diff --git a/deepspeed/.github/workflows/amd-mi200.yml b/deepspeed/.github/workflows/amd-mi200.yml
new file mode 100644
index 000000000000..ea8d2f5f806f
--- /dev/null
+++ b/deepspeed/.github/workflows/amd-mi200.yml
@@ -0,0 +1,86 @@
+name: amd-mi200
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - '.github/workflows/amd-mi200.yml'
+      - 'requirements/**'
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  amd-tests:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, amd, mi200]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+
+      - id: setup-venv
+        uses: ./.github/workflows/setup-venv
+
+      - name: Install pytorch
+        run: |
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm6.0
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip install .
+
+      - name: Install (ROCm) apex
+        run: |
+          git clone https://github.com/ROCmSoftwarePlatform/apex.git
+          cd apex
+          git checkout torch_2.1_higher
+          CURRENT_VER=$(git rev-parse HEAD)
+          INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
+          if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
+            pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings="--global-option=--cpp_ext" --config-settings="--global-option=--cuda_ext" --target=/blob/amd-apex/ --upgrade .
+            git rev-parse HEAD > /blob/amd-apex/.venv_installed_version
+          fi
+          echo PYTHONPATH=$PYTHONPATH:/blob/amd-apex/ >> $GITHUB_ENV
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,1bit,autotuning]
+          #python -c "from deepspeed.env_report import cli_main; cli_main()"
+          ds_report
+
+      - name: Python environment
+        run: |
+          pip list
+
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          pytest $PYTEST_OPTS -n 4 --verbose unit/
+          pytest $PYTEST_OPTS -m 'sequential' unit/
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/deepspeed/.github/workflows/nv-mii.yml b/deepspeed/.github/workflows/nv-mii.yml
new file mode 100644
index 000000000000..d394b7e24bd6
--- /dev/null
+++ b/deepspeed/.github/workflows/nv-mii.yml
@@ -0,0 +1,74 @@
+name: nv-mii
+
+on:
+  workflow_dispatch:
+    inputs:
+      mii_branch:
+        description: 'DeepSpeed-MII Branch'
+        required: false
+        default: 'main'
+        type: string
+  pull_request:
+    paths:
+      - '.github/workflows/nv-mii.yml'
+      - 'requirements/**'
+      - 'setup.py'
+      - 'deepspeed/__init__.py'
+      - 'deepspeed/inference/**'
+      - '!deepspeed/inference/v2/**' # exclude v2 dir
+  merge_group:
+    branches: [ master ]
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu117, v100]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - id: setup-venv
+        uses: ./.github/workflows/setup-venv
+
+      - name: Install pytorch
+        run: |
+          pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          git checkout v4.42.4
+          git rev-parse --short HEAD
+          pip install .
+
+      - name: Install deepspeed
+        run: |
+          pip install .[dev]
+          ds_report
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: MII unit tests
+        run: |
+          BRANCH="main"
+          if [[ ! -z "${{ github.event.inputs.mii_branch }}" ]]; then
+              BRANCH="${{ github.event.inputs.mii_branch }}"
+          fi
+          echo "Cloning DeepSpeed-MII branch: $BRANCH"
+          git clone -b $BRANCH --depth=1 https://github.com/microsoft/DeepSpeed-MII.git
+          cd DeepSpeed-MII
+          pip install .[dev]
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests/legacy
+          pytest $PYTEST_OPTS --forked -m "deepspeed" ./
diff --git a/deepspeed/blogs/deepspeed-fastgen/README.md b/deepspeed/blogs/deepspeed-fastgen/README.md
new file mode 100644
index 000000000000..e287af2540ed
--- /dev/null
+++ b/deepspeed/blogs/deepspeed-fastgen/README.md
@@ -0,0 +1,309 @@
+<div align="center">
+
+# DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference
+
+</div>
+
+<div align="center">
+ <img src="assets/images/fastgen-hero-light.png#gh-light-mode-only" width="850px">
+ <img src="assets/images/fastgen-hero-dark.png#gh-dark-mode-only" width="850px">
+</div>
+
+## Table of Contents
+1. [Introduction](#introduction)
+2. [Key LLM Serving Techniques](#background)
+3. [Dynamic SplitFuse: A Novel Prompt and Generation Composition Strategy](#technical-approach)
+4. [Performance Evaluation](#performance-evaluation)
+5. [DeepSpeed-FastGen: Implementation and Usage](#using-deepspeed-fastgen)
+6. [Try out DeepSpeed-FastGen](#try)
+7. [Acknowledgements](#acknowledgements)
+
+
+## 1. Introduction <a name="introduction"></a>
+
+Large language models (LLMs) like GPT-4 and LLaMA have emerged as a dominant workload in serving a wide range of applications infused with AI at every level. From general chat models to document summarization, and from autonomous driving to copilots at every layer of the software stack, the demand to deploy and serve these models at scale has skyrocketed. While frameworks like DeepSpeed, PyTorch, and several others can regularly achieve good hardware utilization during LLM training, the interactive nature of these applications and the poor arithmetic intensity of tasks like open-ended text generation have become the bottleneck for inference throughput in existing systems.
+
+To this end, frameworks like [vLLM](https://arxiv.org/pdf/2309.06180.pdf) powered by PagedAttention and research systems like [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) have significantly improved the performance of inference for LLMs. However, these systems still struggle to provide consistent quality of service, particularly for workloads with longer prompts. These long prompt workloads are becoming increasingly important as more and more models, like [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b), and systems, such as [DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses), support context windows stretching to tens of thousands of tokens. To better understand the problem space, we provide detailed examples of how text generation works for LLMs in two distinct phases called prompt processing and generation. When systems treat them as distinct phases, generation will be preempted by prompt processing that risks breaking the service level agreements (SLAs).
+
+Today, we are glad to present DeepSpeed-FastGen, a system that overcomes these limitations by leveraging the proposed Dynamic SplitFuse technique and offers up to 2.3x higher effective throughput compared to state-of-the-art systems like vLLM. DeepSpeed-FastGen leverages the combination of DeepSpeed-MII and DeepSpeed-Inference to provide an easy-to-use serving system.
+
+**Quick Start:** Trying DeepSpeed-FastGen is as simple as installing the latest [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) release:
+
+```bash
+pip install deepspeed-mii
+```
+
+To generate text using a simple non-persistent pipeline deployment, run the following code. For more details, please see [Section 5](#using-deepspeed-fastgen).
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+## 2. Existing LLM Serving Techniques in Literature <a name="background"></a>
+
+A text generation workload for a single sequence consists of two phases: 1) prompt processing, in which the user-provided text is efficiently processed as a batch of tokens to build a key-value (KV) cache for attention, and 2) token generation, which will add a single token to that cache and generate a new token. Over the course of generating a sequence of text, the model will make many forward calls to the model to generate the full sequence of text. Two major techniques have been proposed in the literature and deployed in systems that address various limitations and bottlenecks that may arise during these phases.
+
+_<b> Blocked KV Caching: </b>_
+
+vLLM identified that memory fragmentation due to large monolithic KV-caches significantly reduced the concurrency of LLM serving systems and proposed [Paged Attention](https://arxiv.org/pdf/2309.06180.pdf) to enable non-contiguous caches and increase total system throughput. Rather than assign individual variable-sized contiguous chunks of memory, the underlying storage in the KV cache is fixed-sized blocks (also known as pages). The blocked KV-cache increases system throughput by increasing the amount of potential sequence concurrency by eliminating KV-cache induced memory fragmentation. Non-contiguous KV cache implementations are also included in [HuggingFace TGI](https://github.com/huggingface/text-generation-inference) and [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM).
+
+_<b> Continuous Batching: </b>_
+
+In the past, dynamic batching, in which a server would wait for multiple requests to process in phase with each other, was used to improve GPU utilization. However, this approach has drawbacks, as it typically requires padding inputs to identical lengths or stalling the system to wait to construct a larger batch.
+
+Recent advancement in large language model (LLM) inference and serving has been focusing on fine granularity scheduling and optimizing memory efficiency. For instance, Orca proposes _iteration-level scheduling_ (also known as continuous batching) which makes distinct scheduling decisions at each forward pass of the model. This allows requests to join/leave the batch as needed, eliminating the need for padding requests thus improving the overall throughput. In addition to Orca, continuous batching has been implemented in NVIDIA TRT-LLM, HuggingFace TGI, and vLLM.
+
+In current systems, there are two primary approaches to implement continuous batching. In TGI and vLLM, the generation phase is preempted to perform prompt processing (called infill in TGI) before continuing with generation. In Orca, these phases are not distinguished; instead, Orca will add a prompt into the running batch so long as the total number of sequences doesn't reach a fixed bound. Both of these approaches to varying degrees need to stall generation to process long prompts (see [Section 3B](#splitfuse)).
+
+To address these shortcomings, we propose a novel prompt and generation composition strategy, Dynamic SplitFuse.
+
+## 3. Dynamic SplitFuse: A Novel Prompt and Generation Composition Strategy<a name="technical-approach"></a>
+
+DeepSpeed-FastGen is built to leverage continuous batching and non-contiguous KV caches to enable increased occupancy and higher responsivity for serving LLMs in the data center, similar to existing frameworks such as TRT-LLM, TGI, and vLLM. In order to achieve a new level of performance, DeepSpeed-FastGen introduces SplitFuse which leverages dynamic prompt and generation decomposition and unification to further improve continuous batching and system throughput.
+
+### A. Three Performance Insights
+Before describing  Dynamic SplitFuse, we answer three key performance questions that together motivate its design.
+
+*__1. What factors impact the forward pass of a single LLM?__* In order to effectively schedule, it is necessary to understand what are the relevant independent variables the scheduling loop should control. We observe below that the composition of sequences in a forward pass (the batch size in sequences) has a negligible impact on performance compared to the raw number of tokens in the forward pass. This means an effective scheduler can be built around a single signal, the number of tokens in the forward pass.
+
+<div align="center">
+<img src="assets/images/observation-prompt-v-latency.png" alt="" width="480"/><br>
+</div>
+
+*__2. How does a model's throughput respond to changing the number of tokens in the forward pass?__* An LLM has two key operating regions with a relatively steep transition. With a small number of tokens, the GPU bottleneck is reading the model from memory and so throughput scales with the number of tokens, whereas with many tokens the model is throughput bound by compute and sees near-constant throughput.  The model should run highly efficiently if all forward passes are in the throughput-saturating region.
+
+<div align="center">
+<img src="assets/images/observation-prompt-v-flops.png" alt="" width="480"/><br>
+</div>
+
+*__3. How should a pool of tokens be scheduled across multiple forward passes?__* We observe above that for well-aligned inputs the token-throughput curve is concave, which means the second derivative is bound to be less than or equal to 0. As an example, let $f(x)$ be a concave function of latency to throughput for a given model. For a concave function $f(x)$, the following holds:
+
+  $$0 \geq \lim_{h \to 0} \frac{f(x + h) - 2f(x) + f(x - h)}{h^2}$$
+
+  $$0 \geq f(x + h) - 2f(x) + f(x - h)$$
+
+  $$2f(x) \geq f(x + h) + f(x - h)$$
+
+This states that for a given pool of `2x` tokens to process, the manner that maximizes throughput is that which evenly splits them between two batches. More generally, in a system that must consume and process P tokens over F forward passes, the ideal partitioning scheme will divide them equally.
+
+### B. Dynamic SplitFuse <a name="splitfuse"></a>
+
+Dynamic SplitFuse is a novel token composition strategy for prompt processing and token generation. DeepSpeed-FastGen utilizes Dynamic SplitFuse to run at a consistent forward size by leveraging the capability to take partial tokens from prompts and compose this with generation. In particular, Dynamic SplitFuse performs two key behaviors:
+
+1. Long prompts are decomposed into much smaller chunks and scheduled across multiple forward passes (iterations) with only the final pass performing any generation.
+2. Short prompts will be composed to exactly fill a target token budget. Even short prompts may be decomposed to ensure the budget is precisely met and the forward sizes are well-aligned.
+
+Together, these two techniques provide concrete benefits on all user metrics:
+
+1. *__Better Responsiveness__:* Since long prompts no longer require extremely long forward passes to process, the model will provide lower client latency. More forward passes are performed within the same window of time.
+2. *__Higher Efficiency:__* Fusion of short prompts to larger token budgets enables the model to consistently operate in the high throughput regime.
+3. *__Lower variance and better consistency:__* Since forward passes are of consistent size and forward pass size is the primary determinant of performance, the latency of each forward pass is much more consistent than competing systems as is the perceived generation frequency. There are no pre-emption or long-running prompts to increase the latency as in other prior work.
+
+Consequently, DeepSpeed-FastGen will consume tokens from incoming prompts at a rate that permits fast ongoing generation while adding tokens to the system that increase system utilization, providing lower latency and higher throughput streaming generation to all clients as compared to other state-of-the-art serving systems.
+
+<div align="center">
+  <img src="assets/images/fastgen-overview-light.png#gh-light-mode-only" width="640">
+  <img src="assets/images/fastgen-overview-dark.png#gh-dark-mode-only" width="640"><br>
+
+  *Figure 1: Illustration of continuous batching strategies. Each block shows the execution of a forward pass.  An arrow indicates that the forward pass has sequences with one or more tokens generated. vLLM performs either token generations or prompt processing in a forward pass; token generation preempts prompt processing. Orca runs prompts at their complete length alongside generation. Dynamic SplitFuse performs dynamic composition of fixed-sized batches composed of both generation and prompt tokens.*
+
+</div>
+
+## 4. Performance Evaluation  <a name="performance-evaluation"></a>
+
+DeepSpeed-FastGen provides state-of-the-art LLM serving performance leveraging its blocked KV cache and Dynamic SplitFuse continuous batching. We evaluate DeepSpeed-FastGen against vLLM on a range of models and hardware configurations following the benchmarking methodology discussed below.
+
+### A. Benchmarking Methodology
+
+We use two primary quantitative schemes for measuring performance.
+
+**Throughput-Latency Curves:** Two key metrics for production readiness are throughput (measured in requests per second) and latency (the responsiveness of each request). To measure this, we instantiate multiple clients (ranging from 1 to 32) concurrently and send requests (512 in total) to the server. The resulting latency of each request is measured at the endpoint and throughput is measured by the end-to-end time to complete the experiment.
+
+**Effective Throughput:** Interactive applications, such as chat applications, can have more stringent and complex requirements than can be captured by top-level metrics like end-to-end latency. In particular, we focus on the increasingly popular chat user scenario:
+
+  1. A user initiates a task by sending a prompt.
+  2. The system processes the prompt and returns the first token.
+  3. Subsequent tokens are streamed to the user as they are produced.
+
+At each point in this process there is an opportunity for a system to provide an adverse user experience; for example, if the first token arrives too slowly or the generation appears to stop for some time. We propose an SLA framework that considers both of these dimensions.
+
+As the lengths of prompts and generated texts vary significantly, affecting computational costs, it is impractical to set rigid SLA values for throughput and latency. Therefore, we define the SLA for prompt latency as |tokens in prompt| / 512 seconds (= 512 tokens/s). Additionally, considering humans' reading speed, we set the SLA for generation latency on the Exponential Moving Average (EMA) to 2, 4, or 6 tokens/sec. Requests that adhere to these SLAs are deemed successful, and the throughput of these successful requests is referred to as **effective throughput**.
+
+We evaluate vLLM and DeepSpeed-FastGen on both Llama-2 7B, Llama-2 13B, and Llama-2 70B on NVIDIA A100, H100, and A6000.
+
+### B. Throughput-Latency Analysis
+
+In this experiment, DeepSpeed-FastGen outperforms vLLM in both throughput and latency, providing equivalent latency with greater throughput or more responsive latency and the same throughput. On Llama-2 70B with 4 A100x80GB, DeepSpeed-FastGen demonstrates up to 2x higher throughput (1.36 rps vs. 0.67 rps) at identical latency (9 seconds) or up to 50% latency reduction (7 seconds vs. 14 seconds) while achieving the same throughput (1.2 rps), as shown in Figure 2. These trends hold when evaluating Llama-2 13B as shown in Figure 3.
+
+<div align="center">
+  <img src="assets/images/throughput_latency.png" alt="" width="850"/><br>
+
+  *Figure 2: Throughput and latency of text generation using Llama 2 70B (Tensor parallelism across 4 A100-80GB GPUs). A normal distribution was applied to prompt and generation lengths with averages of 1200/2600 and 128/60, respectively, and a 30% variance*
+</div><br>
+
+<div align="center">
+  <img src="assets/images/throughput_latency_13B_no_arrow.png" alt="" width="850"/><br>
+
+  *Figure 3: Throughput and latency of text generation using Llama 2 13B (A100-80GB GPU, no tensor parallelism). A normal distribution was applied to prompt and generation lengths with averages of 1200/2600 and 60/128, respectively, and a 30% variance*
+</div>
+
+### C. Effective Throughput Analysis
+
+Under the effective throughput analysis that considers both first token latency and the rate at which generation occurs, DeepSpeed-FastGen provides up to 2.3x higher throughput than vLLM. Figure 4 presents a comparative analysis of the effective throughputs of DeepSpeed-FastGen and vLLM. Each plotted point denotes the effective throughput derived from a specific number of clients. As we scaled the number of clients, we initially observed an increase in effective throughput. However, the latency also significantly increases as the number of clients approaches the system's capacity, causing many requests to fail in meeting the SLA. Consequently, the effective throughput will either saturate or decrease at some point. From a usability perspective, it's not particularly relevant how many clients are required to achieve the max effective throughput; the maximum point of the line is the optimal serving point.
+
+<div align="center">
+  <img src="assets/images/effective_throughput.png" alt="" width="1200" />
+
+  *Figure 4: Effective throughput of DeepSpeed-FastGen and vLLM (Llama 2 70B/A100-80GB using tensor parallelism across 4 A100-80GB GPUs. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance)*
+</div><br>
+
+When vLLM preempts the ongoing generation of previous requests, the generation latency experiences a notable increase. This leads to vLLM's effective throughput appearing lower than its directly measured throughput. At vLLM's peak, the effective throughput was 0.63 queries/sec and around 28% of requests failed to meet the 4 tokens/s SLA. At the same SLA, DeepSpeed-FastGen achieved 1.42 queries/sec (less than 1% of requests failed to meet the SLA), which is 2.3x higher than vLLM.
+
+### D. Token Level Timing Analysis
+
+Figure 5 displays the P50, P90, and P95 latencies of the generation processes. Both vLLM and DeepSpeed-FastGen exhibit similar P50 latencies, but vLLM demonstrates significantly higher latencies for P90 and P95.
+Regarding the P95 latencies, DeepSpeed-FastGen achieved a reduction of 3.7 times.
+
+This discrepancy is due to a noticeable spike in vLLM's generation latency when it preempts the ongoing generation to process new prompts.
+In contrast, DeepSpeed-FastGen typically processes the prompt and generation for previous requests concurrently, leading to much more consistent generation latency.
+
+
+<div align="center">
+  <img src="assets/images/token_latency.png" alt="" width="400"/><br>
+
+  *Figure 5: Per-Token generation Latency of Llama 2 70B/A100-80GB using tensor parallelism across 4 A100-80GB GPUs, 16 clients. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 128, respectively, and a 30% variance.*
+</div><br>
+
+
+### E. Scalability using Load Balancing
+
+DeepSpeed-FastGen offers replica-level load balancing that evenly distributes requests across multiple servers, allowing you to effortlessly scale up your application.
+
+Figure 6 illustrates the scalability of DeepSpeed-FastGen when employing the load balancer and up to 16 replicas. Note that we utilized 4 A100 GPUs to compute the Llama 2 70B model. In total, we employed 8 nodes to run the 16 replicas. The results demonstrate nearly perfect scalability with DeepSpeed-FastGen.
+Given that the throughput of a single replica is 1.46 queries/sec, the throughput with 16 replicas reaches 23.7 queries/sec, marking a linear 16x increase compared to a single replica.
+
+<div align="center">
+  <img src="assets/images/repl_scale_llama70b_tp4_p2600g60.png" alt="" width="400"/><br>
+
+  *Figure 6: Scalability using the load balancing feature. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance*<br>
+</div>
+
+### F. Other Hardware Platforms
+
+In addition to the deep analysis on A100, we provide additional benchmarking results for H100 and A6000. The same performance trends were observed on both A6000 and H100 as A100.
+
+<div align="center">
+  <img src="assets/images/H100_benchmark.png" alt="" width="800"/><br>
+
+  *Figure 7: Throughput-latency curve and effective throughput of Llama 2 70b using 8 H100 GPUs. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance*<br>
+</div>
+
+<div align="center">
+  <img src="assets/images/A6000_benchmark.png" alt="" width="800"/><br>
+
+  *Figure 8: Throughput-latency curve and effective throughput of Llama 2 7b using A6000. A normal distribution was applied to prompt and generation lengths with averages of 2600 and 60, respectively, and a 30% variance*<br>
+</div>
+
+## 5. DeepSpeed-FastGen: Implementation and Usage  <a name="using-deepspeed-fastgen"></a>
+
+DeepSpeed-FastGen is the synergistic composition of [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) and [DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed) as illustrated in the figure below. Together, both of these software packages provide various components of the system including the frontend APIs, the host and device infrastructure to schedule batches using Dynamic SplitFuse, optimized kernel implementations, and the tools to construct new model implementations.
+
+<div align="center">
+<img src="assets/images/fastgen-arch-light.png#gh-light-mode-only" width="800px">
+<img src="assets/images/fastgen-arch-dark.png#gh-dark-mode-only" width="800px">
+</div>
+
+
+The fastest way to get started with our alpha release of DeepSpeed-FastGen is: `pip install deepspeed-mii`.
+
+Please follow our [Getting Started](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii) guide for more details. For usage and reporting issues, please use the [DeepSpeed-MII Github repository](https://github.com/microsoft/DeepSpeed-MII).
+
+### A. Supported Models
+
+We currently support the following model architectures in this alpha release of DeepSpeed-FastGen:
+
+* [LLaMA](https://huggingface.co/models?other=llama) and [LLaMA-2](https://huggingface.co/models?other=llama-2)
+* [Mistral](https://huggingface.co/models?other=mistral)
+* [OPT](https://huggingface.co/models?other=opt)
+* [Falcon](https://huggingface.co/models?other=falcon)
+* [Mixtral](https://huggingface.co/models?other=mixtral)
+* [Phi-2](https://huggingface.co/models?other=phi-msft)
+* [Phi-3](https://huggingface.co/models?other=phi3)
+* [Qwen](https://huggingface.co/models?other=qwen)
+* [Qwen2](https://huggingface.co/models?other=qwen2)
+* [Qwen2-MoE](https://huggingface.co/models?other=qwen2_moe)
+
+All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer.
+
+We plan to add additional models in the coming weeks and months after the initial release. If there are specific model architectures you would like supported, please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know.
+
+### B. Deployment options
+All of the examples below are runnable in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii). Once installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment:
+
+#### Non-persistent pipeline
+
+The non-persistent pipeline deployment is a great and fast way to get started and can be done with only a few lines of code. Non-persistent models are only around for the duration of the python script you are running but are useful for temporary interactive sessions.
+
+```python
+from mii import pipeline
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+print(output)
+```
+
+#### Persistent deployment
+
+A persistent deployment is ideal for use with long-running and production applications. The persistent deployment uses a lightweight GRPC server that can be created using the following 2 lines:
+
+
+```python
+import mii
+mii.serve("mistralai/Mistral-7B-v0.1")
+```
+
+The above server can be queried by multiple clients at once thanks to the built-in load balancer from DeepSpeed-MII. Creating a client also just takes 2 lines of code:
+
+```python
+client = mii.client("mistralai/Mistral-7B-v0.1")
+output = client.generate("Deepspeed is", max_new_tokens=128)
+print(output)
+```
+
+A persistent deployment can be terminated when it is no longer needed:
+
+```python
+client.terminate_server()
+```
+
+### C. Advanced Installation Information
+
+For ease of use and a significant reduction in lengthy compile times that many projects require in this space, we distribute a pre-compiled Python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases, you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/microsoft/DeepSpeed-Kernels#source).
+
+
+# 6. Try Out DeepSpeed-FastGen <a name="try"></a>
+We are very excited to share this DeepSpeed-FastGen alpha release.
+
+* To get started, please visit our GitHub page for DeepSpeed-MII: [GitHub Landing Page](https://github.com/microsoft/DeepSpeed-MII)
+
+DeepSpeed-FastGen is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, and companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+The following items are on our roadmap and we plan to engage with our community on these through our GitHub issues and PRs:
+
+- Performance improvements
+- Broader model support
+- New hardware backends through collaboration with partners
+- Release performance benchmarks (used to generate plots in this blog)
+
+**"Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) repositories if you like our work!**
+
+# 7. Acknowledgements <a name="acknowledgements"></a>
+
+We would like to thank various open-source community projects including HuggingFace, vLLM, and HuggingFace TGI. We have leveraged HF APIs to support models and tokenizers in our alpha release and will continue to add more models. We especially acknowledge and thank the developers of [Flash Attention](https://github.com/Dao-AILab/flash-attention) for their great work. We have extensively leveraged FlashAttention kernels in our system with modifications that have been acknowledged in our code repositories at appropriate file headers. Finally, we want to thank the developers of [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) kernels that we have used in our MoE kernels (released as part of DeepSpeed-Kernels repository).
diff --git a/deepspeed/blogs/deepspeed-gds/README.md b/deepspeed/blogs/deepspeed-gds/README.md
new file mode 100644
index 000000000000..34416c07ea4d
--- /dev/null
+++ b/deepspeed/blogs/deepspeed-gds/README.md
@@ -0,0 +1,88 @@
+<div align="center">
+
+# DeepNVMe: Improving DL Applications through I/O Optimizations
+
+</div>
+
+# Introduction
+
+Deep Learning (DL) continues to drive unprecedented advancements across important
+Artificial Intelligence domains including language, speech, video, and multimodal applications.
+A key factor to these advancements is dramatic scalability on multiple dimensions including model size,
+sequence length, and hardware parallelism. From a system perspective, DL scalability puts significant
+pressure on essential subsystems including computation, memory, communication, and storage. However,
+existing DL optimization efforts have mostly neglected the storage subsystem, making I/O operations such
+as data loading, model checkpointing, and offloading the main bottlenecks of large-scale DL. To address
+this problem, DeepSpeed has created a suite of I/O optimizations collectively called DeepNVMe.
+
+DeepNVMe improves the performance and efficiency of I/O-bound DL applications by accelerating I/O operations
+and reducing hardware requirements. It achieves this by leveraging storage innovations such as Non-Volatile
+Memory Express (NVMe) Solid Storage Devices (SSDs) and NVIDIA Magnum IO<sup>TM</sup> GPUDirect® Storage (GDS). In this
+blog we show the benefits of DeepNVMe using microbenchmarks and an inference application. In experiments
+conducted on an Azure NC96ads\_A100\_v4 VM, we observed that DeepNVMe saturates available NVMe bandwidth for
+data transfers with GPU or CPU memory, achieving up to 10GB/sec reads and 5 GB/secs writes.
+
+# Background
+High-performance access to persistent storage is a common challenge in many computing domains, including DL. Thus, a significant number of hardware and software solutions have been proposed. DeepNVMe builds on three such solutions: (1) NVMe SSDs, (2) NVIDIA GDS, and (3) Linux Asynchronous I/O (libaio). We will briefly describe each of these technologies.
+
+NVMe SSDs are Flash-based storage devices that are replacing much slower hard disk drives (HDD) as primary persistent storage in modern servers. For example, an Azure NC96ads\_A100\_v4 VM is equipped with four NVMe SSDs which are individually capable of 3.25 GB/sec reads and can be combined in a RAID-0 configuration for a theoretical aggregate read bandwidth of 13 GB/sec. NVIDIA GDS enables direct transfers between NVMe and GPU memory thus avoiding the inefficiencies of the traditional approach of using intermediate CPU memory (bounce buffer). NVIDIA GDS is generally available in CUDA versions 11.4 and above. Finally, libaio is an asynchronous I/O stack introduced in Linux to better extract raw performance of fast storage devices like NVMe SSDs compared to the traditional I/O stack.
+
+# DeepNVMe: an Optimization Module for Deep Learning I/O
+
+DeepNVMe is a Python module that we developed with two key design principles. First, it leverages the above discussed storage technologies to implement powerful optimizations such as non-blocking I/O operations, bulk submission of I/O operations, parallelization of an individual I/O operation, and a lightweight runtime. Second, it exposes these I/O optimizations through a simple POSIX-like interface to foster easy integration into DL applications while avoiding the complexities of the underlying technologies.
+
+# Evaluation
+
+Our experiments are conducted on an Azure NC96ads\_A100\_v4 VM with setup details summarized in Table 1. For multi-device experiments, the SSDs are combined in a RAID-0 configuration.
+
+<img src="./media/table1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Table 1: Experimental setup details
+</div>
+
+## Microbenchmark Performance
+
+We used three benchmarking tools for our evaluations. The first is fio, the popular I/O benchmarking tool written in C. The second is gdsio from NVIDIA for benchmarking GDS performance. The third is ds\_io, a Python tool that we created for easy integration with DeepNVMe and to be more representative of DL applications which are commonly Python-based.
+
+## High-Performance I/O with CPU Buffers via NVMe Scaling
+
+Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments1. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
+
+<img src="./media/figure1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Figure 1: Using DeepNVMe to scale data transfers between NVMe and CPU buffer
+</div>
+
+## High-Performance I/O with GPU Buffers via NVMe Scaling
+
+Our second set of microbenchmark evaluations used gdsio and ds\_io to measure the performance of 1GB data transfer between NVMe and GPU memory. For this experiment, we configure ds\_io to use both the traditional bounce buffer approach and the more efficient GDS approach. The results are summarized in Figure 2, from which we make three observations. First, we see that GDS improves performance in DeepNVMe compared to the traditional bounce buffer approach, with up to 37% speedup. Second, DeepNVMe demonstrates high performance by matching (and sometimes surpassing) gdsio despite being more representative of DL applications. Third, we see that DeepNVMe, with and without GDS, scales I/O performance with available NVMe bandwidth. With GDS, DeepNVMe achieves a maximum of 9.6GB/sec reads and 5GB/sec writes, and without GDS achieves 7GB/sec reads and 4GB/sec writes.
+
+<img src="./media/figure2.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Figure 2: Using DeepNVMe to scale data transfers between NVMe and GPU memory
+</div>
+
+## ZeRO-Inference: Generative AI Performance
+
+ZeRO-Inference is an AI democratization technology that reduces the hardware cost of inferencing massive models by using DeepNVMe to offload model weights to CPU or NVMe memory.  ZeRO-Inference is well suited for throughput-oriented applications, such as offline inferencing, and for scenarios with limited hardware budget. We use token generation workload to evaluate DeepNVMe performance for NVMe offloading.
+
+## High-Performance Offloading via NVMe Scaling
+
+We measure the generation throughput of inferencing a LLAMA3-70B model on a single NVIDIA A100-80GB with a prompt length of 512, generation length of 32, and batch size of 96. We scale the number of NVMe SSDs from 1 to 4 and present the results for ZeRO-Inference with and without GDS in Figure 3.  We make two observations from these results. First, GDS consistently provides better performance compared to the bounce buffer approach, achieving 10-18% faster token generation. Second, DeepNVMe, with and without GDS, scales generation performance with available NVMe bandwidth. With four NVMe SSDs, DeepNVMe achieves generation throughput rates of 7 tokens per second with GDS and 6 tokens per second without GDS. Our profiling results suggest that DeepNVMe will continue to scale with more NVMe bandwidth, making it an economic option for boosting generative application performance.
+
+<img src="./media/figure3.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Figure 3: Using DeepNVMe to scale LLAMA3-70B token generation performance with NVMe offloading.
+</div>
+
+# Summary
+
+In this blog post, we introduced DeepNVMe, an I/O optimization technology created to tackle the emergence of I/O operations as key bottlenecks of Deep Learning scalability. DeepNVMe enables fast and efficient data transfers between persistent storage and DL application memory through optimizations built on popular storage technologies such as NVMe SSDs and NVIDIA GDS. We showed benefits of using DeepNVMe for LLAMA3-70B token generation on single A100-80GB GPU with NVMe offloading, for which it achieves up to 7 tokens per second in generation throughput on an Azure NC96ads\_A100\_v4 VM. DeepNVMe will be open-sourced and generally available in DeepSpeed versions >= [0.15.0](https://github.com/microsoft/DeepSpeed/releases/tag/v0.15.0).  In future blogs, we will report DeepNVMe improvements for other I/O bound DL applications such as model checkpointing and data loading.
+
+
+# Acknowlegements
+This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nivida.
diff --git a/deepspeed/blogs/deepspeed-gds/media/figure1.png b/deepspeed/blogs/deepspeed-gds/media/figure1.png
new file mode 100644
index 000000000000..08db7d2f8afa
Binary files /dev/null and b/deepspeed/blogs/deepspeed-gds/media/figure1.png differ
diff --git a/deepspeed/blogs/deepspeed-gds/media/figure2.png b/deepspeed/blogs/deepspeed-gds/media/figure2.png
new file mode 100644
index 000000000000..35be5d4c4015
Binary files /dev/null and b/deepspeed/blogs/deepspeed-gds/media/figure2.png differ
diff --git a/deepspeed/blogs/deepspeed-gds/media/figure3.png b/deepspeed/blogs/deepspeed-gds/media/figure3.png
new file mode 100644
index 000000000000..7175236f886b
Binary files /dev/null and b/deepspeed/blogs/deepspeed-gds/media/figure3.png differ
diff --git a/deepspeed/blogs/deepspeed-gds/media/table1.png b/deepspeed/blogs/deepspeed-gds/media/table1.png
new file mode 100644
index 000000000000..bba571369932
Binary files /dev/null and b/deepspeed/blogs/deepspeed-gds/media/table1.png differ
diff --git a/deepspeed/csrc/fp_quantizer/fp_quantize.cpp b/deepspeed/csrc/fp_quantizer/fp_quantize.cpp
new file mode 100644
index 000000000000..1a887b50e1a3
--- /dev/null
+++ b/deepspeed/csrc/fp_quantizer/fp_quantize.cpp
@@ -0,0 +1,124 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "fp_quantize.h"
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include <vector>
+
+#define DISPATCH_QUANTIZE(T_TYPE, C_TYPE, mantisa, exponent)                             \
+    if (val.options().dtype() == torch::T_TYPE) {                                        \
+        launch_quantization<C_TYPE, mantisa, exponent>((C_TYPE*)val.data_ptr(),          \
+                                                       (uint8_t*)out.data_ptr(),         \
+                                                       num_groups,                       \
+                                                       group_size,                       \
+                                                       at::cuda::getCurrentCUDAStream(), \
+                                                       q_range,                          \
+                                                       q_bits,                           \
+                                                       q_mantisa_bits,                   \
+                                                       stochastic_rounding);             \
+    }
+
+at::Tensor quantize(torch::Tensor& out,
+                    torch::Tensor& val,
+                    int group_size,
+                    int stochastic_rounding,
+                    int q_bits,
+                    int q_mantisa_bits)
+{
+    int total_elems = at::numel(val);
+    float q_range = q_bits == 8 ? (q_mantisa_bits == 3 ? 480.0 : 114688.0) :  // fp8 ranges
+                        (q_bits == 12 ? 510.0 :                               // fp12 range
+                             (q_bits == 6 ? 28.0 :                            // fp6 range
+                                  6.0));  // fp4 range (using power 2); TODO (Reza): add the power-4
+                                          // in case accuracy is not matching!
+    int num_groups = total_elems / group_size;
+
+    DISPATCH_QUANTIZE(kHalf, __half, 23, 8);
+#ifdef BF16_AVAILABLE
+    DISPATCH_QUANTIZE(kBFloat16, __nv_bfloat16, 23, 8);
+#endif
+
+    return out;
+}
+
+#define DISPATCH_DEQUANTIZE(T_TYPE, C_TYPE, mantisa)                              \
+    if (val.options().dtype() == torch::T_TYPE) {                                 \
+        launch_dequantization<C_TYPE, mantisa>((uint8_t*)val_q.data_ptr(),        \
+                                               (C_TYPE*)val.data_ptr(),           \
+                                               num_groups,                        \
+                                               group_size,                        \
+                                               q_mantisa_bits,                    \
+                                               q_exponent_bits,                   \
+                                               at::cuda::getCurrentCUDAStream()); \
+        return;                                                                   \
+    }
+
+void dequantize(torch::Tensor& val,
+                torch::Tensor& val_q,
+                int group_size,
+                int q_mantisa_bits,
+                int q_exponent_bits)
+{
+    int total_elems = at::numel(val);
+
+    int num_groups = total_elems / group_size;
+
+    DISPATCH_DEQUANTIZE(kHalf, __half, 10);
+#ifdef BF16_AVAILABLE
+    DISPATCH_DEQUANTIZE(kBFloat16, __nv_bfloat16, 7);
+#endif
+}
+
+#define DISPATCH_DEQUANTIZE_INDEX(T_TYPE, C_TYPE, mantisa)                                  \
+    if (val.options().dtype() == torch::T_TYPE) {                                           \
+        launch_selective_dequantization<C_TYPE, mantisa>((uint8_t*)val_q.data_ptr(),        \
+                                                         (C_TYPE*)val.data_ptr(),           \
+                                                         (int32_t*)indexes.data_ptr(),      \
+                                                         num_groups,                        \
+                                                         group_size,                        \
+                                                         num_indexes,                       \
+                                                         q_mantisa_bits,                    \
+                                                         q_exponent_bits,                   \
+                                                         at::cuda::getCurrentCUDAStream()); \
+        return;                                                                             \
+    }
+void selective_dequantize(torch::Tensor& val,
+                          torch::Tensor& val_q,
+                          torch::Tensor& indexes,
+                          int group_size,
+                          int q_mantisa_bits,
+                          int q_exponent_bits)
+{
+    int total_elems = at::numel(val);
+    int num_indexes = indexes.size(0);
+    int num_groups = total_elems / group_size;
+
+    DISPATCH_DEQUANTIZE_INDEX(kHalf, __half, 10);
+#ifdef BF16_AVAILABLE
+    DISPATCH_DEQUANTIZE_INDEX(kBFloat16, __nv_bfloat16, 7);
+#endif
+}
+
+at::Tensor get_scales(torch::Tensor& out, int num_groups)
+{
+    auto options = at::TensorOptions()
+                       .dtype(torch::kFloat)
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+    auto scales =
+        torch::from_blob(out.data_ptr(), {num_groups, 1}, {out.stride(0) / 4, 1}, options);
+    return scales;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("quantize", &quantize, "quantize function");
+    m.def("dequantize", &dequantize, "dequantize function");
+    m.def("get_scales", &get_scales, "get scales function");
+    m.def("selective_dequantize", &selective_dequantize, "selective dequantize function");
+}
diff --git a/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py b/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py
new file mode 100644
index 000000000000..23e06a770023
--- /dev/null
+++ b/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import Qwen2MoePolicy
diff --git a/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/container.py b/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/container.py
new file mode 100644
index 000000000000..b4621257ff82
--- /dev/null
+++ b/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/container.py
@@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
+'''
+ # HF Qwen1.5-MoE-A2.7B model looks like this:
+
+Qwen2MoeForCausalLM(
+  (model): Qwen2MoeModel(
+    (embed_tokens): Embedding(151936, 2048)
+    (layers): ModuleList(
+      (0-23): 24 x Qwen2MoeDecoderLayer(
+        (self_attn): Qwen2MoeSdpaAttention(
+          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
+          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
+          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
+          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+          (rotary_emb): Qwen2MoeRotaryEmbedding()
+        )
+        (mlp): Qwen2MoeSparseMoeBlock(
+          (gate): Linear(in_features=2048, out_features=60, bias=False)
+          (experts): ModuleList(
+            (0-59): 60 x Qwen2MoeMLP(
+              (gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
+              (up_proj): Linear(in_features=2048, out_features=1408, bias=False)
+              (down_proj): Linear(in_features=1408, out_features=2048, bias=False)
+              (act_fn): SiLU()
+            )
+          )
+          (shared_expert): Qwen2MoeMLP(
+            (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
+            (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
+            (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
+            (act_fn): SiLU()
+          )
+          (shared_expert_gate): Linear(in_features=2048, out_features=1, bias=False)
+        )
+        (input_layernorm): Qwen2MoeRMSNorm()
+        (post_attention_layernorm): Qwen2MoeRMSNorm()
+      )
+    )
+    (norm): Qwen2MoeRMSNorm()
+  )
+  (lm_head): Linear(in_features=2048, out_features=151936, bias=False)
+)
+'''
+
+
+class Qwen2MoeTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Qwen2Moe model.
+    """
+    qkv_w: UnfusedQKVParameter
+    qkv_b: UnfusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    moe_gate: MoEGatingWeightParameter
+    moe_mlp_1: UnfusedMoEGatedMLPParameter
+    moe_mlp_2: UnfusedMoEMLP2Parameter
+    shared_moe_mlp_1: GatedMLPParameter
+    shared_moe_mlp_2: MLP2Parameter
+    shared_moe_gate: MoEGatingWeightParameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.q_proj.weight": "qkv_w.q_params",
+        "self_attn.k_proj.weight": "qkv_w.k_params",
+        "self_attn.v_proj.weight": "qkv_w.v_params",
+        "self_attn.q_proj.bias": "qkv_b.q_params",
+        "self_attn.k_proj.bias": "qkv_b.k_params",
+        "self_attn.v_proj.bias": "qkv_b.v_params",
+        "self_attn.o_proj.weight": "attn_out_w.params",
+        "mlp.gate.weight": "moe_gate.params",
+        "mlp.experts.*.gate_proj.weight": "moe_mlp_1.gating_experts",
+        "mlp.experts.*.up_proj.weight": "moe_mlp_1.up_experts",
+        "mlp.experts.*.down_proj.weight": "moe_mlp_2.experts",
+        "mlp.shared_expert.gate_proj.weight": "shared_moe_mlp_1.gate_params",
+        "mlp.shared_expert.up_proj.weight": "shared_moe_mlp_1.up_params",
+        "mlp.shared_expert.down_proj.weight": "shared_moe_mlp_2.params",
+        "mlp.shared_expert_gate.weight": "shared_moe_gate.params",
+        "input_layernorm.weight": "attn_norm_gamma.params",
+        "post_attention_layernorm.weight": "mlp_norm_gamma.params",
+    }
+
+
+class Qwen2MoeNonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Qwen2Moe model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "model.norm.weight": "final_norm.params",
+        "lm_head.weight": "word_unembed.params",
+    }
diff --git a/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/model.py b/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/model.py
new file mode 100644
index 000000000000..7cddbf978369
--- /dev/null
+++ b/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/model.py
@@ -0,0 +1,359 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...config_v2 import RaggedInferenceEngineConfig
+from ...inference_utils import ActivationType, DtypeEnum
+from ...model_implementations import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...modules import heuristics
+from ...ragged import RaggedBatchWrapper
+from ..inference_model_base import (
+    DSModelImplementationConfig,
+    MPType,
+)
+
+from .container import Qwen2MoeNonTransformerContainer, Qwen2MoeTransformerContainer
+
+
+class Qwen2MoeInferenceModel(DSMoETransformerModelBase):
+    """
+    Inference model implementation for Qwen2MoE models.
+    """
+
+    _non_transformer: Optional[Qwen2MoeNonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[Qwen2MoeTransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties ineherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_position_embeddings
+
+    """
+    Properties ineherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        # TODO(ZonePG): bf16 inference results may be different from huggingface bf16,
+        # because in rms_norm, Qwen still use float() instead of bf16
+        # if self._config.torch_dtype == torch.float16:
+        #     return DtypeEnum.fp16
+        # elif self._config.torch_dtype == torch.bfloat16:
+        #     return DtypeEnum.bf16
+        # else:
+        #     raise NotImplementedError("Only fp16 and bf16 are supported")
+        return DtypeEnum.fp16
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        return ActivationType.SiGLU
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return RotateHalfConfig(theta_base=self._config.rope_theta)
+
+    """
+    Inherited from `DSMoETransformerModelBase`
+    """
+
+    @property
+    def n_experts(self) -> int:
+        return self._config.num_experts
+
+    @property
+    def n_top_k(self) -> int:
+        return self._config.num_experts_per_tok
+
+    @property
+    def normalize_expert_scores(self) -> bool:
+        return self._config.norm_topk_prob
+
+    def make_moe_layer(self) -> None:
+        """
+        Instantiates the MoE layer for the model. This sets the `self.moe` attribute.
+        """
+        sharded_dim = sharded_intermediate_dim(self.intermediate_dim // self.n_top_k, self.tp_size, self.tp_rank)
+
+        moe_config = DSMoEConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            model_dim=self.model_dim,
+            intermediate_features=sharded_dim,
+            activation=self.mlp_activation_fn,
+            n_experts=self.n_experts,
+            top_k=self.n_top_k,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+            normalize_scores=self.normalize_expert_scores,
+        )
+
+        self.moe = heuristics.instantiate_moe(moe_config, self._engine_config)
+
+    ######### MLP 1 #########
+    def make_shared_expert_mlp_1_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the first MLP in the feedforward network.
+        This sets the `self.mlp_1` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.intermediate_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=self.model_dim,
+            out_channels=shard_size,
+            activation=self.mlp_activation_fn,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.shared_expert_mlp_1 = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    ######### MLP 2 #########
+    def make_shared_expert_mlp_2_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the second MLP in the feedforward network.
+        This sets the `self.mlp_2` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.intermediate_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=shard_size,
+            out_channels=self.model_dim,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.shared_expert_mlp_2 = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    ######### MLP 2 #########
+    def make_shared_expert_gate_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the second MLP in the feedforward network.
+        This sets the `self.mlp_2` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.model_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=shard_size,
+            out_channels=8,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.shared_expert_gate = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    def make_norm_layer(self) -> None:
+        """
+        Instantiates the normalization layer for the model. This sets the `self.norm` attribute.
+
+        TODO(cmikeh2): In the future we'll distinguish between the different norm objects,
+        but for now we'll just use the same one for all of them.
+        """
+        norm_config = DSNormConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            type=self.norm_type,
+            channels=self.model_dim,
+            residual_dtype=self.activation_dtype,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+            eps=self._config.rms_norm_eps,
+        )
+
+        self.norm = heuristics.instantiate_pre_norm(norm_config, self._engine_config)
+
+    """
+    Model implementation
+    """
+
+    def __init__(self, config: DSModelImplementationConfig, engine_config: RaggedInferenceEngineConfig,
+                 base_mp_group: MPType) -> None:
+        """
+        Base implementation for initialization. By default, this will initialize
+        the traditional components of a transformer model:
+            - Embedding
+            - QKV projection
+            - Self attention
+            - Attention output projection
+            - Feed forward network
+            - Normalization
+            - Unembedding
+
+        Arguments:
+            config (DSModelImplementationConfig): Model-specific configuration. No assumptions
+                should be made about this config that are not closely tied to the specific
+                model implementation.
+            engine_config (RaggedInferenceEngineConfig): Engine configuration.
+            base_mp_group (MPType): Base communication group for Tensor-parallel inference.
+        """
+        super().__init__(config, engine_config, base_mp_group)
+
+        self.make_norm_layer()
+        self.make_qkv_layer()
+        self.make_attn_layer()
+        self.make_attn_out_layer()
+        self.make_moe_layer()
+        self.make_shared_expert_mlp_1_layer()
+        self.make_shared_expert_mlp_2_layer()
+        self.make_shared_expert_gate_layer()
+        self.make_embedding_layer()
+        self.make_unembedding_layer()
+        self._kv_cache_config = None
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                             ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        # TODO(cmikeh2): Distribute ragged_batch_info to all modules
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=cur_params.qkv_b)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None)
+
+        shared_expert_output = self.shared_expert_mlp_1(hidden_states, cur_params.shared_moe_mlp_1, b=None)
+        shared_expert_output = self.shared_expert_mlp_2(shared_expert_output, cur_params.shared_moe_mlp_2, b=None)
+        shared_expert_gate_output = self.shared_expert_gate(hidden_states, cur_params.shared_moe_gate, b=None)[..., :1]
+        # shared_expert_gate_output shape[-1] is 1
+        shared_expert_output.mul_(torch.sigmoid(shared_expert_gate_output))
+        hidden_states = self.moe(hidden_states, ragged_batch_info, cur_params.moe_gate, cur_params.moe_mlp_1,
+                                 cur_params.moe_mlp_2)
+        hidden_states.add_(shared_expert_output)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer(layer_idx, residual, hidden_states, wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/policy.py b/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/policy.py
new file mode 100644
index 000000000000..630bafe993a8
--- /dev/null
+++ b/deepspeed/deepspeed/inference/v2/model_implementations/qwen_v2_moe/policy.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import Qwen2MoeNonTransformerContainer, Qwen2MoeTransformerContainer
+from .model import Qwen2MoeInferenceModel
+
+
+class Qwen2MoePolicy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> Qwen2MoeInferenceModel:
+        return Qwen2MoeInferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [Qwen2MoeTransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(Qwen2MoeNonTransformerContainer(self.model))
+
+        map.set_unmapped_params([])
+
+        return map
diff --git a/deepspeed/deepspeed/ops/fp_quantizer/fp8_gemm.py b/deepspeed/deepspeed/ops/fp_quantizer/fp8_gemm.py
new file mode 100644
index 000000000000..55504e3af8c9
--- /dev/null
+++ b/deepspeed/deepspeed/ops/fp_quantizer/fp8_gemm.py
@@ -0,0 +1,171 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+######## Fused MoE kernel #########
+# These kernels are implemented for
+# fusing GeMM with dequantization of
+# fp8 weight data when using bit-16
+# activation.
+###################################
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def matmul_kernel_fp8_bf16(inp_ptr, weight_ptr, out_ptr, scale_ptr, M, N, K, stride_am, stride_ak, stride_bk,
+                           stride_bn, stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+                           BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr,
+                           quantization_group_size: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    inp_data = inp_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    weight_data = weight_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    weight_ptrs_offset = offs_k[:, None] * (stride_bk // quantization_group_size) + (
+        (pid_n * BLOCK_SIZE_N) // quantization_group_size)
+
+    weight = tl.load(weight_data, mask=offs_k[:, None] < K, other=0.0)
+    scale = tl.load(scale_ptr + weight_ptrs_offset)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        inp = tl.load(inp_data, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        # Dequantize weight (fp8 -> bf16)
+        w = (((weight & 0x80) << 8) | ((weight & 0x7f) << 4)).to(tl.uint16)
+        w = (w + 0x3C00).to(tl.uint16)
+        w = (w.to(tl.bfloat16, bitcast=True) * scale).to(tl.bfloat16)
+
+        inp_data += BLOCK_SIZE_K * stride_ak
+        weight_data += BLOCK_SIZE_K * stride_bk
+        weight_mask = offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K
+        weight = tl.load(weight_data, mask=weight_mask, other=0.0)
+        scale = tl.load(scale_ptr + (weight_ptrs_offset +
+                                     (((k + 1) * BLOCK_SIZE_K * stride_bk) // quantization_group_size)),
+                        mask=weight_mask,
+                        other=0.0)
+
+        accumulator += tl.dot(inp, w)
+
+    out = accumulator.to(tl.bfloat16)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_data = out_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    tl.store(out_data, out, mask=(offs_cm[:, None] < M) & (offs_cn[None, :] < N))
+
+
+@triton.jit
+def matmul_kernel_fp8_fp16(inp_ptr, weight_ptr, out_ptr, scale_ptr, M, N, K, stride_am, stride_ak, stride_bk,
+                           stride_bn, stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+                           BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr,
+                           quantization_group_size: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    inp_data = inp_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    weight_data = weight_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    weight_ptrs_offset = offs_k[:, None] * (stride_bk // quantization_group_size) + (
+        (pid_n * BLOCK_SIZE_N) // quantization_group_size)
+
+    weight = tl.load(weight_data, mask=offs_k[:, None] < K, other=0.0)
+    scale = tl.load(scale_ptr + weight_ptrs_offset)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        inp = tl.load(inp_data, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        # Dequantize weight (fp8 -> fp16)
+        w = (((weight & 0x80) << 8) | ((weight & 0x7f) << 7)).to(tl.uint16)
+        w = (w + 0x2000).to(tl.uint16)
+        w = (w.to(tl.float16, bitcast=True) * scale).to(tl.float16)
+
+        inp_data += BLOCK_SIZE_K * stride_ak
+        weight_data += BLOCK_SIZE_K * stride_bk
+
+        weight = tl.load(weight_data, mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, other=0.0)
+        scale = tl.load(scale_ptr + (weight_ptrs_offset +
+                                     (((k + 1) * BLOCK_SIZE_K * stride_bk) // quantization_group_size)))
+
+        accumulator += tl.dot(inp, w)
+
+    out = accumulator.to(tl.float16)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_data = out_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    tl.store(out_data, out, mask=(offs_cm[:, None] < M) & (offs_cn[None, :] < N))
+
+
+def matmul_fp8(inp, weight, scale, quantization_group_size):
+
+    assert inp.shape[1] == weight.shape[0], \
+        f"Incompatible dimensions (input: {inp.shape}, weight: {weight.shape})"
+
+    M, K = inp.shape
+    K, N = weight.shape
+
+    out = torch.empty((M, N), device=inp.device, dtype=inp.dtype)
+
+    # GEMM tuning parameters!
+    # TODO: Add a more configurable tuning for selecting the best GeMM
+    BLOCK_SIZE_M = 16 if M <= 16 else 32 if M <= 32 else 64 if M <= 64 else 128
+    BLOCK_SIZE_N = 64
+    BLOCK_SIZE_K = max(64, quantization_group_size)
+    GROUP_SIZE_M = 8
+    num_stages = 4
+    num_warps = 4
+    if M >= 256:
+        BLOCK_SIZE_M = 256
+        BLOCK_SIZE_N = 128
+        BLOCK_SIZE_K = max(128, quantization_group_size)
+        num_stages = 3
+        num_warps = 8
+
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
+    kernel = matmul_kernel_fp8_bf16 if inp.dtype == torch.bfloat16 else matmul_kernel_fp8_fp16
+    kernel[grid](inp,
+                 weight,
+                 out,
+                 scale,
+                 M,
+                 N,
+                 K,
+                 inp.stride(0),
+                 inp.stride(1),
+                 weight.stride(0),
+                 weight.stride(1),
+                 out.stride(0),
+                 out.stride(1),
+                 quantization_group_size=quantization_group_size,
+                 BLOCK_SIZE_M=BLOCK_SIZE_M,
+                 BLOCK_SIZE_N=BLOCK_SIZE_N,
+                 BLOCK_SIZE_K=BLOCK_SIZE_K,
+                 GROUP_SIZE_M=GROUP_SIZE_M,
+                 num_stages=num_stages,
+                 num_warps=num_warps)
+    return out
diff --git a/deepspeed/deepspeed/sequence/layer.py b/deepspeed/deepspeed/sequence/layer.py
new file mode 100644
index 000000000000..f17cfa883cc6
--- /dev/null
+++ b/deepspeed/deepspeed/sequence/layer.py
@@ -0,0 +1,212 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import torch
+
+from typing import Any, Tuple
+from torch import Tensor
+from torch.nn import Module
+
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+
+
+def post_all2all(transpose, res_shape):
+
+    def post_func(input):
+        if transpose:
+            input = input.transpose(0, 2).contiguous()
+        input = input.reshape(res_shape)
+        return input
+
+    return post_func
+
+
+def single_all_to_all(input, scatter_idx, gather_idx, group, async_op=False, handle=None, type=None):
+    seq_world_size = dist.get_world_size(group)
+    inp_shape = list(input.shape)
+    inp_shape[scatter_idx] = inp_shape[scatter_idx] // seq_world_size
+    if scatter_idx < 2:
+        input_t = input.reshape(
+            [seq_world_size, inp_shape[scatter_idx]] + \
+            inp_shape[scatter_idx + 1:]
+        ).contiguous()
+    else:
+        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+        input_t = input.reshape(
+            [-1, seq_world_size, inp_shape[scatter_idx]] + \
+            inp_shape[scatter_idx + 1:]
+        ).transpose(0, 1).contiguous()
+
+    output = torch.empty_like(input_t)
+    work = dist.all_to_all_single(output, input_t, group=group, async_op=async_op)
+
+    res_shape=( inp_shape[: gather_idx] + \
+        [inp_shape[gather_idx] * seq_world_size,] + \
+        inp_shape[gather_idx + 1:])
+    transpose = True if scatter_idx < 2 else False
+    post_all2all_fun = post_all2all(transpose, res_shape)
+
+    if async_op:
+        if type in ('dq', 'dk'):
+            handle[type + '_work'] = work
+            handle[type + '_grad'] = output
+            handle[type + '_post_all2all_func'] = post_all2all_fun
+            return output.view(res_shape)
+
+    res = post_all2all_fun(output)
+    return res
+
+
+class _SeqAllToAll(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any,
+                group: dist.ProcessGroup,
+                input: Tensor,
+                scatter_idx: int,
+                gather_idx: int,
+                stream=None,
+                handle=None,
+                type=None,
+                is_fwd=True) -> Tensor:
+        ctx.group = group
+        ctx.scatter_idx = scatter_idx
+        ctx.gather_idx = gather_idx
+        ctx.stream = stream
+        ctx.handle = handle
+        ctx.type = type
+        if ctx.handle is None:
+            res = single_all_to_all(input, scatter_idx, gather_idx, group, False)
+
+        else:
+            # overlap communication path
+            if not is_fwd and type == 'o':
+                assert ctx.stream != None
+                res = single_all_to_all(input, scatter_idx, gather_idx, group, False)
+                get_accelerator().current_stream().wait_stream(ctx.stream)
+                del ctx.stream.activation_buffer_list
+                # The computation of d o_weight can overlap with the communication of d o_input
+
+            elif not is_fwd and type in ('q', 'k'):
+                # Achieve communication overlap by pipelining the matrix computation and communication of dq, dk, and dv
+                type = 'd' + type
+                res = single_all_to_all(input, scatter_idx, gather_idx, group, True, handle, type)
+
+            elif is_fwd and type in ('q', 'k'):
+                # Achieve communication overlap by pipelining the matrix computation and communication of q, k, and v
+                type = 'fwd_' + type
+                res = single_all_to_all(input, scatter_idx, gather_idx, group, False, handle, type)
+
+            else:
+                res = single_all_to_all(input, scatter_idx, gather_idx, group, False)
+
+        return res
+
+    @staticmethod
+    def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
+
+        return (None,
+                _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx, ctx.stream, ctx.handle,
+                                   ctx.type, False), None, None, None, None, None, None)
+
+
+class DistributedAttention(torch.nn.Module):
+    """Initialization.
+
+    Arguments:
+        local_attention (Module): local attention with q,k,v
+        sequence_process_group (ProcessGroup): sequence parallel process group
+        scatter_idx (int): scatter_idx for all2all comm
+        gather_idx (int): gather_idx for all2all comm
+    """
+
+    def __init__(
+        self,
+        local_attention: Module,
+        sequence_process_group: dist.ProcessGroup,
+        scatter_idx: int = 2,
+        gather_idx: int = 0,
+        sp_stream=None,
+    ) -> None:
+
+        super(DistributedAttention, self).__init__()
+        self.local_attn = local_attention
+        self.spg = sequence_process_group
+        self.scatter_idx = scatter_idx
+        self.gather_idx = gather_idx
+        self.sp_overlap_comm = False
+        self.overlap_handles = None
+        self.sp_stream = sp_stream
+        if sp_stream is not None:
+            self.overlap_handles = {}
+            self.sp_overlap_comm = True
+            self.dafult_stream = get_accelerator().default_stream()
+
+    def layer_sync(self, layer):
+        if self.sp_overlap_comm and hasattr(layer, 'done_event'):
+            self.dafult_stream.wait_event(layer.done_event)
+
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, *args: Any, **kwargs) -> Tensor:
+        """ forward
+
+        Arguments:
+            query (Tensor): query input to the layer
+            key (Tensor): key input to the layer
+            value (Tensor): value input to the layer
+            args: other args
+
+        Returns:
+            * output (Tensor): context output
+        """
+
+        # TODO Merge three alltoall calls into one
+        # TODO (Reza): change the api on the megatron-deepspeed side so that we only receive all data (q,k, and v) together!
+        #in shape : e.g.,  [s/p:h:]
+
+        def bwd_hook(layer_type):
+
+            def pre_hook_fun(grad):
+                type = 'd' + layer_type
+                self.overlap_handles[type + '_work'].wait()
+                self.sp_stream.wait_stream(self.dafult_stream)
+                all2all_output = self.overlap_handles[type + '_grad']
+                grad = list(grad)
+                grad[0] = self.overlap_handles[type + '_post_all2all_func'](all2all_output)
+                grad = tuple(grad)
+
+            return pre_hook_fun
+
+        self.layer_sync(query)
+        query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx, None,
+                                         self.overlap_handles, 'q')
+        self.layer_sync(key)
+        key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx, None, self.overlap_handles,
+                                       'k')
+        if self.sp_overlap_comm:
+            self.dafult_stream.wait_stream(self.sp_stream)
+
+        value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx, None,
+                                         self.overlap_handles, 'v')
+
+        if self.sp_overlap_comm:
+            # Register a hook to synchronize dq and dk after the all-to-all
+            # operation when the gradient data is used.
+            # Place this logic after the q, k, v all-to-all operation to
+            # improve interpreter speed to
+            # call and launch of the forward all-to-all communication.
+            grad_fn_q = query.grad_fn.next_functions[0][0]
+            grad_fn_q.register_prehook(bwd_hook(layer_type='q'))
+            grad_fn_k = key.grad_fn.next_functions[0][0]
+            grad_fn_k.register_prehook(bwd_hook(layer_type='k'))
+
+        #out shape : e.g., [s:h/p:]
+
+        context_layer = self.local_attn(query_layer, key_layer, value_layer, *args, **kwargs)
+
+        output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx, self.sp_stream,
+                                    self.overlap_handles, 'o')
+
+        #out e.g., [s/p::h]
+        return output
diff --git a/deepspeed/docs/_tutorials/onebit-adam.md b/deepspeed/docs/_tutorials/onebit-adam.md
new file mode 100644
index 000000000000..b1a8b5369761
--- /dev/null
+++ b/deepspeed/docs/_tutorials/onebit-adam.md
@@ -0,0 +1,304 @@
+---
+title: "1-bit Adam: Up to 5x less communication volume and up to 3.4x faster training"
+tags: training IO
+toc: false
+---
+
+**Note:**
+On 03/07/2022 we released 0/1 Adam, which is a new communication-efficient Adam optimizer partially following the 1-bit Adam's design. Compared to the 1-bit Adam described below, 0/1 Adam provides better communication efficiency and the same final model quality on different tasks including BERT, GPT-2, and ImageNet. Thus we would recommend to first try 0/1 Adam ([tutorial](/tutorials/zero-one-adam/)), and then try 1-bit Adam if 0/1 Adam couldn't provide baseline Adam's convergence in your task.
+{: .notice--info}
+
+**Note:**
+This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes include: 1) NCCL-based implementation which provides better performance and usability compared to the MPI-based implementation. 2) Add support to momentum masks for those parameters with constant zero gradients during training. 3) Bug fixes. See details below.
+{: .notice--info}
+
+**Watch out!**
+1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently the MPI-based implementation is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
+{: .notice--warning}
+
+In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
+
+To illustrate the benefits and usage of 1-bit Adam optimizer in DeepSpeed, we use the following two training tasks as examples:
+
+1. BingBertSQuAD Fine-tuning
+2. BERT Pre-training
+
+For more details on these tasks, please refer to the tutorial posts on [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) and [BERT Pre-training](/tutorials/bert-pretraining/).
+
+## 1. Overview
+
+### 1.1 Pre-requisites for installing DeepSpeed
+
+If you don't already have a copy of the DeepSpeed repository, please clone it
+now and checkout the DeepSpeedExamples submodule that contains the BingBertSQuAD and BERT Pre-training examples.
+
+```shell
+git clone https://github.com/microsoft/DeepSpeed
+cd DeepSpeed
+git submodule update --init --recursive
+cd DeepSpeedExamples/
+```
+
+### 1.2 Pre-requisites for 1-bit Adam
+
+#### 1.2.1 (New in v2) NCCL-based implementation
+
+In 1-bit Adam v2, we introduce a new system implementation for compressed communication using the NCCL backend of PyTorch distributed. This significantly improves the usability due to NCCL’s integration with PyTorch distributed. The performance of our new NCCL-based implementation is also better than our earlier MPI-based implementation for Ethernet-based systems and on-par for InfiniBand-based systems. Thus we highly recommend users to choose this implementation.
+
+**Watch out!**
+This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+{: .notice--warning}
+
+#### 1.2.2 MPI-based implementation
+
+For this implementation, we rely on Message Passing Interface (MPI) for advanced communication primitives.
+
+We package the necessary dependencies in the DeepSpeed docker images. However, if you are using a different build system, please install MPI and mpi4py on your system. To install the prerequisites run:
+
+```shell
+pip install deepspeed[1bit_adam]
+```
+
+We have tested CUDA-Aware MPI communication using the [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) library. However, any CUDA-Aware communication library including [OpenMPI](https://www.open-mpi.org/) should work fine with these examples.
+
+An example launch command for 1-bit Adam using the `deepspeed` launcher is as follows:
+
+```shell
+deepspeed --launcher=[mvapich|openmpi] script.py
+```
+
+Please note that for MPI-based implementation of 1-bit Adam, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
+
+Alternatively, the standard mpirun launcher can also be used as follows:
+
+```shell
+mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
+```
+
+#### 1.2.3 Compressed implementation
+
+This backend provides an approach to abstract the generic part of one-bit optimizers and implements accelerator dependent part with DeepSpeed custom op builder. To use this `CompressedBackend`, you should make sure that your current accelerator supports `PackbitsBuilder`, so that it could be loaded to do high performance packing and unpacking between float and Byte datatype, which is utilized in one-bit algorithm. An example can be found in `Deepspeed/op_builder/xpu/packbits.py`.
+
+This approach does not require NCCL or MPI based communication library. It will automatically use your default communication library selected by your accelerator in `deepspeed/comm`.
+
+### 1.3 1-bit Algorithm
+
+The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+
+### 1.4 Configuration of 1-bit Adam
+The 1-bit Adam feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
+
+```json
+{
+  "train_batch_size": 4096,
+  "train_micro_batch_size_per_gpu": 16,
+  "optimizer": {
+    "type": "OneBitAdam",
+    "params": {
+      "lr": 4e-4,
+      "freeze_step": 23000,
+      "cuda_aware": false,
+      "comm_backend_name": "nccl"
+    }
+  },
+  "fp16": {
+    "enabled": true,
+  }
+}
+```
+Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_name` that have been added to support the 1-bit Adam feature.
+
+`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model (This is related to Adam's variance/second moment term. See detailed analysis in our [paper](https://arxiv.org/abs/2102.02888)). If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
+
+`cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
+
+(New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL, MPI-based and compressed implementations by setting `comm_backend_name` to "nccl", "mpi" or "compressed". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+
+#### 1.4.1 (New in v2) Momentum masks for parameters with constant zero gradients
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+
+**Watch out!**
+1-bit Adam relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
+{: .notice--warning}
+
+## 2. BingBertSQuAD Fine-tuning with 1-bit Adam
+
+* Download the SQuAD dataset:
+  * Training set: [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+  * Validation set: [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+* Download the HuggingFace checkpoint and config files:
+  * [bert-large-uncased-whole-word-masking](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin)
+  * [bert json config](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json)
+
+You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [HuggingFace](https://github.com/huggingface/transformers), or [TensorFlow](https://github.com/google-research/bert#pre-trained-models) to run the fine-tuning.
+
+**Note:** For details about loading checkpoint, argument parsing, initialization, forward pass, backward pass, weight update and evaluation, please refer to the [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) tutorial.
+
+### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam
+
+We provide example scripts under [DeepSpeedExamples/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+
+<!-- The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
+already been modified to use DeepSpeed. The `run_squad_deepspeed.sh` script
+helps to invoke training and setup several different hyperparameters relevant
+to the training process.
+
+- **DeepSpeed-enabled:** Start training with DeepSpeed by providing the following 4 arguments to this script:
+
+```shell
+bash run_squad_deepspeed.sh <NUM_GPUS> <PATH_TO_CHECKPOINT> <PATH_TO_DATA_DIR> <PATH_TO_OUTPUT_DIR>`
+```
+
+The first argument is the number of GPUs to train with, second argument is the path to the pre-training checkpoint, third is the path to training and validation sets (e.g., train-v1.1.json), and fourth is path to an output folder where the results will be saved. This script will invoke `nvidia_run_squad_deepspeed.py`.
+
+- **DeepSpeed with 1-bit Adam enabled:** In order to run with 1-bit Adam feature enabled, the same script (`nvidia_run_squad_deepspeed.py`) can be used but there are two options for launching this properly: 1) Launch using deepspeed launcher and 2) Launch with mpirun.
+
+To enable the 1-bit compressed training, 1-bit Adam uses an MPI library (E.g. MVAPICH2-GDR, OpenMPI, etc.) as the communication backend, which means that we can use `mpirun` to launch the training job. However, our user-friendly launcher called `deepspeed` has been enhanced to launch MPI jobs as well.
+
+### Launch with deepspeed
+
+The following helper script in the DeepSpeedExamples/BingBertSQuAD will launch the training without the need for setting any `mpirun` parameters. The number of nodes and GPUs will be automatically detected and the job will be launched on all the available resources.
+
+```shell
+bash run_squad_deepspeed_onebitadam.sh <PATH_TO_OUTPUT_DIR>
+```
+
+### Launch with mpirun
+
+Alternatively, we show how the standard `mpirun` launcher can be used for launching the fine-tuning job.
+
+```shell
+mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] bash run_squad_mpi_onebitadam.sh
+```
+
+For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the support of InfiniBand, you can use the `mpirun` launcher packaged with the MVAPICH2 library. Please run the following command:
+
+```shell
+mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash run_squad_mpi_onebitadam.sh
+``` -->
+
+### 2.2 Configuration for BingBertSQuAD with DeepSpeed and 1-bit Adam enabled
+
+The `deepspeed_onebitadam_bsz96_config.json` file gives the user the ability to specify DeepSpeed
+options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters.
+When running the `nvidia_run_squad_deepspeed.py`, in addition to the
+`--deepspeed` flag to enable DeepSpeed, the appropriate DeepSpeed configuration
+file must be specified using `--deepspeed_config deepspeed_onebitadam_bsz96_config.json`.
+
+Table 1 shows the fine-tuning configuration we used in our experiments.
+
+| Parameters                     | Value 		|
+| ------------------------------ | ---------------------|
+| Total batch size               | 96    		|
+| Train micro batch size per GPU | 3     		|
+| Optimizer                      | **"OnebitAdam"**  	|
+| Learning rate                  | 3e-5  		|
+| Sequence-length                | 384   		|
+| Weight-decay                   | 0.0   		|
+| Epoch count                    | 2     		|
+| **freeze_step**                | 400     	   	|
+| **comm_backend_name**          | "nccl"     		|
+
+Table 1. Fine-tuning configuration
+
+### 2.3 Performance Results for BingBertSQuAD Fine-tuning
+
+<i>**Accuracy:**</i>
+The results are summarized in the table below. The total batch size is set to 96 and training is conducted
+on 32 GPUs for 2 epochs. A set of parameters (seeds and learning rates) were tried and the best ones were selected.
+We fixed the learning rate to 3e-5. The table below shows the F1 and the EM scores we achieved that are on-par or better than the [HuggingFace results](https://github.com/huggingface/transformers/tree/master/examples/question-answering).
+
+| Case        | Model                                 | Precision | EM    | F1    |
+| ----------- | ------------------------------------- | --------- | ----- | ----- |
+| HuggingFace | [Bert-large-uncased-whole-word-masking](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin) | FP16      | 87.26 | 93.32 |
+
+
+***Training Speed and Scalability:***
+
+<!-- 1-bit Adam enables up to 2.7x overall speedup in training speed for SQuAD fine-tuning. This is made possible by up to 6.2x faster throughput during the compressed stage of the algorithm as shown in Figure 1.
+
+![SQuAD Finetuning](/assets/images/squad-scaling.png){: .align-center}
+
+Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU. -->
+
+Performance results of SQuAD Fine-tuning can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+
+
+
+## 3. BERT Pre-training with 1-bit Adam
+For data downloading and pre-processing, please refer to the [BERT Pre-training](/tutorials/bert-pretraining/) tutorial.
+
+### 3.1 Running Pre-training with DeepSpeed and 1-bit Adam
+
+We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+
+<!-- The main part of training is done in `deepspeed_train.py`, which has
+already been modified to use DeepSpeed. The `ds_train_bert_onebit_bsz4k_seq128.sh` and `ds_train_bert_bsz64k_seq128.sh`
+are the shell scripts that help to invoke training and setup several different hyperparameters relevant
+to the training process.
+
+- **DeepSpeed-enabled:** Start training with DeepSpeed by running the command below:
+
+```shell
+bash ds_train_bert_bsz64k_seq128.sh
+```
+
+- **DeepSpeed with 1-bit Adam enabled:** In order to run with 1-bit Adam feature enabled, the same script (`deepspeed_train.py`) can be used but there are two options for launching this properly:
+
+### Launch with deepspeed
+
+As discussed for BingBertSQuAD fine-tuning, we can simply use the `deepspeed` launcher to launch our BERT pre-training jobs as follows.
+
+```shell
+bash ds_train_bert_onebit_bsz4k_seq128.sh
+```
+
+### Launch with mpirun
+
+Alternatively, use the following command to launch using `mpirun`.
+
+```shell
+mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] bash mpi_train_bert_onebit_bsz4k_seq128.sh
+```
+
+For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the support of InfiniBand, you can use MVAPICH2 as the launcher and run the following command:
+```shell
+mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash ds_train_bert_onebit_bsz4k_seq128.sh
+``` -->
+
+### 3.2 Configuration for BERT Pre-training with DeepSpeed and 1-bit Adam enabled
+
+The `deepspeed_bsz4k_onebit_config_seq128_*.json` file gives the user the ability to specify DeepSpeed
+options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters.
+
+Below is the DeepSpeed configuration file for running BERT-large pre-training with sequence length of 128 using the 1-bit Adam optimizer.
+
+```json
+{
+  "train_batch_size": 4096,
+  "train_micro_batch_size_per_gpu": 16,
+  "steps_per_print": 100,
+  "prescale_gradients": false,
+  "optimizer": {
+    "type": "OneBitAdam",
+    "params": {
+      "lr": 4e-4,
+      "weight_decay": 0.01,
+      "bias_correction": false,
+      "freeze_step": 23000,
+      "comm_backend_name": "nccl"
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "initial_scale_power": 16
+  }
+}
+```
+The above file is for BERT-large. For BERT-base training (sequence length 128), the suggested `freeze_step` is 16000. For sequence 512 pre-training, we suggest to use a `freeze_step` of 1500 for both BERT-base and BERT-large. And make sure to set the `comm_backend_name` and `cuda_aware` correctly as described above.
+
+### 3.3 Performance Results for BERT Pre-training
+
+Performance results of BERT Pre-training can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
diff --git a/deepspeed/docs/_tutorials/onebit-lamb.md b/deepspeed/docs/_tutorials/onebit-lamb.md
new file mode 100644
index 000000000000..b6c6ef075036
--- /dev/null
+++ b/deepspeed/docs/_tutorials/onebit-lamb.md
@@ -0,0 +1,135 @@
+---
+title: "1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed"
+tags: training IO
+---
+
+**Watch out!**
+1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit LAMB is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently the MPI-based implementation is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit LAMB's convergence. See details below.
+{: .notice--warning}
+
+In this tutorial, we introduce DeepSpeed's 1-bit LAMB optimizer which enables communication-efficient large-scale large-batch training with LAMB's convergence speed. 1-bit LAMB can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 4.6x. We also have a [paper](https://arxiv.org/abs/2104.06069) which provides the technical details including algorithm, system implementation, and evaluations.
+
+To illustrate the benefits and usage of 1-bit LAMB optimizer, we use the BERT Pre-training task as example. For more details on this task, please refer to the [tutorial](/tutorials/bert-pretraining/).
+
+## 1. Overview
+
+### 1.1 Pre-requisites for installing DeepSpeed
+
+If you don't already have a copy of the DeepSpeed repository, please clone it
+now and checkout the DeepSpeedExamples submodule that contains the BERT Pre-training example.
+
+```shell
+git clone https://github.com/microsoft/DeepSpeed
+cd DeepSpeed
+git submodule update --init --recursive
+cd DeepSpeedExamples/
+```
+
+### 1.2 Pre-requisites for 1-bit LAMB
+
+#### 1.2.1 NCCL-based implementation
+
+In DeepSpeed, we introduce a system implementation for compressed communication using the NCCL backend of PyTorch distributed. This implementation provides better performance and usability than the MPI-based implementation below. Thus we highly recommend users to choose this implementation.
+
+**Watch out!**
+This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+{: .notice--warning}
+
+#### 1.2.2 MPI-based implementation
+
+For this implementation, we rely on Message Passing Interface (MPI) for advanced communication primitives.
+
+We package the necessary dependencies in the DeepSpeed docker images. However, if you are using a different build system, please install MPI and mpi4py on your system. To install the prerequisites run:
+
+```shell
+pip install deepspeed[1bit_adam]
+```
+
+We have tested CUDA-Aware MPI communication using the [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) library. However, any CUDA-Aware communication library including [OpenMPI](https://www.open-mpi.org/) should work fine with these examples.
+
+An example launch command for 1-bit LAMB using the `deepspeed` launcher is as follows:
+
+```shell
+deepspeed --launcher=[mvapich|openmpi] script.py
+```
+
+Please note that for MPI-based implementation of 1-bit LAMB, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
+
+Alternatively, the standard mpirun launcher can also be used as follows:
+
+```shell
+mpirun -np [num processes] -ppn [num GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
+```
+
+#### 1.2.3 Compressed implementation
+This backend provides an approach to abstract the generic part of one-bit optimizers and implements accelerator dependent part with DeepSpeed custom op builder. To use this `CompressedBackend`, you should make sure that your current accelerator supports `PackbitsBuilder`, so that it could be loaded to do high performance packing and unpacking between float and Byte datatype, which is utilized in one-bit algorithm. An example can be found in `Deepspeed/op_builder/xpu/packbits.py`.
+This approach does not require NCCL or MPI based communication library. It will automatically use your default communication library selected by your accelerator in `deepspeed/comm`.
+
+### 1.3 1-bit LAMB Algorithm
+
+The detailed description of the 1-bit LAMB algorithm can be seen from our [paper](https://arxiv.org/abs/2104.06069).
+
+### 1.4 Configuration of 1-bit LAMB
+The 1-bit LAMB feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
+
+```json
+{
+  "train_batch_size": 65536,
+  "train_micro_batch_size_per_gpu": 64,
+  "optimizer": {
+    "type": "OneBitLamb",
+    "params": {
+      "lr": 11e-3,
+      "max_coeff": 0.3,
+      "min_coeff": 0.01,
+      "freeze_step": 1000,
+      "cuda_aware": false,
+      "comm_backend_name": "nccl",
+      "coeff_beta": 0.9,
+      "factor_max": 4.0,
+      "factor_min": 0.5,
+      "factor_threshold": 0.1
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "initial_scale_power": 16
+  }
+}
+```
+Please note the new parameters `freeze_step`, `cuda_aware`, `comm_backend_name`, `coeff_beta`, `factor_max`, `factor_min`, and `factor_threshold` that have been added to support the 1-bit LAMB feature:
+
+`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model (This is related to LAMB's variance/second moment term and scaling coefficient. See detailed analysis in our [paper](https://arxiv.org/abs/2104.06069)). If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
+
+`cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
+
+`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL, MPI-based and compressed implementations by setting `comm_backend_name` to "nccl", "mpi" or "compressed". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+
+`coeff_beta` is used when calculating a moving average of the LAMB scaling coefficient during the warmup stage. This moving average is then used as the frozen base scaling coefficient during the compression stage.
+
+`factor_max`, `factor_min`, and `factor_threshold` are used to regularize the adaptive scaling of the frozen base scaling coefficient during the compression stage. `factor_max` and `factor_min` are the scaling factor upper/lower bound. `factor_threshold` defines the threshold of how much the scaling factor can fluctuate between steps.
+
+#### 1.4.1 Momentum masks for parameters with constant zero gradients
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit LAMB we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+
+**Watch out!**
+1-bit LAMB relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
+{: .notice--warning}
+
+## 2. BERT Pre-training with 1-bit LAMB
+For data downloading and pre-processing, please refer to the [BERT Pre-training tutorial](/tutorials/bert-pretraining/).
+
+### 2.1 Running Pre-training with DeepSpeed and 1-bit LAMB
+
+We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_lamb/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/1-bit_lamb). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+
+### 2.2 Configuration for BERT Pre-training with DeepSpeed and 1-bit LAMB enabled
+
+The `deepspeed_bsz64k_onebitlamb_config_seq128_*.json` and `deepspeed_bsz32k_onebitlamb_config_seq512_*.json` files give the user the ability to specify DeepSpeed
+options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters. In these files we include the tuned hyperparameters to reproduce experiments in our [paper](https://arxiv.org/abs/2104.06069).
+
+### 2.3 Performance Results for BERT Pre-training
+
+Performance results can be seen in our [paper](https://arxiv.org/abs/2104.06069).
diff --git a/deepspeed/docs/_tutorials/zero-one-adam.md b/deepspeed/docs/_tutorials/zero-one-adam.md
new file mode 100644
index 000000000000..055c685faf89
--- /dev/null
+++ b/deepspeed/docs/_tutorials/zero-one-adam.md
@@ -0,0 +1,145 @@
+---
+title: "Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam"
+---
+
+**Watch out!**
+1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 0/1 Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently the MPI-based implementation is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 0/1 Adam's convergence. See details below.
+{: .notice--warning}
+
+In this tutorial, we introduce DeepSpeed's 0/1 Adam optimizer, which can improve model training speed on communication-constrained clusters, especially for communication-intensive large models. For instance, it is able to reduce the overall communication volume on BERT-large pre-training by up to 26x without affecting the end-to-end model accuracy.
+Compared to the 1-bit Adam optimizer, 0/1 Adam provides a more flexible way of using compressed communication via adaptive variance state freezing. Additionally, it allows the computing nodes to skip communication rounds during training using a technique called 1-bit sync, without compromising the convergence speed.
+We have a [paper](https://arxiv.org/abs/2202.06009) which provides the technical details including algorithm, system implementation, and evaluations.
+
+To illustrate the benefits and usage of 0/1 Adam optimizer, we use the BERT Pre-training task as example. For more details on this task, please refer to the [tutorial](/tutorials/bert-pretraining/).
+
+## 1. Overview
+
+### 1.1 Pre-requisites for installing DeepSpeed
+
+If you don't already have a copy of the DeepSpeed repository, please clone it
+now and checkout the DeepSpeedExamples submodule that contains the BERT Pre-training example.
+
+```shell
+git clone https://github.com/microsoft/DeepSpeed
+cd DeepSpeed
+git submodule update --init --recursive
+cd DeepSpeedExamples/
+```
+
+### 1.2 Pre-requisites for 0/1 Adam
+
+#### 1.2.1 NCCL-based implementation
+
+In DeepSpeed, we introduce a system implementation for compressed communication using the NCCL backend of PyTorch distributed. This implementation provides better performance and usability than the MPI-based implementation below. Thus we highly recommend users to choose this implementation.
+
+**Watch out!**
+This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+{: .notice--warning}
+
+#### 1.2.2 MPI-based implementation
+
+For this implementation, we rely on Message Passing Interface (MPI) for advanced communication primitives.
+
+We package the necessary dependencies in the DeepSpeed docker images. However, if you are using a different build system, please install MPI and mpi4py on your system. To install the prerequisites run:
+
+```shell
+pip install deepspeed[1bit_adam]
+```
+
+We have tested CUDA-Aware MPI communication using the [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) library. However, any CUDA-Aware communication library including [OpenMPI](https://www.open-mpi.org/) should work fine with these examples.
+
+An example launch command for 0/1 Adam using the `deepspeed` launcher is as follows:
+
+```shell
+deepspeed --launcher=[mvapich|openmpi] script.py
+```
+
+Please note that for MPI-based implementation of 0/1 Adam, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
+
+Alternatively, the standard mpirun launcher can also be used as follows:
+
+```shell
+mpirun -np [num processes] -ppn [num GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
+```
+
+#### 1.2.3 Compressed implementation
+This backend provides an approach to abstract the generic part of one-bit optimizers and implements accelerator dependent part with DeepSpeed custom op builder. To use this `CompressedBackend`, you should make sure that your current accelerator supports `PackbitsBuilder`, so that it could be loaded to do high performance packing and unpacking between float and Byte datatype, which is utilized in one-bit algorithm. An example can be found in `Deepspeed/op_builder/xpu/packbits.py`.
+This approach does not require NCCL or MPI based communication library. It will automatically use your default communication library selected by your accelerator in `deepspeed/comm`.
+
+### 1.3 0/1 Adam Algorithm
+
+The detailed description of the 0/1 Adam algorithm can be seen from our [paper](https://arxiv.org/abs/2202.06009).
+
+### 1.4 Configuration of 0/1 Adam
+The 0/1 Adam feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
+
+```json
+{
+  "train_batch_size": 4096,
+  "train_micro_batch_size_per_gpu": 16,
+  "optimizer": {
+    "type": "ZeroOneAdam",
+    "params": {
+      "lr": 1e-3,
+      "weight_decay": 0.01,
+      "bias_correction": false,
+      "var_freeze_step": 1000,
+      "var_update_scaler": 16,
+      "local_step_scaler": 1000,
+      "local_step_clipper": 16,
+      "cuda_aware": false,
+      "comm_backend_name": "nccl"
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "initial_scale_power": 16
+  }
+}
+```
+Please note the new parameters `var_freeze_step`, `var_update_scaler`, `local_step_scaler`, `local_step_clipper`, `cuda_aware` and `comm_backend_name` that have been added to support the 0/1 Adam feature:
+
+`var_freeze_step` is the latest step to update the variance. Using the notation from [0/1 Adam paper](https://arxiv.org/abs/2202.06009), it denotes the $\max\{i|i \in \mathcal{T}_v\}$. Note that this is different from the `freeze_step` in 1-bit Adam. The `var_freeze_step` is usually the last step of the learning rate warmup and thus does not require tuning. Note that this hyperparameter is optional. In practice, we can avoid tuning this parameter by setting it to a sufficiently large number (larger than the total number of steps). Following this, 0/1 Adam still enjoys the non-trivial communication reduction without affecting the convergence speed.
+
+`var_update_scaler` is the interval to update the variance. Note that the update policy for variance follows an exponential rule. Formally, if we denote $k_j$ as the step where $j$-th variance update takes place, then it follows that $k_{j+1} - k_j = 2\cdot\exp\{\lfloor j/\kappa\rfloor\}$ (please refer to the [0/1 Adam paper](https://arxiv.org/abs/2202.06009) for detailed explanation), and the `var_update_scaler` denotes the $\kappa$ factor in such expression.
+In practice, we found its default value (16) is able to work well on most of the tasks, including BERT-Base/Large pretraining, GPT pretraining, and ImageNet training.
+
+`local_step_scaler` and `local_step_clipper` are two hyperparameters for learning rate based local step policy in 0/1 Adam. Formally, if we denote $k_j$ as the step where $j$-th synchronization takes place among all the workers, then it follows that $k_{j+1} - k_j = 2\cdot\exp\{\min(\lfloor j/\alpha\rfloor, \beta )\}$ (please refer to the [0/1 Adam paper](https://arxiv.org/abs/2202.06009) for detailed explanation). Following such notations, `local_step_scaler` and `local_step_clipper` denote the $\alpha$ and $\beta$, respectively. Informally, `local_step_scaler` decides the frequency of synchronization while `local_step_clipper` denotes the maximal local step interval 0/1 Adam can use.
+The learning rate policy is the default policy used in 0/1 Adam, and the value of `local_step_scaler` can be pre-calculated (see [0/1 Adam paper](https://arxiv.org/abs/2202.06009) Section 6). We can also trivially construct other policies by setting these two hyperparameters such as constant local step interval policy by setting `local_step_scaler=1` and `local_step_clipper=constant`.
+
+`cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
+
+`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL, MPI-based and compressed implementations by setting `comm_backend_name` to "nccl", "mpi" or "compressed". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+
+#### 1.4.1 Momentum masks for parameters with constant zero gradients
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 0/1 Adam we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+
+**Watch out!**
+0/1 Adam relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, aside from resetting the compression errors as 1-bit Adam, we additionally need to reset the local step buffer. Since the local step buffer can potentially fail to capture the training dynamics if the checkpoints are loaded by different number of nodes (GPUs).
+{: .notice--warning}
+
+## 2. BERT Pre-training with 0/1 Adam
+For data downloading and pre-processing, please refer to the [BERT Pre-training tutorial](/tutorials/bert-pretraining/).
+
+### 2.1 Running Pre-training with DeepSpeed and 0/1 Adam
+
+We provide example scripts under [DeepSpeedExamples/bing_bert/01_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/01_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+
+### 2.2 Configuration for BERT Pre-training with DeepSpeed and 0/1 Adam enabled
+
+The `deepspeed_bsz4k_01adam_config_seq128_*.json` and `deepspeed_bsz4k_01adam_config_seq512_*.json` files give the user the ability to specify DeepSpeed
+options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters. In these files we include the tuned hyperparameters to reproduce experiments in our [paper](https://arxiv.org/abs/2202.06009).
+
+### 2.3 Performance Results for BERT Pre-training
+
+Performance results can be seen in our [paper](https://arxiv.org/abs/2202.06009).
+
+### 2.4 GLUE Fine-tuning
+We additionally provide the fine-tuning scripts for BERT pre-training checkpoints over [GLUE tasks](https://gluebenchmark.com/). The scripts are available at [DeepSpeedExamples/BingBertGlue](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertGlue). The `glue_bert_base.json` and `glue_bert_large.json` files give the user the ability to specify DeepSpeed
+options/parameters like micro batch size over BERT-base and BERT-large checkpoints, respectively. Currently we use Adam as the default optimizer for GLUE fine-tuning since the fine-tuning tasks usually use small batch size (~32) and do not require large-scale systems. `run_glue_bert_base_finetune.sh` and `run_glue_bert_large_finetune.sh` give the scripts for launching fine-tuning tasks, where we can modify variables like task name, number of epochs, model, etc. Note that to launch the fine-tuning, we must specify the path for checkpoint, for instance,
+```
+bash run_glue_bert_base_finetune.sh <path to checkpoint>
+```
+Specific GLUE scores and hyperparameters for 0/1 Adam are included in our [paper](https://arxiv.org/abs/2202.06009) Table 1.
diff --git a/deepspeed/docs/index.md b/deepspeed/docs/index.md
new file mode 100644
index 000000000000..127c7226e6d4
--- /dev/null
+++ b/deepspeed/docs/index.md
@@ -0,0 +1,176 @@
+---
+layout: single
+toc: true
+toc_label: "Contents"
+title: "Latest News"
+
+---
+<b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
+
+* [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/chinese/README.md)]
+* [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]
+* [2024/03] [DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
+* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
+* [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html)
+
+<!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
+
+<details>
+ <summary>More news</summary>
+ <ul>
+
+   <li>[2023/11] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp/README.md">DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow</a></li>
+
+  <li>[2023/11] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen">DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference</a> [<a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md">日本語</a>]</li>
+
+
+  <li>[2023/10] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md">DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs</a> [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md">日本語</a>]</li>
+
+ </ul>
+</details>
+
+# Extreme Speed and Scale for DL Training and Inference
+
+   ***[DeepSpeed](https://www.deepspeed.ai/) enables world's most powerful language models like [MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/) and [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)***. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. With DeepSpeed you can:
+
+* Train/Inference dense or sparse models with billions or trillions of parameters
+* Achieve excellent system throughput and efficiently scale to thousands of GPUs
+* Train/Inference on resource-constrained GPU systems
+* Achieve unprecedented low latency and high throughput for inference
+* Achieve extreme compression for an unparalleled inference latency and model size reduction with low costs
+
+
+# DeepSpeed has four innovation pillars:
+
+[![Four innovation pillars](/assets/images/DeepSpeed-pillars.png){: .align-center}](https://deepspeed4science.ai/)
+
+
+## DeepSpeed-Training
+
+DeepSpeed offers a confluence of system innovations, that has made large-scale DL training effective, and efficient, greatly improved ease of use, and redefined the DL training landscape in terms of scale that is possible. These innovations such as ZeRO, 3D-Parallelism, DeepSpeed-MoE, ZeRO-Infinity, etc fall under the DeepSpeed-Training pillar. Learn more: [DeepSpeed-Training](https://www.deepspeed.ai/training)
+
+## DeepSpeed-Inference
+
+DeepSpeed brings together innovations in parallelism technology such as tensor, pipeline, expert and ZeRO-parallelism, and combines them with high-performance custom inference kernels, communication optimizations and heterogeneous memory technologies to enable inference at an unprecedented scale, while achieving unparalleled latency, throughput and cost reduction. This systematic composition of system technologies for inference falls under the DeepSpeed-Inference. Learn more: [DeepSpeed-Inference](https://www.deepspeed.ai/inference)
+
+## DeepSpeed-Compression
+
+To further increase the inference efficiency, DeepSpeed offers easy-to-use and flexible-to-compose compression techniques for researchers and practitioners to compress their models while delivering faster speed, smaller model size, and significantly reduced compression cost. Moreover, SoTA innovations on compression like ZeroQuant and XTC are included under the DeepSpeed-Compression pillar. Learn more: [DeepSpeed-Compression](https://www.deepspeed.ai/compression)
+
+## DeepSpeed4Science
+
+In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [DeepSpeed4Science website](https://deepspeed4science.ai/) and [tutorials](/deepspeed4science/)
+
+# DeepSpeed Software Suite
+
+## DeepSpeed Library
+
+   The [DeepSpeed](https://github.com/microsoft/deepspeed) library implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for an easy composition of a multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).
+
+## Model Implementations for Inference (MII)
+
+   [Model Implementations for Inference (MII)](https://github.com/microsoft/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions.
+
+## DeepSpeed on Azure
+
+   DeepSpeed users are diverse and have access to different environments. We recommend trying DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
+
+# DeepSpeed Adoption
+
+DeepSpeed has been used to train many different large-scale models. Below is a list of several examples that we are aware of (if you'd like to include your model please submit a PR):
+
+  * [Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)
+  * [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)
+  * [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed)
+  * [GLM (130B)](https://github.com/THUDM/GLM-130B)
+  * [YaLM (100B)](https://github.com/yandex/YaLM-100B)
+  * [GPT-NeoX (20B)](https://github.com/EleutherAI/gpt-neox)
+  * [AlexaTM (20B)](https://www.amazon.science/blog/20b-parameter-alexa-model-sets-new-marks-in-few-shot-learning)
+  * [Turing NLG (17B](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/)
+  * [METRO-LM (5.4B)](https://arxiv.org/pdf/2204.06644.pdf)
+
+DeepSpeed has been integrated with several different popular open-source DL frameworks such as:
+
+|                                                                                                | Documentation                                |
+| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
+| <img src="assets/images/transformers-light.png" width="300px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
+| <img src="assets/images/accelerate-light.png" width="300px">| [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
+| <img src="assets/images/lightning-light.svg" width="250px"> | [Lightning with DeepSpeed](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html) |
+| <img src="assets/images/mosaicml.svg" width="250px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
+
+DeepSpeed is an integral part of [Microsoft’s AI at Scale initiative](https://www.microsoft.com/en-us/research/project/ai-at-scale/) to enable next-generation AI capabilities at scale.
+
+
+# Contributing
+DeepSpeed welcomes your contributions! Please see our
+[contributing](/contributing/) guide for more details on formatting, testing,
+etc.
+
+## Contributor License Agreement
+This project welcomes contributions and suggestions. Most contributions require you to
+agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
+actually do, grant us the rights to use your contribution. For details, visit
+https://cla.opensource.microsoft.com.
+
+When you submit a pull request, a CLA bot will automatically determine whether you need
+to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
+follow the instructions provided by the bot. You will only need to do this once across
+all repos using our CLA.
+
+## Code of Conduct
+This project has adopted the [Microsoft Open Source Code of
+Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the
+[Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
+[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or
+comments.
+
+# Publications
+1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
+2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
+3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
+4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840) and [USENIX ATC 2021](https://www.usenix.org/conference/atc21/presentation/ren-jie). [[paper]](https://arxiv.org/abs/2101.06840) [[slides]](https://www.usenix.org/system/files/atc21_slides_ren-jie.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
+5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888) and [ICML 2021](http://proceedings.mlr.press/v139/tang21a.html).
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205). [[paper]](https://arxiv.org/abs/2104.07857) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/SC21-ZeRO-Infinity.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
+7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069) and [HiPC 2022](https://hipc.org/advance-program/).
+8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084) and [NeurIPS 2022](https://openreview.net/forum?id=JpZ5du_Kdh).
+9. Yucheng Lu, Conglong Li, Minjia Zhang, Christopher De Sa, Yuxiong He. (2022) Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam. [arXiv:2202.06009](https://arxiv.org/abs/2202.06009).
+10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html). [[pdf]](https://arxiv.org/abs/2201.05596) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/ICML-5mins.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/)
+11. Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, Bryan Catanzaro. (2022) Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model [arXiv:2201.11990](https://arxiv.org/abs/2201.11990).
+12. Xiaoxia Wu, Zhewei Yao, Minjia Zhang, Conglong Li, Yuxiong He. (2022) Extreme Compression for Pre-trained Transformers Made Simple and Efficient. [arXiv:2206.01859](https://arxiv.org/abs/2206.01859) and [NeurIPS 2022](https://openreview.net/forum?id=xNeAhc2CNAl).
+13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-compression-a-composable-library-for-extreme-compression-and-zero-cost-quantization/)
+14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946). [[paper]](https://arxiv.org/abs/2207.00032) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/sc22-ds-inference.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/)
+15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586).
+16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597) [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/)
+17. Xiaoxia Wu, Cheng Li, Reza Yazdani Aminabadi, Zhewei Yao, Yuxiong He. (2023) Understanding INT4 Quantization for Transformer Models: Latency Speedup, Composability, and Failure Cases. [arXiv:2301.12017](https://arxiv.org/abs/2301.12017) and [ICML2023](https://icml.cc/Conferences/2023).
+18. Syed Zawad, Cheng Li, Zhewei Yao, Elton Zheng, Yuxiong He, Feng Yan. (2023) DySR: Adaptive Super-Resolution via Algorithm and System Co-design. [ICLR:2023](https://openreview.net/forum?id=Pgtn4l6eKjv).
+19. Sheng Shen, Zhewei Yao, Chunyuan Li, Trevor Darrell, Kurt Keutzer, Yuxiong He. (2023) Scaling Vision-Language Models with Sparse Mixture of Experts. [arXiv:2303.07226](https://arxiv.org/abs/2303.07226) and [Finding at EMNLP2023](https://2023.emnlp.org/).
+20. Quentin Anthony, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He, Aamir Shafi, Mustafa Abduljabbar, Hari Subramoni, Dhabaleswar Panda. (2023) MCR-DL: Mix-and-Match Communication Runtime for Deep Learning [arXiv:2303.08374](https://arxiv.org/abs/2303.08374) and will appear at IPDPS 2023.
+21. Siddharth Singh, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He, Abhinav Bhatele. (2023) A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training [arXiv:2303.06318](https://arxiv.org/abs/2303.06318) and will appear at ICS 2023.
+22. Guanhua Wang, Heyang Qin, Sam Ade Jacobs, Xiaoxia Wu, Connor Holmes, Zhewei Yao, Samyam Rajbhandari, Olatunji Ruwase, Feng Yan, Lei Yang, Yuxiong He. (2023) ZeRO++: Extremely Efficient Collective Communication for Giant Model Training [arXiv:2306.10209](https://arxiv.org/abs/2306.10209) and [ML for Sys Workshop at NeurIPS2023](http://mlforsystems.org/) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/)
+23. Zhewei Yao, Xiaoxia Wu, Cheng Li, Stephen Youn, Yuxiong He. (2023) ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation [arXiv:2303.08302](https://arxiv.org/abs/2303.08302) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf)
+24. Pareesa Ameneh Golnari, Zhewei Yao, Yuxiong He. (2023) Selective Guidance: Are All the Denoising Steps of Guided Diffusion Important? [arXiv:2305.09847](https://arxiv.org/abs/2305.09847)
+25. Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He. (2023) DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales [arXiv:2308.01320](https://arxiv.org/abs/2308.01320).
+26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf)
+27. Zhewei Yao, Xiaoxia Wu, Conglong Li, Minjia Zhang, Heyang Qin, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He. (2023) DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention [arXiv:2309.14327](https://arxiv.org/pdf/2309.14327.pdf)
+28. Shuaiwen Leon Song, Bonnie Kruft, Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Masahiro Tanaka, Xiaoxia Wu, Jeff Rasley, Ammar Ahmad Awan, Connor Holmes, Martin Cai, Adam Ghanem, Zhongzhu Zhou, Yuxiong He, et al. (2023) DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies [arXiv:2310.04610](https://arxiv.org/abs/2310.04610) [[blog]](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)
+29. Zhewei Yao, Reza Yazdani Aminabadi, Stephen Youn, Xiaoxia Wu, Elton Zheng, Yuxiong He. (2023) ZeroQuant-HERO: Hardware-Enhanced Robust Optimized Post-Training Quantization Framework for W8A8 Transformers [arXiv:2310.17723](https://arxiv.org/abs/2310.17723)
+30. Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Reza Yazdani Aminadabi, Shuaiwen Leon Song, Samyam Rajbhandari, Yuxiong He. (2024) [System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://dl.acm.org/doi/10.1145/3662158.3662806)
+31. Xinyu Lian, Sam Ade Jacobs, Lev Kurilenko, Masahiro Tanaka, Stas Bekman, Olatunji Ruwase, Minjia Zhang. (2024) Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training [arXiv:2406.18820](https://arxiv.org/abs/2406.18820)
+
+# Videos
+1. DeepSpeed KDD 2020 Tutorial
+    1. [Overview](https://www.youtube.com/watch?v=CaseqC45DNc&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=29)
+    2. [ZeRO + large model training](https://www.youtube.com/watch?v=y4_bCiAsIAk&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=28)
+    3. [17B T-NLG demo](https://www.youtube.com/watch?v=9V-ZbP92drg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=27)
+    4. [Fastest BERT training + RScan tuning](https://www.youtube.com/watch?v=o1K-ZG9F6u0&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=26)
+    5. DeepSpeed hands on deep dive: [part 1](https://www.youtube.com/watch?v=_NOk-mBwDYg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=92), [part 2](https://www.youtube.com/watch?v=sG6_c4VXLww&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=94), [part 3](https://www.youtube.com/watch?v=k9yPkBTayos&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=93)
+    6. [FAQ](https://www.youtube.com/watch?v=nsHu6vEgPew&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=24)
+2. Microsoft Research Webinar
+    * Registration is free and all videos are available on-demand.
+    * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
+3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
+4. [Large Model Training and Inference with DeepSpeed // Samyam Rajbhandari // LLMs in Prod Conference](https://www.youtube.com/watch?v=cntxC3g22oU) [[slides]](docs/assets/files/presentation-mlops.pdf)
+5. Community Tutorials
+    * [DeepSpeed: All the tricks to scale to gigantic models (Mark Saroufim)](https://www.youtube.com/watch?v=pDGI668pNg0)
+    * [Turing-NLG, DeepSpeed and the ZeRO optimizer (Yannic Kilcher)](https://www.youtube.com/watch?v=tC01FRB0M7w)
+    * [Ultimate Guide To Scaling ML Models (The AI Epiphany)](https://www.youtube.com/watch?v=hc0u4avAkuM)
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py
index 23e06a770023..902d282b8584 100644
--- a/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py
@@ -3,4 +3,14 @@
 
 # DeepSpeed Team
 
+<<<<<<< HEAD:inference/v2/model_implementations/qwen_v2_moe/__init__.py
 from .policy import Qwen2MoePolicy
+=======
+from .quantize import FP_Quantize, Quantizer
+
+try:
+    import triton
+    from .fp8_gemm import matmul_fp8
+except ImportError:
+    pass
+>>>>>>> 426445c324b56621271b3e609e7e9c49dc915892:deepspeed/ops/fp_quantizer/__init__.py
diff --git a/deepspeed/op_builder/builder.py b/deepspeed/op_builder/builder.py
new file mode 100644
index 000000000000..8998fc0eddb8
--- /dev/null
+++ b/deepspeed/op_builder/builder.py
@@ -0,0 +1,848 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import re
+import sys
+import time
+import importlib
+from pathlib import Path
+import subprocess
+import shlex
+import shutil
+import tempfile
+import distutils.ccompiler
+import distutils.log
+import distutils.sysconfig
+from distutils.errors import CompileError, LinkError
+from abc import ABC, abstractmethod
+from typing import List
+
+YELLOW = '\033[93m'
+END = '\033[0m'
+WARNING = f"{YELLOW} [WARNING] {END}"
+
+DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
+DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
+
+try:
+    import torch
+except ImportError:
+    print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.")
+else:
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+
+class MissingCUDAException(Exception):
+    pass
+
+
+class CUDAMismatchException(Exception):
+    pass
+
+
+def installed_cuda_version(name=""):
+    import torch.utils.cpp_extension
+    cuda_home = torch.utils.cpp_extension.CUDA_HOME
+    if cuda_home is None:
+        raise MissingCUDAException("CUDA_HOME does not exist, unable to compile CUDA op(s)")
+    # Ensure there is not a cuda version mismatch between torch and nvcc compiler
+    output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
+    output_split = output.split()
+    release_idx = output_split.index("release")
+    release = output_split[release_idx + 1].replace(',', '').split(".")
+    # Ignore patch versions, only look at major + minor
+    cuda_major, cuda_minor = release[:2]
+    return int(cuda_major), int(cuda_minor)
+
+
+def get_default_compute_capabilities():
+    compute_caps = DEFAULT_COMPUTE_CAPABILITIES
+    import torch.utils.cpp_extension
+    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version()[0] >= 11:
+        if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
+            # Special treatment of CUDA 11.0 because compute_86 is not supported.
+            compute_caps += ";8.0"
+        else:
+            compute_caps += ";8.0;8.6"
+    return compute_caps
+
+
+# list compatible minor CUDA versions - so that for example pytorch built with cuda-11.0 can be used
+# to build deepspeed and system-wide installed cuda 11.2
+cuda_minor_mismatch_ok = {
+    10: ["10.0", "10.1", "10.2"],
+    11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
+    12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5"],
+}
+
+
+def assert_no_cuda_mismatch(name=""):
+    cuda_major, cuda_minor = installed_cuda_version(name)
+    sys_cuda_version = f'{cuda_major}.{cuda_minor}'
+    torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    # This is a show-stopping error, should probably not proceed past this
+    if sys_cuda_version != torch_cuda_version:
+        if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
+                and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
+            print(f"Installed CUDA version {sys_cuda_version} does not match the "
+                  f"version torch was compiled with {torch.version.cuda} "
+                  "but since the APIs are compatible, accepting this combination")
+            return True
+        elif os.getenv("DS_SKIP_CUDA_CHECK", "0") == "1":
+            print(
+                f"{WARNING} DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+                f"version torch was compiled with {torch.version.cuda}."
+                "Detected `DS_SKIP_CUDA_CHECK=1`: Allowing this combination of CUDA, but it may result in unexpected behavior."
+            )
+            return True
+        raise CUDAMismatchException(
+            f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+            f"version torch was compiled with {torch.version.cuda}, unable to compile "
+            "cuda/cpp extensions without a matching cuda version.")
+    return True
+
+
+class OpBuilder(ABC):
+    _rocm_version = None
+    _rocm_gpu_arch = None
+    _rocm_wavefront_size = None
+    _is_rocm_pytorch = None
+    _is_sycl_enabled = None
+    _loaded_ops = {}
+
+    def __init__(self, name):
+        self.name = name
+        self.jit_mode = False
+        self.build_for_cpu = False
+        self.enable_bf16 = False
+        self.error_log = None
+
+    @abstractmethod
+    def absolute_name(self):
+        '''
+        Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
+        will be installed as something like: deepspeed/ops/adam/cpu_adam.so
+        '''
+        pass
+
+    @abstractmethod
+    def sources(self):
+        '''
+        Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        '''
+        pass
+
+    def hipify_extension(self):
+        pass
+
+    def sycl_extension(self):
+        pass
+
+    @staticmethod
+    def validate_torch_version(torch_info):
+        install_torch_version = torch_info['version']
+        current_torch_version = ".".join(torch.__version__.split('.')[:2])
+        if install_torch_version != current_torch_version:
+            raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed "
+                               "with a different version than what is being used at runtime. "
+                               f"Please re-install DeepSpeed or switch torch versions. "
+                               f"Install torch version={install_torch_version}, "
+                               f"Runtime torch version={current_torch_version}")
+
+    @staticmethod
+    def validate_torch_op_version(torch_info):
+        if not OpBuilder.is_rocm_pytorch():
+            current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+            install_cuda_version = torch_info['cuda_version']
+            if install_cuda_version != current_cuda_version:
+                raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
+                                   f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Install CUDA version={install_cuda_version}, "
+                                   f"Runtime CUDA version={current_cuda_version}")
+        else:
+            current_hip_version = ".".join(torch.version.hip.split('.')[:2])
+            install_hip_version = torch_info['hip_version']
+            if install_hip_version != current_hip_version:
+                raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
+                                   f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Install HIP version={install_hip_version}, "
+                                   f"Runtime HIP version={current_hip_version}")
+
+    @staticmethod
+    def is_rocm_pytorch():
+        if OpBuilder._is_rocm_pytorch is not None:
+            return OpBuilder._is_rocm_pytorch
+
+        _is_rocm_pytorch = False
+        try:
+            import torch
+        except ImportError:
+            pass
+        else:
+            if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
+                _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
+                if _is_rocm_pytorch:
+                    from torch.utils.cpp_extension import ROCM_HOME
+                    _is_rocm_pytorch = ROCM_HOME is not None
+        OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
+        return OpBuilder._is_rocm_pytorch
+
+    @staticmethod
+    def is_sycl_enabled():
+        if OpBuilder._is_sycl_enabled is not None:
+            return OpBuilder._is_sycl_enabled
+
+        _is_sycl_enabled = False
+        try:
+            result = subprocess.run(["c2s", "--version"], capture_output=True)
+        except:
+            pass
+        else:
+            _is_sycl_enabled = True
+
+        OpBuilder._is_sycl_enabled = _is_sycl_enabled
+        return OpBuilder._is_sycl_enabled
+
+    @staticmethod
+    def installed_rocm_version():
+        if OpBuilder._rocm_version:
+            return OpBuilder._rocm_version
+
+        ROCM_MAJOR = '0'
+        ROCM_MINOR = '0'
+        ROCM_VERSION_DEV_RAW = ""
+        if OpBuilder.is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version")
+            if rocm_ver_file.is_file():
+                with open(rocm_ver_file, 'r') as file:
+                    ROCM_VERSION_DEV_RAW = file.read()
+            elif "rocm" in torch.__version__:
+                ROCM_VERSION_DEV_RAW = torch.__version__.split("rocm")[1]
+            if ROCM_VERSION_DEV_RAW != "":
+                ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
+                ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
+            else:
+                # Look in /usr/include/rocm-version.h
+                rocm_ver_file = Path("/usr/include/rocm_version.h")
+                if rocm_ver_file.is_file():
+                    with open(rocm_ver_file, 'r') as file:
+                        for ln in file.readlines():
+                            if "#define ROCM_VERSION_MAJOR" in ln:
+                                ROCM_MAJOR = re.findall(r'\S+', ln)[2]
+                            elif "#define ROCM_VERSION_MINOR" in ln:
+                                ROCM_MINOR = re.findall(r'\S+', ln)[2]
+            if ROCM_MAJOR == '0':
+                assert False, "Could not detect ROCm version"
+
+        OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
+        return OpBuilder._rocm_version
+
+    @staticmethod
+    def get_rocm_gpu_arch():
+        if OpBuilder._rocm_gpu_arch:
+            return OpBuilder._rocm_gpu_arch
+        rocm_info = Path("/opt/rocm/bin/rocminfo")
+        if (not rocm_info.is_file()):
+            rocm_info = Path("rocminfo")
+        rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'"
+        try:
+            result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True)
+            rocm_gpu_arch = result.decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            rocm_gpu_arch = ""
+        OpBuilder._rocm_gpu_arch = rocm_gpu_arch
+        return OpBuilder._rocm_gpu_arch
+
+    @staticmethod
+    def get_rocm_wavefront_size():
+        if OpBuilder._rocm_wavefront_size:
+            return OpBuilder._rocm_wavefront_size
+
+        rocm_info = Path("/opt/rocm/bin/rocminfo")
+        if (not rocm_info.is_file()):
+            rocm_info = Path("rocminfo")
+        rocm_wavefront_size_cmd = str(
+            rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
+        try:
+            result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True)
+            rocm_wavefront_size = result.decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            rocm_wavefront_size = "32"
+        OpBuilder._rocm_wavefront_size = rocm_wavefront_size
+        return OpBuilder._rocm_wavefront_size
+
+    def include_paths(self):
+        '''
+        Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        '''
+        return []
+
+    def nvcc_args(self):
+        '''
+        Returns optional list of compiler flags to forward to nvcc when building CUDA sources
+        '''
+        return []
+
+    def cxx_args(self):
+        '''
+        Returns optional list of compiler flags to forward to the build
+        '''
+        return []
+
+    def is_compatible(self, verbose=True):
+        '''
+        Check if all non-python dependencies are satisfied to build this op
+        '''
+        return True
+
+    def extra_ldflags(self):
+        return []
+
+    def has_function(self, funcname, libraries, verbose=False):
+        '''
+        Test for existence of a function within a tuple of libraries.
+
+        This is used as a smoke test to check whether a certain library is available.
+        As a test, this creates a simple C program that calls the specified function,
+        and then distutils is used to compile that program and link it with the specified libraries.
+        Returns True if both the compile and link are successful, False otherwise.
+        '''
+        tempdir = None  # we create a temporary directory to hold various files
+        filestderr = None  # handle to open file to which we redirect stderr
+        oldstderr = None  # file descriptor for stderr
+        try:
+            # Echo compile and link commands that are used.
+            if verbose:
+                distutils.log.set_verbosity(1)
+
+            # Create a compiler object.
+            compiler = distutils.ccompiler.new_compiler(verbose=verbose)
+
+            # Configure compiler and linker to build according to Python install.
+            distutils.sysconfig.customize_compiler(compiler)
+
+            # Create a temporary directory to hold test files.
+            tempdir = tempfile.mkdtemp()
+
+            # Define a simple C program that calls the function in question
+            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname)
+
+            # Write the test program to a file.
+            filename = os.path.join(tempdir, 'test.c')
+            with open(filename, 'w') as f:
+                f.write(prog)
+
+            # Redirect stderr file descriptor to a file to silence compile/link warnings.
+            if not verbose:
+                filestderr = open(os.path.join(tempdir, 'stderr.txt'), 'w')
+                oldstderr = os.dup(sys.stderr.fileno())
+                os.dup2(filestderr.fileno(), sys.stderr.fileno())
+
+            # Workaround for behavior in distutils.ccompiler.CCompiler.object_filenames()
+            # Otherwise, a local directory will be used instead of tempdir
+            drive, driveless_filename = os.path.splitdrive(filename)
+            root_dir = driveless_filename[0] if os.path.isabs(driveless_filename) else ''
+            output_dir = os.path.join(drive, root_dir)
+
+            # Attempt to compile the C program into an object file.
+            cflags = shlex.split(os.environ.get('CFLAGS', ""))
+            objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags))
+
+            # Attempt to link the object file into an executable.
+            # Be sure to tack on any libraries that have been specified.
+            ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
+            compiler.link_executable(objs,
+                                     os.path.join(tempdir, 'a.out'),
+                                     extra_preargs=self.strip_empty_entries(ldflags),
+                                     libraries=libraries)
+
+            # Compile and link succeeded
+            return True
+
+        except CompileError:
+            return False
+
+        except LinkError:
+            return False
+
+        except:
+            return False
+
+        finally:
+            # Restore stderr file descriptor and close the stderr redirect file.
+            if oldstderr is not None:
+                os.dup2(oldstderr, sys.stderr.fileno())
+            if filestderr is not None:
+                filestderr.close()
+
+            # Delete the temporary directory holding the test program and stderr files.
+            if tempdir is not None:
+                shutil.rmtree(tempdir)
+
+    def strip_empty_entries(self, args):
+        '''
+        Drop any empty strings from the list of compile and link flags
+        '''
+        return [x for x in args if len(x) > 0]
+
+    def cpu_arch(self):
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError as e:
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return "-march=native"
+
+        try:
+            cpu_info = get_cpu_info()
+        except Exception as e:
+            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
+                         "falling back to `lscpu` to get this information.")
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return "-march=native"
+
+        if cpu_info['arch'].startswith('PPC_'):
+            # gcc does not provide -march on PowerPC, use -mcpu instead
+            return '-mcpu=native'
+        return '-march=native'
+
+    def is_cuda_enable(self):
+        try:
+            assert_no_cuda_mismatch(self.name)
+            return '-D__ENABLE_CUDA__'
+        except MissingCUDAException:
+            print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, "
+                  "only cpu ops can be compiled!")
+            return '-D__DISABLE_CUDA__'
+        return '-D__DISABLE_CUDA__'
+
+    def _backup_cpuinfo(self):
+        # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
+        if not self.command_exists('lscpu'):
+            self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
+                         "to detect the CPU architecture. 'lscpu' does not appear to exist on "
+                         "your system, will fall back to use -march=native and non-vectorized execution.")
+            return None
+        result = subprocess.check_output('lscpu', shell=True)
+        result = result.decode('utf-8').strip().lower()
+
+        cpu_info = {}
+        cpu_info['arch'] = None
+        cpu_info['flags'] = ""
+        if 'genuineintel' in result or 'authenticamd' in result:
+            cpu_info['arch'] = 'X86_64'
+            if 'avx512' in result:
+                cpu_info['flags'] += 'avx512,'
+            elif 'avx512f' in result:
+                cpu_info['flags'] += 'avx512f,'
+            if 'avx2' in result:
+                cpu_info['flags'] += 'avx2'
+        elif 'ppc64le' in result:
+            cpu_info['arch'] = "PPC_"
+
+        return cpu_info
+
+    def simd_width(self):
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError as e:
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return '-D__SCALAR__'
+
+        try:
+            cpu_info = get_cpu_info()
+        except Exception as e:
+            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
+                         "falling back to `lscpu` to get this information.")
+            cpu_info = self._backup_cpuinfo()
+            if cpu_info is None:
+                return '-D__SCALAR__'
+
+        if cpu_info['arch'] == 'X86_64':
+            if 'avx512' in cpu_info['flags'] or 'avx512f' in cpu_info['flags']:
+                return '-D__AVX512__'
+            elif 'avx2' in cpu_info['flags']:
+                return '-D__AVX256__'
+        return '-D__SCALAR__'
+
+    def command_exists(self, cmd):
+        if '|' in cmd:
+            cmds = cmd.split("|")
+        else:
+            cmds = [cmd]
+        valid = False
+        for cmd in cmds:
+            result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+            valid = valid or result.wait() == 0
+
+        if not valid and len(cmds) > 1:
+            print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!")
+        elif not valid and len(cmds) == 1:
+            print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!")
+        return valid
+
+    def warning(self, msg):
+        self.error_log = f"{msg}"
+        print(f"{WARNING} {msg}")
+
+    def deepspeed_src_path(self, code_path):
+        if os.path.isabs(code_path):
+            return code_path
+        else:
+            return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
+        return CppExtension(name=self.absolute_name(),
+                            sources=self.strip_empty_entries(self.sources()),
+                            include_dirs=include_dirs,
+                            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
+                            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
+
+    def load(self, verbose=True):
+        if self.name in __class__._loaded_ops:
+            return __class__._loaded_ops[self.name]
+
+        from deepspeed.git_version_info import installed_ops, torch_info, accelerator_name
+        from deepspeed.accelerator import get_accelerator
+        if installed_ops.get(self.name, False) and accelerator_name == get_accelerator()._name:
+            # Ensure the op we're about to load was compiled with the same
+            # torch/cuda versions we are currently using at runtime.
+            self.validate_torch_version(torch_info)
+            if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
+                self.validate_torch_op_version(torch_info)
+
+            op_module = importlib.import_module(self.absolute_name())
+            __class__._loaded_ops[self.name] = op_module
+            return op_module
+        else:
+            return self.jit_load(verbose)
+
+    def jit_load(self, verbose=True):
+        if not self.is_compatible(verbose):
+            raise RuntimeError(
+                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
+            )
+        try:
+            import ninja  # noqa: F401 # type: ignore
+        except ImportError:
+            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+
+        if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
+            self.build_for_cpu = not torch.cuda.is_available()
+
+        self.jit_mode = True
+        from torch.utils.cpp_extension import load
+
+        start_build = time.time()
+        sources = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.sources()]
+        extra_include_paths = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.include_paths()]
+
+        # Torch will try and apply whatever CCs are in the arch list at compile time,
+        # we have already set the intended targets ourselves we know that will be
+        # needed at runtime. This prevents CC collisions such as multiple __half
+        # implementations. Stash arch list to reset after build.
+        torch_arch_list = None
+        if "TORCH_CUDA_ARCH_LIST" in os.environ:
+            torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
+            os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+        nvcc_args = self.strip_empty_entries(self.nvcc_args())
+        cxx_args = self.strip_empty_entries(self.cxx_args())
+
+        if isinstance(self, CUDAOpBuilder):
+            if not self.build_for_cpu and self.enable_bf16:
+                cxx_args.append("-DBF16_AVAILABLE")
+                nvcc_args.append("-DBF16_AVAILABLE")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT16_OPERATORS__")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT162_OPERATORS__")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
+
+        if self.is_rocm_pytorch():
+            cxx_args.append("-D__HIP_PLATFORM_AMD__=1")
+            os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch()
+            cxx_args.append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+
+        op_module = load(name=self.name,
+                         sources=self.strip_empty_entries(sources),
+                         extra_include_paths=self.strip_empty_entries(extra_include_paths),
+                         extra_cflags=cxx_args,
+                         extra_cuda_cflags=nvcc_args,
+                         extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
+                         verbose=verbose)
+
+        build_duration = time.time() - start_build
+        if verbose:
+            print(f"Time to load {self.name} op: {build_duration} seconds")
+
+        # Reset arch list so we are not silently removing it for other possible use cases
+        if torch_arch_list:
+            os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list
+
+        __class__._loaded_ops[self.name] = op_module
+
+        return op_module
+
+
+class CUDAOpBuilder(OpBuilder):
+
+    def compute_capability_args(self, cross_compile_archs=None):
+        """
+        Returns nvcc compute capability compile flags.
+
+        1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
+        2. If neither is set default compute capabilities will be used
+        3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
+
+        Format:
+
+        - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
+
+        TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
+        TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
+
+        - `cross_compile_archs` uses ; separator.
+
+        """
+        ccs = []
+        if self.jit_mode:
+            # Compile for underlying architectures since we know those at runtime
+            for i in range(torch.cuda.device_count()):
+                CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i)
+                cc = f"{CC_MAJOR}.{CC_MINOR}"
+                if cc not in ccs:
+                    ccs.append(cc)
+            ccs = sorted(ccs)
+            ccs[-1] += '+PTX'
+        else:
+            # Cross-compile mode, compile for various architectures
+            # env override takes priority
+            cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
+            if cross_compile_archs_env is not None:
+                if cross_compile_archs is not None:
+                    print(
+                        f"{WARNING} env var `TORCH_CUDA_ARCH_LIST={cross_compile_archs_env}` overrides `cross_compile_archs={cross_compile_archs}`"
+                    )
+                cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
+            else:
+                if cross_compile_archs is None:
+                    cross_compile_archs = get_default_compute_capabilities()
+            ccs = cross_compile_archs.split(';')
+
+        ccs = self.filter_ccs(ccs)
+        if len(ccs) == 0:
+            raise RuntimeError(
+                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering")
+
+        args = []
+        self.enable_bf16 = True
+        for cc in ccs:
+            num = cc[0] + cc[2]
+            args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
+            if cc.endswith('+PTX'):
+                args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
+
+            if int(cc[0]) <= 7:
+                self.enable_bf16 = False
+
+        return args
+
+    def filter_ccs(self, ccs: List[str]):
+        """
+        Prune any compute capabilities that are not compatible with the builder. Should log
+        which CCs have been pruned.
+        """
+        return ccs
+
+    def version_dependent_macros(self):
+        # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
+        version_ge_1_1 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
+            version_ge_1_1 = ['-DVERSION_GE_1_1']
+        version_ge_1_3 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
+            version_ge_1_3 = ['-DVERSION_GE_1_3']
+        version_ge_1_5 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
+            version_ge_1_5 = ['-DVERSION_GE_1_5']
+        return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+
+    def is_compatible(self, verbose=True):
+        return super().is_compatible(verbose)
+
+    def builder(self):
+        try:
+            if not self.is_rocm_pytorch():
+                assert_no_cuda_mismatch(self.name)
+            self.build_for_cpu = False
+        except MissingCUDAException:
+            self.build_for_cpu = True
+
+        if self.build_for_cpu:
+            from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+        else:
+            from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
+                       {'cxx': self.strip_empty_entries(self.cxx_args()), \
+                        'nvcc': self.strip_empty_entries(self.nvcc_args())}
+
+        if not self.build_for_cpu and self.enable_bf16:
+            compile_args['cxx'].append("-DBF16_AVAILABLE")
+            compile_args['nvcc'].append("-DBF16_AVAILABLE")
+
+        if self.is_rocm_pytorch():
+            compile_args['cxx'].append("-D__HIP_PLATFORM_AMD__=1")
+            #cxx compiler args are required to compile cpp files
+            compile_args['cxx'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+            #nvcc compiler args are required to compile hip files
+            compile_args['nvcc'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+            if self.get_rocm_gpu_arch():
+                os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch()
+
+        cuda_ext = ExtensionBuilder(name=self.absolute_name(),
+                                    sources=self.strip_empty_entries(self.sources()),
+                                    include_dirs=include_dirs,
+                                    libraries=self.strip_empty_entries(self.libraries_args()),
+                                    extra_compile_args=compile_args,
+                                    extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
+
+        if self.is_rocm_pytorch():
+            # hip converts paths to absolute, this converts back to relative
+            sources = cuda_ext.sources
+            curr_file = Path(__file__).parent.parent  # ds root
+            for i in range(len(sources)):
+                src = Path(sources[i])
+                if src.is_absolute():
+                    sources[i] = str(src.relative_to(curr_file))
+                else:
+                    sources[i] = str(src)
+            cuda_ext.sources = sources
+        return cuda_ext
+
+    def hipify_extension(self):
+        if self.is_rocm_pytorch():
+            from torch.utils.hipify import hipify_python
+            hipify_python.hipify(
+                project_directory=os.getcwd(),
+                output_directory=os.getcwd(),
+                header_include_dirs=self.include_paths(),
+                includes=[os.path.join(os.getcwd(), '*')],
+                extra_files=[os.path.abspath(s) for s in self.sources()],
+                show_detailed=True,
+                is_pytorch_extension=True,
+                hipify_extra_files_only=True,
+            )
+
+    def cxx_args(self):
+        if sys.platform == "win32":
+            return ['-O2']
+        else:
+            return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
+
+    def nvcc_args(self):
+        if self.build_for_cpu:
+            return []
+        args = ['-O3']
+        if self.is_rocm_pytorch():
+            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+            args += [
+                '-std=c++17', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__',
+                '-U__HIP_NO_HALF2_OPERATORS__',
+                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
+                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
+            ]
+        else:
+            try:
+                nvcc_threads = int(os.getenv("DS_NVCC_THREADS", ""))
+                if nvcc_threads <= 0:
+                    raise ValueError("")
+            except ValueError:
+                nvcc_threads = min(os.cpu_count(), 8)
+
+            cuda_major, cuda_minor = installed_cuda_version()
+            if cuda_major > 10:
+                if cuda_major == 12 and cuda_minor >= 5:
+                    std_lib = '-std=c++20'
+                else:
+                    std_lib = '-std=c++17'
+            else:
+                std_lib = '-std=c++14'
+            args += [
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math', std_lib,
+                '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+                f'--threads={nvcc_threads}'
+            ]
+            if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
+                args.append('--ptxas-options=-v')
+            args += self.compute_capability_args()
+        return args
+
+    def libraries_args(self):
+        if self.build_for_cpu:
+            return []
+
+        if sys.platform == "win32":
+            return ['cublas', 'curand']
+        else:
+            return []
+
+
+class TorchCPUOpBuilder(CUDAOpBuilder):
+
+    def get_cuda_lib64_path(self):
+        import torch
+        if not self.is_rocm_pytorch():
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+            if not os.path.exists(CUDA_LIB64):
+                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib")
+        else:
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
+        return CUDA_LIB64
+
+    def extra_ldflags(self):
+        if self.build_for_cpu:
+            return ['-fopenmp']
+
+        if not self.is_rocm_pytorch():
+            ld_flags = ['-lcurand']
+            if not self.build_for_cpu:
+                ld_flags.append(f'-L{self.get_cuda_lib64_path()}')
+            return ld_flags
+
+        return []
+
+    def cxx_args(self):
+        args = []
+        if not self.build_for_cpu:
+            CUDA_LIB64 = self.get_cuda_lib64_path()
+
+            args += super().cxx_args()
+            args += [
+                f'-L{CUDA_LIB64}',
+                '-lcudart',
+                '-lcublas',
+                '-g',
+            ]
+
+        CPU_ARCH = self.cpu_arch()
+        SIMD_WIDTH = self.simd_width()
+        CUDA_ENABLE = self.is_cuda_enable()
+        args += [
+            CPU_ARCH,
+            '-fopenmp',
+            SIMD_WIDTH,
+            CUDA_ENABLE,
+        ]
+
+        return args
diff --git a/deepspeed/op_builder/fp_quantizer.py b/deepspeed/op_builder/fp_quantizer.py
new file mode 100644
index 000000000000..c7d2e72b5408
--- /dev/null
+++ b/deepspeed/op_builder/fp_quantizer.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+try:
+    from packaging import version as pkg_version
+except ImportError:
+    pkg_version = None
+
+from .builder import CUDAOpBuilder, installed_cuda_version
+
+
+class FPQuantizerBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_FP_QUANTIZER"
+    NAME = "fp_quantizer"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.fp_quantizer.{self.NAME}_op'
+
+    def is_compatible(self, verbose=True):
+        try:
+            import torch
+        except ImportError:
+            self.warning("Please install torch if trying to pre-compile inference kernels")
+            return False
+
+        cuda_okay = True
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            if cuda_capability < 8:
+                self.warning("NVIDIA Inference is only supported on Ampere and newer architectures")
+                cuda_okay = False
+            if cuda_capability >= 8:
+                if torch_cuda_major < 11 or sys_cuda_major < 11:
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    cuda_okay = False
+
+        try:
+            import triton
+        except ImportError:
+            self.warning(f"please install triton==2.3.0 or 2.3.1 if you want to use the FP Quantizer Kernels")
+            return False
+
+        # triton 2.3.0 and 2.3.1 are okay and the only versions released in 2.3.x before 3.x was released
+        if pkg_version:
+            allowed = pkg_version.parse("2.3")
+            installed_triton = pkg_version.parse(triton.__version__)
+            triton_mismatch = installed_triton.major != allowed.major or installed_triton.minor != allowed.minor
+        else:
+            installed_triton = triton.__version__
+            major, minor, _ = installed_triton.split(".")
+            triton_mismatch = major != "2" or minor != "3"
+
+        if triton_mismatch:
+            self.warning(
+                f"FP Quantizer is using an untested triton version ({installed_triton}), only 2.3.0 and 2.3.1 are known to be compatible with these kernels"
+            )
+            return False
+
+        return super().is_compatible(verbose) and cuda_okay
+
+    def filter_ccs(self, ccs):
+        ccs_retained = []
+        ccs_pruned = []
+        for cc in ccs:
+            if int(cc[0]) >= 8:
+                ccs_retained.append(cc)
+            else:
+                ccs_pruned.append(cc)
+        if len(ccs_pruned) > 0:
+            self.warning(f"Filtered compute capabilities {ccs_pruned}")
+        return ccs_retained
+
+    def sources(self):
+        return [
+            "csrc/fp_quantizer/fp_quantize.cu",
+            "csrc/fp_quantizer/fp_quantize.cpp",
+        ]
+
+    def extra_ldflags(self):
+        return ['-lcurand']
+
+    def include_paths(self):
+        return ['csrc/fp_quantizer/includes', 'csrc/includes']
diff --git a/deepspeed/setup.py b/deepspeed/setup.py
new file mode 100644
index 000000000000..2b7555361655
--- /dev/null
+++ b/deepspeed/setup.py
@@ -0,0 +1,326 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+DeepSpeed library
+
+To build wheel on Windows:
+1. Install pytorch, such as pytorch 1.12 + cuda 11.6.
+2. Install visual cpp build tool.
+3. Include cuda toolkit.
+4. Launch cmd console with Administrator privilege for creating required symlink folders.
+
+
+Create a new wheel via the following command:
+build_win.bat
+
+The wheel will be located at: dist/*.whl
+"""
+
+import pathlib
+import os
+import shutil
+import sys
+import subprocess
+from setuptools import setup, find_packages
+from setuptools.command import egg_info
+import time
+import typing
+
+torch_available = True
+try:
+    import torch
+except ImportError:
+    torch_available = False
+    print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \
+        'Please visit https://pytorch.org/ to see how to properly install torch on your system.')
+
+from op_builder import get_default_compute_capabilities, OpBuilder
+from op_builder.all_ops import ALL_OPS, accelerator_name
+from op_builder.builder import installed_cuda_version
+
+from accelerator import get_accelerator
+
+# Fetch rocm state.
+is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
+rocm_version = OpBuilder.installed_rocm_version()
+
+RED_START = '\033[31m'
+RED_END = '\033[0m'
+ERROR = f"{RED_START} [ERROR] {RED_END}"
+
+
+def abort(msg):
+    print(f"{ERROR} {msg}")
+    assert False, msg
+
+
+def fetch_requirements(path):
+    with open(path, 'r') as fd:
+        return [r.strip() for r in fd.readlines()]
+
+
+def is_env_set(key):
+    """
+    Checks if an environment variable is set and not "".
+    """
+    return bool(os.environ.get(key, None))
+
+
+def get_env_if_set(key, default: typing.Any = ""):
+    """
+    Returns an environment variable if it is set and not "",
+    otherwise returns a default value. In contrast, the fallback
+    parameter of os.environ.get() is skipped if the variable is set to "".
+    """
+    return os.environ.get(key, None) or default
+
+
+install_requires = fetch_requirements('requirements/requirements.txt')
+extras_require = {
+    '1bit': [],  # add cupy based on cuda/rocm version
+    '1bit_mpi': fetch_requirements('requirements/requirements-1bit-mpi.txt'),
+    'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
+    'dev': fetch_requirements('requirements/requirements-dev.txt'),
+    'autotuning': fetch_requirements('requirements/requirements-autotuning.txt'),
+    'autotuning_ml': fetch_requirements('requirements/requirements-autotuning-ml.txt'),
+    'sparse_attn': fetch_requirements('requirements/requirements-sparse_attn.txt'),
+    'sparse': fetch_requirements('requirements/requirements-sparse_pruning.txt'),
+    'inf': fetch_requirements('requirements/requirements-inf.txt'),
+    'sd': fetch_requirements('requirements/requirements-sd.txt'),
+    'triton': fetch_requirements('requirements/requirements-triton.txt'),
+}
+
+# Add specific cupy version to both onebit extension variants.
+if torch_available and get_accelerator().device_name() == 'cuda':
+    cupy = None
+    if is_rocm_pytorch:
+        rocm_major, rocm_minor = rocm_version
+        # XXX cupy support for rocm 5 is not available yet.
+        if rocm_major <= 4:
+            cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
+    else:
+        cuda_major_ver, cuda_minor_ver = installed_cuda_version()
+        if (cuda_major_ver < 11) or ((cuda_major_ver == 11) and (cuda_minor_ver < 3)):
+            cupy = f"cupy-cuda{cuda_major_ver}{cuda_minor_ver}"
+        else:
+            cupy = f"cupy-cuda{cuda_major_ver}x"
+
+    if cupy:
+        extras_require['1bit'].append(cupy)
+        extras_require['1bit_mpi'].append(cupy)
+
+# Make an [all] extra that installs all needed dependencies.
+all_extras = set()
+for extra in extras_require.items():
+    for req in extra[1]:
+        all_extras.add(req)
+extras_require['all'] = list(all_extras)
+
+cmdclass = {}
+
+# For any pre-installed ops force disable ninja.
+if torch_available:
+    use_ninja = is_env_set("DS_ENABLE_NINJA")
+    cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=use_ninja)
+
+if torch_available:
+    TORCH_MAJOR = torch.__version__.split('.')[0]
+    TORCH_MINOR = torch.__version__.split('.')[1]
+else:
+    TORCH_MAJOR = "0"
+    TORCH_MINOR = "0"
+
+if torch_available and not get_accelerator().device_name() == 'cuda':
+    # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
+    print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
+          "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
+          "(compute capabilities 6.0, 6.1, 6.2)")
+    if not is_env_set("TORCH_CUDA_ARCH_LIST"):
+        os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capabilities()
+
+ext_modules = []
+
+# Default to pre-install kernels to false so we rely on JIT on Linux, opposite on Windows.
+BUILD_OP_PLATFORM = 1 if sys.platform == "win32" else 0
+BUILD_OP_DEFAULT = int(get_env_if_set('DS_BUILD_OPS', BUILD_OP_PLATFORM))
+print(f"DS_BUILD_OPS={BUILD_OP_DEFAULT}")
+
+if BUILD_OP_DEFAULT:
+    assert torch_available, "Unable to pre-compile ops without torch installed. Please install torch before attempting to pre-compile ops."
+
+
+def command_exists(cmd):
+    if sys.platform == "win32":
+        result = subprocess.Popen(f'{cmd}', stdout=subprocess.PIPE, shell=True)
+        return result.wait() == 1
+    else:
+        result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+        return result.wait() == 0
+
+
+def op_envvar(op_name):
+    assert hasattr(ALL_OPS[op_name], 'BUILD_VAR'), \
+        f"{op_name} is missing BUILD_VAR field"
+    return ALL_OPS[op_name].BUILD_VAR
+
+
+def op_enabled(op_name):
+    env_var = op_envvar(op_name)
+    return int(get_env_if_set(env_var, BUILD_OP_DEFAULT))
+
+
+install_ops = dict.fromkeys(ALL_OPS.keys(), False)
+for op_name, builder in ALL_OPS.items():
+    op_compatible = builder.is_compatible()
+
+    # If op is requested but not available, throw an error.
+    if op_enabled(op_name) and not op_compatible:
+        env_var = op_envvar(op_name)
+        if not is_env_set(env_var):
+            builder.warning(f"One can disable {op_name} with {env_var}=0")
+        abort(f"Unable to pre-compile {op_name}")
+
+    # If op is compatible but install is not enabled (JIT mode).
+    if is_rocm_pytorch and op_compatible and not op_enabled(op_name):
+        builder.hipify_extension()
+
+    # If op install enabled, add builder to extensions.
+    if op_enabled(op_name) and op_compatible:
+        assert torch_available, f"Unable to pre-compile {op_name}, please first install torch"
+        install_ops[op_name] = op_enabled(op_name)
+        ext_modules.append(builder.builder())
+
+print(f'Install Ops={install_ops}')
+
+# Write out version/git info.
+git_hash_cmd = "git rev-parse --short HEAD"
+git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
+if command_exists('git') and not is_env_set('DS_BUILD_STRING'):
+    try:
+        result = subprocess.check_output(git_hash_cmd, shell=True)
+        git_hash = result.decode('utf-8').strip()
+        result = subprocess.check_output(git_branch_cmd, shell=True)
+        git_branch = result.decode('utf-8').strip()
+    except subprocess.CalledProcessError:
+        git_hash = "unknown"
+        git_branch = "unknown"
+else:
+    git_hash = "unknown"
+    git_branch = "unknown"
+
+if sys.platform == "win32":
+    shutil.rmtree('.\\deepspeed\\ops\\csrc', ignore_errors=True)
+    pathlib.Path('.\\deepspeed\\ops\\csrc').unlink(missing_ok=True)
+    shutil.copytree('.\\csrc', '.\\deepspeed\\ops\\csrc', dirs_exist_ok=True)
+    shutil.rmtree('.\\deepspeed\\ops\\op_builder', ignore_errors=True)
+    pathlib.Path('.\\deepspeed\\ops\\op_builder').unlink(missing_ok=True)
+    shutil.copytree('.\\op_builder', '.\\deepspeed\\ops\\op_builder', dirs_exist_ok=True)
+    shutil.rmtree('.\\deepspeed\\accelerator', ignore_errors=True)
+    pathlib.Path('.\\deepspeed\\accelerator').unlink(missing_ok=True)
+    shutil.copytree('.\\accelerator', '.\\deepspeed\\accelerator', dirs_exist_ok=True)
+    egg_info.manifest_maker.template = 'MANIFEST_win.in'
+
+# Parse the DeepSpeed version string from version.txt.
+version_str = open('version.txt', 'r').read().strip()
+
+# Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
+# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
+
+# Building wheel for distribution, update version file.
+if is_env_set('DS_BUILD_STRING'):
+    # Build string env specified, probably building for distribution.
+    with open('build.txt', 'w') as fd:
+        fd.write(os.environ['DS_BUILD_STRING'])
+    version_str += os.environ['DS_BUILD_STRING']
+elif os.path.isfile('build.txt'):
+    # build.txt exists, probably installing from distribution.
+    with open('build.txt', 'r') as fd:
+        version_str += fd.read().strip()
+else:
+    # None of the above, probably installing from source.
+    version_str += f'+{git_hash}'
+
+torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
+bf16_support = False
+# Set cuda_version to 0.0 if cpu-only.
+cuda_version = "0.0"
+nccl_version = "0.0"
+# Set hip_version to 0.0 if cpu-only.
+hip_version = "0.0"
+if torch_available and torch.version.cuda is not None:
+    cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    if sys.platform != "win32":
+        if isinstance(torch.cuda.nccl.version(), int):
+            # This will break if minor version > 9.
+            nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
+        else:
+            nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
+    if hasattr(torch.cuda, 'is_bf16_supported') and torch.cuda.is_available():
+        bf16_support = torch.cuda.is_bf16_supported()
+if torch_available and hasattr(torch.version, 'hip') and torch.version.hip is not None:
+    hip_version = ".".join(torch.version.hip.split('.')[:2])
+torch_info = {
+    "version": torch_version,
+    "bf16_support": bf16_support,
+    "cuda_version": cuda_version,
+    "nccl_version": nccl_version,
+    "hip_version": hip_version
+}
+
+print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
+with open('deepspeed/git_version_info_installed.py', 'w') as fd:
+    fd.write(f"version='{version_str}'\n")
+    fd.write(f"git_hash='{git_hash}'\n")
+    fd.write(f"git_branch='{git_branch}'\n")
+    fd.write(f"installed_ops={install_ops}\n")
+    fd.write(f"accelerator_name='{accelerator_name}'\n")
+    fd.write(f"torch_info={torch_info}\n")
+
+print(f'install_requires={install_requires}')
+print(f'ext_modules={ext_modules}')
+
+# Parse README.md to make long_description for PyPI page.
+thisdir = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(thisdir, 'README.md'), encoding='utf-8') as fin:
+    readme_text = fin.read()
+
+if sys.platform == "win32":
+    scripts = ['bin/deepspeed.bat', 'bin/ds', 'bin/ds_report.bat', 'bin/ds_report']
+else:
+    scripts = [
+        'bin/deepspeed', 'bin/deepspeed.pt', 'bin/ds', 'bin/ds_ssh', 'bin/ds_report', 'bin/ds_bench', 'bin/dsr',
+        'bin/ds_elastic'
+    ]
+
+start_time = time.time()
+
+setup(name='deepspeed',
+      version=version_str,
+      description='DeepSpeed library',
+      long_description=readme_text,
+      long_description_content_type='text/markdown',
+      author='DeepSpeed Team',
+      author_email='deepspeed-info@microsoft.com',
+      url='http://deepspeed.ai',
+      project_urls={
+          'Documentation': 'https://deepspeed.readthedocs.io',
+          'Source': 'https://github.com/microsoft/DeepSpeed',
+      },
+      install_requires=install_requires,
+      extras_require=extras_require,
+      packages=find_packages(include=['deepspeed', 'deepspeed.*']),
+      include_package_data=True,
+      scripts=scripts,
+      classifiers=[
+          'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7',
+          'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9',
+          'Programming Language :: Python :: 3.10'
+      ],
+      license='Apache Software License 2.0',
+      ext_modules=ext_modules,
+      cmdclass=cmdclass)
+
+end_time = time.time()
+print(f'deepspeed build time = {end_time - start_time} secs')
diff --git a/deepspeed/tests/unit/inference/test_inference.py b/deepspeed/tests/unit/inference/test_inference.py
new file mode 100644
index 000000000000..eadf670d9328
--- /dev/null
+++ b/deepspeed/tests/unit/inference/test_inference.py
@@ -0,0 +1,703 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+
+import itertools
+import pickle
+import os
+import time
+import requests
+
+from dataclasses import dataclass
+from typing import List
+
+import deepspeed
+import torch
+
+from huggingface_hub import HfApi
+from packaging import version as pkg_version
+from torch import nn
+from transformers import pipeline
+from transformers.models.t5.modeling_t5 import T5Block
+from transformers.models.roberta.modeling_roberta import RobertaLayer
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.git_version_info import torch_info
+from deepspeed.model_implementations import DeepSpeedTransformerInference
+from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.op_builder import OpBuilder
+
+from unit.common import DistributedTest
+
+rocm_version = OpBuilder.installed_rocm_version()
+if rocm_version != (0, 0):
+    pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
+
+_bert_models = [
+    "google-bert/bert-base-cased",
+    "google-bert/bert-base-uncased",
+    "google-bert/bert-large-cased",
+    "google-bert/bert-large-uncased",
+    "google-bert/bert-base-multilingual-cased",
+    "google-bert/bert-base-multilingual-uncased",
+    "deepset/minilm-uncased-squad2",
+    "cross-encoder/ms-marco-MiniLM-L-12-v2",
+    "dslim/bert-base-NER",
+    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad",
+    "distilbert/distilbert-base-cased-distilled-squad",
+]
+_roberta_models = [
+    "FacebookAI/roberta-large",
+    "FacebookAI/roberta-base",
+    "deepset/roberta-base-squad2",
+    "j-hartmann/emotion-english-distilroberta-base",
+    "Jean-Baptiste/roberta-large-ner-english",
+]
+_gpt_models = [
+    "openai-community/gpt2",
+    "distilbert/distilgpt2",
+    "Norod78/hebrew-bad_wiki-gpt_neo-tiny",
+    "EleutherAI/gpt-j-6b",
+    "EleutherAI/pythia-70m-deduped",
+    "bigscience/bloom-560m",
+]
+_opt_models = [
+    "facebook/opt-125m",  # 125m, 1.7B, ..., 175B variants have the same model architecture.
+    "facebook/opt-350m",  # 350m applies layer norm after attention layer which is different than other variants.
+]
+_test_models = set(_bert_models + _roberta_models + _gpt_models + _opt_models)
+_test_tasks = [
+    "fill-mask", "question-answering", "text-classification", "token-classification", "text-generation",
+    "text2text-generation", "summarization", "translation"
+]
+
+
+@dataclass
+class ModelInfo:
+    id: str
+    pipeline_tag: str
+    tags: List[str]
+
+
+def _hf_model_list() -> List[ModelInfo]:
+    """ Caches HF model list to avoid repeated API calls """
+
+    cache_dir = os.getenv("HF_HOME", "~/.cache/huggingface")
+    cache_file_path = os.path.join(cache_dir, "DS_model_cache.pkl")
+    num_days = os.getenv("HF_CACHE_EXPIRY_DAYS", 1)
+    cache_expiration_seconds = num_days * 60 * 60 * 24
+
+    # Load or initialize the cache
+    model_data = {"cache_time": 0, "model_list": []}
+    if os.path.isfile(cache_file_path):
+        with open(cache_file_path, 'rb') as f:
+            try:
+                model_data = pickle.load(f)
+            except Exception as e:
+                print(f"Error loading cache file {cache_file_path}: {e}")
+
+    current_time = time.time()
+
+    # Update the cache if it has expired
+    if ((model_data["cache_time"] + cache_expiration_seconds) < current_time) or os.getenv("FORCE_UPDATE_HF_CACHE",
+                                                                                           default=False):
+        api = HfApi()
+        while True:
+            try:
+                model_list = []
+                for model in _test_models:
+                    model_list.extend(api.list_models(model_name=model))
+                model_data["model_list"] = [
+                    ModelInfo(id=m.id, pipeline_tag=m.pipeline_tag, tags=m.tags) for m in model_list
+                ]
+                break  # Exit the loop if the operation is successful
+            except requests.exceptions.HTTPError as e:
+                if e.response.status_code == 429:
+                    print("Rate limit exceeded. Retrying in 60 seconds...")
+                    time.sleep(60)
+                else:
+                    raise  # Re-raise the exception if it's not a 429 error
+        model_data["cache_time"] = current_time
+
+        # Save the updated cache
+        os.makedirs(cache_dir, exist_ok=True)
+        with open(cache_file_path, 'wb') as f:
+            pickle.dump(model_data, f)
+
+    return model_data["model_list"]
+
+
+# Get a list of all models and mapping from task to supported models
+_hf_models = _hf_model_list()
+_hf_model_names = [m.id for m in _hf_models]
+_hf_task_to_models = {task: [m.id for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks}
+
+# Get all combinations of task:model to test
+_model_w_tasks = [(m, t) for m, t in itertools.product(*[_test_models, _test_tasks]) if m in _hf_task_to_models[t]]
+
+# Assign to pytest variables for testing
+pytest.model_w_tasks = _model_w_tasks
+pytest.mt_names = [f"{m}-{t}" for m, t in pytest.model_w_tasks]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def verify_models():
+    # Verify all test models are registered in HF
+    _test_models_not_found = [m for m in _test_models if m not in _hf_model_names]
+    if _test_models_not_found:
+        pytest.fail(f"Model(s) not found in HuggingFace: {_test_models_not_found}")
+
+    # Verify all models are assigned to at least one task
+    _models_to_be_tested = set(m for m, t in _model_w_tasks)
+    _missing_task_models = _models_to_be_tested.difference(_test_models)
+    if _missing_task_models:
+        pytest.fail(f"Model(s) do not have an assigned task: {_missing_task_models}")
+
+
+""" Fixtures for inference config """
+
+
+@pytest.fixture(params=pytest.model_w_tasks, ids=pytest.mt_names)
+def model_w_task(request):
+    return request.param
+
+
+@pytest.fixture(params=[torch.float, torch.half], ids=["fp32", "fp16"])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False], ids=["CG", "noCG"])
+def enable_cuda_graph(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False], ids=["Triton", "noTriton"])
+def enable_triton(request):
+    return request.param
+
+
+@pytest.fixture(params=[1, 2], ids=["ws1", "ws2"])
+def world_size(request):
+    return request.param
+
+
+""" Fixtures for running query """
+
+
+@pytest.fixture
+def query(model_w_task):
+    model, task = model_w_task
+    angle_bracket_mask_models = ["roberta", "camembert", "esm", "ibert", "luke", "mpnet", "yoso", "mpnet"]
+
+    if task == "fill-mask":
+        if any(map(lambda x: x in model, angle_bracket_mask_models)):
+            return "Hello I'm a <mask> model."
+        else:
+            return "Hell I'm a [MASK] model."
+    elif task == "question-answering":
+        return {
+            "question": "What's my name?",
+            "context": "My name is Clara and I live in Berkeley",
+        }
+    elif task == "text-classification":
+        return "DeepSpeed is the greatest"
+    elif task == "token-classification":
+        return "My name is jean-baptiste and I live in montreal."
+    elif task == "text-generation":
+        return "DeepSpeed is the greatest"
+    elif task == "text2text-generation":
+        return "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"
+    elif task == "translation" or task == "summarization":
+        return "Hello, my dog is cute"
+    else:
+        NotImplementedError(f'query for task "{task}" is not implemented')
+
+
+@pytest.fixture
+def inf_kwargs(model_w_task):
+    model, task = model_w_task
+    if task == "text-generation":
+        if model == "EleutherAI/gpt-j-6b":
+            # This model on V100 is hitting memory problems that limit the number of output tokens
+            return {"do_sample": False, "temperature": 1.0, "max_length": 12}
+        return {"do_sample": False, "temperature": 1.0, "max_length": 20}
+    else:
+        return {}
+
+
+""" Assertion fixture for verifying model outputs """
+
+
+def fill_mask_assert(x, y):
+    return set(res["token_str"] for res in x) == set(res["token_str"] for res in y)
+
+
+def question_answering_assert(x, y):
+    return x["answer"] == y["answer"]
+
+
+def text_classification_assert(x, y):
+    return set(res["label"] for res in x) == set(res["label"] for res in y)
+
+
+def token_classification_assert(x, y):
+    return set(ent["word"] for ent in x) == set(ent["word"] for ent in y)
+
+
+def text_generation_assert(x, y):
+    return set(res["generated_text"] for res in x) == set(res["generated_text"] for res in y)
+
+
+def text2text_generation_assert(x, y):
+    return set(res["generated_text"] for res in x) == set(res["generated_text"] for res in y)
+
+
+def translation_assert(x, y):
+    return set(res["translation_text"] for res in x) == set(res["translation_text"] for res in y)
+
+
+def summarization_assert(x, y):
+    return set(res["summary_text"] for res in x) == set(res["summary_text"] for res in y)
+
+
+@pytest.fixture
+def assert_fn(model_w_task):
+    model, task = model_w_task
+    assert_fn_dict = {
+        "fill-mask": fill_mask_assert,
+        "question-answering": question_answering_assert,
+        "text-classification": text_classification_assert,
+        "token-classification": token_classification_assert,
+        "text-generation": text_generation_assert,
+        "text2text-generation": text2text_generation_assert,
+        "translation": translation_assert,
+        "summarization": summarization_assert
+    }
+    assert_fn = assert_fn_dict.get(task, None)
+    if assert_fn is None:
+        NotImplementedError(f'assert_fn for task "{task}" is not implemented')
+    return assert_fn
+
+
+# Used to verify DeepSpeed kernel injection worked with a model
+def check_injection(model):
+
+    def verify_injection(module):
+        for child in module.children():
+            if isinstance(child, nn.ModuleList):
+                assert isinstance(child[0], DeepSpeedTransformerInference),\
+                    "DeepSpeed-Inference Transformer kernels has not been injected in the model"
+                break
+            else:
+                verify_injection(child)
+
+    verify_injection(model)
+
+
+# Verify that test is valid
+def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton):
+    model, task = model_w_task
+    msg = ""
+    if enable_cuda_graph and (torch_info["cuda_version"] == "0.0"):
+        msg = "CUDA not detected, cannot use CUDA Graph"
+    elif enable_cuda_graph and pkg_version.parse(torch.__version__) < pkg_version.parse("1.10"):
+        msg = "CUDA Graph is only available in torch versions >= 1.10"
+    elif "gpt-j-6b" in model:
+        if dtype != torch.half:
+            msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
+        elif enable_cuda_graph:
+            msg = f"Not enough GPU memory to run {model} with CUDA Graph enabled"
+    elif "gpt-neox-20b" in model:  # TODO: remove this when neox issues resolved
+        msg = "Skipping gpt-neox-20b for now"
+    elif ("gpt-neox-20b" in model) and (dtype != torch.half):
+        msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
+    elif ("bloom" in model) and (dtype != torch.half):
+        msg = f"Bloom models only support half precision, cannot use dtype {dtype}"
+    elif (model not in _bert_models + _roberta_models) and enable_cuda_graph:
+        msg = "Non bert/roberta models do no support CUDA Graph"
+    elif enable_triton and not (dtype in [torch.half]):
+        msg = "Triton is for fp16"
+    elif enable_triton and not deepspeed.HAS_TRITON:
+        msg = "triton needs to be installed for the test"
+    elif (model not in _bert_models + _roberta_models) and enable_triton:
+        msg = "Triton kernels do not support Non bert/roberta models yet"
+
+    # These should be removed once we fix several inference tests failing
+    if model in [
+            "EleutherAI/pythia-70m-deduped", "distilbert/distilbert-base-cased-distilled-squad", "EleutherAI/gpt-j-6b"
+    ]:
+        msg = "Test is currently broken"
+    return msg
+
+
+@pytest.mark.inference
+class TestModelTask(DistributedTest):
+    world_size = 1
+
+    def test(
+        self,
+        model_w_task,
+        dtype,
+        enable_cuda_graph,
+        enable_triton,
+        query,
+        inf_kwargs,
+        assert_fn,
+        perf_meas=True,
+    ):
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
+
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
+        if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+            pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
+
+        model, task = model_w_task
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        # Load the model on CPU first to avoid OOM for large models @fp32
+        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        if dtype == torch.half:
+            pipe.model.half()
+
+        # Switch device to GPU after converting to half
+        device = torch.device(get_accelerator().device_name(local_rank))
+        pipe.device = device
+        pipe.model.to(device)
+
+        # Warm-up queries for perf measurement
+        #for i in range(10):
+        #    _ = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        start = time.time()
+        bs_output = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        bs_time = time.time() - start
+
+        args = {
+            'mp_size': 1,
+            'dtype': dtype,
+            'replace_with_kernel_inject': True,
+            'enable_cuda_graph': enable_cuda_graph,
+            'use_triton': enable_triton,
+            'triton_autotune': False,
+        }
+        if pipe.tokenizer.model_max_length < deepspeed.ops.transformer.inference.config.DeepSpeedInferenceConfig(
+        ).max_out_tokens:
+            args.update({'max_out_tokens': pipe.tokenizer.model_max_length})
+        pipe.model = deepspeed.init_inference(pipe.model, **args)
+        check_injection(pipe.model)
+        # Warm-up queries for perf measurement
+        #for i in range(10):
+        #    _ = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        start = time.time()
+        ds_output = pipe(query, **inf_kwargs)
+        get_accelerator().synchronize()
+        ds_time = time.time() - start
+
+        if perf_meas:
+            print(
+                f"model={model}, task={task}, dtype={dtype}, cuda_graph={enable_cuda_graph}, triton={enable_triton}, bs_time={bs_time}, ds_time={ds_time}"
+            )
+
+        # facebook/opt* and some bigscient/bloom* models are not matching
+        # baseline exactly, adding an exception to them for now
+        if ("opt" in model) or ("bloom" in model):
+            bs_output = pipe(query, **inf_kwargs)
+
+        # These performance tests are only measuring the time for a single
+        # inference request, we just want to check that performance isn't terrible
+        #assert ds_time <= (bs_time * 1.1)
+
+        assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize("model_w_task", [("EleutherAI/gpt-neo-1.3B", "text-generation"),
+                                          ("EleutherAI/gpt-neox-20b", "text-generation"),
+                                          ("bigscience/bloom-3b", "text-generation"),
+                                          ("EleutherAI/gpt-j-6b", "text-generation")],
+                         ids=["gpt-neo", "gpt-neox", "bloom", "gpt-j"])
+class TestMPSize(DistributedTest):
+    world_size = 2
+
+    def test(
+        self,
+        model_w_task,
+        dtype,
+        query,
+        inf_kwargs,
+        assert_fn,
+    ):
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
+
+        if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+            pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
+
+        model, task = model_w_task
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        # We have to load these large models on CPU with pipeline because not
+        # enough GPU memory
+        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model,
+                                              mp_size=self.world_size,
+                                              dtype=dtype,
+                                              replace_with_kernel_inject=True)
+        check_injection(pipe.model)
+        # Switch device to GPU so that input tensors are not on CPU
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
+        ds_output = pipe(query, **inf_kwargs)
+
+        print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.inference
+@pytest.mark.parametrize("model_w_task", [("openai-community/gpt2", "text-generation")], ids=["gpt2"])
+class TestLowCpuMemUsage(DistributedTest):
+    world_size = 1
+
+    def test(
+        self,
+        model_w_task,
+        query,
+        inf_kwargs,
+        assert_fn,
+    ):
+        model, task = model_w_task
+        dtype = torch.float16
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+        pipe.model = deepspeed.init_inference(pipe.model,
+                                              mp_size=self.world_size,
+                                              dtype=dtype,
+                                              replace_method="auto",
+                                              replace_with_kernel_inject=True)
+
+        ds_output = pipe(query, **inf_kwargs)
+
+        assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize(
+    "model_w_task, injection_policy",
+    [
+        (("google/t5-v1_1-small", "text2text-generation"), {
+            T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')
+        }),
+        (("FacebookAI/roberta-large", "fill-mask"), {
+            RobertaLayer: ('output.dense')
+        }),
+    ],
+    ids=["t5", "roberta"],
+)
+@pytest.mark.parametrize("dtype", [torch.float], ids=["fp32"])
+class TestInjectionPolicy(DistributedTest):
+
+    def test(self, model_w_task, injection_policy, query, inf_kwargs, assert_fn, dtype, world_size):
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
+
+        model, task = model_w_task
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        pipe = pipeline(task,
+                        model=model,
+                        device=torch.device(get_accelerator().device_name(local_rank)),
+                        framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model,
+                                              mp_size=world_size,
+                                              dtype=dtype,
+                                              injection_policy=injection_policy)
+        ds_output = pipe(query, **inf_kwargs)
+
+        print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.seq_inference
+@pytest.mark.parametrize(
+    "model_w_task",
+    [("Helsinki-NLP/opus-mt-en-de", "translation"), ("Salesforce/codegen-350M-mono", "text-generation")],
+    ids=["marian", "codegen"],  #codegen has fusedqkv weight.
+)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+class TestAutoTensorParallelism(DistributedTest):
+    world_size = [2]
+
+    def test(
+        self,
+        model_w_task,
+        query,
+        inf_kwargs,
+        assert_fn,
+        dtype,
+    ):
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
+
+        model, task = model_w_task
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        world_size = int(os.getenv("WORLD_SIZE", "2"))
+
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
+        if model == "Salesforce/codegen-350M-mono":
+            pytest.skip("Disable Codegen model due to slight result difference")
+            #TODO: re-enable this test once we have a fix for the slight result difference
+
+        pipe = pipeline(task,
+                        model=model,
+                        device=torch.device(get_accelerator().device_name(local_rank)),
+                        framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=dtype)
+        ds_output = pipe(query, **inf_kwargs)
+
+        print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        assert assert_fn(bs_output, ds_output)
+
+    @pytest.mark.world_size(3)
+    def test_odd_world_size(
+        self,
+        model_w_task,
+        query,
+        inf_kwargs,
+        assert_fn,
+        dtype,
+    ):
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
+
+        model, task = model_w_task
+        if model == "Salesforce/codegen-350M-mono":
+            pytest.skip("codegen does not supported by odd world_size")
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        world_size = int(os.getenv("WORLD_SIZE", "3"))
+
+        pipe = pipeline(task,
+                        model=model,
+                        device=torch.device(get_accelerator().device_name(local_rank)),
+                        framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=dtype)
+        ds_output = pipe(query, **inf_kwargs)
+
+        print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        assert assert_fn(bs_output, ds_output)
+
+
+@pytest.mark.nightly
+@pytest.mark.parametrize(
+    "model_family, model_name",
+    (
+        ["gpt2", "EleutherAI/gpt-neo-2.7B"],
+        #["gpt2", "EleutherAI/gpt-j-6b"], # Causing OOM for this test
+        ["gpt2", "openai-community/gpt2-xl"],
+    ),
+)
+@pytest.mark.parametrize("task", ["lambada_standard"])
+class TestLMCorrectness(DistributedTest):
+    world_size = 1
+    exec_timeout = 1200  # Give these tests longer to complete
+
+    def test(self, model_family, model_name, task):
+        # imports here to avoid import errors when pytest collects tests
+        import lm_eval
+        import lm_eval.models
+        import lm_eval.tasks
+        import lm_eval.evaluator
+
+        # The bootstrap_stderr function in lm_eval.metrics uses a
+        # multiprocessing Pool to increase performance. Since we use a Pool for
+        # our distributed tests and cannot nest Pools, we must redefine and
+        # patch this function with a version that does not use Pool.
+        def no_pool_bootstrap_stderr(f, xs, iters):
+            from lm_eval.metrics import _bootstrap_internal
+            from lm_eval.metrics import sample_stddev
+            res = []
+            chunk_size = min(1000, iters)
+            for i in range(iters // chunk_size):
+                res.extend(_bootstrap_internal(f, chunk_size)((i, xs)))
+            return sample_stddev(res)
+
+        lm_eval.metrics.bootstrap_stderr = no_pool_bootstrap_stderr
+
+        local_rank = os.getenv("LOCAL_RANK", "0")
+        device = torch.device(get_accelerator().device_name(local_rank))
+        dtype = torch.float
+        task_dict = lm_eval.tasks.get_task_dict([task])
+
+        if 'gpt-j-6b' in model_name:
+            dtype = torch.half
+            lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}",
+                                                                               {"device": "cpu"})
+            setattr(lm, model_family, getattr(lm, model_family).half().to(device))
+            lm._device = device
+        else:
+            if get_accelerator().device_name() == 'hpu':
+                #lm_eval not supporting HPU device, so get model with CPU and move it to HPU.
+                lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}",
+                                                                                   {"device": "cpu"})
+                setattr(lm, model_family, getattr(lm, model_family).to(device))
+                lm._device = device
+            else:
+                lm = lm_eval.models.get_model(model_family).create_from_arg_string(
+                    f"pretrained={model_name}", {"device": get_accelerator().device_name()})
+
+        get_accelerator().synchronize()
+        start = time.time()
+        bs_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict)
+        get_accelerator().synchronize()
+        bs_time = time.time() - start
+
+        getattr(lm, model_family).to("cpu")
+        ds_model = deepspeed.init_inference(
+            getattr(lm, model_family),
+            mp_size=1,
+            dtype=dtype,
+            replace_with_kernel_inject=True,
+            enable_cuda_graph=False,
+        )
+        check_injection(ds_model)
+        setattr(lm, model_family, ds_model)
+        get_accelerator().synchronize()
+        start = time.time()
+        ds_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict)
+        get_accelerator().synchronize()
+        ds_time = time.time() - start
+
+        ppl_diff = abs(bs_output["results"][task]["ppl"] - ds_output["results"][task]["ppl"])
+        #assert ds_time <= bs_time
+        assert ppl_diff < 0.01
diff --git a/deepspeed/tests/unit/linear/test_quant_param.py b/deepspeed/tests/unit/linear/test_quant_param.py
new file mode 100644
index 000000000000..84a9f766ef74
--- /dev/null
+++ b/deepspeed/tests/unit/linear/test_quant_param.py
@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.linear.quantization import QuantizedParameter
+from deepspeed.linear.config import QuantizationConfig
+
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+
+from unit.common import DistributedTest
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+
+class TestQuantParam(DistributedTest):
+    world_size = 1
+
+    @pytest.mark.parametrize('dtype', [torch.half, torch.float])
+    def test_unsupported_dtypes(self, dtype):
+        device = get_accelerator().current_device_name()
+        data = torch.rand(5, 5, device='cpu', dtype=dtype)
+        qp = QuantizedParameter(data)
+        with pytest.raises(AssertionError):
+            qp.to(device)
+
+    def test_requires_grad(self):
+        data = torch.rand(5, 5, dtype=torch.bfloat16)
+        with pytest.raises(ValueError):
+            QuantizedParameter(data, requires_grad=True)
+
+    def test_move_to_accelerator(self):
+        device = get_accelerator().current_device()
+        data = torch.rand(5, 5, device='cpu', dtype=torch.bfloat16)
+        qp = QuantizedParameter(data)
+        assert qp.device == torch.device('cpu')
+        qp = qp.to(get_accelerator().current_device_name())
+        assert qp.device == torch.device(device)
+        assert qp.dtype == torch.uint8
+
+    def test_hf_clone(self):
+        device = get_accelerator().current_device_name()
+        data = torch.rand(5, 5, device=device, dtype=torch.bfloat16)
+
+        quantization_config = QuantizationConfig(q_bits=6)
+        qp = QuantizedParameter(data, quantization_config=quantization_config)
+
+        # should be able to clone parameter via dict, HF expects this to work
+        qp_copy = QuantizedParameter(qp.data, **qp.__dict__)
+
+        assert all(qp.data == qp_copy.data)
+        assert qp.quantization_config == qp_copy.quantization_config
diff --git a/deepspeed/tests/unit/ops/fp_quantizer/test_fp8_gemm.py b/deepspeed/tests/unit/ops/fp_quantizer/test_fp8_gemm.py
new file mode 100644
index 000000000000..d66f7c8cb4cc
--- /dev/null
+++ b/deepspeed/tests/unit/ops/fp_quantizer/test_fp8_gemm.py
@@ -0,0 +1,45 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+from deepspeed.ops.fp_quantizer import FP_Quantize, matmul_fp8
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
+@pytest.mark.parametrize("q_bits", [8], ids=[
+    "qbits8",
+])
+@pytest.mark.parametrize("M", [1, 2, 4, 8, 32, 64, 128, 256, 512, 1024, 2048])
+def test_fp_quant(dtype, q_bits, M):
+    quantization_group_size = 128
+    fpq = FP_Quantize(group_size=quantization_group_size)
+
+    N = 8192
+    H = 4096
+
+    x = torch.randn(M, H, dtype=dtype, device='cuda')
+    weight_bf16 = torch.randn(H, N, dtype=dtype, device='cuda')
+
+    weight, _ = fpq.quantize(weight_bf16.data, q_bits=8, return_meta_tensor=True)
+    scale = fpq.get_scales()
+    out = matmul_fp8(
+        x,
+        weight,
+        scale,
+        quantization_group_size,
+    )
+
+    out_q = torch.matmul(x, fpq.dequantize(weight, scale=fpq.scale))
+
+    error = ((out - out_q).abs() / (out.abs() + 1e-5)).sum() / out.numel()
+    assert 0.004 > error, f"failed on batch-size {M} with error {error}"