From 513f8618de41d882c0831e71c6b939f00a6bfc76 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 5 May 2020 22:11:10 +0000
Subject: [PATCH 01/66] 1. Disable third_party_install so flow doesn't
 automatically build APEX. 2. Comment out pip uninstall command as it errors
 out if deepspeed isn't already installed

---
 install.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/install.sh b/install.sh
index 8644142d3158..10fc0e77a533 100755
--- a/install.sh
+++ b/install.sh
@@ -30,7 +30,7 @@ hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install loc
 ds_only=0
 tp_only=0
 deepspeed_install=1
-third_party_install=1
+third_party_install=0
 local_only=0
 pip_sudo=0
 entire_dlts_job=1
@@ -162,7 +162,7 @@ fi
 if [ "$local_only" == "1" ]; then
     if [ "$deepspeed_install" == "1" ]; then
         echo "Installing deepspeed"
-        $PIP_SUDO pip uninstall -y deepspeed
+#        $PIP_SUDO pip uninstall -y deepspeed
         $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
         python basic_install_test.py
         if [ $? == 0 ]; then

From ed421e9f64bd08cbde2aed3c7a1bfe068c9cc6e9 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 6 May 2020 00:52:08 +0000
Subject: [PATCH 02/66] Update setup.py to hipify before building extension

---
 setup.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7957e371bf22..fae21fe4ad5d 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@
 from deepspeed import __version__ as ds_version
 from setuptools import setup, find_packages
 from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+from torch.utils.hipify import hipify_python
 
 cmdclass = {}
 ext_modules = []
@@ -42,7 +43,21 @@
     version_ge_1_5 = ['-DVERSION_GE_1_5']
 version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
 
-ext_modules.append(
+is_rocm_pytorch = False
+if torch.__version__ >= '1.5':
+    from torch.utils.cpp_extension import ROCM_HOME
+    is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
+
+if is_rocm_pytorch:
+    import shutil
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+#    with hipify_python.GeneratedFileCleaner(keep_intermediates=True) as clean_ctx:
+    hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
+                                show_detailed=True, is_pytorch_extension=True) #, clean_ctx=clean_ctx)
+    shutil.copy("csrc/type_shim.h", "csrc/hip/type_shim.h")
+
+if not is_rocm_pytorch:
+  ext_modules.append(
     CUDAExtension(name='fused_lamb_cuda',
                   sources=['csrc/fused_lamb_cuda.cpp',
                            'csrc/fused_lamb_cuda_kernel.cu'],
@@ -53,6 +68,18 @@
                       'nvcc': ['-O3',
                                '--use_fast_math'] + version_dependent_macros
                   }))
+else:
+  ext_modules.append(
+    CUDAExtension(name='fused_lamb_cuda',
+                  sources=['csrc/fused_lamb_cuda.cpp',
+                           'csrc/hip/fused_lamb_hip_kernel.hip'],
+                  extra_compile_args={
+                      'cxx': [
+                          '-O3',
+                      ] + version_dependent_macros,
+                      'nvcc': []
+                  }))
+
 
 setup(name='deepspeed',
       version=ds_version,

From e82fa34cfe357ea151ad292ee6e0d1098ceb3caf Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 6 May 2020 00:53:11 +0000
Subject: [PATCH 03/66] Cooperative groups is not supported by HIP yet, so
 replace with workaround

---
 csrc/fused_lamb_cuda_kernel.cu | 56 +++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/csrc/fused_lamb_cuda_kernel.cu b/csrc/fused_lamb_cuda_kernel.cu
index b79f5af82332..27250314bd93 100644
--- a/csrc/fused_lamb_cuda_kernel.cu
+++ b/csrc/fused_lamb_cuda_kernel.cu
@@ -15,11 +15,8 @@
 
 //#include <helper_functions.h>
 #include <cuda_runtime_api.h>
-#include <cooperative_groups.h>
 #include <stdio.h>
 
-namespace cg = cooperative_groups;
-
 // Utility class used to avoid linker errors with extern
 // unsized shared memory arrays with templated type
 namespace {
@@ -72,16 +69,13 @@ template <typename T, int blockSize>
 __device__ void
 reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 {
-    // Handle to thread block group
-    cg::thread_block cta = cg::this_thread_block();
-
     // perform block reduction in shared memory,
-    unsigned int tid = cta.thread_rank();
+    unsigned int tid = threadIdx.x + blockDim.x * threadIdx.y;
 
     T a_sum = s_a[tid];
     T b_sum = s_b[tid];
 
-    cg::sync(cta);
+    __syncthreads();
 
     // do reduction in shared mem
     if ((blockSize >= 512) && (tid < 256))
@@ -91,7 +85,7 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 
     }
 
-    cg::sync(cta);
+    __syncthreads();
 
     if ((blockSize >= 256) && (tid < 128))
     {
@@ -100,7 +94,7 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 
     }
 
-    cg::sync(cta);
+    __syncthreads();
 
     if ((blockSize >= 128) && (tid < 64))
     {
@@ -109,13 +103,19 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 
     }
 
-    cg::sync(cta);
+    __syncthreads();
 
-#if (__CUDA_ARCH__ >= 300 )
-    if ( tid < 32 )
+#if defined(__HIP_PLATFORM_HCC__)
+    // Reduce final warp using shuffle
+    for (int offset = warpSize/2; offset > 0; offset /= 2)
     {
-        cg::coalesced_group active = cg::coalesced_threads();
+         a_sum += __shfl_down(a_sum, offset);
+         b_sum += __shfl_down(b_sum, offset);
 
+    }
+#elif (__CUDA_ARCH__ >= 300 )
+    if ( tid < 32 )
+    {
         // Fetch final intermediate sum from 2nd warp
         if (blockSize >=  64)
         {
@@ -126,8 +126,8 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
         // Reduce final warp using shuffle
         for (int offset = warpSize/2; offset > 0; offset /= 2)
         {
-             a_sum += active.shfl_down(a_sum, offset);
-             b_sum += active.shfl_down(b_sum, offset);
+             a_sum += __shfl_down(a_sum, offset);
+             b_sum += __shfl_down(b_sum, offset);
 
         }
     }
@@ -139,7 +139,7 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 
     }
 
-    cg::sync(cta);
+    __syncthreads();
 
     if ((blockSize >= 32) && (tid < 16))
     {
@@ -148,7 +148,7 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 
     }
 
-    cg::sync(cta);
+    __syncthreads();
 
     if ((blockSize >= 16) && (tid < 8))
     {
@@ -157,7 +157,7 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 
     }
 
-    cg::sync(cta);
+    __syncthreads();
 
     if ((blockSize >= 8) && (tid < 4))
     {
@@ -166,7 +166,7 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 
     }
 
-    cg::sync(cta);
+    __syncthreads();
 
     if ((blockSize >= 4) && (tid < 2))
     {
@@ -175,7 +175,7 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 
     }
 
-    cg::sync(cta);
+    __syncthreads();
 
     if ((blockSize >= 2) && (tid < 1))
     {
@@ -184,7 +184,7 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 
     }
 
-    cg::sync(cta);
+    __syncthreads();
 
 #endif
 
@@ -198,10 +198,10 @@ reduce_block_in_shared_memory(T *s_a, T *s_b, T* g_a, T* g_b)
 template <typename T, int blockSize>
 __device__ void reduce_two_vectors_in_register(T a, T b, T* g_a, T* g_b){
 
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
+    const int threadIdInBlock = threadIdx.x + blockDim.x * threadIdx.y;
 
     T *s_a = SharedMemory<T>();
-    T *s_b = SharedMemory<T>() + cg::this_thread_block().size();
+    T *s_b = SharedMemory<T>() + (blockDim.x * blockDim.y);
 
     s_a[threadIdInBlock] = a;
     s_b[threadIdInBlock] = b;
@@ -232,7 +232,7 @@ __global__ void lamb_cuda_kernel_part1(
         //Assuming 2D grids and 2D blocks
         const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
         const int threadsPerBlock = blockDim.x * blockDim.y;
-        const int threadIdInBlock = cg::this_thread_block().thread_rank();
+        const int threadIdInBlock = threadIdx.x + blockDim.x * threadIdx.y;
         const int i = (blockId * threadsPerBlock + threadIdInBlock);
         const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
 
@@ -268,9 +268,9 @@ __global__ void lamb_cuda_kernel_part2(
 {
 
     T *s_a = SharedMemory<T>() ;
-    T *s_b = SharedMemory<T>() + cg::this_thread_block().size();
+    T *s_b = SharedMemory<T>() + (blockDim.x * blockDim.y);
 
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
+    const int threadIdInBlock = threadIdx.x + blockDim.x * threadIdx.y;
 
     s_a[threadIdInBlock] = g_a[threadIdInBlock];
     s_b[threadIdInBlock] = g_b[threadIdInBlock];
@@ -309,7 +309,7 @@ __global__ void lamb_cuda_kernel_part2(
         //Assuming 2D grids and 2D blocks
         const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
         const int threadsPerBlock = blockDim.x * blockDim.y;
-        const int threadIdInBlock = cg::this_thread_block().thread_rank();
+        const int threadIdInBlock = threadIdx.x + blockDim.x * threadIdx.y;
         const int i = (blockId * threadsPerBlock + threadIdInBlock);
         const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
 

From bdb8421925eda4b080ecbf59e50940bda0405d60 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Fri, 8 May 2020 00:27:20 +0000
Subject: [PATCH 04/66] Use ROCm APEX

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 1257dc13e0f4..1443f33f54a5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "third_party/apex"]
 	path = third_party/apex
-	url = https://github.com/NVIDIA/apex.git
+	url = https://github.com/ROCmSoftwarePlatform/apex.git
 [submodule "DeepSpeedExamples"]
 	path = DeepSpeedExamples
 	url = https://github.com/microsoft/DeepSpeedExamples

From 7c0e6ac8e94835addb70d3431a4ab0b947910c16 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Fri, 8 May 2020 00:55:27 +0000
Subject: [PATCH 05/66] Update ROCm APEX commit to get FusedLayerNorm and
 hipification fixes

---
 third_party/apex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/apex b/third_party/apex
index 494f8ab3fc1b..2d0f9cf20f3c 160000
--- a/third_party/apex
+++ b/third_party/apex
@@ -1 +1 @@
-Subproject commit 494f8ab3fc1b0b26949a3bcbb2bcac78008d48c1
+Subproject commit 2d0f9cf20f3c998293225c633e3ec42f68edbba4

From 23200d4d9c943a12c6befbf9b2ae5308173c4cc5 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Fri, 8 May 2020 01:14:03 +0000
Subject: [PATCH 06/66] Update requirements to use tensorflow-rocm package
 instead of tensorflow-gpu

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6ac3b2a446a0..b7af2bb78c69 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ pillow==6.2.2
 tqdm
 psutil
 tensorboardX==1.8
-tensorflow-gpu==1.15.2
+tensorflow-rocm=2.1.1
 pytest
 pytest-forked
 pre-commit

From c10bdcbd0391d9958d8d2a8cad291628baaef753 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Fri, 8 May 2020 01:34:54 +0000
Subject: [PATCH 07/66] Use DeepSpeedExamples fork

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 1443f33f54a5..2d83bbf9eb3e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,5 +3,5 @@
 	url = https://github.com/ROCmSoftwarePlatform/apex.git
 [submodule "DeepSpeedExamples"]
 	path = DeepSpeedExamples
-	url = https://github.com/microsoft/DeepSpeedExamples
+	url = https://github.com/jithunnair-amd/DeepSpeedExamples.git
 	branch = master

From 43212b37480663159e0abc268b0481922c3cf691 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Fri, 8 May 2020 22:01:31 +0000
Subject: [PATCH 08/66] Typo

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b7af2bb78c69..bea33f2c9ccf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ pillow==6.2.2
 tqdm
 psutil
 tensorboardX==1.8
-tensorflow-rocm=2.1.1
+tensorflow-rocm==2.1.1
 pytest
 pytest-forked
 pre-commit

From 54ad8a5455ab5b5ffd11b8c7dc3789f6fdd5b15d Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Sat, 16 May 2020 22:13:37 +0000
Subject: [PATCH 09/66] Use changes_for_rocm_build branch for jithunnair-amd
 fork of DeepSpeedExamples

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 2d83bbf9eb3e..695218a37e0a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,4 +4,4 @@
 [submodule "DeepSpeedExamples"]
 	path = DeepSpeedExamples
 	url = https://github.com/jithunnair-amd/DeepSpeedExamples.git
-	branch = master
+	branch = changes_for_rocm_build 

From 453d50102707920ca233fdfb2eaac501d3f0f9b7 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Thu, 28 May 2020 20:51:09 +0000
Subject: [PATCH 10/66] Update ROCm APEX commit

---
 third_party/apex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/apex b/third_party/apex
index 2d0f9cf20f3c..38ade0a025c1 160000
--- a/third_party/apex
+++ b/third_party/apex
@@ -1 +1 @@
-Subproject commit 2d0f9cf20f3c998293225c633e3ec42f68edbba4
+Subproject commit 38ade0a025c1dc256262af48db3a9e0f890e8def

From 4454bc2de66b93a601ee69e17e1e529a17abe7bf Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Fri, 29 May 2020 20:52:20 +0000
Subject: [PATCH 11/66] Update DeepSpeedExamples commit

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 274787a189b2..e23370ae5a03 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 274787a189b265814ed75dd5ddeae2dce026ea88
+Subproject commit e23370ae5a038bc3e0b1d2d86d23df07daafbdb9

From db28f752522aac7dc070e40d521fc3fe00d4bf8f Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Thu, 25 Jun 2020 21:27:00 +0000
Subject: [PATCH 12/66] Update ROCm Apex commit

---
 third_party/apex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/apex b/third_party/apex
index 38ade0a025c1..7e099371e7cc 160000
--- a/third_party/apex
+++ b/third_party/apex
@@ -1 +1 @@
-Subproject commit 38ade0a025c1dc256262af48db3a9e0f890e8def
+Subproject commit 7e099371e7ccdaf82058d7db9646269f4756a21b

From 077638dabbd8a6183758f88352ba69fad53bf98a Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 15 Sep 2020 18:02:32 +0000
Subject: [PATCH 13/66] Enable cooperative groups for ROCm

---
 csrc/lamb/fused_lamb_cuda_kernel.cu | 44 +++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index c94e9bb9562c..89ba068fbc1f 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -14,7 +14,11 @@
 #include <iostream>
 
 //#include <helper_functions.h>
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <cuda_runtime_api.h>
 #include <stdio.h>
 
@@ -76,7 +80,11 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
     T a_sum = s_a[tid];
     T b_sum = s_b[tid];
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     // do reduction in shared mem
     if ((blockSize >= 512) && (tid < 256)) {
@@ -84,21 +92,33 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
         s_b[tid] = b_sum = b_sum + s_b[tid + 256];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 256) && (tid < 128)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 128];
         s_b[tid] = b_sum = b_sum + s_b[tid + 128];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 128) && (tid < 64)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 64];
         s_b[tid] = b_sum = b_sum + s_b[tid + 64];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
 #if (__CUDA_ARCH__ >= 300)
     if (tid < 32) {
@@ -122,42 +142,66 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
         s_b[tid] = b_sum = b_sum + s_b[tid + 32];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 32) && (tid < 16)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 16];
         s_b[tid] = b_sum = b_sum + s_b[tid + 16];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 16) && (tid < 8)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 8];
         s_b[tid] = b_sum = b_sum + s_b[tid + 8];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 8) && (tid < 4)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 4];
         s_b[tid] = b_sum = b_sum + s_b[tid + 4];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 4) && (tid < 2)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 2];
         s_b[tid] = b_sum = b_sum + s_b[tid + 2];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 2) && (tid < 1)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 1];
         s_b[tid] = b_sum = b_sum + s_b[tid + 1];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
 #endif
 

From 66c135e2a5dfca226166d5ae4eafbe1e827e6ef6 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 15 Sep 2020 18:05:23 +0000
Subject: [PATCH 14/66] Update setup.py to build lamb extension for ROCm

---
 setup.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 55459395ec79..30f8812b0cf2 100755
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,7 @@
 import cpufeature
 from setuptools import setup, find_packages
 from torch.utils.cpp_extension import CUDAExtension, BuildExtension, CppExtension
+from torch.utils.hipify import hipify_python
 
 VERSION = "0.3.0"
 
@@ -119,23 +120,37 @@ def fetch_requirements(path):
     SIMD_WIDTH = '-D__AVX256__'
 print("SIMD_WIDTH = ", SIMD_WIDTH)
 
+is_rocm_pytorch = False
+if torch.__version__ >= '1.5':
+    from torch.utils.cpp_extension import ROCM_HOME
+    is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
+
+if is_rocm_pytorch:
+    import shutil
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
+                                show_detailed=True, is_pytorch_extension=True)
+
 ext_modules = []
 
 ## Lamb ##
 if BUILD_MASK & DS_BUILD_LAMB:
+    nvcc_flags=['-O3'] + version_dependent_macros
+    if is_rocm_pytorch:
+        sources = ['csrc/lamb/hip/fused_lamb_hip.cpp', 'csrc/lamb/hip/fused_lamb_hip_kernel.hip']
+    else:
+        sources = ['csrc/lamb/fused_lamb_cuda.cpp', 'csrc/lamb/fused_lamb_cuda_kernel.cu']
+        nvcc_flags.extend(['--use_fast_math'])
+
     ext_modules.append(
         CUDAExtension(name='deepspeed.ops.lamb.fused_lamb_cuda',
-                      sources=[
-                          'csrc/lamb/fused_lamb_cuda.cpp',
-                          'csrc/lamb/fused_lamb_cuda_kernel.cu'
-                      ],
+                      sources=sources,
                       include_dirs=['csrc/includes'],
                       extra_compile_args={
                           'cxx': [
                               '-O3',
                           ] + version_dependent_macros,
-                          'nvcc': ['-O3',
-                                   '--use_fast_math'] + version_dependent_macros
+                          'nvcc': nvcc_flags
                       }))
 
 ## Adam ##

From 9379918063a48595ad648e5b725eea46fc1dfa2c Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 15 Sep 2020 22:43:42 +0000
Subject: [PATCH 15/66] Do not install torch and torchvision for ROCm using pip

---
 requirements/requirements-rocm.txt |  6 ++++++
 setup.py                           | 12 +++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)
 create mode 100644 requirements/requirements-rocm.txt

diff --git a/requirements/requirements-rocm.txt b/requirements/requirements-rocm.txt
new file mode 100644
index 000000000000..54ed5f4b9e0c
--- /dev/null
+++ b/requirements/requirements-rocm.txt
@@ -0,0 +1,6 @@
+#torch>=1.2
+#torchvision>=0.4.0
+tqdm
+psutil
+cpufeature
+tensorboardX==1.8
diff --git a/setup.py b/setup.py
index 30f8812b0cf2..451e63333083 100755
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,10 @@
 
 VERSION = "0.3.0"
 
+is_rocm_pytorch = False
+if torch.__version__ >= '1.5':
+    from torch.utils.cpp_extension import ROCM_HOME
+    is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
 
 def fetch_requirements(path):
     with open(path, 'r') as fd:
@@ -27,6 +31,9 @@ def fetch_requirements(path):
 
 
 install_requires = fetch_requirements('requirements/requirements.txt')
+if is_rocm_pytorch:
+    print("NOTE: Please manually install torch and torchvision packages for ROCm")
+    install_requires = fetch_requirements('requirements/requirements-rocm.txt')
 dev_requires = fetch_requirements('requirements/requirements-dev.txt')
 sparse_attn_requires = fetch_requirements('requirements/requirements-sparse-attn.txt')
 
@@ -120,11 +127,6 @@ def fetch_requirements(path):
     SIMD_WIDTH = '-D__AVX256__'
 print("SIMD_WIDTH = ", SIMD_WIDTH)
 
-is_rocm_pytorch = False
-if torch.__version__ >= '1.5':
-    from torch.utils.cpp_extension import ROCM_HOME
-    is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
-
 if is_rocm_pytorch:
     import shutil
     this_dir = os.path.dirname(os.path.abspath(__file__))

From b5866a62001f93c823982ae9bb8f8fee142a1b49 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Sep 2020 21:51:45 +0000
Subject: [PATCH 16/66] Use ROCm fork of DeepSpeedExamples

---
 .gitmodules | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 695218a37e0a..57c5c32b3b44 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,5 +3,4 @@
 	url = https://github.com/ROCmSoftwarePlatform/apex.git
 [submodule "DeepSpeedExamples"]
 	path = DeepSpeedExamples
-	url = https://github.com/jithunnair-amd/DeepSpeedExamples.git
-	branch = changes_for_rocm_build 
+	url = https://github.com/ROCmSoftwarePlatform/DeepSpeedExamples.git

From 9c624c211a7129b26e6e6cc6991a0b04c45dc5f6 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Sep 2020 22:12:07 +0000
Subject: [PATCH 17/66] Update DeepSpeedExamples commit to use ROCm fork master
 branch

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index e23370ae5a03..c43a022931a2 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit e23370ae5a038bc3e0b1d2d86d23df07daafbdb9
+Subproject commit c43a022931a2946a945a6931788be640aafa59db

From ab6aca12e12fb772d045713b49796f04e9555237 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Sat, 26 Sep 2020 05:33:43 +0000
Subject: [PATCH 18/66] Update DeepSpeedExamples commit

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index c43a022931a2..5e63c68085ad 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit c43a022931a2946a945a6931788be640aafa59db
+Subproject commit 5e63c68085adab099a78f57bc0fa88664f540fba

From 884f08ec9406b17f5e98d82a550db6a08993185a Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Sat, 26 Sep 2020 08:38:17 +0000
Subject: [PATCH 19/66] ROCm PyTorch can be installed in the user local area in
 some cases

---
 install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.sh b/install.sh
index 587866a49751..d8a28a757f7f 100755
--- a/install.sh
+++ b/install.sh
@@ -201,7 +201,7 @@ if [ "$local_only" == "1" ]; then
 #        $PIP_SUDO pip uninstall -y deepspeed
         $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
 	# -I to exclude local directory files
-        python -I basic_install_test.py
+        python basic_install_test.py
         if [ $? == 0 ]; then
             echo "Installation is successful"
         else

From 17febe56774a1542479d2f2555e96aae7ff84fd7 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 29 Sep 2020 17:33:40 +0000
Subject: [PATCH 20/66] Remove requirements.txt since upstream moved it to
 requirements folder

---
 requirements.txt | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index bea33f2c9ccf..000000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-torch>=1.2
-torchvision>=0.4.0
-pillow==6.2.2
-tqdm
-psutil
-tensorboardX==1.8
-tensorflow-rocm==2.1.1
-pytest
-pytest-forked
-pre-commit

From 46d64e2d87bd28a7d52a78d8c9bad452c6243ed3 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 30 Sep 2020 15:56:16 +0000
Subject: [PATCH 21/66] Add Dockerfile for ROCm

---
 docker/Dockerfile.rocm | 174 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 docker/Dockerfile.rocm

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
new file mode 100644
index 000000000000..2578d98f0749
--- /dev/null
+++ b/docker/Dockerfile.rocm
@@ -0,0 +1,174 @@
+FROM rocm/pytorch:latest
+
+
+##############################################################################
+# Temporary Installation Directory
+##############################################################################
+ENV STAGE_DIR=/tmp
+RUN mkdir -p ${STAGE_DIR}
+
+##############################################################################
+# Installation/Basic Utilities
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-9-dev
+
+##############################################################################
+# Installation Latest Git
+##############################################################################
+RUN add-apt-repository ppa:git-core/ppa -y && \
+    apt-get update && \
+    apt-get install -y git && \
+    git --version
+
+##############################################################################
+# Client Liveness & Uncomment Port 22 for SSH Daemon
+##############################################################################
+# Keep SSH client alive from server side
+RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
+RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
+    sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# Mellanox OFED
+##############################################################################
+#ENV MLNX_OFED_VERSION=4.6-1.0.1.1
+#RUN apt-get install -y libnuma-dev
+#RUN cd ${STAGE_DIR} && \
+#    wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
+#    cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
+#    ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
+#    cd ${STAGE_DIR} && \
+#    rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
+
+##############################################################################
+# OPENMPI
+##############################################################################
+ENV OPENMPI_BASEVERSION=4.0
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
+RUN cd ${STAGE_DIR} && \
+    wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+    cd openmpi-${OPENMPI_VERSION} && \
+    ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+    make -j"$(nproc)" install && \
+    ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+    # Sanity check:
+    test -f /usr/local/mpi/bin/mpic++ && \
+    cd ${STAGE_DIR} && \
+    rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+ENV PATH=/usr/local/mpi/bin:${PATH} \
+    LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+    chmod a+x /usr/local/mpi/bin/mpirun
+
+##############################################################################
+# Python
+##############################################################################
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHON_VERSION=3.6
+RUN apt-get install -y python3.6 python3.6-dev && \
+    rm -f /usr/bin/python && \
+    ln -s /usr/bin/python3.6 /usr/bin/python && \
+    curl -O https://bootstrap.pypa.io/get-pip.py && \
+        python get-pip.py && \
+        rm get-pip.py && \
+    pip install --upgrade pip && \
+    # Print python an pip version
+    python -V && pip -V
+RUN pip install pyyaml
+RUN pip install ipython
+
+##############################################################################
+# TensorFlow
+##############################################################################
+RUN pip install tensorflow-rocm
+
+##############################################################################
+# Some Packages
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libsndfile-dev \
+        libjpeg-dev \
+        libpng-dev \
+        screen
+RUN pip install psutil \
+                yappi \
+                cffi \
+                ipdb \
+                pandas \
+                matplotlib \
+                py3nvml \
+                pyarrow \
+                graphviz \
+                astor \
+                boto3 \
+                tqdm \
+                sentencepiece \
+                msgpack \
+                requests \
+                pandas \
+                sphinx \
+                sphinx_rtd_theme \
+                scipy \
+                numpy \
+                sklearn \
+                scikit-learn \
+                mpi4py
+
+##############################################################################
+## SSH daemon port inside container cannot conflict with host OS port
+###############################################################################
+ENV SSH_PORT=2222
+RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+    sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# PyTorch
+##############################################################################
+#ENV PYTORCH_VERSION=1.2.0
+#ENV TORCHVISION_VERSION=0.4.0
+#ENV TENSORBOARDX_VERSION=1.8
+#RUN pip install torch==${PYTORCH_VERSION}
+#RUN pip install torchvision==${TORCHVISION_VERSION}
+#RUN pip install tensorboardX==${TENSORBOARDX_VERSION}
+
+##############################################################################
+# PyYAML build issue
+# https://stackoverflow.com/a/53926898
+##############################################################################
+RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
+    rm -rf /usr/lib/python3/dist-packages/PyYAML-*
+
+##############################################################################
+## Add deepspeed user
+###############################################################################
+# Add a deepspeed user with user id 8877
+#RUN useradd --create-home --uid 8877 deepspeed
+#RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+#RUN usermod -aG sudo deepspeed
+#RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+# # Change to non-root privilege
+#USER deepspeed
+
+##############################################################################
+# DeepSpeed
+##############################################################################
+RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+RUN cd ${STAGE_DIR}/DeepSpeed && \
+    git checkout . && \
+    git checkout master && \
+    ./install.sh --third_party_only --allow_sudo && \
+    DS_BUILD_CUDA=0 DS_BUILD_LAMB=1 ./install.sh --allow_sudo
+RUN rm -rf ${STAGE_DIR}/DeepSpeed
+RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"

From c2d4cc0896eac34c1f8d9e17ec7048015f30a5ff Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 30 Sep 2020 22:31:42 +0000
Subject: [PATCH 22/66] Add skips for unit tests that fail on ROCm. Current
 status: 72 passed, 149 skipped

---
 tests/unit/common.py                  | 12 ++++++++++++
 tests/unit/test_adam_acuracy.py       |  2 ++
 tests/unit/test_checkpointing.py      |  9 ++++++++-
 tests/unit/test_config.py             |  5 ++++-
 tests/unit/test_cuda_backward.py      |  2 ++
 tests/unit/test_cuda_forward.py       |  4 ++++
 tests/unit/test_dist.py               |  6 ++++--
 tests/unit/test_dynamic_loss_scale.py |  8 +++++++-
 tests/unit/test_fp16.py               | 19 ++++++++++++++++++-
 tests/unit/test_lr_schedulers.py      |  3 ++-
 tests/unit/test_multi_output_model.py |  4 +++-
 tests/unit/test_partition.py          |  4 +++-
 tests/unit/test_pipe.py               |  3 ++-
 tests/unit/test_pipe_module.py        |  3 ++-
 tests/unit/test_topology.py           |  4 +++-
 15 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/tests/unit/common.py b/tests/unit/common.py
index c04dfb72fd46..87f2fce2ade6 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -6,10 +6,22 @@
 from torch.multiprocessing import Process
 
 import pytest
+from functools import wraps
+import unittest
 
 # Worker timeout *after* the first worker has completed.
 DEEPSPEED_UNIT_WORKER_TIMEOUT = 120
 
+TEST_WITH_ROCM = os.getenv('DEEPSPEED_TEST_WITH_ROCM', '0') == '1'
+
+def skipIfRocm(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if TEST_WITH_ROCM:
+            raise unittest.SkipTest("test doesn't currently work on the ROCm stack")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
 
 def distributed_test(world_size=2, backend='nccl'):
     """A decorator for executing a function (e.g., a unit test) in a distributed manner.
diff --git a/tests/unit/test_adam_acuracy.py b/tests/unit/test_adam_acuracy.py
index f61b6ecba58b..3e94ec6fe613 100755
--- a/tests/unit/test_adam_acuracy.py
+++ b/tests/unit/test_adam_acuracy.py
@@ -6,6 +6,7 @@
 import pytest
 import copy
 
+from common import skipIfRocm
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 
 
@@ -27,6 +28,7 @@ def check_equal(first, second, atol=1e-2, verbose=False):
                              (1024),
                              (1048576),
                          ]) # yapf: disable
+@skipIfRocm
 def test_adam_opt(model_size):
     device = 'cpu'
     rng_state = torch.get_rng_state()
diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py
index d08addb936d8..ab28a7d5da29 100755
--- a/tests/unit/test_checkpointing.py
+++ b/tests/unit/test_checkpointing.py
@@ -16,7 +16,7 @@
 import json
 import os
 import numbers
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import *
 
 
@@ -151,6 +151,7 @@ def checkpoint_correctness_verification(args,
         compare_lr_scheduler_states(trained_model, loaded_model)
 
 
+@skipIfRocm
 def test_checkpoint_unfused_optimizer(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -209,6 +210,7 @@ def _test_checkpoint_unfused_optimizer(args,
                                        load_optimizer_states=False)
 
 
+@skipIfRocm
 def test_checkpoint_fused_optimizer(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -263,6 +265,7 @@ def _test_checkpoint_fused_optimizer(args, model, hidden_dim, load_optimizer_sta
                               True,
                               'deepspeed_adam'),
                          ])
+@skipIfRocm
 def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
     config_dict = {
         "train_batch_size": 2,
@@ -316,6 +319,7 @@ def _test_checkpoint_zero_optimizer(args, model, hidden_dim, load_optimizer_stat
                               True,
                               'deepspeed_adam'),
                          ])
+@skipIfRocm
 def test_checkpoint_zero_no_optimizer(tmpdir,
                                       zero_stage,
                                       use_cpu_offload,
@@ -378,6 +382,7 @@ def _test_checkpoint_zero_no_optimizer(args,
                               True,
                               'deepspeed_adam'),
                          ])
+@skipIfRocm
 def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
     config_dict = {
         "train_batch_size": 2,
@@ -449,6 +454,7 @@ def _test_checkpoint_lr_scheduler(args,
                               True,
                               'deepspeed_adam'),
                          ])
+@skipIfRocm
 def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
     config_dict = {
         "train_batch_size": 2,
@@ -501,6 +507,7 @@ def _test_checkpoint_no_lr_scheduler(args,
                                      load_lr_scheduler_states=False)
 
 
+@skipIfRocm
 def test_checkpoint_fp32_optimizer(tmpdir):
     config_dict = {
         "train_batch_size": 2,
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index e5fe75b281e0..baf27c165ba0 100755
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -3,7 +3,7 @@
 import pytest
 import json
 import argparse
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import SimpleModel, create_config_from_dict, random_dataloader
 import torch.distributed as dist
 
@@ -56,6 +56,7 @@ def _batch_assert(status, ds_config, batch, micro_batch, gas, success):
                          (2,32,8,2,True),
                          (2,33,17,2,False),
                          (2,32,18,1,False)]) # yapf: disable
+@skipIfRocm
 def test_batch_config(num_ranks, batch, micro_batch, gas, success):
     @distributed_test(world_size=2)
     def _test_batch_config(num_ranks, batch, micro_batch, gas, success):
@@ -114,6 +115,7 @@ def test_temp_config_json(tmpdir):
     assert 'train_batch_size' in config_json
 
 
+@skipIfRocm
 def test_deprecated_deepscale_config(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -155,6 +157,7 @@ def _test_deprecated_deepscale_config(args, model, hidden_dim):
     _test_deprecated_deepscale_config(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm
 def test_dist_init_true(tmpdir):
     config_dict = {
         "train_batch_size": 1,
diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py
index bf0e5955d62c..e2563f41b2ca 100755
--- a/tests/unit/test_cuda_backward.py
+++ b/tests/unit/test_cuda_backward.py
@@ -8,6 +8,7 @@
 import time
 import copy
 from torch import nn
+from common import skipIfRocm
 from modelingpreln import BertEncoder as BertEncoderPreln
 from modeling import BertEncoder as BertEncoderPostln
 from modeling import BertConfig, BertLayerNorm
@@ -257,6 +258,7 @@ def run_backward(ds_config, atol=1e-2, verbose=False):
                              (3,1024,128,16,24,False,False, 0.1),
                              (3,1024,128,16,24,False,True, 0.2),
                          ]) # yapf: disable
+@skipIfRocm
 def test_backward(batch_size,
                   hidden_size,
                   seq_len,
diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py
index 4e995a34448f..5d79b9b5dacb 100755
--- a/tests/unit/test_cuda_forward.py
+++ b/tests/unit/test_cuda_forward.py
@@ -8,6 +8,7 @@
 import time
 import copy
 from torch import nn
+from common import skipIfRocm
 from modelingpreln import BertEncoder as BertEncoderPreln
 from modeling import BertEncoder as BertEncoderPostln
 from modeling import BertLayerNorm, BertConfig
@@ -226,6 +227,7 @@ def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None):
                              (8,2560,128,40,3,False,False),
                              (8,2560,128,40,3,False,True),
                          ]) # yapf: disable
+@skipIfRocm
 def test_forward(batch_size,
                  hidden_size,
                  seq_len,
@@ -261,6 +263,7 @@ def test_forward(batch_size,
                              (8,3,1024,512,16,3,False,False),
                              (8,7,1024,512,16,3,False,True),
                          ]) # yapf: disable
+@skipIfRocm
 def test_forward_with_small_bsz(batch_size,
                                 small_bsz,
                                 hidden_size,
@@ -296,6 +299,7 @@ def test_forward_with_small_bsz(batch_size,
                              (64,1024,128,16,3,False,False),
                              (64,1024,128,16,3,False,True),
                          ]) # yapf: disable
+@skipIfRocm
 def test_forward_stochastic(batch_size,
                             hidden_size,
                             seq_len,
diff --git a/tests/unit/test_dist.py b/tests/unit/test_dist.py
index 04b97031b3e5..61433e1ada93 100644
--- a/tests/unit/test_dist.py
+++ b/tests/unit/test_dist.py
@@ -1,11 +1,11 @@
 import torch
 import torch.distributed as dist
 
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 
 import pytest
 
-
+@skipIfRocm
 @distributed_test(world_size=3)
 def test_init():
     assert dist.is_initialized()
@@ -15,6 +15,7 @@ def test_init():
 
 # Demonstration of pytest's paramaterization
 @pytest.mark.parametrize('number,color', [(1138, 'purple')])
+@skipIfRocm
 def test_dist_args(number, color):
     """Outer test function with inputs from pytest.mark.parametrize(). Uses a distributed
     helper function.
@@ -29,6 +30,7 @@ def _test_dist_args_helper(x, color='red'):
     _test_dist_args_helper(number, color=color)
 
 
+@skipIfRocm
 @distributed_test(world_size=[1, 2, 4])
 def test_dist_allreduce():
     x = torch.ones(1, 3).cuda() * (dist.get_rank() + 1)
diff --git a/tests/unit/test_dynamic_loss_scale.py b/tests/unit/test_dynamic_loss_scale.py
index 7575d6b49454..799571fff8a4 100755
--- a/tests/unit/test_dynamic_loss_scale.py
+++ b/tests/unit/test_dynamic_loss_scale.py
@@ -5,7 +5,7 @@
 import json
 import os
 import numpy as np
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import SimpleModel, args_from_dict
 
 
@@ -17,6 +17,7 @@ def run_model_step(model, gradient_list):
         model.step()
 
 
+@skipIfRocm
 def test_fused_no_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -61,6 +62,7 @@ def _test_fused_no_overflow(args):
     _test_fused_no_overflow(args)
 
 
+@skipIfRocm
 def test_fused_all_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -103,6 +105,7 @@ def _test_fused_all_overflow(args):
     _test_fused_all_overflow(args)
 
 
+@skipIfRocm
 def test_fused_some_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -165,6 +168,7 @@ def _test_fused_some_overflow(args):
     _test_fused_some_overflow(args)
 
 
+@skipIfRocm
 def test_unfused_no_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -208,6 +212,7 @@ def _test_unfused_no_overflow(args):
     _test_unfused_no_overflow(args)
 
 
+@skipIfRocm
 def test_unfused_all_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -253,6 +258,7 @@ def _test_unfused_all_overflow(args):
     _test_unfused_all_overflow(args)
 
 
+@skipIfRocm
 def test_unfused_some_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py
index f5176294a549..7c5e01c58f90 100755
--- a/tests/unit/test_fp16.py
+++ b/tests/unit/test_fp16.py
@@ -5,10 +5,11 @@
 import pytest
 import json
 import os
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
 
 
+@skipIfRocm
 def test_lamb_fp32_grad_clip(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -44,6 +45,7 @@ def _test_lamb_fp32_grad_clip(args, model, hidden_dim):
     _test_lamb_fp32_grad_clip(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm
 def test_lamb_fp16_basic(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -81,6 +83,7 @@ def _test_lamb_fp16_basic(args, model, hidden_dim):
     _test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm
 def test_lamb_fp16_empty_grad(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -118,6 +121,7 @@ def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
     _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm
 def test_adam_fp32_empty_grad(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -156,6 +160,7 @@ def _test_adam_fp32_empty_grad(args, model, hidden_dim):
     _test_adam_fp32_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm
 def test_adamw_fp16_basic(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -187,6 +192,7 @@ def _test_adamw_fp16_basic(args, model, hidden_dim):
     _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm
 def test_adamw_fp16_empty_grad(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -227,6 +233,7 @@ def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
                              (2,
                               True),
                          ])
+@skipIfRocm
 def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 1,
@@ -293,6 +300,7 @@ def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
                              (2,
                               True),
                          ])
+@skipIfRocm
 def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 4,
@@ -339,6 +347,7 @@ def _test_zero_static_scale(args):
     _test_zero_static_scale(args)
 
 
+@skipIfRocm
 def test_zero_static_scale_deprecated_format(tmpdir):
     config_dict = {
         "train_batch_size": 4,
@@ -391,6 +400,7 @@ def _test_zero_static_scale(args):
                              (2,
                               True),
                          ])
+@skipIfRocm
 def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 4,
@@ -429,6 +439,7 @@ def _test_zero_allow_untested_optimizer(args):
                              (2,
                               True),
                          ])
+@skipIfRocm
 def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_micro_batch_size_per_gpu": 1,
@@ -475,6 +486,7 @@ def _test_zero_empty_partition(args):
     _test_zero_empty_partition(args)
 
 
+@skipIfRocm
 def test_adam_amp_basic(tmpdir):
     config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}}
     args = args_from_dict(tmpdir, config_dict)
@@ -500,6 +512,7 @@ def _test_adam_amp_basic(args, model, hidden_dim):
     _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm
 def test_lamb_amp_basic(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -537,6 +550,7 @@ def _test_lamb_amp_basic(args, model, hidden_dim):
     _test_lamb_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm
 def test_adam_amp_o2(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -575,6 +589,7 @@ def _test_adam_amp_o2(args, model, hidden_dim):
     _test_adam_amp_o2(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm
 def test_adam_amp_o2_empty_grad(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -620,6 +635,7 @@ def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
                            torch.optim.Adam),
                           (2,
                            apex.optimizers.FusedAdam)])
+@skipIfRocm
 def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
     config_dict = {
         "train_batch_size": 2,
@@ -648,6 +664,7 @@ def _test_zero_supported_client_optimizer(args, model, optimizer_constructor):
                                           optimizer_constructor=optimizer_constructor)
 
 
+@skipIfRocm
 def test_zero2_reduce_scatter_off(tmpdir):
     config_dict = {
         "train_batch_size": 2,
diff --git a/tests/unit/test_lr_schedulers.py b/tests/unit/test_lr_schedulers.py
index 0c388627a38f..d15d1b4fdc78 100644
--- a/tests/unit/test_lr_schedulers.py
+++ b/tests/unit/test_lr_schedulers.py
@@ -4,7 +4,7 @@
 import pytest
 import json
 import os
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
 
 
@@ -18,6 +18,7 @@
                            }),
                           ("LRRangeTest",
                            {})])
+@skipIfRocm
 def test_get_lr_before_train(tmpdir, scheduler_type, params):
     config_dict = {
         "train_batch_size": 2,
diff --git a/tests/unit/test_multi_output_model.py b/tests/unit/test_multi_output_model.py
index ccbe7f484e29..fbca3250cf4e 100755
--- a/tests/unit/test_multi_output_model.py
+++ b/tests/unit/test_multi_output_model.py
@@ -5,7 +5,7 @@
 from pytest import approx
 import json
 import os
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import args_from_dict
 from multi_output_model import MultiOutputModel, multi_output_dataloader
 
@@ -28,6 +28,7 @@ def create_config_dict(micro_batch_size, grad_accumulation_steps, world_size):
     }
 
 
+@skipIfRocm
 def test_two_output_model(tmpdir):
     gradient_accumulation_steps = 2
     micro_batch_size = 1
@@ -81,6 +82,7 @@ def _test_two_output_model(args, model, hidden_dim):
     _test_two_output_model(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm
 def test_three_output_model(tmpdir):
     gradient_accumulation_steps = 3
     micro_batch_size = 1
diff --git a/tests/unit/test_partition.py b/tests/unit/test_partition.py
index 7cd264752c6f..4655d470ccee 100644
--- a/tests/unit/test_partition.py
+++ b/tests/unit/test_partition.py
@@ -8,9 +8,10 @@
 from deepspeed.runtime.utils import prefix_sum_inc
 from deepspeed.runtime.utils import PartitionedTensor
 
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 
 
+@skipIfRocm
 @distributed_test(world_size=4)
 def test_partitioned_tensor():
     world = dist.get_world_size()
@@ -32,6 +33,7 @@ def test_partitioned_tensor():
     assert torch.equal(full, reconstructed)
 
 
+@skipIfRocm
 @distributed_test(world_size=4)
 def test_partitioned_tensor_meta():
     world = dist.get_world_size()
diff --git a/tests/unit/test_pipe.py b/tests/unit/test_pipe.py
index 11c76fff926e..507e0d5119f5 100644
--- a/tests/unit/test_pipe.py
+++ b/tests/unit/test_pipe.py
@@ -15,7 +15,7 @@
 import deepspeed.runtime.pipe.module as PipelineModule
 from deepspeed.runtime.pipe.module import LayerSpec
 
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 
 
 def rel_diff(A, B):
@@ -170,6 +170,7 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
                               PipeTopo(num_pp=4,
                                        num_dp=1)),
                          ])
+@skipIfRocm
 def test_pipe_cifar10_seedlayers(base_topo, test_topo, tmpdir):
     config_dict = {
         "train_batch_size": 16,
diff --git a/tests/unit/test_pipe_module.py b/tests/unit/test_pipe_module.py
index 61f07a196971..382242a506f8 100644
--- a/tests/unit/test_pipe_module.py
+++ b/tests/unit/test_pipe_module.py
@@ -14,7 +14,7 @@
 from deepspeed.pipe import PipelineModule, LayerSpec
 from deepspeed.utils import RepeatingLoader
 
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import args_from_dict
 
 HIDDEN_DIM = 32
@@ -56,6 +56,7 @@ def simple_args(tmpdir):
     return args
 
 
+@skipIfRocm
 def test_pipe_module_sequential(sequential_model, simple_args):
     batch_input = torch.randn(1, HIDDEN_DIM)
 
diff --git a/tests/unit/test_topology.py b/tests/unit/test_topology.py
index 176363688de4..ab6a95f28aab 100644
--- a/tests/unit/test_topology.py
+++ b/tests/unit/test_topology.py
@@ -7,7 +7,7 @@
 from deepspeed.runtime.pipe.topology import ProcessTopology as Topo
 from deepspeed.runtime.pipe.topology import _prime_factors
 
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 
 
 def test_topology_2d():
@@ -157,6 +157,7 @@ def test_topology_comm_list():
     assert topo.get_axis_comm_lists('jeff') == []
 
 
+@skipIfRocm
 @distributed_test(world_size=4)
 def test_grid_pipe_data():
     topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
@@ -183,6 +184,7 @@ def test_grid_pipe_data():
     assert torch.all(rank_tensor == sum(data_group))
 
 
+@skipIfRocm
 @distributed_test(world_size=4)
 def test_stage_to_global():
     topo = Topo(axes=['pipe', 'data'], dims=[2, 2])

From 9f0c80d9ed6f85a54f3068c224f716eaf011c415 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Mon, 19 Oct 2020 20:28:00 +0000
Subject: [PATCH 23/66] Enable CPU adam extension for ROCm

---
 csrc/adam/cpu_adam.cpp             |   8 ++
 csrc/adam/custom_cuda_kernel.cu    |   4 +
 csrc/includes/custom_cuda_layers.h |   4 +
 setup.py                           | 135 +++++++++++++++--------------
 4 files changed, 86 insertions(+), 65 deletions(-)

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index 380bc4ea0ab0..b629ae71d71f 100644
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -1,4 +1,8 @@
+#ifdef __HIP_PLATFORM_HCC__
+#include "hip/cpu_adam.h"
+#else
 #include "cpu_adam.h"
+#endif
 #include <cuda_runtime_api.h>
 #include <math.h>
 #include <omp.h>
@@ -10,7 +14,11 @@
 #include "cublas_v2.h"
 #include "cuda.h"
 #include "curand.h"
+#ifdef __HIP_PLATFORM_HCC__
+#include "hip/custom_hip_layers.h"
+#else
 #include "custom_cuda_layers.h"
+#endif
 
 static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
diff --git a/csrc/adam/custom_cuda_kernel.cu b/csrc/adam/custom_cuda_kernel.cu
index 8f8d2c826771..ac3c9fe5929a 100644
--- a/csrc/adam/custom_cuda_kernel.cu
+++ b/csrc/adam/custom_cuda_kernel.cu
@@ -1,6 +1,10 @@
 
 
+#ifdef __HIP_PLATFORM_HCC__
+#include "hip/custom_hip_layers.h"
+#else
 #include "custom_cuda_layers.h"
+#endif
 
 __global__ void param_update_kernel(const float* input, __half* output, int size)
 {
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
index 2e72a35292c6..4f8c6b0c36d3 100644
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@@ -5,7 +5,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>
 
 #include "context.h"
diff --git a/setup.py b/setup.py
index 451e63333083..aeb616d11074 100755
--- a/setup.py
+++ b/setup.py
@@ -157,14 +157,33 @@ def fetch_requirements(path):
 
 ## Adam ##
 if BUILD_MASK & DS_BUILD_CPU_ADAM:
+    nvcc_flags= ['-O3'] + version_dependent_macros
+    include_dirs=['csrc/includes']
+    if is_rocm_pytorch:
+        sources = ['csrc/adam/hip/cpu_adam.cpp', 'csrc/adam/hip/custom_hip_kernel.hip']
+        include_dirs.extend(['/opt/rocm/include/rocrand', '/opt/rocm/include/hiprand'])
+        nvcc_flags.extend(['-U__HIP_NO_HALF_OPERATORS__',
+                           '-U__HIP_NO_HALF_CONVERSIONS__',
+                           '-U__HIP_NO_HALF2_OPERATORS__'
+        ])
+    else:
+        sources=['csrc/adam/cpu_adam.cpp','csrc/adam/custom_cuda_kernel.cu']
+        include_dirs.extend(['/usr/local/cuda/include'])
+        nvcc_flags.extend(['--use_fast_math',
+                           '-gencode',
+                           'arch=compute_61,code=compute_61',
+                           '-gencode',
+                           'arch=compute_70,code=compute_70',
+                           '-std=c++14',
+                           '-U__CUDA_NO_HALF_OPERATORS__',
+                           '-U__CUDA_NO_HALF_CONVERSIONS__',
+                           '-U__CUDA_NO_HALF2_OPERATORS__'
+        ])
+
     ext_modules.append(
         CUDAExtension(name='deepspeed.ops.adam.cpu_adam_op',
-                      sources=[
-                          'csrc/adam/cpu_adam.cpp',
-                          'csrc/adam/custom_cuda_kernel.cu',
-                      ],
-                      include_dirs=['csrc/includes',
-                                    '/usr/local/cuda/include'],
+                      sources=sources,
+                      include_dirs=include_dirs,
                       extra_compile_args={
                           'cxx': [
                               '-O3',
@@ -178,84 +197,70 @@ def fetch_requirements(path):
                               '-fopenmp',
                               SIMD_WIDTH
                           ],
-                          'nvcc': [
-                              '-O3',
-                              '--use_fast_math',
-                              '-gencode',
-                              'arch=compute_61,code=compute_61',
-                              '-gencode',
-                              'arch=compute_70,code=compute_70',
-                              '-std=c++14',
-                              '-U__CUDA_NO_HALF_OPERATORS__',
-                              '-U__CUDA_NO_HALF_CONVERSIONS__',
-                              '-U__CUDA_NO_HALF2_OPERATORS__'
-                          ]
+                          'nvcc': nvcc_flags
                       }))
 
 ## Transformer ##
 if BUILD_MASK & DS_BUILD_TRANSFORMER:
+    nvcc_flags= ['-O3', '-std=c++14'] + version_dependent_macros
+    include_dirs=['csrc/includes']
+    if is_rocm_pytorch:
+        sources = [
+            'csrc/transformer/hip/ds_transformer_hip.cpp',
+            'csrc/transformer/hip/cublas_wrappers.hip',
+            'csrc/transformer/hip/transform_kernels.hip',
+            'csrc/transformer/hip/gelu_kernels.hip',
+            'csrc/transformer/hip/dropout_kernels.hip',
+#            'csrc/transformer/hip/normalize_kernels.hip',
+            'csrc/transformer/hip/softmax_kernels.hip',
+            'csrc/transformer/hip/general_kernels.hip'
+        ]
+        include_dirs.extend(['/opt/rocm/include/rocrand', '/opt/rocm/include/hiprand'])
+        nvcc_flags.extend(['-U__HIP_NO_HALF_OPERATORS__',
+                           '-U__HIP_NO_HALF_CONVERSIONS__',
+                           '-U__HIP_NO_HALF2_OPERATORS__'
+        ])
+    else:
+        sources=[
+            'csrc/transformer/ds_transformer_cuda.cpp',
+            'csrc/transformer/cublas_wrappers.cu',
+            'csrc/transformer/transform_kernels.cu',
+            'csrc/transformer/gelu_kernels.cu',
+            'csrc/transformer/dropout_kernels.cu',
+            'csrc/transformer/normalize_kernels.cu',
+            'csrc/transformer/softmax_kernels.cu',
+            'csrc/transformer/general_kernels.cu'
+        ]
+        nvcc_flags.extend(['--use_fast_math',
+                           '-gencode',
+                           'arch=compute_61,code=compute_61',
+                           '-gencode',
+                           'arch=compute_70,code=compute_70',
+                           '-U__CUDA_NO_HALF_OPERATORS__',
+                           '-U__CUDA_NO_HALF_CONVERSIONS__',
+                           '-U__CUDA_NO_HALF2_OPERATORS__'
+        ])
     ext_modules.append(
         CUDAExtension(name='deepspeed.ops.transformer.transformer_cuda',
-                      sources=[
-                          'csrc/transformer/ds_transformer_cuda.cpp',
-                          'csrc/transformer/cublas_wrappers.cu',
-                          'csrc/transformer/transform_kernels.cu',
-                          'csrc/transformer/gelu_kernels.cu',
-                          'csrc/transformer/dropout_kernels.cu',
-                          'csrc/transformer/normalize_kernels.cu',
-                          'csrc/transformer/softmax_kernels.cu',
-                          'csrc/transformer/general_kernels.cu'
-                      ],
-                      include_dirs=['csrc/includes'],
+                      sources=sources,
+                      include_dirs=include_dirs,
                       extra_compile_args={
                           'cxx': ['-O3',
                                   '-std=c++14',
                                   '-g',
                                   '-Wno-reorder'],
-                          'nvcc': [
-                              '-O3',
-                              '--use_fast_math',
-                              '-gencode',
-                              'arch=compute_61,code=compute_61',
-                              '-gencode',
-                              'arch=compute_70,code=compute_70',
-                              '-std=c++14',
-                              '-U__CUDA_NO_HALF_OPERATORS__',
-                              '-U__CUDA_NO_HALF_CONVERSIONS__',
-                              '-U__CUDA_NO_HALF2_OPERATORS__'
-                          ]
+                          'nvcc': nvcc_flags
                       }))
     ext_modules.append(
         CUDAExtension(name='deepspeed.ops.transformer.stochastic_transformer_cuda',
-                      sources=[
-                          'csrc/transformer/ds_transformer_cuda.cpp',
-                          'csrc/transformer/cublas_wrappers.cu',
-                          'csrc/transformer/transform_kernels.cu',
-                          'csrc/transformer/gelu_kernels.cu',
-                          'csrc/transformer/dropout_kernels.cu',
-                          'csrc/transformer/normalize_kernels.cu',
-                          'csrc/transformer/softmax_kernels.cu',
-                          'csrc/transformer/general_kernels.cu'
-                      ],
-                      include_dirs=['csrc/includes'],
+                      sources=sources,
+                      include_dirs=include_dirs,
                       extra_compile_args={
                           'cxx': ['-O3',
                                   '-std=c++14',
                                   '-g',
                                   '-Wno-reorder'],
-                          'nvcc': [
-                              '-O3',
-                              '--use_fast_math',
-                              '-gencode',
-                              'arch=compute_61,code=compute_61',
-                              '-gencode',
-                              'arch=compute_70,code=compute_70',
-                              '-std=c++14',
-                              '-U__CUDA_NO_HALF_OPERATORS__',
-                              '-U__CUDA_NO_HALF_CONVERSIONS__',
-                              '-U__CUDA_NO_HALF2_OPERATORS__',
-                              '-D__STOCHASTIC_MODE__'
-                          ]
+                          'nvcc': nvcc_flags + ['-D__STOCHASTIC_MODE__']
                       }))
 
 

From cb3f83a538e543ace1c71bb9d4ceab7690a78a39 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 27 Oct 2020 03:52:20 +0000
Subject: [PATCH 24/66] Install requirements as appropriate for ROCm

---
 install.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/install.sh b/install.sh
index d8a28a757f7f..fd70e28663df 100755
--- a/install.sh
+++ b/install.sh
@@ -166,7 +166,11 @@ fi
 
 if [ "$skip_requirements" == "0" ]; then
    # Ensure dependencies are installed locally
-   $PIP_SUDO $PIP_INSTALL -r requirements/requirements.txt
+   if [ -e "/opt/rocm" ]; then
+       $PIP_SUDO $PIP_INSTALL -r requirements/requirements-rocm.txt
+   else
+       $PIP_SUDO $PIP_INSTALL -r requirements/requirements.txt
+   fi
 fi
 
 # Build wheels

From 617027fa83c9e73e21bdef78c97ccb843bbcc561 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 28 Oct 2020 16:49:20 +0000
Subject: [PATCH 25/66] Skip additional unit tests that fail on CI (but not
 locally)

---
 tests/unit/test_checkpointing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py
index ab28a7d5da29..2a721a316839 100755
--- a/tests/unit/test_checkpointing.py
+++ b/tests/unit/test_checkpointing.py
@@ -540,6 +540,7 @@ def _test_checkpoint_fp32_optimizer(args, model, hidden_dim):
 
 
 @pytest.mark.parametrize("zero_stage", [0, 1])
+@skipIfRocm
 def test_checkpoint_pipe_engine(zero_stage, tmpdir, stages=2):
     config_dict = {
         "train_batch_size": 2,
@@ -606,6 +607,7 @@ def _test(save_folder, num_stages):
                               PipeTopo(num_pp=2,
                                        num_dp=2)),
                          ])
+@skipIfRocm
 def test_checkpoint_pipe_module(base_topo, test_topo, tmpdir):
     @distributed_test(world_size=4)
     def _test(base_topo, test_topo, save_folder):

From a508e6229ec750553ad0f5dfbb9320f97786b347 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 3 Nov 2020 05:00:02 +0000
Subject: [PATCH 26/66] Do not skip unit tests which pass with latest PyTorch

---
 tests/unit/test_config.py             | 2 --
 tests/unit/test_dist.py               | 3 ---
 tests/unit/test_dynamic_loss_scale.py | 6 ------
 tests/unit/test_fp16.py               | 6 ------
 tests/unit/test_lr_schedulers.py      | 1 -
 tests/unit/test_multi_output_model.py | 2 --
 tests/unit/test_partition.py          | 2 --
 tests/unit/test_topology.py           | 1 -
 8 files changed, 23 deletions(-)

diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index baf27c165ba0..291caa8895b8 100755
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -115,7 +115,6 @@ def test_temp_config_json(tmpdir):
     assert 'train_batch_size' in config_json
 
 
-@skipIfRocm
 def test_deprecated_deepscale_config(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -157,7 +156,6 @@ def _test_deprecated_deepscale_config(args, model, hidden_dim):
     _test_deprecated_deepscale_config(args=args, model=model, hidden_dim=hidden_dim)
 
 
-@skipIfRocm
 def test_dist_init_true(tmpdir):
     config_dict = {
         "train_batch_size": 1,
diff --git a/tests/unit/test_dist.py b/tests/unit/test_dist.py
index 61433e1ada93..b3aaf9baa4af 100644
--- a/tests/unit/test_dist.py
+++ b/tests/unit/test_dist.py
@@ -5,7 +5,6 @@
 
 import pytest
 
-@skipIfRocm
 @distributed_test(world_size=3)
 def test_init():
     assert dist.is_initialized()
@@ -15,7 +14,6 @@ def test_init():
 
 # Demonstration of pytest's paramaterization
 @pytest.mark.parametrize('number,color', [(1138, 'purple')])
-@skipIfRocm
 def test_dist_args(number, color):
     """Outer test function with inputs from pytest.mark.parametrize(). Uses a distributed
     helper function.
@@ -30,7 +28,6 @@ def _test_dist_args_helper(x, color='red'):
     _test_dist_args_helper(number, color=color)
 
 
-@skipIfRocm
 @distributed_test(world_size=[1, 2, 4])
 def test_dist_allreduce():
     x = torch.ones(1, 3).cuda() * (dist.get_rank() + 1)
diff --git a/tests/unit/test_dynamic_loss_scale.py b/tests/unit/test_dynamic_loss_scale.py
index 799571fff8a4..0fffea0e3eed 100755
--- a/tests/unit/test_dynamic_loss_scale.py
+++ b/tests/unit/test_dynamic_loss_scale.py
@@ -17,7 +17,6 @@ def run_model_step(model, gradient_list):
         model.step()
 
 
-@skipIfRocm
 def test_fused_no_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -62,7 +61,6 @@ def _test_fused_no_overflow(args):
     _test_fused_no_overflow(args)
 
 
-@skipIfRocm
 def test_fused_all_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -105,7 +103,6 @@ def _test_fused_all_overflow(args):
     _test_fused_all_overflow(args)
 
 
-@skipIfRocm
 def test_fused_some_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -168,7 +165,6 @@ def _test_fused_some_overflow(args):
     _test_fused_some_overflow(args)
 
 
-@skipIfRocm
 def test_unfused_no_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -212,7 +208,6 @@ def _test_unfused_no_overflow(args):
     _test_unfused_no_overflow(args)
 
 
-@skipIfRocm
 def test_unfused_all_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -258,7 +253,6 @@ def _test_unfused_all_overflow(args):
     _test_unfused_all_overflow(args)
 
 
-@skipIfRocm
 def test_unfused_some_overflow(tmpdir):
     config_dict = {
         "train_batch_size": 1,
diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py
index 7c5e01c58f90..6d564eb1535e 100755
--- a/tests/unit/test_fp16.py
+++ b/tests/unit/test_fp16.py
@@ -160,7 +160,6 @@ def _test_adam_fp32_empty_grad(args, model, hidden_dim):
     _test_adam_fp32_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
 
 
-@skipIfRocm
 def test_adamw_fp16_basic(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -192,7 +191,6 @@ def _test_adamw_fp16_basic(args, model, hidden_dim):
     _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
-@skipIfRocm
 def test_adamw_fp16_empty_grad(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -233,7 +231,6 @@ def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
                              (2,
                               True),
                          ])
-@skipIfRocm
 def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 1,
@@ -400,7 +397,6 @@ def _test_zero_static_scale(args):
                              (2,
                               True),
                          ])
-@skipIfRocm
 def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
     config_dict = {
         "train_batch_size": 4,
@@ -486,7 +482,6 @@ def _test_zero_empty_partition(args):
     _test_zero_empty_partition(args)
 
 
-@skipIfRocm
 def test_adam_amp_basic(tmpdir):
     config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}}
     args = args_from_dict(tmpdir, config_dict)
@@ -635,7 +630,6 @@ def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
                            torch.optim.Adam),
                           (2,
                            apex.optimizers.FusedAdam)])
-@skipIfRocm
 def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
     config_dict = {
         "train_batch_size": 2,
diff --git a/tests/unit/test_lr_schedulers.py b/tests/unit/test_lr_schedulers.py
index d15d1b4fdc78..5498481bfeb5 100644
--- a/tests/unit/test_lr_schedulers.py
+++ b/tests/unit/test_lr_schedulers.py
@@ -18,7 +18,6 @@
                            }),
                           ("LRRangeTest",
                            {})])
-@skipIfRocm
 def test_get_lr_before_train(tmpdir, scheduler_type, params):
     config_dict = {
         "train_batch_size": 2,
diff --git a/tests/unit/test_multi_output_model.py b/tests/unit/test_multi_output_model.py
index fbca3250cf4e..1c8b8b39d779 100755
--- a/tests/unit/test_multi_output_model.py
+++ b/tests/unit/test_multi_output_model.py
@@ -28,7 +28,6 @@ def create_config_dict(micro_batch_size, grad_accumulation_steps, world_size):
     }
 
 
-@skipIfRocm
 def test_two_output_model(tmpdir):
     gradient_accumulation_steps = 2
     micro_batch_size = 1
@@ -82,7 +81,6 @@ def _test_two_output_model(args, model, hidden_dim):
     _test_two_output_model(args=args, model=model, hidden_dim=hidden_dim)
 
 
-@skipIfRocm
 def test_three_output_model(tmpdir):
     gradient_accumulation_steps = 3
     micro_batch_size = 1
diff --git a/tests/unit/test_partition.py b/tests/unit/test_partition.py
index 4655d470ccee..8919450f4153 100644
--- a/tests/unit/test_partition.py
+++ b/tests/unit/test_partition.py
@@ -11,7 +11,6 @@
 from common import distributed_test, skipIfRocm
 
 
-@skipIfRocm
 @distributed_test(world_size=4)
 def test_partitioned_tensor():
     world = dist.get_world_size()
@@ -33,7 +32,6 @@ def test_partitioned_tensor():
     assert torch.equal(full, reconstructed)
 
 
-@skipIfRocm
 @distributed_test(world_size=4)
 def test_partitioned_tensor_meta():
     world = dist.get_world_size()
diff --git a/tests/unit/test_topology.py b/tests/unit/test_topology.py
index ab6a95f28aab..5dc6d2444b85 100644
--- a/tests/unit/test_topology.py
+++ b/tests/unit/test_topology.py
@@ -184,7 +184,6 @@ def test_grid_pipe_data():
     assert torch.all(rank_tensor == sum(data_group))
 
 
-@skipIfRocm
 @distributed_test(world_size=4)
 def test_stage_to_global():
     topo = Topo(axes=['pipe', 'data'], dims=[2, 2])

From 3dd5e2d95350886224697ff057f0a0a8edee574c Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 3 Nov 2020 06:41:39 +0000
Subject: [PATCH 27/66] Modify include files to build CPU Adam extension

---
 csrc/includes/cublas_wrappers.h | 18 +++++++++++++++++
 csrc/includes/gemm_test.h       | 36 +++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/csrc/includes/cublas_wrappers.h b/csrc/includes/cublas_wrappers.h
index 19d726c3bcd3..9bb6cc30f6ae 100644
--- a/csrc/includes/cublas_wrappers.h
+++ b/csrc/includes/cublas_wrappers.h
@@ -5,7 +5,9 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <mma.h>
+#endif
 #include <stdio.h>
 
 int cublas_gemm_ex(cublasHandle_t handle,
@@ -19,7 +21,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const float* A,
                    const float* B,
                    float* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
 
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
@@ -32,7 +38,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const __half* A,
                    const __half* B,
                    __half* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
 
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
@@ -49,7 +59,11 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
 
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
@@ -66,4 +80,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
index ff06f884351c..05f7b6f62586 100644
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
@@ -2,7 +2,9 @@
 #pragma once
 
 #include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
+#endif
 #include <array>
 #include <cstdio>
 #include <cstdlib>
@@ -58,7 +60,11 @@ class GemmTest {
                            B,
                            A,
                            C,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw1 = Run(loops, [=](int algo) {
@@ -73,7 +79,11 @@ class GemmTest {
                            A,
                            C,
                            B,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw2 = Run(loops, [=](int algo) {
@@ -88,7 +98,11 @@ class GemmTest {
                            B,
                            C,
                            A,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
@@ -100,8 +114,13 @@ class GemmTest {
         float fast_latency = std::numeric_limits<float>::max();
         int fast_algo = 0;
 
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard;
+             algo <= (int)rocblas_gemm_algo_standard;
+#else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
              algo++) {
             int warm_up = 5;
             for (int i = 0; i < warm_up; ++i) f(algo);
@@ -186,7 +205,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw1 = Run(loops, [=](int algo) {
@@ -216,7 +239,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw2 = Run(loops, [=](int algo) {
@@ -243,7 +270,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
@@ -255,8 +286,13 @@ class StridedGemmTest {
         float fast_latency = std::numeric_limits<float>::max();
         int fast_algo = 0;
 
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard;
+             algo <= (int)rocblas_gemm_algo_standard;
+#else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
              algo++) {
             int warm_up = 5;
             for (int i = 0; i < warm_up; ++i) f(algo);

From 77cd5c398a2dd46a4bc85eae21b3db1b0d4af589 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Dec 2020 17:47:19 +0000
Subject: [PATCH 28/66] Update setup.py for latest hipify

---
 setup.py | 66 +++++++++++++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 37 deletions(-)

diff --git a/setup.py b/setup.py
index aeb616d11074..c012f09dafb6 100755
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,6 @@
 import cpufeature
 from setuptools import setup, find_packages
 from torch.utils.cpp_extension import CUDAExtension, BuildExtension, CppExtension
-from torch.utils.hipify import hipify_python
 
 VERSION = "0.3.0"
 
@@ -127,26 +126,20 @@ def fetch_requirements(path):
     SIMD_WIDTH = '-D__AVX256__'
 print("SIMD_WIDTH = ", SIMD_WIDTH)
 
-if is_rocm_pytorch:
-    import shutil
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
-                                show_detailed=True, is_pytorch_extension=True)
-
 ext_modules = []
 
 ## Lamb ##
 if BUILD_MASK & DS_BUILD_LAMB:
     nvcc_flags=['-O3'] + version_dependent_macros
-    if is_rocm_pytorch:
-        sources = ['csrc/lamb/hip/fused_lamb_hip.cpp', 'csrc/lamb/hip/fused_lamb_hip_kernel.hip']
-    else:
-        sources = ['csrc/lamb/fused_lamb_cuda.cpp', 'csrc/lamb/fused_lamb_cuda_kernel.cu']
+    if not is_rocm_pytorch:
         nvcc_flags.extend(['--use_fast_math'])
 
     ext_modules.append(
         CUDAExtension(name='deepspeed.ops.lamb.fused_lamb_cuda',
-                      sources=sources,
+                      sources=[
+                          'csrc/lamb/fused_lamb_cuda.cpp',
+                          'csrc/lamb/fused_lamb_cuda_kernel.cu'
+                      ],
                       include_dirs=['csrc/includes'],
                       extra_compile_args={
                           'cxx': [
@@ -160,14 +153,12 @@ def fetch_requirements(path):
     nvcc_flags= ['-O3'] + version_dependent_macros
     include_dirs=['csrc/includes']
     if is_rocm_pytorch:
-        sources = ['csrc/adam/hip/cpu_adam.cpp', 'csrc/adam/hip/custom_hip_kernel.hip']
         include_dirs.extend(['/opt/rocm/include/rocrand', '/opt/rocm/include/hiprand'])
         nvcc_flags.extend(['-U__HIP_NO_HALF_OPERATORS__',
                            '-U__HIP_NO_HALF_CONVERSIONS__',
                            '-U__HIP_NO_HALF2_OPERATORS__'
         ])
     else:
-        sources=['csrc/adam/cpu_adam.cpp','csrc/adam/custom_cuda_kernel.cu']
         include_dirs.extend(['/usr/local/cuda/include'])
         nvcc_flags.extend(['--use_fast_math',
                            '-gencode',
@@ -182,7 +173,10 @@ def fetch_requirements(path):
 
     ext_modules.append(
         CUDAExtension(name='deepspeed.ops.adam.cpu_adam_op',
-                      sources=sources,
+                      sources=[
+                          'csrc/adam/cpu_adam.cpp',
+                          'csrc/adam/custom_cuda_kernel.cu',
+                      ],
                       include_dirs=include_dirs,
                       extra_compile_args={
                           'cxx': [
@@ -205,32 +199,12 @@ def fetch_requirements(path):
     nvcc_flags= ['-O3', '-std=c++14'] + version_dependent_macros
     include_dirs=['csrc/includes']
     if is_rocm_pytorch:
-        sources = [
-            'csrc/transformer/hip/ds_transformer_hip.cpp',
-            'csrc/transformer/hip/cublas_wrappers.hip',
-            'csrc/transformer/hip/transform_kernels.hip',
-            'csrc/transformer/hip/gelu_kernels.hip',
-            'csrc/transformer/hip/dropout_kernels.hip',
-#            'csrc/transformer/hip/normalize_kernels.hip',
-            'csrc/transformer/hip/softmax_kernels.hip',
-            'csrc/transformer/hip/general_kernels.hip'
-        ]
         include_dirs.extend(['/opt/rocm/include/rocrand', '/opt/rocm/include/hiprand'])
         nvcc_flags.extend(['-U__HIP_NO_HALF_OPERATORS__',
                            '-U__HIP_NO_HALF_CONVERSIONS__',
                            '-U__HIP_NO_HALF2_OPERATORS__'
         ])
     else:
-        sources=[
-            'csrc/transformer/ds_transformer_cuda.cpp',
-            'csrc/transformer/cublas_wrappers.cu',
-            'csrc/transformer/transform_kernels.cu',
-            'csrc/transformer/gelu_kernels.cu',
-            'csrc/transformer/dropout_kernels.cu',
-            'csrc/transformer/normalize_kernels.cu',
-            'csrc/transformer/softmax_kernels.cu',
-            'csrc/transformer/general_kernels.cu'
-        ]
         nvcc_flags.extend(['--use_fast_math',
                            '-gencode',
                            'arch=compute_61,code=compute_61',
@@ -242,7 +216,16 @@ def fetch_requirements(path):
         ])
     ext_modules.append(
         CUDAExtension(name='deepspeed.ops.transformer.transformer_cuda',
-                      sources=sources,
+                      sources=[
+                          'csrc/transformer/ds_transformer_cuda.cpp',
+                          'csrc/transformer/cublas_wrappers.cu',
+                          'csrc/transformer/transform_kernels.cu',
+                          'csrc/transformer/gelu_kernels.cu',
+                          'csrc/transformer/dropout_kernels.cu',
+                          'csrc/transformer/normalize_kernels.cu',
+                          'csrc/transformer/softmax_kernels.cu',
+                          'csrc/transformer/general_kernels.cu'
+                      ],
                       include_dirs=include_dirs,
                       extra_compile_args={
                           'cxx': ['-O3',
@@ -253,7 +236,16 @@ def fetch_requirements(path):
                       }))
     ext_modules.append(
         CUDAExtension(name='deepspeed.ops.transformer.stochastic_transformer_cuda',
-                      sources=sources,
+                      sources=[
+                          'csrc/transformer/ds_transformer_cuda.cpp',
+                          'csrc/transformer/cublas_wrappers.cu',
+                          'csrc/transformer/transform_kernels.cu',
+                          'csrc/transformer/gelu_kernels.cu',
+                          'csrc/transformer/dropout_kernels.cu',
+                          'csrc/transformer/normalize_kernels.cu',
+                          'csrc/transformer/softmax_kernels.cu',
+                          'csrc/transformer/general_kernels.cu'
+                      ],
                       include_dirs=include_dirs,
                       extra_compile_args={
                           'cxx': ['-O3',

From 7f9bbeb39955be7ab446f209c546f2faf9be9acc Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 16 Dec 2020 19:08:55 +0000
Subject: [PATCH 29/66] Update CPU Adam header files to remove ifdefing
 unnecessary with latest hipify

---
 csrc/adam/cpu_adam.cpp          | 8 --------
 csrc/adam/custom_cuda_kernel.cu | 4 ----
 2 files changed, 12 deletions(-)

diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index b629ae71d71f..380bc4ea0ab0 100644
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -1,8 +1,4 @@
-#ifdef __HIP_PLATFORM_HCC__
-#include "hip/cpu_adam.h"
-#else
 #include "cpu_adam.h"
-#endif
 #include <cuda_runtime_api.h>
 #include <math.h>
 #include <omp.h>
@@ -14,11 +10,7 @@
 #include "cublas_v2.h"
 #include "cuda.h"
 #include "curand.h"
-#ifdef __HIP_PLATFORM_HCC__
-#include "hip/custom_hip_layers.h"
-#else
 #include "custom_cuda_layers.h"
-#endif
 
 static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
diff --git a/csrc/adam/custom_cuda_kernel.cu b/csrc/adam/custom_cuda_kernel.cu
index ac3c9fe5929a..8f8d2c826771 100644
--- a/csrc/adam/custom_cuda_kernel.cu
+++ b/csrc/adam/custom_cuda_kernel.cu
@@ -1,10 +1,6 @@
 
 
-#ifdef __HIP_PLATFORM_HCC__
-#include "hip/custom_hip_layers.h"
-#else
 #include "custom_cuda_layers.h"
-#endif
 
 __global__ void param_update_kernel(const float* input, __half* output, int size)
 {

From ea7100504414f7b28b33bfe0ac2c6929de61d91b Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 23 Dec 2020 23:31:52 +0000
Subject: [PATCH 30/66] Hipified transformer kernel extensions

---
 csrc/includes/feed_forward.h             |  12 ++
 csrc/includes/general_kernels.h          |   4 +
 csrc/includes/strided_batch_gemm.h       |  16 ++
 csrc/transformer/cublas_wrappers.cu      | 202 +++++++++++++++++++++++
 csrc/transformer/ds_transformer_cuda.cpp |   2 +
 5 files changed, 236 insertions(+)

diff --git a/csrc/includes/feed_forward.h b/csrc/includes/feed_forward.h
index 7b7379d9b998..3a59d56ee6cd 100644
--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
@@ -43,7 +43,11 @@ class FeedForward {
                        weights,
                        input_ptr,
                        out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[0]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[0]));
+#endif
     }
     void Backward(int bsz,
                   const T* out_grad,
@@ -68,7 +72,11 @@ class FeedForward {
                        input_ptr,
                        out_grad,
                        weights_grad,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[1]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[1]));
+#endif
 
         cublas_gemm_ex(_cublasHandle,
                        CUBLAS_OP_N,
@@ -81,7 +89,11 @@ class FeedForward {
                        weights,
                        out_grad,
                        inp_grad_out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[2]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[2]));
+#endif
 
         launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
     }
diff --git a/csrc/includes/general_kernels.h b/csrc/includes/general_kernels.h
index 588cf2aaa048..62416f0124dc 100644
--- a/csrc/includes/general_kernels.h
+++ b/csrc/includes/general_kernels.h
@@ -3,7 +3,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>
 
 #include "context.h"
diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h
index 8c43608e2ecf..d882dc1be1fa 100644
--- a/csrc/includes/strided_batch_gemm.h
+++ b/csrc/includes/strided_batch_gemm.h
@@ -65,7 +65,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[0]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
+#endif
     }
 
     void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
@@ -89,7 +93,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     _config.batch_size,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[0]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
+#endif
 
         k_buf = _buffer_a;
         q_buf = _buffer_b;
@@ -129,7 +137,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[1]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[1]));
+#endif
 
         // A need to transpose.
         cublasOperation_t op_a = (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
@@ -154,7 +166,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[2]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[2]));
+#endif
     }
 
     inline int GetN() const { return _config.k; }
diff --git a/csrc/transformer/cublas_wrappers.cu b/csrc/transformer/cublas_wrappers.cu
index 7b0016bcae5e..2068fe668360 100644
--- a/csrc/transformer/cublas_wrappers.cu
+++ b/csrc/transformer/cublas_wrappers.cu
@@ -1,5 +1,19 @@
 #include "cublas_wrappers.h"
 
+#ifdef __HIP_PLATFORM_HCC__
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const float* A,
+                   const float* B,
+                   float* C,
+                   rocblas_gemm_algo algo)
+#else
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
                    cublasOperation_t transb,
@@ -12,7 +26,34 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const float* B,
                    float* C,
                    cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status = rocblas_gemm_ex(handle,
+                                            transa,
+                                            transb,
+                                            m,
+                                            n,
+                                            k,
+                                            (const void*)alpha,
+                                            (const void*)A,
+                                            rocblas_datatype_f32_r,
+                                            (transa == rocblas_operation_none) ? m : k,
+                                            (const void*)B,
+                                            rocblas_datatype_f32_r,
+                                            (transb == rocblas_operation_none) ? k : n,
+                                            (const void*)beta,
+                                            C,
+                                            rocblas_datatype_f32_r,
+                                            m,
+                                            C,
+                                            rocblas_datatype_f32_r,
+                                            m,
+                                            rocblas_datatype_f32_r,
+                                            algo,
+					    0,
+					    0);
+#else
     cublasStatus_t status = cublasGemmEx(handle,
                                          transa,
                                          transb,
@@ -32,14 +73,33 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          m,
                                          CUDA_R_32F,
                                          algo);
+#endif
 
+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
     if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
         fprintf(stderr, "!!!! kernel execution error.\n");
         return EXIT_FAILURE;
     }
     return 0;
 }
 
+#ifdef __HIP_PLATFORM_HCC__
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const __half* A,
+                   const __half* B,
+                   __half* C,
+                   rocblas_gemm_algo algo)
+#else
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
                    cublasOperation_t transb,
@@ -52,7 +112,34 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const __half* B,
                    __half* C,
                    cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status = rocblas_gemm_ex(handle,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         (const void*)A,
+                                         rocblas_datatype_f16_r ,
+                                         (transa == rocblas_operation_none) ? m : k,
+                                         (const void*)B,
+                                         rocblas_datatype_f16_r,
+                                         (transb == rocblas_operation_none) ? k : n,
+                                         (const void*)beta,
+                                         (void*)C,
+                                         rocblas_datatype_f16_r,
+                                         m,
+                                         (void*)C,
+                                         rocblas_datatype_f16_r,
+                                         m,
+                                         rocblas_datatype_f32_r,
+                                         algo,
+					 0,
+					 0);
+#else
     cublasStatus_t status = cublasGemmEx(handle,
                                          transa,
                                          transb,
@@ -72,14 +159,37 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          m,
                                          CUDA_R_32F,
                                          algo);
+#endif
 
+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
     if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
         fprintf(stderr, "!!!! kernel execution error.\n");
         return EXIT_FAILURE;
     }
     return 0;
 }
 
+#ifdef __HIP_PLATFORM_HCC__
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const float* A,
+                                const float* B,
+                                float* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                rocblas_gemm_algo algo)
+#else
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
                                 int n,
@@ -96,7 +206,39 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_C,
                                 int batch,
                                 cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status = rocblas_gemm_strided_batched_ex(handle,
+                                                            op_A,
+                                                            op_B,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            rocblas_datatype_f32_r,
+                                                            (op_A == rocblas_operation_none) ? m : k,
+                                                            stride_A,
+                                                            B,
+                                                            rocblas_datatype_f32_r,
+                                                            (op_B == rocblas_operation_none) ? k : n,
+                                                            stride_B,
+                                                            beta,
+                                                            C,
+                                                            rocblas_datatype_f32_r,
+                                                            m,
+                                                            stride_C,
+                                                            C,
+                                                            rocblas_datatype_f32_r,
+                                                            m,
+                                                            stride_C,
+                                                            batch,
+                                                            rocblas_datatype_f32_r,
+                                                            algo,
+							    0,
+							    0);
+#else
     cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
                                                        op_A,
                                                        op_B,
@@ -120,14 +262,37 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        batch,
                                                        CUDA_R_32F,
                                                        algo);
+#endif
 
+#ifdef __HIP_PLATFORM_HCC__
     if (status != CUBLAS_STATUS_SUCCESS) {
+#else
+    if (status != rocblas_status_success) {
+#endif
         fprintf(stderr, "!!!! kernel execution error.\n");
         return EXIT_FAILURE;
     }
     return 0;
 }
 
+#ifdef __HIP_PLATFORM_HCC__
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const __half* A,
+                                const __half* B,
+                                __half* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                rocblas_gemm_algo algo)
+#else
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
                                 int n,
@@ -144,7 +309,39 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_C,
                                 int batch,
                                 cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status = rocblas_gemm_strided_batched_ex(handle,
+                                                            op_A,
+                                                            op_B,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            rocblas_datatype_f16_r,
+                                                            (op_A == rocblas_operation_none) ? m : k,
+                                                            stride_A,
+                                                            B,
+                                                            rocblas_datatype_f16_r,
+                                                            (op_B == rocblas_operation_none) ? k : n,
+                                                            stride_B,
+                                                            beta,
+                                                            C,
+                                                            rocblas_datatype_f16_r,
+                                                            m,
+                                                            stride_C,
+                                                            C,
+                                                            rocblas_datatype_f16_r,
+                                                            m,
+                                                            stride_C,
+                                                            batch,
+                                                            rocblas_datatype_f32_r,
+                                                            algo,
+							    0,
+							    0);
+#else
     cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
                                                        op_A,
                                                        op_B,
@@ -168,8 +365,13 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        batch,
                                                        CUDA_R_32F,
                                                        algo);
+#endif
 
+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
     if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
         fprintf(stderr, "!!!! kernel execution error.\n");
         return EXIT_FAILURE;
     }
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
index 269468bdfdb4..b469eef2dde7 100755
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -146,7 +146,9 @@ void BertTransformerLayer<T>::Initialize()
     Context::Instance().GenWorkSpace(get_workspace_size<T>(
         _batch_size, _seq_length, _hidden_size, _heads, _training, _gelu_checkpoint));
 
+#ifndef __HIP_PLATFORM_HCC__
     if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
+#endif
 }
 
 template <typename T>

From fbddd9316c3686a15e3d805cbececf25feb4aa69 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 23 Dec 2020 23:36:02 +0000
Subject: [PATCH 31/66] Cooperative Groups workaround for transformer kernels
 extension

---
 csrc/transformer/general_kernels.cu   |  5 ++-
 csrc/transformer/normalize_kernels.cu | 64 ++++++++++++++++++++-------
 csrc/transformer/softmax_kernels.cu   | 16 +++++--
 3 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu
index 0ce280a702ab..fbb8cd7738d9 100644
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@@ -11,7 +11,10 @@ __global__ void column_sum_reduce(const T* __restrict__ inp,
     __shared__ float tile[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
+
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index 7345175694bf..7ee835771c27 100755
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -35,7 +35,9 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     constexpr int iteration_stride = row_stride / iterations;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
+    g.tiled_partition(b, 32);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -117,7 +119,9 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
     constexpr int iteration_stride = row_stride / iterations;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
+    g.tiled_partition(b, 32);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -317,7 +321,9 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     constexpr int iteration_stride = row_stride / iterations;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
+    g.tiled_partition(b, 32);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -397,7 +403,9 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
     constexpr int iteration_stride = row_stride / iterations;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
+    g.tiled_partition(b, 32);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -711,7 +719,9 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -778,7 +788,9 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -843,7 +855,9 @@ __global__ void LayerNormBackward2(const float* out_grad,
     constexpr int iteration_stride = THREADS;  // row_stride / iterations;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -929,7 +943,9 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     constexpr int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1145,7 +1161,9 @@ __global__ void LayerNormBackward2(const float* out_grad,
     constexpr int iteration_stride = THREADS;  // row_stride / iterations;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1229,7 +1247,9 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     constexpr int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1439,7 +1459,9 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -1501,7 +1523,9 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -1560,7 +1584,9 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     constexpr int iteration_stride = THREADS;  // row_stride / iterations;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1647,7 +1673,9 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     constexpr int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1868,7 +1896,9 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     constexpr int iteration_stride = THREADS;  // row_stride / iterations;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1957,7 +1987,9 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     constexpr int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index 8e2b86901609..af668d51352f 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -19,7 +19,9 @@ __global__ void attn_softmax(float* vals,
     int block_width = blockStride * seq_length;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
+    g.tiled_partition(b, tbSize);
 
     int batch = blockIdx.x;
     int row = blockIdx.y;
@@ -148,7 +150,9 @@ __global__ void attn_softmax(__half* vals,
     int block_width = blockStride * seq_length;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
+    g.tiled_partition(b, tbSize);
 
     int batch = blockIdx.x;
     int row = blockIdx.y;
@@ -436,7 +440,9 @@ __global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_
                           : MAX_THREAD_ITERATIONS);
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
+    g.tiled_partition(b, tbSize);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -511,7 +517,9 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
     }
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
 

From 9091b20baa1b6476fdce13f0b6311a254a0b59ea Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Thu, 7 Jan 2021 22:03:38 +0000
Subject: [PATCH 32/66] Update apex commit

---
 third_party/apex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/apex b/third_party/apex
index 7e099371e7cc..76e4e05408b0 160000
--- a/third_party/apex
+++ b/third_party/apex
@@ -1 +1 @@
-Subproject commit 7e099371e7ccdaf82058d7db9646269f4756a21b
+Subproject commit 76e4e05408b06035c78672a014d92aaad27ec1d1

From 5e6bb85674d9450298ce84dcd19a3c0946ad6377 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Fri, 26 Mar 2021 23:32:01 +0000
Subject: [PATCH 33/66] Integrate op_builder from upstream and update for ROCm

---
 op_builder/__init__.py               |  24 ++
 op_builder/builder.py                | 346 +++++++++++++++++++++++++++
 op_builder/cpu_adam.py               |  90 +++++++
 op_builder/fused_adam.py             |  31 +++
 op_builder/fused_lamb.py             |  31 +++
 op_builder/sparse_attn.py            |  52 ++++
 op_builder/stochastic_transformer.py |  21 ++
 op_builder/transformer.py            |  56 +++++
 op_builder/utils.py                  |  18 ++
 9 files changed, 669 insertions(+)
 create mode 100644 op_builder/__init__.py
 create mode 100644 op_builder/builder.py
 create mode 100644 op_builder/cpu_adam.py
 create mode 100644 op_builder/fused_adam.py
 create mode 100644 op_builder/fused_lamb.py
 create mode 100644 op_builder/sparse_attn.py
 create mode 100644 op_builder/stochastic_transformer.py
 create mode 100644 op_builder/transformer.py
 create mode 100644 op_builder/utils.py

diff --git a/op_builder/__init__.py b/op_builder/__init__.py
new file mode 100644
index 000000000000..01711869371d
--- /dev/null
+++ b/op_builder/__init__.py
@@ -0,0 +1,24 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+from .cpu_adam import CPUAdamBuilder
+from .fused_adam import FusedAdamBuilder
+from .fused_lamb import FusedLambBuilder
+from .sparse_attn import SparseAttnBuilder
+from .transformer import TransformerBuilder
+from .stochastic_transformer import StochasticTransformerBuilder
+from .utils import UtilsBuilder
+from .builder import get_default_compute_capatabilities, is_rocm_pytorch
+
+# TODO: infer this list instead of hard coded
+# List of all available ops
+__op_builders__ = [
+    CPUAdamBuilder(),
+    FusedAdamBuilder(),
+    FusedLambBuilder(),
+    SparseAttnBuilder(),
+    TransformerBuilder(),
+    StochasticTransformerBuilder(),
+    UtilsBuilder()
+]
+ALL_OPS = {op.name: op for op in __op_builders__}
diff --git a/op_builder/builder.py b/op_builder/builder.py
new file mode 100644
index 000000000000..0511bdc4c6e4
--- /dev/null
+++ b/op_builder/builder.py
@@ -0,0 +1,346 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import os
+import time
+import torch
+import importlib
+from pathlib import Path
+import subprocess
+from abc import ABC, abstractmethod
+
+YELLOW = '\033[93m'
+END = '\033[0m'
+WARNING = f"{YELLOW} [WARNING] {END}"
+
+DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
+DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
+
+is_rocm_pytorch = False
+if torch.__version__ >= '1.5':
+    from torch.utils.cpp_extension import ROCM_HOME
+    is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
+
+def installed_cuda_version():
+    import torch.utils.cpp_extension
+    cuda_home = torch.utils.cpp_extension.CUDA_HOME
+    assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
+    # Ensure there is not a cuda version mismatch between torch and nvcc compiler
+    output = subprocess.check_output([cuda_home + "/bin/nvcc",
+                                      "-V"],
+                                     universal_newlines=True)
+    output_split = output.split()
+    release_idx = output_split.index("release")
+    release = output_split[release_idx + 1].replace(',', '').split(".")
+    # Ignore patch versions, only look at major + minor
+    cuda_major, cuda_minor = release[:2]
+    installed_cuda_version = ".".join(release[:2])
+    return int(cuda_major), int(cuda_minor)
+
+
+def get_default_compute_capatabilities():
+    compute_caps = DEFAULT_COMPUTE_CAPABILITIES
+    import torch.utils.cpp_extension
+    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version(
+    )[0] >= 11:
+        if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
+            # Special treatment of CUDA 11.0 because compute_86 is not supported.
+            compute_caps += ";8.0"
+        else:
+            compute_caps += ";8.0;8.6"
+    return compute_caps
+
+
+def assert_no_cuda_mismatch():
+    cuda_major, cuda_minor = installed_cuda_version()
+    sys_cuda_version = f'{cuda_major}.{cuda_minor}'
+    torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    # This is a show-stopping error, should probably not proceed past this
+    if sys_cuda_version != torch_cuda_version:
+        if sys_cuda_version == "11.1" and torch_cuda_version == "11.0":
+            # it works to build against installed cuda-11.1 while torch was built with cuda-11.0
+            return
+        raise Exception(
+            f"Installed CUDA version {sys_cuda_version} does not match the "
+            f"version torch was compiled with {torch.version.cuda}, unable to compile "
+            "cuda/cpp extensions without a matching cuda version.")
+
+
+def assert_torch_info(torch_info):
+    install_torch_version = torch_info['version']
+    install_cuda_version = torch_info['cuda_version']
+
+    current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    current_torch_version = ".".join(torch.__version__.split('.')[:2])
+
+    if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
+        raise RuntimeError(
+            "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
+            "with a different version than what is being used at runtime. Please re-install "
+            f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
+            f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
+            f"torch={current_torch_version}, cuda={current_cuda_version}")
+
+
+class OpBuilder(ABC):
+    def __init__(self, name):
+        self.name = name
+        self.jit_mode = False
+
+    @abstractmethod
+    def absolute_name(self):
+        '''
+        Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
+        will be installed as something like: deepspeed/ops/adam/cpu_adam.so
+        '''
+        pass
+
+    @abstractmethod
+    def sources(self):
+        '''
+        Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        '''
+        pass
+
+    def include_paths(self):
+        '''
+        Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        '''
+        return []
+
+    def nvcc_args(self):
+        '''
+        Returns optional list of compiler flags to forward to nvcc when building CUDA sources
+        '''
+        return []
+
+    def cxx_args(self):
+        '''
+        Returns optional list of compiler flags to forward to the build
+        '''
+        return []
+
+    def is_compatible(self):
+        '''
+        Check if all non-python dependencies are satisfied to build this op
+        '''
+        return True
+
+    def extra_ldflags(self):
+        return []
+
+    def libraries_installed(self, libraries):
+        valid = False
+        check_cmd = 'dpkg -l'
+        for lib in libraries:
+            result = subprocess.Popen(f'dpkg -l {lib}',
+                                      stdout=subprocess.PIPE,
+                                      stderr=subprocess.PIPE,
+                                      shell=True)
+            valid = valid or result.wait() == 0
+        return valid
+
+    def simd_width(self):
+        if not self.command_exists('lscpu'):
+            self.warning(
+                f"{self.name} is attempted to query 'lscpu' to detect the existence "
+                "of AVX instructions. However, 'lscpu' does not appear to exist on "
+                "your system, will fall back to non-vectorized execution.")
+            return ''
+
+        result = subprocess.check_output('lscpu', shell=True)
+        result = result.decode('utf-8').strip().lower()
+        if 'genuineintel' in result:
+            if 'avx512' in result:
+                return '-D__AVX512__'
+            elif 'avx2' in result:
+                return '-D__AVX256__'
+        return ''
+
+    def python_requirements(self):
+        '''
+        Override if op wants to define special dependencies, otherwise will
+        take self.name and load requirements-<op-name>.txt if it exists.
+        '''
+        path = f'requirements/requirements-{self.name}.txt'
+        requirements = []
+        if os.path.isfile(path):
+            with open(path, 'r') as fd:
+                requirements = [r.strip() for r in fd.readlines()]
+        return requirements
+
+    def command_exists(self, cmd):
+        if '|' in cmd:
+            cmds = cmd.split("|")
+        else:
+            cmds = [cmd]
+        valid = False
+        for cmd in cmds:
+            result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+            valid = valid or result.wait() == 0
+
+        if not valid and len(cmds) > 1:
+            print(
+                f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!"
+            )
+        elif not valid and len(cmds) == 1:
+            print(
+                f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!"
+            )
+        return valid
+
+    def warning(self, msg):
+        print(f"{WARNING} {msg}")
+
+    def deepspeed_src_path(self, code_path):
+        if os.path.isabs(code_path):
+            return code_path
+        else:
+            return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension
+        return CppExtension(name=self.absolute_name(),
+                            sources=self.sources(),
+                            include_dirs=self.include_paths(),
+                            extra_compile_args={'cxx': self.cxx_args()},
+                            extra_link_args=self.extra_ldflags())
+
+    def load(self, verbose=True):
+        from ...git_version_info import installed_ops, torch_info
+        if installed_ops[self.name]:
+            # Ensure the op we're about to load was compiled with the same
+            # torch/cuda versions we are currently using at runtime.
+            if isinstance(self, CUDAOpBuilder):
+                assert_torch_info(torch_info)
+
+            return importlib.import_module(self.absolute_name())
+        else:
+            return self.jit_load(verbose)
+
+    def jit_load(self, verbose=True):
+        if not self.is_compatible():
+            raise RuntimeError(
+                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue."
+            )
+        try:
+            import ninja
+        except ImportError:
+            raise RuntimeError(
+                f"Unable to JIT load the {self.name} op due to ninja not being installed."
+            )
+
+        if isinstance(self, CUDAOpBuilder) and not is_rocm_pytorch:
+            assert_no_cuda_mismatch()
+
+        self.jit_mode = True
+        from torch.utils.cpp_extension import load
+
+        # Ensure directory exists to prevent race condition in some cases
+        ext_path = os.path.join(
+            os.environ.get('TORCH_EXTENSIONS_DIR',
+                           DEFAULT_TORCH_EXTENSION_PATH),
+            self.name)
+        os.makedirs(ext_path, exist_ok=True)
+
+        start_build = time.time()
+        sources = [self.deepspeed_src_path(path) for path in self.sources()]
+        op_module = load(
+            name=self.name,
+            sources=sources,
+            extra_include_paths=[
+                self.deepspeed_src_path(path) for path in self.include_paths()
+            ],
+            extra_cflags=self.cxx_args(),
+            extra_cuda_cflags=self.nvcc_args(),
+            extra_ldflags=self.extra_ldflags(),
+            verbose=verbose)
+        build_duration = time.time() - start_build
+        if verbose:
+            print(f"Time to load {self.name} op: {build_duration} seconds")
+        return op_module
+
+
+class CUDAOpBuilder(OpBuilder):
+    def compute_capability_args(self, cross_compile_archs=None):
+        """
+        Returns nvcc compute capability compile flags.
+
+        1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
+        2. If neither is set default compute capabilities will be used
+        3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
+
+        Format:
+
+        - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
+
+        TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
+        TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
+
+        - `cross_compile_archs` uses ; separator.
+
+        """
+
+        ccs = []
+        if self.jit_mode:
+            # Compile for underlying architectures since we know those at runtime
+            for i in range(torch.cuda.device_count()):
+                CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i)
+                cc = f"{CC_MAJOR}.{CC_MINOR}"
+                if cc not in ccs:
+                    ccs.append(cc)
+            ccs = sorted(ccs)
+            ccs[-1] += '+PTX'
+        else:
+            # Cross-compile mode, compile for various architectures
+            # env override takes priority
+            cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
+            if cross_compile_archs_env is not None:
+                if cross_compile_archs is not None:
+                    print(
+                        f"{WARNING} env var `TORCH_CUDA_ARCH_LIST={cross_compile_archs_env}` overrides `cross_compile_archs={cross_compile_archs}`"
+                    )
+                cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
+            else:
+                if cross_compile_archs is None:
+                    cross_compile_archs = get_default_compute_capatabilities()
+            ccs = cross_compile_archs.split(';')
+
+        args = []
+        for cc in ccs:
+            num = cc[0] + cc[2]
+            args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
+            if cc.endswith('+PTX'):
+                args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
+
+        return args
+
+    def version_dependent_macros(self):
+        # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        version_ge_1_1 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
+            version_ge_1_1 = ['-DVERSION_GE_1_1']
+        version_ge_1_3 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
+            version_ge_1_3 = ['-DVERSION_GE_1_3']
+        version_ge_1_5 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
+            version_ge_1_5 = ['-DVERSION_GE_1_5']
+        return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+
+    def is_compatible(self):
+        return super().is_compatible()
+
+    def builder(self):
+        from torch.utils.cpp_extension import CUDAExtension
+        if not is_rocm_pytorch:
+            assert_no_cuda_mismatch()
+        return CUDAExtension(name=self.absolute_name(),
+                             sources=self.sources(),
+                             include_dirs=self.include_paths(),
+                             extra_compile_args={
+                                 'cxx': self.cxx_args(),
+                                 'nvcc': self.nvcc_args()
+                             })
diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py
new file mode 100644
index 000000000000..adf078c9bc0f
--- /dev/null
+++ b/op_builder/cpu_adam.py
@@ -0,0 +1,90 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import os
+import torch
+import subprocess
+from .builder import CUDAOpBuilder, is_rocm_pytorch
+
+
+class CPUAdamBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/custom_cuda_kernel.cu']
+
+    def include_paths(self):
+        if not is_rocm_pytorch:
+            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        else:
+            CUDA_INCLUDE = [
+                            os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
+                            os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"), 
+                            os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
+            ]
+        return ['csrc/includes'] + CUDA_INCLUDE
+
+    def simd_width(self):
+        if not self.command_exists('lscpu'):
+            self.warning(
+                "CPUAdam attempted to query 'lscpu' to detect the existence "
+                "of AVX instructions. However, 'lscpu' does not appear to exist on "
+                "your system, will fall back to non-vectorized execution.")
+            return ''
+
+        result = subprocess.check_output('lscpu', shell=True)
+        result = result.decode('utf-8').strip().lower()
+        if 'genuineintel' in result:
+            if 'avx512' in result:
+                return '-D__AVX512__'
+            elif 'avx2' in result:
+                return '-D__AVX256__'
+        return '-D__SCALAR__'
+
+    def cxx_args(self):
+        if not is_rocm_pytorch:
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+        else:
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
+        SIMD_WIDTH = self.simd_width()
+
+        return [
+            '-O3',
+            '-std=c++14',
+            f'-L{CUDA_LIB64}',
+            '-lcudart',
+            '-lcublas',
+            '-g',
+            '-Wno-reorder',
+            '-march=native',
+            '-fopenmp',
+            SIMD_WIDTH
+        ]
+
+    def nvcc_args(self):
+        args = [
+            '-O3',
+            '-std=c++14'
+        ]
+        if is_rocm_pytorch:
+            args += [
+                '-U__HIP_NO_HALF_OPERATORS__',
+                '-U__HIP_NO_HALF_CONVERSIONS__',
+                '-U__HIP_NO_HALF2_OPERATORS__'
+            ]
+        else:
+            args += [
+                '--use_fast_math',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__',
+                '-U__CUDA_NO_HALF2_OPERATORS__'
+            ]
+            args += self.compute_capability_args()
+        return args
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
new file mode 100644
index 000000000000..52dbeff01d64
--- /dev/null
+++ b/op_builder/fused_adam.py
@@ -0,0 +1,31 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import torch
+from .builder import CUDAOpBuilder, is_rocm_pytorch
+
+
+class FusedAdamBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
+
+    def include_paths(self):
+        return ['csrc/includes', 'csrc/adam']
+
+    def cxx_args(self):
+        return ['-O3'] + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags=['-O3'] + self.version_dependent_macros()
+        if not is_rocm_pytorch:
+            nvcc_flags.extend(['-lineinfo', '--use_fast_math'] + self.compute_capability_args())
+        return nvcc_flags
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
new file mode 100644
index 000000000000..ccc9730b4e27
--- /dev/null
+++ b/op_builder/fused_lamb.py
@@ -0,0 +1,31 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import torch
+from .builder import CUDAOpBuilder, is_rocm_pytorch
+
+
+class FusedLambBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_FUSED_LAMB'
+    NAME = "fused_lamb"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.lamb.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/lamb/fused_lamb_cuda.cpp', 'csrc/lamb/fused_lamb_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['csrc/includes']
+
+    def cxx_args(self):
+        return ['-O3'] + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        nvcc_flags=['-O3'] + self.version_dependent_macros()
+        if not is_rocm_pytorch:
+            nvcc_flags.extend(['-lineinfo', '--use_fast_math'] + self.compute_capability_args())
+        return nvcc_flags
diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py
new file mode 100644
index 000000000000..9a46c2ff3de6
--- /dev/null
+++ b/op_builder/sparse_attn.py
@@ -0,0 +1,52 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import torch
+import warnings
+from .builder import OpBuilder
+
+
+class SparseAttnBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_SPARSE_ATTN"
+    NAME = "sparse_attn"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.sparse_attention.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/sparse_attention/utils.cpp']
+
+    def cxx_args(self):
+        return ['-O2', '-fopenmp']
+
+    def is_compatible(self):
+        # Check to see if llvm and cmake are installed since they are dependencies
+        required_commands = ['llvm-config|llvm-config-9', 'cmake']
+        command_status = list(map(self.command_exists, required_commands))
+        deps_compatible = all(command_status)
+
+        # torch-cpu will not have a cuda version
+        if torch.version.cuda is None:
+            cuda_compatible = False
+            self.warning(f"{self.NAME} cuda is not available from torch")
+        else:
+            major, minor = torch.version.cuda.split('.')[:2]
+            cuda_compatible = int(major) == 10 and int(minor) >= 1
+            if not cuda_compatible:
+                self.warning(
+                    f"{self.NAME} requires CUDA version 10.1+, does not currently support >=11 or <10.1"
+                )
+
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5
+        if not torch_compatible:
+            self.warning(
+                f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
+            )
+
+        return super().is_compatible(
+        ) and deps_compatible and torch_compatible and cuda_compatible
diff --git a/op_builder/stochastic_transformer.py b/op_builder/stochastic_transformer.py
new file mode 100644
index 000000000000..b7e2f3845117
--- /dev/null
+++ b/op_builder/stochastic_transformer.py
@@ -0,0 +1,21 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import torch
+from .transformer import TransformerBuilder
+
+
+class StochasticTransformerBuilder(TransformerBuilder):
+    BUILD_VAR = "DS_BUILD_STOCHASTIC_TRANSFORMER"
+    NAME = "stochastic_transformer"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.transformer.{self.NAME}_op'
+
+    def nvcc_args(self):
+        args = super().nvcc_args()
+        args.append('-D__STOCHASTIC_MODE__')
+        return args
diff --git a/op_builder/transformer.py b/op_builder/transformer.py
new file mode 100644
index 000000000000..12608e1ba898
--- /dev/null
+++ b/op_builder/transformer.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import torch
+from .builder import CUDAOpBuilder, is_rocm_pytorch
+
+
+class TransformerBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_TRANSFORMER"
+    NAME = "transformer"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.transformer.{self.NAME}_op'
+
+    def sources(self):
+        return [
+            'csrc/transformer/ds_transformer_cuda.cpp',
+            'csrc/transformer/cublas_wrappers.cu',
+            'csrc/transformer/transform_kernels.cu',
+            'csrc/transformer/gelu_kernels.cu',
+            'csrc/transformer/dropout_kernels.cu',
+            'csrc/transformer/normalize_kernels.cu',
+            'csrc/transformer/softmax_kernels.cu',
+            'csrc/transformer/general_kernels.cu'
+        ]
+
+    def include_paths(self):
+        return ['csrc/includes']
+
+    def nvcc_args(self):
+        args = [
+            '-O3',
+            '-std=c++14',
+        ]
+        if is_rocm_pytorch:
+            args += [
+                '-U__HIP_NO_HALF_OPERATORS__',
+                '-U__HIP_NO_HALF_CONVERSIONS__',
+                '-U__HIP_NO_HALF2_OPERATORS__'
+            ]
+        else:
+            args += [
+                '--use_fast_math',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__',
+                '-U__CUDA_NO_HALF2_OPERATORS__'
+            ]
+            args += self.compute_capability_args()
+        return args
+
+    def cxx_args(self):
+        return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
diff --git a/op_builder/utils.py b/op_builder/utils.py
new file mode 100644
index 000000000000..02d4daa41680
--- /dev/null
+++ b/op_builder/utils.py
@@ -0,0 +1,18 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+from .builder import OpBuilder
+
+
+class UtilsBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_UTILS"
+    NAME = "utils"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/utils/flatten_unflatten.cpp']

From 67ed124fc9492fa0b38c34a5ee1b8012bcd70c92 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Sat, 27 Mar 2021 00:05:02 +0000
Subject: [PATCH 34/66] Update Dockerfile.rocm

---
 docker/Dockerfile.rocm | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 2578d98f0749..d111e966d66f 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -148,7 +148,7 @@ RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
 # https://stackoverflow.com/a/53926898
 ##############################################################################
 RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
-    rm -rf /usr/lib/python3/dist-packages/PyYAML-*
+        rm -rf /usr/lib/python3/dist-packages/PyYAML-*
 
 ##############################################################################
 ## Add deepspeed user
@@ -166,9 +166,8 @@ RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
 ##############################################################################
 RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
 RUN cd ${STAGE_DIR}/DeepSpeed && \
-    git checkout . && \
-    git checkout master && \
-    ./install.sh --third_party_only --allow_sudo && \
-    DS_BUILD_CUDA=0 DS_BUILD_LAMB=1 ./install.sh --allow_sudo
+        git checkout . && \
+        git checkout master && \
+        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
 RUN rm -rf ${STAGE_DIR}/DeepSpeed
 RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"

From c4fe42741713b404955fb815ffbe57bf7f71380e Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Sat, 27 Mar 2021 00:10:04 +0000
Subject: [PATCH 35/66] Temporary hacks to workaround: 1) setup.py issues on
 ROCm wrt. absolute hipified paths. Proper fix is in PR:
 https://github.com/pytorch/pytorch/pull/54801 and 2) import issues with
 sparse_attention because of the workaround for 1)

---
 deepspeed/ops/__init__.py | 4 +++-
 setup.py                  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py
index e6fd81fb5a13..525700b91e62 100755
--- a/deepspeed/ops/__init__.py
+++ b/deepspeed/ops/__init__.py
@@ -1,6 +1,8 @@
 from . import adam
 from . import lamb
-from . import sparse_attention
+from ..git_version_info_installed import installed_ops as __installed_ops__
+if __installed_ops__['sparse_attn']:
+    from . import sparse_attention
 from . import transformer
 
 from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
diff --git a/setup.py b/setup.py
index a7e175fccc47..d0bf4f750d2d 100755
--- a/setup.py
+++ b/setup.py
@@ -182,7 +182,7 @@ def op_enabled(op_name):
       extras_require=extras_require,
       packages=find_packages(exclude=["docker",
                                       "third_party"]),
-      include_package_data=True,
+#      include_package_data=True, #FIXME
       scripts=[
           'bin/deepspeed',
           'bin/deepspeed.pt',

From 74ebc970916a8113a391769b0427f4f323f2a1a1 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Sat, 27 Mar 2021 06:27:32 +0000
Subject: [PATCH 36/66] torch.version.cuda doesn't exist for ROCm PyTorch

---
 setup.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index d0bf4f750d2d..02c44f6dd764 100755
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,10 @@ def fetch_requirements(path):
 # If MPI is available add 1bit-adam requirements
 if torch.cuda.is_available():
     if shutil.which('ompi_info') or shutil.which('mpiname'):
-        cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
+        if is_rocm_pytorch:
+            cupy = "cupy"
+        else:
+            cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
         extras_require['1bit_adam'].append(cupy)
 
 # Make an [all] extra that installs all needed dependencies

From 1bb74d03d12cd77dbe88b5386e74410c4b898a0c Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Mon, 29 Mar 2021 15:41:32 +0000
Subject: [PATCH 37/66] Add hip_version

---
 deepspeed/env_report.py       | 4 +++-
 deepspeed/git_version_info.py | 2 +-
 setup.py                      | 6 +++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
index b14ac4464835..7166104fde95 100644
--- a/deepspeed/env_report.py
+++ b/deepspeed/env_report.py
@@ -85,6 +85,8 @@ def debug_report():
          torch.__version__),
         ("torch cuda version",
          torch.version.cuda),
+        ("torch hip version",
+         torch.version.hip),
         ("nvcc version",
          nvcc_version()),
         ("deepspeed install path",
@@ -93,7 +95,7 @@ def debug_report():
          f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
          ),
         ("deepspeed wheel compiled w.",
-         f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}"),
+         f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}, hip {torch_info['hip_version']}"),
     ]
     print("DeepSpeed general environment info:")
     for name, value in report:
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index f04982c74f0d..a806475c397b 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -14,4 +14,4 @@
     from .ops.op_builder import ALL_OPS
     installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
     compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
-    torch_info = {'version': "0.0", "cuda_version": "0.0"}
+    torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
diff --git a/setup.py b/setup.py
index 02c44f6dd764..12f07a70d2b6 100755
--- a/setup.py
+++ b/setup.py
@@ -149,9 +149,13 @@ def op_enabled(op_name):
 torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
 # Set cuda_version to 0.0 if cpu-only
 cuda_version = "0.0"
+# Set hip_version to 0.0 if cpu-only
+hip_version = "0.0"
 if torch.version.cuda is not None:
     cuda_version = ".".join(torch.version.cuda.split('.')[:2])
-torch_info = {"version": torch_version, "cuda_version": cuda_version}
+if torch.version.hip is not None:
+    hip_version = ".".join(torch.version.hip.split('.')[:2])
+torch_info = {"version": torch_version, "cuda_version": cuda_version, "hip_version": hip_version}
 
 print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
 with open('deepspeed/git_version_info_installed.py', 'w') as fd:

From 3d4e19d29710723cabdf6e8b742be332c5256fb3 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Mon, 29 Mar 2021 18:42:28 +0000
Subject: [PATCH 38/66] Check hip version for ROCm builds

---
 op_builder/builder.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/op_builder/builder.py b/op_builder/builder.py
index 0511bdc4c6e4..b9a96e5347ae 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -69,17 +69,31 @@ def assert_no_cuda_mismatch():
 def assert_torch_info(torch_info):
     install_torch_version = torch_info['version']
     install_cuda_version = torch_info['cuda_version']
+    install_hip_version = torch_info['hip_version']
+
+    if not is_rocm_pytorch:
+        current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    else:
+        current_hip_version = ".".join(torch.version.hip.split('.')[:2])
 
-    current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
     current_torch_version = ".".join(torch.__version__.split('.')[:2])
 
-    if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
-        raise RuntimeError(
-            "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
-            "with a different version than what is being used at runtime. Please re-install "
-            f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
-            f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
-            f"torch={current_torch_version}, cuda={current_cuda_version}")
+    if not is_rocm_pytorch:
+        if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
+            raise RuntimeError(
+                "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
+                "with a different version than what is being used at runtime. Please re-install "
+                f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
+                f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
+                f"torch={current_torch_version}, cuda={current_cuda_version}")
+    else:
+        if install_hip_version != current_hip_version or install_torch_version != current_torch_version:
+            raise RuntimeError(
+                "PyTorch and HIP version mismatch! DeepSpeed ops were compiled and installed "
+                "with a different version than what is being used at runtime. Please re-install "
+                f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
+                f"torch={install_torch_version}, hip={install_hip_version}, runtime versions:"
+                f"torch={current_torch_version}, hip={current_hip_version}")
 
 
 class OpBuilder(ABC):

From 9939bd73ed222eab6a462a75f1df9db8538284df Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 31 Mar 2021 19:17:31 +0000
Subject: [PATCH 39/66] Remove unused dir

---
 third_party/apex | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 third_party/apex

diff --git a/third_party/apex b/third_party/apex
deleted file mode 160000
index 76e4e05408b0..000000000000
--- a/third_party/apex
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 76e4e05408b06035c78672a014d92aaad27ec1d1

From 99571e50c9454f1de27bf69461a39caf527ed996 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Thu, 8 Apr 2021 20:31:33 +0000
Subject: [PATCH 40/66] Skipped the tests with the error, ModuleNotFoundError:
 No module named 'cupy'

---
 tests/unit/test_onebit.py   | 5 +++++
 tests/unit/test_topology.py | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
index 8e0056be0cff..d1ebb6957be9 100644
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@@ -18,6 +18,7 @@
 
 
 def test_onebitadam_fp16_basic(tmpdir):
+    pytest.skip("Skipped for now as cupy is not available on ROCm")
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -61,6 +62,7 @@ def _test_onebitadam_fp16_basic(args, model, hidden_dim):
 
 
 def test_onebitadam_fp32_basic(tmpdir):
+    pytest.skip("Skipped for now as cupy is not available on ROCm")
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -100,6 +102,7 @@ def _test_onebitadam_fp32_basic(args, model, hidden_dim):
 
 
 def test_onebitadam_exp_avg_mask(tmpdir):
+    pytest.skip("Skipped for now as cupy is not available on ROCm")
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -161,6 +164,7 @@ def _test_onebitadam_exp_avg_mask(args, model, hidden_dim):
 
 
 def test_onebitadam_checkpointing(tmpdir):
+    pytest.skip("Skipped for now as cupy is not available on ROCm")
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -298,6 +302,7 @@ def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
 
 
 def test_compressed_allreduce_basic(tmpdir):
+    pytest.skip("Skipped for now as cupy is not available on ROCm")
     @distributed_test(world_size=[1, 2])
     def _test_compressed_allreduce_basic():
         from deepspeed.runtime.comm.nccl import NcclBackend
diff --git a/tests/unit/test_topology.py b/tests/unit/test_topology.py
index 5dc6d2444b85..5f054cf084fc 100644
--- a/tests/unit/test_topology.py
+++ b/tests/unit/test_topology.py
@@ -183,9 +183,10 @@ def test_grid_pipe_data():
     data_group = grid.dp_group
     assert torch.all(rank_tensor == sum(data_group))
 
-
+@skipIfRocm
 @distributed_test(world_size=4)
 def test_stage_to_global():
+    #pytest.skip("Skipped for now as cupy is not available on ROCm")
     topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
     grid = Grid(topology=topo)
 

From 9d8ad53a6860aa267a544818843353b2d0f6658b Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Fri, 9 Apr 2021 19:14:14 -0400
Subject: [PATCH 41/66] Updated Dockerfile.rocm

---
 docker/Dockerfile.rocm | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index d111e966d66f..3e850a5238b3 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -51,25 +51,25 @@ RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
 ##############################################################################
 # OPENMPI
 ##############################################################################
-ENV OPENMPI_BASEVERSION=4.0
-ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
-RUN cd ${STAGE_DIR} && \
-    wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
-    cd openmpi-${OPENMPI_VERSION} && \
-    ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
-    make -j"$(nproc)" install && \
-    ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
-    # Sanity check:
-    test -f /usr/local/mpi/bin/mpic++ && \
-    cd ${STAGE_DIR} && \
-    rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
-ENV PATH=/usr/local/mpi/bin:${PATH} \
-    LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
-# Create a wrapper for OpenMPI to allow running as root by default
-RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
-    echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
-    echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
-    chmod a+x /usr/local/mpi/bin/mpirun
+#ENV OPENMPI_BASEVERSION=4.0
+#ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
+#RUN cd ${STAGE_DIR} && \
+#    wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+#    cd openmpi-${OPENMPI_VERSION} && \
+#    ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+#    make -j"$(nproc)" install && \
+#    ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+#    # Sanity check:
+#    test -f /usr/local/mpi/bin/mpic++ && \
+#    cd ${STAGE_DIR} && \
+#    rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+#ENV PATH=/usr/local/mpi/bin:${PATH} \
+#    LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+## Create a wrapper for OpenMPI to allow running as root by default
+#RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+#    echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+#    echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+#    chmod a+x /usr/local/mpi/bin/mpirun
 
 ##############################################################################
 # Python

From 529ebcd2cfec2e9765825bd63b71d7f9412d8609 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Mon, 12 Apr 2021 18:51:17 -0500
Subject: [PATCH 42/66] Update skipIfRocm to add customizable reason string
 (#6)

* Update skipIfRocm to add customizable reason string; update skipped unit tests

* Don't skip test_stage_to_global for now
---
 tests/unit/common.py                  | 18 ++++++++++--------
 tests/unit/test_config.py             |  2 +-
 tests/unit/test_cuda_backward.py      |  2 +-
 tests/unit/test_dist.py               |  2 +-
 tests/unit/test_dynamic_loss_scale.py |  2 +-
 tests/unit/test_lr_schedulers.py      |  2 +-
 tests/unit/test_multi_output_model.py |  2 +-
 tests/unit/test_onebit.py             | 12 ++++++------
 tests/unit/test_partition.py          |  2 +-
 tests/unit/test_pipe_module.py        |  2 +-
 tests/unit/test_topology.py           |  4 +---
 11 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/tests/unit/common.py b/tests/unit/common.py
index 1523de85fab6..316fcf227232 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -16,14 +16,16 @@
 
 TEST_WITH_ROCM = os.getenv('DEEPSPEED_TEST_WITH_ROCM', '0') == '1'
 
-def skipIfRocm(fn):
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        if TEST_WITH_ROCM:
-            raise unittest.SkipTest("test doesn't currently work on the ROCm stack")
-        else:
-            fn(*args, **kwargs)
-    return wrapper
+def skipIfRocm(reason="test doesn't currently work on the ROCm stack"):
+    def decorator(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            if TEST_WITH_ROCM:
+                raise unittest.SkipTest(reason)
+            else:
+                fn(*args, **kwargs)
+        return wrapper
+    return decorator
 
 def distributed_test(world_size=2, backend='nccl'):
     """A decorator for executing a function (e.g., a unit test) in a distributed manner.
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 291ac6cb7009..f8b95ef1d68c 100755
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -56,7 +56,7 @@ def _batch_assert(status, ds_config, batch, micro_batch, gas, success):
                          (2,32,8,2,True),
                          (2,33,17,2,False),
                          (2,32,18,1,False)]) # yapf: disable
-@skipIfRocm
+@skipIfRocm()
 def test_batch_config(num_ranks, batch, micro_batch, gas, success):
     @distributed_test(world_size=2)
     def _test_batch_config(num_ranks, batch, micro_batch, gas, success):
diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py
index f648a79892c5..c66977e9ffc6 100755
--- a/tests/unit/test_cuda_backward.py
+++ b/tests/unit/test_cuda_backward.py
@@ -266,7 +266,7 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
                              #(3,128,51,2,24,False,False, 0.1),
                              #(3,128,54,2,24,False,True, 0.2),
                          ]) # yapf: disable
-@skipIfRocm
+@skipIfRocm()
 def test_backward(batch_size,
                   hidden_size,
                   seq_len,
diff --git a/tests/unit/test_dist.py b/tests/unit/test_dist.py
index b3aaf9baa4af..18a74b0a16fd 100644
--- a/tests/unit/test_dist.py
+++ b/tests/unit/test_dist.py
@@ -1,7 +1,7 @@
 import torch
 import torch.distributed as dist
 
-from common import distributed_test, skipIfRocm
+from common import distributed_test
 
 import pytest
 
diff --git a/tests/unit/test_dynamic_loss_scale.py b/tests/unit/test_dynamic_loss_scale.py
index f5811e657340..302de55c36a3 100755
--- a/tests/unit/test_dynamic_loss_scale.py
+++ b/tests/unit/test_dynamic_loss_scale.py
@@ -5,7 +5,7 @@
 import json
 import os
 import numpy as np
-from common import distributed_test, skipIfRocm
+from common import distributed_test
 from simple_model import SimpleModel, args_from_dict
 
 
diff --git a/tests/unit/test_lr_schedulers.py b/tests/unit/test_lr_schedulers.py
index a50b4b71238a..d93ac6f171bb 100755
--- a/tests/unit/test_lr_schedulers.py
+++ b/tests/unit/test_lr_schedulers.py
@@ -4,7 +4,7 @@
 import pytest
 import json
 import os
-from common import distributed_test, skipIfRocm
+from common import distributed_test
 from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
 from deepspeed.runtime.lr_schedules import LR_RANGE_TEST, LR_RANGE_TEST_MIN_LR, LR_RANGE_TEST_STEP_RATE, LR_RANGE_TEST_STEP_SIZE, LR_RANGE_TEST_STAIRCASE
 from deepspeed.runtime.lr_schedules import WARMUP_LR, WARMUP_MIN_LR, WARMUP_MAX_LR, WARMUP_NUM_STEPS
diff --git a/tests/unit/test_multi_output_model.py b/tests/unit/test_multi_output_model.py
index 1c8b8b39d779..ccbe7f484e29 100755
--- a/tests/unit/test_multi_output_model.py
+++ b/tests/unit/test_multi_output_model.py
@@ -5,7 +5,7 @@
 from pytest import approx
 import json
 import os
-from common import distributed_test, skipIfRocm
+from common import distributed_test
 from simple_model import args_from_dict
 from multi_output_model import MultiOutputModel, multi_output_dataloader
 
diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
index d1ebb6957be9..c4099bc4525f 100644
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@@ -7,7 +7,7 @@
 import os
 import numpy as np
 import time
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args
 
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -17,8 +17,8 @@
                 allow_module_level=True)
 
 
+@skipIfRocm("Skipped for now as cupy is not available on ROCm")
 def test_onebitadam_fp16_basic(tmpdir):
-    pytest.skip("Skipped for now as cupy is not available on ROCm")
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -61,8 +61,8 @@ def _test_onebitadam_fp16_basic(args, model, hidden_dim):
     _test_onebitadam_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm("Skipped for now as cupy is not available on ROCm")
 def test_onebitadam_fp32_basic(tmpdir):
-    pytest.skip("Skipped for now as cupy is not available on ROCm")
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -101,8 +101,8 @@ def _test_onebitadam_fp32_basic(args, model, hidden_dim):
     _test_onebitadam_fp32_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm("Skipped for now as cupy is not available on ROCm")
 def test_onebitadam_exp_avg_mask(tmpdir):
-    pytest.skip("Skipped for now as cupy is not available on ROCm")
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -163,8 +163,8 @@ def _test_onebitadam_exp_avg_mask(args, model, hidden_dim):
     _test_onebitadam_exp_avg_mask(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm("Skipped for now as cupy is not available on ROCm")
 def test_onebitadam_checkpointing(tmpdir):
-    pytest.skip("Skipped for now as cupy is not available on ROCm")
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
@@ -301,8 +301,8 @@ def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
                                    hidden_dim=hidden_dim)
 
 
+@skipIfRocm("Skipped for now as cupy is not available on ROCm")
 def test_compressed_allreduce_basic(tmpdir):
-    pytest.skip("Skipped for now as cupy is not available on ROCm")
     @distributed_test(world_size=[1, 2])
     def _test_compressed_allreduce_basic():
         from deepspeed.runtime.comm.nccl import NcclBackend
diff --git a/tests/unit/test_partition.py b/tests/unit/test_partition.py
index 8919450f4153..7cd264752c6f 100644
--- a/tests/unit/test_partition.py
+++ b/tests/unit/test_partition.py
@@ -8,7 +8,7 @@
 from deepspeed.runtime.utils import prefix_sum_inc
 from deepspeed.runtime.utils import PartitionedTensor
 
-from common import distributed_test, skipIfRocm
+from common import distributed_test
 
 
 @distributed_test(world_size=4)
diff --git a/tests/unit/test_pipe_module.py b/tests/unit/test_pipe_module.py
index 382242a506f8..f7f4b8c1abda 100644
--- a/tests/unit/test_pipe_module.py
+++ b/tests/unit/test_pipe_module.py
@@ -56,7 +56,7 @@ def simple_args(tmpdir):
     return args
 
 
-@skipIfRocm
+@skipIfRocm()
 def test_pipe_module_sequential(sequential_model, simple_args):
     batch_input = torch.randn(1, HIDDEN_DIM)
 
diff --git a/tests/unit/test_topology.py b/tests/unit/test_topology.py
index 5f054cf084fc..e907af06427c 100644
--- a/tests/unit/test_topology.py
+++ b/tests/unit/test_topology.py
@@ -157,7 +157,7 @@ def test_topology_comm_list():
     assert topo.get_axis_comm_lists('jeff') == []
 
 
-@skipIfRocm
+@skipIfRocm()
 @distributed_test(world_size=4)
 def test_grid_pipe_data():
     topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
@@ -183,10 +183,8 @@ def test_grid_pipe_data():
     data_group = grid.dp_group
     assert torch.all(rank_tensor == sum(data_group))
 
-@skipIfRocm
 @distributed_test(world_size=4)
 def test_stage_to_global():
-    #pytest.skip("Skipped for now as cupy is not available on ROCm")
     topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
     grid = Grid(topology=topo)
 

From 37651f3d167c5a0c2df2b869e79aaa6e06c6427b Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Tue, 13 Apr 2021 13:42:26 -0500
Subject: [PATCH 43/66] Disable AVX512 for ROCm to enable same build of
 DeepSpeed to work on Intel and AMD CPUs (#7)

---
 op_builder/cpu_adam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py
index adf078c9bc0f..3b030f52b6ff 100644
--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
@@ -42,7 +42,7 @@ def simd_width(self):
         result = subprocess.check_output('lscpu', shell=True)
         result = result.decode('utf-8').strip().lower()
         if 'genuineintel' in result:
-            if 'avx512' in result:
+            if not is_rocm_pytorch and 'avx512' in result:
                 return '-D__AVX512__'
             elif 'avx2' in result:
                 return '-D__AVX256__'

From 7be71d322e2e4e33a1dc6b85044e23d4e54b0283 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Sun, 18 Apr 2021 22:50:02 -0500
Subject: [PATCH 44/66] Update headers and include_dirs to enable transformer
 extension (#8)

* Add hiprand and rocrand include paths for transformers extension

* Add patched HIP CG headers to enable transformer extension
---
 .../hip/hcc_detail/hip_cooperative_groups.h   | 362 ++++++++++++++++++
 .../hip_cooperative_groups_helper.h           | 183 +++++++++
 op_builder/transformer.py                     |   6 +-
 3 files changed, 550 insertions(+), 1 deletion(-)
 create mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
 create mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h

diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
new file mode 100644
index 000000000000..20e7bb94b8ad
--- /dev/null
+++ b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
@@ -0,0 +1,362 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  hcc_detail/hip_cooperative_groups.h
+ *
+ *  @brief Device side implementation of `Cooperative Group` feature.
+ *
+ *  Defines new types and device API wrappers related to `Cooperative Group`
+ *  feature, which the programmer can directly use in his kernel(s) in order to
+ *  make use of this feature.
+ */
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+//#if __cplusplus
+#if __cplusplus && defined(__clang__) && defined(__HIP__)
+#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
+#include <hip/hcc_detail/device_functions.h>
+namespace cooperative_groups {
+
+/** \brief The base type of all cooperative group types
+ *
+ *  \details Holds the key properties of a constructed cooperative group type
+ *           object, like the group type, its size, etc
+ */
+/*
+class thread_group {
+ protected:
+  uint32_t _type; // thread_group type
+  uint32_t _size; // total number of threads in the tread_group
+  uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
+                  // LSB represents lane 0, and MSB represents lane 63
+
+  // Construct a thread group, and set thread group type and other essential
+  // thread group properties. This generic thread group is directly constructed
+  // only when the group is supposed to contain only the calling the thread
+  // (throurh the API - `this_thread()`), and in all other cases, this thread
+  // group object is a sub-object of some other derived thread group object
+  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
+                                uint64_t mask = (uint64_t)0) {
+    _type = type;
+    _size = size;
+    _mask = mask;
+  }
+
+ public:
+  // Total number of threads in the thread group, and this serves the purpose
+  // for all derived cooperative group types since their `size` is directly
+  // saved during the construction
+  __CG_QUALIFIER__ uint32_t size() const {
+    return _size;
+  }
+  // Rank of the calling thread within [0, size())
+  __CG_QUALIFIER__ uint32_t thread_rank() const;
+  // Is this cooperative group type valid?
+  __CG_QUALIFIER__ bool is_valid() const;
+  // synchronize the threads in the thread group
+  __CG_QUALIFIER__ void sync() const;
+};
+*/
+
+class thread_group {
+ protected:
+    bool _tiled_partition; // this_thread_block() constructor sets to false
+    uint32_t _size;            // this_thread_block() constructor sets to size()
+    uint32_t local_rank;      // this_thread_block() constructor sets to thread_rank()
+    uint32_t _mask;
+    uint32_t _type;
+ public:
+    __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t group_size,
+                                uint64_t mask = (uint64_t)0) {
+    _type = type;
+    _size = group_size;
+    _mask = mask;
+    local_rank = internal::workgroup::thread_rank(); 
+    }
+
+    __CG_QUALIFIER__ void tiled_partition(const thread_group& parent,
+                                               unsigned int tile_size) {
+        if ( (ceil(log2(tile_size)) == floor(log2(tile_size))) || tile_size == 0 ||
+            tile_size > 64 || parent.size() < tile_size)
+            _tiled_partition =  false;
+            //xxx : abort
+        _tiled_partition = true;
+        _size = tile_size;
+        local_rank = parent.thread_rank() % tile_size;
+    }
+    __CG_QUALIFIER__ void sync() const; 
+    __CG_QUALIFIER__ uint32_t size() const {
+        return _size;
+    }
+    __CG_QUALIFIER__ uint32_t thread_rank() const; 
+    __CG_QUALIFIER__ float shfl_down(float var, unsigned int delta) const {
+        return (__shfl_down(var, delta, _size));
+    }
+     __CG_QUALIFIER__ float shfl_xor(float var, int mask) const {
+        return (__shfl_xor(var, mask, _size));
+    }
+    __CG_QUALIFIER__ float shfl(float var, unsigned int src_lane) const {
+        return (__shfl(var, src_lane, _size));
+    }
+    __CG_QUALIFIER__ bool is_valid() const;
+
+};
+
+/** \brief The multi-grid cooperative group type
+ *
+ *  \details Represents an inter-device cooperative group type where the
+ *           participating threads within the group spans across multple
+ *           devices, running the (same) kernel on these devices
+ */
+class multi_grid_group : public thread_group {
+  // Only these friend functions are allowed to construct an object of this class
+  // and access its resources
+  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
+
+ protected:
+  // Construct mutli-grid thread group (through the API this_multi_grid())
+  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
+      : thread_group(internal::cg_multi_grid, size) { }
+
+ public:
+  // Number of invocations participating in this multi-grid group. In other
+  // words, the number of GPUs
+	__CG_QUALIFIER__ uint32_t num_grids() {
+    return internal::multi_grid::num_grids();
+  }
+  // Rank of this invocation. In other words, an ID number within the range
+  // [0, num_grids()) of the GPU, this kernel is running on
+	__CG_QUALIFIER__ uint32_t grid_rank() {
+    return internal::multi_grid::grid_rank();
+  }
+  __CG_QUALIFIER__ uint32_t thread_rank() const {
+    return internal::multi_grid::thread_rank();
+  }
+  __CG_QUALIFIER__ bool is_valid() const {
+    return internal::multi_grid::is_valid();
+  }
+  __CG_QUALIFIER__ void sync() const {
+    internal::multi_grid::sync();
+  }
+};
+
+/** \brief User exposed API interface to construct multi-grid cooperative
+ *         group type object - `multi_grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ multi_grid_group
+this_multi_grid() {
+  return multi_grid_group(internal::multi_grid::size());
+}
+
+/** \brief The grid cooperative group type
+ *
+ *  \details Represents an inter-workgroup cooperative group type where the
+ *           participating threads within the group spans across multiple
+ *           workgroups running the (same) kernel on the same device
+ */
+class grid_group : public thread_group {
+  // Only these friend functions are allowed to construct an object of this class
+  // and access its resources
+  friend __CG_QUALIFIER__ grid_group this_grid();
+
+ protected:
+  // Construct grid thread group (through the API this_grid())
+  explicit __CG_QUALIFIER__ grid_group(uint32_t size)
+      : thread_group(internal::cg_grid, size) { }
+
+ public:
+  __CG_QUALIFIER__ uint32_t thread_rank() const {
+    return internal::grid::thread_rank();
+  }
+  __CG_QUALIFIER__ bool is_valid() const {
+    return internal::grid::is_valid();
+  }
+  __CG_QUALIFIER__ void sync() const {
+    internal::grid::sync();
+  }
+};
+
+/** \brief User exposed API interface to construct grid cooperative group type
+ *         object - `grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ grid_group
+this_grid() {
+  return grid_group(internal::grid::size());
+}
+
+/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
+ *         type
+ *
+ *  \details Represents an intra-workgroup cooperative group type where the
+ *           participating threads within the group are exctly the same threads
+ *           which are participated in the currently executing `workgroup`
+ */
+class thread_block : public thread_group {
+  // Only these friend functions are allowed to construct an object of this
+  // class and access its resources
+  friend __CG_QUALIFIER__ thread_block this_thread_block();
+
+ protected:
+  // Construct a workgroup thread group (through the API this_thread_block())
+  explicit __CG_QUALIFIER__ thread_block(uint32_t size)
+      : thread_group(internal::cg_workgroup, size) { }
+
+ public:
+  // 3-dimensional block index within the grid
+  __CG_QUALIFIER__ dim3 group_index() {
+    return internal::workgroup::group_index();
+  }
+  // 3-dimensional thread index within the block
+  __CG_QUALIFIER__ dim3 thread_index() {
+    return internal::workgroup::thread_index();
+  }
+  __CG_QUALIFIER__ uint32_t thread_rank() const {
+    return internal::workgroup::thread_rank();
+  }
+  __CG_QUALIFIER__ bool is_valid() const {
+    return internal::workgroup::is_valid();
+  }
+  __CG_QUALIFIER__ void sync() const {
+    internal::workgroup::sync();
+  }
+};
+
+/** \brief User exposed API interface to construct workgroup cooperative
+ *         group type object - `thread_block`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `thread_block`. Instead, he should construct it through this API
+ *           function
+ */
+__CG_QUALIFIER__ thread_block
+this_thread_block() {
+  return thread_block(internal::workgroup::size());
+}
+
+/**
+ *  Implemenation of all publicly exposed base class APIs
+ */
+__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      return (static_cast<const multi_grid_group*>(this)->thread_rank());
+    }
+    case internal::cg_grid: {
+      return (static_cast<const grid_group*>(this)->thread_rank());
+    }
+    case internal::cg_workgroup: {
+      return (static_cast<const thread_block*>(this)->thread_rank());
+    }
+    case internal::cg_coalesced_tile: {
+      return local_rank;
+    }
+    default: {
+      assert(false && "invalid cooperative group type");
+      return -1;
+    }
+  }
+}
+
+__CG_QUALIFIER__ bool thread_group::is_valid() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      return (static_cast<const multi_grid_group*>(this)->is_valid());
+    }
+    case internal::cg_grid: {
+      return (static_cast<const grid_group*>(this)->is_valid());
+    }
+    case internal::cg_workgroup: {
+      return (static_cast<const thread_block*>(this)->is_valid());
+    }
+    case internal::cg_coalesced_tile: {
+      return _tiled_partition;
+    }
+    default: {
+      assert(false && "invalid cooperative group type");
+      return false;
+    }
+  }
+}
+
+__CG_QUALIFIER__ void thread_group::sync() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      static_cast<const multi_grid_group*>(this)->sync();
+      break;
+    }
+    case internal::cg_grid: {
+      static_cast<const grid_group*>(this)->sync();
+      break;
+    }
+    case internal::cg_workgroup: {
+      static_cast<const thread_block*>(this)->sync();
+      break;
+    }
+    case internal::cg_coalesced_tile: {
+      if (!_tiled_partition) // If in a tiled partition, this is a no-op
+            __syncthreads();
+      break;
+    }
+    default: {
+      assert(false && "invalid cooperative group type");
+    }
+  }
+}
+
+/**
+ *  Implemenation of publicly exposed `wrapper` APIs on top of basic cooperative
+ *  group type APIs
+ */
+template <class CGTy>
+__CG_QUALIFIER__ uint32_t group_size(CGTy const &g) {
+  return g.size();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ uint32_t thread_rank(CGTy const &g) {
+  return g.thread_rank();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ bool is_valid(CGTy const &g) {
+  return g.is_valid();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ void sync(CGTy const &g) {
+  g.sync();
+}
+
+} // namespace cooperative_groups
+
+#endif // __cplusplus
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
new file mode 100644
index 000000000000..7f8e69da11c3
--- /dev/null
+++ b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
@@ -0,0 +1,183 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  hcc_detail/hip_cooperative_groups_helper.h
+ *
+ *  @brief Device side implementation of cooperative group feature.
+ *
+ *  Defines helper constructs and APIs which aid the types and device API
+ *  wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
+ */
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+
+#if __cplusplus
+#include <hip/hcc_detail/hip_runtime_api.h>
+#include <hip/hcc_detail/device_functions.h>
+
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+#if !defined(__CG_QUALIFIER__)
+#define __CG_QUALIFIER__ __device__ __forceinline__
+#endif
+
+#if !defined(__CG_STATIC_QUALIFIER__)
+#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
+#endif
+
+#if !defined(WAVEFRONT_SIZE)
+#define WAVEFRONT_SIZE 64
+#endif
+
+namespace cooperative_groups {
+
+namespace internal {
+
+/** \brief Enums representing different cooperative group types
+ */
+typedef enum {
+  cg_invalid,
+  cg_multi_grid,
+  cg_grid,
+  cg_workgroup,
+  cg_coalesced_tile
+} group_type;
+
+/**
+ *  Functionalities related to multi-grid cooperative group type
+ */
+namespace multi_grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
+  return (uint32_t)__ockl_multi_grid_num_grids();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
+  return (uint32_t)__ockl_multi_grid_grid_rank();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return (uint32_t)__ockl_multi_grid_size();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  return (uint32_t)__ockl_multi_grid_thread_rank();
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+  return (bool)__ockl_multi_grid_is_valid();
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+  __ockl_multi_grid_sync();
+}
+
+} // namespace multi_grid
+
+/**
+ *  Functionalities related to grid cooperative group type
+ */
+namespace grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return (uint32_t)((hipBlockDim_z * hipGridDim_z) *
+                    (hipBlockDim_y * hipGridDim_y) *
+                    (hipBlockDim_x * hipGridDim_x));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  // Compute global id of the workgroup to which the current thread belongs to
+  uint32_t blkIdx =
+           (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
+                      (hipBlockIdx_y * hipGridDim_x) +
+                      (hipBlockIdx_x));
+
+  // Compute total number of threads being passed to reach current workgroup
+  // within grid
+  uint32_t num_threads_till_current_workgroup =
+           (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
+
+  // Compute thread local rank within current workgroup
+  uint32_t local_thread_rank =
+           (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+                      (hipThreadIdx_y * hipBlockDim_x) +
+                      (hipThreadIdx_x));
+
+  return (num_threads_till_current_workgroup + local_thread_rank);
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+  return (bool)__ockl_grid_is_valid();
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+  __ockl_grid_sync();
+}
+
+} // namespace grid
+
+/**
+ *  Functionalities related to `workgroup` (thread_block in CUDA terminology)
+ *  cooperative group type
+ */
+namespace workgroup {
+
+__CG_STATIC_QUALIFIER__ dim3 group_index() {
+  return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y,
+               (uint32_t)hipBlockIdx_z));
+}
+
+__CG_STATIC_QUALIFIER__ dim3 thread_index() {
+  return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y,
+               (uint32_t)hipThreadIdx_z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+ return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+                    (hipThreadIdx_y * hipBlockDim_x) +
+                    (hipThreadIdx_x)));
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+   //TODO(mahesha) any functionality need to be added here? I believe not
+  return true;
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+  __syncthreads();
+}
+
+} // namespace workgroup
+
+} // namespace internal
+
+} // namespace cooperative_groups
+
+#endif // __cplusplus
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
diff --git a/op_builder/transformer.py b/op_builder/transformer.py
index 12608e1ba898..3f666e393ed9 100644
--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
@@ -29,7 +29,11 @@ def sources(self):
         ]
 
     def include_paths(self):
-        return ['csrc/includes']
+        includes = ['csrc/includes']
+        if is_rocm_pytorch:
+            from torch.utils.cpp_extension import ROCM_HOME
+            includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
+        return includes
 
     def nvcc_args(self):
         args = [

From 1c69737e1a8a8ae5ed9d295937458d54a65f4702 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Sun, 18 Apr 2021 23:22:45 -0500
Subject: [PATCH 45/66] Add patched CG headers to rocm install path (#9)

---
 docker/Dockerfile.rocm | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 3e850a5238b3..7b80c3facbd4 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -168,6 +168,8 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}
 RUN cd ${STAGE_DIR}/DeepSpeed && \
         git checkout . && \
         git checkout master && \
-        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
+        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \
+        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \
+        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
 RUN rm -rf ${STAGE_DIR}/DeepSpeed
 RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"

From ac4f8d571640973298be1c03d44895453b40ef92 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Mon, 19 Apr 2021 01:35:34 -0500
Subject: [PATCH 46/66] Update DeepSpeedExamples commit (#10)

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 5e63c68085ad..ea3bdc2525e2 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 5e63c68085adab099a78f57bc0fa88664f540fba
+Subproject commit ea3bdc2525e210f116a89d5d9f5833705df28a62

From 14204ab9a47c8e593ee9133c9e945ebe61ebf569 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 21 Apr 2021 22:34:52 +0000
Subject: [PATCH 47/66] Update DeepSpeedExamples commit

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index ea3bdc2525e2..53b28ad553a9 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit ea3bdc2525e210f116a89d5d9f5833705df28a62
+Subproject commit 53b28ad553a99108e7c4a2cc5cce5628ad1692dd

From 827ebfbecd1ed887d0ac4f768500e3c066cf6580 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Thu, 22 Apr 2021 19:41:27 +0000
Subject: [PATCH 48/66] Update DeepSpeedExamples commit

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 53b28ad553a9..51d5f03867a6 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 53b28ad553a99108e7c4a2cc5cce5628ad1692dd
+Subproject commit 51d5f03867a693d9e58ecc8567299bc530024948

From 3f2657f78ff467272c7342b85f7be3ae97d6a203 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 11 May 2021 18:24:49 +0000
Subject: [PATCH 49/66] Add Github Actions ifu.yml

---
 .github/workflows/ifu.yml | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 .github/workflows/ifu.yml

diff --git a/.github/workflows/ifu.yml b/.github/workflows/ifu.yml
new file mode 100644
index 000000000000..db6f2445dc7d
--- /dev/null
+++ b/.github/workflows/ifu.yml
@@ -0,0 +1,37 @@
+name: IntegrateFromUpstream
+on:
+#  schedule:
+#    # verified via crontab.guru website. “At 06:55 on Monday.”
+#    - cron: '55 6 * * 1'
+  workflow_dispatch:
+    inputs:
+      message:
+        description: 'Reason for manual trigger'
+        required: false
+        default: 'refresh master'
+jobs:
+  IntegrateFromUpstream:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+      - name: Fetch and Merge
+        run: |
+          echo "Reason for trigger: ${{ github.event.inputs.message }}"
+          echo "Actor for trigger: ${{ github.actor }}"
+          git config user.name github-actions
+          git config user.email github-actions@github.com
+          git remote add upstream https://github.com/microsoft/DeepSpeed
+          git fetch upstream master
+          git merge upstream/master 
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v3
+        with:
+#          token: ${{ secrets.PAT }}
+          branch: IFU-master-${{ steps.date.outputs.date }}
+          title: IFU-master-${{ steps.date.outputs.date }}
+          assignees: rraminen

From 9b41aa7e46a9ca9d5880ec69e57a8d2b05a0cc85 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 12 May 2021 21:53:22 +0000
Subject: [PATCH 50/66] Update ifu.yml to ignore DeepSpeedExamples

---
 .github/workflows/ifu.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ifu.yml b/.github/workflows/ifu.yml
index db6f2445dc7d..2b0f5a538e75 100644
--- a/.github/workflows/ifu.yml
+++ b/.github/workflows/ifu.yml
@@ -28,6 +28,8 @@ jobs:
           git remote add upstream https://github.com/microsoft/DeepSpeed
           git fetch upstream master
           git merge upstream/master 
+          # Since we use our own fork of DeepSpeedExamples, ignore theirs
+          git checkout HEAD DeepSpeedExamples
       - name: Create Pull Request
         uses: peter-evans/create-pull-request@v3
         with:

From 2066405283c5d13d2114cd288f8c65e7c5ca009a Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 12 May 2021 23:04:22 +0000
Subject: [PATCH 51/66] Update DeepSpeedExamples commit

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index e035305b4925..9524d99d1908 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit e035305b4925d70d9b7c8385ce96270987e36658
+Subproject commit 9524d99d190808e4014a76b9d877dfdbac385237

From e827515a8baeb92a8b7d43e1a3b3a6284695bf99 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Mon, 17 May 2021 15:51:03 -0500
Subject: [PATCH 52/66] Use branch name in PR title/branch name

---
 .github/workflows/ifu.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ifu.yml b/.github/workflows/ifu.yml
index 2b0f5a538e75..298f6f4d244e 100644
--- a/.github/workflows/ifu.yml
+++ b/.github/workflows/ifu.yml
@@ -30,10 +30,14 @@ jobs:
           git merge upstream/master 
           # Since we use our own fork of DeepSpeedExamples, ignore theirs
           git checkout HEAD DeepSpeedExamples
+      - name: Extract branch name
+        shell: bash
+        run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+        id: extract_branch
       - name: Create Pull Request
         uses: peter-evans/create-pull-request@v3
         with:
 #          token: ${{ secrets.PAT }}
-          branch: IFU-master-${{ steps.date.outputs.date }}
-          title: IFU-master-${{ steps.date.outputs.date }}
+          branch: IFU-${{ steps.extract_branch.outputs.branch }}-${{ steps.date.outputs.date }}
+          title: IFU-${{ steps.extract_branch.outputs.branch }}-${{ steps.date.outputs.date }}
           assignees: rraminen

From 4c7a25248ea9ec6170698ca0f9f3cf4a181b4e24 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Fri, 21 May 2021 22:54:12 +0000
Subject: [PATCH 53/66] Add email functionality

---
 .github/workflows/ifu.yml | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ifu.yml b/.github/workflows/ifu.yml
index 298f6f4d244e..82c13c6a12a3 100644
--- a/.github/workflows/ifu.yml
+++ b/.github/workflows/ifu.yml
@@ -8,7 +8,7 @@ on:
       message:
         description: 'Reason for manual trigger'
         required: false
-        default: 'refresh master'
+        default: 'refresh branch'
 jobs:
   IntegrateFromUpstream:
     runs-on: ubuntu-latest
@@ -19,7 +19,12 @@ jobs:
       - name: Get Current Date
         id: date
         run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+      - name: Extract branch name
+        id: extract_branch
+        shell: bash
+        run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
       - name: Fetch and Merge
+        id: fetch_and_merge
         run: |
           echo "Reason for trigger: ${{ github.event.inputs.message }}"
           echo "Actor for trigger: ${{ github.actor }}"
@@ -30,14 +35,29 @@ jobs:
           git merge upstream/master 
           # Since we use our own fork of DeepSpeedExamples, ignore theirs
           git checkout HEAD DeepSpeedExamples
-      - name: Extract branch name
-        shell: bash
-        run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
-        id: extract_branch
       - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v3
+        id: create_pull_request
+        uses: jithunnair-amd/create-pull-request@v3
         with:
 #          token: ${{ secrets.PAT }}
           branch: IFU-${{ steps.extract_branch.outputs.branch }}-${{ steps.date.outputs.date }}
           title: IFU-${{ steps.extract_branch.outputs.branch }}-${{ steps.date.outputs.date }}
           assignees: rraminen
+          reviewers: jithunnair-amd
+          delete-branch: true
+      - name: Send email
+        uses: jithunnair-amd/action-send-mail@v3.1.0
+        if: always()
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          secure: true
+          username: ${{ secrets.GMAIL_USERNAME }}
+          password: ${{ secrets.GMAIL_PASSWORD }}
+          subject: IFU to ${{ steps.extract_branch.outputs.branch }} branch of ${{ github.repository }}
+          to: Jithun.Nair@amd.com, RamyaSai.Ramineni@amd.com
+          from: ${{ secrets.GMAIL_USERNAME }}
+          html_body: |
+            <b>Fetch and Merge</b>: ${{ steps.fetch_and_merge.outcome }} <br/>
+            <b>Create Pull Request</b>: ${{ steps.create_pull_request.outcome }} <br/>
+            <b>Pull request</b>: <a href="${{ steps.create_pull_request.outputs.pull-request-url }}">${{ steps.create_pull_request.outputs.pull-request-url }}</a> <br/>

From 5de081ed8a0aca39254353985798246fa9d368c4 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Fri, 4 Jun 2021 18:12:59 +0000
Subject: [PATCH 54/66] Pointed DeepSpeedExamples to latest commit after IFU

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 177d5398f5e6..36846da89d5b 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 177d5398f5e63df7969086def7fc5d113bdeb5fe
+Subproject commit 36846da89d5be7e13465f95be7074b4ccd5898cd

From d29666505a7b0a9357a419e50cb2474fd559e98e Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 18:54:15 +0000
Subject: [PATCH 55/66] Revert "Add patched CG headers to rocm install path
 (#9)"

This reverts commit 1c69737e1a8a8ae5ed9d295937458d54a65f4702.
---
 docker/Dockerfile.rocm | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 7b80c3facbd4..3e850a5238b3 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -168,8 +168,6 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}
 RUN cd ${STAGE_DIR}/DeepSpeed && \
         git checkout . && \
         git checkout master && \
-        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \
-        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \
-        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
+        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
 RUN rm -rf ${STAGE_DIR}/DeepSpeed
 RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"

From f50fa7b98254adb7d205255180c27265579df52c Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 18:54:40 +0000
Subject: [PATCH 56/66] Revert "Update headers and include_dirs to enable
 transformer extension (#8)"

This reverts commit 7be71d322e2e4e33a1dc6b85044e23d4e54b0283.
---
 .../hip/hcc_detail/hip_cooperative_groups.h   | 362 ------------------
 .../hip_cooperative_groups_helper.h           | 183 ---------
 op_builder/transformer.py                     |   6 +-
 3 files changed, 1 insertion(+), 550 deletions(-)
 delete mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
 delete mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h

diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
deleted file mode 100644
index 20e7bb94b8ad..000000000000
--- a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/**
- *  @file  hcc_detail/hip_cooperative_groups.h
- *
- *  @brief Device side implementation of `Cooperative Group` feature.
- *
- *  Defines new types and device API wrappers related to `Cooperative Group`
- *  feature, which the programmer can directly use in his kernel(s) in order to
- *  make use of this feature.
- */
-#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
-#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
-
-//#if __cplusplus
-#if __cplusplus && defined(__clang__) && defined(__HIP__)
-#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
-#include <hip/hcc_detail/device_functions.h>
-namespace cooperative_groups {
-
-/** \brief The base type of all cooperative group types
- *
- *  \details Holds the key properties of a constructed cooperative group type
- *           object, like the group type, its size, etc
- */
-/*
-class thread_group {
- protected:
-  uint32_t _type; // thread_group type
-  uint32_t _size; // total number of threads in the tread_group
-  uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
-                  // LSB represents lane 0, and MSB represents lane 63
-
-  // Construct a thread group, and set thread group type and other essential
-  // thread group properties. This generic thread group is directly constructed
-  // only when the group is supposed to contain only the calling the thread
-  // (throurh the API - `this_thread()`), and in all other cases, this thread
-  // group object is a sub-object of some other derived thread group object
-  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
-                                uint64_t mask = (uint64_t)0) {
-    _type = type;
-    _size = size;
-    _mask = mask;
-  }
-
- public:
-  // Total number of threads in the thread group, and this serves the purpose
-  // for all derived cooperative group types since their `size` is directly
-  // saved during the construction
-  __CG_QUALIFIER__ uint32_t size() const {
-    return _size;
-  }
-  // Rank of the calling thread within [0, size())
-  __CG_QUALIFIER__ uint32_t thread_rank() const;
-  // Is this cooperative group type valid?
-  __CG_QUALIFIER__ bool is_valid() const;
-  // synchronize the threads in the thread group
-  __CG_QUALIFIER__ void sync() const;
-};
-*/
-
-class thread_group {
- protected:
-    bool _tiled_partition; // this_thread_block() constructor sets to false
-    uint32_t _size;            // this_thread_block() constructor sets to size()
-    uint32_t local_rank;      // this_thread_block() constructor sets to thread_rank()
-    uint32_t _mask;
-    uint32_t _type;
- public:
-    __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t group_size,
-                                uint64_t mask = (uint64_t)0) {
-    _type = type;
-    _size = group_size;
-    _mask = mask;
-    local_rank = internal::workgroup::thread_rank(); 
-    }
-
-    __CG_QUALIFIER__ void tiled_partition(const thread_group& parent,
-                                               unsigned int tile_size) {
-        if ( (ceil(log2(tile_size)) == floor(log2(tile_size))) || tile_size == 0 ||
-            tile_size > 64 || parent.size() < tile_size)
-            _tiled_partition =  false;
-            //xxx : abort
-        _tiled_partition = true;
-        _size = tile_size;
-        local_rank = parent.thread_rank() % tile_size;
-    }
-    __CG_QUALIFIER__ void sync() const; 
-    __CG_QUALIFIER__ uint32_t size() const {
-        return _size;
-    }
-    __CG_QUALIFIER__ uint32_t thread_rank() const; 
-    __CG_QUALIFIER__ float shfl_down(float var, unsigned int delta) const {
-        return (__shfl_down(var, delta, _size));
-    }
-     __CG_QUALIFIER__ float shfl_xor(float var, int mask) const {
-        return (__shfl_xor(var, mask, _size));
-    }
-    __CG_QUALIFIER__ float shfl(float var, unsigned int src_lane) const {
-        return (__shfl(var, src_lane, _size));
-    }
-    __CG_QUALIFIER__ bool is_valid() const;
-
-};
-
-/** \brief The multi-grid cooperative group type
- *
- *  \details Represents an inter-device cooperative group type where the
- *           participating threads within the group spans across multple
- *           devices, running the (same) kernel on these devices
- */
-class multi_grid_group : public thread_group {
-  // Only these friend functions are allowed to construct an object of this class
-  // and access its resources
-  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
-
- protected:
-  // Construct mutli-grid thread group (through the API this_multi_grid())
-  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
-      : thread_group(internal::cg_multi_grid, size) { }
-
- public:
-  // Number of invocations participating in this multi-grid group. In other
-  // words, the number of GPUs
-	__CG_QUALIFIER__ uint32_t num_grids() {
-    return internal::multi_grid::num_grids();
-  }
-  // Rank of this invocation. In other words, an ID number within the range
-  // [0, num_grids()) of the GPU, this kernel is running on
-	__CG_QUALIFIER__ uint32_t grid_rank() {
-    return internal::multi_grid::grid_rank();
-  }
-  __CG_QUALIFIER__ uint32_t thread_rank() const {
-    return internal::multi_grid::thread_rank();
-  }
-  __CG_QUALIFIER__ bool is_valid() const {
-    return internal::multi_grid::is_valid();
-  }
-  __CG_QUALIFIER__ void sync() const {
-    internal::multi_grid::sync();
-  }
-};
-
-/** \brief User exposed API interface to construct multi-grid cooperative
- *         group type object - `multi_grid_group`
- *
- *  \details User is not allowed to directly construct an object of type
- *           `multi_grid_group`. Instead, he should construct it through this
- *           API function
- */
-__CG_QUALIFIER__ multi_grid_group
-this_multi_grid() {
-  return multi_grid_group(internal::multi_grid::size());
-}
-
-/** \brief The grid cooperative group type
- *
- *  \details Represents an inter-workgroup cooperative group type where the
- *           participating threads within the group spans across multiple
- *           workgroups running the (same) kernel on the same device
- */
-class grid_group : public thread_group {
-  // Only these friend functions are allowed to construct an object of this class
-  // and access its resources
-  friend __CG_QUALIFIER__ grid_group this_grid();
-
- protected:
-  // Construct grid thread group (through the API this_grid())
-  explicit __CG_QUALIFIER__ grid_group(uint32_t size)
-      : thread_group(internal::cg_grid, size) { }
-
- public:
-  __CG_QUALIFIER__ uint32_t thread_rank() const {
-    return internal::grid::thread_rank();
-  }
-  __CG_QUALIFIER__ bool is_valid() const {
-    return internal::grid::is_valid();
-  }
-  __CG_QUALIFIER__ void sync() const {
-    internal::grid::sync();
-  }
-};
-
-/** \brief User exposed API interface to construct grid cooperative group type
- *         object - `grid_group`
- *
- *  \details User is not allowed to directly construct an object of type
- *           `multi_grid_group`. Instead, he should construct it through this
- *           API function
- */
-__CG_QUALIFIER__ grid_group
-this_grid() {
-  return grid_group(internal::grid::size());
-}
-
-/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
- *         type
- *
- *  \details Represents an intra-workgroup cooperative group type where the
- *           participating threads within the group are exctly the same threads
- *           which are participated in the currently executing `workgroup`
- */
-class thread_block : public thread_group {
-  // Only these friend functions are allowed to construct an object of this
-  // class and access its resources
-  friend __CG_QUALIFIER__ thread_block this_thread_block();
-
- protected:
-  // Construct a workgroup thread group (through the API this_thread_block())
-  explicit __CG_QUALIFIER__ thread_block(uint32_t size)
-      : thread_group(internal::cg_workgroup, size) { }
-
- public:
-  // 3-dimensional block index within the grid
-  __CG_QUALIFIER__ dim3 group_index() {
-    return internal::workgroup::group_index();
-  }
-  // 3-dimensional thread index within the block
-  __CG_QUALIFIER__ dim3 thread_index() {
-    return internal::workgroup::thread_index();
-  }
-  __CG_QUALIFIER__ uint32_t thread_rank() const {
-    return internal::workgroup::thread_rank();
-  }
-  __CG_QUALIFIER__ bool is_valid() const {
-    return internal::workgroup::is_valid();
-  }
-  __CG_QUALIFIER__ void sync() const {
-    internal::workgroup::sync();
-  }
-};
-
-/** \brief User exposed API interface to construct workgroup cooperative
- *         group type object - `thread_block`
- *
- *  \details User is not allowed to directly construct an object of type
- *           `thread_block`. Instead, he should construct it through this API
- *           function
- */
-__CG_QUALIFIER__ thread_block
-this_thread_block() {
-  return thread_block(internal::workgroup::size());
-}
-
-/**
- *  Implemenation of all publicly exposed base class APIs
- */
-__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
-  switch (this->_type) {
-    case internal::cg_multi_grid: {
-      return (static_cast<const multi_grid_group*>(this)->thread_rank());
-    }
-    case internal::cg_grid: {
-      return (static_cast<const grid_group*>(this)->thread_rank());
-    }
-    case internal::cg_workgroup: {
-      return (static_cast<const thread_block*>(this)->thread_rank());
-    }
-    case internal::cg_coalesced_tile: {
-      return local_rank;
-    }
-    default: {
-      assert(false && "invalid cooperative group type");
-      return -1;
-    }
-  }
-}
-
-__CG_QUALIFIER__ bool thread_group::is_valid() const {
-  switch (this->_type) {
-    case internal::cg_multi_grid: {
-      return (static_cast<const multi_grid_group*>(this)->is_valid());
-    }
-    case internal::cg_grid: {
-      return (static_cast<const grid_group*>(this)->is_valid());
-    }
-    case internal::cg_workgroup: {
-      return (static_cast<const thread_block*>(this)->is_valid());
-    }
-    case internal::cg_coalesced_tile: {
-      return _tiled_partition;
-    }
-    default: {
-      assert(false && "invalid cooperative group type");
-      return false;
-    }
-  }
-}
-
-__CG_QUALIFIER__ void thread_group::sync() const {
-  switch (this->_type) {
-    case internal::cg_multi_grid: {
-      static_cast<const multi_grid_group*>(this)->sync();
-      break;
-    }
-    case internal::cg_grid: {
-      static_cast<const grid_group*>(this)->sync();
-      break;
-    }
-    case internal::cg_workgroup: {
-      static_cast<const thread_block*>(this)->sync();
-      break;
-    }
-    case internal::cg_coalesced_tile: {
-      if (!_tiled_partition) // If in a tiled partition, this is a no-op
-            __syncthreads();
-      break;
-    }
-    default: {
-      assert(false && "invalid cooperative group type");
-    }
-  }
-}
-
-/**
- *  Implemenation of publicly exposed `wrapper` APIs on top of basic cooperative
- *  group type APIs
- */
-template <class CGTy>
-__CG_QUALIFIER__ uint32_t group_size(CGTy const &g) {
-  return g.size();
-}
-
-template <class CGTy>
-__CG_QUALIFIER__ uint32_t thread_rank(CGTy const &g) {
-  return g.thread_rank();
-}
-
-template <class CGTy>
-__CG_QUALIFIER__ bool is_valid(CGTy const &g) {
-  return g.is_valid();
-}
-
-template <class CGTy>
-__CG_QUALIFIER__ void sync(CGTy const &g) {
-  g.sync();
-}
-
-} // namespace cooperative_groups
-
-#endif // __cplusplus
-#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
deleted file mode 100644
index 7f8e69da11c3..000000000000
--- a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/**
- *  @file  hcc_detail/hip_cooperative_groups_helper.h
- *
- *  @brief Device side implementation of cooperative group feature.
- *
- *  Defines helper constructs and APIs which aid the types and device API
- *  wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
- */
-#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
-#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
-
-#if __cplusplus
-#include <hip/hcc_detail/hip_runtime_api.h>
-#include <hip/hcc_detail/device_functions.h>
-
-#if !defined(__align__)
-#define __align__(x) __attribute__((aligned(x)))
-#endif
-
-#if !defined(__CG_QUALIFIER__)
-#define __CG_QUALIFIER__ __device__ __forceinline__
-#endif
-
-#if !defined(__CG_STATIC_QUALIFIER__)
-#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
-#endif
-
-#if !defined(WAVEFRONT_SIZE)
-#define WAVEFRONT_SIZE 64
-#endif
-
-namespace cooperative_groups {
-
-namespace internal {
-
-/** \brief Enums representing different cooperative group types
- */
-typedef enum {
-  cg_invalid,
-  cg_multi_grid,
-  cg_grid,
-  cg_workgroup,
-  cg_coalesced_tile
-} group_type;
-
-/**
- *  Functionalities related to multi-grid cooperative group type
- */
-namespace multi_grid {
-
-__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
-  return (uint32_t)__ockl_multi_grid_num_grids();
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
-  return (uint32_t)__ockl_multi_grid_grid_rank();
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return (uint32_t)__ockl_multi_grid_size();
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  return (uint32_t)__ockl_multi_grid_thread_rank();
-}
-
-__CG_STATIC_QUALIFIER__ bool is_valid() {
-  return (bool)__ockl_multi_grid_is_valid();
-}
-
-__CG_STATIC_QUALIFIER__ void sync() {
-  __ockl_multi_grid_sync();
-}
-
-} // namespace multi_grid
-
-/**
- *  Functionalities related to grid cooperative group type
- */
-namespace grid {
-
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return (uint32_t)((hipBlockDim_z * hipGridDim_z) *
-                    (hipBlockDim_y * hipGridDim_y) *
-                    (hipBlockDim_x * hipGridDim_x));
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  // Compute global id of the workgroup to which the current thread belongs to
-  uint32_t blkIdx =
-           (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
-                      (hipBlockIdx_y * hipGridDim_x) +
-                      (hipBlockIdx_x));
-
-  // Compute total number of threads being passed to reach current workgroup
-  // within grid
-  uint32_t num_threads_till_current_workgroup =
-           (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
-
-  // Compute thread local rank within current workgroup
-  uint32_t local_thread_rank =
-           (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
-                      (hipThreadIdx_y * hipBlockDim_x) +
-                      (hipThreadIdx_x));
-
-  return (num_threads_till_current_workgroup + local_thread_rank);
-}
-
-__CG_STATIC_QUALIFIER__ bool is_valid() {
-  return (bool)__ockl_grid_is_valid();
-}
-
-__CG_STATIC_QUALIFIER__ void sync() {
-  __ockl_grid_sync();
-}
-
-} // namespace grid
-
-/**
- *  Functionalities related to `workgroup` (thread_block in CUDA terminology)
- *  cooperative group type
- */
-namespace workgroup {
-
-__CG_STATIC_QUALIFIER__ dim3 group_index() {
-  return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y,
-               (uint32_t)hipBlockIdx_z));
-}
-
-__CG_STATIC_QUALIFIER__ dim3 thread_index() {
-  return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y,
-               (uint32_t)hipThreadIdx_z));
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
- return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
-                    (hipThreadIdx_y * hipBlockDim_x) +
-                    (hipThreadIdx_x)));
-}
-
-__CG_STATIC_QUALIFIER__ bool is_valid() {
-   //TODO(mahesha) any functionality need to be added here? I believe not
-  return true;
-}
-
-__CG_STATIC_QUALIFIER__ void sync() {
-  __syncthreads();
-}
-
-} // namespace workgroup
-
-} // namespace internal
-
-} // namespace cooperative_groups
-
-#endif // __cplusplus
-#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
diff --git a/op_builder/transformer.py b/op_builder/transformer.py
index 606d0be255ef..234fb616f0b3 100644
--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
@@ -29,11 +29,7 @@ def sources(self):
         ]
 
     def include_paths(self):
-        includes = ['csrc/includes']
-        if is_rocm_pytorch:
-            from torch.utils.cpp_extension import ROCM_HOME
-            includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
-        return includes
+        return ['csrc/includes']
 
     def nvcc_args(self):
         args = [

From 2585f2918b831d68a3460bc1b47aad827a47d3d8 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 19:04:37 +0000
Subject: [PATCH 57/66] Added back the required code from the commits,
 1c69737e1a8a8ae5ed9d295937458d54a65f4702 and
 7be71d322e2e4e33a1dc6b85044e23d4e54b0283

---
 docker/Dockerfile.rocm    | 2 +-
 op_builder/transformer.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 3e850a5238b3..5cbb0be580eb 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -168,6 +168,6 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}
 RUN cd ${STAGE_DIR}/DeepSpeed && \
         git checkout . && \
         git checkout master && \
-        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
+        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
 RUN rm -rf ${STAGE_DIR}/DeepSpeed
 RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"
diff --git a/op_builder/transformer.py b/op_builder/transformer.py
index 234fb616f0b3..606d0be255ef 100644
--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
@@ -29,7 +29,11 @@ def sources(self):
         ]
 
     def include_paths(self):
-        return ['csrc/includes']
+        includes = ['csrc/includes']
+        if is_rocm_pytorch:
+            from torch.utils.cpp_extension import ROCM_HOME
+            includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
+        return includes
 
     def nvcc_args(self):
         args = [

From 0be96458a329b5df77d98e43d85f614b89fb388d Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 19:51:43 +0000
Subject: [PATCH 58/66] Revert "Cooperative Groups workaround for transformer
 kernels extension"

This reverts commit fbddd9316c3686a15e3d805cbececf25feb4aa69.
---
 csrc/transformer/general_kernels.cu   |  5 +--
 csrc/transformer/normalize_kernels.cu | 64 +++++++--------------------
 csrc/transformer/softmax_kernels.cu   | 16 ++-----
 3 files changed, 21 insertions(+), 64 deletions(-)

diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu
index 180e93ce4dde..7d318773f354 100644
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@@ -11,10 +11,7 @@ __global__ void column_sum_reduce(const T* __restrict__ inp,
     __shared__ float tile[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
-
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
 
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index c69c47ebf1c8..366e93724638 100644
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -28,9 +28,7 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -128,9 +126,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
-    g.tiled_partition(b, 32);
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -318,9 +314,7 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
-    g.tiled_partition(b, 32);
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -416,9 +410,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
-    g.tiled_partition(b, 32);
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -626,9 +618,7 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -695,9 +685,7 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -762,9 +750,7 @@ __global__ void LayerNormBackward2(const float* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -864,9 +850,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1081,9 +1065,7 @@ __global__ void LayerNormBackward2(const float* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1178,9 +1160,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1382,9 +1362,7 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -1446,9 +1424,7 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -1507,9 +1483,7 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1613,9 +1587,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1833,9 +1805,7 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1938,9 +1908,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index 18398148f559..be776b0c074d 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -20,9 +20,7 @@ __global__ void attn_softmax(float* vals,
     int block_width = blockStride * seq_length;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
-    g.tiled_partition(b, tbSize);
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
 
     int batch = blockIdx.x;
     int row = blockIdx.y;
@@ -153,9 +151,7 @@ __global__ void attn_softmax(__half* vals,
     int block_width = blockStride * seq_length;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
-    g.tiled_partition(b, tbSize);
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
 
     int batch = blockIdx.x;
     int row = blockIdx.y;
@@ -449,9 +445,7 @@ __global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_
                           : MAX_THREAD_ITERATIONS);
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
-    g.tiled_partition(b, tbSize);
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -526,9 +520,7 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
     }
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
 

From f428da58c96b46afcc68952c19c078e7d4e3e738 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 21:29:39 +0000
Subject: [PATCH 59/66] Added defined(__HIP_PLATFORM_HCC__) to kernels code

---
 csrc/lamb/fused_lamb_cuda_kernel.cu           |  2 +-
 csrc/quantization/quantizer.cu                |  8 ++++----
 csrc/transformer/gelu_kernels.cu              |  6 +++---
 csrc/transformer/inference/csrc/dequantize.cu |  2 +-
 csrc/transformer/inference/csrc/gelu.cu       |  6 +++---
 csrc/transformer/inference/csrc/normalize.cu  |  4 ++--
 csrc/transformer/inference/csrc/softmax.cu    |  2 +-
 csrc/transformer/normalize_kernels.cu         |  4 ++--
 csrc/transformer/softmax_kernels.cu           |  2 +-
 csrc/transformer/transform_kernels.cu         | 10 +++++-----
 10 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index e12b2c8585b4..1a8cd6071f91 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -122,7 +122,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
     cg::sync(cta);
 #endif
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDA_ARCH__ >= 300) || defined(__HIP_PLATFORM_HCC__)
     if (tid < 32) {
         cg::coalesced_group active = cg::coalesced_threads();
 
diff --git a/csrc/quantization/quantizer.cu b/csrc/quantization/quantizer.cu
index c48ae38969e3..f79c3ecb1e12 100644
--- a/csrc/quantization/quantizer.cu
+++ b/csrc/quantization/quantizer.cu
@@ -5,7 +5,7 @@ namespace cg = cooperative_groups;
 
 __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -206,7 +206,7 @@ __global__ void sr_qunatize_kernel(__half* vals,
                                    int num_bits,
                                    std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -484,7 +484,7 @@ template void launch_sr_qunatize_kernel(__half* vals,
 
 __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -729,7 +729,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals,
                                         int num_bits,
                                         std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
index 12048006266e..dbb8828ce977 100644
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -60,7 +60,7 @@ __global__ void gelu_kernel(const float* input, float* vals, int row_stride, int
 
 __global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
@@ -131,7 +131,7 @@ __global__ void fused_bias_gelu(const __half* input,
                                 int row_stride,
                                 int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
@@ -214,7 +214,7 @@ __global__ void d_gelu_func(__half* d_output,
                             int row_stride,
                             int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
diff --git a/csrc/transformer/inference/csrc/dequantize.cu b/csrc/transformer/inference/csrc/dequantize.cu
index ddf7a958822a..43d50f00c058 100644
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@@ -46,7 +46,7 @@ __global__ void dequantize_kernel(__half* output,
                                   unsigned groups,
                                   unsigned merge_count)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     unsigned merge_hidden = hidden_dim >> merge_count;
     unsigned quantization_stride = (merge_hidden * output_size) / groups;
diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index fc3faacc54e8..1737855f614d 100755
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -39,7 +39,7 @@ __global__ void fused_bias_gelu(__half* input,
                                 int total_count,
                                 int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@@ -117,7 +117,7 @@ __global__ void fused_bias_add(float* input, const float* bias, int total_count,
 
 __global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@@ -195,7 +195,7 @@ __global__ void fused_bias_residual(__half* input,
                                     int total_count,
                                     int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* residual_cast = reinterpret_cast<const float2*>(residual);
diff --git a/csrc/transformer/inference/csrc/normalize.cu b/csrc/transformer/inference/csrc/normalize.cu
index ecd73154f37f..dc0f6be01144 100755
--- a/csrc/transformer/inference/csrc/normalize.cu
+++ b/csrc/transformer/inference/csrc/normalize.cu
@@ -85,7 +85,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
                                                float epsilon,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -287,7 +287,7 @@ __global__ void fused_residual_layer_norm(__half* norm,
                                           int row_stride,
                                           bool preLN)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
 
     cg::thread_block b = cg::this_thread_block();
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index cee509965106..b347945df636 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -37,7 +37,7 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int num_seq,
                                 float scale)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index 366e93724638..c9bc4a46ee5e 100644
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -121,7 +121,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                __half* means,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -404,7 +404,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                __half* vars,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index be776b0c074d..a4d84c37dd3b 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -142,7 +142,7 @@ __global__ void attn_softmax(__half* vals,
                              int seq_length,
                              int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float partialSum[MAX_WARP_NUM];
 
     int warp_num = blockDim.x >> 5;
diff --git a/csrc/transformer/transform_kernels.cu b/csrc/transformer/transform_kernels.cu
index 7d8a27eeeb43..b68d70f67ae1 100755
--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
@@ -96,7 +96,7 @@ __global__ void transform_0213<__half>(__half* output,
                                        int heads,
                                        int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -219,7 +219,7 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
                                                 int heads,
                                                 int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -289,7 +289,7 @@ __global__ void bias_add_transform_0213_v2(__half* output,
                                            int seq_length,
                                            int heads)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -451,7 +451,7 @@ __global__ void transform4d_0213<__half>(__half* out,
                                          int hidden_dim,
                                          int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * (seq_length / head_ext);
     int d1_stride = hidden_dim;
@@ -487,7 +487,7 @@ __global__ void transform4d_0213_v2(__half* out,
                                     int seq_length,
                                     int hidden_dim)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;

From ed2ee34e35e82ab27497859e2c36537fa2f43152 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 21:29:45 +0000
Subject: [PATCH 60/66] Revert "Enable cooperative groups for ROCm"

This reverts commit 077638dabbd8a6183758f88352ba69fad53bf98a.
---
 csrc/lamb/fused_lamb_cuda_kernel.cu | 44 -----------------------------
 1 file changed, 44 deletions(-)

diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index 1a8cd6071f91..f74746f93752 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -14,11 +14,7 @@
 #include <iostream>
 
 //#include <helper_functions.h>
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-#include <hip/hip_cooperative_groups.h>
-#else
 #include <cooperative_groups.h>
-#endif
 #include <cuda_runtime_api.h>
 #include <stdio.h>
 
@@ -82,11 +78,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
     T a_sum = s_a[tid];
     T b_sum = s_b[tid];
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     // do reduction in shared mem
     if ((blockSize >= 512) && (tid < 256)) {
@@ -94,33 +86,21 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
         s_b[tid] = b_sum = b_sum + s_b[tid + 256];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 256) && (tid < 128)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 128];
         s_b[tid] = b_sum = b_sum + s_b[tid + 128];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 128) && (tid < 64)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 64];
         s_b[tid] = b_sum = b_sum + s_b[tid + 64];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
 #if (__CUDA_ARCH__ >= 300) || defined(__HIP_PLATFORM_HCC__)
     if (tid < 32) {
@@ -144,66 +124,42 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
         s_b[tid] = b_sum = b_sum + s_b[tid + 32];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 32) && (tid < 16)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 16];
         s_b[tid] = b_sum = b_sum + s_b[tid + 16];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 16) && (tid < 8)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 8];
         s_b[tid] = b_sum = b_sum + s_b[tid + 8];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 8) && (tid < 4)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 4];
         s_b[tid] = b_sum = b_sum + s_b[tid + 4];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 4) && (tid < 2)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 2];
         s_b[tid] = b_sum = b_sum + s_b[tid + 2];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 2) && (tid < 1)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 1];
         s_b[tid] = b_sum = b_sum + s_b[tid + 1];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
 #endif
 

From 742fd6486d02b84332ac3a8c72c52a22c03661fa Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 21:31:59 +0000
Subject: [PATCH 61/66] Enable cooperative groups for ROCm

---
 csrc/lamb/fused_lamb_cuda_kernel.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index f74746f93752..a6c610ada499 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -14,7 +14,11 @@
 #include <iostream>
 
 //#include <helper_functions.h>
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <cuda_runtime_api.h>
 #include <stdio.h>
 

From 1d20b14ab5da92fdd97e440ba1bceb38852b6892 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Thu, 24 Jun 2021 20:46:01 +0000
Subject: [PATCH 62/66] Added CuPy installation from source

---
 docker/Dockerfile.rocm | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 7b80c3facbd4..bd7cc4a242fa 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -124,7 +124,7 @@ RUN pip install psutil \
                 numpy \
                 sklearn \
                 scikit-learn \
-                mpi4py
+                mpi4py 
 
 ##############################################################################
 ## SSH daemon port inside container cannot conflict with host OS port
@@ -150,6 +150,15 @@ RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
 RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
         rm -rf /usr/lib/python3/dist-packages/PyYAML-*
 
+##############################################################################
+## CuPy installation
+###############################################################################
+RUN git clone https://github.com/ROCmSoftwarePlatform/cupy ${STAGE_DIR}/cupy
+RUN cd ${STAGE_DIR}/cupy && \
+        git submodule update --init && \
+        CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm pip install -e . --no-cache-dir -vvvv
+RUN rm -rf ${STAGE_DIR}/cupy
+
 ##############################################################################
 ## Add deepspeed user
 ###############################################################################

From f6c79ae3857860dc4797483f9ad6da735419d650 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Thu, 24 Jun 2021 20:47:33 +0000
Subject: [PATCH 63/66] Added h5py installation

---
 docker/Dockerfile.rocm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index bd7cc4a242fa..c4383a0eed5d 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -124,7 +124,8 @@ RUN pip install psutil \
                 numpy \
                 sklearn \
                 scikit-learn \
-                mpi4py 
+                mpi4py \
+                h5py 
 
 ##############################################################################
 ## SSH daemon port inside container cannot conflict with host OS port

From 81b744ef24a92ae2eb463458a075e512ac03918e Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Mon, 28 Jun 2021 23:02:35 +0000
Subject: [PATCH 64/66] hip cooperative groups functionality for
 coalesced_group in fused_lamb_cuda_kernel.cu is not implemented yet

---
 csrc/lamb/fused_lamb_cuda_kernel.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index a6c610ada499..10a17e98a13d 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -106,7 +106,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
 
     cg::sync(cta);
 
-#if (__CUDA_ARCH__ >= 300) || defined(__HIP_PLATFORM_HCC__)
+#if (__CUDA_ARCH__ >= 300)
     if (tid < 32) {
         cg::coalesced_group active = cg::coalesced_threads();
 

From 5b0fac74999ca0fc5fd4e10ecd6bd614eea0343c Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 29 Jun 2021 18:59:28 +0000
Subject: [PATCH 65/66] Revert "Merge pull request #21 from
 rraminen/PR_Revert_HIP_Cooperative_Groups_Workaround"

This reverts commit bf2979b5fe04fa70f5e8ca23d6af76349ccb68b9, reversing
changes made to 0cf33069528e92716e96c1c9664e83219eb72c4a.
---
 .../hip/hcc_detail/hip_cooperative_groups.h   | 362 ++++++++++++++++++
 .../hip_cooperative_groups_helper.h           | 183 +++++++++
 csrc/lamb/fused_lamb_cuda_kernel.cu           |  40 ++
 csrc/quantization/quantizer.cu                |   8 +-
 csrc/transformer/gelu_kernels.cu              |   6 +-
 csrc/transformer/general_kernels.cu           |   5 +-
 csrc/transformer/inference/csrc/dequantize.cu |   2 +-
 csrc/transformer/inference/csrc/gelu.cu       |   6 +-
 csrc/transformer/inference/csrc/normalize.cu  |   4 +-
 csrc/transformer/inference/csrc/softmax.cu    |   2 +-
 csrc/transformer/normalize_kernels.cu         |  68 +++-
 csrc/transformer/softmax_kernels.cu           |  18 +-
 csrc/transformer/transform_kernels.cu         |  10 +-
 docker/Dockerfile.rocm                        |   2 +
 14 files changed, 673 insertions(+), 43 deletions(-)
 create mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
 create mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h

diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
new file mode 100644
index 000000000000..20e7bb94b8ad
--- /dev/null
+++ b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
@@ -0,0 +1,362 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  hcc_detail/hip_cooperative_groups.h
+ *
+ *  @brief Device side implementation of `Cooperative Group` feature.
+ *
+ *  Defines new types and device API wrappers related to `Cooperative Group`
+ *  feature, which the programmer can directly use in his kernel(s) in order to
+ *  make use of this feature.
+ */
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+//#if __cplusplus
+#if __cplusplus && defined(__clang__) && defined(__HIP__)
+#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
+#include <hip/hcc_detail/device_functions.h>
+namespace cooperative_groups {
+
+/** \brief The base type of all cooperative group types
+ *
+ *  \details Holds the key properties of a constructed cooperative group type
+ *           object, like the group type, its size, etc
+ */
+/*
+class thread_group {
+ protected:
+  uint32_t _type; // thread_group type
+  uint32_t _size; // total number of threads in the tread_group
+  uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
+                  // LSB represents lane 0, and MSB represents lane 63
+
+  // Construct a thread group, and set thread group type and other essential
+  // thread group properties. This generic thread group is directly constructed
+  // only when the group is supposed to contain only the calling the thread
+  // (throurh the API - `this_thread()`), and in all other cases, this thread
+  // group object is a sub-object of some other derived thread group object
+  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
+                                uint64_t mask = (uint64_t)0) {
+    _type = type;
+    _size = size;
+    _mask = mask;
+  }
+
+ public:
+  // Total number of threads in the thread group, and this serves the purpose
+  // for all derived cooperative group types since their `size` is directly
+  // saved during the construction
+  __CG_QUALIFIER__ uint32_t size() const {
+    return _size;
+  }
+  // Rank of the calling thread within [0, size())
+  __CG_QUALIFIER__ uint32_t thread_rank() const;
+  // Is this cooperative group type valid?
+  __CG_QUALIFIER__ bool is_valid() const;
+  // synchronize the threads in the thread group
+  __CG_QUALIFIER__ void sync() const;
+};
+*/
+
+class thread_group {
+ protected:
+    bool _tiled_partition; // this_thread_block() constructor sets to false
+    uint32_t _size;            // this_thread_block() constructor sets to size()
+    uint32_t local_rank;      // this_thread_block() constructor sets to thread_rank()
+    uint32_t _mask;
+    uint32_t _type;
+ public:
+    __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t group_size,
+                                uint64_t mask = (uint64_t)0) {
+    _type = type;
+    _size = group_size;
+    _mask = mask;
+    local_rank = internal::workgroup::thread_rank(); 
+    }
+
+    __CG_QUALIFIER__ void tiled_partition(const thread_group& parent,
+                                               unsigned int tile_size) {
+        if ( (ceil(log2(tile_size)) == floor(log2(tile_size))) || tile_size == 0 ||
+            tile_size > 64 || parent.size() < tile_size)
+            _tiled_partition =  false;
+            //xxx : abort
+        _tiled_partition = true;
+        _size = tile_size;
+        local_rank = parent.thread_rank() % tile_size;
+    }
+    __CG_QUALIFIER__ void sync() const; 
+    __CG_QUALIFIER__ uint32_t size() const {
+        return _size;
+    }
+    __CG_QUALIFIER__ uint32_t thread_rank() const; 
+    __CG_QUALIFIER__ float shfl_down(float var, unsigned int delta) const {
+        return (__shfl_down(var, delta, _size));
+    }
+     __CG_QUALIFIER__ float shfl_xor(float var, int mask) const {
+        return (__shfl_xor(var, mask, _size));
+    }
+    __CG_QUALIFIER__ float shfl(float var, unsigned int src_lane) const {
+        return (__shfl(var, src_lane, _size));
+    }
+    __CG_QUALIFIER__ bool is_valid() const;
+
+};
+
+/** \brief The multi-grid cooperative group type
+ *
+ *  \details Represents an inter-device cooperative group type where the
+ *           participating threads within the group spans across multple
+ *           devices, running the (same) kernel on these devices
+ */
+class multi_grid_group : public thread_group {
+  // Only these friend functions are allowed to construct an object of this class
+  // and access its resources
+  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
+
+ protected:
+  // Construct mutli-grid thread group (through the API this_multi_grid())
+  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
+      : thread_group(internal::cg_multi_grid, size) { }
+
+ public:
+  // Number of invocations participating in this multi-grid group. In other
+  // words, the number of GPUs
+	__CG_QUALIFIER__ uint32_t num_grids() {
+    return internal::multi_grid::num_grids();
+  }
+  // Rank of this invocation. In other words, an ID number within the range
+  // [0, num_grids()) of the GPU, this kernel is running on
+	__CG_QUALIFIER__ uint32_t grid_rank() {
+    return internal::multi_grid::grid_rank();
+  }
+  __CG_QUALIFIER__ uint32_t thread_rank() const {
+    return internal::multi_grid::thread_rank();
+  }
+  __CG_QUALIFIER__ bool is_valid() const {
+    return internal::multi_grid::is_valid();
+  }
+  __CG_QUALIFIER__ void sync() const {
+    internal::multi_grid::sync();
+  }
+};
+
+/** \brief User exposed API interface to construct multi-grid cooperative
+ *         group type object - `multi_grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ multi_grid_group
+this_multi_grid() {
+  return multi_grid_group(internal::multi_grid::size());
+}
+
+/** \brief The grid cooperative group type
+ *
+ *  \details Represents an inter-workgroup cooperative group type where the
+ *           participating threads within the group spans across multiple
+ *           workgroups running the (same) kernel on the same device
+ */
+class grid_group : public thread_group {
+  // Only these friend functions are allowed to construct an object of this class
+  // and access its resources
+  friend __CG_QUALIFIER__ grid_group this_grid();
+
+ protected:
+  // Construct grid thread group (through the API this_grid())
+  explicit __CG_QUALIFIER__ grid_group(uint32_t size)
+      : thread_group(internal::cg_grid, size) { }
+
+ public:
+  __CG_QUALIFIER__ uint32_t thread_rank() const {
+    return internal::grid::thread_rank();
+  }
+  __CG_QUALIFIER__ bool is_valid() const {
+    return internal::grid::is_valid();
+  }
+  __CG_QUALIFIER__ void sync() const {
+    internal::grid::sync();
+  }
+};
+
+/** \brief User exposed API interface to construct grid cooperative group type
+ *         object - `grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ grid_group
+this_grid() {
+  return grid_group(internal::grid::size());
+}
+
+/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
+ *         type
+ *
+ *  \details Represents an intra-workgroup cooperative group type where the
+ *           participating threads within the group are exctly the same threads
+ *           which are participated in the currently executing `workgroup`
+ */
+class thread_block : public thread_group {
+  // Only these friend functions are allowed to construct an object of this
+  // class and access its resources
+  friend __CG_QUALIFIER__ thread_block this_thread_block();
+
+ protected:
+  // Construct a workgroup thread group (through the API this_thread_block())
+  explicit __CG_QUALIFIER__ thread_block(uint32_t size)
+      : thread_group(internal::cg_workgroup, size) { }
+
+ public:
+  // 3-dimensional block index within the grid
+  __CG_QUALIFIER__ dim3 group_index() {
+    return internal::workgroup::group_index();
+  }
+  // 3-dimensional thread index within the block
+  __CG_QUALIFIER__ dim3 thread_index() {
+    return internal::workgroup::thread_index();
+  }
+  __CG_QUALIFIER__ uint32_t thread_rank() const {
+    return internal::workgroup::thread_rank();
+  }
+  __CG_QUALIFIER__ bool is_valid() const {
+    return internal::workgroup::is_valid();
+  }
+  __CG_QUALIFIER__ void sync() const {
+    internal::workgroup::sync();
+  }
+};
+
+/** \brief User exposed API interface to construct workgroup cooperative
+ *         group type object - `thread_block`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `thread_block`. Instead, he should construct it through this API
+ *           function
+ */
+__CG_QUALIFIER__ thread_block
+this_thread_block() {
+  return thread_block(internal::workgroup::size());
+}
+
+/**
+ *  Implemenation of all publicly exposed base class APIs
+ */
+__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      return (static_cast<const multi_grid_group*>(this)->thread_rank());
+    }
+    case internal::cg_grid: {
+      return (static_cast<const grid_group*>(this)->thread_rank());
+    }
+    case internal::cg_workgroup: {
+      return (static_cast<const thread_block*>(this)->thread_rank());
+    }
+    case internal::cg_coalesced_tile: {
+      return local_rank;
+    }
+    default: {
+      assert(false && "invalid cooperative group type");
+      return -1;
+    }
+  }
+}
+
+__CG_QUALIFIER__ bool thread_group::is_valid() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      return (static_cast<const multi_grid_group*>(this)->is_valid());
+    }
+    case internal::cg_grid: {
+      return (static_cast<const grid_group*>(this)->is_valid());
+    }
+    case internal::cg_workgroup: {
+      return (static_cast<const thread_block*>(this)->is_valid());
+    }
+    case internal::cg_coalesced_tile: {
+      return _tiled_partition;
+    }
+    default: {
+      assert(false && "invalid cooperative group type");
+      return false;
+    }
+  }
+}
+
+__CG_QUALIFIER__ void thread_group::sync() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      static_cast<const multi_grid_group*>(this)->sync();
+      break;
+    }
+    case internal::cg_grid: {
+      static_cast<const grid_group*>(this)->sync();
+      break;
+    }
+    case internal::cg_workgroup: {
+      static_cast<const thread_block*>(this)->sync();
+      break;
+    }
+    case internal::cg_coalesced_tile: {
+      if (!_tiled_partition) // If in a tiled partition, this is a no-op
+            __syncthreads();
+      break;
+    }
+    default: {
+      assert(false && "invalid cooperative group type");
+    }
+  }
+}
+
+/**
+ *  Implemenation of publicly exposed `wrapper` APIs on top of basic cooperative
+ *  group type APIs
+ */
+template <class CGTy>
+__CG_QUALIFIER__ uint32_t group_size(CGTy const &g) {
+  return g.size();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ uint32_t thread_rank(CGTy const &g) {
+  return g.thread_rank();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ bool is_valid(CGTy const &g) {
+  return g.is_valid();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ void sync(CGTy const &g) {
+  g.sync();
+}
+
+} // namespace cooperative_groups
+
+#endif // __cplusplus
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
new file mode 100644
index 000000000000..7f8e69da11c3
--- /dev/null
+++ b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
@@ -0,0 +1,183 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  hcc_detail/hip_cooperative_groups_helper.h
+ *
+ *  @brief Device side implementation of cooperative group feature.
+ *
+ *  Defines helper constructs and APIs which aid the types and device API
+ *  wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
+ */
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+
+#if __cplusplus
+#include <hip/hcc_detail/hip_runtime_api.h>
+#include <hip/hcc_detail/device_functions.h>
+
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+#if !defined(__CG_QUALIFIER__)
+#define __CG_QUALIFIER__ __device__ __forceinline__
+#endif
+
+#if !defined(__CG_STATIC_QUALIFIER__)
+#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
+#endif
+
+#if !defined(WAVEFRONT_SIZE)
+#define WAVEFRONT_SIZE 64
+#endif
+
+namespace cooperative_groups {
+
+namespace internal {
+
+/** \brief Enums representing different cooperative group types
+ */
+typedef enum {
+  cg_invalid,
+  cg_multi_grid,
+  cg_grid,
+  cg_workgroup,
+  cg_coalesced_tile
+} group_type;
+
+/**
+ *  Functionalities related to multi-grid cooperative group type
+ */
+namespace multi_grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
+  return (uint32_t)__ockl_multi_grid_num_grids();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
+  return (uint32_t)__ockl_multi_grid_grid_rank();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return (uint32_t)__ockl_multi_grid_size();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  return (uint32_t)__ockl_multi_grid_thread_rank();
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+  return (bool)__ockl_multi_grid_is_valid();
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+  __ockl_multi_grid_sync();
+}
+
+} // namespace multi_grid
+
+/**
+ *  Functionalities related to grid cooperative group type
+ */
+namespace grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return (uint32_t)((hipBlockDim_z * hipGridDim_z) *
+                    (hipBlockDim_y * hipGridDim_y) *
+                    (hipBlockDim_x * hipGridDim_x));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  // Compute global id of the workgroup to which the current thread belongs to
+  uint32_t blkIdx =
+           (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
+                      (hipBlockIdx_y * hipGridDim_x) +
+                      (hipBlockIdx_x));
+
+  // Compute total number of threads being passed to reach current workgroup
+  // within grid
+  uint32_t num_threads_till_current_workgroup =
+           (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
+
+  // Compute thread local rank within current workgroup
+  uint32_t local_thread_rank =
+           (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+                      (hipThreadIdx_y * hipBlockDim_x) +
+                      (hipThreadIdx_x));
+
+  return (num_threads_till_current_workgroup + local_thread_rank);
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+  return (bool)__ockl_grid_is_valid();
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+  __ockl_grid_sync();
+}
+
+} // namespace grid
+
+/**
+ *  Functionalities related to `workgroup` (thread_block in CUDA terminology)
+ *  cooperative group type
+ */
+namespace workgroup {
+
+__CG_STATIC_QUALIFIER__ dim3 group_index() {
+  return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y,
+               (uint32_t)hipBlockIdx_z));
+}
+
+__CG_STATIC_QUALIFIER__ dim3 thread_index() {
+  return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y,
+               (uint32_t)hipThreadIdx_z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+ return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+                    (hipThreadIdx_y * hipBlockDim_x) +
+                    (hipThreadIdx_x)));
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+   //TODO(mahesha) any functionality need to be added here? I believe not
+  return true;
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+  __syncthreads();
+}
+
+} // namespace workgroup
+
+} // namespace internal
+
+} // namespace cooperative_groups
+
+#endif // __cplusplus
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index 10a17e98a13d..e12b2c8585b4 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -82,7 +82,11 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
     T a_sum = s_a[tid];
     T b_sum = s_b[tid];
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     // do reduction in shared mem
     if ((blockSize >= 512) && (tid < 256)) {
@@ -90,21 +94,33 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
         s_b[tid] = b_sum = b_sum + s_b[tid + 256];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 256) && (tid < 128)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 128];
         s_b[tid] = b_sum = b_sum + s_b[tid + 128];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 128) && (tid < 64)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 64];
         s_b[tid] = b_sum = b_sum + s_b[tid + 64];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
 #if (__CUDA_ARCH__ >= 300)
     if (tid < 32) {
@@ -128,42 +144,66 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
         s_b[tid] = b_sum = b_sum + s_b[tid + 32];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 32) && (tid < 16)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 16];
         s_b[tid] = b_sum = b_sum + s_b[tid + 16];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 16) && (tid < 8)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 8];
         s_b[tid] = b_sum = b_sum + s_b[tid + 8];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 8) && (tid < 4)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 4];
         s_b[tid] = b_sum = b_sum + s_b[tid + 4];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 4) && (tid < 2)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 2];
         s_b[tid] = b_sum = b_sum + s_b[tid + 2];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
     if ((blockSize >= 2) && (tid < 1)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 1];
         s_b[tid] = b_sum = b_sum + s_b[tid + 1];
     }
 
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
     cg::sync(cta);
+#endif
 
 #endif
 
diff --git a/csrc/quantization/quantizer.cu b/csrc/quantization/quantizer.cu
index f79c3ecb1e12..c48ae38969e3 100644
--- a/csrc/quantization/quantizer.cu
+++ b/csrc/quantization/quantizer.cu
@@ -5,7 +5,7 @@ namespace cg = cooperative_groups;
 
 __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -206,7 +206,7 @@ __global__ void sr_qunatize_kernel(__half* vals,
                                    int num_bits,
                                    std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -484,7 +484,7 @@ template void launch_sr_qunatize_kernel(__half* vals,
 
 __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -729,7 +729,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals,
                                         int num_bits,
                                         std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
index dbb8828ce977..12048006266e 100644
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -60,7 +60,7 @@ __global__ void gelu_kernel(const float* input, float* vals, int row_stride, int
 
 __global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
@@ -131,7 +131,7 @@ __global__ void fused_bias_gelu(const __half* input,
                                 int row_stride,
                                 int iterations)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
@@ -214,7 +214,7 @@ __global__ void d_gelu_func(__half* d_output,
                             int row_stride,
                             int iterations)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu
index 7d318773f354..180e93ce4dde 100644
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@@ -11,7 +11,10 @@ __global__ void column_sum_reduce(const T* __restrict__ inp,
     __shared__ float tile[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
+
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
 
diff --git a/csrc/transformer/inference/csrc/dequantize.cu b/csrc/transformer/inference/csrc/dequantize.cu
index 43d50f00c058..ddf7a958822a 100644
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@@ -46,7 +46,7 @@ __global__ void dequantize_kernel(__half* output,
                                   unsigned groups,
                                   unsigned merge_count)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     unsigned merge_hidden = hidden_dim >> merge_count;
     unsigned quantization_stride = (merge_hidden * output_size) / groups;
diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index 1737855f614d..fc3faacc54e8 100755
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -39,7 +39,7 @@ __global__ void fused_bias_gelu(__half* input,
                                 int total_count,
                                 int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@@ -117,7 +117,7 @@ __global__ void fused_bias_add(float* input, const float* bias, int total_count,
 
 __global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@@ -195,7 +195,7 @@ __global__ void fused_bias_residual(__half* input,
                                     int total_count,
                                     int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* residual_cast = reinterpret_cast<const float2*>(residual);
diff --git a/csrc/transformer/inference/csrc/normalize.cu b/csrc/transformer/inference/csrc/normalize.cu
index dc0f6be01144..ecd73154f37f 100755
--- a/csrc/transformer/inference/csrc/normalize.cu
+++ b/csrc/transformer/inference/csrc/normalize.cu
@@ -85,7 +85,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
                                                float epsilon,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -287,7 +287,7 @@ __global__ void fused_residual_layer_norm(__half* norm,
                                           int row_stride,
                                           bool preLN)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
     int iteration_stride = blockDim.x;
 
     cg::thread_block b = cg::this_thread_block();
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index b347945df636..cee509965106 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -37,7 +37,7 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int num_seq,
                                 float scale)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index c9bc4a46ee5e..c69c47ebf1c8 100644
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -28,7 +28,9 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -121,12 +123,14 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                __half* means,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
+    g.tiled_partition(b, 32);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -314,7 +318,9 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
+    g.tiled_partition(b, 32);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -404,13 +410,15 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                __half* vars,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
+    g.tiled_partition(b, 32);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -618,7 +626,9 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -685,7 +695,9 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -750,7 +762,9 @@ __global__ void LayerNormBackward2(const float* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -850,7 +864,9 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1065,7 +1081,9 @@ __global__ void LayerNormBackward2(const float* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1160,7 +1178,9 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1362,7 +1382,9 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -1424,7 +1446,9 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -1483,7 +1507,9 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1587,7 +1613,9 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1805,7 +1833,9 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1908,7 +1938,9 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index a4d84c37dd3b..18398148f559 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -20,7 +20,9 @@ __global__ void attn_softmax(float* vals,
     int block_width = blockStride * seq_length;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
+    g.tiled_partition(b, tbSize);
 
     int batch = blockIdx.x;
     int row = blockIdx.y;
@@ -142,7 +144,7 @@ __global__ void attn_softmax(__half* vals,
                              int seq_length,
                              int iterations)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
     __shared__ float partialSum[MAX_WARP_NUM];
 
     int warp_num = blockDim.x >> 5;
@@ -151,7 +153,9 @@ __global__ void attn_softmax(__half* vals,
     int block_width = blockStride * seq_length;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
+    g.tiled_partition(b, tbSize);
 
     int batch = blockIdx.x;
     int row = blockIdx.y;
@@ -445,7 +449,9 @@ __global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_
                           : MAX_THREAD_ITERATIONS);
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
+    g.tiled_partition(b, tbSize);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -520,7 +526,9 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
     }
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
 
     for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
 
diff --git a/csrc/transformer/transform_kernels.cu b/csrc/transformer/transform_kernels.cu
index b68d70f67ae1..7d8a27eeeb43 100755
--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
@@ -96,7 +96,7 @@ __global__ void transform_0213<__half>(__half* output,
                                        int heads,
                                        int head_ext)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -219,7 +219,7 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
                                                 int heads,
                                                 int head_ext)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -289,7 +289,7 @@ __global__ void bias_add_transform_0213_v2(__half* output,
                                            int seq_length,
                                            int heads)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -451,7 +451,7 @@ __global__ void transform4d_0213<__half>(__half* out,
                                          int hidden_dim,
                                          int head_ext)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
 
     int d0_stride = hidden_dim * (seq_length / head_ext);
     int d1_stride = hidden_dim;
@@ -487,7 +487,7 @@ __global__ void transform4d_0213_v2(__half* out,
                                     int seq_length,
                                     int hidden_dim)
 {
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
+#if __CUDA_ARCH__ >= 700
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 95f9233ff21e..c4383a0eed5d 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -178,6 +178,8 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}
 RUN cd ${STAGE_DIR}/DeepSpeed && \
         git checkout . && \
         git checkout master && \
+        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \
+        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \
         DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
 RUN rm -rf ${STAGE_DIR}/DeepSpeed
 RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"

From 909f9bfab1a2a827bb669483ff9acc9ff3f76a4f Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Tue, 29 Jun 2021 19:36:56 +0000
Subject: [PATCH 66/66] Re-Merge pull request #21 from
 rraminen/PR_Revert_HIP_Cooperative_Groups_Workaround

This reverts commit 5b0fac74999ca0fc5fd4e10ecd6bd614eea0343c.
---
 .../hip/hcc_detail/hip_cooperative_groups.h   | 362 ------------------
 .../hip_cooperative_groups_helper.h           | 183 ---------
 csrc/lamb/fused_lamb_cuda_kernel.cu           |  40 --
 csrc/quantization/quantizer.cu                |   8 +-
 csrc/transformer/gelu_kernels.cu              |   6 +-
 csrc/transformer/general_kernels.cu           |   5 +-
 csrc/transformer/inference/csrc/dequantize.cu |   2 +-
 csrc/transformer/inference/csrc/gelu.cu       |   6 +-
 csrc/transformer/inference/csrc/normalize.cu  |   4 +-
 csrc/transformer/inference/csrc/softmax.cu    |   2 +-
 csrc/transformer/normalize_kernels.cu         |  68 +---
 csrc/transformer/softmax_kernels.cu           |  18 +-
 csrc/transformer/transform_kernels.cu         |  10 +-
 docker/Dockerfile.rocm                        |   2 -
 14 files changed, 43 insertions(+), 673 deletions(-)
 delete mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
 delete mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h

diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
deleted file mode 100644
index 20e7bb94b8ad..000000000000
--- a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/**
- *  @file  hcc_detail/hip_cooperative_groups.h
- *
- *  @brief Device side implementation of `Cooperative Group` feature.
- *
- *  Defines new types and device API wrappers related to `Cooperative Group`
- *  feature, which the programmer can directly use in his kernel(s) in order to
- *  make use of this feature.
- */
-#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
-#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
-
-//#if __cplusplus
-#if __cplusplus && defined(__clang__) && defined(__HIP__)
-#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
-#include <hip/hcc_detail/device_functions.h>
-namespace cooperative_groups {
-
-/** \brief The base type of all cooperative group types
- *
- *  \details Holds the key properties of a constructed cooperative group type
- *           object, like the group type, its size, etc
- */
-/*
-class thread_group {
- protected:
-  uint32_t _type; // thread_group type
-  uint32_t _size; // total number of threads in the tread_group
-  uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
-                  // LSB represents lane 0, and MSB represents lane 63
-
-  // Construct a thread group, and set thread group type and other essential
-  // thread group properties. This generic thread group is directly constructed
-  // only when the group is supposed to contain only the calling the thread
-  // (throurh the API - `this_thread()`), and in all other cases, this thread
-  // group object is a sub-object of some other derived thread group object
-  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
-                                uint64_t mask = (uint64_t)0) {
-    _type = type;
-    _size = size;
-    _mask = mask;
-  }
-
- public:
-  // Total number of threads in the thread group, and this serves the purpose
-  // for all derived cooperative group types since their `size` is directly
-  // saved during the construction
-  __CG_QUALIFIER__ uint32_t size() const {
-    return _size;
-  }
-  // Rank of the calling thread within [0, size())
-  __CG_QUALIFIER__ uint32_t thread_rank() const;
-  // Is this cooperative group type valid?
-  __CG_QUALIFIER__ bool is_valid() const;
-  // synchronize the threads in the thread group
-  __CG_QUALIFIER__ void sync() const;
-};
-*/
-
-class thread_group {
- protected:
-    bool _tiled_partition; // this_thread_block() constructor sets to false
-    uint32_t _size;            // this_thread_block() constructor sets to size()
-    uint32_t local_rank;      // this_thread_block() constructor sets to thread_rank()
-    uint32_t _mask;
-    uint32_t _type;
- public:
-    __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t group_size,
-                                uint64_t mask = (uint64_t)0) {
-    _type = type;
-    _size = group_size;
-    _mask = mask;
-    local_rank = internal::workgroup::thread_rank(); 
-    }
-
-    __CG_QUALIFIER__ void tiled_partition(const thread_group& parent,
-                                               unsigned int tile_size) {
-        if ( (ceil(log2(tile_size)) == floor(log2(tile_size))) || tile_size == 0 ||
-            tile_size > 64 || parent.size() < tile_size)
-            _tiled_partition =  false;
-            //xxx : abort
-        _tiled_partition = true;
-        _size = tile_size;
-        local_rank = parent.thread_rank() % tile_size;
-    }
-    __CG_QUALIFIER__ void sync() const; 
-    __CG_QUALIFIER__ uint32_t size() const {
-        return _size;
-    }
-    __CG_QUALIFIER__ uint32_t thread_rank() const; 
-    __CG_QUALIFIER__ float shfl_down(float var, unsigned int delta) const {
-        return (__shfl_down(var, delta, _size));
-    }
-     __CG_QUALIFIER__ float shfl_xor(float var, int mask) const {
-        return (__shfl_xor(var, mask, _size));
-    }
-    __CG_QUALIFIER__ float shfl(float var, unsigned int src_lane) const {
-        return (__shfl(var, src_lane, _size));
-    }
-    __CG_QUALIFIER__ bool is_valid() const;
-
-};
-
-/** \brief The multi-grid cooperative group type
- *
- *  \details Represents an inter-device cooperative group type where the
- *           participating threads within the group spans across multple
- *           devices, running the (same) kernel on these devices
- */
-class multi_grid_group : public thread_group {
-  // Only these friend functions are allowed to construct an object of this class
-  // and access its resources
-  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
-
- protected:
-  // Construct mutli-grid thread group (through the API this_multi_grid())
-  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
-      : thread_group(internal::cg_multi_grid, size) { }
-
- public:
-  // Number of invocations participating in this multi-grid group. In other
-  // words, the number of GPUs
-	__CG_QUALIFIER__ uint32_t num_grids() {
-    return internal::multi_grid::num_grids();
-  }
-  // Rank of this invocation. In other words, an ID number within the range
-  // [0, num_grids()) of the GPU, this kernel is running on
-	__CG_QUALIFIER__ uint32_t grid_rank() {
-    return internal::multi_grid::grid_rank();
-  }
-  __CG_QUALIFIER__ uint32_t thread_rank() const {
-    return internal::multi_grid::thread_rank();
-  }
-  __CG_QUALIFIER__ bool is_valid() const {
-    return internal::multi_grid::is_valid();
-  }
-  __CG_QUALIFIER__ void sync() const {
-    internal::multi_grid::sync();
-  }
-};
-
-/** \brief User exposed API interface to construct multi-grid cooperative
- *         group type object - `multi_grid_group`
- *
- *  \details User is not allowed to directly construct an object of type
- *           `multi_grid_group`. Instead, he should construct it through this
- *           API function
- */
-__CG_QUALIFIER__ multi_grid_group
-this_multi_grid() {
-  return multi_grid_group(internal::multi_grid::size());
-}
-
-/** \brief The grid cooperative group type
- *
- *  \details Represents an inter-workgroup cooperative group type where the
- *           participating threads within the group spans across multiple
- *           workgroups running the (same) kernel on the same device
- */
-class grid_group : public thread_group {
-  // Only these friend functions are allowed to construct an object of this class
-  // and access its resources
-  friend __CG_QUALIFIER__ grid_group this_grid();
-
- protected:
-  // Construct grid thread group (through the API this_grid())
-  explicit __CG_QUALIFIER__ grid_group(uint32_t size)
-      : thread_group(internal::cg_grid, size) { }
-
- public:
-  __CG_QUALIFIER__ uint32_t thread_rank() const {
-    return internal::grid::thread_rank();
-  }
-  __CG_QUALIFIER__ bool is_valid() const {
-    return internal::grid::is_valid();
-  }
-  __CG_QUALIFIER__ void sync() const {
-    internal::grid::sync();
-  }
-};
-
-/** \brief User exposed API interface to construct grid cooperative group type
- *         object - `grid_group`
- *
- *  \details User is not allowed to directly construct an object of type
- *           `multi_grid_group`. Instead, he should construct it through this
- *           API function
- */
-__CG_QUALIFIER__ grid_group
-this_grid() {
-  return grid_group(internal::grid::size());
-}
-
-/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
- *         type
- *
- *  \details Represents an intra-workgroup cooperative group type where the
- *           participating threads within the group are exctly the same threads
- *           which are participated in the currently executing `workgroup`
- */
-class thread_block : public thread_group {
-  // Only these friend functions are allowed to construct an object of this
-  // class and access its resources
-  friend __CG_QUALIFIER__ thread_block this_thread_block();
-
- protected:
-  // Construct a workgroup thread group (through the API this_thread_block())
-  explicit __CG_QUALIFIER__ thread_block(uint32_t size)
-      : thread_group(internal::cg_workgroup, size) { }
-
- public:
-  // 3-dimensional block index within the grid
-  __CG_QUALIFIER__ dim3 group_index() {
-    return internal::workgroup::group_index();
-  }
-  // 3-dimensional thread index within the block
-  __CG_QUALIFIER__ dim3 thread_index() {
-    return internal::workgroup::thread_index();
-  }
-  __CG_QUALIFIER__ uint32_t thread_rank() const {
-    return internal::workgroup::thread_rank();
-  }
-  __CG_QUALIFIER__ bool is_valid() const {
-    return internal::workgroup::is_valid();
-  }
-  __CG_QUALIFIER__ void sync() const {
-    internal::workgroup::sync();
-  }
-};
-
-/** \brief User exposed API interface to construct workgroup cooperative
- *         group type object - `thread_block`
- *
- *  \details User is not allowed to directly construct an object of type
- *           `thread_block`. Instead, he should construct it through this API
- *           function
- */
-__CG_QUALIFIER__ thread_block
-this_thread_block() {
-  return thread_block(internal::workgroup::size());
-}
-
-/**
- *  Implemenation of all publicly exposed base class APIs
- */
-__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
-  switch (this->_type) {
-    case internal::cg_multi_grid: {
-      return (static_cast<const multi_grid_group*>(this)->thread_rank());
-    }
-    case internal::cg_grid: {
-      return (static_cast<const grid_group*>(this)->thread_rank());
-    }
-    case internal::cg_workgroup: {
-      return (static_cast<const thread_block*>(this)->thread_rank());
-    }
-    case internal::cg_coalesced_tile: {
-      return local_rank;
-    }
-    default: {
-      assert(false && "invalid cooperative group type");
-      return -1;
-    }
-  }
-}
-
-__CG_QUALIFIER__ bool thread_group::is_valid() const {
-  switch (this->_type) {
-    case internal::cg_multi_grid: {
-      return (static_cast<const multi_grid_group*>(this)->is_valid());
-    }
-    case internal::cg_grid: {
-      return (static_cast<const grid_group*>(this)->is_valid());
-    }
-    case internal::cg_workgroup: {
-      return (static_cast<const thread_block*>(this)->is_valid());
-    }
-    case internal::cg_coalesced_tile: {
-      return _tiled_partition;
-    }
-    default: {
-      assert(false && "invalid cooperative group type");
-      return false;
-    }
-  }
-}
-
-__CG_QUALIFIER__ void thread_group::sync() const {
-  switch (this->_type) {
-    case internal::cg_multi_grid: {
-      static_cast<const multi_grid_group*>(this)->sync();
-      break;
-    }
-    case internal::cg_grid: {
-      static_cast<const grid_group*>(this)->sync();
-      break;
-    }
-    case internal::cg_workgroup: {
-      static_cast<const thread_block*>(this)->sync();
-      break;
-    }
-    case internal::cg_coalesced_tile: {
-      if (!_tiled_partition) // If in a tiled partition, this is a no-op
-            __syncthreads();
-      break;
-    }
-    default: {
-      assert(false && "invalid cooperative group type");
-    }
-  }
-}
-
-/**
- *  Implemenation of publicly exposed `wrapper` APIs on top of basic cooperative
- *  group type APIs
- */
-template <class CGTy>
-__CG_QUALIFIER__ uint32_t group_size(CGTy const &g) {
-  return g.size();
-}
-
-template <class CGTy>
-__CG_QUALIFIER__ uint32_t thread_rank(CGTy const &g) {
-  return g.thread_rank();
-}
-
-template <class CGTy>
-__CG_QUALIFIER__ bool is_valid(CGTy const &g) {
-  return g.is_valid();
-}
-
-template <class CGTy>
-__CG_QUALIFIER__ void sync(CGTy const &g) {
-  g.sync();
-}
-
-} // namespace cooperative_groups
-
-#endif // __cplusplus
-#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
deleted file mode 100644
index 7f8e69da11c3..000000000000
--- a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/**
- *  @file  hcc_detail/hip_cooperative_groups_helper.h
- *
- *  @brief Device side implementation of cooperative group feature.
- *
- *  Defines helper constructs and APIs which aid the types and device API
- *  wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
- */
-#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
-#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
-
-#if __cplusplus
-#include <hip/hcc_detail/hip_runtime_api.h>
-#include <hip/hcc_detail/device_functions.h>
-
-#if !defined(__align__)
-#define __align__(x) __attribute__((aligned(x)))
-#endif
-
-#if !defined(__CG_QUALIFIER__)
-#define __CG_QUALIFIER__ __device__ __forceinline__
-#endif
-
-#if !defined(__CG_STATIC_QUALIFIER__)
-#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
-#endif
-
-#if !defined(WAVEFRONT_SIZE)
-#define WAVEFRONT_SIZE 64
-#endif
-
-namespace cooperative_groups {
-
-namespace internal {
-
-/** \brief Enums representing different cooperative group types
- */
-typedef enum {
-  cg_invalid,
-  cg_multi_grid,
-  cg_grid,
-  cg_workgroup,
-  cg_coalesced_tile
-} group_type;
-
-/**
- *  Functionalities related to multi-grid cooperative group type
- */
-namespace multi_grid {
-
-__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
-  return (uint32_t)__ockl_multi_grid_num_grids();
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
-  return (uint32_t)__ockl_multi_grid_grid_rank();
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return (uint32_t)__ockl_multi_grid_size();
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  return (uint32_t)__ockl_multi_grid_thread_rank();
-}
-
-__CG_STATIC_QUALIFIER__ bool is_valid() {
-  return (bool)__ockl_multi_grid_is_valid();
-}
-
-__CG_STATIC_QUALIFIER__ void sync() {
-  __ockl_multi_grid_sync();
-}
-
-} // namespace multi_grid
-
-/**
- *  Functionalities related to grid cooperative group type
- */
-namespace grid {
-
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return (uint32_t)((hipBlockDim_z * hipGridDim_z) *
-                    (hipBlockDim_y * hipGridDim_y) *
-                    (hipBlockDim_x * hipGridDim_x));
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  // Compute global id of the workgroup to which the current thread belongs to
-  uint32_t blkIdx =
-           (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
-                      (hipBlockIdx_y * hipGridDim_x) +
-                      (hipBlockIdx_x));
-
-  // Compute total number of threads being passed to reach current workgroup
-  // within grid
-  uint32_t num_threads_till_current_workgroup =
-           (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
-
-  // Compute thread local rank within current workgroup
-  uint32_t local_thread_rank =
-           (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
-                      (hipThreadIdx_y * hipBlockDim_x) +
-                      (hipThreadIdx_x));
-
-  return (num_threads_till_current_workgroup + local_thread_rank);
-}
-
-__CG_STATIC_QUALIFIER__ bool is_valid() {
-  return (bool)__ockl_grid_is_valid();
-}
-
-__CG_STATIC_QUALIFIER__ void sync() {
-  __ockl_grid_sync();
-}
-
-} // namespace grid
-
-/**
- *  Functionalities related to `workgroup` (thread_block in CUDA terminology)
- *  cooperative group type
- */
-namespace workgroup {
-
-__CG_STATIC_QUALIFIER__ dim3 group_index() {
-  return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y,
-               (uint32_t)hipBlockIdx_z));
-}
-
-__CG_STATIC_QUALIFIER__ dim3 thread_index() {
-  return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y,
-               (uint32_t)hipThreadIdx_z));
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
- return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
-                    (hipThreadIdx_y * hipBlockDim_x) +
-                    (hipThreadIdx_x)));
-}
-
-__CG_STATIC_QUALIFIER__ bool is_valid() {
-   //TODO(mahesha) any functionality need to be added here? I believe not
-  return true;
-}
-
-__CG_STATIC_QUALIFIER__ void sync() {
-  __syncthreads();
-}
-
-} // namespace workgroup
-
-} // namespace internal
-
-} // namespace cooperative_groups
-
-#endif // __cplusplus
-#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index e12b2c8585b4..10a17e98a13d 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -82,11 +82,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
     T a_sum = s_a[tid];
     T b_sum = s_b[tid];
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     // do reduction in shared mem
     if ((blockSize >= 512) && (tid < 256)) {
@@ -94,33 +90,21 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
         s_b[tid] = b_sum = b_sum + s_b[tid + 256];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 256) && (tid < 128)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 128];
         s_b[tid] = b_sum = b_sum + s_b[tid + 128];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 128) && (tid < 64)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 64];
         s_b[tid] = b_sum = b_sum + s_b[tid + 64];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
 #if (__CUDA_ARCH__ >= 300)
     if (tid < 32) {
@@ -144,66 +128,42 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
         s_b[tid] = b_sum = b_sum + s_b[tid + 32];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 32) && (tid < 16)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 16];
         s_b[tid] = b_sum = b_sum + s_b[tid + 16];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 16) && (tid < 8)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 8];
         s_b[tid] = b_sum = b_sum + s_b[tid + 8];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 8) && (tid < 4)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 4];
         s_b[tid] = b_sum = b_sum + s_b[tid + 4];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 4) && (tid < 2)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 2];
         s_b[tid] = b_sum = b_sum + s_b[tid + 2];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 2) && (tid < 1)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 1];
         s_b[tid] = b_sum = b_sum + s_b[tid + 1];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
 #endif
 
diff --git a/csrc/quantization/quantizer.cu b/csrc/quantization/quantizer.cu
index c48ae38969e3..f79c3ecb1e12 100644
--- a/csrc/quantization/quantizer.cu
+++ b/csrc/quantization/quantizer.cu
@@ -5,7 +5,7 @@ namespace cg = cooperative_groups;
 
 __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -206,7 +206,7 @@ __global__ void sr_qunatize_kernel(__half* vals,
                                    int num_bits,
                                    std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -484,7 +484,7 @@ template void launch_sr_qunatize_kernel(__half* vals,
 
 __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -729,7 +729,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals,
                                         int num_bits,
                                         std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
index 12048006266e..dbb8828ce977 100644
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -60,7 +60,7 @@ __global__ void gelu_kernel(const float* input, float* vals, int row_stride, int
 
 __global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
@@ -131,7 +131,7 @@ __global__ void fused_bias_gelu(const __half* input,
                                 int row_stride,
                                 int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
@@ -214,7 +214,7 @@ __global__ void d_gelu_func(__half* d_output,
                             int row_stride,
                             int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu
index 180e93ce4dde..7d318773f354 100644
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@@ -11,10 +11,7 @@ __global__ void column_sum_reduce(const T* __restrict__ inp,
     __shared__ float tile[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
-
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
 
diff --git a/csrc/transformer/inference/csrc/dequantize.cu b/csrc/transformer/inference/csrc/dequantize.cu
index ddf7a958822a..43d50f00c058 100644
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@@ -46,7 +46,7 @@ __global__ void dequantize_kernel(__half* output,
                                   unsigned groups,
                                   unsigned merge_count)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     unsigned merge_hidden = hidden_dim >> merge_count;
     unsigned quantization_stride = (merge_hidden * output_size) / groups;
diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index fc3faacc54e8..1737855f614d 100755
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -39,7 +39,7 @@ __global__ void fused_bias_gelu(__half* input,
                                 int total_count,
                                 int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@@ -117,7 +117,7 @@ __global__ void fused_bias_add(float* input, const float* bias, int total_count,
 
 __global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@@ -195,7 +195,7 @@ __global__ void fused_bias_residual(__half* input,
                                     int total_count,
                                     int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* residual_cast = reinterpret_cast<const float2*>(residual);
diff --git a/csrc/transformer/inference/csrc/normalize.cu b/csrc/transformer/inference/csrc/normalize.cu
index ecd73154f37f..dc0f6be01144 100755
--- a/csrc/transformer/inference/csrc/normalize.cu
+++ b/csrc/transformer/inference/csrc/normalize.cu
@@ -85,7 +85,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
                                                float epsilon,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -287,7 +287,7 @@ __global__ void fused_residual_layer_norm(__half* norm,
                                           int row_stride,
                                           bool preLN)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
 
     cg::thread_block b = cg::this_thread_block();
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index cee509965106..b347945df636 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -37,7 +37,7 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int num_seq,
                                 float scale)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index c69c47ebf1c8..c9bc4a46ee5e 100644
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -28,9 +28,7 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -123,14 +121,12 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                __half* means,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
-    g.tiled_partition(b, 32);
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -318,9 +314,7 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
-    g.tiled_partition(b, 32);
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -410,15 +404,13 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                __half* vars,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
-    g.tiled_partition(b, 32);
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -626,9 +618,7 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -695,9 +685,7 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -762,9 +750,7 @@ __global__ void LayerNormBackward2(const float* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -864,9 +850,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1081,9 +1065,7 @@ __global__ void LayerNormBackward2(const float* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1178,9 +1160,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1382,9 +1362,7 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -1446,9 +1424,7 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -1507,9 +1483,7 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1613,9 +1587,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1833,9 +1805,7 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1938,9 +1908,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index 18398148f559..a4d84c37dd3b 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -20,9 +20,7 @@ __global__ void attn_softmax(float* vals,
     int block_width = blockStride * seq_length;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
-    g.tiled_partition(b, tbSize);
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
 
     int batch = blockIdx.x;
     int row = blockIdx.y;
@@ -144,7 +142,7 @@ __global__ void attn_softmax(__half* vals,
                              int seq_length,
                              int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float partialSum[MAX_WARP_NUM];
 
     int warp_num = blockDim.x >> 5;
@@ -153,9 +151,7 @@ __global__ void attn_softmax(__half* vals,
     int block_width = blockStride * seq_length;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
-    g.tiled_partition(b, tbSize);
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
 
     int batch = blockIdx.x;
     int row = blockIdx.y;
@@ -449,9 +445,7 @@ __global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_
                           : MAX_THREAD_ITERATIONS);
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
-    g.tiled_partition(b, tbSize);
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -526,9 +520,7 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
     }
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
 
diff --git a/csrc/transformer/transform_kernels.cu b/csrc/transformer/transform_kernels.cu
index 7d8a27eeeb43..b68d70f67ae1 100755
--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
@@ -96,7 +96,7 @@ __global__ void transform_0213<__half>(__half* output,
                                        int heads,
                                        int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -219,7 +219,7 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
                                                 int heads,
                                                 int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -289,7 +289,7 @@ __global__ void bias_add_transform_0213_v2(__half* output,
                                            int seq_length,
                                            int heads)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -451,7 +451,7 @@ __global__ void transform4d_0213<__half>(__half* out,
                                          int hidden_dim,
                                          int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * (seq_length / head_ext);
     int d1_stride = hidden_dim;
@@ -487,7 +487,7 @@ __global__ void transform4d_0213_v2(__half* out,
                                     int seq_length,
                                     int hidden_dim)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index c4383a0eed5d..95f9233ff21e 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -178,8 +178,6 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}
 RUN cd ${STAGE_DIR}/DeepSpeed && \
         git checkout . && \
         git checkout master && \
-        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \
-        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \
         DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
 RUN rm -rf ${STAGE_DIR}/DeepSpeed
 RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"