ROCm · jithunnair-amd · May 5, 2020 · May 6, 2020 · May 6, 2020 · May 8, 2020
diff --git a/.github/workflows/ifu.yml b/.github/workflows/ifu.yml
@@ -0,0 +1,63 @@
+name: IntegrateFromUpstream
+on:
+#  schedule:
+#    # verified via crontab.guru website. “At 06:55 on Monday.”
+#    - cron: '55 6 * * 1'
+  workflow_dispatch:
+    inputs:
+      message:
+        description: 'Reason for manual trigger'
+        required: false
+        default: 'refresh branch'
+jobs:
+  IntegrateFromUpstream:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+      - name: Extract branch name
+        id: extract_branch
+        shell: bash
+        run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      - name: Fetch and Merge
+        id: fetch_and_merge
+        run: |
+          echo "Reason for trigger: ${{ github.event.inputs.message }}"
+          echo "Actor for trigger: ${{ github.actor }}"
+          git config user.name github-actions
+          git config user.email [email protected]
+          git remote add upstream https://github.com/microsoft/DeepSpeed
+          git fetch upstream master
+          git merge upstream/master 
+          # Since we use our own fork of DeepSpeedExamples, ignore theirs
+          git checkout HEAD DeepSpeedExamples
+      - name: Create Pull Request
+        id: create_pull_request
+        uses: jithunnair-amd/create-pull-request@v3
+        with:
+#          token: ${{ secrets.PAT }}
+          branch: IFU-${{ steps.extract_branch.outputs.branch }}-${{ steps.date.outputs.date }}
+          title: IFU-${{ steps.extract_branch.outputs.branch }}-${{ steps.date.outputs.date }}
+          assignees: rraminen
+          reviewers: jithunnair-amd
+          delete-branch: true
+      - name: Send email
+        uses: jithunnair-amd/[email protected]
+        if: always()
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          secure: true
+          username: ${{ secrets.GMAIL_USERNAME }}
+          password: ${{ secrets.GMAIL_PASSWORD }}
+          subject: IFU to ${{ steps.extract_branch.outputs.branch }} branch of ${{ github.repository }}
+          to: [email protected], [email protected]
+          from: ${{ secrets.GMAIL_USERNAME }}
+          html_body: |
+            <b>Fetch and Merge</b>: ${{ steps.fetch_and_merge.outcome }} <br/>
+            <b>Create Pull Request</b>: ${{ steps.create_pull_request.outcome }} <br/>
+            <b>Pull request</b>: <a href="${{ steps.create_pull_request.outputs.pull-request-url }}">${{ steps.create_pull_request.outputs.pull-request-url }}</a> <br/>
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,3 @@
 [submodule "DeepSpeedExamples"]
 	path = DeepSpeedExamples
-	url = https://github.com/microsoft/DeepSpeedExamples
-	branch = master
+	url = https://github.com/ROCmSoftwarePlatform/DeepSpeedExamples.git
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
diff --git a/csrc/includes/cublas_wrappers.h b/csrc/includes/cublas_wrappers.h
@@ -5,7 +5,9 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <mma.h>
+#endif
 #include <stdio.h>
 
 int cublas_gemm_ex(cublasHandle_t handle,
@@ -19,7 +21,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const float* A,
                    const float* B,
                    float* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
 
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
@@ -32,7 +38,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const __half* A,
                    const __half* B,
                    __half* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
 
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
@@ -49,7 +59,11 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
 
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
@@ -66,4 +80,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
@@ -5,7 +5,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>
 
 #include "context.h"

diff --git a/csrc/includes/feed_forward.h b/csrc/includes/feed_forward.h
@@ -43,7 +43,11 @@ class FeedForward {
                        weights,
                        input_ptr,
                        out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[0]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[0]));
+#endif
     }
     void Backward(int bsz,
                   const T* out_grad,
@@ -68,7 +72,11 @@ class FeedForward {
                        input_ptr,
                        out_grad,
                        weights_grad,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[1]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[1]));
+#endif
 
         cublas_gemm_ex(_cublasHandle,
                        CUBLAS_OP_N,
@@ -81,7 +89,11 @@ class FeedForward {
                        weights,
                        out_grad,
                        inp_grad_out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[2]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[2]));
+#endif
 
         launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
     }

diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
@@ -2,7 +2,9 @@
 #pragma once
 
 #include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
+#endif
 #include <array>
 #include <cstdio>
 #include <cstdlib>
@@ -58,7 +60,11 @@ class GemmTest {
                            B,
                            A,
                            C,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw1 = Run(loops, [=](int algo) {
@@ -73,7 +79,11 @@ class GemmTest {
                            A,
                            C,
                            B,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw2 = Run(loops, [=](int algo) {
@@ -88,7 +98,11 @@ class GemmTest {
                            B,
                            C,
                            A,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
@@ -100,8 +114,13 @@ class GemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard;
+             algo <= (int)rocblas_gemm_algo_standard;
+#else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
              algo++) {
             int warm_up = 5;
             for (int i = 0; i < warm_up; ++i) f(algo);
@@ -186,7 +205,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw1 = Run(loops, [=](int algo) {
@@ -216,7 +239,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw2 = Run(loops, [=](int algo) {
@@ -243,7 +270,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
@@ -255,8 +286,13 @@ class StridedGemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard;
+             algo <= (int)rocblas_gemm_algo_standard;
+#else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
              algo++) {
             int warm_up = 5;
             for (int i = 0; i < warm_up; ++i) f(algo);

diff --git a/csrc/includes/general_kernels.h b/csrc/includes/general_kernels.h
@@ -3,7 +3,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>
 
 #include "context.h"

diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h
@@ -72,7 +72,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[0]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
+#endif
     }
 
     void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
@@ -96,7 +100,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     _config.batch_size,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[0]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
+#endif
 
         k_buf = _buffer_a;
         q_buf = _buffer_b;
@@ -136,7 +144,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[1]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[1]));
+#endif
 
         // A need to transpose.
         cublasOperation_t op_a = (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
@@ -161,7 +173,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[2]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[2]));
+#endif
     }
 
     inline int GetN() const { return _config.k; }

diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -14,7 +14,11 @@
 #include <iostream>
 
 //#include <helper_functions.h>
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <cuda_runtime_api.h>
 #include <stdio.h>
 

diff --git a/csrc/quantization/quantizer.cu b/csrc/quantization/quantizer.cu
@@ -5,7 +5,7 @@ namespace cg = cooperative_groups;
 
 __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -206,7 +206,7 @@ __global__ void sr_qunatize_kernel(__half* vals,
                                    int num_bits,
                                    std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -484,7 +484,7 @@ template void launch_sr_qunatize_kernel(__half* vals,
 
 __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -729,7 +729,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals,
                                         int num_bits,
                                         std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);