Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rocblas -> hipblas changes for ROCm #5401

Merged
merged 9 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions csrc/includes/cublas_wrappers.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <rocblas/rocblas.h>
#endif
#include <stdio.h>
#include <torch/version.h>

int cublas_gemm_ex(cublasHandle_t handle,
cublasOperation_t transa,
Expand All @@ -29,7 +30,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
const float* A,
const float* B,
float* C,
#ifdef __HIP_PLATFORM_AMD__
// TODO HIP: Remove backward compatibility for torch<=2.0 in future
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
loadams marked this conversation as resolved.
Show resolved Hide resolved
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
Expand All @@ -46,7 +48,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
const __half* A,
const __half* B,
__half* C,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
Expand All @@ -67,7 +69,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
Expand All @@ -88,7 +90,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
Expand Down
7 changes: 4 additions & 3 deletions csrc/includes/feed_forward.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ class FeedForward {
weights,
input_ptr,
out,
#ifdef __HIP_PLATFORM_AMD__
// TODO HIP: Remove backward compatibility for torch<=2.0 in future
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
rocblas_gemm_algo(config_.gemm_algos[0]));
#else
cublasGemmAlgo_t(config_.gemm_algos[0]));
Expand Down Expand Up @@ -77,7 +78,7 @@ class FeedForward {
input_ptr,
out_grad,
weights_grad,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
rocblas_gemm_algo(config_.gemm_algos[1]));
#else
cublasGemmAlgo_t(config_.gemm_algos[1]));
Expand All @@ -94,7 +95,7 @@ class FeedForward {
weights,
out_grad,
inp_grad_out,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
rocblas_gemm_algo(config_.gemm_algos[2]));
#else
cublasGemmAlgo_t(config_.gemm_algos[2]));
Expand Down
24 changes: 16 additions & 8 deletions csrc/includes/gemm_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ class GemmTest {
B,
A,
C,
#ifdef __HIP_PLATFORM_AMD__
// TODO HIP: Remove backward compatibility for torch<=2.0 in future
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand All @@ -86,7 +87,7 @@ class GemmTest {
A,
C,
B,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand All @@ -105,7 +106,7 @@ class GemmTest {
B,
C,
A,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand All @@ -121,8 +122,10 @@ class GemmTest {
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;

#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#elif defined(__HIP_PLATFORM_AMD__)
for (int algo = (int)HIPBLAS_GEMM_DEFAULT; algo <= (int)HIPBLAS_GEMM_DEFAULT;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
Expand Down Expand Up @@ -211,7 +214,7 @@ class StridedGemmTest {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand Down Expand Up @@ -245,7 +248,7 @@ class StridedGemmTest {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand Down Expand Up @@ -276,7 +279,7 @@ class StridedGemmTest {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
static_cast<rocblas_gemm_algo>(algo));
#else
static_cast<cublasGemmAlgo_t>(algo));
Expand All @@ -292,11 +295,16 @@ class StridedGemmTest {
float fast_latency = (std::numeric_limits<float>::max)();
int fast_algo = 0;

#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
#else
#ifdef __HIP_PLATFORM_AMD__
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
#else
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
#endif
#endif
algo++) {
int warm_up = 5;
Expand Down
9 changes: 5 additions & 4 deletions csrc/includes/strided_batch_gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ class StridedBatchGemm {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_AMD__
// TODO HIP: Remove backward compatibility for torch<=2.0 in future
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
rocblas_gemm_algo(_config.gemm_algos[0]));
#else
cublasGemmAlgo_t(_config.gemm_algos[0]));
Expand Down Expand Up @@ -105,7 +106,7 @@ class StridedBatchGemm {
stride_b,
stride_c,
_config.batch_size,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
rocblas_gemm_algo(_config.gemm_algos[0]));
#else
cublasGemmAlgo_t(_config.gemm_algos[0]));
Expand Down Expand Up @@ -149,7 +150,7 @@ class StridedBatchGemm {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
rocblas_gemm_algo(_config.gemm_algos[1]));
#else
cublasGemmAlgo_t(_config.gemm_algos[1]));
Expand Down Expand Up @@ -178,7 +179,7 @@ class StridedBatchGemm {
stride_b,
stride_c,
bsz,
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_PLATFORM_AMD__) && TORCH_VERSION_MAJOR <= 2 && TORCH_VERSION_MINOR <= 0
rocblas_gemm_algo(_config.gemm_algos[2]));
#else
cublasGemmAlgo_t(_config.gemm_algos[2]));
Expand Down
Loading
Loading