-
Notifications
You must be signed in to change notification settings - Fork 0
/
simple_dgemm.cu
177 lines (138 loc) · 5.2 KB
/
simple_dgemm.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// Low level matrix multiplication on GPU using CUDA with CURAND and CUBLAS
// C(m,n) = A(m,k) * B(k,n)
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cublas_v2.h>
#include <curand.h>
#include <cmath>
// Randomization helpers
// adapted from https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/rocm-3.0/clients/include/rocblas_init.hpp#L42
void fill_sin(double *A, size_t nr_rows_A, size_t nr_cols_A){
for(size_t i = 0; i < nr_rows_A; ++i)
for(size_t j = 0; j < nr_cols_A; ++j)
A[i + j * nr_rows_A] = sin(float(i + j * nr_rows_A));
}
void fill_cos(double *A, size_t nr_rows_A, size_t nr_cols_A){
for(size_t i = 0; i < nr_rows_A; ++i)
for(size_t j = 0; j < nr_cols_A; ++j)
A[i + j * nr_rows_A] = cos(float(i + j * nr_rows_A));
}
#ifdef CUBLAS_API_H_
// cuBLAS API errors
static const char *_cudaGetErrorEnum(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
#endif
// Multiply the arrays A and B on GPU and save the result in C
// C(m,n) = A(m,k) * B(k,n)
void gpu_blas_mmul( cublasHandle_t handle, const double *A, const double *B, double *C, const int m, const int k, const int n) {
int lda=m,ldb=k,ldc=m;
const double alf = 1;
const double bet = 0;
const double *alpha = &alf;
const double *beta = &bet;
// Do the actual multiplication
cublasStatus_t err = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
if (err != CUBLAS_STATUS_SUCCESS)
std::cout << "Error: " << _cudaGetErrorEnum(err) << std::endl;
}
//Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format
void print_matrix(const double *A, int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[j * nr_rows_A + i] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int def(int value, int reps, int device) {
cudaSetDevice(device);
cudaStream_t computeStream;
cudaError_t result;
result = cudaStreamCreate(&computeStream);
// Allocate 3 arrays on CPU
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
// for simplicity we are going to use square arrays
nr_rows_A = nr_cols_A = nr_rows_B = nr_cols_B = nr_rows_C = nr_cols_C = value;
double *h_A = (double *)malloc(nr_rows_A * nr_cols_A * sizeof(double));
double *h_B = (double *)malloc(nr_rows_B * nr_cols_B * sizeof(double));
double *h_C = (double *)malloc(nr_rows_C * nr_cols_C * sizeof(double));
// Allocate 3 arrays on GPU
double *d_A, *d_B, *d_C;
cudaMalloc(&d_A,nr_rows_A * nr_cols_A * sizeof(double));
cudaMalloc(&d_B,nr_rows_B * nr_cols_B * sizeof(double));
cudaMalloc(&d_C,nr_rows_C * nr_cols_C * sizeof(double));
// cudaMemcpy(d_A,h_A,nr_rows_A * nr_cols_A * sizeof(float),cudaMemcpyHostToDevice);
// cudaMemcpy(d_B,h_B,nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyHostToDevice);
// Fill the arrays A and B on GPU with random numbers
fill_sin(h_A, nr_rows_A, nr_cols_A);
fill_cos(h_B, nr_rows_B, nr_cols_B);
// We have useful values in A and B you can copy them in GPU:
cudaMemcpyAsync(d_A,h_A,nr_rows_A * nr_cols_A * sizeof(double),cudaMemcpyHostToDevice, computeStream);
cudaMemcpyAsync(d_B,h_B,nr_rows_B * nr_cols_B * sizeof(double),cudaMemcpyHostToDevice, computeStream);
std::cout << "A =" << std::endl;
std::cout << "B =" << std::endl;
// Create a handle for CUBLAS
cublasHandle_t handle;
cublasCreate(&handle);
cublasSetStream(handle, computeStream);
for (int i=0; i< reps; i++){
// Multiply A and B on GPU
gpu_blas_mmul(handle, d_A, d_B, d_C, nr_rows_A, nr_cols_A, nr_cols_B);
cudaStreamSynchronize(computeStream);
}
// Destroy the handle
cublasDestroy(handle);
// Copy (and print) the result on host memory
cudaMemcpyAsync(h_C,d_C,nr_rows_C * nr_cols_C * sizeof(double),cudaMemcpyDeviceToHost, computeStream);
std::cout << "C =" << std::endl;
// print_matrix(h_C, nr_rows_C, nr_cols_C);
//Free GPU memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
result = cudaStreamDestroy(computeStream);
// Free CPU memory
free(h_A);
free(h_B);
free(h_C);
return 0;
}
int main(int argc, char* argv[]){
// for (int i=100; i <= 100000; i = i*10){
// std::cout << "\n\n\n" << i << "\n";
// def(1024, i);
// }
if (argc != 4){
std::cout << "Usage: mul <dim> <reps> <target-device num>" << std::endl;
exit(-1);
}
int dim = atoi(argv[1]);
int reps = atoi(argv[2]);
int device = atoi(argv[3]);
//cout << dim <<
def(dim, reps, device);
return 0;
}