-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMatMul_v1.cu
87 lines (73 loc) · 2.78 KB
/
MatMul_v1.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include "Common.cuh"
#include "MatMulOnCPU.h"
using namespace std;
const int Row = 1024;
const int Col = 1024;
const int Blocksize = 32;
__global__ void myMatMulOnGPU(float* M, float* N, float* P, int width)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
float sum = 0;
for (int k = 0; k < width; k++)
sum += M[k * width + i] * N[j * width + k];
P[j * width + i] = sum;
}
int main()
{
float* A = (float*)malloc(sizeof(float) * Row * Col);
float* B = (float*)malloc(sizeof(float) * Row * Col);
float* C = (float*)malloc(sizeof(float) * Row * Col);
float* C_ref = (float*)malloc(sizeof(float) * Row * Col);
mySetMatValue(A, Row, Col);
mySetMatValue(B, Row, Col);
myMatMulOnCPU(A, B, C_ref, Col);
//malloc device memory
float* d_dataA, * d_dataB, * d_dataC;
myCudaDetermineGPU();
CHECK(cudaMalloc((void**)&d_dataA, sizeof(float) * Row * Col));
CHECK(cudaMalloc((void**)&d_dataB, sizeof(float) * Row * Col));
CHECK(cudaMalloc((void**)&d_dataC, sizeof(float) * Row * Col));
CHECK(cudaMemcpy(d_dataA, A, sizeof(float) * Row * Col, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_dataB, B, sizeof(float) * Row * Col, cudaMemcpyHostToDevice));
//init timing
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
//init block and grid dim
dim3 threadPerBlock(Blocksize, Blocksize);
dim3 blockNumber((Col + threadPerBlock.x - 1) / threadPerBlock.x, (Row + threadPerBlock.y - 1) / threadPerBlock.y);
printf("Block(%d,%d) Grid(%d,%d).\n", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);
//warm up
for (int i = 0; i < 10; i++)
myMatMulOnGPU << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);
CHECK(cudaGetLastError());
CHECK(cudaEventRecord(start, 0));
myMatMulOnGPU << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);
CHECK(cudaGetLastError());
CHECK(cudaEventRecord(stop, 0));
//check result
CHECK(cudaMemcpy(C, d_dataC, sizeof(float) * Row * Col, cudaMemcpyDeviceToHost));
if (myMatCmp(C, C_ref, Row * Col))
{
printf("Error: Wrong result!\n");
exit(-1);
}
float elapsedTime;
CHECK(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("Total time: %fms\n", elapsedTime);
//free resource
free(A);
free(B);
free(C);
CHECK(cudaFree(d_dataA));
CHECK(cudaFree(d_dataB));
CHECK(cudaFree(d_dataC));
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
return 0;
}