-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatrix_multiplication.cu
143 lines (114 loc) · 3.38 KB
/
matrix_multiplication.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#include <stdio.h>
#include <time.h>
//Set 2-D Matrix size: SIZE*SIZE
const int DSIZE = 256;
const int a = 1;
const int b = 2;
// error checking macro
#define cudaCheckErrors() \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Error: %s at %s:%d \n", \
cudaGetErrorString(__err),__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING***\n"); \
exit(1); \
} \
} while (0)
// Check if matrix multiplication was correct
int check_result(const int *C){
for (int i = 0; i < DSIZE*DSIZE; i++) {
if (C[i] != a*b*DSIZE) {
printf("Error : Index %d was %d instead of %d\n", i, C[i], a*b*DSIZE);
return -1;
}
}
printf("Matrix multiplication was correct!\n");
return 0;
}
// Function that runs on the CPU
void matrix_mult_cpu(const int *A, const int *B, int *C, int N) {
for(int i=0; i<N; ++i)
{
for(int j=0; j<N; ++j)
{
int sum = 0;
for(int k=0; k<N; ++k)
sum += A[k+N*i] * B[j+k*N];
C[j+N*i] = sum;
}
}
}
// CUDA kernel that runs on the GPU
__global__ void matrix_mult_gpu(const int *A, const int *B, int *C, int N) {
// Express the matrix index in 2-D as a function of the threadIdx and the blockIdx
int idx = FIXME;
int idy = FIXME;
if(idx<N && idy<N){
int sum = 0;
for(int k=0; k<N; k++)
FIXME : Write the dot product using idx,idy,k and N
C[FIXME] = sum;
}
}
int main() {
// Variables used to measure time
clock_t t0, t1, t2, t3;
double t_cpu = 0.0;
double t_gpu = 0.0;
// Create the device and host pointers
int *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
// Fill in the host pointers
h_A = new int[DSIZE*DSIZE];
h_B = new int[DSIZE*DSIZE];
h_C = new int[DSIZE*DSIZE];
for (int i = 0; i < DSIZE*DSIZE; i++){
h_A[i] = a;
h_B[i] = b;
h_C[i] = 0;
}
// Measure time on CPU : Start timing
t0 = clock();
// Call the CPU function
FIXME
// Calculate & print CPU time
t1 = clock();
t_cpu = ((double)(t1-t0))/CLOCKS_PER_SEC;
printf ("CPU took %f seconds\n", t_cpu);
// Check if matrix multiplication on CPU was correct
check_result(h_C);
// Initialize host pointer that holds result
for (int i = 0; i < DSIZE*DSIZE; i++)
h_C[i] = 0;
// Measure time on GPU : Start timing
t2 = clock();
// Allocate device memory
cudaMalloc(&d_A, DSIZE*DSIZE*sizeof(int));
FIXME : Do the same for the additional device pointers
// Check memory allocation for errors
cudaCheckErrors();
// Copy the matrices on GPU
cudaMemcpy(d_A, h_A, DSIZE*DSIZE*sizeof(int), cudaMemcpyHostToDevice);
FIXME : Copy h_B from host to device
// Check memory copy for errors
cudaCheckErrors();
// Define the number of threads per block
int blockDim = 32;
// dim3: Native CUDA type used to specify dimensions (up to 3 arguments)
dim3 block(blockDim, blockDim);
// Define the number of blocks in the grid
dim3 grid((DSIZE+block.x-1)/block.x, (DSIZE+block.y-1)/block.y);
// Launch kernel
matrix_mult_gpu<<<FIXME,FIXME>>>(FIXME);
// Check kernel launch for errors
FIXME
// Copy results back to host
FIXME
// Measure time on GPU
FIXME
// Check if matrix multiplication on GPU was correct
FIXME
// Free the allocated memory
FIXME
return 0;
}