-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatmul_thread_row.cu
94 lines (79 loc) · 2.8 KB
/
matmul_thread_row.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
// Matrix multiplication kernel where each thread produces one output matrix row.
// Assumptions: inputs are square matrices.
// Note: Exercise 1.a. in Chapter 3 of the PMPP book.
#include <stdio.h>
#include <cstdlib>
#include <cassert>
__global__ void matMulThreadRow(int *a, int *b, int *c, int N) {
// Every thread computes one row in the output matrix.
// Calculate global row of each thread
int row = blockIdx.y * blockDim.y + threadIdx.y;
// Boundary check
if (row < N) {
// Each thread computes one row in the output matrix `c` and
// thus traverses the columns (col=0, ..., N-1) of that row.
for(int col = 0; col < N; col++) {
// For the computation of each element, the thread
// traverses one row in `a` and one column in `b`.
int tmp = 0;
for (int k = 0; k < N; k++) {
tmp += a[row * N + k] * b[k * N + col];
}
c[row * N + col] = tmp; // Write element to output matrix
}
}
}
// Initialise a square matrix with random numbers between 0-100
void init_matrix(int *m, int N)
{
for (int i = 0; i < N * N; i++) {
m[i] = rand() % 100;
}
}
// Verify the result of the GPU kernel with a CPU calculation
void verify_result(int *a, int *b, int *c, int N)
{
int tmp;
// For every row
for(int i = 0; i < N; i++) {
// For every column
for(int j = 0; j < N; j++) {
// For every element in the row-col pair ij
tmp = 0;
for(int k = 0; k < N; k++) {
tmp += a[i * N + k] * b[k * N + j];
}
// Check each element
assert(tmp == c[i * N + j]);
}
}
}
int main() {
// Set square matrix dimensions (2^10 x 2^10 here)
int N = 1 << 10;
size_t bytes = N * N * sizeof(int);
// Allocate host memory
int *a, *b, *c;
// Allocate unified memory
cudaMallocManaged(&a, bytes);
cudaMallocManaged(&b, bytes);
cudaMallocManaged(&c, bytes);
// Initialise input matrices
init_matrix(a, N);
init_matrix(b, N);
// Since each thread computes one row of the output `c`, we
// only need `N` threads in total along the y-dimension
int threadsPerBlockDim = 16;
int blocksPerGridDim = (N + threadsPerBlockDim - 1) / threadsPerBlockDim;
dim3 dimBlock(1, threadsPerBlockDim);
dim3 dimGrid(1, blocksPerGridDim);
// Invoke kernel
matMulThreadRow<<<dimGrid, dimBlock>>>(a, b, c, N);
// As we are not doing a Memcpy (a synchronizing operation) due to using `cudaMallocManaged`,
// we need to synchronize another way:
cudaDeviceSynchronize();
// Verify the result
verify_result(a, b, c, N);
printf("Success! Computed kernel where each thread produces one output matrix row.\n");
return 0;
}