Skip to content

Commit

Permalink
improved 10_matMul
Browse files Browse the repository at this point in the history
  • Loading branch information
xwuupb committed Apr 15, 2020
1 parent 8ada85b commit a554810
Show file tree
Hide file tree
Showing 11 changed files with 549 additions and 250 deletions.
70 changes: 42 additions & 28 deletions 08_distThreads/src/gpuThreads.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
*
* This source file contains function definition for organizing GPU threads.
*
* thread_limit for the teams construct is omitted for clarity.
*
* @author Xin Wu (PC²)
* @data 12.03.2020
* @copyright CC BY-SA 2.0
Expand Down Expand Up @@ -304,6 +306,17 @@ void gpuThreads(int i)
* 5. Nested loop with collapse(3).
* 6. It features coalesced GPU memory access and good performance.
* 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling.
*
* Caveat: especially for the innermost loop
*
* OpenMP API Specification: Version 5.0 November 2018
*
* https://www.openmp.org/spec-html/5.0/openmpsu44.html
*
* If a collapse clause is specified with a parameter value greater than 1, then
* the iterations of the associated loops to which the clause applies are
* collapsed into one larger iteration space with *unspecified ordering*.
*
*/
ncol = 6;
nrow =12;
Expand All @@ -320,16 +333,15 @@ void gpuThreads(int i)
default(none) shared(ncol, nrow, wblk, lteams, nthrds, league)
for (int icol = 0; icol < ncol; ++icol) {
for (int iblk = 0; iblk < nrow / wblk; ++iblk) {
for (int irow = iblk * wblk;
irow < iblk * wblk + nthrds; ++irow) {
league[icol * nrow + irow ].itd = omp_get_thread_num();
league[icol * nrow + irow ].ntd = omp_get_num_threads();
league[icol * nrow + irow ].itm = omp_get_team_num();
league[icol * nrow + irow ].ltm = omp_get_num_teams();
league[icol * nrow + irow + nthrds].itd = omp_get_thread_num() + 10;
league[icol * nrow + irow + nthrds].ntd = omp_get_num_threads();
league[icol * nrow + irow + nthrds].itm = omp_get_team_num();
league[icol * nrow + irow + nthrds].ltm = omp_get_num_teams();
for (int irow = 0; irow < nthrds; ++irow) {
league[icol * nrow + iblk * wblk + irow ].itd = omp_get_thread_num();
league[icol * nrow + iblk * wblk + irow ].ntd = omp_get_num_threads();
league[icol * nrow + iblk * wblk + irow ].itm = omp_get_team_num();
league[icol * nrow + iblk * wblk + irow ].ltm = omp_get_num_teams();
league[icol * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
league[icol * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
league[icol * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num();
league[icol * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
}
}
}
Expand All @@ -344,6 +356,9 @@ void gpuThreads(int i)
* 5. Nested loop with collapse(3).
* 6. +10 to each unrolled team is used to label the 2x icol-loop unrolling.
* 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling.
*
* More work for each thread is an approach to achieve high performance.
*
*/
ncol = 6;
nrow =12;
Expand All @@ -360,24 +375,23 @@ void gpuThreads(int i)
default(none) shared(ncol, nrow, wblk, lteams, nthrds, league)
for (int icol = 0; icol < ncol; icol += 2) {
for (int iblk = 0; iblk < nrow / wblk; ++iblk) {
for (int irow = iblk * wblk;
irow < iblk * wblk + nthrds; ++irow) {
league[ icol * nrow + irow ].itd = omp_get_thread_num();
league[ icol * nrow + irow ].ntd = omp_get_num_threads();
league[ icol * nrow + irow ].itm = omp_get_team_num();
league[ icol * nrow + irow ].ltm = omp_get_num_teams();
league[ icol * nrow + irow + nthrds].itd = omp_get_thread_num() + 10;
league[ icol * nrow + irow + nthrds].ntd = omp_get_num_threads();
league[ icol * nrow + irow + nthrds].itm = omp_get_team_num();
league[ icol * nrow + irow + nthrds].ltm = omp_get_num_teams();
league[(icol + 1) * nrow + irow ].itd = omp_get_thread_num();
league[(icol + 1) * nrow + irow ].ntd = omp_get_num_threads();
league[(icol + 1) * nrow + irow ].itm = omp_get_team_num() + 10;
league[(icol + 1) * nrow + irow ].ltm = omp_get_num_teams();
league[(icol + 1) * nrow + irow + nthrds].itd = omp_get_thread_num() + 10;
league[(icol + 1) * nrow + irow + nthrds].ntd = omp_get_num_threads();
league[(icol + 1) * nrow + irow + nthrds].itm = omp_get_team_num() + 10;
league[(icol + 1) * nrow + irow + nthrds].ltm = omp_get_num_teams();
for (int irow = 0; irow < nthrds; ++irow) {
league[ icol * nrow + iblk * wblk + irow ].itd = omp_get_thread_num();
league[ icol * nrow + iblk * wblk + irow ].ntd = omp_get_num_threads();
league[ icol * nrow + iblk * wblk + irow ].itm = omp_get_team_num();
league[ icol * nrow + iblk * wblk + irow ].ltm = omp_get_num_teams();
league[ icol * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
league[ icol * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
league[ icol * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num();
league[ icol * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
league[(icol + 1) * nrow + iblk * wblk + irow ].itd = omp_get_thread_num();
league[(icol + 1) * nrow + iblk * wblk + irow ].ntd = omp_get_num_threads();
league[(icol + 1) * nrow + iblk * wblk + irow ].itm = omp_get_team_num() + 10;
league[(icol + 1) * nrow + iblk * wblk + irow ].ltm = omp_get_num_teams();
league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num() + 10;
league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
}
}
}
Expand Down
51 changes: 31 additions & 20 deletions 09_matAdd/src/matAddAB.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,11 @@ for (int i = 0; i < n; ++i) {
default(none) shared(a, b, n)
for (int j = 0; j < n; ++j) {
for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
for (int i = iblk * NTHRDS9;
i < iblk * NTHRDS9 + NTHRDS8; ++i) {
a[j * n + i ] += b[j * n + i ];
a[j * n + i + NTHRDS8] += b[j * n + i + NTHRDS8];
for (int i = 0; i < NTHRDS8; ++i) {
a[j * n + iblk * NTHRDS9 + i ] +=
b[j * n + iblk * NTHRDS9 + i ];
a[j * n + iblk * NTHRDS9 + i + NTHRDS8] +=
b[j * n + iblk * NTHRDS9 + i + NTHRDS8];
} /* end i-loop */
} /* end iblk-loop */
} /* end j-loop */
Expand Down Expand Up @@ -194,12 +195,15 @@ for (int i = iblk * NTHRDS9;
default(none) shared(a, b, n, halfn)
for (int j = 0; j < n; ++j) {
for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
for (int i = iblk * NTHRDS8;
i < iblk * NTHRDS8 + NTHRDS7; ++i) {
a[j * n + i ] += b[j * n + i ];
a[j * n + i + NTHRDS7] += b[j * n + i + NTHRDS7];
a[j * n + i + halfn ] += b[j * n + i + halfn ];
a[j * n + i + halfn + NTHRDS7] += b[j * n + i + halfn + NTHRDS7];
for (int i = 0; i < NTHRDS7; ++i) {
a[j * n + iblk * NTHRDS8 + i ] +=
b[j * n + iblk * NTHRDS8 + i ];
a[j * n + iblk * NTHRDS8 + i + NTHRDS7] +=
b[j * n + iblk * NTHRDS8 + i + NTHRDS7];
a[j * n + iblk * NTHRDS8 + i + halfn ] +=
b[j * n + iblk * NTHRDS8 + i + halfn ];
a[j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] +=
b[j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7];
} /* end i-loop */
} /* end iblk-loop */
} /* end j-loop */
Expand Down Expand Up @@ -228,16 +232,23 @@ for (int i = iblk * NTHRDS8;
default(none) shared(a, b, n, halfn)
for (int j = 0; j < halfn; ++j) {
for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
for (int i = iblk * NTHRDS8;
i < iblk * NTHRDS8 + NTHRDS7; ++i) {
a[ j * n + i ] += b[ j * n + i ];
a[ j * n + i + NTHRDS7] += b[ j * n + i + NTHRDS7];
a[ j * n + i + halfn ] += b[ j * n + i + halfn ];
a[ j * n + i + halfn + NTHRDS7] += b[ j * n + i + halfn + NTHRDS7];
a[(j + halfn) * n + i ] += b[(j + halfn) * n + i ];
a[(j + halfn) * n + i + NTHRDS7] += b[(j + halfn) * n + i + NTHRDS7];
a[(j + halfn) * n + i + halfn ] += b[(j + halfn) * n + i + halfn ];
a[(j + halfn) * n + i + halfn + NTHRDS7] += b[(j + halfn) * n + i + halfn + NTHRDS7];
for (int i = 0; i < NTHRDS7; ++i) {
a[ j * n + iblk * NTHRDS8 + i ] +=
b[ j * n + iblk * NTHRDS8 + i ];
a[ j * n + iblk * NTHRDS8 + i + NTHRDS7] +=
b[ j * n + iblk * NTHRDS8 + i + NTHRDS7];
a[ j * n + iblk * NTHRDS8 + i + halfn ] +=
b[ j * n + iblk * NTHRDS8 + i + halfn ];
a[ j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] +=
b[ j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7];
a[(j + halfn) * n + iblk * NTHRDS8 + i ] +=
b[(j + halfn) * n + iblk * NTHRDS8 + i ];
a[(j + halfn) * n + iblk * NTHRDS8 + i + NTHRDS7] +=
b[(j + halfn) * n + iblk * NTHRDS8 + i + NTHRDS7];
a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn ] +=
b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn ];
a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] +=
b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn + NTHRDS7];
} /* end i-loop */
} /* end iblk-loop */
} /* end j-loop */
Expand Down
11 changes: 0 additions & 11 deletions 09_matAdd/tests/matAdd_real_00.sh.5329174.out

This file was deleted.

11 changes: 11 additions & 0 deletions 09_matAdd/tests/matAdd_real_00.sh.5422037.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
hallo from gpu011
matrix dim: 4096 x 4096
time averaged over 64 loops
matAddAB (0) : 2.9 GB/s 94.2 GB/s maxabserr = 0.0
matAddAB (1) : 2.8 GB/s 36.3 GB/s maxabserr = 0.0
matAddAB (2) : 2.9 GB/s 49.7 GB/s maxabserr = 0.0
matAddAB (3) : 3.0 GB/s 193.5 GB/s maxabserr = 0.0
matAddAB (4) : 3.2 GB/s 194.4 GB/s maxabserr = 0.0
matAddAB (5) : 3.3 GB/s 197.9 GB/s maxabserr = 0.0
matAddAB (6) : 3.3 GB/s 195.7 GB/s maxabserr = 0.0
matAddAB (7) : 3.1 GB/s 201.1 GB/s maxabserr = 0.0
23 changes: 17 additions & 6 deletions 10_matMul/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ numerical results are also verified.

* Column-major is assumed thru the entire code!

* `i` and `j` are indices for row and column, respectively.

* For testing the dimension of all matrices are assumed to be 4096 x 4096.

* The following table only summarizes the most important points. For more
Expand All @@ -29,16 +31,25 @@ numerical results are also verified.
| 4 | jik-loop, 2^9 threads * 2^f teams, collapse(2), |
| | 4x k-loop unrolling |
| 5 | jik-loop, 2^7 threads * 2^f teams, collapse(3), |
| | 4x i-loop unrolling (2x + 2x), |
| | 4x i-loop unrolling (stride of 2^7 rows), |
| | 4x k-loop unrolling, |
| | rb: 4x data reuse |
| 6 | jik-loop, 2^7 threads * 2^e teams, collapse(3), |
| | 2x j-loop unrolling, |
| | 4x i-loop unrolling (2x + 2x), |
| 6 | jik-loop, 2^7 threads * 2^d teams, collapse(3), |
| | 4x j-loop unrolling (stride of 1 col ), |
| | 4x i-loop unrolling (stride of 2^7 rows), |
| | 4x k-loop unrolling, |
| | ra: 4x data reuse, |
| | rb: 4x data reuse, |
| | register blocking |
| 7 | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2) |
| 8 | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2), |
| | GPU shared memory for data re-use, 16x k-loop unrolling, |
| | shared memory blocking |
| 9 | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2), |
| | 4x i-loop unrolling (stride of n/4 rows), |
| | 4x k-loop unrolling, |
| | ra: 2x data reuse, |
| | rb: 4x data reuse |
| 7 | cublasSgemm in CUBLAS |
| 10 | cublasSgemm in CUBLAS |

# Build

Expand Down
4 changes: 2 additions & 2 deletions 10_matMul/configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ LDFLAGS+="-L${CUDALIB} -L${MKLLIB}"
#
AC_PROG_CC([clang gcc])
AS_IF([test "${CC}" = gcc],
[CFLAGS="-Wall -fopenmp -foffload=nvptx-none $CFLAGS"])
[CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"])
AS_IF([test "${CC}" = clang],
[CFLAGS="-Wall -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \
[CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
-Xopenmp-target -march=sm_61 $CFLAGS"])
##############################################################################80
#
Expand Down
21 changes: 15 additions & 6 deletions 10_matMul/docs/UserManual.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,25 @@ numerical results are also verified.
| 4 | jik-loop, 2^9 threads * 2^f teams, collapse(2), |
| | 4x k-loop unrolling |
| 5 | jik-loop, 2^7 threads * 2^f teams, collapse(3), |
| | 4x i-loop unrolling (2x + 2x), |
| | 4x i-loop unrolling (stride of 2^7 rows), |
| | 4x k-loop unrolling, |
| | rb: 4x data reuse |
| 6 | jik-loop, 2^7 threads * 2^e teams, collapse(3), |
| | 2x j-loop unrolling, |
| | 4x i-loop unrolling (2x + 2x), |
| 6 | jik-loop, 2^7 threads * 2^d teams, collapse(3), |
| | 4x j-loop unrolling (stride of 1 col ), |
| | 4x i-loop unrolling (stride of 2^7 rows), |
| | 4x k-loop unrolling, |
| | ra: 4x data reuse, |
| | rb: 4x data reuse, |
| | register blocking |
| 7 | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2) |
| 8 | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2), |
| | GPU shared memory for data re-use, 16x k-loop unrolling, |
| | shared memory blocking |
| 9 | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2), |
| | 4x i-loop unrolling (stride of n/4 rows), |
| | 4x k-loop unrolling, |
| | ra: 2x data reuse, |
| | rb: 4x data reuse |
| 7 | cublasSgemm in CUBLAS |
| 10 | cublasSgemm in CUBLAS |

# Usage

Expand Down
25 changes: 19 additions & 6 deletions 10_matMul/src/matMul.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ int main(int argc, char *argv[])
/*
* matMul on accl
*/
for (ial = 0; ial < 9; ++ial) {
for (ial = 0; ial < 11; ++ial) {
/*
* See matMulAB.c for details:
*
Expand All @@ -96,15 +96,28 @@ int main(int argc, char *argv[])
* 4x k-loop unrolling
*
* 5: jik-loop, 2^7 threads * 2^f teams, collapse(3),
* 4x i-loop unrolling (2x + 2x),
* 4x i-loop unrolling (stride of 2^7 rows),
* 4x k-loop unrolling,
* rb: 4x data reuse
*
* 6: jik-loop, 2^7 threads * 2^e teams, collapse(3),
* 2x j-loop unrolling,
* 4x i-loop unrolling (2x + 2x),
* 6: jik-loop, 2^7 threads * 2^d teams, collapse(3),
* 4x j-loop unrolling (stride of 1 col ),
* 4x i-loop unrolling (stride of 2^7 rows),
* 4x k-loop unrolling,
* rb: 4x data reuse,
* ra: 4x data reuse,
* register blocking
*
* 7: based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2)
*
* 8: based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2)
* GPU shared memory for data re-use,
* 16x k-loop unrolling,
* shared memory blocking
*
* 9: based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2),
* 4x i-loop unrolling (stride of n/4 rows),
* 4x k-loop unrolling,
* ra: 2x data reuse,
* rb: 4x data reuse
*
* otherwise: cublasSgemm in CUBLAS
Expand Down
Loading

0 comments on commit a554810

Please sign in to comment.