From a5548103832bd0b6d91101462843167c817854de Mon Sep 17 00:00:00 2001 From: xwuupb Date: Wed, 15 Apr 2020 14:53:40 +0200 Subject: [PATCH] improved 10_matMul --- 08_distThreads/src/gpuThreads.c | 70 ++- 09_matAdd/src/matAddAB.c | 51 +- 09_matAdd/tests/matAdd_real_00.sh.5329174.out | 11 - 09_matAdd/tests/matAdd_real_00.sh.5422037.out | 11 + 10_matMul/README.md | 23 +- 10_matMul/configure.ac | 4 +- 10_matMul/docs/UserManual.md | 21 +- 10_matMul/src/matMul.c | 25 +- 10_matMul/src/matMulAB.c | 557 +++++++++++++----- 10_matMul/tests/matMul_real_00.sh.5323229.out | 12 - 10_matMul/tests/matMul_real_00.sh.5422034.out | 14 + 11 files changed, 549 insertions(+), 250 deletions(-) delete mode 100644 09_matAdd/tests/matAdd_real_00.sh.5329174.out create mode 100644 09_matAdd/tests/matAdd_real_00.sh.5422037.out delete mode 100644 10_matMul/tests/matMul_real_00.sh.5323229.out create mode 100644 10_matMul/tests/matMul_real_00.sh.5422034.out diff --git a/08_distThreads/src/gpuThreads.c b/08_distThreads/src/gpuThreads.c index 3e1971b..bdeefa5 100644 --- a/08_distThreads/src/gpuThreads.c +++ b/08_distThreads/src/gpuThreads.c @@ -4,6 +4,8 @@ * * This source file contains function definition for organizing GPU threads. * + * thread_limit for the teams construct is omitted for clarity. + * * @author Xin Wu (PC²) * @data 12.03.2020 * @copyright CC BY-SA 2.0 @@ -304,6 +306,17 @@ void gpuThreads(int i) * 5. Nested loop with collapse(3). * 6. It features coalesced GPU memory access and good performance. * 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling. + * + * Caveat: especially for the innermost loop + * + * OpenMP API Specification: Version 5.0 November 2018 + * + * https://www.openmp.org/spec-html/5.0/openmpsu44.html + * + * If a collapse clause is specified with a parameter value greater than 1, then + * the iterations of the associated loops to which the clause applies are + * collapsed into one larger iteration space with *unspecified ordering*. + * */ ncol = 6; nrow =12; @@ -320,16 +333,15 @@ void gpuThreads(int i) default(none) shared(ncol, nrow, wblk, lteams, nthrds, league) for (int icol = 0; icol < ncol; ++icol) { for (int iblk = 0; iblk < nrow / wblk; ++iblk) { - for (int irow = iblk * wblk; - irow < iblk * wblk + nthrds; ++irow) { - league[icol * nrow + irow ].itd = omp_get_thread_num(); - league[icol * nrow + irow ].ntd = omp_get_num_threads(); - league[icol * nrow + irow ].itm = omp_get_team_num(); - league[icol * nrow + irow ].ltm = omp_get_num_teams(); - league[icol * nrow + irow + nthrds].itd = omp_get_thread_num() + 10; - league[icol * nrow + irow + nthrds].ntd = omp_get_num_threads(); - league[icol * nrow + irow + nthrds].itm = omp_get_team_num(); - league[icol * nrow + irow + nthrds].ltm = omp_get_num_teams(); + for (int irow = 0; irow < nthrds; ++irow) { +league[icol * nrow + iblk * wblk + irow ].itd = omp_get_thread_num(); +league[icol * nrow + iblk * wblk + irow ].ntd = omp_get_num_threads(); +league[icol * nrow + iblk * wblk + irow ].itm = omp_get_team_num(); +league[icol * nrow + iblk * wblk + irow ].ltm = omp_get_num_teams(); +league[icol * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10; +league[icol * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads(); +league[icol * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num(); +league[icol * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams(); } } } @@ -344,6 +356,9 @@ void gpuThreads(int i) * 5. Nested loop with collapse(3). * 6. +10 to each unrolled team is used to label the 2x icol-loop unrolling. * 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling. + * + * More work for each thread is an approach to achieve high performance. + * */ ncol = 6; nrow =12; @@ -360,24 +375,23 @@ void gpuThreads(int i) default(none) shared(ncol, nrow, wblk, lteams, nthrds, league) for (int icol = 0; icol < ncol; icol += 2) { for (int iblk = 0; iblk < nrow / wblk; ++iblk) { - for (int irow = iblk * wblk; - irow < iblk * wblk + nthrds; ++irow) { - league[ icol * nrow + irow ].itd = omp_get_thread_num(); - league[ icol * nrow + irow ].ntd = omp_get_num_threads(); - league[ icol * nrow + irow ].itm = omp_get_team_num(); - league[ icol * nrow + irow ].ltm = omp_get_num_teams(); - league[ icol * nrow + irow + nthrds].itd = omp_get_thread_num() + 10; - league[ icol * nrow + irow + nthrds].ntd = omp_get_num_threads(); - league[ icol * nrow + irow + nthrds].itm = omp_get_team_num(); - league[ icol * nrow + irow + nthrds].ltm = omp_get_num_teams(); - league[(icol + 1) * nrow + irow ].itd = omp_get_thread_num(); - league[(icol + 1) * nrow + irow ].ntd = omp_get_num_threads(); - league[(icol + 1) * nrow + irow ].itm = omp_get_team_num() + 10; - league[(icol + 1) * nrow + irow ].ltm = omp_get_num_teams(); - league[(icol + 1) * nrow + irow + nthrds].itd = omp_get_thread_num() + 10; - league[(icol + 1) * nrow + irow + nthrds].ntd = omp_get_num_threads(); - league[(icol + 1) * nrow + irow + nthrds].itm = omp_get_team_num() + 10; - league[(icol + 1) * nrow + irow + nthrds].ltm = omp_get_num_teams(); + for (int irow = 0; irow < nthrds; ++irow) { +league[ icol * nrow + iblk * wblk + irow ].itd = omp_get_thread_num(); +league[ icol * nrow + iblk * wblk + irow ].ntd = omp_get_num_threads(); +league[ icol * nrow + iblk * wblk + irow ].itm = omp_get_team_num(); +league[ icol * nrow + iblk * wblk + irow ].ltm = omp_get_num_teams(); +league[ icol * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10; +league[ icol * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads(); +league[ icol * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num(); +league[ icol * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams(); +league[(icol + 1) * nrow + iblk * wblk + irow ].itd = omp_get_thread_num(); +league[(icol + 1) * nrow + iblk * wblk + irow ].ntd = omp_get_num_threads(); +league[(icol + 1) * nrow + iblk * wblk + irow ].itm = omp_get_team_num() + 10; +league[(icol + 1) * nrow + iblk * wblk + irow ].ltm = omp_get_num_teams(); +league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10; +league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads(); +league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num() + 10; +league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams(); } } } diff --git a/09_matAdd/src/matAddAB.c b/09_matAdd/src/matAddAB.c index d580f7c..6cec8be 100644 --- a/09_matAdd/src/matAddAB.c +++ b/09_matAdd/src/matAddAB.c @@ -163,10 +163,11 @@ for (int i = 0; i < n; ++i) { default(none) shared(a, b, n) for (int j = 0; j < n; ++j) { for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) { -for (int i = iblk * NTHRDS9; - i < iblk * NTHRDS9 + NTHRDS8; ++i) { - a[j * n + i ] += b[j * n + i ]; - a[j * n + i + NTHRDS8] += b[j * n + i + NTHRDS8]; +for (int i = 0; i < NTHRDS8; ++i) { + a[j * n + iblk * NTHRDS9 + i ] += + b[j * n + iblk * NTHRDS9 + i ]; + a[j * n + iblk * NTHRDS9 + i + NTHRDS8] += + b[j * n + iblk * NTHRDS9 + i + NTHRDS8]; } /* end i-loop */ } /* end iblk-loop */ } /* end j-loop */ @@ -194,12 +195,15 @@ for (int i = iblk * NTHRDS9; default(none) shared(a, b, n, halfn) for (int j = 0; j < n; ++j) { for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) { -for (int i = iblk * NTHRDS8; - i < iblk * NTHRDS8 + NTHRDS7; ++i) { - a[j * n + i ] += b[j * n + i ]; - a[j * n + i + NTHRDS7] += b[j * n + i + NTHRDS7]; - a[j * n + i + halfn ] += b[j * n + i + halfn ]; - a[j * n + i + halfn + NTHRDS7] += b[j * n + i + halfn + NTHRDS7]; +for (int i = 0; i < NTHRDS7; ++i) { + a[j * n + iblk * NTHRDS8 + i ] += + b[j * n + iblk * NTHRDS8 + i ]; + a[j * n + iblk * NTHRDS8 + i + NTHRDS7] += + b[j * n + iblk * NTHRDS8 + i + NTHRDS7]; + a[j * n + iblk * NTHRDS8 + i + halfn ] += + b[j * n + iblk * NTHRDS8 + i + halfn ]; + a[j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] += + b[j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7]; } /* end i-loop */ } /* end iblk-loop */ } /* end j-loop */ @@ -228,16 +232,23 @@ for (int i = iblk * NTHRDS8; default(none) shared(a, b, n, halfn) for (int j = 0; j < halfn; ++j) { for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) { -for (int i = iblk * NTHRDS8; - i < iblk * NTHRDS8 + NTHRDS7; ++i) { - a[ j * n + i ] += b[ j * n + i ]; - a[ j * n + i + NTHRDS7] += b[ j * n + i + NTHRDS7]; - a[ j * n + i + halfn ] += b[ j * n + i + halfn ]; - a[ j * n + i + halfn + NTHRDS7] += b[ j * n + i + halfn + NTHRDS7]; - a[(j + halfn) * n + i ] += b[(j + halfn) * n + i ]; - a[(j + halfn) * n + i + NTHRDS7] += b[(j + halfn) * n + i + NTHRDS7]; - a[(j + halfn) * n + i + halfn ] += b[(j + halfn) * n + i + halfn ]; - a[(j + halfn) * n + i + halfn + NTHRDS7] += b[(j + halfn) * n + i + halfn + NTHRDS7]; +for (int i = 0; i < NTHRDS7; ++i) { + a[ j * n + iblk * NTHRDS8 + i ] += + b[ j * n + iblk * NTHRDS8 + i ]; + a[ j * n + iblk * NTHRDS8 + i + NTHRDS7] += + b[ j * n + iblk * NTHRDS8 + i + NTHRDS7]; + a[ j * n + iblk * NTHRDS8 + i + halfn ] += + b[ j * n + iblk * NTHRDS8 + i + halfn ]; + a[ j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] += + b[ j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7]; + a[(j + halfn) * n + iblk * NTHRDS8 + i ] += + b[(j + halfn) * n + iblk * NTHRDS8 + i ]; + a[(j + halfn) * n + iblk * NTHRDS8 + i + NTHRDS7] += + b[(j + halfn) * n + iblk * NTHRDS8 + i + NTHRDS7]; + a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn ] += + b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn ]; + a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] += + b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn + NTHRDS7]; } /* end i-loop */ } /* end iblk-loop */ } /* end j-loop */ diff --git a/09_matAdd/tests/matAdd_real_00.sh.5329174.out b/09_matAdd/tests/matAdd_real_00.sh.5329174.out deleted file mode 100644 index d3dd40f..0000000 --- a/09_matAdd/tests/matAdd_real_00.sh.5329174.out +++ /dev/null @@ -1,11 +0,0 @@ -hallo from gpu029 -matrix dim: 4096 x 4096 -time averaged over 64 loops -matAddAB (0) : 2.0 GB/s 86.6 GB/s maxabserr = 0.0 -matAddAB (1) : 2.0 GB/s 35.8 GB/s maxabserr = 0.0 -matAddAB (2) : 2.0 GB/s 49.1 GB/s maxabserr = 0.0 -matAddAB (3) : 2.1 GB/s 179.9 GB/s maxabserr = 0.0 -matAddAB (4) : 2.1 GB/s 180.1 GB/s maxabserr = 0.0 -matAddAB (5) : 2.0 GB/s 180.8 GB/s maxabserr = 0.0 -matAddAB (6) : 2.1 GB/s 184.1 GB/s maxabserr = 0.0 -matAddAB (7) : 2.0 GB/s 180.8 GB/s maxabserr = 0.0 diff --git a/09_matAdd/tests/matAdd_real_00.sh.5422037.out b/09_matAdd/tests/matAdd_real_00.sh.5422037.out new file mode 100644 index 0000000..dde9ad9 --- /dev/null +++ b/09_matAdd/tests/matAdd_real_00.sh.5422037.out @@ -0,0 +1,11 @@ +hallo from gpu011 +matrix dim: 4096 x 4096 +time averaged over 64 loops +matAddAB (0) : 2.9 GB/s 94.2 GB/s maxabserr = 0.0 +matAddAB (1) : 2.8 GB/s 36.3 GB/s maxabserr = 0.0 +matAddAB (2) : 2.9 GB/s 49.7 GB/s maxabserr = 0.0 +matAddAB (3) : 3.0 GB/s 193.5 GB/s maxabserr = 0.0 +matAddAB (4) : 3.2 GB/s 194.4 GB/s maxabserr = 0.0 +matAddAB (5) : 3.3 GB/s 197.9 GB/s maxabserr = 0.0 +matAddAB (6) : 3.3 GB/s 195.7 GB/s maxabserr = 0.0 +matAddAB (7) : 3.1 GB/s 201.1 GB/s maxabserr = 0.0 diff --git a/10_matMul/README.md b/10_matMul/README.md index 82cdc60..131cd3e 100644 --- a/10_matMul/README.md +++ b/10_matMul/README.md @@ -12,6 +12,8 @@ numerical results are also verified. * Column-major is assumed thru the entire code! +* `i` and `j` are indices for row and column, respectively. + * For testing the dimension of all matrices are assumed to be 4096 x 4096. * The following table only summarizes the most important points. For more @@ -29,16 +31,25 @@ numerical results are also verified. | 4 | jik-loop, 2^9 threads * 2^f teams, collapse(2), | | | 4x k-loop unrolling | | 5 | jik-loop, 2^7 threads * 2^f teams, collapse(3), | -| | 4x i-loop unrolling (2x + 2x), | +| | 4x i-loop unrolling (stride of 2^7 rows), | | | 4x k-loop unrolling, | | | rb: 4x data reuse | -| 6 | jik-loop, 2^7 threads * 2^e teams, collapse(3), | -| | 2x j-loop unrolling, | -| | 4x i-loop unrolling (2x + 2x), | +| 6 | jik-loop, 2^7 threads * 2^d teams, collapse(3), | +| | 4x j-loop unrolling (stride of 1 col ), | +| | 4x i-loop unrolling (stride of 2^7 rows), | +| | 4x k-loop unrolling, | +| | ra: 4x data reuse, | +| | rb: 4x data reuse, | +| | register blocking | +| 7 | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2) | +| 8 | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2), | +| | GPU shared memory for data re-use, 16x k-loop unrolling, | +| | shared memory blocking | +| 9 | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2), | +| | 4x i-loop unrolling (stride of n/4 rows), | | | 4x k-loop unrolling, | -| | ra: 2x data reuse, | | | rb: 4x data reuse | -| 7 | cublasSgemm in CUBLAS | +| 10 | cublasSgemm in CUBLAS | # Build diff --git a/10_matMul/configure.ac b/10_matMul/configure.ac index 9b264e7..dcdb628 100644 --- a/10_matMul/configure.ac +++ b/10_matMul/configure.ac @@ -39,9 +39,9 @@ LDFLAGS+="-L${CUDALIB} -L${MKLLIB}" # AC_PROG_CC([clang gcc]) AS_IF([test "${CC}" = gcc], - [CFLAGS="-Wall -fopenmp -foffload=nvptx-none $CFLAGS"]) + [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"]) AS_IF([test "${CC}" = clang], - [CFLAGS="-Wall -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \ + [CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ -Xopenmp-target -march=sm_61 $CFLAGS"]) ##############################################################################80 # diff --git a/10_matMul/docs/UserManual.md b/10_matMul/docs/UserManual.md index ca6ac34..21f78fb 100644 --- a/10_matMul/docs/UserManual.md +++ b/10_matMul/docs/UserManual.md @@ -29,16 +29,25 @@ numerical results are also verified. | 4 | jik-loop, 2^9 threads * 2^f teams, collapse(2), | | | 4x k-loop unrolling | | 5 | jik-loop, 2^7 threads * 2^f teams, collapse(3), | -| | 4x i-loop unrolling (2x + 2x), | +| | 4x i-loop unrolling (stride of 2^7 rows), | | | 4x k-loop unrolling, | | | rb: 4x data reuse | -| 6 | jik-loop, 2^7 threads * 2^e teams, collapse(3), | -| | 2x j-loop unrolling, | -| | 4x i-loop unrolling (2x + 2x), | +| 6 | jik-loop, 2^7 threads * 2^d teams, collapse(3), | +| | 4x j-loop unrolling (stride of 1 col ), | +| | 4x i-loop unrolling (stride of 2^7 rows), | +| | 4x k-loop unrolling, | +| | ra: 4x data reuse, | +| | rb: 4x data reuse, | +| | register blocking | +| 7 | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2) | +| 8 | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2), | +| | GPU shared memory for data re-use, 16x k-loop unrolling, | +| | shared memory blocking | +| 9 | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2), | +| | 4x i-loop unrolling (stride of n/4 rows), | | | 4x k-loop unrolling, | -| | ra: 2x data reuse, | | | rb: 4x data reuse | -| 7 | cublasSgemm in CUBLAS | +| 10 | cublasSgemm in CUBLAS | # Usage diff --git a/10_matMul/src/matMul.c b/10_matMul/src/matMul.c index b5e458a..526f00c 100644 --- a/10_matMul/src/matMul.c +++ b/10_matMul/src/matMul.c @@ -75,7 +75,7 @@ int main(int argc, char *argv[]) /* * matMul on accl */ - for (ial = 0; ial < 9; ++ial) { + for (ial = 0; ial < 11; ++ial) { /* * See matMulAB.c for details: * @@ -96,15 +96,28 @@ int main(int argc, char *argv[]) * 4x k-loop unrolling * * 5: jik-loop, 2^7 threads * 2^f teams, collapse(3), - * 4x i-loop unrolling (2x + 2x), + * 4x i-loop unrolling (stride of 2^7 rows), * 4x k-loop unrolling, * rb: 4x data reuse * - * 6: jik-loop, 2^7 threads * 2^e teams, collapse(3), - * 2x j-loop unrolling, - * 4x i-loop unrolling (2x + 2x), + * 6: jik-loop, 2^7 threads * 2^d teams, collapse(3), + * 4x j-loop unrolling (stride of 1 col ), + * 4x i-loop unrolling (stride of 2^7 rows), + * 4x k-loop unrolling, + * rb: 4x data reuse, + * ra: 4x data reuse, + * register blocking + * + * 7: based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2) + * + * 8: based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2) + * GPU shared memory for data re-use, + * 16x k-loop unrolling, + * shared memory blocking + * + * 9: based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2), + * 4x i-loop unrolling (stride of n/4 rows), * 4x k-loop unrolling, - * ra: 2x data reuse, * rb: 4x data reuse * * otherwise: cublasSgemm in CUBLAS diff --git a/10_matMul/src/matMulAB.c b/10_matMul/src/matMulAB.c index de63847..21cf6a8 100644 --- a/10_matMul/src/matMulAB.c +++ b/10_matMul/src/matMulAB.c @@ -11,10 +11,6 @@ * @copyright CC BY-SA 2.0 */ -#ifdef __cplusplus -extern "C" { -#endif - #include #include #include @@ -25,13 +21,18 @@ extern "C" { #include "cublas_v2.h" #include "matMulAB.h" -#define NTHRDS7 (1 << 0x7) -#define NTHRDS8 (1 << 0x8) -#define NTHRDS9 (1 << 0x9) +#define NTHRDS7 (1 << 0x7) /* 2^{7} */ +#define NTHRDS8 (1 << 0x8) /* 2^{8} */ +#define NTHRDS9 (1 << 0x9) /* 2^{9} */ + +#define LTEAMSD (1 << 0xD) /* 2^{13} */ +#define LTEAMSE (1 << 0xE) /* 2^{14} */ +#define LTEAMSF (1 << 0xF) /* 2^{15} */ +#define LTEAMSG (1 << 020) /* 2^{16} */ -#define LTEAMSD (1 << 0xD) -#define LTEAMSE (1 << 0xE) -#define LTEAMSF (1 << 0xF) +#define NPITCH (1024) +#define BLKROW (512) +#define BLKDIM (16) double wtcalc; @@ -48,7 +49,6 @@ void matMulAB_accl(float *a, *b_dev = NULL, *c_dev = NULL; struct timespec rt[2]; - int halfn = n / 2; switch (ial) { case 0: @@ -63,7 +63,7 @@ void matMulAB_accl(float *a, map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ default(none) shared(a, b, c, n) #pragma omp distribute parallel for num_threads(NTHRDS9) \ @@ -94,7 +94,7 @@ for (int i = 0; i < n; ++i) { /* sequential */ map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ default(none) shared(a, b, c, n) #pragma omp distribute parallel for num_threads(NTHRDS9) \ @@ -123,7 +123,7 @@ for (int k = 0; k < n; ++k) { /* sequential */ map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ default(none) shared(a, b, c, n) #pragma omp distribute parallel for num_threads(NTHRDS9) \ @@ -154,7 +154,7 @@ for (int i = 0; i < n; ++i) { /* parallel */ map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ default(none) shared(a, b, c, n) #pragma omp distribute parallel for num_threads(NTHRDS9) \ @@ -177,12 +177,18 @@ for (int k = 0; k < n; ++k) { /* parallel */ * - jik-loop * - 2^9 threads per team and 2^15 teams * - 4x k-loop unrolling + * + * good: more work for one thread per iteration. + * bad : one thread must read b 4 times in k-loop. + * all threads in a team do the same read of b (waste of instructions). + * tips: each thread reads the corresponding element in b and + * saves it in shared memory. */ #pragma omp target data device(0) \ map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ default(none) shared(a, b, c, n) #pragma omp distribute parallel for num_threads(NTHRDS9) \ @@ -193,15 +199,10 @@ for (int i = 0; i < n; ++i) { float rc; rc = c[j * n + i]; for (int k = 0; k < n; k += 4) { /* 4x unrolling */ - float rb0, rb1, rb2, rb3; - rb0 = b[j * n + k ]; - rb1 = b[j * n + k + 1]; - rb2 = b[j * n + k + 2]; - rb3 = b[j * n + k + 3]; - rc += a[ k * n + i] * rb0 - + a[(k + 1) * n + i] * rb1 - + a[(k + 2) * n + i] * rb2 - + a[(k + 3) * n + i] * rb3; + rc += a[ k * n + i] * b[j * n + k ]; + rc += a[(k + 1) * n + i] * b[j * n + k + 1]; + rc += a[(k + 2) * n + i] * b[j * n + k + 2]; + rc += a[(k + 3) * n + i] * b[j * n + k + 3]; } c[j * n + i] = rc; } /* end i-loop */ @@ -214,58 +215,61 @@ for (int i = 0; i < n; ++i) { * - jik-loop * - 2^7 threads per team and 2^15 teams * - collapse(3) - * - 4x i-loop unrolling - * * 2x by number of threads - * * 2x by half of rows + * - 4x i-loop unrolling (stride of 2^7 rows) * - 4x k-loop unrolling - * - rb: 4x data reuse + * - rb: 4x data re-use + * + * The integer calculation of matrix indices looks ugly. But considering the GPU + * hardware architecture, e.g. many separate INT32 units, these calculations are + * much faster than accessing GPU global memory and save the precious registers. + * */ #pragma omp target data device(0) \ - map(to:n, halfn, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) + map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ - map(to:n, halfn, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ - default(none) shared(a, b, c, n, halfn) +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS7) \ + map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ + default(none) shared(a, b, c, n) #pragma omp distribute parallel for num_threads(NTHRDS7) \ dist_schedule(static, NTHRDS7) collapse(3) \ - default(none) shared(a, b, c, n, halfn) + default(none) shared(a, b, c, n) for (int j = 0; j < n; ++j) { -for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) { /* 4x unrolling */ -for (int i = iblk * NTHRDS8; - i < iblk * NTHRDS8 + NTHRDS7; ++i) { +for (int iblk = 0; iblk < n / BLKROW; ++iblk) { +for (int i = 0; i < NTHRDS7; ++i) { /* 4x unrolling */ float rc0, rc1, rc2, rc3; - rc0 = c[j * n + i ]; - rc1 = c[j * n + i + NTHRDS7 ]; - rc2 = c[j * n + i + halfn]; - rc3 = c[j * n + i + NTHRDS7 + halfn]; + rc0 = c[j * n + iblk * BLKROW + i ]; + rc1 = c[j * n + iblk * BLKROW + i + NTHRDS7 ]; + rc2 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 2]; + rc3 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 3]; for (int k = 0; k < n; k += 4) { /* 4x unrolling */ + /* register for b: 4x k-loop */ float rb0, rb1, rb2, rb3; - rb0 = b[j * n + k ]; - rb1 = b[j * n + k + 1]; - rb2 = b[j * n + k + 2]; - rb3 = b[j * n + k + 3]; - rc0+= a[ k * n + i ] * rb0 - + a[(k + 1) * n + i ] * rb1 - + a[(k + 2) * n + i ] * rb2 - + a[(k + 3) * n + i ] * rb3; - rc1+= a[ k * n + i + NTHRDS7 ] * rb0 - + a[(k + 1) * n + i + NTHRDS7 ] * rb1 - + a[(k + 2) * n + i + NTHRDS7 ] * rb2 - + a[(k + 3) * n + i + NTHRDS7 ] * rb3; - rc2+= a[ k * n + i + halfn] * rb0 - + a[(k + 1) * n + i + halfn] * rb1 - + a[(k + 2) * n + i + halfn] * rb2 - + a[(k + 3) * n + i + halfn] * rb3; - rc3+= a[ k * n + i + NTHRDS7 + halfn] * rb0 - + a[(k + 1) * n + i + NTHRDS7 + halfn] * rb1 - + a[(k + 2) * n + i + NTHRDS7 + halfn] * rb2 - + a[(k + 3) * n + i + NTHRDS7 + halfn] * rb3; + rb0 = b[j * n + k ]; + rb1 = b[j * n + k + 1]; + rb2 = b[j * n + k + 2]; + rb3 = b[j * n + k + 3]; + rc0 += a[ k * n + iblk * BLKROW + i ] * rb0; + rc0 += a[(k + 1) * n + iblk * BLKROW + i ] * rb1; + rc0 += a[(k + 2) * n + iblk * BLKROW + i ] * rb2; + rc0 += a[(k + 3) * n + iblk * BLKROW + i ] * rb3; + rc1 += a[ k * n + iblk * BLKROW + i + NTHRDS7 ] * rb0; + rc1 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 ] * rb1; + rc1 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 ] * rb2; + rc1 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 ] * rb3; + rc2 += a[ k * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb0; + rc2 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb1; + rc2 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb2; + rc2 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb3; + rc3 += a[ k * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb0; + rc3 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb1; + rc3 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb2; + rc3 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb3; } - c[j * n + i ] = rc0; - c[j * n + i + NTHRDS7 ] = rc1; - c[j * n + i + halfn] = rc2; - c[j * n + i + NTHRDS7 + halfn] = rc3; + c[j * n + iblk * BLKROW + i ] = rc0; + c[j * n + iblk * BLKROW + i + NTHRDS7 ] = rc1; + c[j * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc2; + c[j * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc3; } /* end i-loop */ } /* end iblk-loop */ } /* end j-loop */ @@ -275,118 +279,357 @@ for (int i = iblk * NTHRDS8; case 6: /* * - jik-loop - * - 2^7 threads per team and 2^14 teams + * - 2^7 threads per team and 2^13 teams * - collapse(3) - * - 2x j-loop unrolling by half of cols - * - 4x i-loop unrolling - * * 2x by number of threads - * * 2x by half of rows + * - 4x j-loop unrolling (stride of 1 col ) + * - 4x i-loop unrolling (stride of 2^7 rows) * - 4x k-loop unrolling + * - rb: 4x data re-use + * - ra: 4x data re-use + * - register blocking */ #pragma omp target data device(0) \ - map(to:n, halfn, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) + map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSE) \ - map(to:n, halfn, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ - default(none) shared(a, b, c, n, halfn) +#pragma omp target teams device(0) num_teams(LTEAMSD) thread_limit(NTHRDS7) \ + map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ + default(none) shared(a, b, c, n) #pragma omp distribute parallel for num_threads(NTHRDS7) \ dist_schedule(static, NTHRDS7) collapse(3) \ - default(none) shared(a, b, c, n, halfn) -for (int j = 0; j < halfn; ++j) { /* 2x unrolling */ -for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) { /* 4x unrolling */ -for (int i = iblk * NTHRDS8; - i < iblk * NTHRDS8 + NTHRDS7; ++i) { - /* register for c: 2x j-loop * 4x i-loop */ + default(none) shared(a, b, c, n) +for (int j = 0; j < n; j += 4) { /* 4x unrolling */ +for (int iblk = 0; iblk < n / BLKROW; ++iblk) { +for (int i = 0; i < NTHRDS7; ++i) { /* 4x unrolling */ + /* register for c: 4x j-loop * 4x i-loop */ float rc0, rc1, rc2, rc3, - rc4, rc5, rc6, rc7; - rc0 = c[ j * n + i ]; - rc1 = c[ j * n + i + NTHRDS7 ]; - rc2 = c[ j * n + i + halfn]; - rc3 = c[ j * n + i + NTHRDS7 + halfn]; - rc4 = c[(j + halfn) * n + i ]; - rc5 = c[(j + halfn) * n + i + NTHRDS7 ]; - rc6 = c[(j + halfn) * n + i + halfn]; - rc7 = c[(j + halfn) * n + i + NTHRDS7 + halfn]; + rc4, rc5, rc6, rc7, + rc8, rc9, rca, rcb, + rcc, rcd, rce, rcf; + rc0 = c[ j * n + iblk * BLKROW + i ]; + rc1 = c[ j * n + iblk * BLKROW + i + NTHRDS7 ]; + rc2 = c[ j * n + iblk * BLKROW + i + NTHRDS7 * 2]; + rc3 = c[ j * n + iblk * BLKROW + i + NTHRDS7 * 3]; + rc4 = c[(j + 1) * n + iblk * BLKROW + i ]; + rc5 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 ]; + rc6 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2]; + rc7 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3]; + rc8 = c[(j + 2) * n + iblk * BLKROW + i ]; + rc9 = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 ]; + rca = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2]; + rcb = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3]; + rcc = c[(j + 3) * n + iblk * BLKROW + i ]; + rcd = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 ]; + rce = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2]; + rcf = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3]; for (int k = 0; k < n; k += 4) { /* 4x unrolling */ - /* register for b: 2x j-loop * 4x k-loop */ + /* register for b: 4x j-loop * 4x k-loop */ float rb0, rb1, rb2, rb3, - rb4, rb5, rb6, rb7; + rb4, rb5, rb6, rb7, + rb8, rb9, rba, rbb, + rbc, rbd, rbe, rbf; + rb0 = b[ j * n + k ]; + rb1 = b[ j * n + k + 1]; + rb2 = b[ j * n + k + 2]; + rb3 = b[ j * n + k + 3]; + rb4 = b[(j + 1) * n + k ]; + rb5 = b[(j + 1) * n + k + 1]; + rb6 = b[(j + 1) * n + k + 2]; + rb7 = b[(j + 1) * n + k + 3]; + rb8 = b[(j + 2) * n + k ]; + rb9 = b[(j + 2) * n + k + 1]; + rba = b[(j + 2) * n + k + 2]; + rbb = b[(j + 2) * n + k + 3]; + rbc = b[(j + 3) * n + k ]; + rbd = b[(j + 3) * n + k + 1]; + rbe = b[(j + 3) * n + k + 2]; + rbf = b[(j + 3) * n + k + 3]; /* register for a: 4x i-loop * 4x k-loop */ float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, raa, rab, rac, rad, rae, raf; - rb0 = b[ j * n + k ]; - rb1 = b[ j * n + k + 1]; - rb2 = b[ j * n + k + 2]; - rb3 = b[ j * n + k + 3]; - rb4 = b[(j + halfn) * n + k ]; - rb5 = b[(j + halfn) * n + k + 1]; - rb6 = b[(j + halfn) * n + k + 2]; - rb7 = b[(j + halfn) * n + k + 3]; - ra0 = a[ k * n + i ]; - ra1 = a[(k + 1) * n + i ]; - ra2 = a[(k + 2) * n + i ]; - ra3 = a[(k + 3) * n + i ]; - ra4 = a[ k * n + i + NTHRDS7 ]; - ra5 = a[(k + 1) * n + i + NTHRDS7 ]; - ra6 = a[(k + 2) * n + i + NTHRDS7 ]; - ra7 = a[(k + 3) * n + i + NTHRDS7 ]; - ra8 = a[ k * n + i + halfn]; - ra9 = a[(k + 1) * n + i + halfn]; - raa = a[(k + 2) * n + i + halfn]; - rab = a[(k + 3) * n + i + halfn]; - rac = a[ k * n + i + NTHRDS7 + halfn]; - rad = a[(k + 1) * n + i + NTHRDS7 + halfn]; - rae = a[(k + 2) * n + i + NTHRDS7 + halfn]; - raf = a[(k + 3) * n + i + NTHRDS7 + halfn]; + ra0 = a[ k * n + iblk * BLKROW + i ]; + ra1 = a[ k * n + iblk * BLKROW + i + NTHRDS7 ]; + ra2 = a[ k * n + iblk * BLKROW + i + NTHRDS7 * 2]; + ra3 = a[ k * n + iblk * BLKROW + i + NTHRDS7 * 3]; + ra4 = a[(k + 1) * n + iblk * BLKROW + i ]; + ra5 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 ]; + ra6 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2]; + ra7 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3]; + ra8 = a[(k + 2) * n + iblk * BLKROW + i ]; + ra9 = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 ]; + raa = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2]; + rab = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3]; + rac = a[(k + 3) * n + iblk * BLKROW + i ]; + rad = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 ]; + rae = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2]; + raf = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3]; /* * register blocking */ - rc0+= ra0 * rb0 - + ra1 * rb1 - + ra2 * rb2 - + ra3 * rb3; - rc1+= ra4 * rb0 - + ra5 * rb1 - + ra6 * rb2 - + ra7 * rb3; - rc2+= ra8 * rb0 - + ra9 * rb1 - + raa * rb2 - + rab * rb3; - rc3+= rac * rb0 - + rad * rb1 - + rae * rb2 - + raf * rb3; - rc4+= ra0 * rb4 - + ra1 * rb5 - + ra2 * rb6 - + ra3 * rb7; - rc5+= ra4 * rb4 - + ra5 * rb5 - + ra6 * rb6 - + ra7 * rb7; - rc6+= ra8 * rb4 - + ra9 * rb5 - + raa * rb6 - + rab * rb7; - rc7+= rac * rb4 - + rad * rb5 - + rae * rb6 - + raf * rb7; + // col 1 of c: + rc0 += ra0 * rb0; + rc0 += ra4 * rb1; + rc0 += ra8 * rb2; + rc0 += rac * rb3; + rc1 += ra1 * rb0; + rc1 += ra5 * rb1; + rc1 += ra9 * rb2; + rc1 += rad * rb3; + rc2 += ra2 * rb0; + rc2 += ra6 * rb1; + rc2 += raa * rb2; + rc2 += rae * rb3; + rc3 += ra3 * rb0; + rc3 += ra7 * rb1; + rc3 += rab * rb2; + rc3 += raf * rb3; + // col 2 of c: + rc4 += ra0 * rb4; + rc4 += ra4 * rb5; + rc4 += ra8 * rb6; + rc4 += rac * rb7; + rc5 += ra1 * rb4; + rc5 += ra5 * rb5; + rc5 += ra9 * rb6; + rc5 += rad * rb7; + rc6 += ra2 * rb4; + rc6 += ra6 * rb5; + rc6 += raa * rb6; + rc6 += rae * rb7; + rc7 += ra3 * rb4; + rc7 += ra7 * rb5; + rc7 += rab * rb6; + rc7 += raf * rb7; + // col 3 of c: + rc8 += ra0 * rb8; + rc8 += ra4 * rb9; + rc8 += ra8 * rba; + rc8 += rac * rbb; + rc9 += ra1 * rb8; + rc9 += ra5 * rb9; + rc9 += ra9 * rba; + rc9 += rad * rbb; + rca += ra2 * rb8; + rca += ra6 * rb9; + rca += raa * rba; + rca += rae * rbb; + rcb += ra3 * rb8; + rcb += ra7 * rb9; + rcb += rab * rba; + rcb += raf * rbb; + // col 4 of c: + rcc += ra0 * rbc; + rcc += ra4 * rbd; + rcc += ra8 * rbe; + rcc += rac * rbf; + rcd += ra1 * rbc; + rcd += ra5 * rbd; + rcd += ra9 * rbe; + rcd += rad * rbf; + rce += ra2 * rbc; + rce += ra6 * rbd; + rce += raa * rbe; + rce += rae * rbf; + rcf += ra3 * rbc; + rcf += ra7 * rbd; + rcf += rab * rbe; + rcf += raf * rbf; } - c[ j * n + i ] = rc0; - c[ j * n + i + NTHRDS7 ] = rc1; - c[ j * n + i + halfn] = rc2; - c[ j * n + i + NTHRDS7 + halfn] = rc3; - c[(j + halfn) * n + i ] = rc4; - c[(j + halfn) * n + i + NTHRDS7 ] = rc5; - c[(j + halfn) * n + i + halfn] = rc6; - c[(j + halfn) * n + i + NTHRDS7 + halfn] = rc7; + c[ j * n + iblk * BLKROW + i ] = rc0; + c[ j * n + iblk * BLKROW + i + NTHRDS7 ] = rc1; + c[ j * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc2; + c[ j * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc3; + c[(j + 1) * n + iblk * BLKROW + i ] = rc4; + c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 ] = rc5; + c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc6; + c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc7; + c[(j + 2) * n + iblk * BLKROW + i ] = rc8; + c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 ] = rc9; + c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rca; + c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcb; + c[(j + 3) * n + iblk * BLKROW + i ] = rcc; + c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 ] = rcd; + c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rce; + c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcf; } /* end i-loop */ } /* end iblk-loop */ +} /* end j-loop */ + clock_gettime(CLOCK_REALTIME, rt + 1); +} + break; + case 7: +/* + * - based on case 2 + * - jik-loop + * - 2^8 threads per team and 2^16 teams + * - collapse(2) + * - no race condition + */ +#pragma omp target data device(0) \ + map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) +{ + clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp target teams device(0) num_teams(LTEAMSG) thread_limit(NTHRDS8) \ + map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ + default(none) shared(a, b, c, n) +#pragma omp distribute parallel for num_threads(NTHRDS8) \ + dist_schedule(static, NTHRDS8) collapse(2) \ + default(none) shared(a, b, c, n) +for (int j = 0; j < n; ++j) { /* parallel */ +for (int i = 0; i < n; ++i) { /* parallel */ + float rc; + rc = c[j * n + i]; + for (int k = 0; k < n; ++k) { /* sequential */ + rc += a[k * n + i] * b[j * n + k]; + } + c[j * n + i] = rc; +} /* end i-loop */ +} /* end j-loop */ + clock_gettime(CLOCK_REALTIME, rt + 1); +} + break; + case 8: +/* + * - based on case 7 + * - jik-loop + * - 2^8 threads per team and 2^16 teams + * - collapse(2) + * - GPU shared memory for data re-use + * - 16x k-loop unrolling + */ +#pragma omp target data device(0) \ + map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) +{ + clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp target teams device(0) num_teams(LTEAMSG) thread_limit(NTHRDS8) \ + map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ + default(none) shared(a, b, c, n) +{ + // GPU shared memory for each team + /* + * I have tested the bank conflict-free version, but it gives worse results, + * e.g. ~ 290 GFLOPS (20 GFLOPS less than the bank conflict version). + * I cannot explain ... + * + float ashm[BLKDIM][BLKDIM + 1], + bshm[BLKDIM][BLKDIM + 1]; + */ + float ashm[BLKDIM][BLKDIM], + bshm[BLKDIM][BLKDIM]; +#pragma omp distribute dist_schedule(static, 1) collapse(2) +for (int j = 0; j < n / BLKDIM; ++j) { +for (int i = 0; i < n / BLKDIM; ++i) { +#pragma omp parallel num_threads(NTHRDS8) \ + default(none) shared(a, b, c, n, ashm, bshm, i, j) +{ + /* + * The code here resembles CUDA. + */ + int td = omp_get_thread_num(); + // de-linearize the thread number + int it, // thread number along the row + jt; // thread number along the col + it = td % BLKDIM; + jt = td / BLKDIM; + int ib, // row at the beginning of block + jb; // col at the beginning of block + ib = i * BLKDIM; + jb = j * BLKDIM; + int ii, // the real row + jj; // the real col + ii = ib + it; + jj = jb + jt; + float rc = c[jj * n + ii]; // c in register + /* + * the k blocks + */ + for (int k = 0; k < n / BLKDIM; ++k) { + // read the global data to shared memory + ashm[jt][it] = a[(k * 16 + jt) * n + ii]; + bshm[jt][it] = b[jj * n + (k * 16 + it)]; +#pragma omp barrier + // shared memory blocking and 16x k-loop unrolling + rc += ashm[0x0][it] * bshm[jt][0x0]; + rc += ashm[0x1][it] * bshm[jt][0x1]; + rc += ashm[0x2][it] * bshm[jt][0x2]; + rc += ashm[0x3][it] * bshm[jt][0x3]; + rc += ashm[0x4][it] * bshm[jt][0x4]; + rc += ashm[0x5][it] * bshm[jt][0x5]; + rc += ashm[0x6][it] * bshm[jt][0x6]; + rc += ashm[0x7][it] * bshm[jt][0x7]; + rc += ashm[0x8][it] * bshm[jt][0x8]; + rc += ashm[0x9][it] * bshm[jt][0x9]; + rc += ashm[0xa][it] * bshm[jt][0xa]; + rc += ashm[0xb][it] * bshm[jt][0xb]; + rc += ashm[0xc][it] * bshm[jt][0xc]; + rc += ashm[0xd][it] * bshm[jt][0xd]; + rc += ashm[0xe][it] * bshm[jt][0xe]; + rc += ashm[0xf][it] * bshm[jt][0xf]; +#pragma omp barrier + } /* end k-loop */ + c[jj * n + ii] =rc; +} /* end omp parallel */ +} /* end i-loop */ +} /* end j-loop */ +} /* end omp target teams */ + clock_gettime(CLOCK_REALTIME, rt + 1); +} + break; + case 9: +/* + * - based on case 5 + * - only diffs are listed here: + * * collapse(2) + * * 4x i-loop unrolling (stride of n/4 rows) + */ +#pragma omp target data device(0) \ + map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) +{ + clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS7) \ + map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ + default(none) shared(a, b, c, n) +#pragma omp distribute parallel for num_threads(NTHRDS7) \ + dist_schedule(static, NTHRDS7) collapse(2) \ + default(none) shared(a, b, c, n) +for (int j = 0; j < n; ++j) { +for (int i = 0; i < NPITCH; ++i) { /* 4x unrolling */ + float rc0, rc1, rc2, rc3; + rc0 = c[j * n + i ]; + rc1 = c[j * n + i + NPITCH ]; + rc2 = c[j * n + i + NPITCH * 2]; + rc3 = c[j * n + i + NPITCH * 3]; + for (int k = 0; k < n; k += 4) { /* 4x unrolling */ + /* register for b: 4x k-loop */ + float rb0, rb1, rb2, rb3; + rb0 = b[j * n + k ]; + rb1 = b[j * n + k + 1]; + rb2 = b[j * n + k + 2]; + rb3 = b[j * n + k + 3]; + rc0 += a[ k * n + i ] * rb0; + rc0 += a[(k + 1) * n + i ] * rb1; + rc0 += a[(k + 2) * n + i ] * rb2; + rc0 += a[(k + 3) * n + i ] * rb3; + rc1 += a[ k * n + i + NPITCH ] * rb0; + rc1 += a[(k + 1) * n + i + NPITCH ] * rb1; + rc1 += a[(k + 2) * n + i + NPITCH ] * rb2; + rc1 += a[(k + 3) * n + i + NPITCH ] * rb3; + rc2 += a[ k * n + i + NPITCH * 2] * rb0; + rc2 += a[(k + 1) * n + i + NPITCH * 2] * rb1; + rc2 += a[(k + 2) * n + i + NPITCH * 2] * rb2; + rc2 += a[(k + 3) * n + i + NPITCH * 2] * rb3; + rc3 += a[ k * n + i + NPITCH * 3] * rb0; + rc3 += a[(k + 1) * n + i + NPITCH * 3] * rb1; + rc3 += a[(k + 2) * n + i + NPITCH * 3] * rb2; + rc3 += a[(k + 3) * n + i + NPITCH * 3] * rb3; + } + c[j * n + i ] = rc0; + c[j * n + i + NPITCH ] = rc1; + c[j * n + i + NPITCH * 2] = rc2; + c[j * n + i + NPITCH * 3] = rc3; +} /* end i-loop */ } /* end j-loop */ clock_gettime(CLOCK_REALTIME, rt + 1); } @@ -445,7 +688,3 @@ for (int i = iblk * NTHRDS8; wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); } } - -#ifdef __cplusplus -} -#endif diff --git a/10_matMul/tests/matMul_real_00.sh.5323229.out b/10_matMul/tests/matMul_real_00.sh.5323229.out deleted file mode 100644 index d61e8b7..0000000 --- a/10_matMul/tests/matMul_real_00.sh.5323229.out +++ /dev/null @@ -1,12 +0,0 @@ -hallo from gpu013 -matrix dim: 4096 x 4096 -time averaged over 16 loops -matMulAB (0) : 24.9 GFLOPS 25.4 GFLOPS maxabserr = 0.0 -matMulAB (1) : 10.0 GFLOPS 10.0 GFLOPS maxabserr = 0.0 -matMulAB (2) : 193.6 GFLOPS 229.2 GFLOPS maxabserr = 0.0 -matMulAB (3) : 5.0 GFLOPS 5.1 GFLOPS maxabserr = 1017.8 -matMulAB (4) : 182.2 GFLOPS 213.7 GFLOPS maxabserr = 0.0 -matMulAB (5) : 370.7 GFLOPS 529.6 GFLOPS maxabserr = 0.0 -matMulAB (6) : 540.0 GFLOPS 962.0 GFLOPS maxabserr = 0.0 -matMulAB (7) : 1028.2 GFLOPS 10463.3 GFLOPS maxabserr = 0.0 -matMulAB (8) : 1007.7 GFLOPS 10468.4 GFLOPS maxabserr = 0.0 diff --git a/10_matMul/tests/matMul_real_00.sh.5422034.out b/10_matMul/tests/matMul_real_00.sh.5422034.out new file mode 100644 index 0000000..7f94772 --- /dev/null +++ b/10_matMul/tests/matMul_real_00.sh.5422034.out @@ -0,0 +1,14 @@ +hallo from gpu011 +matrix dim: 4096 x 4096 +time averaged over 16 loops +matMulAB (0) : 24.7 GFLOPS 25.3 GFLOPS maxabserr = 0.0 +matMulAB (1) : 9.8 GFLOPS 9.9 GFLOPS maxabserr = 0.0 +matMulAB (2) : 179.0 GFLOPS 220.7 GFLOPS maxabserr = 0.0 +matMulAB (3) : 5.1 GFLOPS 5.1 GFLOPS maxabserr = 1028.8 +matMulAB (4) : 171.2 GFLOPS 209.0 GFLOPS maxabserr = 0.0 +matMulAB (5) : 342.0 GFLOPS 533.1 GFLOPS maxabserr = 0.0 +matMulAB (6) : 634.9 GFLOPS 1717.8 GFLOPS maxabserr = 0.0 +matMulAB (7) : 217.8 GFLOPS 282.3 GFLOPS maxabserr = 0.0 +matMulAB (8) : 236.0 GFLOPS 313.6 GFLOPS maxabserr = 0.0 +matMulAB (9) : 341.6 GFLOPS 534.8 GFLOPS maxabserr = 0.0 +matMulAB (10) : 988.8 GFLOPS 10645.0 GFLOPS maxabserr = 0.0