improved 10_matMul

pc2 · Apr 15, 2020 · a554810 · a554810
1 parent 8ada85b
commit a554810
Show file tree

Hide file tree

Showing 11 changed files with 549 additions and 250 deletions.
diff --git a/08_distThreads/src/gpuThreads.c b/08_distThreads/src/gpuThreads.c
@@ -4,6 +4,8 @@
  *
  * This source file contains function definition for organizing GPU threads.
  *
+ * thread_limit for the teams construct is omitted for clarity.
+ *
  * @author Xin Wu (PC²)
  * @data 12.03.2020
  * @copyright CC BY-SA 2.0
@@ -304,6 +306,17 @@ void gpuThreads(int i)
  * 5. Nested loop with collapse(3).
  * 6. It features coalesced GPU memory access and good performance.
  * 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling.
+ *
+ * Caveat: especially for the innermost loop
+ *
+ * OpenMP API Specification: Version 5.0 November 2018
+ *
+ * https://www.openmp.org/spec-html/5.0/openmpsu44.html
+ *
+ * If a collapse clause is specified with a parameter value greater than 1, then
+ * the iterations of the associated loops to which the clause applies are
+ * collapsed into one larger iteration space with *unspecified ordering*.
+ *
  */
       ncol   = 6;
       nrow   =12;
@@ -320,16 +333,15 @@ void gpuThreads(int i)
   default(none) shared(ncol, nrow, wblk, lteams, nthrds, league)
   for (int icol = 0; icol < ncol; ++icol) {
     for (int iblk = 0; iblk < nrow / wblk; ++iblk) {
-      for (int irow = iblk * wblk;
-               irow < iblk * wblk + nthrds; ++irow) {
-        league[icol * nrow + irow         ].itd = omp_get_thread_num();
-        league[icol * nrow + irow         ].ntd = omp_get_num_threads();
-        league[icol * nrow + irow         ].itm = omp_get_team_num();
-        league[icol * nrow + irow         ].ltm = omp_get_num_teams();
-        league[icol * nrow + irow + nthrds].itd = omp_get_thread_num() + 10;
-        league[icol * nrow + irow + nthrds].ntd = omp_get_num_threads();
-        league[icol * nrow + irow + nthrds].itm = omp_get_team_num();
-        league[icol * nrow + irow + nthrds].ltm = omp_get_num_teams();
+      for (int irow = 0; irow < nthrds; ++irow) {
+league[icol * nrow + iblk * wblk + irow         ].itd = omp_get_thread_num();
+league[icol * nrow + iblk * wblk + irow         ].ntd = omp_get_num_threads();
+league[icol * nrow + iblk * wblk + irow         ].itm = omp_get_team_num();
+league[icol * nrow + iblk * wblk + irow         ].ltm = omp_get_num_teams();
+league[icol * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
+league[icol * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
+league[icol * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num();
+league[icol * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
       }
     }
   }
@@ -344,6 +356,9 @@ void gpuThreads(int i)
  * 5. Nested loop with collapse(3).
  * 6. +10 to each unrolled team   is used to label the 2x icol-loop unrolling.
  * 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling.
+ *
+ * More work for each thread is an approach to achieve high performance.
+ *
  */
       ncol   = 6;
       nrow   =12;
@@ -360,24 +375,23 @@ void gpuThreads(int i)
   default(none) shared(ncol, nrow, wblk, lteams, nthrds, league)
   for (int icol = 0; icol < ncol; icol += 2) {
     for (int iblk = 0; iblk < nrow / wblk; ++iblk) {
-      for (int irow = iblk * wblk;
-               irow < iblk * wblk + nthrds; ++irow) {
-        league[ icol      * nrow + irow         ].itd = omp_get_thread_num();
-        league[ icol      * nrow + irow         ].ntd = omp_get_num_threads();
-        league[ icol      * nrow + irow         ].itm = omp_get_team_num();
-        league[ icol      * nrow + irow         ].ltm = omp_get_num_teams();
-        league[ icol      * nrow + irow + nthrds].itd = omp_get_thread_num() + 10;
-        league[ icol      * nrow + irow + nthrds].ntd = omp_get_num_threads();
-        league[ icol      * nrow + irow + nthrds].itm = omp_get_team_num();
-        league[ icol      * nrow + irow + nthrds].ltm = omp_get_num_teams();
-        league[(icol + 1) * nrow + irow         ].itd = omp_get_thread_num();
-        league[(icol + 1) * nrow + irow         ].ntd = omp_get_num_threads();
-        league[(icol + 1) * nrow + irow         ].itm = omp_get_team_num() + 10;
-        league[(icol + 1) * nrow + irow         ].ltm = omp_get_num_teams();
-        league[(icol + 1) * nrow + irow + nthrds].itd = omp_get_thread_num() + 10;
-        league[(icol + 1) * nrow + irow + nthrds].ntd = omp_get_num_threads();
-        league[(icol + 1) * nrow + irow + nthrds].itm = omp_get_team_num() + 10;
-        league[(icol + 1) * nrow + irow + nthrds].ltm = omp_get_num_teams();
+      for (int irow = 0; irow < nthrds; ++irow) {
+league[ icol      * nrow + iblk * wblk + irow         ].itd = omp_get_thread_num();
+league[ icol      * nrow + iblk * wblk + irow         ].ntd = omp_get_num_threads();
+league[ icol      * nrow + iblk * wblk + irow         ].itm = omp_get_team_num();
+league[ icol      * nrow + iblk * wblk + irow         ].ltm = omp_get_num_teams();
+league[ icol      * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
+league[ icol      * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
+league[ icol      * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num();
+league[ icol      * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
+league[(icol + 1) * nrow + iblk * wblk + irow         ].itd = omp_get_thread_num();
+league[(icol + 1) * nrow + iblk * wblk + irow         ].ntd = omp_get_num_threads();
+league[(icol + 1) * nrow + iblk * wblk + irow         ].itm = omp_get_team_num() + 10;
+league[(icol + 1) * nrow + iblk * wblk + irow         ].ltm = omp_get_num_teams();
+league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
+league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
+league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num() + 10;
+league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
       }
     }
   }

diff --git a/09_matAdd/src/matAddAB.c b/09_matAdd/src/matAddAB.c
@@ -163,10 +163,11 @@ for (int i = 0; i < n; ++i) {
   default(none) shared(a, b, n)
 for (int j = 0; j < n; ++j) {
 for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
-for (int i = iblk * NTHRDS9;
-         i < iblk * NTHRDS9 + NTHRDS8; ++i) {
-  a[j * n + i          ] += b[j * n + i          ];
-  a[j * n + i + NTHRDS8] += b[j * n + i + NTHRDS8];
+for (int i = 0; i < NTHRDS8; ++i) {
+  a[j * n + iblk * NTHRDS9 + i          ] +=
+  b[j * n + iblk * NTHRDS9 + i          ];
+  a[j * n + iblk * NTHRDS9 + i + NTHRDS8] +=
+  b[j * n + iblk * NTHRDS9 + i + NTHRDS8];
 } /* end i-loop */
 } /* end iblk-loop */
 } /* end j-loop */
@@ -194,12 +195,15 @@ for (int i = iblk * NTHRDS9;
   default(none) shared(a, b, n, halfn)
 for (int j = 0; j < n; ++j) {
 for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
-for (int i = iblk * NTHRDS8;
-         i < iblk * NTHRDS8 + NTHRDS7; ++i) {
-  a[j * n + i                   ] += b[j * n + i                   ];
-  a[j * n + i          + NTHRDS7] += b[j * n + i          + NTHRDS7];
-  a[j * n + i + halfn           ] += b[j * n + i + halfn           ];
-  a[j * n + i + halfn  + NTHRDS7] += b[j * n + i + halfn  + NTHRDS7];
+for (int i = 0; i < NTHRDS7; ++i) {
+  a[j * n + iblk * NTHRDS8 + i                   ] +=
+  b[j * n + iblk * NTHRDS8 + i                   ];
+  a[j * n + iblk * NTHRDS8 + i          + NTHRDS7] +=
+  b[j * n + iblk * NTHRDS8 + i          + NTHRDS7];
+  a[j * n + iblk * NTHRDS8 + i + halfn           ] +=
+  b[j * n + iblk * NTHRDS8 + i + halfn           ];
+  a[j * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7] +=
+  b[j * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7];
 } /* end i-loop */
 } /* end iblk-loop */
 } /* end j-loop */
@@ -228,16 +232,23 @@ for (int i = iblk * NTHRDS8;
   default(none) shared(a, b, n, halfn)
 for (int j = 0; j < halfn; ++j) {
 for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
-for (int i = iblk * NTHRDS8;
-         i < iblk * NTHRDS8 + NTHRDS7; ++i) {
-  a[ j          * n + i                   ] += b[ j          * n + i                   ];
-  a[ j          * n + i          + NTHRDS7] += b[ j          * n + i          + NTHRDS7];
-  a[ j          * n + i + halfn           ] += b[ j          * n + i + halfn           ];
-  a[ j          * n + i + halfn  + NTHRDS7] += b[ j          * n + i + halfn  + NTHRDS7];
-  a[(j + halfn) * n + i                   ] += b[(j + halfn) * n + i                   ];
-  a[(j + halfn) * n + i          + NTHRDS7] += b[(j + halfn) * n + i          + NTHRDS7];
-  a[(j + halfn) * n + i + halfn           ] += b[(j + halfn) * n + i + halfn           ];
-  a[(j + halfn) * n + i + halfn  + NTHRDS7] += b[(j + halfn) * n + i + halfn  + NTHRDS7];
+for (int i = 0; i < NTHRDS7; ++i) {
+  a[ j          * n + iblk * NTHRDS8 + i                   ] +=
+  b[ j          * n + iblk * NTHRDS8 + i                   ];
+  a[ j          * n + iblk * NTHRDS8 + i          + NTHRDS7] +=
+  b[ j          * n + iblk * NTHRDS8 + i          + NTHRDS7];
+  a[ j          * n + iblk * NTHRDS8 + i + halfn           ] +=
+  b[ j          * n + iblk * NTHRDS8 + i + halfn           ];
+  a[ j          * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7] +=
+  b[ j          * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7];
+  a[(j + halfn) * n + iblk * NTHRDS8 + i                   ] +=
+  b[(j + halfn) * n + iblk * NTHRDS8 + i                   ];
+  a[(j + halfn) * n + iblk * NTHRDS8 + i          + NTHRDS7] +=
+  b[(j + halfn) * n + iblk * NTHRDS8 + i          + NTHRDS7];
+  a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn           ] +=
+  b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn           ];
+  a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7] +=
+  b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7];
 } /* end i-loop */
 } /* end iblk-loop */
 } /* end j-loop */

diff --git a/09_matAdd/tests/matAdd_real_00.sh.5329174.out b/09_matAdd/tests/matAdd_real_00.sh.5329174.out
diff --git a/09_matAdd/tests/matAdd_real_00.sh.5422037.out b/09_matAdd/tests/matAdd_real_00.sh.5422037.out
@@ -0,0 +1,11 @@
+hallo from gpu011
+matrix dim: 4096 x 4096
+time averaged over 64 loops
+matAddAB (0) :       2.9 GB/s      94.2 GB/s maxabserr =       0.0
+matAddAB (1) :       2.8 GB/s      36.3 GB/s maxabserr =       0.0
+matAddAB (2) :       2.9 GB/s      49.7 GB/s maxabserr =       0.0
+matAddAB (3) :       3.0 GB/s     193.5 GB/s maxabserr =       0.0
+matAddAB (4) :       3.2 GB/s     194.4 GB/s maxabserr =       0.0
+matAddAB (5) :       3.3 GB/s     197.9 GB/s maxabserr =       0.0
+matAddAB (6) :       3.3 GB/s     195.7 GB/s maxabserr =       0.0
+matAddAB (7) :       3.1 GB/s     201.1 GB/s maxabserr =       0.0
diff --git a/10_matMul/README.md b/10_matMul/README.md
@@ -12,6 +12,8 @@ numerical results are also verified.
 
 * Column-major is assumed thru the entire code!
 
+* `i` and `j` are indices for row and column, respectively.
+
 * For testing the dimension of all matrices are assumed to be 4096 x 4096.
 
 * The following table only summarizes the most important points. For more
@@ -29,16 +31,25 @@ numerical results are also verified.
 |  4  | jik-loop, 2^9 threads * 2^f teams, collapse(2),                        |
 |     | 4x k-loop unrolling                                                    |
 |  5  | jik-loop, 2^7 threads * 2^f teams, collapse(3),                        |
-|     | 4x i-loop unrolling (2x + 2x),                                         |
+|     | 4x i-loop unrolling (stride of 2^7 rows),                              |
 |     | 4x k-loop unrolling,                                                   |
 |     | rb: 4x data reuse                                                      |
-|  6  | jik-loop, 2^7 threads * 2^e teams, collapse(3),                        |
-|     | 2x j-loop unrolling,                                                   |
-|     | 4x i-loop unrolling (2x + 2x),                                         |
+|  6  | jik-loop, 2^7 threads * 2^d teams, collapse(3),                        |
+|     | 4x j-loop unrolling (stride of 1   col ),                              |
+|     | 4x i-loop unrolling (stride of 2^7 rows),                              |
+|     | 4x k-loop unrolling,                                                   |
+|     | ra: 4x data reuse,                                                     |
+|     | rb: 4x data reuse,                                                     |
+|     | register blocking                                                      |
+|  7  | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2)           |
+|  8  | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2),          |
+|     | GPU shared memory for data re-use, 16x k-loop unrolling,               |
+|     | shared memory blocking                                                 |
+|  9  | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2),          |
+|     | 4x i-loop unrolling (stride of n/4 rows),                              |
 |     | 4x k-loop unrolling,                                                   |
-|     | ra: 2x data reuse,                                                     |
 |     | rb: 4x data reuse                                                      |
-|  7  | cublasSgemm in CUBLAS                                                  |
+| 10  | cublasSgemm in CUBLAS                                                  |
 
 # Build
 

diff --git a/10_matMul/configure.ac b/10_matMul/configure.ac
@@ -39,9 +39,9 @@ LDFLAGS+="-L${CUDALIB} -L${MKLLIB}"
 #
 AC_PROG_CC([clang gcc])
 AS_IF([test "${CC}" = gcc],
-  [CFLAGS="-Wall -fopenmp -foffload=nvptx-none $CFLAGS"])
+  [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"])
 AS_IF([test "${CC}" = clang],
-  [CFLAGS="-Wall -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \
+  [CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
            -Xopenmp-target -march=sm_61 $CFLAGS"])
 ##############################################################################80
 #

diff --git a/10_matMul/docs/UserManual.md b/10_matMul/docs/UserManual.md
@@ -29,16 +29,25 @@ numerical results are also verified.
 |  4  | jik-loop, 2^9 threads * 2^f teams, collapse(2),                        |
 |     | 4x k-loop unrolling                                                    |
 |  5  | jik-loop, 2^7 threads * 2^f teams, collapse(3),                        |
-|     | 4x i-loop unrolling (2x + 2x),                                         |
+|     | 4x i-loop unrolling (stride of 2^7 rows),                              |
 |     | 4x k-loop unrolling,                                                   |
 |     | rb: 4x data reuse                                                      |
-|  6  | jik-loop, 2^7 threads * 2^e teams, collapse(3),                        |
-|     | 2x j-loop unrolling,                                                   |
-|     | 4x i-loop unrolling (2x + 2x),                                         |
+|  6  | jik-loop, 2^7 threads * 2^d teams, collapse(3),                        |
+|     | 4x j-loop unrolling (stride of 1   col ),                              |
+|     | 4x i-loop unrolling (stride of 2^7 rows),                              |
+|     | 4x k-loop unrolling,                                                   |
+|     | ra: 4x data reuse,                                                     |
+|     | rb: 4x data reuse,                                                     |
+|     | register blocking                                                      |
+|  7  | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2)           |
+|  8  | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2),          |
+|     | GPU shared memory for data re-use, 16x k-loop unrolling,               |
+|     | shared memory blocking                                                 |
+|  9  | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2),          |
+|     | 4x i-loop unrolling (stride of n/4 rows),                              |
 |     | 4x k-loop unrolling,                                                   |
-|     | ra: 2x data reuse,                                                     |
 |     | rb: 4x data reuse                                                      |
-|  7  | cublasSgemm in CUBLAS                                                  |
+| 10  | cublasSgemm in CUBLAS                                                  |
 
 # Usage
 

diff --git a/10_matMul/src/matMul.c b/10_matMul/src/matMul.c
@@ -75,7 +75,7 @@ int main(int argc, char *argv[])
   /*
    * matMul on accl
    */
-  for (ial = 0; ial < 9; ++ial) {
+  for (ial = 0; ial < 11; ++ial) {
     /*
      * See matMulAB.c for details:
      *
@@ -96,15 +96,28 @@ int main(int argc, char *argv[])
      *    4x k-loop unrolling
      *
      * 5: jik-loop, 2^7 threads * 2^f teams, collapse(3),
-     *    4x i-loop unrolling (2x + 2x),
+     *    4x i-loop unrolling (stride of 2^7 rows),
      *    4x k-loop unrolling,
      *    rb: 4x data reuse
      *
-     * 6: jik-loop, 2^7 threads * 2^e teams, collapse(3),
-     *    2x j-loop unrolling,
-     *    4x i-loop unrolling (2x + 2x),
+     * 6: jik-loop, 2^7 threads * 2^d teams, collapse(3),
+     *    4x j-loop unrolling (stride of 1   col ),
+     *    4x i-loop unrolling (stride of 2^7 rows),
+     *    4x k-loop unrolling,
+     *    rb: 4x data reuse,
+     *    ra: 4x data reuse,
+     *    register blocking
+     *
+     * 7: based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2)
+     *
+     * 8: based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2)
+     *    GPU shared memory for data re-use,
+     *    16x k-loop unrolling,
+     *    shared memory blocking
+     *
+     * 9: based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2),
+     *    4x i-loop unrolling (stride of n/4 rows),
      *    4x k-loop unrolling,
-     *    ra: 2x data reuse,
      *    rb: 4x data reuse
      *
      * otherwise: cublasSgemm in CUBLAS