From a5548103832bd0b6d91101462843167c817854de Mon Sep 17 00:00:00 2001
From: xwuupb <xinwu@mail.uni-paderborn.de>
Date: Wed, 15 Apr 2020 14:53:40 +0200
Subject: [PATCH] improved 10_matMul

---
 08_distThreads/src/gpuThreads.c               |  70 ++-
 09_matAdd/src/matAddAB.c                      |  51 +-
 09_matAdd/tests/matAdd_real_00.sh.5329174.out |  11 -
 09_matAdd/tests/matAdd_real_00.sh.5422037.out |  11 +
 10_matMul/README.md                           |  23 +-
 10_matMul/configure.ac                        |   4 +-
 10_matMul/docs/UserManual.md                  |  21 +-
 10_matMul/src/matMul.c                        |  25 +-
 10_matMul/src/matMulAB.c                      | 557 +++++++++++++-----
 10_matMul/tests/matMul_real_00.sh.5323229.out |  12 -
 10_matMul/tests/matMul_real_00.sh.5422034.out |  14 +
 11 files changed, 549 insertions(+), 250 deletions(-)
 delete mode 100644 09_matAdd/tests/matAdd_real_00.sh.5329174.out
 create mode 100644 09_matAdd/tests/matAdd_real_00.sh.5422037.out
 delete mode 100644 10_matMul/tests/matMul_real_00.sh.5323229.out
 create mode 100644 10_matMul/tests/matMul_real_00.sh.5422034.out

diff --git a/08_distThreads/src/gpuThreads.c b/08_distThreads/src/gpuThreads.c
index 3e1971b..bdeefa5 100644
--- a/08_distThreads/src/gpuThreads.c
+++ b/08_distThreads/src/gpuThreads.c
@@ -4,6 +4,8 @@
  *
  * This source file contains function definition for organizing GPU threads.
  *
+ * thread_limit for the teams construct is omitted for clarity.
+ *
  * @author Xin Wu (PC²)
  * @data 12.03.2020
  * @copyright CC BY-SA 2.0
@@ -304,6 +306,17 @@ void gpuThreads(int i)
  * 5. Nested loop with collapse(3).
  * 6. It features coalesced GPU memory access and good performance.
  * 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling.
+ *
+ * Caveat: especially for the innermost loop
+ *
+ * OpenMP API Specification: Version 5.0 November 2018
+ *
+ * https://www.openmp.org/spec-html/5.0/openmpsu44.html
+ *
+ * If a collapse clause is specified with a parameter value greater than 1, then
+ * the iterations of the associated loops to which the clause applies are
+ * collapsed into one larger iteration space with *unspecified ordering*.
+ *
  */
       ncol   = 6;
       nrow   =12;
@@ -320,16 +333,15 @@ void gpuThreads(int i)
   default(none) shared(ncol, nrow, wblk, lteams, nthrds, league)
   for (int icol = 0; icol < ncol; ++icol) {
     for (int iblk = 0; iblk < nrow / wblk; ++iblk) {
-      for (int irow = iblk * wblk;
-               irow < iblk * wblk + nthrds; ++irow) {
-        league[icol * nrow + irow         ].itd = omp_get_thread_num();
-        league[icol * nrow + irow         ].ntd = omp_get_num_threads();
-        league[icol * nrow + irow         ].itm = omp_get_team_num();
-        league[icol * nrow + irow         ].ltm = omp_get_num_teams();
-        league[icol * nrow + irow + nthrds].itd = omp_get_thread_num() + 10;
-        league[icol * nrow + irow + nthrds].ntd = omp_get_num_threads();
-        league[icol * nrow + irow + nthrds].itm = omp_get_team_num();
-        league[icol * nrow + irow + nthrds].ltm = omp_get_num_teams();
+      for (int irow = 0; irow < nthrds; ++irow) {
+league[icol * nrow + iblk * wblk + irow         ].itd = omp_get_thread_num();
+league[icol * nrow + iblk * wblk + irow         ].ntd = omp_get_num_threads();
+league[icol * nrow + iblk * wblk + irow         ].itm = omp_get_team_num();
+league[icol * nrow + iblk * wblk + irow         ].ltm = omp_get_num_teams();
+league[icol * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
+league[icol * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
+league[icol * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num();
+league[icol * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
       }
     }
   }
@@ -344,6 +356,9 @@ void gpuThreads(int i)
  * 5. Nested loop with collapse(3).
  * 6. +10 to each unrolled team   is used to label the 2x icol-loop unrolling.
  * 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling.
+ *
+ * More work for each thread is an approach to achieve high performance.
+ *
  */
       ncol   = 6;
       nrow   =12;
@@ -360,24 +375,23 @@ void gpuThreads(int i)
   default(none) shared(ncol, nrow, wblk, lteams, nthrds, league)
   for (int icol = 0; icol < ncol; icol += 2) {
     for (int iblk = 0; iblk < nrow / wblk; ++iblk) {
-      for (int irow = iblk * wblk;
-               irow < iblk * wblk + nthrds; ++irow) {
-        league[ icol      * nrow + irow         ].itd = omp_get_thread_num();
-        league[ icol      * nrow + irow         ].ntd = omp_get_num_threads();
-        league[ icol      * nrow + irow         ].itm = omp_get_team_num();
-        league[ icol      * nrow + irow         ].ltm = omp_get_num_teams();
-        league[ icol      * nrow + irow + nthrds].itd = omp_get_thread_num() + 10;
-        league[ icol      * nrow + irow + nthrds].ntd = omp_get_num_threads();
-        league[ icol      * nrow + irow + nthrds].itm = omp_get_team_num();
-        league[ icol      * nrow + irow + nthrds].ltm = omp_get_num_teams();
-        league[(icol + 1) * nrow + irow         ].itd = omp_get_thread_num();
-        league[(icol + 1) * nrow + irow         ].ntd = omp_get_num_threads();
-        league[(icol + 1) * nrow + irow         ].itm = omp_get_team_num() + 10;
-        league[(icol + 1) * nrow + irow         ].ltm = omp_get_num_teams();
-        league[(icol + 1) * nrow + irow + nthrds].itd = omp_get_thread_num() + 10;
-        league[(icol + 1) * nrow + irow + nthrds].ntd = omp_get_num_threads();
-        league[(icol + 1) * nrow + irow + nthrds].itm = omp_get_team_num() + 10;
-        league[(icol + 1) * nrow + irow + nthrds].ltm = omp_get_num_teams();
+      for (int irow = 0; irow < nthrds; ++irow) {
+league[ icol      * nrow + iblk * wblk + irow         ].itd = omp_get_thread_num();
+league[ icol      * nrow + iblk * wblk + irow         ].ntd = omp_get_num_threads();
+league[ icol      * nrow + iblk * wblk + irow         ].itm = omp_get_team_num();
+league[ icol      * nrow + iblk * wblk + irow         ].ltm = omp_get_num_teams();
+league[ icol      * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
+league[ icol      * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
+league[ icol      * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num();
+league[ icol      * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
+league[(icol + 1) * nrow + iblk * wblk + irow         ].itd = omp_get_thread_num();
+league[(icol + 1) * nrow + iblk * wblk + irow         ].ntd = omp_get_num_threads();
+league[(icol + 1) * nrow + iblk * wblk + irow         ].itm = omp_get_team_num() + 10;
+league[(icol + 1) * nrow + iblk * wblk + irow         ].ltm = omp_get_num_teams();
+league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
+league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
+league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num() + 10;
+league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
       }
     }
   }
diff --git a/09_matAdd/src/matAddAB.c b/09_matAdd/src/matAddAB.c
index d580f7c..6cec8be 100644
--- a/09_matAdd/src/matAddAB.c
+++ b/09_matAdd/src/matAddAB.c
@@ -163,10 +163,11 @@ for (int i = 0; i < n; ++i) {
   default(none) shared(a, b, n)
 for (int j = 0; j < n; ++j) {
 for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
-for (int i = iblk * NTHRDS9;
-         i < iblk * NTHRDS9 + NTHRDS8; ++i) {
-  a[j * n + i          ] += b[j * n + i          ];
-  a[j * n + i + NTHRDS8] += b[j * n + i + NTHRDS8];
+for (int i = 0; i < NTHRDS8; ++i) {
+  a[j * n + iblk * NTHRDS9 + i          ] +=
+  b[j * n + iblk * NTHRDS9 + i          ];
+  a[j * n + iblk * NTHRDS9 + i + NTHRDS8] +=
+  b[j * n + iblk * NTHRDS9 + i + NTHRDS8];
 } /* end i-loop */
 } /* end iblk-loop */
 } /* end j-loop */
@@ -194,12 +195,15 @@ for (int i = iblk * NTHRDS9;
   default(none) shared(a, b, n, halfn)
 for (int j = 0; j < n; ++j) {
 for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
-for (int i = iblk * NTHRDS8;
-         i < iblk * NTHRDS8 + NTHRDS7; ++i) {
-  a[j * n + i                   ] += b[j * n + i                   ];
-  a[j * n + i          + NTHRDS7] += b[j * n + i          + NTHRDS7];
-  a[j * n + i + halfn           ] += b[j * n + i + halfn           ];
-  a[j * n + i + halfn  + NTHRDS7] += b[j * n + i + halfn  + NTHRDS7];
+for (int i = 0; i < NTHRDS7; ++i) {
+  a[j * n + iblk * NTHRDS8 + i                   ] +=
+  b[j * n + iblk * NTHRDS8 + i                   ];
+  a[j * n + iblk * NTHRDS8 + i          + NTHRDS7] +=
+  b[j * n + iblk * NTHRDS8 + i          + NTHRDS7];
+  a[j * n + iblk * NTHRDS8 + i + halfn           ] +=
+  b[j * n + iblk * NTHRDS8 + i + halfn           ];
+  a[j * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7] +=
+  b[j * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7];
 } /* end i-loop */
 } /* end iblk-loop */
 } /* end j-loop */
@@ -228,16 +232,23 @@ for (int i = iblk * NTHRDS8;
   default(none) shared(a, b, n, halfn)
 for (int j = 0; j < halfn; ++j) {
 for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
-for (int i = iblk * NTHRDS8;
-         i < iblk * NTHRDS8 + NTHRDS7; ++i) {
-  a[ j          * n + i                   ] += b[ j          * n + i                   ];
-  a[ j          * n + i          + NTHRDS7] += b[ j          * n + i          + NTHRDS7];
-  a[ j          * n + i + halfn           ] += b[ j          * n + i + halfn           ];
-  a[ j          * n + i + halfn  + NTHRDS7] += b[ j          * n + i + halfn  + NTHRDS7];
-  a[(j + halfn) * n + i                   ] += b[(j + halfn) * n + i                   ];
-  a[(j + halfn) * n + i          + NTHRDS7] += b[(j + halfn) * n + i          + NTHRDS7];
-  a[(j + halfn) * n + i + halfn           ] += b[(j + halfn) * n + i + halfn           ];
-  a[(j + halfn) * n + i + halfn  + NTHRDS7] += b[(j + halfn) * n + i + halfn  + NTHRDS7];
+for (int i = 0; i < NTHRDS7; ++i) {
+  a[ j          * n + iblk * NTHRDS8 + i                   ] +=
+  b[ j          * n + iblk * NTHRDS8 + i                   ];
+  a[ j          * n + iblk * NTHRDS8 + i          + NTHRDS7] +=
+  b[ j          * n + iblk * NTHRDS8 + i          + NTHRDS7];
+  a[ j          * n + iblk * NTHRDS8 + i + halfn           ] +=
+  b[ j          * n + iblk * NTHRDS8 + i + halfn           ];
+  a[ j          * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7] +=
+  b[ j          * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7];
+  a[(j + halfn) * n + iblk * NTHRDS8 + i                   ] +=
+  b[(j + halfn) * n + iblk * NTHRDS8 + i                   ];
+  a[(j + halfn) * n + iblk * NTHRDS8 + i          + NTHRDS7] +=
+  b[(j + halfn) * n + iblk * NTHRDS8 + i          + NTHRDS7];
+  a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn           ] +=
+  b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn           ];
+  a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7] +=
+  b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7];
 } /* end i-loop */
 } /* end iblk-loop */
 } /* end j-loop */
diff --git a/09_matAdd/tests/matAdd_real_00.sh.5329174.out b/09_matAdd/tests/matAdd_real_00.sh.5329174.out
deleted file mode 100644
index d3dd40f..0000000
--- a/09_matAdd/tests/matAdd_real_00.sh.5329174.out
+++ /dev/null
@@ -1,11 +0,0 @@
-hallo from gpu029
-matrix dim: 4096 x 4096
-time averaged over 64 loops
-matAddAB (0) :       2.0 GB/s      86.6 GB/s maxabserr =       0.0
-matAddAB (1) :       2.0 GB/s      35.8 GB/s maxabserr =       0.0
-matAddAB (2) :       2.0 GB/s      49.1 GB/s maxabserr =       0.0
-matAddAB (3) :       2.1 GB/s     179.9 GB/s maxabserr =       0.0
-matAddAB (4) :       2.1 GB/s     180.1 GB/s maxabserr =       0.0
-matAddAB (5) :       2.0 GB/s     180.8 GB/s maxabserr =       0.0
-matAddAB (6) :       2.1 GB/s     184.1 GB/s maxabserr =       0.0
-matAddAB (7) :       2.0 GB/s     180.8 GB/s maxabserr =       0.0
diff --git a/09_matAdd/tests/matAdd_real_00.sh.5422037.out b/09_matAdd/tests/matAdd_real_00.sh.5422037.out
new file mode 100644
index 0000000..dde9ad9
--- /dev/null
+++ b/09_matAdd/tests/matAdd_real_00.sh.5422037.out
@@ -0,0 +1,11 @@
+hallo from gpu011
+matrix dim: 4096 x 4096
+time averaged over 64 loops
+matAddAB (0) :       2.9 GB/s      94.2 GB/s maxabserr =       0.0
+matAddAB (1) :       2.8 GB/s      36.3 GB/s maxabserr =       0.0
+matAddAB (2) :       2.9 GB/s      49.7 GB/s maxabserr =       0.0
+matAddAB (3) :       3.0 GB/s     193.5 GB/s maxabserr =       0.0
+matAddAB (4) :       3.2 GB/s     194.4 GB/s maxabserr =       0.0
+matAddAB (5) :       3.3 GB/s     197.9 GB/s maxabserr =       0.0
+matAddAB (6) :       3.3 GB/s     195.7 GB/s maxabserr =       0.0
+matAddAB (7) :       3.1 GB/s     201.1 GB/s maxabserr =       0.0
diff --git a/10_matMul/README.md b/10_matMul/README.md
index 82cdc60..131cd3e 100644
--- a/10_matMul/README.md
+++ b/10_matMul/README.md
@@ -12,6 +12,8 @@ numerical results are also verified.
 
 * Column-major is assumed thru the entire code!
 
+* `i` and `j` are indices for row and column, respectively.
+
 * For testing the dimension of all matrices are assumed to be 4096 x 4096.
 
 * The following table only summarizes the most important points. For more
@@ -29,16 +31,25 @@ numerical results are also verified.
 |  4  | jik-loop, 2^9 threads * 2^f teams, collapse(2),                        |
 |     | 4x k-loop unrolling                                                    |
 |  5  | jik-loop, 2^7 threads * 2^f teams, collapse(3),                        |
-|     | 4x i-loop unrolling (2x + 2x),                                         |
+|     | 4x i-loop unrolling (stride of 2^7 rows),                              |
 |     | 4x k-loop unrolling,                                                   |
 |     | rb: 4x data reuse                                                      |
-|  6  | jik-loop, 2^7 threads * 2^e teams, collapse(3),                        |
-|     | 2x j-loop unrolling,                                                   |
-|     | 4x i-loop unrolling (2x + 2x),                                         |
+|  6  | jik-loop, 2^7 threads * 2^d teams, collapse(3),                        |
+|     | 4x j-loop unrolling (stride of 1   col ),                              |
+|     | 4x i-loop unrolling (stride of 2^7 rows),                              |
+|     | 4x k-loop unrolling,                                                   |
+|     | ra: 4x data reuse,                                                     |
+|     | rb: 4x data reuse,                                                     |
+|     | register blocking                                                      |
+|  7  | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2)           |
+|  8  | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2),          |
+|     | GPU shared memory for data re-use, 16x k-loop unrolling,               |
+|     | shared memory blocking                                                 |
+|  9  | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2),          |
+|     | 4x i-loop unrolling (stride of n/4 rows),                              |
 |     | 4x k-loop unrolling,                                                   |
-|     | ra: 2x data reuse,                                                     |
 |     | rb: 4x data reuse                                                      |
-|  7  | cublasSgemm in CUBLAS                                                  |
+| 10  | cublasSgemm in CUBLAS                                                  |
 
 # Build
 
diff --git a/10_matMul/configure.ac b/10_matMul/configure.ac
index 9b264e7..dcdb628 100644
--- a/10_matMul/configure.ac
+++ b/10_matMul/configure.ac
@@ -39,9 +39,9 @@ LDFLAGS+="-L${CUDALIB} -L${MKLLIB}"
 #
 AC_PROG_CC([clang gcc])
 AS_IF([test "${CC}" = gcc],
-  [CFLAGS="-Wall -fopenmp -foffload=nvptx-none $CFLAGS"])
+  [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"])
 AS_IF([test "${CC}" = clang],
-  [CFLAGS="-Wall -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \
+  [CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
            -Xopenmp-target -march=sm_61 $CFLAGS"])
 ##############################################################################80
 #
diff --git a/10_matMul/docs/UserManual.md b/10_matMul/docs/UserManual.md
index ca6ac34..21f78fb 100644
--- a/10_matMul/docs/UserManual.md
+++ b/10_matMul/docs/UserManual.md
@@ -29,16 +29,25 @@ numerical results are also verified.
 |  4  | jik-loop, 2^9 threads * 2^f teams, collapse(2),                        |
 |     | 4x k-loop unrolling                                                    |
 |  5  | jik-loop, 2^7 threads * 2^f teams, collapse(3),                        |
-|     | 4x i-loop unrolling (2x + 2x),                                         |
+|     | 4x i-loop unrolling (stride of 2^7 rows),                              |
 |     | 4x k-loop unrolling,                                                   |
 |     | rb: 4x data reuse                                                      |
-|  6  | jik-loop, 2^7 threads * 2^e teams, collapse(3),                        |
-|     | 2x j-loop unrolling,                                                   |
-|     | 4x i-loop unrolling (2x + 2x),                                         |
+|  6  | jik-loop, 2^7 threads * 2^d teams, collapse(3),                        |
+|     | 4x j-loop unrolling (stride of 1   col ),                              |
+|     | 4x i-loop unrolling (stride of 2^7 rows),                              |
+|     | 4x k-loop unrolling,                                                   |
+|     | ra: 4x data reuse,                                                     |
+|     | rb: 4x data reuse,                                                     |
+|     | register blocking                                                      |
+|  7  | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2)           |
+|  8  | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2),          |
+|     | GPU shared memory for data re-use, 16x k-loop unrolling,               |
+|     | shared memory blocking                                                 |
+|  9  | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2),          |
+|     | 4x i-loop unrolling (stride of n/4 rows),                              |
 |     | 4x k-loop unrolling,                                                   |
-|     | ra: 2x data reuse,                                                     |
 |     | rb: 4x data reuse                                                      |
-|  7  | cublasSgemm in CUBLAS                                                  |
+| 10  | cublasSgemm in CUBLAS                                                  |
 
 # Usage
 
diff --git a/10_matMul/src/matMul.c b/10_matMul/src/matMul.c
index b5e458a..526f00c 100644
--- a/10_matMul/src/matMul.c
+++ b/10_matMul/src/matMul.c
@@ -75,7 +75,7 @@ int main(int argc, char *argv[])
   /*
    * matMul on accl
    */
-  for (ial = 0; ial < 9; ++ial) {
+  for (ial = 0; ial < 11; ++ial) {
     /*
      * See matMulAB.c for details:
      *
@@ -96,15 +96,28 @@ int main(int argc, char *argv[])
      *    4x k-loop unrolling
      *
      * 5: jik-loop, 2^7 threads * 2^f teams, collapse(3),
-     *    4x i-loop unrolling (2x + 2x),
+     *    4x i-loop unrolling (stride of 2^7 rows),
      *    4x k-loop unrolling,
      *    rb: 4x data reuse
      *
-     * 6: jik-loop, 2^7 threads * 2^e teams, collapse(3),
-     *    2x j-loop unrolling,
-     *    4x i-loop unrolling (2x + 2x),
+     * 6: jik-loop, 2^7 threads * 2^d teams, collapse(3),
+     *    4x j-loop unrolling (stride of 1   col ),
+     *    4x i-loop unrolling (stride of 2^7 rows),
+     *    4x k-loop unrolling,
+     *    rb: 4x data reuse,
+     *    ra: 4x data reuse,
+     *    register blocking
+     *
+     * 7: based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2)
+     *
+     * 8: based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2)
+     *    GPU shared memory for data re-use,
+     *    16x k-loop unrolling,
+     *    shared memory blocking
+     *
+     * 9: based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2),
+     *    4x i-loop unrolling (stride of n/4 rows),
      *    4x k-loop unrolling,
-     *    ra: 2x data reuse,
      *    rb: 4x data reuse
      *
      * otherwise: cublasSgemm in CUBLAS
diff --git a/10_matMul/src/matMulAB.c b/10_matMul/src/matMulAB.c
index de63847..21cf6a8 100644
--- a/10_matMul/src/matMulAB.c
+++ b/10_matMul/src/matMulAB.c
@@ -11,10 +11,6 @@
  * @copyright CC BY-SA 2.0
  */
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
@@ -25,13 +21,18 @@ extern "C" {
 #include "cublas_v2.h"
 #include "matMulAB.h"
 
-#define NTHRDS7 (1 << 0x7)
-#define NTHRDS8 (1 << 0x8)
-#define NTHRDS9 (1 << 0x9)
+#define NTHRDS7 (1 << 0x7) /* 2^{7}  */
+#define NTHRDS8 (1 << 0x8) /* 2^{8}  */
+#define NTHRDS9 (1 << 0x9) /* 2^{9}  */
+
+#define LTEAMSD (1 << 0xD) /* 2^{13} */
+#define LTEAMSE (1 << 0xE) /* 2^{14} */
+#define LTEAMSF (1 << 0xF) /* 2^{15} */
+#define LTEAMSG (1 << 020) /* 2^{16} */
 
-#define LTEAMSD (1 << 0xD)
-#define LTEAMSE (1 << 0xE)
-#define LTEAMSF (1 << 0xF)
+#define NPITCH  (1024)
+#define BLKROW  (512)
+#define BLKDIM  (16)
 
 double wtcalc;
 
@@ -48,7 +49,6 @@ void matMulAB_accl(float *a,
         *b_dev = NULL,
         *c_dev = NULL;
   struct timespec rt[2];
-  int halfn = n / 2;
 
   switch (ial) {
     case 0:
@@ -63,7 +63,7 @@ void matMulAB_accl(float *a,
   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
   default(none) shared(a, b, c, n)
 #pragma omp distribute parallel for num_threads(NTHRDS9) \
@@ -94,7 +94,7 @@ for (int i = 0; i < n; ++i) { /* sequential */
   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
   default(none) shared(a, b, c, n)
 #pragma omp distribute parallel for num_threads(NTHRDS9) \
@@ -123,7 +123,7 @@ for (int k = 0; k < n; ++k) { /* sequential */
   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
   default(none) shared(a, b, c, n)
 #pragma omp distribute parallel for num_threads(NTHRDS9) \
@@ -154,7 +154,7 @@ for (int i = 0; i < n; ++i) { /* parallel */
   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
   default(none) shared(a, b, c, n)
 #pragma omp distribute parallel for num_threads(NTHRDS9) \
@@ -177,12 +177,18 @@ for (int k = 0; k < n; ++k) { /* parallel */
  * - jik-loop
  * - 2^9 threads per team and 2^15 teams
  * - 4x k-loop unrolling
+ *
+ * good: more work for one thread per iteration.
+ * bad : one thread must read b 4 times in k-loop.
+ *       all threads in a team do the same read of b (waste of instructions).
+ * tips: each thread reads the corresponding element in b and
+ *       saves it in shared memory.
  */
 #pragma omp target data  device(0) \
   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
   default(none) shared(a, b, c, n)
 #pragma omp distribute parallel for num_threads(NTHRDS9) \
@@ -193,15 +199,10 @@ for (int i = 0; i < n; ++i) {
   float rc;
   rc = c[j * n + i];
   for (int k = 0; k < n; k += 4) { /* 4x unrolling */
-    float rb0, rb1, rb2, rb3;
-    rb0 = b[j * n + k    ];
-    rb1 = b[j * n + k + 1];
-    rb2 = b[j * n + k + 2];
-    rb3 = b[j * n + k + 3];
-    rc += a[ k      * n + i] * rb0
-        + a[(k + 1) * n + i] * rb1
-        + a[(k + 2) * n + i] * rb2
-        + a[(k + 3) * n + i] * rb3;
+    rc += a[ k      * n + i] * b[j * n + k    ];
+    rc += a[(k + 1) * n + i] * b[j * n + k + 1];
+    rc += a[(k + 2) * n + i] * b[j * n + k + 2];
+    rc += a[(k + 3) * n + i] * b[j * n + k + 3];
   }
   c[j * n + i] = rc;
 } /* end i-loop */
@@ -214,58 +215,61 @@ for (int i = 0; i < n; ++i) {
  * - jik-loop
  * - 2^7 threads per team and 2^15 teams
  * - collapse(3)
- * - 4x i-loop unrolling
- *      * 2x by number of threads
- *      * 2x by half of rows
+ * - 4x i-loop unrolling (stride of 2^7 rows)
  * - 4x k-loop unrolling
- * - rb: 4x data reuse
+ * - rb: 4x data re-use
+ *
+ * The integer calculation of matrix indices looks ugly. But considering the GPU
+ * hardware architecture, e.g. many separate INT32 units, these calculations are
+ * much faster than accessing GPU global memory and save the precious registers.
+ *
  */
 #pragma omp target data  device(0) \
-  map(to:n, halfn, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
+  map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
-  map(to:n, halfn, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
-  default(none) shared(a, b, c, n, halfn)
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS7) \
+  map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
+  default(none) shared(a, b, c, n)
 #pragma omp distribute parallel for num_threads(NTHRDS7) \
   dist_schedule(static, NTHRDS7) collapse(3) \
-  default(none) shared(a, b, c, n, halfn)
+  default(none) shared(a, b, c, n)
 for (int j = 0; j < n; ++j) {
-for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) { /* 4x unrolling */
-for (int i = iblk * NTHRDS8;
-         i < iblk * NTHRDS8 + NTHRDS7; ++i) {
+for (int iblk = 0; iblk < n / BLKROW; ++iblk) {
+for (int i = 0; i < NTHRDS7; ++i) { /* 4x unrolling */
   float rc0, rc1, rc2, rc3;
-  rc0 = c[j * n + i                  ];
-  rc1 = c[j * n + i + NTHRDS7        ];
-  rc2 = c[j * n + i           + halfn];
-  rc3 = c[j * n + i + NTHRDS7 + halfn];
+  rc0 = c[j * n + iblk * BLKROW + i              ];
+  rc1 = c[j * n + iblk * BLKROW + i + NTHRDS7    ];
+  rc2 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 2];
+  rc3 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 3];
   for (int k = 0; k < n; k += 4) { /* 4x unrolling */
+    /* register for b: 4x k-loop */
     float rb0, rb1, rb2, rb3;
-    rb0 = b[j * n + k    ];
-    rb1 = b[j * n + k + 1];
-    rb2 = b[j * n + k + 2];
-    rb3 = b[j * n + k + 3];
-    rc0+= a[ k      * n + i                  ] * rb0
-        + a[(k + 1) * n + i                  ] * rb1
-        + a[(k + 2) * n + i                  ] * rb2
-        + a[(k + 3) * n + i                  ] * rb3;
-    rc1+= a[ k      * n + i + NTHRDS7        ] * rb0
-        + a[(k + 1) * n + i + NTHRDS7        ] * rb1
-        + a[(k + 2) * n + i + NTHRDS7        ] * rb2
-        + a[(k + 3) * n + i + NTHRDS7        ] * rb3;
-    rc2+= a[ k      * n + i           + halfn] * rb0
-        + a[(k + 1) * n + i           + halfn] * rb1
-        + a[(k + 2) * n + i           + halfn] * rb2
-        + a[(k + 3) * n + i           + halfn] * rb3;
-    rc3+= a[ k      * n + i + NTHRDS7 + halfn] * rb0
-        + a[(k + 1) * n + i + NTHRDS7 + halfn] * rb1
-        + a[(k + 2) * n + i + NTHRDS7 + halfn] * rb2
-        + a[(k + 3) * n + i + NTHRDS7 + halfn] * rb3;
+    rb0  = b[j * n + k    ];
+    rb1  = b[j * n + k + 1];
+    rb2  = b[j * n + k + 2];
+    rb3  = b[j * n + k + 3];
+    rc0 += a[ k      * n + iblk * BLKROW + i              ] * rb0;
+    rc0 += a[(k + 1) * n + iblk * BLKROW + i              ] * rb1;
+    rc0 += a[(k + 2) * n + iblk * BLKROW + i              ] * rb2;
+    rc0 += a[(k + 3) * n + iblk * BLKROW + i              ] * rb3;
+    rc1 += a[ k      * n + iblk * BLKROW + i + NTHRDS7    ] * rb0;
+    rc1 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7    ] * rb1;
+    rc1 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7    ] * rb2;
+    rc1 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7    ] * rb3;
+    rc2 += a[ k      * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb0;
+    rc2 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb1;
+    rc2 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb2;
+    rc2 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb3;
+    rc3 += a[ k      * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb0;
+    rc3 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb1;
+    rc3 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb2;
+    rc3 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb3;
   }
-  c[j * n + i                  ] = rc0;
-  c[j * n + i + NTHRDS7        ] = rc1;
-  c[j * n + i           + halfn] = rc2;
-  c[j * n + i + NTHRDS7 + halfn] = rc3;
+  c[j * n + iblk * BLKROW + i              ] = rc0;
+  c[j * n + iblk * BLKROW + i + NTHRDS7    ] = rc1;
+  c[j * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc2;
+  c[j * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc3;
 } /* end i-loop */
 } /* end iblk-loop */
 } /* end j-loop */
@@ -275,118 +279,357 @@ for (int i = iblk * NTHRDS8;
     case 6:
 /*
  * - jik-loop
- * - 2^7 threads per team and 2^14 teams
+ * - 2^7 threads per team and 2^13 teams
  * - collapse(3)
- * - 2x j-loop unrolling by half of cols
- * - 4x i-loop unrolling
- *      * 2x by number of threads
- *      * 2x by half of rows
+ * - 4x j-loop unrolling (stride of 1   col )
+ * - 4x i-loop unrolling (stride of 2^7 rows)
  * - 4x k-loop unrolling
+ * - rb: 4x data re-use
+ * - ra: 4x data re-use
+ * - register blocking
  */
 #pragma omp target data  device(0) \
-  map(to:n, halfn, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
+  map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSE) \
-  map(to:n, halfn, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
-  default(none) shared(a, b, c, n, halfn)
+#pragma omp target teams device(0) num_teams(LTEAMSD) thread_limit(NTHRDS7) \
+  map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
+  default(none) shared(a, b, c, n)
 #pragma omp distribute parallel for num_threads(NTHRDS7) \
   dist_schedule(static, NTHRDS7) collapse(3) \
-  default(none) shared(a, b, c, n, halfn)
-for (int j = 0; j < halfn; ++j) { /* 2x unrolling */
-for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) { /* 4x unrolling */
-for (int i = iblk * NTHRDS8;
-         i < iblk * NTHRDS8 + NTHRDS7; ++i) {
-  /* register for c: 2x j-loop * 4x i-loop */
+  default(none) shared(a, b, c, n)
+for (int j = 0; j < n; j += 4) { /* 4x unrolling */
+for (int iblk = 0; iblk < n / BLKROW; ++iblk) {
+for (int i = 0; i < NTHRDS7; ++i) { /* 4x unrolling */
+  /* register for c: 4x j-loop * 4x i-loop */
   float rc0, rc1, rc2, rc3,
-        rc4, rc5, rc6, rc7;
-  rc0 = c[ j          * n + i                  ];
-  rc1 = c[ j          * n + i + NTHRDS7        ];
-  rc2 = c[ j          * n + i           + halfn];
-  rc3 = c[ j          * n + i + NTHRDS7 + halfn];
-  rc4 = c[(j + halfn) * n + i                  ];
-  rc5 = c[(j + halfn) * n + i + NTHRDS7        ];
-  rc6 = c[(j + halfn) * n + i           + halfn];
-  rc7 = c[(j + halfn) * n + i + NTHRDS7 + halfn];
+        rc4, rc5, rc6, rc7,
+        rc8, rc9, rca, rcb,
+        rcc, rcd, rce, rcf;
+  rc0 = c[ j      * n + iblk * BLKROW + i              ];
+  rc1 = c[ j      * n + iblk * BLKROW + i + NTHRDS7    ];
+  rc2 = c[ j      * n + iblk * BLKROW + i + NTHRDS7 * 2];
+  rc3 = c[ j      * n + iblk * BLKROW + i + NTHRDS7 * 3];
+  rc4 = c[(j + 1) * n + iblk * BLKROW + i              ];
+  rc5 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7    ];
+  rc6 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+  rc7 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3];
+  rc8 = c[(j + 2) * n + iblk * BLKROW + i              ];
+  rc9 = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7    ];
+  rca = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+  rcb = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3];
+  rcc = c[(j + 3) * n + iblk * BLKROW + i              ];
+  rcd = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7    ];
+  rce = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+  rcf = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3];
   for (int k = 0; k < n; k += 4) { /* 4x unrolling */
-    /* register for b: 2x j-loop * 4x k-loop */
+    /* register for b: 4x j-loop * 4x k-loop */
     float rb0, rb1, rb2, rb3,
-          rb4, rb5, rb6, rb7;
+          rb4, rb5, rb6, rb7,
+          rb8, rb9, rba, rbb,
+          rbc, rbd, rbe, rbf;
+    rb0 = b[ j      * n + k    ];
+    rb1 = b[ j      * n + k + 1];
+    rb2 = b[ j      * n + k + 2];
+    rb3 = b[ j      * n + k + 3];
+    rb4 = b[(j + 1) * n + k    ];
+    rb5 = b[(j + 1) * n + k + 1];
+    rb6 = b[(j + 1) * n + k + 2];
+    rb7 = b[(j + 1) * n + k + 3];
+    rb8 = b[(j + 2) * n + k    ];
+    rb9 = b[(j + 2) * n + k + 1];
+    rba = b[(j + 2) * n + k + 2];
+    rbb = b[(j + 2) * n + k + 3];
+    rbc = b[(j + 3) * n + k    ];
+    rbd = b[(j + 3) * n + k + 1];
+    rbe = b[(j + 3) * n + k + 2];
+    rbf = b[(j + 3) * n + k + 3];
     /* register for a: 4x i-loop * 4x k-loop */
     float ra0, ra1, ra2, ra3,
           ra4, ra5, ra6, ra7,
           ra8, ra9, raa, rab,
           rac, rad, rae, raf;
-    rb0 = b[ j          * n + k    ];
-    rb1 = b[ j          * n + k + 1];
-    rb2 = b[ j          * n + k + 2];
-    rb3 = b[ j          * n + k + 3];
-    rb4 = b[(j + halfn) * n + k    ];
-    rb5 = b[(j + halfn) * n + k + 1];
-    rb6 = b[(j + halfn) * n + k + 2];
-    rb7 = b[(j + halfn) * n + k + 3];
-    ra0 = a[ k      * n + i                  ];
-    ra1 = a[(k + 1) * n + i                  ];
-    ra2 = a[(k + 2) * n + i                  ];
-    ra3 = a[(k + 3) * n + i                  ];
-    ra4 = a[ k      * n + i + NTHRDS7        ];
-    ra5 = a[(k + 1) * n + i + NTHRDS7        ];
-    ra6 = a[(k + 2) * n + i + NTHRDS7        ];
-    ra7 = a[(k + 3) * n + i + NTHRDS7        ];
-    ra8 = a[ k      * n + i           + halfn];
-    ra9 = a[(k + 1) * n + i           + halfn];
-    raa = a[(k + 2) * n + i           + halfn];
-    rab = a[(k + 3) * n + i           + halfn];
-    rac = a[ k      * n + i + NTHRDS7 + halfn];
-    rad = a[(k + 1) * n + i + NTHRDS7 + halfn];
-    rae = a[(k + 2) * n + i + NTHRDS7 + halfn];
-    raf = a[(k + 3) * n + i + NTHRDS7 + halfn];
+    ra0 = a[ k      * n + iblk * BLKROW + i              ];
+    ra1 = a[ k      * n + iblk * BLKROW + i + NTHRDS7    ];
+    ra2 = a[ k      * n + iblk * BLKROW + i + NTHRDS7 * 2];
+    ra3 = a[ k      * n + iblk * BLKROW + i + NTHRDS7 * 3];
+    ra4 = a[(k + 1) * n + iblk * BLKROW + i              ];
+    ra5 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7    ];
+    ra6 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+    ra7 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3];
+    ra8 = a[(k + 2) * n + iblk * BLKROW + i              ];
+    ra9 = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7    ];
+    raa = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+    rab = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3];
+    rac = a[(k + 3) * n + iblk * BLKROW + i              ];
+    rad = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7    ];
+    rae = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+    raf = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3];
     /*
      * register blocking
      */
-    rc0+= ra0 * rb0
-        + ra1 * rb1
-        + ra2 * rb2
-        + ra3 * rb3;
-    rc1+= ra4 * rb0
-        + ra5 * rb1
-        + ra6 * rb2
-        + ra7 * rb3;
-    rc2+= ra8 * rb0
-        + ra9 * rb1
-        + raa * rb2
-        + rab * rb3;
-    rc3+= rac * rb0
-        + rad * rb1
-        + rae * rb2
-        + raf * rb3;
-    rc4+= ra0 * rb4
-        + ra1 * rb5
-        + ra2 * rb6
-        + ra3 * rb7;
-    rc5+= ra4 * rb4
-        + ra5 * rb5
-        + ra6 * rb6
-        + ra7 * rb7;
-    rc6+= ra8 * rb4
-        + ra9 * rb5
-        + raa * rb6
-        + rab * rb7;
-    rc7+= rac * rb4
-        + rad * rb5
-        + rae * rb6
-        + raf * rb7;
+    // col 1 of c:
+    rc0 += ra0 * rb0;
+    rc0 += ra4 * rb1;
+    rc0 += ra8 * rb2;
+    rc0 += rac * rb3;
+    rc1 += ra1 * rb0;
+    rc1 += ra5 * rb1;
+    rc1 += ra9 * rb2;
+    rc1 += rad * rb3;
+    rc2 += ra2 * rb0;
+    rc2 += ra6 * rb1;
+    rc2 += raa * rb2;
+    rc2 += rae * rb3;
+    rc3 += ra3 * rb0;
+    rc3 += ra7 * rb1;
+    rc3 += rab * rb2;
+    rc3 += raf * rb3;
+    // col 2 of c:
+    rc4 += ra0 * rb4;
+    rc4 += ra4 * rb5;
+    rc4 += ra8 * rb6;
+    rc4 += rac * rb7;
+    rc5 += ra1 * rb4;
+    rc5 += ra5 * rb5;
+    rc5 += ra9 * rb6;
+    rc5 += rad * rb7;
+    rc6 += ra2 * rb4;
+    rc6 += ra6 * rb5;
+    rc6 += raa * rb6;
+    rc6 += rae * rb7;
+    rc7 += ra3 * rb4;
+    rc7 += ra7 * rb5;
+    rc7 += rab * rb6;
+    rc7 += raf * rb7;
+    // col 3 of c:
+    rc8 += ra0 * rb8;
+    rc8 += ra4 * rb9;
+    rc8 += ra8 * rba;
+    rc8 += rac * rbb;
+    rc9 += ra1 * rb8;
+    rc9 += ra5 * rb9;
+    rc9 += ra9 * rba;
+    rc9 += rad * rbb;
+    rca += ra2 * rb8;
+    rca += ra6 * rb9;
+    rca += raa * rba;
+    rca += rae * rbb;
+    rcb += ra3 * rb8;
+    rcb += ra7 * rb9;
+    rcb += rab * rba;
+    rcb += raf * rbb;
+    // col 4 of c:
+    rcc += ra0 * rbc;
+    rcc += ra4 * rbd;
+    rcc += ra8 * rbe;
+    rcc += rac * rbf;
+    rcd += ra1 * rbc;
+    rcd += ra5 * rbd;
+    rcd += ra9 * rbe;
+    rcd += rad * rbf;
+    rce += ra2 * rbc;
+    rce += ra6 * rbd;
+    rce += raa * rbe;
+    rce += rae * rbf;
+    rcf += ra3 * rbc;
+    rcf += ra7 * rbd;
+    rcf += rab * rbe;
+    rcf += raf * rbf;
   }
-  c[ j          * n + i                  ] = rc0;
-  c[ j          * n + i + NTHRDS7        ] = rc1;
-  c[ j          * n + i           + halfn] = rc2;
-  c[ j          * n + i + NTHRDS7 + halfn] = rc3;
-  c[(j + halfn) * n + i                  ] = rc4;
-  c[(j + halfn) * n + i + NTHRDS7        ] = rc5;
-  c[(j + halfn) * n + i           + halfn] = rc6;
-  c[(j + halfn) * n + i + NTHRDS7 + halfn] = rc7;
+  c[ j      * n + iblk * BLKROW + i              ] = rc0;
+  c[ j      * n + iblk * BLKROW + i + NTHRDS7    ] = rc1;
+  c[ j      * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc2;
+  c[ j      * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc3;
+  c[(j + 1) * n + iblk * BLKROW + i              ] = rc4;
+  c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7    ] = rc5;
+  c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc6;
+  c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc7;
+  c[(j + 2) * n + iblk * BLKROW + i              ] = rc8;
+  c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7    ] = rc9;
+  c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rca;
+  c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcb;
+  c[(j + 3) * n + iblk * BLKROW + i              ] = rcc;
+  c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7    ] = rcd;
+  c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rce;
+  c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcf;
 } /* end i-loop */
 } /* end iblk-loop */
+} /* end j-loop */
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    case 7:
+/*
+ * - based on case 2
+ * - jik-loop
+ * - 2^8 threads per team and 2^16 teams
+ * - collapse(2)
+ * - no race condition
+ */
+#pragma omp target data  device(0) \
+  map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
+{
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams(LTEAMSG) thread_limit(NTHRDS8) \
+  map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
+  default(none) shared(a, b, c, n)
+#pragma omp distribute parallel for num_threads(NTHRDS8) \
+  dist_schedule(static, NTHRDS8) collapse(2) \
+  default(none) shared(a, b, c, n)
+for (int j = 0; j < n; ++j) { /* parallel */
+for (int i = 0; i < n; ++i) { /* parallel */
+  float rc;
+  rc = c[j * n + i];
+  for (int k = 0; k < n; ++k) { /* sequential */
+    rc += a[k * n + i] * b[j * n + k];
+  }
+  c[j * n + i] = rc;
+} /* end i-loop */
+} /* end j-loop */
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    case 8:
+/*
+ * - based on case 7
+ * - jik-loop
+ * - 2^8 threads per team and 2^16 teams
+ * - collapse(2)
+ * - GPU shared memory for data re-use
+ * - 16x k-loop unrolling
+ */
+#pragma omp target data  device(0) \
+  map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
+{
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams(LTEAMSG) thread_limit(NTHRDS8) \
+  map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
+  default(none) shared(a, b, c, n)
+{
+  // GPU shared memory for each team
+  /*
+   * I have tested the bank conflict-free version, but it gives worse results,
+   * e.g. ~ 290 GFLOPS (20 GFLOPS less than the bank conflict version).
+   * I cannot explain ...
+   *
+  float ashm[BLKDIM][BLKDIM + 1],
+        bshm[BLKDIM][BLKDIM + 1];
+   */
+  float ashm[BLKDIM][BLKDIM],
+        bshm[BLKDIM][BLKDIM];
+#pragma omp distribute dist_schedule(static, 1) collapse(2)
+for (int j = 0; j < n / BLKDIM; ++j) {
+for (int i = 0; i < n / BLKDIM; ++i) {
+#pragma omp parallel num_threads(NTHRDS8) \
+  default(none) shared(a, b, c, n, ashm, bshm, i, j)
+{
+  /*
+   * The code here resembles CUDA.
+   */
+  int td = omp_get_thread_num();
+  // de-linearize the thread number
+  int it, // thread number along the row
+      jt; // thread number along the col
+  it = td % BLKDIM;
+  jt = td / BLKDIM;
+  int ib, // row at the beginning of block
+      jb; // col at the beginning of block
+  ib = i * BLKDIM;
+  jb = j * BLKDIM;
+  int ii, // the real row
+      jj; // the real col
+  ii = ib + it;
+  jj = jb + jt;
+  float rc = c[jj * n + ii]; // c in register
+  /*
+   * the k blocks
+   */
+  for (int k = 0; k < n / BLKDIM; ++k) {
+    // read the global data to shared memory
+    ashm[jt][it] = a[(k * 16 + jt) * n + ii];
+    bshm[jt][it] = b[jj * n + (k * 16 + it)];
+#pragma omp barrier
+    // shared memory blocking and 16x k-loop unrolling
+    rc += ashm[0x0][it] * bshm[jt][0x0];
+    rc += ashm[0x1][it] * bshm[jt][0x1];
+    rc += ashm[0x2][it] * bshm[jt][0x2];
+    rc += ashm[0x3][it] * bshm[jt][0x3];
+    rc += ashm[0x4][it] * bshm[jt][0x4];
+    rc += ashm[0x5][it] * bshm[jt][0x5];
+    rc += ashm[0x6][it] * bshm[jt][0x6];
+    rc += ashm[0x7][it] * bshm[jt][0x7];
+    rc += ashm[0x8][it] * bshm[jt][0x8];
+    rc += ashm[0x9][it] * bshm[jt][0x9];
+    rc += ashm[0xa][it] * bshm[jt][0xa];
+    rc += ashm[0xb][it] * bshm[jt][0xb];
+    rc += ashm[0xc][it] * bshm[jt][0xc];
+    rc += ashm[0xd][it] * bshm[jt][0xd];
+    rc += ashm[0xe][it] * bshm[jt][0xe];
+    rc += ashm[0xf][it] * bshm[jt][0xf];
+#pragma omp barrier
+  } /* end k-loop */
+  c[jj * n + ii] =rc;
+} /* end omp parallel */
+} /* end i-loop */
+} /* end j-loop */
+} /* end omp target teams */
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    case 9:
+/*
+ * - based on case 5
+ * - only diffs are listed here:
+ *     * collapse(2)
+ *     * 4x i-loop unrolling (stride of n/4 rows)
+ */
+#pragma omp target data  device(0) \
+  map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
+{
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS7) \
+  map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
+  default(none) shared(a, b, c, n)
+#pragma omp distribute parallel for num_threads(NTHRDS7) \
+  dist_schedule(static, NTHRDS7) collapse(2) \
+  default(none) shared(a, b, c, n)
+for (int j = 0; j < n; ++j) {
+for (int i = 0; i < NPITCH; ++i) { /* 4x unrolling */
+  float rc0, rc1, rc2, rc3;
+  rc0 = c[j * n + i             ];
+  rc1 = c[j * n + i + NPITCH    ];
+  rc2 = c[j * n + i + NPITCH * 2];
+  rc3 = c[j * n + i + NPITCH * 3];
+  for (int k = 0; k < n; k += 4) { /* 4x unrolling */
+    /* register for b: 4x k-loop */
+    float rb0, rb1, rb2, rb3;
+    rb0  = b[j * n + k    ];
+    rb1  = b[j * n + k + 1];
+    rb2  = b[j * n + k + 2];
+    rb3  = b[j * n + k + 3];
+    rc0 += a[ k      * n + i             ] * rb0;
+    rc0 += a[(k + 1) * n + i             ] * rb1;
+    rc0 += a[(k + 2) * n + i             ] * rb2;
+    rc0 += a[(k + 3) * n + i             ] * rb3;
+    rc1 += a[ k      * n + i + NPITCH    ] * rb0;
+    rc1 += a[(k + 1) * n + i + NPITCH    ] * rb1;
+    rc1 += a[(k + 2) * n + i + NPITCH    ] * rb2;
+    rc1 += a[(k + 3) * n + i + NPITCH    ] * rb3;
+    rc2 += a[ k      * n + i + NPITCH * 2] * rb0;
+    rc2 += a[(k + 1) * n + i + NPITCH * 2] * rb1;
+    rc2 += a[(k + 2) * n + i + NPITCH * 2] * rb2;
+    rc2 += a[(k + 3) * n + i + NPITCH * 2] * rb3;
+    rc3 += a[ k      * n + i + NPITCH * 3] * rb0;
+    rc3 += a[(k + 1) * n + i + NPITCH * 3] * rb1;
+    rc3 += a[(k + 2) * n + i + NPITCH * 3] * rb2;
+    rc3 += a[(k + 3) * n + i + NPITCH * 3] * rb3;
+  }
+  c[j * n + i             ] = rc0;
+  c[j * n + i + NPITCH    ] = rc1;
+  c[j * n + i + NPITCH * 2] = rc2;
+  c[j * n + i + NPITCH * 3] = rc3;
+} /* end i-loop */
 } /* end j-loop */
   clock_gettime(CLOCK_REALTIME, rt + 1);
 }
@@ -445,7 +688,3 @@ for (int i = iblk * NTHRDS8;
     wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
   }
 }
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/10_matMul/tests/matMul_real_00.sh.5323229.out b/10_matMul/tests/matMul_real_00.sh.5323229.out
deleted file mode 100644
index d61e8b7..0000000
--- a/10_matMul/tests/matMul_real_00.sh.5323229.out
+++ /dev/null
@@ -1,12 +0,0 @@
-hallo from gpu013
-matrix dim: 4096 x 4096
-time averaged over 16 loops
-matMulAB (0) :      24.9 GFLOPS      25.4 GFLOPS maxabserr =       0.0
-matMulAB (1) :      10.0 GFLOPS      10.0 GFLOPS maxabserr =       0.0
-matMulAB (2) :     193.6 GFLOPS     229.2 GFLOPS maxabserr =       0.0
-matMulAB (3) :       5.0 GFLOPS       5.1 GFLOPS maxabserr =    1017.8
-matMulAB (4) :     182.2 GFLOPS     213.7 GFLOPS maxabserr =       0.0
-matMulAB (5) :     370.7 GFLOPS     529.6 GFLOPS maxabserr =       0.0
-matMulAB (6) :     540.0 GFLOPS     962.0 GFLOPS maxabserr =       0.0
-matMulAB (7) :    1028.2 GFLOPS   10463.3 GFLOPS maxabserr =       0.0
-matMulAB (8) :    1007.7 GFLOPS   10468.4 GFLOPS maxabserr =       0.0
diff --git a/10_matMul/tests/matMul_real_00.sh.5422034.out b/10_matMul/tests/matMul_real_00.sh.5422034.out
new file mode 100644
index 0000000..7f94772
--- /dev/null
+++ b/10_matMul/tests/matMul_real_00.sh.5422034.out
@@ -0,0 +1,14 @@
+hallo from gpu011
+matrix dim: 4096 x 4096
+time averaged over 16 loops
+matMulAB (0) :      24.7 GFLOPS      25.3 GFLOPS maxabserr =       0.0
+matMulAB (1) :       9.8 GFLOPS       9.9 GFLOPS maxabserr =       0.0
+matMulAB (2) :     179.0 GFLOPS     220.7 GFLOPS maxabserr =       0.0
+matMulAB (3) :       5.1 GFLOPS       5.1 GFLOPS maxabserr =    1028.8
+matMulAB (4) :     171.2 GFLOPS     209.0 GFLOPS maxabserr =       0.0
+matMulAB (5) :     342.0 GFLOPS     533.1 GFLOPS maxabserr =       0.0
+matMulAB (6) :     634.9 GFLOPS    1717.8 GFLOPS maxabserr =       0.0
+matMulAB (7) :     217.8 GFLOPS     282.3 GFLOPS maxabserr =       0.0
+matMulAB (8) :     236.0 GFLOPS     313.6 GFLOPS maxabserr =       0.0
+matMulAB (9) :     341.6 GFLOPS     534.8 GFLOPS maxabserr =       0.0
+matMulAB (10) :     988.8 GFLOPS   10645.0 GFLOPS maxabserr =       0.0