From 94bd6720f191d744be886f6b3dce45a5bfc88da2 Mon Sep 17 00:00:00 2001
From: xwuupb <xinwu@mail.uni-paderborn.de>
Date: Wed, 15 Apr 2020 17:06:56 +0200
Subject: [PATCH] improved 09_matAdd

---
 09_matAdd/README.md               |  11 +--
 09_matAdd/configure.ac            |   2 +-
 09_matAdd/docs/UserManual.md      |  11 +--
 09_matAdd/src/matAdd.c            |   8 +-
 09_matAdd/src/matAddAB.c          | 129 +++++++++++++-----------------
 09_matAdd/tests/matAdd_real_00.sh |   3 +-
 10_matMul/src/matMulAB.c          |  53 ++++++------
 7 files changed, 101 insertions(+), 116 deletions(-)

diff --git a/09_matAdd/README.md b/09_matAdd/README.md
index 76077ce..ccd58e7 100644
--- a/09_matAdd/README.md
+++ b/09_matAdd/README.md
@@ -28,11 +28,12 @@ the numerical results are also verified.
 |  3  | ji-loop, 2^9 threads * 2^f teams, collapse(2),                         |
 |     | coalesced memory access                                                |
 |  4  | ji-loop, 2^8 threads * 2^f teams, collapse(3),                         |
-|     | 2x i-loop unrolling                                                    |
-|  5  | ji-loop, 2^7 threads * 2^f teams, collapse(3),                         |
-|     | 4x i-loop unrolling                                                    |
-|  6  | ji-loop, 2^7 threads * 2^e teams, collapse(3),                         |
-|     | 4x i-loop unrolling, 2x j-loop unrolling                               |
+|     | 2x i-loop unrolling (stride of 2^8 rows)                               |
+|  5  | ji-loop, 2^8 threads * 2^f teams, collapse(2),                         |
+|     | 2x i-loop unrolling (stride of n/2 rows)                               |
+|  6  | ji-loop, 2^8 threads * 2^e teams, collapse(3),                         |
+|     | 2x i-loop unrolling (stride of 2^8 rows),                              |
+|     | 2x j-loop unrolling (stride of 1   col )                               |
 |  7  | cublasSaxpy in CUBLAS                                                  |
 
 # Build
diff --git a/09_matAdd/configure.ac b/09_matAdd/configure.ac
index bcca20e..b321799 100644
--- a/09_matAdd/configure.ac
+++ b/09_matAdd/configure.ac
@@ -41,7 +41,7 @@ AC_PROG_CC([clang gcc])
 AS_IF([test "${CC}" = gcc],
   [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"])
 AS_IF([test "${CC}" = clang],
-  [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \
+  [CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
            -Xopenmp-target -march=sm_61 $CFLAGS"])
 ##############################################################################80
 #
diff --git a/09_matAdd/docs/UserManual.md b/09_matAdd/docs/UserManual.md
index a392a83..3d6753b 100644
--- a/09_matAdd/docs/UserManual.md
+++ b/09_matAdd/docs/UserManual.md
@@ -28,11 +28,12 @@ the numerical results are also verified.
 |  3  | ji-loop, 2^9 threads * 2^f teams, collapse(2),                         |
 |     | coalesced memory access                                                |
 |  4  | ji-loop, 2^8 threads * 2^f teams, collapse(3),                         |
-|     | 2x i-loop unrolling                                                    |
-|  5  | ji-loop, 2^7 threads * 2^f teams, collapse(3),                         |
-|     | 4x i-loop unrolling                                                    |
-|  6  | ji-loop, 2^7 threads * 2^e teams, collapse(3),                         |
-|     | 4x i-loop unrolling, 2x j-loop unrolling                               |
+|     | 2x i-loop unrolling (stride of 2^8 rows)                               |
+|  5  | ji-loop, 2^8 threads * 2^f teams, collapse(2),                         |
+|     | 2x i-loop unrolling (stride of n/2 rows)                               |
+|  6  | ji-loop, 2^8 threads * 2^e teams, collapse(3),                         |
+|     | 2x i-loop unrolling (stride of 2^8 rows),                              |
+|     | 2x j-loop unrolling (stride of 1   col )                               |
 |  7  | cublasSaxpy in CUBLAS                                                  |
 
 # Usage
diff --git a/09_matAdd/src/matAdd.c b/09_matAdd/src/matAdd.c
index 382c536..c7b515a 100644
--- a/09_matAdd/src/matAdd.c
+++ b/09_matAdd/src/matAdd.c
@@ -93,11 +93,11 @@ int main(int argc, char *argv[])
      * 4: ji-loop, 2^8 threads * 2^f teams, collapse(3),
      *    2x i-loop unrolling
      *
-     * 5: ji-loop, 2^7 threads * 2^f teams, collapse(3),
-     *    4x i-loop unrolling
+     * 5: ji-loop, 2^8 threads * 2^f teams, collapse(2),
+     *    2x i-loop unrolling
      *
-     * 6: ji-loop, 2^7 threads * 2^e teams, collapse(3),
-     *    4x i-loop unrolling, 2x j-loop unrolling
+     * 6: ji-loop, 2^8 threads * 2^e teams, collapse(3),
+     *    2x i-loop unrolling, 2x j-loop unrolling
      *
      * otherwise: cublasSaxpy in CUBLAS
      */
diff --git a/09_matAdd/src/matAddAB.c b/09_matAdd/src/matAddAB.c
index 6cec8be..fa1cce0 100644
--- a/09_matAdd/src/matAddAB.c
+++ b/09_matAdd/src/matAddAB.c
@@ -21,13 +21,15 @@
 #include "cublas_v2.h"
 #include "matAddAB.h"
 
-#define NTHRDS7 (1 << 0x7)
-#define NTHRDS8 (1 << 0x8)
-#define NTHRDS9 (1 << 0x9)
+#define NTHRDS7 (1 << 0x7) /* 2^{7}  */
+#define NTHRDS8 (1 << 0x8) /* 2^{8}  */
+#define NTHRDS9 (1 << 0x9) /* 2^{9}  */
 
-#define LTEAMSD (1 << 0xD)
-#define LTEAMSE (1 << 0xE)
-#define LTEAMSF (1 << 0xF)
+#define LTEAMSD (1 << 0xD) /* 2^{13} */
+#define LTEAMSE (1 << 0xE) /* 2^{14} */
+#define LTEAMSF (1 << 0xF) /* 2^{15} */
+
+#define BLKROW  (512) /* 2x number of threads in each team */
 
 double wtcalc;
 
@@ -41,7 +43,6 @@ void matAddAB_accl(float *a,
         *a_dev = NULL,
         *b_dev = NULL;
   struct timespec rt[2];
-  int halfn = n / 2;
 
   switch (ial) {
     case 0:
@@ -54,7 +55,7 @@ void matAddAB_accl(float *a,
   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
   default(none) shared(a, b, n)
 #pragma omp distribute parallel for num_threads(NTHRDS9) \
@@ -79,7 +80,7 @@ for (int j = 0; j < n; ++j) { /* sequential */
   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
   default(none) shared(a, b, n)
 #pragma omp distribute parallel for num_threads(NTHRDS9) \
@@ -105,7 +106,7 @@ for (int i = 0; i < n; ++i) { /* sequential */
   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
   default(none) shared(a, b, n)
 #pragma omp distribute parallel for num_threads(NTHRDS9) \
@@ -130,7 +131,7 @@ for (int j = 0; j < n; ++j) {
   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
   default(none) shared(a, b, n)
 #pragma omp distribute parallel for num_threads(NTHRDS9) \
@@ -149,25 +150,25 @@ for (int i = 0; i < n; ++i) {
  * - ji-loop
  * - 2^8 threads per team and 2^f teams
  * - collapse(3)
- * - 2x i-loop unrolling by number of threads
+ * - 2x i-loop unrolling (stride of 2^8 rows)
  */
 #pragma omp target data  device(0) \
   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS8) \
   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
   default(none) shared(a, b, n)
 #pragma omp distribute parallel for num_threads(NTHRDS8) \
   dist_schedule(static, NTHRDS8) collapse(3) \
   default(none) shared(a, b, n)
 for (int j = 0; j < n; ++j) {
-for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
-for (int i = 0; i < NTHRDS8; ++i) {
-  a[j * n + iblk * NTHRDS9 + i          ] +=
-  b[j * n + iblk * NTHRDS9 + i          ];
-  a[j * n + iblk * NTHRDS9 + i + NTHRDS8] +=
-  b[j * n + iblk * NTHRDS9 + i + NTHRDS8];
+for (int iblk = 0; iblk < n / BLKROW; ++iblk) {
+for (int i = 0; i < NTHRDS8; ++i) { /* 2x unrolling */
+  a[j * n + iblk * BLKROW + i          ] +=
+  b[j * n + iblk * BLKROW + i          ];
+  a[j * n + iblk * BLKROW + i + NTHRDS8] +=
+  b[j * n + iblk * BLKROW + i + NTHRDS8];
 } /* end i-loop */
 } /* end iblk-loop */
 } /* end j-loop */
@@ -177,35 +178,27 @@ for (int i = 0; i < NTHRDS8; ++i) {
     case 5:
 /*
  * - ji-loop
- * - 2^7 threads per team and 2^f teams
- * - collapse(3)
- * - 4x i-loop unrolling
- *      * 2x by number of threads
- *      * 2x by half of rows
+ * - 2^8 threads per team and 2^f teams
+ * - collapse(2)
+ * - 2x i-loop unrolling (stride of n/2 rows)
  */
 #pragma omp target data  device(0) \
-  map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n])
+  map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSF) \
-  map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n]) \
-  default(none) shared(a, b, n, halfn)
-#pragma omp distribute parallel for num_threads(NTHRDS7) \
-  dist_schedule(static, NTHRDS7) collapse(3) \
-  default(none) shared(a, b, n, halfn)
+#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS8) \
+  map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
+  default(none) shared(a, b, n)
+#pragma omp distribute parallel for num_threads(NTHRDS8) \
+  dist_schedule(static, NTHRDS8) collapse(2) \
+  default(none) shared(a, b, n)
 for (int j = 0; j < n; ++j) {
-for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
-for (int i = 0; i < NTHRDS7; ++i) {
-  a[j * n + iblk * NTHRDS8 + i                   ] +=
-  b[j * n + iblk * NTHRDS8 + i                   ];
-  a[j * n + iblk * NTHRDS8 + i          + NTHRDS7] +=
-  b[j * n + iblk * NTHRDS8 + i          + NTHRDS7];
-  a[j * n + iblk * NTHRDS8 + i + halfn           ] +=
-  b[j * n + iblk * NTHRDS8 + i + halfn           ];
-  a[j * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7] +=
-  b[j * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7];
+for (int i = 0; i < (n >> 1); ++i) { /* 2x unrolling */
+  a[j * n + i           ] +=
+  b[j * n + i           ];
+  a[j * n + i + (n >> 1)] +=
+  b[j * n + i + (n >> 1)];
 } /* end i-loop */
-} /* end iblk-loop */
 } /* end j-loop */
   clock_gettime(CLOCK_REALTIME, rt + 1);
 }
@@ -213,42 +206,32 @@ for (int i = 0; i < NTHRDS7; ++i) {
     case 6:
 /*
  * - ji-loop
- * - 2^7 threads per team and 2^e teams
+ * - 2^8 threads per team and 2^14 teams
  * - collapse(3)
- * - 4x i-loop unrolling
- *      * 2x by number of threads
- *      * 2x by half of rows
- * - 2x j-loop unrolling
+ * - 2x j-loop unrolling (stride of 1   col )
+ * - 2x i-loop unrolling (stride of 2^8 rows)
  */
 #pragma omp target data  device(0) \
-  map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n])
+  map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(LTEAMSE) \
-  map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n]) \
-  default(none) shared(a, b, n, halfn)
-#pragma omp distribute parallel for num_threads(NTHRDS7) \
-  dist_schedule(static, NTHRDS7) collapse(3) \
-  default(none) shared(a, b, n, halfn)
-for (int j = 0; j < halfn; ++j) {
-for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
-for (int i = 0; i < NTHRDS7; ++i) {
-  a[ j          * n + iblk * NTHRDS8 + i                   ] +=
-  b[ j          * n + iblk * NTHRDS8 + i                   ];
-  a[ j          * n + iblk * NTHRDS8 + i          + NTHRDS7] +=
-  b[ j          * n + iblk * NTHRDS8 + i          + NTHRDS7];
-  a[ j          * n + iblk * NTHRDS8 + i + halfn           ] +=
-  b[ j          * n + iblk * NTHRDS8 + i + halfn           ];
-  a[ j          * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7] +=
-  b[ j          * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7];
-  a[(j + halfn) * n + iblk * NTHRDS8 + i                   ] +=
-  b[(j + halfn) * n + iblk * NTHRDS8 + i                   ];
-  a[(j + halfn) * n + iblk * NTHRDS8 + i          + NTHRDS7] +=
-  b[(j + halfn) * n + iblk * NTHRDS8 + i          + NTHRDS7];
-  a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn           ] +=
-  b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn           ];
-  a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7] +=
-  b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn  + NTHRDS7];
+#pragma omp target teams device(0) num_teams(LTEAMSE) thread_limit(NTHRDS8) \
+  map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
+  default(none) shared(a, b, n)
+#pragma omp distribute parallel for num_threads(NTHRDS8) \
+  dist_schedule(static, NTHRDS8) collapse(3) \
+  default(none) shared(a, b, n)
+for (int j = 0; j < n; j += 2) { /* 2x unrolling */
+for (int iblk = 0; iblk < n / BLKROW; ++iblk) {
+for (int i = 0; i < NTHRDS8; ++i) { /* 2x unrolling */
+  a[ j      * n + iblk * BLKROW + i          ] +=
+  b[ j      * n + iblk * BLKROW + i          ];
+  a[ j      * n + iblk * BLKROW + i + NTHRDS8] +=
+  b[ j      * n + iblk * BLKROW + i + NTHRDS8];
+  a[(j + 1) * n + iblk * BLKROW + i          ] +=
+  b[(j + 1) * n + iblk * BLKROW + i          ];
+  a[(j + 1) * n + iblk * BLKROW + i + NTHRDS8] +=
+  b[(j + 1) * n + iblk * BLKROW + i + NTHRDS8];
 } /* end i-loop */
 } /* end iblk-loop */
 } /* end j-loop */
diff --git a/09_matAdd/tests/matAdd_real_00.sh b/09_matAdd/tests/matAdd_real_00.sh
index a2037d7..9be3087 100755
--- a/09_matAdd/tests/matAdd_real_00.sh
+++ b/09_matAdd/tests/matAdd_real_00.sh
@@ -1,8 +1,9 @@
 #!/bin/bash
-#CCS -N matMul
+#CCS -N matAdd
 #CCS -t 600m
 #CCS -g pc2-mitarbeiter
 #CCS --res=rset=1:gtx1080=1,place=:excl
 
 echo "hallo from $(hostname)"
 ../src/matAdd $((2**12))
+../src/matAdd $((2**13))
diff --git a/10_matMul/src/matMulAB.c b/10_matMul/src/matMulAB.c
index 21cf6a8..22590e5 100644
--- a/10_matMul/src/matMulAB.c
+++ b/10_matMul/src/matMulAB.c
@@ -30,8 +30,7 @@
 #define LTEAMSF (1 << 0xF) /* 2^{15} */
 #define LTEAMSG (1 << 020) /* 2^{16} */
 
-#define NPITCH  (1024)
-#define BLKROW  (512)
+#define BLKROW  (512) /* 4x number of threads in each team */
 #define BLKDIM  (16)
 
 double wtcalc;
@@ -595,12 +594,12 @@ for (int i = 0; i < n / BLKDIM; ++i) {
   dist_schedule(static, NTHRDS7) collapse(2) \
   default(none) shared(a, b, c, n)
 for (int j = 0; j < n; ++j) {
-for (int i = 0; i < NPITCH; ++i) { /* 4x unrolling */
+for (int i = 0; i < (n >> 2); ++i) { /* 4x unrolling */
   float rc0, rc1, rc2, rc3;
-  rc0 = c[j * n + i             ];
-  rc1 = c[j * n + i + NPITCH    ];
-  rc2 = c[j * n + i + NPITCH * 2];
-  rc3 = c[j * n + i + NPITCH * 3];
+  rc0 = c[j * n + i               ];
+  rc1 = c[j * n + i + (n >> 2)    ];
+  rc2 = c[j * n + i + (n >> 2) * 2];
+  rc3 = c[j * n + i + (n >> 2) * 3];
   for (int k = 0; k < n; k += 4) { /* 4x unrolling */
     /* register for b: 4x k-loop */
     float rb0, rb1, rb2, rb3;
@@ -608,27 +607,27 @@ for (int i = 0; i < NPITCH; ++i) { /* 4x unrolling */
     rb1  = b[j * n + k + 1];
     rb2  = b[j * n + k + 2];
     rb3  = b[j * n + k + 3];
-    rc0 += a[ k      * n + i             ] * rb0;
-    rc0 += a[(k + 1) * n + i             ] * rb1;
-    rc0 += a[(k + 2) * n + i             ] * rb2;
-    rc0 += a[(k + 3) * n + i             ] * rb3;
-    rc1 += a[ k      * n + i + NPITCH    ] * rb0;
-    rc1 += a[(k + 1) * n + i + NPITCH    ] * rb1;
-    rc1 += a[(k + 2) * n + i + NPITCH    ] * rb2;
-    rc1 += a[(k + 3) * n + i + NPITCH    ] * rb3;
-    rc2 += a[ k      * n + i + NPITCH * 2] * rb0;
-    rc2 += a[(k + 1) * n + i + NPITCH * 2] * rb1;
-    rc2 += a[(k + 2) * n + i + NPITCH * 2] * rb2;
-    rc2 += a[(k + 3) * n + i + NPITCH * 2] * rb3;
-    rc3 += a[ k      * n + i + NPITCH * 3] * rb0;
-    rc3 += a[(k + 1) * n + i + NPITCH * 3] * rb1;
-    rc3 += a[(k + 2) * n + i + NPITCH * 3] * rb2;
-    rc3 += a[(k + 3) * n + i + NPITCH * 3] * rb3;
+    rc0 += a[ k      * n + i               ] * rb0;
+    rc0 += a[(k + 1) * n + i               ] * rb1;
+    rc0 += a[(k + 2) * n + i               ] * rb2;
+    rc0 += a[(k + 3) * n + i               ] * rb3;
+    rc1 += a[ k      * n + i + (n >> 2)    ] * rb0;
+    rc1 += a[(k + 1) * n + i + (n >> 2)    ] * rb1;
+    rc1 += a[(k + 2) * n + i + (n >> 2)    ] * rb2;
+    rc1 += a[(k + 3) * n + i + (n >> 2)    ] * rb3;
+    rc2 += a[ k      * n + i + (n >> 2) * 2] * rb0;
+    rc2 += a[(k + 1) * n + i + (n >> 2) * 2] * rb1;
+    rc2 += a[(k + 2) * n + i + (n >> 2) * 2] * rb2;
+    rc2 += a[(k + 3) * n + i + (n >> 2) * 2] * rb3;
+    rc3 += a[ k      * n + i + (n >> 2) * 3] * rb0;
+    rc3 += a[(k + 1) * n + i + (n >> 2) * 3] * rb1;
+    rc3 += a[(k + 2) * n + i + (n >> 2) * 3] * rb2;
+    rc3 += a[(k + 3) * n + i + (n >> 2) * 3] * rb3;
   }
-  c[j * n + i             ] = rc0;
-  c[j * n + i + NPITCH    ] = rc1;
-  c[j * n + i + NPITCH * 2] = rc2;
-  c[j * n + i + NPITCH * 3] = rc3;
+  c[j * n + i               ] = rc0;
+  c[j * n + i + (n >> 2)    ] = rc1;
+  c[j * n + i + (n >> 2) * 2] = rc2;
+  c[j * n + i + (n >> 2) * 3] = rc3;
 } /* end i-loop */
 } /* end j-loop */
   clock_gettime(CLOCK_REALTIME, rt + 1);