From 1b2b8c3900ec4c4d72396969911e5778dbefcd07 Mon Sep 17 00:00:00 2001
From: xwuupb <xinwu@mail.uni-paderborn.de>
Date: Mon, 6 Apr 2020 13:21:31 +0200
Subject: [PATCH] improve 05_saxpy

---
 05_saxpy/README.md                          | 18 ++++---
 05_saxpy/docs/UserManual.md                 | 18 ++++---
 05_saxpy/src/asaxpy.c                       | 56 +++++++++++++++++----
 05_saxpy/src/saxpy.c                        | 19 +++----
 05_saxpy/tests/saxpy_real_00.sh.5383621.out | 12 -----
 05_saxpy/tests/saxpy_real_00.sh.5385100.out | 13 +++++
 6 files changed, 88 insertions(+), 48 deletions(-)
 delete mode 100644 05_saxpy/tests/saxpy_real_00.sh.5383621.out
 create mode 100644 05_saxpy/tests/saxpy_real_00.sh.5385100.out
diff --git a/05_saxpy/README.md b/05_saxpy/README.md
index f4d543e..eec437e 100644
--- a/05_saxpy/README.md
+++ b/05_saxpy/README.md
@@ -17,7 +17,7 @@ where:
 
 * `a` is a scalar.
 * `x` and `y` are single-precision vectors each with n elements.
-* For testing n is assumed to be $2^{22}$.
+* For testing n is assumed to be $2^{26}$.
 * The following table only summarizes the most important points. For more
   details on the ial-th implementation see comments in `hsaxpy.c` (on host)
   and `asaxpy.c` (on accelerator).
@@ -33,13 +33,15 @@ where:
 
 | ial |  Remarks                                                               |
 |:---:|------------------------------------------------------------------------|
-|  0  | <<<             1,   1>>>, TOO SLOW! not tested                        |
-|  1  | <<<             1, 128>>>                                              |
-|  2  | <<<           128,   1>>>                                              |
-|  3  | <<<           128, 128>>>                                              |
-|  4  | <<<n /        128, 128>>>                                              |
-|  5  | <<<n / (128 * 16), 128>>>, 16x loop unrolling                          |
-|  6  | cublasSaxpy in CUBLAS                                                  |
+|  0  | <<<2^0 , 2^0 >>>, TOO SLOW! not tested                                 |
+|  1  | <<<2^0 , 2^7 >>>, auto   scheduling                                    |
+|  2  | <<<2^7 , 2^0 >>>, auto   scheduling                                    |
+|  3  | <<<2^7 , 2^7 >>>, auto   scheduling                                    |
+|  4  | <<<2^16, 2^10>>>, manual scheduling                                    |
+|  5  | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling                |
+|     | (2^15*2^7*16==2^26)                                                    |
+|  6  | <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling                |
+|  7  | cublasSaxpy in CUBLAS                                                  |
 
 # Build
 
diff --git a/05_saxpy/docs/UserManual.md b/05_saxpy/docs/UserManual.md
index c989f62..b9a68fc 100644
--- a/05_saxpy/docs/UserManual.md
+++ b/05_saxpy/docs/UserManual.md
@@ -17,7 +17,7 @@ where:
 
 * `a` is a scalar.
 * `x` and `y` are single-precision vectors each with n elements.
-* For testing n is assumed to be $2^{22}$.
+* For testing n is assumed to be $2^{26}$.
 * The following table only summarizes the most important points. For more
   details on the ial-th implementation see comments in `hsaxpy.c` (on host)
   and `asaxpy.c` (on accelerator).
@@ -33,13 +33,15 @@ where:
 
 | ial |  Remarks                                                               |
 |:---:|------------------------------------------------------------------------|
-|  0  | <<<             1,   1>>>, TOO SLOW! not tested                        |
-|  1  | <<<             1, 128>>>                                              |
-|  2  | <<<           128,   1>>>                                              |
-|  3  | <<<           128, 128>>>                                              |
-|  4  | <<<n /        128, 128>>>                                              |
-|  5  | <<<n / (128 * 16), 128>>>, 16x loop unrolling                          |
-|  6  | cublasSaxpy in CUBLAS                                                  |
+|  0  | <<<2^0 , 2^0 >>>, TOO SLOW! not tested                                 |
+|  1  | <<<2^0 , 2^7 >>>, auto   scheduling                                    |
+|  2  | <<<2^7 , 2^0 >>>, auto   scheduling                                    |
+|  3  | <<<2^7 , 2^7 >>>, auto   scheduling                                    |
+|  4  | <<<2^16, 2^10>>>, manual scheduling                                    |
+|  5  | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling                |
+|     | (2^15*2^7*16==2^26)                                                    |
+|  6  | <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling                |
+|  7  | cublasSaxpy in CUBLAS                                                  |
 
 # Usage
 
diff --git a/05_saxpy/src/asaxpy.c b/05_saxpy/src/asaxpy.c
index 9c53b80..0a05ccf 100644
--- a/05_saxpy/src/asaxpy.c
+++ b/05_saxpy/src/asaxpy.c
@@ -48,7 +48,7 @@ void asaxpy(const int n,
   switch (ial) {
     case 0:
 /*
- * - <<<1, 1>>>
+ * - <<<2^0 , 2^0 >>>, TOO SLOW! not tested
  */
 #pragma omp target data  device(0) \
   map(to:a, n, x[0:n]) map(tofrom:y[0:n])
@@ -68,7 +68,7 @@ for (int i = 0; i < n; ++i) {
       break;
     case 1:
 /*
- * - <<<1, 128>>>
+ * - <<<2^0 , 2^7 >>>, auto   scheduling
  */
 #pragma omp target data  device(0) \
   map(to:a, n, x[0:n]) map(tofrom:y[0:n])
@@ -88,7 +88,7 @@ for (int i = 0; i < n; ++i) {
       break;
     case 2:
 /*
- * - <<<128, 1>>>
+ * - <<<2^7 , 2^0 >>>, auto   scheduling
  */
 #pragma omp target data  device(0) \
   map(to:a, n, x[0:n]) map(tofrom:y[0:n])
@@ -108,7 +108,7 @@ for (int i = 0; i < n; ++i) {
       break;
     case 3:
 /*
- * - <<<128, 128>>>
+ * - <<<2^7 , 2^7 >>>, auto   scheduling
  */
 #pragma omp target data  device(0) \
   map(to:a, n, x[0:n]) map(tofrom:y[0:n])
@@ -128,17 +128,17 @@ for (int i = 0; i < n; ++i) {
       break;
     case 4:
 /*
- * - <<<n / 128, 128>>>
+ * - <<<2^16, 2^10>>>, manual scheduling
  */
 #pragma omp target data  device(0) \
   map(to:a, n, x[0:n]) map(tofrom:y[0:n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams((1 << 15)) \
+#pragma omp target teams device(0) num_teams(65536) \
   map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
   default(none) shared(a, n, x, y)
-#pragma omp distribute parallel for num_threads(128) \
-  dist_schedule(static, 128) \
+#pragma omp distribute parallel for num_threads(1024) \
+  dist_schedule(static, 1024) \
   default(none) shared(a, n, x, y)
 for (int i = 0; i < n; ++i) {
   y[i] = a * x[i] + y[i];
@@ -148,14 +148,48 @@ for (int i = 0; i < n; ++i) {
       break;
     case 5:
 /*
- * - <<<n / (128 * 16), 128>>>
- * - 16x loop-unrolling
+ * - <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26)
+ */
+#pragma omp target data  device(0) \
+  map(to:a, m, x[0:n]) map(tofrom:y[0:n])
+{
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams(32768) \
+  map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \
+  default(none) shared(a, m, x, y)
+#pragma omp distribute parallel for num_threads(128) \
+  dist_schedule(static, 128) \
+  default(none) shared(a, m, x, y)
+for (int i = 0; i < m; ++i) {
+  y[i          ] = a * x[i          ] + y[i          ];
+  y[i +       m] = a * x[i +       m] + y[i +       m];
+  y[i + 0x2 * m] = a * x[i + 0x2 * m] + y[i + 0x2 * m];
+  y[i + 0x3 * m] = a * x[i + 0x3 * m] + y[i + 0x3 * m];
+  y[i + 0x4 * m] = a * x[i + 0x4 * m] + y[i + 0x4 * m];
+  y[i + 0x5 * m] = a * x[i + 0x5 * m] + y[i + 0x5 * m];
+  y[i + 0x6 * m] = a * x[i + 0x6 * m] + y[i + 0x6 * m];
+  y[i + 0x7 * m] = a * x[i + 0x7 * m] + y[i + 0x7 * m];
+  y[i + 0x8 * m] = a * x[i + 0x8 * m] + y[i + 0x8 * m];
+  y[i + 0x9 * m] = a * x[i + 0x9 * m] + y[i + 0x9 * m];
+  y[i + 0xa * m] = a * x[i + 0xa * m] + y[i + 0xa * m];
+  y[i + 0xb * m] = a * x[i + 0xb * m] + y[i + 0xb * m];
+  y[i + 0xc * m] = a * x[i + 0xc * m] + y[i + 0xc * m];
+  y[i + 0xd * m] = a * x[i + 0xd * m] + y[i + 0xd * m];
+  y[i + 0xe * m] = a * x[i + 0xe * m] + y[i + 0xe * m];
+  y[i + 0xf * m] = a * x[i + 0xf * m] + y[i + 0xf * m];
+}
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    case 6:
+/*
+ * - <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling
  */
 #pragma omp target data  device(0) \
   map(to:a, m, x[0:n]) map(tofrom:y[0:n])
 {
   clock_gettime(CLOCK_REALTIME, rt + 0);
-#pragma omp target teams device(0) num_teams(2048) \
+#pragma omp target teams device(0) num_teams(4096) \
   map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \
   default(none) shared(a, m, x, y)
 #pragma omp distribute parallel for num_threads(128) \
diff --git a/05_saxpy/src/saxpy.c b/05_saxpy/src/saxpy.c
index 2ff3d25..c184fc6 100644
--- a/05_saxpy/src/saxpy.c
+++ b/05_saxpy/src/saxpy.c
@@ -34,7 +34,7 @@
 #include "check1ns.h"
 #include "wtcalc.h"
 
-#define TWO22 (1 << 22)
+#define TWO26 (1 << 26)
 #define NLUP  (32)
 
 /**
@@ -69,7 +69,7 @@ int main(int argc, char *argv[])
   /*
    * preparation
    */
-  n      = TWO22;
+  n      = TWO26;
   nbytes = sizeof(float) * n;
   iret   = 0;
   if (NULL == (x     = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1;
@@ -131,18 +131,19 @@ int main(int argc, char *argv[])
   /*
    * saxpy on accl
    */
-  for (ial = 1; ial < 7; ++ial) {
+  for (ial = 1; ial < 8; ++ial) {
     /*
      * See asaxpy.c for details:
      *
      * ial:
      *
-     * 0: <<<             1,   1>>>, TOO SLOW! not tested
-     * 1: <<<             1, 128>>>
-     * 2: <<<           128,   1>>>
-     * 3: <<<           128, 128>>>
-     * 4: <<<n /        128, 128>>>
-     * 5: <<<n / (128 * 16), 128>>>, 16x loop unrolling
+     * 0: <<<2^0 , 2^0 >>>, TOO SLOW! not tested
+     * 1: <<<2^0 , 2^7 >>>, auto   scheduling
+     * 2: <<<2^7 , 2^0 >>>, auto   scheduling
+     * 3: <<<2^7 , 2^7 >>>, auto   scheduling
+     * 4: <<<2^16, 2^10>>>, manual scheduling
+     * 5: <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26)
+     * 6: <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling
      * otherwise: cublasSaxpy in CUBLAS
      */
     memcpy(yaccl, y, nbytes);
diff --git a/05_saxpy/tests/saxpy_real_00.sh.5383621.out b/05_saxpy/tests/saxpy_real_00.sh.5383621.out
deleted file mode 100644
index 2904da0..0000000
--- a/05_saxpy/tests/saxpy_real_00.sh.5383621.out
+++ /dev/null
@@ -1,12 +0,0 @@
-hallo from gpu011
-The system supports 1 ns time resolution
-total size of x and y is      32.0 MB
-tests are averaged over 32 loops
-saxpy on host (0) :   43431.2 MB/s   43438.4 MB/s maxabserr =       0.0
-saxpy on host (1) :  446622.5 MB/s  447142.8 MB/s maxabserr =       0.0
-saxpy on accl (1) :      75.2 MB/s      78.6 MB/s maxabserr =       0.0
-saxpy on accl (2) :      75.7 MB/s      79.0 MB/s maxabserr =       0.0
-saxpy on accl (3) :    1945.7 MB/s    6071.8 MB/s maxabserr =       0.0
-saxpy on accl (4) :    1689.3 MB/s    4074.8 MB/s maxabserr =       0.0
-saxpy on accl (5) :    2644.2 MB/s   30438.5 MB/s maxabserr =       0.0
-saxpy on accl (6) :    2762.5 MB/s  127182.3 MB/s maxabserr =       0.0
diff --git a/05_saxpy/tests/saxpy_real_00.sh.5385100.out b/05_saxpy/tests/saxpy_real_00.sh.5385100.out
new file mode 100644
index 0000000..5d21ff8
--- /dev/null
+++ b/05_saxpy/tests/saxpy_real_00.sh.5385100.out
@@ -0,0 +1,13 @@
+hallo from gpu026
+The system supports 1 ns time resolution
+total size of x and y is     512.0 MB
+tests are averaged over 32 loops
+saxpy on host (0) :   41473.2 MB/s   41474.3 MB/s maxabserr =       0.0
+saxpy on host (1) :   66632.2 MB/s   66635.3 MB/s maxabserr =       0.0
+saxpy on accl (1) :      76.0 MB/s      78.7 MB/s maxabserr =       0.0
+saxpy on accl (2) :      76.3 MB/s      79.0 MB/s maxabserr =       0.0
+saxpy on accl (3) :    1635.3 MB/s    6457.5 MB/s maxabserr =       0.0
+saxpy on accl (4) :    1417.9 MB/s    3956.9 MB/s maxabserr =       0.0
+saxpy on accl (5) :    1954.3 MB/s   45380.7 MB/s maxabserr =       0.0
+saxpy on accl (6) :    1938.4 MB/s   67113.8 MB/s maxabserr =       0.0
+saxpy on accl (7) :    1984.3 MB/s  282336.0 MB/s maxabserr =       0.0