From 652bb41956884bca93edec4c78058ad4ce288f2f Mon Sep 17 00:00:00 2001
From: xwuupb <xinwu@mail.uni-paderborn.de>
Date: Sun, 5 Apr 2020 21:35:30 +0200
Subject: [PATCH] finish 05_saxpy

---
 05_saxpy/README.md                          |  43 ++--
 05_saxpy/configure.ac                       |  44 +++-
 05_saxpy/docs/UserManual.md                 |  41 ++--
 05_saxpy/src/Makefile.am                    |   4 +-
 05_saxpy/src/asaxpy.c                       | 228 +++++++++++++++++---
 05_saxpy/src/asaxpy.h                       |  27 +--
 05_saxpy/src/hsaxpy.c                       |  45 ++--
 05_saxpy/src/hsaxpy.h                       |  26 +--
 05_saxpy/src/saxpy.c                        | 176 +++++++++------
 05_saxpy/src/wtcalc.c                       |  21 ++
 05_saxpy/src/wtcalc.h                       |  30 +++
 05_saxpy/tests/saxpy_real_00.sh             |   4 +-
 05_saxpy/tests/saxpy_real_00.sh.5383621.out |  12 ++
 13 files changed, 541 insertions(+), 160 deletions(-)
 create mode 100644 05_saxpy/src/wtcalc.c
 create mode 100644 05_saxpy/src/wtcalc.h
 create mode 100644 05_saxpy/tests/saxpy_real_00.sh.5383621.out
diff --git a/05_saxpy/README.md b/05_saxpy/README.md
index ac59e67..f4d543e 100644
--- a/05_saxpy/README.md
+++ b/05_saxpy/README.md
@@ -1,35 +1,50 @@
 ---
 title: saxpy
 author: Xin Wu (PC²)
-date: 10.01.2020
+date: 05.04.2020
 ---
 
 # Introduction
 
-`saxpy` performs the `axpy` operation on host as well as accelerator and then
-compares the FLOPS performance.
+`saxpy` performs the `saxpy` operation on host as well as accelerator.
+The performance (in MB/s) for different implementations is also compared.
 
-The `axpy` operation is defined as:
+The `saxpy` operation is defined as:
 
 $$ y := a * x + y $$
 
 where:
 
 * `a` is a scalar.
-* `x` and `y` are vectors each with n elements.
-
-The initial value of `a` and elements of `x[]` and `y[]` are specially designed,
-so that the floating-point calculations on host and accelerator can be compared
-_exactly_.
-
-Please note that only _one GPU thread_ is used for the `axpy` calculation on
-accelerator in this version. This can be verified by uncomment the `CFLAGS` line
-in `configure.ac`.
+* `x` and `y` are single-precision vectors each with n elements.
+* For testing n is assumed to be $2^{22}$.
+* The following table only summarizes the most important points. For more
+  details on the ial-th implementation see comments in `hsaxpy.c` (on host)
+  and `asaxpy.c` (on accelerator).
+
+    - on host
+
+| ial |  Remarks                                                               |
+|:---:|------------------------------------------------------------------------|
+|  0  | naive implementation                                                   |
+|  1  | saxpy in MKL                                                           |
+
+    - on accl
+
+| ial |  Remarks                                                               |
+|:---:|------------------------------------------------------------------------|
+|  0  | <<<             1,   1>>>, TOO SLOW! not tested                        |
+|  1  | <<<             1, 128>>>                                              |
+|  2  | <<<           128,   1>>>                                              |
+|  3  | <<<           128, 128>>>                                              |
+|  4  | <<<n /        128, 128>>>                                              |
+|  5  | <<<n / (128 * 16), 128>>>, 16x loop unrolling                          |
+|  6  | cublasSaxpy in CUBLAS                                                  |
 
 # Build
 
 ```bash
-autoreconf -i; ./configure; make; make check; sudo make install;
+autoreconf -i; ./configure; make; make check;
 ```
 
 `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS).
diff --git a/05_saxpy/configure.ac b/05_saxpy/configure.ac
index aae1231..2075c61 100644
--- a/05_saxpy/configure.ac
+++ b/05_saxpy/configure.ac
@@ -18,18 +18,31 @@ if test -z "${CUDALIB}"; then
 fi
 ##############################################################################80
 #
+# check MKL
+#
+##############################################################################80
+AC_ARG_VAR([MKLINC], [The PATH wherein mkl.h can be found])
+if test -z "${MKLINC}"; then
+  AC_SUBST([MKLINC], [${MKLROOT}/include])
+fi
+AC_ARG_VAR([MKLLIB], [The PATH wherein MKL library can be found])
+if test -z "${MKLLIB}"; then
+  AC_SUBST([MKLLIB], [${MKLROOT}/lib/intel64])
+fi
+##############################################################################80
+#
 # check C compiler
 #
 ##############################################################################80
+CFLAGS+="-I${CUDAINC} -I${MKLINC}"
+LDFLAGS+="-L${CUDALIB} -L${MKLLIB}"
+#
 AC_PROG_CC([clang gcc])
-##CFLAGS+=" -DDEBUG"
 AS_IF([test "${CC}" = gcc],
-  [CFLAGS="-Wall -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS"
-   LDFLAGS="-L${CUDALIB} $LDFLAGS"])
+  [CFLAGS="-Wall -fopenmp -foffload=nvptx-none $CFLAGS"])
 AS_IF([test "${CC}" = clang],
-  [CFLAGS="-Wall -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \
-   -Xopenmp-target -march=sm_61 $CFLAGS"
-   LDFLAGS="-L${CUDALIB} $LDFLAGS"])
+  [CFLAGS="-Wall -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \
+           -Xopenmp-target -march=sm_61 $CFLAGS"])
 ##############################################################################80
 #
 # check archiver
@@ -44,6 +57,10 @@ AM_PROG_AR
 ##############################################################################80
 AC_CHECK_HEADER([cuda_runtime.h], [],
   [AC_MSG_ERROR([cuda_runtime.h required, but not found])], [])
+AC_CHECK_HEADER([cublas_v2.h], [],
+  [AC_MSG_ERROR([cublas_v2.h required, but not found])], [])
+AC_CHECK_HEADER([mkl.h], [],
+  [AC_MSG_ERROR([mkl.h required, but not found])], [])
 ##############################################################################80
 #
 # check libraries
@@ -51,6 +68,21 @@ AC_CHECK_HEADER([cuda_runtime.h], [],
 ##############################################################################80
 AC_CHECK_LIB([cudart], [cudaSetDevice], [],
   [AC_MSG_ERROR([libcudart required, but not found])], [])
+AC_CHECK_LIB([cublas], [cublasSaxpy], [],
+  [AC_MSG_ERROR([libcublas required, but not found])], [])
+AC_CHECK_LIB([pthread], [pthread_create], [],
+  [AC_MSG_ERROR([libpthread required, but not found])], [])
+AC_CHECK_LIB([iomp5], [omp_set_num_threads], [],
+  [AC_MSG_ERROR([libiomp5 required, but not found])], [])
+AC_CHECK_LIB([mkl_core], [mkl_blas_xsaxpy], [],
+  [AC_MSG_ERROR([libmkl_core required, but not found])],
+  [-lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lm])
+AC_CHECK_LIB([mkl_intel_thread], [mkl_blas_saxpy], [],
+  [AC_MSG_ERROR([libmkl_intel_thread required, but not found])],
+  [-lmkl_intel_lp64 -lmkl_core -liomp5 -lm])
+AC_CHECK_LIB([mkl_intel_lp64], [saxpy], [],
+  [AC_MSG_ERROR([libmkl_intel_lp64 required, but not found])],
+  [-lmkl_intel_thread -lmkl_core -liomp5 -lm])
 ##############################################################################80
 #
 # check Doxygen
diff --git a/05_saxpy/docs/UserManual.md b/05_saxpy/docs/UserManual.md
index 9056e10..c989f62 100644
--- a/05_saxpy/docs/UserManual.md
+++ b/05_saxpy/docs/UserManual.md
@@ -1,30 +1,45 @@
 ---
 title: saxpy
 author: Xin Wu (PC²)
-date: 10.01.2020
+date: 05.04.2020
 ---
 
 # Introduction
 
-`saxpy` performs the `axpy` operation on host as well as accelerator and then
-compares the FLOPS performance.
+`saxpy` performs the `saxpy` operation on host as well as accelerator.
+The performance (in MB/s) for different implementations is also compared.
 
-The `axpy` operation is defined as:
+The `saxpy` operation is defined as:
 
 $$ y := a * x + y $$
 
 where:
 
 * `a` is a scalar.
-* `x` and `y` are vectors each with n elements.
-
-The initial value of `a` and elements of `x[]` and `y[]` are specially designed,
-so that the floating-point calculations on host and accelerator can be compared
-_exactly_.
-
-Please note that only _one GPU thread_ is used for the `axpy` calculation on
-accelerator in this version. This can be verified by uncomment the `CFLAGS` line
-in `configure.ac`.
+* `x` and `y` are single-precision vectors each with n elements.
+* For testing n is assumed to be $2^{22}$.
+* The following table only summarizes the most important points. For more
+  details on the ial-th implementation see comments in `hsaxpy.c` (on host)
+  and `asaxpy.c` (on accelerator).
+
+    - on host
+
+| ial |  Remarks                                                               |
+|:---:|------------------------------------------------------------------------|
+|  0  | naive implementation                                                   |
+|  1  | saxpy in MKL                                                           |
+
+    - on accl
+
+| ial |  Remarks                                                               |
+|:---:|------------------------------------------------------------------------|
+|  0  | <<<             1,   1>>>, TOO SLOW! not tested                        |
+|  1  | <<<             1, 128>>>                                              |
+|  2  | <<<           128,   1>>>                                              |
+|  3  | <<<           128, 128>>>                                              |
+|  4  | <<<n /        128, 128>>>                                              |
+|  5  | <<<n / (128 * 16), 128>>>, 16x loop unrolling                          |
+|  6  | cublasSaxpy in CUBLAS                                                  |
 
 # Usage
 
diff --git a/05_saxpy/src/Makefile.am b/05_saxpy/src/Makefile.am
index 2ef9113..d3c2053 100644
--- a/05_saxpy/src/Makefile.am
+++ b/05_saxpy/src/Makefile.am
@@ -5,4 +5,6 @@ saxpy_SOURCES = saxpy.c    \
 		hsaxpy.h   \
 		hsaxpy.c   \
 		asaxpy.h   \
-		asaxpy.c
+		asaxpy.c   \
+		wtcalc.h   \
+		wtcalc.c
diff --git a/05_saxpy/src/asaxpy.c b/05_saxpy/src/asaxpy.c
index 3d630e9..9c53b80 100644
--- a/05_saxpy/src/asaxpy.c
+++ b/05_saxpy/src/asaxpy.c
@@ -1,9 +1,8 @@
 /**
  * @file asaxpy.c
- * @brief Function definition for performing the \c axpy operation on
- * accelerator.
+ * @brief Function definition for performing the \c saxpy operation on accelerator.
  *
- * This source file contains function definition for the \c axpy operation,
+ * This source file contains function definition for the \c saxpy operation,
  * which is defined as:
  *
  * y := a * x + y
@@ -11,10 +10,10 @@
  * where:
  *
  * - a is a scalar.
- * - x and y are vectors each with n elements.
+ * - x and y are single-precision vectors each with n elements.
  *
  * @author Xin Wu (PC²)
- * @date 09.01.2020
+ * @date 05.04.2020
  * @copyright CC BY-SA 2.0
  */
 
@@ -22,40 +21,217 @@
 extern "C" {
 #endif
 
-#ifdef DEBUG
 #include <stdio.h>
-#endif
+#include <stdlib.h>
+#include <time.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+#include "wtcalc.h"
 #include "asaxpy.h"
 
 void asaxpy(const int n,
             const float a,
             const float *x,
-                  float *y)
+                  float *y,
+            const int ial)
 {
-  int i = 0;
-#ifdef DEBUG
-  int nth;
-#endif
+  cublasHandle_t handle;
+  float alfa = a,
+        *x_dev = NULL,
+        *y_dev = NULL;
+  struct timespec rt[2];
+  int m = (n >> 4);
 
-#ifdef DEBUG
-#pragma omp target map(n, a, x[0:n], y[0:n], nth) private(i)
-#else
-#pragma omp target map(n, a, x[0:n], y[0:n]     ) private(i)
-#endif
+  switch (ial) {
+    case 0:
+/*
+ * - <<<1, 1>>>
+ */
+#pragma omp target data  device(0) \
+  map(to:a, n, x[0:n]) map(tofrom:y[0:n])
 {
-#ifdef DEBUG
-  nth = omp_get_num_threads();
-#endif
-  for (i = 0; i < n; i++) {
-    y[i] = a * x[i] + y[i];
-  }
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams(1) \
+  map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
+  default(none) shared(a, n, x, y)
+#pragma omp distribute parallel for num_threads(1) \
+  dist_schedule(static, 1) \
+  default(none) shared(a, n, x, y)
+for (int i = 0; i < n; ++i) {
+  y[i] = a * x[i] + y[i];
 }
-#ifdef DEBUG
-printf("DEBUG: number of threads on accelerator: %d\n", nth);
-#endif
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    case 1:
+/*
+ * - <<<1, 128>>>
+ */
+#pragma omp target data  device(0) \
+  map(to:a, n, x[0:n]) map(tofrom:y[0:n])
+{
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams(1) \
+  map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
+  default(none) shared(a, n, x, y)
+#pragma omp distribute parallel for num_threads(128) \
+  dist_schedule(static, 128) \
+  default(none) shared(a, n, x, y)
+for (int i = 0; i < n; ++i) {
+  y[i] = a * x[i] + y[i];
+}
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    case 2:
+/*
+ * - <<<128, 1>>>
+ */
+#pragma omp target data  device(0) \
+  map(to:a, n, x[0:n]) map(tofrom:y[0:n])
+{
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams(128) \
+  map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
+  default(none) shared(a, n, x, y)
+#pragma omp distribute parallel for num_threads(1) \
+  dist_schedule(static, 1) \
+  default(none) shared(a, n, x, y)
+for (int i = 0; i < n; ++i) {
+  y[i] = a * x[i] + y[i];
+}
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    case 3:
+/*
+ * - <<<128, 128>>>
+ */
+#pragma omp target data  device(0) \
+  map(to:a, n, x[0:n]) map(tofrom:y[0:n])
+{
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams(128) \
+  map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
+  default(none) shared(a, n, x, y)
+#pragma omp distribute parallel for num_threads(128) \
+  dist_schedule(static, 128) \
+  default(none) shared(a, n, x, y)
+for (int i = 0; i < n; ++i) {
+  y[i] = a * x[i] + y[i];
+}
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    case 4:
+/*
+ * - <<<n / 128, 128>>>
+ */
+#pragma omp target data  device(0) \
+  map(to:a, n, x[0:n]) map(tofrom:y[0:n])
+{
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams((1 << 15)) \
+  map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
+  default(none) shared(a, n, x, y)
+#pragma omp distribute parallel for num_threads(128) \
+  dist_schedule(static, 128) \
+  default(none) shared(a, n, x, y)
+for (int i = 0; i < n; ++i) {
+  y[i] = a * x[i] + y[i];
+}
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    case 5:
+/*
+ * - <<<n / (128 * 16), 128>>>
+ * - 16x loop-unrolling
+ */
+#pragma omp target data  device(0) \
+  map(to:a, m, x[0:n]) map(tofrom:y[0:n])
+{
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams(2048) \
+  map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \
+  default(none) shared(a, m, x, y)
+#pragma omp distribute parallel for num_threads(128) \
+  dist_schedule(static, 128) \
+  default(none) shared(a, m, x, y)
+for (int i = 0; i < m; ++i) {
+  y[i          ] = a * x[i          ] + y[i          ];
+  y[i +       m] = a * x[i +       m] + y[i +       m];
+  y[i + 0x2 * m] = a * x[i + 0x2 * m] + y[i + 0x2 * m];
+  y[i + 0x3 * m] = a * x[i + 0x3 * m] + y[i + 0x3 * m];
+  y[i + 0x4 * m] = a * x[i + 0x4 * m] + y[i + 0x4 * m];
+  y[i + 0x5 * m] = a * x[i + 0x5 * m] + y[i + 0x5 * m];
+  y[i + 0x6 * m] = a * x[i + 0x6 * m] + y[i + 0x6 * m];
+  y[i + 0x7 * m] = a * x[i + 0x7 * m] + y[i + 0x7 * m];
+  y[i + 0x8 * m] = a * x[i + 0x8 * m] + y[i + 0x8 * m];
+  y[i + 0x9 * m] = a * x[i + 0x9 * m] + y[i + 0x9 * m];
+  y[i + 0xa * m] = a * x[i + 0xa * m] + y[i + 0xa * m];
+  y[i + 0xb * m] = a * x[i + 0xb * m] + y[i + 0xb * m];
+  y[i + 0xc * m] = a * x[i + 0xc * m] + y[i + 0xc * m];
+  y[i + 0xd * m] = a * x[i + 0xd * m] + y[i + 0xd * m];
+  y[i + 0xe * m] = a * x[i + 0xe * m] + y[i + 0xe * m];
+  y[i + 0xf * m] = a * x[i + 0xf * m] + y[i + 0xf * m];
+}
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    default:
+/*
+ * cublasSaxpy in CUBLAS
+ */
+  if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle)) {
+    printf("error: initialization (CUBLAS)\n");
+    cublasDestroy(handle);
+    exit(EXIT_FAILURE);
+  }
+  if (cudaSuccess != cudaMalloc((void **) &x_dev, sizeof(*x) * n) ||
+      cudaSuccess != cudaMalloc((void **) &y_dev, sizeof(*y) * n)) {
+    printf("error: memory allocation (CUDA)\n");
+    cudaFree(x_dev); cudaFree(y_dev);
+    cublasDestroy(handle);
+    exit(EXIT_FAILURE);
+  }
+  if (CUBLAS_STATUS_SUCCESS != cublasSetVector(n, sizeof(*x), x, 1, x_dev, 1) ||
+      CUBLAS_STATUS_SUCCESS != cublasSetVector(n, sizeof(*y), y, 1, y_dev, 1)) {
+    printf("error: host --> accl (CUBLAS)\n");
+    cudaFree(x_dev); cudaFree(y_dev);
+    cublasDestroy(handle);
+    exit(EXIT_FAILURE);
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+  if (CUBLAS_STATUS_SUCCESS != cublasSaxpy(handle, n, &alfa, x_dev, 1, y_dev, 1)) {
+    printf("error: cublasSaxpy (CUBLAS)\n");
+    cudaFree(x_dev); cudaFree(y_dev);
+    cublasDestroy(handle);
+    exit(EXIT_FAILURE);
+  }
+  if (cudaSuccess != cudaDeviceSynchronize()) {
+    printf("error: device synchronization (CUDA)\n");
+    cudaFree(x_dev); cudaFree(y_dev);
+    cublasDestroy(handle);
+    exit(EXIT_FAILURE);
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  if (CUBLAS_STATUS_SUCCESS != cublasGetVector(n, sizeof(*y), y_dev, 1, y, 1)) {
+    printf("error: accl --> host (CUBLAS)\n");
+    cudaFree(x_dev); cudaFree(y_dev);
+    cublasDestroy(handle);
+    exit(EXIT_FAILURE);
+  }
+  cudaFree(x_dev); cudaFree(y_dev);
+  cublasDestroy(handle);
+      break;
+  } /* end switch (ial) */
+  if (wtcalc >= 0.0) {
+    wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  }
 }
 
 #ifdef __cplusplus
diff --git a/05_saxpy/src/asaxpy.h b/05_saxpy/src/asaxpy.h
index 2a24d39..1ebdb9c 100644
--- a/05_saxpy/src/asaxpy.h
+++ b/05_saxpy/src/asaxpy.h
@@ -1,9 +1,8 @@
 /**
  * @file asaxpy.h
- * @brief Function prototype for performing the \c axpy operation on
- * accelerator.
+ * @brief Function prototype for performing the \c saxpy operation on accelerator.
  *
- * This header file contains function prototype for the \c axpy operation,
+ * This header file contains function prototype for the \c saxpy operation,
  * which is defined as:
  *
  * y := a * x + y
@@ -11,10 +10,10 @@
  * where:
  *
  * - a is a scalar.
- * - x and y are vectors each with n elements.
+ * - x and y are single-precision vectors each with n elements.
  *
  * @author Xin Wu (PC²)
- * @date 09.01.2020
+ * @date 05.04.2020
  * @copyright CC BY-SA 2.0
  */
 
@@ -28,23 +27,25 @@ extern "C" {
 void asaxpy(const int n,
             const float a,
             const float *x,
-                  float *y);
+                  float *y,
+            const int ial);
 /**<
- * @brief Performs the \c axpy operation on accelerator.
+ * @brief Performs the \c saxpy operation on accelerator.
  *
- * The \c axpy operation is defined as:
+ * The \c saxpy operation is defined as:
  *
  * y := a * x + y
  *
  * where:
  *
  * - a is a scalar.
- * - x and y are vectors each with n elements.
+ * - x and y are single-precision vectors each with n elements.
  *
- * @param n The number of elements in \p x and \p y.
- * @param a The scalar for multiplication.
- * @param x The vector \p x in \c axpy.
- * @param y The vector \p y in \c axpy.
+ * @param n   The number of elements in \p x and \p y.
+ * @param a   The scalar for multiplication.
+ * @param x   The vector \p x in \c saxpy.
+ * @param y   The vector \p y in \c saxpy.
+ * @param ial The ial-th implementation.
  *
  * @return \c void.
  */
diff --git a/05_saxpy/src/hsaxpy.c b/05_saxpy/src/hsaxpy.c
index 3514a12..22bab18 100644
--- a/05_saxpy/src/hsaxpy.c
+++ b/05_saxpy/src/hsaxpy.c
@@ -1,8 +1,8 @@
 /**
  * @file hsaxpy.c
- * @brief Function definition for performing the \c axpy operation on host.
+ * @brief Function definition for performing the \c saxpy operation on host.
  *
- * This source file contains function definition for the \c axpy operation,
+ * This source file contains function definition for the \c saxpy operation,
  * which is defined as:
  *
  * y := a * x + y
@@ -10,10 +10,10 @@
  * where:
  *
  * - a is a scalar.
- * - x and y are vectors each with n elements.
+ * - x and y are single-precision vectors each with n elements.
  *
  * @author Xin Wu (PC²)
- * @date 09.01.2020
+ * @date 05.04.2020
  * @copyright CC BY-SA 2.0
  */
 
@@ -21,26 +21,47 @@
 extern "C" {
 #endif
 
+#include <time.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+#include "mkl.h"
+#include "wtcalc.h"
 #include "hsaxpy.h"
 
 void hsaxpy(const int n,
             const float a,
             const float *x,
-                  float *y)
+                  float *y,
+            const int ial)
 {
-  int i = 0;
+  struct timespec rt[2];
 
-#pragma omp parallel \
-  default(none) shared(n, a, x, y) private(i)
-{
-#pragma omp for simd schedule(simd:static)
-  for (i = 0; i < n; i++) {
+  switch (ial) {
+    case 0:
+/*
+ * - naive implementation
+ */
+clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp parallel for simd schedule(simd:static) \
+  default(none) shared(a, n, x, y)
+  for (int i = 0; i < n; i++) {
     y[i] = a * x[i] + y[i];
   }
-}
+clock_gettime(CLOCK_REALTIME, rt + 1);
+      break;
+    default:
+/*
+ * - saxpy in MKL
+ */
+clock_gettime(CLOCK_REALTIME, rt + 0);
+cblas_saxpy(n, a, x, 1, y, 1);
+clock_gettime(CLOCK_REALTIME, rt + 1);
+      break;
+  } /* end switch (ial) */
+  if (wtcalc >= 0.0) {
+    wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  }
 }
 
 #ifdef __cplusplus
diff --git a/05_saxpy/src/hsaxpy.h b/05_saxpy/src/hsaxpy.h
index 8a7fe73..f6dce4f 100644
--- a/05_saxpy/src/hsaxpy.h
+++ b/05_saxpy/src/hsaxpy.h
@@ -1,8 +1,8 @@
 /**
  * @file hsaxpy.h
- * @brief Function prototype for performing the \c axpy operation on host.
+ * @brief Function prototype for performing the \c saxpy operation on host.
  *
- * This header file contains function prototype for the \c axpy operation,
+ * This header file contains function prototype for the \c saxpy operation,
  * which is defined as:
  *
  * y := a * x + y
@@ -10,10 +10,10 @@
  * where:
  *
  * - a is a scalar.
- * - x and y are vectors each with n elements.
+ * - x and y are single-precision vectors each with n elements.
  *
  * @author Xin Wu (PC²)
- * @date 09.01.2020
+ * @date 05.04.2020
  * @copyright CC BY-SA 2.0
  */
 
@@ -27,23 +27,25 @@ extern "C" {
 void hsaxpy(const int n,
             const float a,
             const float *x,
-                  float *y);
+                  float *y,
+            const int ial);
 /**<
- * @brief Performs the \c axpy operation on host.
+ * @brief Performs the \c saxpy operation on host.
  *
- * The \c axpy operation is defined as:
+ * The \c saxpy operation is defined as:
  *
  * y := a * x + y
  *
  * where:
  *
  * - a is a scalar.
- * - x and y are vectors each with n elements.
+ * - x and y are single-precision vectors each with n elements.
  *
- * @param n The number of elements in \p x and \p y.
- * @param a The scalar for multiplication.
- * @param x The vector \p x in \c axpy.
- * @param y The vector \p y in \c axpy.
+ * @param n   The number of elements in \p x and \p y.
+ * @param a   The scalar for multiplication.
+ * @param x   The vector \p x in \c saxpy.
+ * @param y   The vector \p y in \c saxpy.
+ * @param ial The ial-th implementation.
  *
  * @return \c void.
  */
diff --git a/05_saxpy/src/saxpy.c b/05_saxpy/src/saxpy.c
index 089ad34..2ff3d25 100644
--- a/05_saxpy/src/saxpy.c
+++ b/05_saxpy/src/saxpy.c
@@ -4,56 +4,53 @@
  * @mainpage saxpy
  *
  * @author Xin Wu (PC²)
- * @date 09.01.2020
+ * @date 05.04.2020
  * @copyright CC BY-SA 2.0
  *
- * saxpy performs the \c axpy operation on host as well as accelerator and then
- * compares the FLOPS performance.
+ * saxpy performs the \c saxpy operation on host as well as accelerator.
+ * The performance (in MB/s) for different implementations is also compared.
  *
- * The \c axpy operation is defined as:
+ * The \c saxpy operation is defined as:
  *
  * y := a * x + y
  *
  * where:
  *
  * - a is a scalar.
- * - x and y are vectors each with n elements.
- *
- * The initial value of \c a and elements of \c x[] and \c y[] are specially
- * designed, so that the floating-point calculations on host and accelerator can
- * be compared \e exactly.
- *
- * Please note that only <em>one GPU thread</em> is used for the \c axpy
- * calculation on accelerator in this version. This can be verified by uncomment
- * the \c CFLAGS line in \c configure.ac.
+ * - x and y are single-precision vectors each with n elements.
  */
 
-#include <assert.h>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <time.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+#include "mkl.h"
 #include "hsaxpy.h"
 #include "asaxpy.h"
 #include "check1ns.h"
+#include "wtcalc.h"
 
-#define TWO02 (1 <<  2)
-#define TWO04 (1 <<  4)
-#define TWO08 (1 <<  8)
-#define TWO27 (1 << 27)
+#define TWO22 (1 << 22)
+#define NLUP  (32)
 
 /**
  * @brief Main entry point for saxpy.
  */
 int main(int argc, char *argv[])
 {
-  int   i, n = TWO27,
-        iret = 0;
-  float a = 101.0f / TWO02,
-        *x, *y,
-            *z;
+  int    i, n,
+         iret,
+         ial;
+  size_t nbytes;
+  float  a = 2.0f,
+         *x, *y,
+         *yhost,
+         *yaccl,
+         maxabserr;
   struct timespec rt[2];
   double wt; // walltime
 
@@ -70,55 +67,112 @@ int main(int argc, char *argv[])
     exit(EXIT_FAILURE);
   }
   /*
-   * prepare x, y, and z
-   *
-   * y := a * x + y (on host)
-   * z := a * x + z (on accel)
+   * preparation
    */
-  if (NULL == (x = (float *) malloc(sizeof(*x) * n))) {
-    printf("error: memory allocation for 'x'\n");
-    iret = -1;
-  }
-  if (NULL == (y = (float *) malloc(sizeof(*y) * n))) {
-    printf("error: memory allocation for 'y'\n");
-    iret = -1;
-  }
-  if (NULL == (z = (float *) malloc(sizeof(*z) * n))) {
-    printf("error: memory allocation for 'z'\n");
-    iret = -1;
-  }
+  n      = TWO22;
+  nbytes = sizeof(float) * n;
+  iret   = 0;
+  if (NULL == (x     = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1;
+  if (NULL == (y     = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1;
+  if (NULL == (yhost = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1;
+  if (NULL == (yaccl = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1;
   if (0 != iret) {
-    free(x);
-    free(y);
-    free(z);
+    printf("error: memory allocation\n");
+    mkl_free(x);     mkl_free(y);
+    mkl_free(yhost); mkl_free(yaccl);
     exit(EXIT_FAILURE);
   }
-  for (i = 0; i < n; i++) {
-    x[i] =        rand() % TWO04 / (float) TWO02;
-    y[i] = z[i] = rand() % TWO08 / (float) TWO04;
+#pragma omp parallel for default(none) \
+  shared(a, x, y, yhost, yaccl, n) private(i)
+  for (i = 0; i < n; ++i) {
+    x[i]     = rand() % 32 / 32.0f;
+    y[i]     = rand() % 32 / 32.0f;
+    yhost[i] = a * x[i] + y[i]; // yhost will be used as reference value
+    yaccl[i] = 0.0f;
   }
+  printf("total size of x and y is %9.1f MB\n", 2.0 * nbytes / (1 << 20));
+  printf("tests are averaged over %2d loops\n", NLUP);
   /*
    * saxpy on host
    */
-  clock_gettime(CLOCK_REALTIME, rt + 0);
-  hsaxpy(n, a, x, y);
-  clock_gettime(CLOCK_REALTIME, rt + 1);
-  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
-  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+  for (ial = 0; ial < 2; ++ial) {
+    /*
+     * See hsaxpy.c for details:
+     *
+     * ial:
+     *
+     * 0: naive implementation
+     * otherwise: saxpy in MKL
+     */
+    memcpy(yaccl, y, nbytes);
+    wtcalc = -1.0;
+    // skip 1st run for timing
+    hsaxpy(n, a, x, yaccl, ial);
+    // check yaccl
+    maxabserr = -1.0f;
+    for (i = 0; i < n; ++i) {
+      maxabserr = fabsf(yaccl[i] - yhost[i]) > maxabserr?
+                  fabsf(yaccl[i] - yhost[i]) : maxabserr;
+    }
+    // skip 2nd run for timing
+    hsaxpy(n, a, x, yaccl, ial);
+    // timing : start
+    wtcalc = 0.0;
+    clock_gettime(CLOCK_REALTIME, rt + 0);
+    for (int ilup = 0; ilup < NLUP; ++ilup) {
+      hsaxpy(n, a, x, yaccl, ial);
+    }
+    clock_gettime(CLOCK_REALTIME, rt + 1);
+    wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+    printf("saxpy on host (%d) : %9.1f MB/s %9.1f MB/s maxabserr = %9.1f\n",
+        ial, NLUP * 3.0 * nbytes / ((1 << 20) * wt),
+             NLUP * 3.0 * nbytes / ((1 << 20) * wtcalc), maxabserr);
+  }
   /*
-   * saxpy on accel
+   * saxpy on accl
    */
-  clock_gettime(CLOCK_REALTIME, rt + 0);
-  asaxpy(n, a, x, z);
-  clock_gettime(CLOCK_REALTIME, rt + 1);
-  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
-  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+  for (ial = 1; ial < 7; ++ial) {
+    /*
+     * See asaxpy.c for details:
+     *
+     * ial:
+     *
+     * 0: <<<             1,   1>>>, TOO SLOW! not tested
+     * 1: <<<             1, 128>>>
+     * 2: <<<           128,   1>>>
+     * 3: <<<           128, 128>>>
+     * 4: <<<n /        128, 128>>>
+     * 5: <<<n / (128 * 16), 128>>>, 16x loop unrolling
+     * otherwise: cublasSaxpy in CUBLAS
+     */
+    memcpy(yaccl, y, nbytes);
+    wtcalc = -1.0;
+    // skip 1st run for timing
+    asaxpy(n, a, x, yaccl, ial);
+    // check yaccl
+    maxabserr = -1.0f;
+    for (i = 0; i < n; ++i) {
+      maxabserr = fabsf(yaccl[i] - yhost[i]) > maxabserr?
+                  fabsf(yaccl[i] - yhost[i]) : maxabserr;
+    }
+    // skip 2nd run for timing
+    asaxpy(n, a, x, yaccl, ial);
+    // timing : start
+    wtcalc = 0.0;
+    clock_gettime(CLOCK_REALTIME, rt + 0);
+    for (int ilup = 0; ilup < NLUP; ++ilup) {
+      asaxpy(n, a, x, yaccl, ial);
+    }
+    clock_gettime(CLOCK_REALTIME, rt + 1);
+    wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+    printf("saxpy on accl (%d) : %9.1f MB/s %9.1f MB/s maxabserr = %9.1f\n",
+        ial, NLUP * 3.0 * nbytes / ((1 << 20) * wt),
+             NLUP * 3.0 * nbytes / ((1 << 20) * wtcalc), maxabserr);
+  }
   /*
-   * check whether y[] == z[] _exactly_
+   * release memory
    */
-  for (i = 0; i < n; i++) {
-    iret = *(int *) (y + i) ^ *(int *) (z + i);
-    assert(iret == 0);
-  }
+  mkl_free(x);     mkl_free(y);
+  mkl_free(yhost); mkl_free(yaccl);
   return 0;
 }
diff --git a/05_saxpy/src/wtcalc.c b/05_saxpy/src/wtcalc.c
new file mode 100644
index 0000000..bb00340
--- /dev/null
+++ b/05_saxpy/src/wtcalc.c
@@ -0,0 +1,21 @@
+/**
+ * @file wtcalc.c
+ *
+ * @brief Global variable for walltime of the calculation kernel.
+ *
+ * @author Xin Wu (PC²)
+ * @date 05.04.2020
+ * @copyright CC BY-SA 2.0
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "wtcalc.h"
+
+double wtcalc;
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/05_saxpy/src/wtcalc.h b/05_saxpy/src/wtcalc.h
new file mode 100644
index 0000000..483951e
--- /dev/null
+++ b/05_saxpy/src/wtcalc.h
@@ -0,0 +1,30 @@
+/**
+ * @file wtcalc.h
+ *
+ * @brief Global variable for walltime of the calculation kernel.
+ *
+ * @author Xin Wu (PC²)
+ * @date 05.04.2020
+ * @copyright CC BY-SA 2.0
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef WATCALC_H
+#define WATCALC_H
+
+/*
+ * wtcalc: walltime for the calculation kernel
+ *
+ * - wtcalc  < 0.0: reset and disable the timer
+ * - wtcalc == 0.0:            enable the timer
+ */
+extern double wtcalc;
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/05_saxpy/tests/saxpy_real_00.sh b/05_saxpy/tests/saxpy_real_00.sh
index a39dd00..6255fd4 100755
--- a/05_saxpy/tests/saxpy_real_00.sh
+++ b/05_saxpy/tests/saxpy_real_00.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 #CCS -N saxpy
-#CCS -t 10m
+#CCS -t 600m
 #CCS -g pc2-mitarbeiter
-#CCS --res=rset=1:ncpus=1:mem=4g:vmem=8g:tesla=1
+#CCS --res=rset=1:gtx1080=1,place=:excl
 
 echo "hallo from $(hostname)"
 ../src/saxpy
diff --git a/05_saxpy/tests/saxpy_real_00.sh.5383621.out b/05_saxpy/tests/saxpy_real_00.sh.5383621.out
new file mode 100644
index 0000000..2904da0
--- /dev/null
+++ b/05_saxpy/tests/saxpy_real_00.sh.5383621.out
@@ -0,0 +1,12 @@
+hallo from gpu011
+The system supports 1 ns time resolution
+total size of x and y is      32.0 MB
+tests are averaged over 32 loops
+saxpy on host (0) :   43431.2 MB/s   43438.4 MB/s maxabserr =       0.0
+saxpy on host (1) :  446622.5 MB/s  447142.8 MB/s maxabserr =       0.0
+saxpy on accl (1) :      75.2 MB/s      78.6 MB/s maxabserr =       0.0
+saxpy on accl (2) :      75.7 MB/s      79.0 MB/s maxabserr =       0.0
+saxpy on accl (3) :    1945.7 MB/s    6071.8 MB/s maxabserr =       0.0
+saxpy on accl (4) :    1689.3 MB/s    4074.8 MB/s maxabserr =       0.0
+saxpy on accl (5) :    2644.2 MB/s   30438.5 MB/s maxabserr =       0.0
+saxpy on accl (6) :    2762.5 MB/s  127182.3 MB/s maxabserr =       0.0