diff --git a/05_saxpy/README.md b/05_saxpy/README.md index ac59e67..f4d543e 100644 --- a/05_saxpy/README.md +++ b/05_saxpy/README.md @@ -1,35 +1,50 @@ --- title: saxpy author: Xin Wu (PC²) -date: 10.01.2020 +date: 05.04.2020 --- # Introduction -`saxpy` performs the `axpy` operation on host as well as accelerator and then -compares the FLOPS performance. +`saxpy` performs the `saxpy` operation on host as well as accelerator. +The performance (in MB/s) for different implementations is also compared. -The `axpy` operation is defined as: +The `saxpy` operation is defined as: $$ y := a * x + y $$ where: * `a` is a scalar. -* `x` and `y` are vectors each with n elements. - -The initial value of `a` and elements of `x[]` and `y[]` are specially designed, -so that the floating-point calculations on host and accelerator can be compared -_exactly_. - -Please note that only _one GPU thread_ is used for the `axpy` calculation on -accelerator in this version. This can be verified by uncomment the `CFLAGS` line -in `configure.ac`. +* `x` and `y` are single-precision vectors each with n elements. +* For testing n is assumed to be $2^{22}$. +* The following table only summarizes the most important points. For more + details on the ial-th implementation see comments in `hsaxpy.c` (on host) + and `asaxpy.c` (on accelerator). + + - on host + +| ial | Remarks | +|:---:|------------------------------------------------------------------------| +| 0 | naive implementation | +| 1 | saxpy in MKL | + + - on accl + +| ial | Remarks | +|:---:|------------------------------------------------------------------------| +| 0 | <<< 1, 1>>>, TOO SLOW! not tested | +| 1 | <<< 1, 128>>> | +| 2 | <<< 128, 1>>> | +| 3 | <<< 128, 128>>> | +| 4 | <<>> | +| 5 | <<>>, 16x loop unrolling | +| 6 | cublasSaxpy in CUBLAS | # Build ```bash -autoreconf -i; ./configure; make; make check; sudo make install; +autoreconf -i; ./configure; make; make check; ``` `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS). diff --git a/05_saxpy/configure.ac b/05_saxpy/configure.ac index aae1231..2075c61 100644 --- a/05_saxpy/configure.ac +++ b/05_saxpy/configure.ac @@ -18,18 +18,31 @@ if test -z "${CUDALIB}"; then fi ##############################################################################80 # +# check MKL +# +##############################################################################80 +AC_ARG_VAR([MKLINC], [The PATH wherein mkl.h can be found]) +if test -z "${MKLINC}"; then + AC_SUBST([MKLINC], [${MKLROOT}/include]) +fi +AC_ARG_VAR([MKLLIB], [The PATH wherein MKL library can be found]) +if test -z "${MKLLIB}"; then + AC_SUBST([MKLLIB], [${MKLROOT}/lib/intel64]) +fi +##############################################################################80 +# # check C compiler # ##############################################################################80 +CFLAGS+="-I${CUDAINC} -I${MKLINC}" +LDFLAGS+="-L${CUDALIB} -L${MKLLIB}" +# AC_PROG_CC([clang gcc]) -##CFLAGS+=" -DDEBUG" AS_IF([test "${CC}" = gcc], - [CFLAGS="-Wall -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS" - LDFLAGS="-L${CUDALIB} $LDFLAGS"]) + [CFLAGS="-Wall -fopenmp -foffload=nvptx-none $CFLAGS"]) AS_IF([test "${CC}" = clang], - [CFLAGS="-Wall -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \ - -Xopenmp-target -march=sm_61 $CFLAGS" - LDFLAGS="-L${CUDALIB} $LDFLAGS"]) + [CFLAGS="-Wall -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \ + -Xopenmp-target -march=sm_61 $CFLAGS"]) ##############################################################################80 # # check archiver @@ -44,6 +57,10 @@ AM_PROG_AR ##############################################################################80 AC_CHECK_HEADER([cuda_runtime.h], [], [AC_MSG_ERROR([cuda_runtime.h required, but not found])], []) +AC_CHECK_HEADER([cublas_v2.h], [], + [AC_MSG_ERROR([cublas_v2.h required, but not found])], []) +AC_CHECK_HEADER([mkl.h], [], + [AC_MSG_ERROR([mkl.h required, but not found])], []) ##############################################################################80 # # check libraries @@ -51,6 +68,21 @@ AC_CHECK_HEADER([cuda_runtime.h], [], ##############################################################################80 AC_CHECK_LIB([cudart], [cudaSetDevice], [], [AC_MSG_ERROR([libcudart required, but not found])], []) +AC_CHECK_LIB([cublas], [cublasSaxpy], [], + [AC_MSG_ERROR([libcublas required, but not found])], []) +AC_CHECK_LIB([pthread], [pthread_create], [], + [AC_MSG_ERROR([libpthread required, but not found])], []) +AC_CHECK_LIB([iomp5], [omp_set_num_threads], [], + [AC_MSG_ERROR([libiomp5 required, but not found])], []) +AC_CHECK_LIB([mkl_core], [mkl_blas_xsaxpy], [], + [AC_MSG_ERROR([libmkl_core required, but not found])], + [-lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lm]) +AC_CHECK_LIB([mkl_intel_thread], [mkl_blas_saxpy], [], + [AC_MSG_ERROR([libmkl_intel_thread required, but not found])], + [-lmkl_intel_lp64 -lmkl_core -liomp5 -lm]) +AC_CHECK_LIB([mkl_intel_lp64], [saxpy], [], + [AC_MSG_ERROR([libmkl_intel_lp64 required, but not found])], + [-lmkl_intel_thread -lmkl_core -liomp5 -lm]) ##############################################################################80 # # check Doxygen diff --git a/05_saxpy/docs/UserManual.md b/05_saxpy/docs/UserManual.md index 9056e10..c989f62 100644 --- a/05_saxpy/docs/UserManual.md +++ b/05_saxpy/docs/UserManual.md @@ -1,30 +1,45 @@ --- title: saxpy author: Xin Wu (PC²) -date: 10.01.2020 +date: 05.04.2020 --- # Introduction -`saxpy` performs the `axpy` operation on host as well as accelerator and then -compares the FLOPS performance. +`saxpy` performs the `saxpy` operation on host as well as accelerator. +The performance (in MB/s) for different implementations is also compared. -The `axpy` operation is defined as: +The `saxpy` operation is defined as: $$ y := a * x + y $$ where: * `a` is a scalar. -* `x` and `y` are vectors each with n elements. - -The initial value of `a` and elements of `x[]` and `y[]` are specially designed, -so that the floating-point calculations on host and accelerator can be compared -_exactly_. - -Please note that only _one GPU thread_ is used for the `axpy` calculation on -accelerator in this version. This can be verified by uncomment the `CFLAGS` line -in `configure.ac`. +* `x` and `y` are single-precision vectors each with n elements. +* For testing n is assumed to be $2^{22}$. +* The following table only summarizes the most important points. For more + details on the ial-th implementation see comments in `hsaxpy.c` (on host) + and `asaxpy.c` (on accelerator). + + - on host + +| ial | Remarks | +|:---:|------------------------------------------------------------------------| +| 0 | naive implementation | +| 1 | saxpy in MKL | + + - on accl + +| ial | Remarks | +|:---:|------------------------------------------------------------------------| +| 0 | <<< 1, 1>>>, TOO SLOW! not tested | +| 1 | <<< 1, 128>>> | +| 2 | <<< 128, 1>>> | +| 3 | <<< 128, 128>>> | +| 4 | <<>> | +| 5 | <<>>, 16x loop unrolling | +| 6 | cublasSaxpy in CUBLAS | # Usage diff --git a/05_saxpy/src/Makefile.am b/05_saxpy/src/Makefile.am index 2ef9113..d3c2053 100644 --- a/05_saxpy/src/Makefile.am +++ b/05_saxpy/src/Makefile.am @@ -5,4 +5,6 @@ saxpy_SOURCES = saxpy.c \ hsaxpy.h \ hsaxpy.c \ asaxpy.h \ - asaxpy.c + asaxpy.c \ + wtcalc.h \ + wtcalc.c diff --git a/05_saxpy/src/asaxpy.c b/05_saxpy/src/asaxpy.c index 3d630e9..9c53b80 100644 --- a/05_saxpy/src/asaxpy.c +++ b/05_saxpy/src/asaxpy.c @@ -1,9 +1,8 @@ /** * @file asaxpy.c - * @brief Function definition for performing the \c axpy operation on - * accelerator. + * @brief Function definition for performing the \c saxpy operation on accelerator. * - * This source file contains function definition for the \c axpy operation, + * This source file contains function definition for the \c saxpy operation, * which is defined as: * * y := a * x + y @@ -11,10 +10,10 @@ * where: * * - a is a scalar. - * - x and y are vectors each with n elements. + * - x and y are single-precision vectors each with n elements. * * @author Xin Wu (PC²) - * @date 09.01.2020 + * @date 05.04.2020 * @copyright CC BY-SA 2.0 */ @@ -22,40 +21,217 @@ extern "C" { #endif -#ifdef DEBUG #include -#endif +#include +#include #ifdef _OPENMP #include #endif +#include +#include "cublas_v2.h" +#include "wtcalc.h" #include "asaxpy.h" void asaxpy(const int n, const float a, const float *x, - float *y) + float *y, + const int ial) { - int i = 0; -#ifdef DEBUG - int nth; -#endif + cublasHandle_t handle; + float alfa = a, + *x_dev = NULL, + *y_dev = NULL; + struct timespec rt[2]; + int m = (n >> 4); -#ifdef DEBUG -#pragma omp target map(n, a, x[0:n], y[0:n], nth) private(i) -#else -#pragma omp target map(n, a, x[0:n], y[0:n] ) private(i) -#endif + switch (ial) { + case 0: +/* + * - <<<1, 1>>> + */ +#pragma omp target data device(0) \ + map(to:a, n, x[0:n]) map(tofrom:y[0:n]) { -#ifdef DEBUG - nth = omp_get_num_threads(); -#endif - for (i = 0; i < n; i++) { - y[i] = a * x[i] + y[i]; - } + clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp target teams device(0) num_teams(1) \ + map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ + default(none) shared(a, n, x, y) +#pragma omp distribute parallel for num_threads(1) \ + dist_schedule(static, 1) \ + default(none) shared(a, n, x, y) +for (int i = 0; i < n; ++i) { + y[i] = a * x[i] + y[i]; } -#ifdef DEBUG -printf("DEBUG: number of threads on accelerator: %d\n", nth); -#endif + clock_gettime(CLOCK_REALTIME, rt + 1); +} + break; + case 1: +/* + * - <<<1, 128>>> + */ +#pragma omp target data device(0) \ + map(to:a, n, x[0:n]) map(tofrom:y[0:n]) +{ + clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp target teams device(0) num_teams(1) \ + map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ + default(none) shared(a, n, x, y) +#pragma omp distribute parallel for num_threads(128) \ + dist_schedule(static, 128) \ + default(none) shared(a, n, x, y) +for (int i = 0; i < n; ++i) { + y[i] = a * x[i] + y[i]; +} + clock_gettime(CLOCK_REALTIME, rt + 1); +} + break; + case 2: +/* + * - <<<128, 1>>> + */ +#pragma omp target data device(0) \ + map(to:a, n, x[0:n]) map(tofrom:y[0:n]) +{ + clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp target teams device(0) num_teams(128) \ + map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ + default(none) shared(a, n, x, y) +#pragma omp distribute parallel for num_threads(1) \ + dist_schedule(static, 1) \ + default(none) shared(a, n, x, y) +for (int i = 0; i < n; ++i) { + y[i] = a * x[i] + y[i]; +} + clock_gettime(CLOCK_REALTIME, rt + 1); +} + break; + case 3: +/* + * - <<<128, 128>>> + */ +#pragma omp target data device(0) \ + map(to:a, n, x[0:n]) map(tofrom:y[0:n]) +{ + clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp target teams device(0) num_teams(128) \ + map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ + default(none) shared(a, n, x, y) +#pragma omp distribute parallel for num_threads(128) \ + dist_schedule(static, 128) \ + default(none) shared(a, n, x, y) +for (int i = 0; i < n; ++i) { + y[i] = a * x[i] + y[i]; +} + clock_gettime(CLOCK_REALTIME, rt + 1); +} + break; + case 4: +/* + * - <<>> + */ +#pragma omp target data device(0) \ + map(to:a, n, x[0:n]) map(tofrom:y[0:n]) +{ + clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp target teams device(0) num_teams((1 << 15)) \ + map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ + default(none) shared(a, n, x, y) +#pragma omp distribute parallel for num_threads(128) \ + dist_schedule(static, 128) \ + default(none) shared(a, n, x, y) +for (int i = 0; i < n; ++i) { + y[i] = a * x[i] + y[i]; +} + clock_gettime(CLOCK_REALTIME, rt + 1); +} + break; + case 5: +/* + * - <<>> + * - 16x loop-unrolling + */ +#pragma omp target data device(0) \ + map(to:a, m, x[0:n]) map(tofrom:y[0:n]) +{ + clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp target teams device(0) num_teams(2048) \ + map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \ + default(none) shared(a, m, x, y) +#pragma omp distribute parallel for num_threads(128) \ + dist_schedule(static, 128) \ + default(none) shared(a, m, x, y) +for (int i = 0; i < m; ++i) { + y[i ] = a * x[i ] + y[i ]; + y[i + m] = a * x[i + m] + y[i + m]; + y[i + 0x2 * m] = a * x[i + 0x2 * m] + y[i + 0x2 * m]; + y[i + 0x3 * m] = a * x[i + 0x3 * m] + y[i + 0x3 * m]; + y[i + 0x4 * m] = a * x[i + 0x4 * m] + y[i + 0x4 * m]; + y[i + 0x5 * m] = a * x[i + 0x5 * m] + y[i + 0x5 * m]; + y[i + 0x6 * m] = a * x[i + 0x6 * m] + y[i + 0x6 * m]; + y[i + 0x7 * m] = a * x[i + 0x7 * m] + y[i + 0x7 * m]; + y[i + 0x8 * m] = a * x[i + 0x8 * m] + y[i + 0x8 * m]; + y[i + 0x9 * m] = a * x[i + 0x9 * m] + y[i + 0x9 * m]; + y[i + 0xa * m] = a * x[i + 0xa * m] + y[i + 0xa * m]; + y[i + 0xb * m] = a * x[i + 0xb * m] + y[i + 0xb * m]; + y[i + 0xc * m] = a * x[i + 0xc * m] + y[i + 0xc * m]; + y[i + 0xd * m] = a * x[i + 0xd * m] + y[i + 0xd * m]; + y[i + 0xe * m] = a * x[i + 0xe * m] + y[i + 0xe * m]; + y[i + 0xf * m] = a * x[i + 0xf * m] + y[i + 0xf * m]; +} + clock_gettime(CLOCK_REALTIME, rt + 1); +} + break; + default: +/* + * cublasSaxpy in CUBLAS + */ + if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle)) { + printf("error: initialization (CUBLAS)\n"); + cublasDestroy(handle); + exit(EXIT_FAILURE); + } + if (cudaSuccess != cudaMalloc((void **) &x_dev, sizeof(*x) * n) || + cudaSuccess != cudaMalloc((void **) &y_dev, sizeof(*y) * n)) { + printf("error: memory allocation (CUDA)\n"); + cudaFree(x_dev); cudaFree(y_dev); + cublasDestroy(handle); + exit(EXIT_FAILURE); + } + if (CUBLAS_STATUS_SUCCESS != cublasSetVector(n, sizeof(*x), x, 1, x_dev, 1) || + CUBLAS_STATUS_SUCCESS != cublasSetVector(n, sizeof(*y), y, 1, y_dev, 1)) { + printf("error: host --> accl (CUBLAS)\n"); + cudaFree(x_dev); cudaFree(y_dev); + cublasDestroy(handle); + exit(EXIT_FAILURE); + } + clock_gettime(CLOCK_REALTIME, rt + 0); + if (CUBLAS_STATUS_SUCCESS != cublasSaxpy(handle, n, &alfa, x_dev, 1, y_dev, 1)) { + printf("error: cublasSaxpy (CUBLAS)\n"); + cudaFree(x_dev); cudaFree(y_dev); + cublasDestroy(handle); + exit(EXIT_FAILURE); + } + if (cudaSuccess != cudaDeviceSynchronize()) { + printf("error: device synchronization (CUDA)\n"); + cudaFree(x_dev); cudaFree(y_dev); + cublasDestroy(handle); + exit(EXIT_FAILURE); + } + clock_gettime(CLOCK_REALTIME, rt + 1); + if (CUBLAS_STATUS_SUCCESS != cublasGetVector(n, sizeof(*y), y_dev, 1, y, 1)) { + printf("error: accl --> host (CUBLAS)\n"); + cudaFree(x_dev); cudaFree(y_dev); + cublasDestroy(handle); + exit(EXIT_FAILURE); + } + cudaFree(x_dev); cudaFree(y_dev); + cublasDestroy(handle); + break; + } /* end switch (ial) */ + if (wtcalc >= 0.0) { + wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + } } #ifdef __cplusplus diff --git a/05_saxpy/src/asaxpy.h b/05_saxpy/src/asaxpy.h index 2a24d39..1ebdb9c 100644 --- a/05_saxpy/src/asaxpy.h +++ b/05_saxpy/src/asaxpy.h @@ -1,9 +1,8 @@ /** * @file asaxpy.h - * @brief Function prototype for performing the \c axpy operation on - * accelerator. + * @brief Function prototype for performing the \c saxpy operation on accelerator. * - * This header file contains function prototype for the \c axpy operation, + * This header file contains function prototype for the \c saxpy operation, * which is defined as: * * y := a * x + y @@ -11,10 +10,10 @@ * where: * * - a is a scalar. - * - x and y are vectors each with n elements. + * - x and y are single-precision vectors each with n elements. * * @author Xin Wu (PC²) - * @date 09.01.2020 + * @date 05.04.2020 * @copyright CC BY-SA 2.0 */ @@ -28,23 +27,25 @@ extern "C" { void asaxpy(const int n, const float a, const float *x, - float *y); + float *y, + const int ial); /**< - * @brief Performs the \c axpy operation on accelerator. + * @brief Performs the \c saxpy operation on accelerator. * - * The \c axpy operation is defined as: + * The \c saxpy operation is defined as: * * y := a * x + y * * where: * * - a is a scalar. - * - x and y are vectors each with n elements. + * - x and y are single-precision vectors each with n elements. * - * @param n The number of elements in \p x and \p y. - * @param a The scalar for multiplication. - * @param x The vector \p x in \c axpy. - * @param y The vector \p y in \c axpy. + * @param n The number of elements in \p x and \p y. + * @param a The scalar for multiplication. + * @param x The vector \p x in \c saxpy. + * @param y The vector \p y in \c saxpy. + * @param ial The ial-th implementation. * * @return \c void. */ diff --git a/05_saxpy/src/hsaxpy.c b/05_saxpy/src/hsaxpy.c index 3514a12..22bab18 100644 --- a/05_saxpy/src/hsaxpy.c +++ b/05_saxpy/src/hsaxpy.c @@ -1,8 +1,8 @@ /** * @file hsaxpy.c - * @brief Function definition for performing the \c axpy operation on host. + * @brief Function definition for performing the \c saxpy operation on host. * - * This source file contains function definition for the \c axpy operation, + * This source file contains function definition for the \c saxpy operation, * which is defined as: * * y := a * x + y @@ -10,10 +10,10 @@ * where: * * - a is a scalar. - * - x and y are vectors each with n elements. + * - x and y are single-precision vectors each with n elements. * * @author Xin Wu (PC²) - * @date 09.01.2020 + * @date 05.04.2020 * @copyright CC BY-SA 2.0 */ @@ -21,26 +21,47 @@ extern "C" { #endif +#include #ifdef _OPENMP #include #endif +#include "mkl.h" +#include "wtcalc.h" #include "hsaxpy.h" void hsaxpy(const int n, const float a, const float *x, - float *y) + float *y, + const int ial) { - int i = 0; + struct timespec rt[2]; -#pragma omp parallel \ - default(none) shared(n, a, x, y) private(i) -{ -#pragma omp for simd schedule(simd:static) - for (i = 0; i < n; i++) { + switch (ial) { + case 0: +/* + * - naive implementation + */ +clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp parallel for simd schedule(simd:static) \ + default(none) shared(a, n, x, y) + for (int i = 0; i < n; i++) { y[i] = a * x[i] + y[i]; } -} +clock_gettime(CLOCK_REALTIME, rt + 1); + break; + default: +/* + * - saxpy in MKL + */ +clock_gettime(CLOCK_REALTIME, rt + 0); +cblas_saxpy(n, a, x, 1, y, 1); +clock_gettime(CLOCK_REALTIME, rt + 1); + break; + } /* end switch (ial) */ + if (wtcalc >= 0.0) { + wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + } } #ifdef __cplusplus diff --git a/05_saxpy/src/hsaxpy.h b/05_saxpy/src/hsaxpy.h index 8a7fe73..f6dce4f 100644 --- a/05_saxpy/src/hsaxpy.h +++ b/05_saxpy/src/hsaxpy.h @@ -1,8 +1,8 @@ /** * @file hsaxpy.h - * @brief Function prototype for performing the \c axpy operation on host. + * @brief Function prototype for performing the \c saxpy operation on host. * - * This header file contains function prototype for the \c axpy operation, + * This header file contains function prototype for the \c saxpy operation, * which is defined as: * * y := a * x + y @@ -10,10 +10,10 @@ * where: * * - a is a scalar. - * - x and y are vectors each with n elements. + * - x and y are single-precision vectors each with n elements. * * @author Xin Wu (PC²) - * @date 09.01.2020 + * @date 05.04.2020 * @copyright CC BY-SA 2.0 */ @@ -27,23 +27,25 @@ extern "C" { void hsaxpy(const int n, const float a, const float *x, - float *y); + float *y, + const int ial); /**< - * @brief Performs the \c axpy operation on host. + * @brief Performs the \c saxpy operation on host. * - * The \c axpy operation is defined as: + * The \c saxpy operation is defined as: * * y := a * x + y * * where: * * - a is a scalar. - * - x and y are vectors each with n elements. + * - x and y are single-precision vectors each with n elements. * - * @param n The number of elements in \p x and \p y. - * @param a The scalar for multiplication. - * @param x The vector \p x in \c axpy. - * @param y The vector \p y in \c axpy. + * @param n The number of elements in \p x and \p y. + * @param a The scalar for multiplication. + * @param x The vector \p x in \c saxpy. + * @param y The vector \p y in \c saxpy. + * @param ial The ial-th implementation. * * @return \c void. */ diff --git a/05_saxpy/src/saxpy.c b/05_saxpy/src/saxpy.c index 089ad34..2ff3d25 100644 --- a/05_saxpy/src/saxpy.c +++ b/05_saxpy/src/saxpy.c @@ -4,56 +4,53 @@ * @mainpage saxpy * * @author Xin Wu (PC²) - * @date 09.01.2020 + * @date 05.04.2020 * @copyright CC BY-SA 2.0 * - * saxpy performs the \c axpy operation on host as well as accelerator and then - * compares the FLOPS performance. + * saxpy performs the \c saxpy operation on host as well as accelerator. + * The performance (in MB/s) for different implementations is also compared. * - * The \c axpy operation is defined as: + * The \c saxpy operation is defined as: * * y := a * x + y * * where: * * - a is a scalar. - * - x and y are vectors each with n elements. - * - * The initial value of \c a and elements of \c x[] and \c y[] are specially - * designed, so that the floating-point calculations on host and accelerator can - * be compared \e exactly. - * - * Please note that only one GPU thread is used for the \c axpy - * calculation on accelerator in this version. This can be verified by uncomment - * the \c CFLAGS line in \c configure.ac. + * - x and y are single-precision vectors each with n elements. */ -#include +#include #include #include +#include #include #ifdef _OPENMP #include #endif +#include "mkl.h" #include "hsaxpy.h" #include "asaxpy.h" #include "check1ns.h" +#include "wtcalc.h" -#define TWO02 (1 << 2) -#define TWO04 (1 << 4) -#define TWO08 (1 << 8) -#define TWO27 (1 << 27) +#define TWO22 (1 << 22) +#define NLUP (32) /** * @brief Main entry point for saxpy. */ int main(int argc, char *argv[]) { - int i, n = TWO27, - iret = 0; - float a = 101.0f / TWO02, - *x, *y, - *z; + int i, n, + iret, + ial; + size_t nbytes; + float a = 2.0f, + *x, *y, + *yhost, + *yaccl, + maxabserr; struct timespec rt[2]; double wt; // walltime @@ -70,55 +67,112 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } /* - * prepare x, y, and z - * - * y := a * x + y (on host) - * z := a * x + z (on accel) + * preparation */ - if (NULL == (x = (float *) malloc(sizeof(*x) * n))) { - printf("error: memory allocation for 'x'\n"); - iret = -1; - } - if (NULL == (y = (float *) malloc(sizeof(*y) * n))) { - printf("error: memory allocation for 'y'\n"); - iret = -1; - } - if (NULL == (z = (float *) malloc(sizeof(*z) * n))) { - printf("error: memory allocation for 'z'\n"); - iret = -1; - } + n = TWO22; + nbytes = sizeof(float) * n; + iret = 0; + if (NULL == (x = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1; + if (NULL == (y = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1; + if (NULL == (yhost = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1; + if (NULL == (yaccl = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1; if (0 != iret) { - free(x); - free(y); - free(z); + printf("error: memory allocation\n"); + mkl_free(x); mkl_free(y); + mkl_free(yhost); mkl_free(yaccl); exit(EXIT_FAILURE); } - for (i = 0; i < n; i++) { - x[i] = rand() % TWO04 / (float) TWO02; - y[i] = z[i] = rand() % TWO08 / (float) TWO04; +#pragma omp parallel for default(none) \ + shared(a, x, y, yhost, yaccl, n) private(i) + for (i = 0; i < n; ++i) { + x[i] = rand() % 32 / 32.0f; + y[i] = rand() % 32 / 32.0f; + yhost[i] = a * x[i] + y[i]; // yhost will be used as reference value + yaccl[i] = 0.0f; } + printf("total size of x and y is %9.1f MB\n", 2.0 * nbytes / (1 << 20)); + printf("tests are averaged over %2d loops\n", NLUP); /* * saxpy on host */ - clock_gettime(CLOCK_REALTIME, rt + 0); - hsaxpy(n, a, x, y); - clock_gettime(CLOCK_REALTIME, rt + 1); - wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); - printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt)); + for (ial = 0; ial < 2; ++ial) { + /* + * See hsaxpy.c for details: + * + * ial: + * + * 0: naive implementation + * otherwise: saxpy in MKL + */ + memcpy(yaccl, y, nbytes); + wtcalc = -1.0; + // skip 1st run for timing + hsaxpy(n, a, x, yaccl, ial); + // check yaccl + maxabserr = -1.0f; + for (i = 0; i < n; ++i) { + maxabserr = fabsf(yaccl[i] - yhost[i]) > maxabserr? + fabsf(yaccl[i] - yhost[i]) : maxabserr; + } + // skip 2nd run for timing + hsaxpy(n, a, x, yaccl, ial); + // timing : start + wtcalc = 0.0; + clock_gettime(CLOCK_REALTIME, rt + 0); + for (int ilup = 0; ilup < NLUP; ++ilup) { + hsaxpy(n, a, x, yaccl, ial); + } + clock_gettime(CLOCK_REALTIME, rt + 1); + wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("saxpy on host (%d) : %9.1f MB/s %9.1f MB/s maxabserr = %9.1f\n", + ial, NLUP * 3.0 * nbytes / ((1 << 20) * wt), + NLUP * 3.0 * nbytes / ((1 << 20) * wtcalc), maxabserr); + } /* - * saxpy on accel + * saxpy on accl */ - clock_gettime(CLOCK_REALTIME, rt + 0); - asaxpy(n, a, x, z); - clock_gettime(CLOCK_REALTIME, rt + 1); - wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); - printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt)); + for (ial = 1; ial < 7; ++ial) { + /* + * See asaxpy.c for details: + * + * ial: + * + * 0: <<< 1, 1>>>, TOO SLOW! not tested + * 1: <<< 1, 128>>> + * 2: <<< 128, 1>>> + * 3: <<< 128, 128>>> + * 4: <<>> + * 5: <<>>, 16x loop unrolling + * otherwise: cublasSaxpy in CUBLAS + */ + memcpy(yaccl, y, nbytes); + wtcalc = -1.0; + // skip 1st run for timing + asaxpy(n, a, x, yaccl, ial); + // check yaccl + maxabserr = -1.0f; + for (i = 0; i < n; ++i) { + maxabserr = fabsf(yaccl[i] - yhost[i]) > maxabserr? + fabsf(yaccl[i] - yhost[i]) : maxabserr; + } + // skip 2nd run for timing + asaxpy(n, a, x, yaccl, ial); + // timing : start + wtcalc = 0.0; + clock_gettime(CLOCK_REALTIME, rt + 0); + for (int ilup = 0; ilup < NLUP; ++ilup) { + asaxpy(n, a, x, yaccl, ial); + } + clock_gettime(CLOCK_REALTIME, rt + 1); + wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("saxpy on accl (%d) : %9.1f MB/s %9.1f MB/s maxabserr = %9.1f\n", + ial, NLUP * 3.0 * nbytes / ((1 << 20) * wt), + NLUP * 3.0 * nbytes / ((1 << 20) * wtcalc), maxabserr); + } /* - * check whether y[] == z[] _exactly_ + * release memory */ - for (i = 0; i < n; i++) { - iret = *(int *) (y + i) ^ *(int *) (z + i); - assert(iret == 0); - } + mkl_free(x); mkl_free(y); + mkl_free(yhost); mkl_free(yaccl); return 0; } diff --git a/05_saxpy/src/wtcalc.c b/05_saxpy/src/wtcalc.c new file mode 100644 index 0000000..bb00340 --- /dev/null +++ b/05_saxpy/src/wtcalc.c @@ -0,0 +1,21 @@ +/** + * @file wtcalc.c + * + * @brief Global variable for walltime of the calculation kernel. + * + * @author Xin Wu (PC²) + * @date 05.04.2020 + * @copyright CC BY-SA 2.0 + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "wtcalc.h" + +double wtcalc; + +#ifdef __cplusplus +} +#endif diff --git a/05_saxpy/src/wtcalc.h b/05_saxpy/src/wtcalc.h new file mode 100644 index 0000000..483951e --- /dev/null +++ b/05_saxpy/src/wtcalc.h @@ -0,0 +1,30 @@ +/** + * @file wtcalc.h + * + * @brief Global variable for walltime of the calculation kernel. + * + * @author Xin Wu (PC²) + * @date 05.04.2020 + * @copyright CC BY-SA 2.0 + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef WATCALC_H +#define WATCALC_H + +/* + * wtcalc: walltime for the calculation kernel + * + * - wtcalc < 0.0: reset and disable the timer + * - wtcalc == 0.0: enable the timer + */ +extern double wtcalc; + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/05_saxpy/tests/saxpy_real_00.sh b/05_saxpy/tests/saxpy_real_00.sh index a39dd00..6255fd4 100755 --- a/05_saxpy/tests/saxpy_real_00.sh +++ b/05_saxpy/tests/saxpy_real_00.sh @@ -1,8 +1,8 @@ #!/bin/bash #CCS -N saxpy -#CCS -t 10m +#CCS -t 600m #CCS -g pc2-mitarbeiter -#CCS --res=rset=1:ncpus=1:mem=4g:vmem=8g:tesla=1 +#CCS --res=rset=1:gtx1080=1,place=:excl echo "hallo from $(hostname)" ../src/saxpy diff --git a/05_saxpy/tests/saxpy_real_00.sh.5383621.out b/05_saxpy/tests/saxpy_real_00.sh.5383621.out new file mode 100644 index 0000000..2904da0 --- /dev/null +++ b/05_saxpy/tests/saxpy_real_00.sh.5383621.out @@ -0,0 +1,12 @@ +hallo from gpu011 +The system supports 1 ns time resolution +total size of x and y is 32.0 MB +tests are averaged over 32 loops +saxpy on host (0) : 43431.2 MB/s 43438.4 MB/s maxabserr = 0.0 +saxpy on host (1) : 446622.5 MB/s 447142.8 MB/s maxabserr = 0.0 +saxpy on accl (1) : 75.2 MB/s 78.6 MB/s maxabserr = 0.0 +saxpy on accl (2) : 75.7 MB/s 79.0 MB/s maxabserr = 0.0 +saxpy on accl (3) : 1945.7 MB/s 6071.8 MB/s maxabserr = 0.0 +saxpy on accl (4) : 1689.3 MB/s 4074.8 MB/s maxabserr = 0.0 +saxpy on accl (5) : 2644.2 MB/s 30438.5 MB/s maxabserr = 0.0 +saxpy on accl (6) : 2762.5 MB/s 127182.3 MB/s maxabserr = 0.0