From 1b2b8c3900ec4c4d72396969911e5778dbefcd07 Mon Sep 17 00:00:00 2001 From: xwuupb Date: Mon, 6 Apr 2020 13:21:31 +0200 Subject: [PATCH] improve 05_saxpy --- 05_saxpy/README.md | 18 ++++--- 05_saxpy/docs/UserManual.md | 18 ++++--- 05_saxpy/src/asaxpy.c | 56 +++++++++++++++++---- 05_saxpy/src/saxpy.c | 19 +++---- 05_saxpy/tests/saxpy_real_00.sh.5383621.out | 12 ----- 05_saxpy/tests/saxpy_real_00.sh.5385100.out | 13 +++++ 6 files changed, 88 insertions(+), 48 deletions(-) delete mode 100644 05_saxpy/tests/saxpy_real_00.sh.5383621.out create mode 100644 05_saxpy/tests/saxpy_real_00.sh.5385100.out diff --git a/05_saxpy/README.md b/05_saxpy/README.md index f4d543e..eec437e 100644 --- a/05_saxpy/README.md +++ b/05_saxpy/README.md @@ -17,7 +17,7 @@ where: * `a` is a scalar. * `x` and `y` are single-precision vectors each with n elements. -* For testing n is assumed to be $2^{22}$. +* For testing n is assumed to be $2^{26}$. * The following table only summarizes the most important points. For more details on the ial-th implementation see comments in `hsaxpy.c` (on host) and `asaxpy.c` (on accelerator). @@ -33,13 +33,15 @@ where: | ial | Remarks | |:---:|------------------------------------------------------------------------| -| 0 | <<< 1, 1>>>, TOO SLOW! not tested | -| 1 | <<< 1, 128>>> | -| 2 | <<< 128, 1>>> | -| 3 | <<< 128, 128>>> | -| 4 | <<>> | -| 5 | <<>>, 16x loop unrolling | -| 6 | cublasSaxpy in CUBLAS | +| 0 | <<<2^0 , 2^0 >>>, TOO SLOW! not tested | +| 1 | <<<2^0 , 2^7 >>>, auto scheduling | +| 2 | <<<2^7 , 2^0 >>>, auto scheduling | +| 3 | <<<2^7 , 2^7 >>>, auto scheduling | +| 4 | <<<2^16, 2^10>>>, manual scheduling | +| 5 | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling | +| | (2^15*2^7*16==2^26) | +| 6 | <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling | +| 7 | cublasSaxpy in CUBLAS | # Build diff --git a/05_saxpy/docs/UserManual.md b/05_saxpy/docs/UserManual.md index c989f62..b9a68fc 100644 --- a/05_saxpy/docs/UserManual.md +++ b/05_saxpy/docs/UserManual.md @@ -17,7 +17,7 @@ where: * `a` is a scalar. * `x` and `y` are single-precision vectors each with n elements. -* For testing n is assumed to be $2^{22}$. +* For testing n is assumed to be $2^{26}$. * The following table only summarizes the most important points. For more details on the ial-th implementation see comments in `hsaxpy.c` (on host) and `asaxpy.c` (on accelerator). @@ -33,13 +33,15 @@ where: | ial | Remarks | |:---:|------------------------------------------------------------------------| -| 0 | <<< 1, 1>>>, TOO SLOW! not tested | -| 1 | <<< 1, 128>>> | -| 2 | <<< 128, 1>>> | -| 3 | <<< 128, 128>>> | -| 4 | <<>> | -| 5 | <<>>, 16x loop unrolling | -| 6 | cublasSaxpy in CUBLAS | +| 0 | <<<2^0 , 2^0 >>>, TOO SLOW! not tested | +| 1 | <<<2^0 , 2^7 >>>, auto scheduling | +| 2 | <<<2^7 , 2^0 >>>, auto scheduling | +| 3 | <<<2^7 , 2^7 >>>, auto scheduling | +| 4 | <<<2^16, 2^10>>>, manual scheduling | +| 5 | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling | +| | (2^15*2^7*16==2^26) | +| 6 | <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling | +| 7 | cublasSaxpy in CUBLAS | # Usage diff --git a/05_saxpy/src/asaxpy.c b/05_saxpy/src/asaxpy.c index 9c53b80..0a05ccf 100644 --- a/05_saxpy/src/asaxpy.c +++ b/05_saxpy/src/asaxpy.c @@ -48,7 +48,7 @@ void asaxpy(const int n, switch (ial) { case 0: /* - * - <<<1, 1>>> + * - <<<2^0 , 2^0 >>>, TOO SLOW! not tested */ #pragma omp target data device(0) \ map(to:a, n, x[0:n]) map(tofrom:y[0:n]) @@ -68,7 +68,7 @@ for (int i = 0; i < n; ++i) { break; case 1: /* - * - <<<1, 128>>> + * - <<<2^0 , 2^7 >>>, auto scheduling */ #pragma omp target data device(0) \ map(to:a, n, x[0:n]) map(tofrom:y[0:n]) @@ -88,7 +88,7 @@ for (int i = 0; i < n; ++i) { break; case 2: /* - * - <<<128, 1>>> + * - <<<2^7 , 2^0 >>>, auto scheduling */ #pragma omp target data device(0) \ map(to:a, n, x[0:n]) map(tofrom:y[0:n]) @@ -108,7 +108,7 @@ for (int i = 0; i < n; ++i) { break; case 3: /* - * - <<<128, 128>>> + * - <<<2^7 , 2^7 >>>, auto scheduling */ #pragma omp target data device(0) \ map(to:a, n, x[0:n]) map(tofrom:y[0:n]) @@ -128,17 +128,17 @@ for (int i = 0; i < n; ++i) { break; case 4: /* - * - <<>> + * - <<<2^16, 2^10>>>, manual scheduling */ #pragma omp target data device(0) \ map(to:a, n, x[0:n]) map(tofrom:y[0:n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams((1 << 15)) \ +#pragma omp target teams device(0) num_teams(65536) \ map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ default(none) shared(a, n, x, y) -#pragma omp distribute parallel for num_threads(128) \ - dist_schedule(static, 128) \ +#pragma omp distribute parallel for num_threads(1024) \ + dist_schedule(static, 1024) \ default(none) shared(a, n, x, y) for (int i = 0; i < n; ++i) { y[i] = a * x[i] + y[i]; @@ -148,14 +148,48 @@ for (int i = 0; i < n; ++i) { break; case 5: /* - * - <<>> - * - 16x loop-unrolling + * - <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26) + */ +#pragma omp target data device(0) \ + map(to:a, m, x[0:n]) map(tofrom:y[0:n]) +{ + clock_gettime(CLOCK_REALTIME, rt + 0); +#pragma omp target teams device(0) num_teams(32768) \ + map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \ + default(none) shared(a, m, x, y) +#pragma omp distribute parallel for num_threads(128) \ + dist_schedule(static, 128) \ + default(none) shared(a, m, x, y) +for (int i = 0; i < m; ++i) { + y[i ] = a * x[i ] + y[i ]; + y[i + m] = a * x[i + m] + y[i + m]; + y[i + 0x2 * m] = a * x[i + 0x2 * m] + y[i + 0x2 * m]; + y[i + 0x3 * m] = a * x[i + 0x3 * m] + y[i + 0x3 * m]; + y[i + 0x4 * m] = a * x[i + 0x4 * m] + y[i + 0x4 * m]; + y[i + 0x5 * m] = a * x[i + 0x5 * m] + y[i + 0x5 * m]; + y[i + 0x6 * m] = a * x[i + 0x6 * m] + y[i + 0x6 * m]; + y[i + 0x7 * m] = a * x[i + 0x7 * m] + y[i + 0x7 * m]; + y[i + 0x8 * m] = a * x[i + 0x8 * m] + y[i + 0x8 * m]; + y[i + 0x9 * m] = a * x[i + 0x9 * m] + y[i + 0x9 * m]; + y[i + 0xa * m] = a * x[i + 0xa * m] + y[i + 0xa * m]; + y[i + 0xb * m] = a * x[i + 0xb * m] + y[i + 0xb * m]; + y[i + 0xc * m] = a * x[i + 0xc * m] + y[i + 0xc * m]; + y[i + 0xd * m] = a * x[i + 0xd * m] + y[i + 0xd * m]; + y[i + 0xe * m] = a * x[i + 0xe * m] + y[i + 0xe * m]; + y[i + 0xf * m] = a * x[i + 0xf * m] + y[i + 0xf * m]; +} + clock_gettime(CLOCK_REALTIME, rt + 1); +} + break; + case 6: +/* + * - <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling */ #pragma omp target data device(0) \ map(to:a, m, x[0:n]) map(tofrom:y[0:n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(2048) \ +#pragma omp target teams device(0) num_teams(4096) \ map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \ default(none) shared(a, m, x, y) #pragma omp distribute parallel for num_threads(128) \ diff --git a/05_saxpy/src/saxpy.c b/05_saxpy/src/saxpy.c index 2ff3d25..c184fc6 100644 --- a/05_saxpy/src/saxpy.c +++ b/05_saxpy/src/saxpy.c @@ -34,7 +34,7 @@ #include "check1ns.h" #include "wtcalc.h" -#define TWO22 (1 << 22) +#define TWO26 (1 << 26) #define NLUP (32) /** @@ -69,7 +69,7 @@ int main(int argc, char *argv[]) /* * preparation */ - n = TWO22; + n = TWO26; nbytes = sizeof(float) * n; iret = 0; if (NULL == (x = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1; @@ -131,18 +131,19 @@ int main(int argc, char *argv[]) /* * saxpy on accl */ - for (ial = 1; ial < 7; ++ial) { + for (ial = 1; ial < 8; ++ial) { /* * See asaxpy.c for details: * * ial: * - * 0: <<< 1, 1>>>, TOO SLOW! not tested - * 1: <<< 1, 128>>> - * 2: <<< 128, 1>>> - * 3: <<< 128, 128>>> - * 4: <<>> - * 5: <<>>, 16x loop unrolling + * 0: <<<2^0 , 2^0 >>>, TOO SLOW! not tested + * 1: <<<2^0 , 2^7 >>>, auto scheduling + * 2: <<<2^7 , 2^0 >>>, auto scheduling + * 3: <<<2^7 , 2^7 >>>, auto scheduling + * 4: <<<2^16, 2^10>>>, manual scheduling + * 5: <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26) + * 6: <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling * otherwise: cublasSaxpy in CUBLAS */ memcpy(yaccl, y, nbytes); diff --git a/05_saxpy/tests/saxpy_real_00.sh.5383621.out b/05_saxpy/tests/saxpy_real_00.sh.5383621.out deleted file mode 100644 index 2904da0..0000000 --- a/05_saxpy/tests/saxpy_real_00.sh.5383621.out +++ /dev/null @@ -1,12 +0,0 @@ -hallo from gpu011 -The system supports 1 ns time resolution -total size of x and y is 32.0 MB -tests are averaged over 32 loops -saxpy on host (0) : 43431.2 MB/s 43438.4 MB/s maxabserr = 0.0 -saxpy on host (1) : 446622.5 MB/s 447142.8 MB/s maxabserr = 0.0 -saxpy on accl (1) : 75.2 MB/s 78.6 MB/s maxabserr = 0.0 -saxpy on accl (2) : 75.7 MB/s 79.0 MB/s maxabserr = 0.0 -saxpy on accl (3) : 1945.7 MB/s 6071.8 MB/s maxabserr = 0.0 -saxpy on accl (4) : 1689.3 MB/s 4074.8 MB/s maxabserr = 0.0 -saxpy on accl (5) : 2644.2 MB/s 30438.5 MB/s maxabserr = 0.0 -saxpy on accl (6) : 2762.5 MB/s 127182.3 MB/s maxabserr = 0.0 diff --git a/05_saxpy/tests/saxpy_real_00.sh.5385100.out b/05_saxpy/tests/saxpy_real_00.sh.5385100.out new file mode 100644 index 0000000..5d21ff8 --- /dev/null +++ b/05_saxpy/tests/saxpy_real_00.sh.5385100.out @@ -0,0 +1,13 @@ +hallo from gpu026 +The system supports 1 ns time resolution +total size of x and y is 512.0 MB +tests are averaged over 32 loops +saxpy on host (0) : 41473.2 MB/s 41474.3 MB/s maxabserr = 0.0 +saxpy on host (1) : 66632.2 MB/s 66635.3 MB/s maxabserr = 0.0 +saxpy on accl (1) : 76.0 MB/s 78.7 MB/s maxabserr = 0.0 +saxpy on accl (2) : 76.3 MB/s 79.0 MB/s maxabserr = 0.0 +saxpy on accl (3) : 1635.3 MB/s 6457.5 MB/s maxabserr = 0.0 +saxpy on accl (4) : 1417.9 MB/s 3956.9 MB/s maxabserr = 0.0 +saxpy on accl (5) : 1954.3 MB/s 45380.7 MB/s maxabserr = 0.0 +saxpy on accl (6) : 1938.4 MB/s 67113.8 MB/s maxabserr = 0.0 +saxpy on accl (7) : 1984.3 MB/s 282336.0 MB/s maxabserr = 0.0