Skip to content

Commit

Permalink
improve 05_saxpy
Browse files Browse the repository at this point in the history
  • Loading branch information
xwuupb committed Apr 6, 2020
1 parent 19c425d commit 1b2b8c3
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 48 deletions.
18 changes: 10 additions & 8 deletions 05_saxpy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ where:

* `a` is a scalar.
* `x` and `y` are single-precision vectors each with n elements.
* For testing n is assumed to be $2^{22}$.
* For testing n is assumed to be $2^{26}$.
* The following table only summarizes the most important points. For more
details on the ial-th implementation see comments in `hsaxpy.c` (on host)
and `asaxpy.c` (on accelerator).
Expand All @@ -33,13 +33,15 @@ where:

| ial | Remarks |
|:---:|------------------------------------------------------------------------|
| 0 | <<< 1, 1>>>, TOO SLOW! not tested |
| 1 | <<< 1, 128>>> |
| 2 | <<< 128, 1>>> |
| 3 | <<< 128, 128>>> |
| 4 | <<<n / 128, 128>>> |
| 5 | <<<n / (128 * 16), 128>>>, 16x loop unrolling |
| 6 | cublasSaxpy in CUBLAS |
| 0 | <<<2^0 , 2^0 >>>, TOO SLOW! not tested |
| 1 | <<<2^0 , 2^7 >>>, auto scheduling |
| 2 | <<<2^7 , 2^0 >>>, auto scheduling |
| 3 | <<<2^7 , 2^7 >>>, auto scheduling |
| 4 | <<<2^16, 2^10>>>, manual scheduling |
| 5 | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling |
| | (2^15*2^7*16==2^26) |
| 6 | <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling |
| 7 | cublasSaxpy in CUBLAS |

# Build

Expand Down
18 changes: 10 additions & 8 deletions 05_saxpy/docs/UserManual.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ where:

* `a` is a scalar.
* `x` and `y` are single-precision vectors each with n elements.
* For testing n is assumed to be $2^{22}$.
* For testing n is assumed to be $2^{26}$.
* The following table only summarizes the most important points. For more
details on the ial-th implementation see comments in `hsaxpy.c` (on host)
and `asaxpy.c` (on accelerator).
Expand All @@ -33,13 +33,15 @@ where:

| ial | Remarks |
|:---:|------------------------------------------------------------------------|
| 0 | <<< 1, 1>>>, TOO SLOW! not tested |
| 1 | <<< 1, 128>>> |
| 2 | <<< 128, 1>>> |
| 3 | <<< 128, 128>>> |
| 4 | <<<n / 128, 128>>> |
| 5 | <<<n / (128 * 16), 128>>>, 16x loop unrolling |
| 6 | cublasSaxpy in CUBLAS |
| 0 | <<<2^0 , 2^0 >>>, TOO SLOW! not tested |
| 1 | <<<2^0 , 2^7 >>>, auto scheduling |
| 2 | <<<2^7 , 2^0 >>>, auto scheduling |
| 3 | <<<2^7 , 2^7 >>>, auto scheduling |
| 4 | <<<2^16, 2^10>>>, manual scheduling |
| 5 | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling |
| | (2^15*2^7*16==2^26) |
| 6 | <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling |
| 7 | cublasSaxpy in CUBLAS |

# Usage

Expand Down
56 changes: 45 additions & 11 deletions 05_saxpy/src/asaxpy.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ void asaxpy(const int n,
switch (ial) {
case 0:
/*
* - <<<1, 1>>>
* - <<<2^0 , 2^0 >>>, TOO SLOW! not tested
*/
#pragma omp target data device(0) \
map(to:a, n, x[0:n]) map(tofrom:y[0:n])
Expand All @@ -68,7 +68,7 @@ for (int i = 0; i < n; ++i) {
break;
case 1:
/*
* - <<<1, 128>>>
* - <<<2^0 , 2^7 >>>, auto scheduling
*/
#pragma omp target data device(0) \
map(to:a, n, x[0:n]) map(tofrom:y[0:n])
Expand All @@ -88,7 +88,7 @@ for (int i = 0; i < n; ++i) {
break;
case 2:
/*
* - <<<128, 1>>>
* - <<<2^7 , 2^0 >>>, auto scheduling
*/
#pragma omp target data device(0) \
map(to:a, n, x[0:n]) map(tofrom:y[0:n])
Expand All @@ -108,7 +108,7 @@ for (int i = 0; i < n; ++i) {
break;
case 3:
/*
* - <<<128, 128>>>
* - <<<2^7 , 2^7 >>>, auto scheduling
*/
#pragma omp target data device(0) \
map(to:a, n, x[0:n]) map(tofrom:y[0:n])
Expand All @@ -128,17 +128,17 @@ for (int i = 0; i < n; ++i) {
break;
case 4:
/*
* - <<<n / 128, 128>>>
* - <<<2^16, 2^10>>>, manual scheduling
*/
#pragma omp target data device(0) \
map(to:a, n, x[0:n]) map(tofrom:y[0:n])
{
clock_gettime(CLOCK_REALTIME, rt + 0);
#pragma omp target teams device(0) num_teams((1 << 15)) \
#pragma omp target teams device(0) num_teams(65536) \
map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
default(none) shared(a, n, x, y)
#pragma omp distribute parallel for num_threads(128) \
dist_schedule(static, 128) \
#pragma omp distribute parallel for num_threads(1024) \
dist_schedule(static, 1024) \
default(none) shared(a, n, x, y)
for (int i = 0; i < n; ++i) {
y[i] = a * x[i] + y[i];
Expand All @@ -148,14 +148,48 @@ for (int i = 0; i < n; ++i) {
break;
case 5:
/*
* - <<<n / (128 * 16), 128>>>
* - 16x loop-unrolling
* - <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26)
*/
#pragma omp target data device(0) \
map(to:a, m, x[0:n]) map(tofrom:y[0:n])
{
clock_gettime(CLOCK_REALTIME, rt + 0);
#pragma omp target teams device(0) num_teams(32768) \
map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \
default(none) shared(a, m, x, y)
#pragma omp distribute parallel for num_threads(128) \
dist_schedule(static, 128) \
default(none) shared(a, m, x, y)
for (int i = 0; i < m; ++i) {
y[i ] = a * x[i ] + y[i ];
y[i + m] = a * x[i + m] + y[i + m];
y[i + 0x2 * m] = a * x[i + 0x2 * m] + y[i + 0x2 * m];
y[i + 0x3 * m] = a * x[i + 0x3 * m] + y[i + 0x3 * m];
y[i + 0x4 * m] = a * x[i + 0x4 * m] + y[i + 0x4 * m];
y[i + 0x5 * m] = a * x[i + 0x5 * m] + y[i + 0x5 * m];
y[i + 0x6 * m] = a * x[i + 0x6 * m] + y[i + 0x6 * m];
y[i + 0x7 * m] = a * x[i + 0x7 * m] + y[i + 0x7 * m];
y[i + 0x8 * m] = a * x[i + 0x8 * m] + y[i + 0x8 * m];
y[i + 0x9 * m] = a * x[i + 0x9 * m] + y[i + 0x9 * m];
y[i + 0xa * m] = a * x[i + 0xa * m] + y[i + 0xa * m];
y[i + 0xb * m] = a * x[i + 0xb * m] + y[i + 0xb * m];
y[i + 0xc * m] = a * x[i + 0xc * m] + y[i + 0xc * m];
y[i + 0xd * m] = a * x[i + 0xd * m] + y[i + 0xd * m];
y[i + 0xe * m] = a * x[i + 0xe * m] + y[i + 0xe * m];
y[i + 0xf * m] = a * x[i + 0xf * m] + y[i + 0xf * m];
}
clock_gettime(CLOCK_REALTIME, rt + 1);
}
break;
case 6:
/*
* - <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling
*/
#pragma omp target data device(0) \
map(to:a, m, x[0:n]) map(tofrom:y[0:n])
{
clock_gettime(CLOCK_REALTIME, rt + 0);
#pragma omp target teams device(0) num_teams(2048) \
#pragma omp target teams device(0) num_teams(4096) \
map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \
default(none) shared(a, m, x, y)
#pragma omp distribute parallel for num_threads(128) \
Expand Down
19 changes: 10 additions & 9 deletions 05_saxpy/src/saxpy.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
#include "check1ns.h"
#include "wtcalc.h"

#define TWO22 (1 << 22)
#define TWO26 (1 << 26)
#define NLUP (32)

/**
Expand Down Expand Up @@ -69,7 +69,7 @@ int main(int argc, char *argv[])
/*
* preparation
*/
n = TWO22;
n = TWO26;
nbytes = sizeof(float) * n;
iret = 0;
if (NULL == (x = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1;
Expand Down Expand Up @@ -131,18 +131,19 @@ int main(int argc, char *argv[])
/*
* saxpy on accl
*/
for (ial = 1; ial < 7; ++ial) {
for (ial = 1; ial < 8; ++ial) {
/*
* See asaxpy.c for details:
*
* ial:
*
* 0: <<< 1, 1>>>, TOO SLOW! not tested
* 1: <<< 1, 128>>>
* 2: <<< 128, 1>>>
* 3: <<< 128, 128>>>
* 4: <<<n / 128, 128>>>
* 5: <<<n / (128 * 16), 128>>>, 16x loop unrolling
* 0: <<<2^0 , 2^0 >>>, TOO SLOW! not tested
* 1: <<<2^0 , 2^7 >>>, auto scheduling
* 2: <<<2^7 , 2^0 >>>, auto scheduling
* 3: <<<2^7 , 2^7 >>>, auto scheduling
* 4: <<<2^16, 2^10>>>, manual scheduling
* 5: <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26)
* 6: <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling
* otherwise: cublasSaxpy in CUBLAS
*/
memcpy(yaccl, y, nbytes);
Expand Down
12 changes: 0 additions & 12 deletions 05_saxpy/tests/saxpy_real_00.sh.5383621.out

This file was deleted.

13 changes: 13 additions & 0 deletions 05_saxpy/tests/saxpy_real_00.sh.5385100.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
hallo from gpu026
The system supports 1 ns time resolution
total size of x and y is 512.0 MB
tests are averaged over 32 loops
saxpy on host (0) : 41473.2 MB/s 41474.3 MB/s maxabserr = 0.0
saxpy on host (1) : 66632.2 MB/s 66635.3 MB/s maxabserr = 0.0
saxpy on accl (1) : 76.0 MB/s 78.7 MB/s maxabserr = 0.0
saxpy on accl (2) : 76.3 MB/s 79.0 MB/s maxabserr = 0.0
saxpy on accl (3) : 1635.3 MB/s 6457.5 MB/s maxabserr = 0.0
saxpy on accl (4) : 1417.9 MB/s 3956.9 MB/s maxabserr = 0.0
saxpy on accl (5) : 1954.3 MB/s 45380.7 MB/s maxabserr = 0.0
saxpy on accl (6) : 1938.4 MB/s 67113.8 MB/s maxabserr = 0.0
saxpy on accl (7) : 1984.3 MB/s 282336.0 MB/s maxabserr = 0.0

0 comments on commit 1b2b8c3

Please sign in to comment.