diff --git a/09_matAdd/README.md b/09_matAdd/README.md index 76077ce..ccd58e7 100644 --- a/09_matAdd/README.md +++ b/09_matAdd/README.md @@ -28,11 +28,12 @@ the numerical results are also verified. | 3 | ji-loop, 2^9 threads * 2^f teams, collapse(2), | | | coalesced memory access | | 4 | ji-loop, 2^8 threads * 2^f teams, collapse(3), | -| | 2x i-loop unrolling | -| 5 | ji-loop, 2^7 threads * 2^f teams, collapse(3), | -| | 4x i-loop unrolling | -| 6 | ji-loop, 2^7 threads * 2^e teams, collapse(3), | -| | 4x i-loop unrolling, 2x j-loop unrolling | +| | 2x i-loop unrolling (stride of 2^8 rows) | +| 5 | ji-loop, 2^8 threads * 2^f teams, collapse(2), | +| | 2x i-loop unrolling (stride of n/2 rows) | +| 6 | ji-loop, 2^8 threads * 2^e teams, collapse(3), | +| | 2x i-loop unrolling (stride of 2^8 rows), | +| | 2x j-loop unrolling (stride of 1 col ) | | 7 | cublasSaxpy in CUBLAS | # Build diff --git a/09_matAdd/configure.ac b/09_matAdd/configure.ac index bcca20e..b321799 100644 --- a/09_matAdd/configure.ac +++ b/09_matAdd/configure.ac @@ -41,7 +41,7 @@ AC_PROG_CC([clang gcc]) AS_IF([test "${CC}" = gcc], [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"]) AS_IF([test "${CC}" = clang], - [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \ + [CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ -Xopenmp-target -march=sm_61 $CFLAGS"]) ##############################################################################80 # diff --git a/09_matAdd/docs/UserManual.md b/09_matAdd/docs/UserManual.md index a392a83..3d6753b 100644 --- a/09_matAdd/docs/UserManual.md +++ b/09_matAdd/docs/UserManual.md @@ -28,11 +28,12 @@ the numerical results are also verified. | 3 | ji-loop, 2^9 threads * 2^f teams, collapse(2), | | | coalesced memory access | | 4 | ji-loop, 2^8 threads * 2^f teams, collapse(3), | -| | 2x i-loop unrolling | -| 5 | ji-loop, 2^7 threads * 2^f teams, collapse(3), | -| | 4x i-loop unrolling | -| 6 | ji-loop, 2^7 threads * 2^e teams, collapse(3), | -| | 4x i-loop unrolling, 2x j-loop unrolling | +| | 2x i-loop unrolling (stride of 2^8 rows) | +| 5 | ji-loop, 2^8 threads * 2^f teams, collapse(2), | +| | 2x i-loop unrolling (stride of n/2 rows) | +| 6 | ji-loop, 2^8 threads * 2^e teams, collapse(3), | +| | 2x i-loop unrolling (stride of 2^8 rows), | +| | 2x j-loop unrolling (stride of 1 col ) | | 7 | cublasSaxpy in CUBLAS | # Usage diff --git a/09_matAdd/src/matAdd.c b/09_matAdd/src/matAdd.c index 382c536..c7b515a 100644 --- a/09_matAdd/src/matAdd.c +++ b/09_matAdd/src/matAdd.c @@ -93,11 +93,11 @@ int main(int argc, char *argv[]) * 4: ji-loop, 2^8 threads * 2^f teams, collapse(3), * 2x i-loop unrolling * - * 5: ji-loop, 2^7 threads * 2^f teams, collapse(3), - * 4x i-loop unrolling + * 5: ji-loop, 2^8 threads * 2^f teams, collapse(2), + * 2x i-loop unrolling * - * 6: ji-loop, 2^7 threads * 2^e teams, collapse(3), - * 4x i-loop unrolling, 2x j-loop unrolling + * 6: ji-loop, 2^8 threads * 2^e teams, collapse(3), + * 2x i-loop unrolling, 2x j-loop unrolling * * otherwise: cublasSaxpy in CUBLAS */ diff --git a/09_matAdd/src/matAddAB.c b/09_matAdd/src/matAddAB.c index 6cec8be..fa1cce0 100644 --- a/09_matAdd/src/matAddAB.c +++ b/09_matAdd/src/matAddAB.c @@ -21,13 +21,15 @@ #include "cublas_v2.h" #include "matAddAB.h" -#define NTHRDS7 (1 << 0x7) -#define NTHRDS8 (1 << 0x8) -#define NTHRDS9 (1 << 0x9) +#define NTHRDS7 (1 << 0x7) /* 2^{7} */ +#define NTHRDS8 (1 << 0x8) /* 2^{8} */ +#define NTHRDS9 (1 << 0x9) /* 2^{9} */ -#define LTEAMSD (1 << 0xD) -#define LTEAMSE (1 << 0xE) -#define LTEAMSF (1 << 0xF) +#define LTEAMSD (1 << 0xD) /* 2^{13} */ +#define LTEAMSE (1 << 0xE) /* 2^{14} */ +#define LTEAMSF (1 << 0xF) /* 2^{15} */ + +#define BLKROW (512) /* 2x number of threads in each team */ double wtcalc; @@ -41,7 +43,6 @@ void matAddAB_accl(float *a, *a_dev = NULL, *b_dev = NULL; struct timespec rt[2]; - int halfn = n / 2; switch (ial) { case 0: @@ -54,7 +55,7 @@ void matAddAB_accl(float *a, map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ default(none) shared(a, b, n) #pragma omp distribute parallel for num_threads(NTHRDS9) \ @@ -79,7 +80,7 @@ for (int j = 0; j < n; ++j) { /* sequential */ map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ default(none) shared(a, b, n) #pragma omp distribute parallel for num_threads(NTHRDS9) \ @@ -105,7 +106,7 @@ for (int i = 0; i < n; ++i) { /* sequential */ map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ default(none) shared(a, b, n) #pragma omp distribute parallel for num_threads(NTHRDS9) \ @@ -130,7 +131,7 @@ for (int j = 0; j < n; ++j) { map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ default(none) shared(a, b, n) #pragma omp distribute parallel for num_threads(NTHRDS9) \ @@ -149,25 +150,25 @@ for (int i = 0; i < n; ++i) { * - ji-loop * - 2^8 threads per team and 2^f teams * - collapse(3) - * - 2x i-loop unrolling by number of threads + * - 2x i-loop unrolling (stride of 2^8 rows) */ #pragma omp target data device(0) \ map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS8) \ map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ default(none) shared(a, b, n) #pragma omp distribute parallel for num_threads(NTHRDS8) \ dist_schedule(static, NTHRDS8) collapse(3) \ default(none) shared(a, b, n) for (int j = 0; j < n; ++j) { -for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) { -for (int i = 0; i < NTHRDS8; ++i) { - a[j * n + iblk * NTHRDS9 + i ] += - b[j * n + iblk * NTHRDS9 + i ]; - a[j * n + iblk * NTHRDS9 + i + NTHRDS8] += - b[j * n + iblk * NTHRDS9 + i + NTHRDS8]; +for (int iblk = 0; iblk < n / BLKROW; ++iblk) { +for (int i = 0; i < NTHRDS8; ++i) { /* 2x unrolling */ + a[j * n + iblk * BLKROW + i ] += + b[j * n + iblk * BLKROW + i ]; + a[j * n + iblk * BLKROW + i + NTHRDS8] += + b[j * n + iblk * BLKROW + i + NTHRDS8]; } /* end i-loop */ } /* end iblk-loop */ } /* end j-loop */ @@ -177,35 +178,27 @@ for (int i = 0; i < NTHRDS8; ++i) { case 5: /* * - ji-loop - * - 2^7 threads per team and 2^f teams - * - collapse(3) - * - 4x i-loop unrolling - * * 2x by number of threads - * * 2x by half of rows + * - 2^8 threads per team and 2^f teams + * - collapse(2) + * - 2x i-loop unrolling (stride of n/2 rows) */ #pragma omp target data device(0) \ - map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n]) + map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSF) \ - map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n]) \ - default(none) shared(a, b, n, halfn) -#pragma omp distribute parallel for num_threads(NTHRDS7) \ - dist_schedule(static, NTHRDS7) collapse(3) \ - default(none) shared(a, b, n, halfn) +#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS8) \ + map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ + default(none) shared(a, b, n) +#pragma omp distribute parallel for num_threads(NTHRDS8) \ + dist_schedule(static, NTHRDS8) collapse(2) \ + default(none) shared(a, b, n) for (int j = 0; j < n; ++j) { -for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) { -for (int i = 0; i < NTHRDS7; ++i) { - a[j * n + iblk * NTHRDS8 + i ] += - b[j * n + iblk * NTHRDS8 + i ]; - a[j * n + iblk * NTHRDS8 + i + NTHRDS7] += - b[j * n + iblk * NTHRDS8 + i + NTHRDS7]; - a[j * n + iblk * NTHRDS8 + i + halfn ] += - b[j * n + iblk * NTHRDS8 + i + halfn ]; - a[j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] += - b[j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7]; +for (int i = 0; i < (n >> 1); ++i) { /* 2x unrolling */ + a[j * n + i ] += + b[j * n + i ]; + a[j * n + i + (n >> 1)] += + b[j * n + i + (n >> 1)]; } /* end i-loop */ -} /* end iblk-loop */ } /* end j-loop */ clock_gettime(CLOCK_REALTIME, rt + 1); } @@ -213,42 +206,32 @@ for (int i = 0; i < NTHRDS7; ++i) { case 6: /* * - ji-loop - * - 2^7 threads per team and 2^e teams + * - 2^8 threads per team and 2^14 teams * - collapse(3) - * - 4x i-loop unrolling - * * 2x by number of threads - * * 2x by half of rows - * - 2x j-loop unrolling + * - 2x j-loop unrolling (stride of 1 col ) + * - 2x i-loop unrolling (stride of 2^8 rows) */ #pragma omp target data device(0) \ - map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n]) + map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) { clock_gettime(CLOCK_REALTIME, rt + 0); -#pragma omp target teams device(0) num_teams(LTEAMSE) \ - map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n]) \ - default(none) shared(a, b, n, halfn) -#pragma omp distribute parallel for num_threads(NTHRDS7) \ - dist_schedule(static, NTHRDS7) collapse(3) \ - default(none) shared(a, b, n, halfn) -for (int j = 0; j < halfn; ++j) { -for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) { -for (int i = 0; i < NTHRDS7; ++i) { - a[ j * n + iblk * NTHRDS8 + i ] += - b[ j * n + iblk * NTHRDS8 + i ]; - a[ j * n + iblk * NTHRDS8 + i + NTHRDS7] += - b[ j * n + iblk * NTHRDS8 + i + NTHRDS7]; - a[ j * n + iblk * NTHRDS8 + i + halfn ] += - b[ j * n + iblk * NTHRDS8 + i + halfn ]; - a[ j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] += - b[ j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7]; - a[(j + halfn) * n + iblk * NTHRDS8 + i ] += - b[(j + halfn) * n + iblk * NTHRDS8 + i ]; - a[(j + halfn) * n + iblk * NTHRDS8 + i + NTHRDS7] += - b[(j + halfn) * n + iblk * NTHRDS8 + i + NTHRDS7]; - a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn ] += - b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn ]; - a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] += - b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn + NTHRDS7]; +#pragma omp target teams device(0) num_teams(LTEAMSE) thread_limit(NTHRDS8) \ + map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ + default(none) shared(a, b, n) +#pragma omp distribute parallel for num_threads(NTHRDS8) \ + dist_schedule(static, NTHRDS8) collapse(3) \ + default(none) shared(a, b, n) +for (int j = 0; j < n; j += 2) { /* 2x unrolling */ +for (int iblk = 0; iblk < n / BLKROW; ++iblk) { +for (int i = 0; i < NTHRDS8; ++i) { /* 2x unrolling */ + a[ j * n + iblk * BLKROW + i ] += + b[ j * n + iblk * BLKROW + i ]; + a[ j * n + iblk * BLKROW + i + NTHRDS8] += + b[ j * n + iblk * BLKROW + i + NTHRDS8]; + a[(j + 1) * n + iblk * BLKROW + i ] += + b[(j + 1) * n + iblk * BLKROW + i ]; + a[(j + 1) * n + iblk * BLKROW + i + NTHRDS8] += + b[(j + 1) * n + iblk * BLKROW + i + NTHRDS8]; } /* end i-loop */ } /* end iblk-loop */ } /* end j-loop */ diff --git a/09_matAdd/tests/matAdd_real_00.sh b/09_matAdd/tests/matAdd_real_00.sh index a2037d7..9be3087 100755 --- a/09_matAdd/tests/matAdd_real_00.sh +++ b/09_matAdd/tests/matAdd_real_00.sh @@ -1,8 +1,9 @@ #!/bin/bash -#CCS -N matMul +#CCS -N matAdd #CCS -t 600m #CCS -g pc2-mitarbeiter #CCS --res=rset=1:gtx1080=1,place=:excl echo "hallo from $(hostname)" ../src/matAdd $((2**12)) +../src/matAdd $((2**13)) diff --git a/10_matMul/src/matMulAB.c b/10_matMul/src/matMulAB.c index 21cf6a8..22590e5 100644 --- a/10_matMul/src/matMulAB.c +++ b/10_matMul/src/matMulAB.c @@ -30,8 +30,7 @@ #define LTEAMSF (1 << 0xF) /* 2^{15} */ #define LTEAMSG (1 << 020) /* 2^{16} */ -#define NPITCH (1024) -#define BLKROW (512) +#define BLKROW (512) /* 4x number of threads in each team */ #define BLKDIM (16) double wtcalc; @@ -595,12 +594,12 @@ for (int i = 0; i < n / BLKDIM; ++i) { dist_schedule(static, NTHRDS7) collapse(2) \ default(none) shared(a, b, c, n) for (int j = 0; j < n; ++j) { -for (int i = 0; i < NPITCH; ++i) { /* 4x unrolling */ +for (int i = 0; i < (n >> 2); ++i) { /* 4x unrolling */ float rc0, rc1, rc2, rc3; - rc0 = c[j * n + i ]; - rc1 = c[j * n + i + NPITCH ]; - rc2 = c[j * n + i + NPITCH * 2]; - rc3 = c[j * n + i + NPITCH * 3]; + rc0 = c[j * n + i ]; + rc1 = c[j * n + i + (n >> 2) ]; + rc2 = c[j * n + i + (n >> 2) * 2]; + rc3 = c[j * n + i + (n >> 2) * 3]; for (int k = 0; k < n; k += 4) { /* 4x unrolling */ /* register for b: 4x k-loop */ float rb0, rb1, rb2, rb3; @@ -608,27 +607,27 @@ for (int i = 0; i < NPITCH; ++i) { /* 4x unrolling */ rb1 = b[j * n + k + 1]; rb2 = b[j * n + k + 2]; rb3 = b[j * n + k + 3]; - rc0 += a[ k * n + i ] * rb0; - rc0 += a[(k + 1) * n + i ] * rb1; - rc0 += a[(k + 2) * n + i ] * rb2; - rc0 += a[(k + 3) * n + i ] * rb3; - rc1 += a[ k * n + i + NPITCH ] * rb0; - rc1 += a[(k + 1) * n + i + NPITCH ] * rb1; - rc1 += a[(k + 2) * n + i + NPITCH ] * rb2; - rc1 += a[(k + 3) * n + i + NPITCH ] * rb3; - rc2 += a[ k * n + i + NPITCH * 2] * rb0; - rc2 += a[(k + 1) * n + i + NPITCH * 2] * rb1; - rc2 += a[(k + 2) * n + i + NPITCH * 2] * rb2; - rc2 += a[(k + 3) * n + i + NPITCH * 2] * rb3; - rc3 += a[ k * n + i + NPITCH * 3] * rb0; - rc3 += a[(k + 1) * n + i + NPITCH * 3] * rb1; - rc3 += a[(k + 2) * n + i + NPITCH * 3] * rb2; - rc3 += a[(k + 3) * n + i + NPITCH * 3] * rb3; + rc0 += a[ k * n + i ] * rb0; + rc0 += a[(k + 1) * n + i ] * rb1; + rc0 += a[(k + 2) * n + i ] * rb2; + rc0 += a[(k + 3) * n + i ] * rb3; + rc1 += a[ k * n + i + (n >> 2) ] * rb0; + rc1 += a[(k + 1) * n + i + (n >> 2) ] * rb1; + rc1 += a[(k + 2) * n + i + (n >> 2) ] * rb2; + rc1 += a[(k + 3) * n + i + (n >> 2) ] * rb3; + rc2 += a[ k * n + i + (n >> 2) * 2] * rb0; + rc2 += a[(k + 1) * n + i + (n >> 2) * 2] * rb1; + rc2 += a[(k + 2) * n + i + (n >> 2) * 2] * rb2; + rc2 += a[(k + 3) * n + i + (n >> 2) * 2] * rb3; + rc3 += a[ k * n + i + (n >> 2) * 3] * rb0; + rc3 += a[(k + 1) * n + i + (n >> 2) * 3] * rb1; + rc3 += a[(k + 2) * n + i + (n >> 2) * 3] * rb2; + rc3 += a[(k + 3) * n + i + (n >> 2) * 3] * rb3; } - c[j * n + i ] = rc0; - c[j * n + i + NPITCH ] = rc1; - c[j * n + i + NPITCH * 2] = rc2; - c[j * n + i + NPITCH * 3] = rc3; + c[j * n + i ] = rc0; + c[j * n + i + (n >> 2) ] = rc1; + c[j * n + i + (n >> 2) * 2] = rc2; + c[j * n + i + (n >> 2) * 3] = rc3; } /* end i-loop */ } /* end j-loop */ clock_gettime(CLOCK_REALTIME, rt + 1);