Skip to content

Commit

Permalink
improved 09_matAdd
Browse files Browse the repository at this point in the history
  • Loading branch information
xwuupb committed Apr 15, 2020
1 parent a554810 commit 94bd672
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 116 deletions.
11 changes: 6 additions & 5 deletions 09_matAdd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ the numerical results are also verified.
| 3 | ji-loop, 2^9 threads * 2^f teams, collapse(2), |
| | coalesced memory access |
| 4 | ji-loop, 2^8 threads * 2^f teams, collapse(3), |
| | 2x i-loop unrolling |
| 5 | ji-loop, 2^7 threads * 2^f teams, collapse(3), |
| | 4x i-loop unrolling |
| 6 | ji-loop, 2^7 threads * 2^e teams, collapse(3), |
| | 4x i-loop unrolling, 2x j-loop unrolling |
| | 2x i-loop unrolling (stride of 2^8 rows) |
| 5 | ji-loop, 2^8 threads * 2^f teams, collapse(2), |
| | 2x i-loop unrolling (stride of n/2 rows) |
| 6 | ji-loop, 2^8 threads * 2^e teams, collapse(3), |
| | 2x i-loop unrolling (stride of 2^8 rows), |
| | 2x j-loop unrolling (stride of 1 col ) |
| 7 | cublasSaxpy in CUBLAS |

# Build
Expand Down
2 changes: 1 addition & 1 deletion 09_matAdd/configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ AC_PROG_CC([clang gcc])
AS_IF([test "${CC}" = gcc],
[CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"])
AS_IF([test "${CC}" = clang],
[CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \
[CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
-Xopenmp-target -march=sm_61 $CFLAGS"])
##############################################################################80
#
Expand Down
11 changes: 6 additions & 5 deletions 09_matAdd/docs/UserManual.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ the numerical results are also verified.
| 3 | ji-loop, 2^9 threads * 2^f teams, collapse(2), |
| | coalesced memory access |
| 4 | ji-loop, 2^8 threads * 2^f teams, collapse(3), |
| | 2x i-loop unrolling |
| 5 | ji-loop, 2^7 threads * 2^f teams, collapse(3), |
| | 4x i-loop unrolling |
| 6 | ji-loop, 2^7 threads * 2^e teams, collapse(3), |
| | 4x i-loop unrolling, 2x j-loop unrolling |
| | 2x i-loop unrolling (stride of 2^8 rows) |
| 5 | ji-loop, 2^8 threads * 2^f teams, collapse(2), |
| | 2x i-loop unrolling (stride of n/2 rows) |
| 6 | ji-loop, 2^8 threads * 2^e teams, collapse(3), |
| | 2x i-loop unrolling (stride of 2^8 rows), |
| | 2x j-loop unrolling (stride of 1 col ) |
| 7 | cublasSaxpy in CUBLAS |

# Usage
Expand Down
8 changes: 4 additions & 4 deletions 09_matAdd/src/matAdd.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ int main(int argc, char *argv[])
* 4: ji-loop, 2^8 threads * 2^f teams, collapse(3),
* 2x i-loop unrolling
*
* 5: ji-loop, 2^7 threads * 2^f teams, collapse(3),
* 4x i-loop unrolling
* 5: ji-loop, 2^8 threads * 2^f teams, collapse(2),
* 2x i-loop unrolling
*
* 6: ji-loop, 2^7 threads * 2^e teams, collapse(3),
* 4x i-loop unrolling, 2x j-loop unrolling
* 6: ji-loop, 2^8 threads * 2^e teams, collapse(3),
* 2x i-loop unrolling, 2x j-loop unrolling
*
* otherwise: cublasSaxpy in CUBLAS
*/
Expand Down
129 changes: 56 additions & 73 deletions 09_matAdd/src/matAddAB.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@
#include "cublas_v2.h"
#include "matAddAB.h"

#define NTHRDS7 (1 << 0x7)
#define NTHRDS8 (1 << 0x8)
#define NTHRDS9 (1 << 0x9)
#define NTHRDS7 (1 << 0x7) /* 2^{7} */
#define NTHRDS8 (1 << 0x8) /* 2^{8} */
#define NTHRDS9 (1 << 0x9) /* 2^{9} */

#define LTEAMSD (1 << 0xD)
#define LTEAMSE (1 << 0xE)
#define LTEAMSF (1 << 0xF)
#define LTEAMSD (1 << 0xD) /* 2^{13} */
#define LTEAMSE (1 << 0xE) /* 2^{14} */
#define LTEAMSF (1 << 0xF) /* 2^{15} */

#define BLKROW (512) /* 2x number of threads in each team */

double wtcalc;

Expand All @@ -41,7 +43,6 @@ void matAddAB_accl(float *a,
*a_dev = NULL,
*b_dev = NULL;
struct timespec rt[2];
int halfn = n / 2;

switch (ial) {
case 0:
Expand All @@ -54,7 +55,7 @@ void matAddAB_accl(float *a,
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
{
clock_gettime(CLOCK_REALTIME, rt + 0);
#pragma omp target teams device(0) num_teams(LTEAMSF) \
#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
default(none) shared(a, b, n)
#pragma omp distribute parallel for num_threads(NTHRDS9) \
Expand All @@ -79,7 +80,7 @@ for (int j = 0; j < n; ++j) { /* sequential */
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
{
clock_gettime(CLOCK_REALTIME, rt + 0);
#pragma omp target teams device(0) num_teams(LTEAMSF) \
#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
default(none) shared(a, b, n)
#pragma omp distribute parallel for num_threads(NTHRDS9) \
Expand All @@ -105,7 +106,7 @@ for (int i = 0; i < n; ++i) { /* sequential */
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
{
clock_gettime(CLOCK_REALTIME, rt + 0);
#pragma omp target teams device(0) num_teams(LTEAMSF) \
#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
default(none) shared(a, b, n)
#pragma omp distribute parallel for num_threads(NTHRDS9) \
Expand All @@ -130,7 +131,7 @@ for (int j = 0; j < n; ++j) {
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
{
clock_gettime(CLOCK_REALTIME, rt + 0);
#pragma omp target teams device(0) num_teams(LTEAMSF) \
#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
default(none) shared(a, b, n)
#pragma omp distribute parallel for num_threads(NTHRDS9) \
Expand All @@ -149,25 +150,25 @@ for (int i = 0; i < n; ++i) {
* - ji-loop
* - 2^8 threads per team and 2^f teams
* - collapse(3)
* - 2x i-loop unrolling by number of threads
* - 2x i-loop unrolling (stride of 2^8 rows)
*/
#pragma omp target data device(0) \
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
{
clock_gettime(CLOCK_REALTIME, rt + 0);
#pragma omp target teams device(0) num_teams(LTEAMSF) \
#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS8) \
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
default(none) shared(a, b, n)
#pragma omp distribute parallel for num_threads(NTHRDS8) \
dist_schedule(static, NTHRDS8) collapse(3) \
default(none) shared(a, b, n)
for (int j = 0; j < n; ++j) {
for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
for (int i = 0; i < NTHRDS8; ++i) {
a[j * n + iblk * NTHRDS9 + i ] +=
b[j * n + iblk * NTHRDS9 + i ];
a[j * n + iblk * NTHRDS9 + i + NTHRDS8] +=
b[j * n + iblk * NTHRDS9 + i + NTHRDS8];
for (int iblk = 0; iblk < n / BLKROW; ++iblk) {
for (int i = 0; i < NTHRDS8; ++i) { /* 2x unrolling */
a[j * n + iblk * BLKROW + i ] +=
b[j * n + iblk * BLKROW + i ];
a[j * n + iblk * BLKROW + i + NTHRDS8] +=
b[j * n + iblk * BLKROW + i + NTHRDS8];
} /* end i-loop */
} /* end iblk-loop */
} /* end j-loop */
Expand All @@ -177,78 +178,60 @@ for (int i = 0; i < NTHRDS8; ++i) {
case 5:
/*
* - ji-loop
* - 2^7 threads per team and 2^f teams
* - collapse(3)
* - 4x i-loop unrolling
* * 2x by number of threads
* * 2x by half of rows
* - 2^8 threads per team and 2^f teams
* - collapse(2)
* - 2x i-loop unrolling (stride of n/2 rows)
*/
#pragma omp target data device(0) \
map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n])
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
{
clock_gettime(CLOCK_REALTIME, rt + 0);
#pragma omp target teams device(0) num_teams(LTEAMSF) \
map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n]) \
default(none) shared(a, b, n, halfn)
#pragma omp distribute parallel for num_threads(NTHRDS7) \
dist_schedule(static, NTHRDS7) collapse(3) \
default(none) shared(a, b, n, halfn)
#pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS8) \
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
default(none) shared(a, b, n)
#pragma omp distribute parallel for num_threads(NTHRDS8) \
dist_schedule(static, NTHRDS8) collapse(2) \
default(none) shared(a, b, n)
for (int j = 0; j < n; ++j) {
for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
for (int i = 0; i < NTHRDS7; ++i) {
a[j * n + iblk * NTHRDS8 + i ] +=
b[j * n + iblk * NTHRDS8 + i ];
a[j * n + iblk * NTHRDS8 + i + NTHRDS7] +=
b[j * n + iblk * NTHRDS8 + i + NTHRDS7];
a[j * n + iblk * NTHRDS8 + i + halfn ] +=
b[j * n + iblk * NTHRDS8 + i + halfn ];
a[j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] +=
b[j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7];
for (int i = 0; i < (n >> 1); ++i) { /* 2x unrolling */
a[j * n + i ] +=
b[j * n + i ];
a[j * n + i + (n >> 1)] +=
b[j * n + i + (n >> 1)];
} /* end i-loop */
} /* end iblk-loop */
} /* end j-loop */
clock_gettime(CLOCK_REALTIME, rt + 1);
}
break;
case 6:
/*
* - ji-loop
* - 2^7 threads per team and 2^e teams
* - 2^8 threads per team and 2^14 teams
* - collapse(3)
* - 4x i-loop unrolling
* * 2x by number of threads
* * 2x by half of rows
* - 2x j-loop unrolling
* - 2x j-loop unrolling (stride of 1 col )
* - 2x i-loop unrolling (stride of 2^8 rows)
*/
#pragma omp target data device(0) \
map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n])
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
{
clock_gettime(CLOCK_REALTIME, rt + 0);
#pragma omp target teams device(0) num_teams(LTEAMSE) \
map(to:n, halfn, b[0:n * n]) map(tofrom:a[0:n * n]) \
default(none) shared(a, b, n, halfn)
#pragma omp distribute parallel for num_threads(NTHRDS7) \
dist_schedule(static, NTHRDS7) collapse(3) \
default(none) shared(a, b, n, halfn)
for (int j = 0; j < halfn; ++j) {
for (int iblk = 0; iblk < n / NTHRDS9; ++iblk) {
for (int i = 0; i < NTHRDS7; ++i) {
a[ j * n + iblk * NTHRDS8 + i ] +=
b[ j * n + iblk * NTHRDS8 + i ];
a[ j * n + iblk * NTHRDS8 + i + NTHRDS7] +=
b[ j * n + iblk * NTHRDS8 + i + NTHRDS7];
a[ j * n + iblk * NTHRDS8 + i + halfn ] +=
b[ j * n + iblk * NTHRDS8 + i + halfn ];
a[ j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] +=
b[ j * n + iblk * NTHRDS8 + i + halfn + NTHRDS7];
a[(j + halfn) * n + iblk * NTHRDS8 + i ] +=
b[(j + halfn) * n + iblk * NTHRDS8 + i ];
a[(j + halfn) * n + iblk * NTHRDS8 + i + NTHRDS7] +=
b[(j + halfn) * n + iblk * NTHRDS8 + i + NTHRDS7];
a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn ] +=
b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn ];
a[(j + halfn) * n + iblk * NTHRDS8 + i + halfn + NTHRDS7] +=
b[(j + halfn) * n + iblk * NTHRDS8 + i + halfn + NTHRDS7];
#pragma omp target teams device(0) num_teams(LTEAMSE) thread_limit(NTHRDS8) \
map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
default(none) shared(a, b, n)
#pragma omp distribute parallel for num_threads(NTHRDS8) \
dist_schedule(static, NTHRDS8) collapse(3) \
default(none) shared(a, b, n)
for (int j = 0; j < n; j += 2) { /* 2x unrolling */
for (int iblk = 0; iblk < n / BLKROW; ++iblk) {
for (int i = 0; i < NTHRDS8; ++i) { /* 2x unrolling */
a[ j * n + iblk * BLKROW + i ] +=
b[ j * n + iblk * BLKROW + i ];
a[ j * n + iblk * BLKROW + i + NTHRDS8] +=
b[ j * n + iblk * BLKROW + i + NTHRDS8];
a[(j + 1) * n + iblk * BLKROW + i ] +=
b[(j + 1) * n + iblk * BLKROW + i ];
a[(j + 1) * n + iblk * BLKROW + i + NTHRDS8] +=
b[(j + 1) * n + iblk * BLKROW + i + NTHRDS8];
} /* end i-loop */
} /* end iblk-loop */
} /* end j-loop */
Expand Down
3 changes: 2 additions & 1 deletion 09_matAdd/tests/matAdd_real_00.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/bin/bash
#CCS -N matMul
#CCS -N matAdd
#CCS -t 600m
#CCS -g pc2-mitarbeiter
#CCS --res=rset=1:gtx1080=1,place=:excl

echo "hallo from $(hostname)"
../src/matAdd $((2**12))
../src/matAdd $((2**13))
53 changes: 26 additions & 27 deletions 10_matMul/src/matMulAB.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
#define LTEAMSF (1 << 0xF) /* 2^{15} */
#define LTEAMSG (1 << 020) /* 2^{16} */

#define NPITCH (1024)
#define BLKROW (512)
#define BLKROW (512) /* 4x number of threads in each team */
#define BLKDIM (16)

double wtcalc;
Expand Down Expand Up @@ -595,40 +594,40 @@ for (int i = 0; i < n / BLKDIM; ++i) {
dist_schedule(static, NTHRDS7) collapse(2) \
default(none) shared(a, b, c, n)
for (int j = 0; j < n; ++j) {
for (int i = 0; i < NPITCH; ++i) { /* 4x unrolling */
for (int i = 0; i < (n >> 2); ++i) { /* 4x unrolling */
float rc0, rc1, rc2, rc3;
rc0 = c[j * n + i ];
rc1 = c[j * n + i + NPITCH ];
rc2 = c[j * n + i + NPITCH * 2];
rc3 = c[j * n + i + NPITCH * 3];
rc0 = c[j * n + i ];
rc1 = c[j * n + i + (n >> 2) ];
rc2 = c[j * n + i + (n >> 2) * 2];
rc3 = c[j * n + i + (n >> 2) * 3];
for (int k = 0; k < n; k += 4) { /* 4x unrolling */
/* register for b: 4x k-loop */
float rb0, rb1, rb2, rb3;
rb0 = b[j * n + k ];
rb1 = b[j * n + k + 1];
rb2 = b[j * n + k + 2];
rb3 = b[j * n + k + 3];
rc0 += a[ k * n + i ] * rb0;
rc0 += a[(k + 1) * n + i ] * rb1;
rc0 += a[(k + 2) * n + i ] * rb2;
rc0 += a[(k + 3) * n + i ] * rb3;
rc1 += a[ k * n + i + NPITCH ] * rb0;
rc1 += a[(k + 1) * n + i + NPITCH ] * rb1;
rc1 += a[(k + 2) * n + i + NPITCH ] * rb2;
rc1 += a[(k + 3) * n + i + NPITCH ] * rb3;
rc2 += a[ k * n + i + NPITCH * 2] * rb0;
rc2 += a[(k + 1) * n + i + NPITCH * 2] * rb1;
rc2 += a[(k + 2) * n + i + NPITCH * 2] * rb2;
rc2 += a[(k + 3) * n + i + NPITCH * 2] * rb3;
rc3 += a[ k * n + i + NPITCH * 3] * rb0;
rc3 += a[(k + 1) * n + i + NPITCH * 3] * rb1;
rc3 += a[(k + 2) * n + i + NPITCH * 3] * rb2;
rc3 += a[(k + 3) * n + i + NPITCH * 3] * rb3;
rc0 += a[ k * n + i ] * rb0;
rc0 += a[(k + 1) * n + i ] * rb1;
rc0 += a[(k + 2) * n + i ] * rb2;
rc0 += a[(k + 3) * n + i ] * rb3;
rc1 += a[ k * n + i + (n >> 2) ] * rb0;
rc1 += a[(k + 1) * n + i + (n >> 2) ] * rb1;
rc1 += a[(k + 2) * n + i + (n >> 2) ] * rb2;
rc1 += a[(k + 3) * n + i + (n >> 2) ] * rb3;
rc2 += a[ k * n + i + (n >> 2) * 2] * rb0;
rc2 += a[(k + 1) * n + i + (n >> 2) * 2] * rb1;
rc2 += a[(k + 2) * n + i + (n >> 2) * 2] * rb2;
rc2 += a[(k + 3) * n + i + (n >> 2) * 2] * rb3;
rc3 += a[ k * n + i + (n >> 2) * 3] * rb0;
rc3 += a[(k + 1) * n + i + (n >> 2) * 3] * rb1;
rc3 += a[(k + 2) * n + i + (n >> 2) * 3] * rb2;
rc3 += a[(k + 3) * n + i + (n >> 2) * 3] * rb3;
}
c[j * n + i ] = rc0;
c[j * n + i + NPITCH ] = rc1;
c[j * n + i + NPITCH * 2] = rc2;
c[j * n + i + NPITCH * 3] = rc3;
c[j * n + i ] = rc0;
c[j * n + i + (n >> 2) ] = rc1;
c[j * n + i + (n >> 2) * 2] = rc2;
c[j * n + i + (n >> 2) * 3] = rc3;
} /* end i-loop */
} /* end j-loop */
clock_gettime(CLOCK_REALTIME, rt + 1);
Expand Down

0 comments on commit 94bd672

Please sign in to comment.