improve 05_saxpy, 09_matAdd and 10_matMul

pc2 · Apr 15, 2020 · a0885f6 · a0885f6
1 parent 94bd672
commit a0885f6
Show file tree

Hide file tree

Showing 12 changed files with 94 additions and 52 deletions.
diff --git a/05_saxpy/README.md b/05_saxpy/README.md
@@ -41,7 +41,8 @@ where:
 |  5  | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling                |
 |     | (2^15*2^7*16==2^26)                                                    |
 |  6  | <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling                |
-|  7  | cublasSaxpy in CUBLAS                                                  |
+|  7  | de-linearize the vector gives slightly better performance than CUBLAS  |
+|  8  | cublasSaxpy in CUBLAS                                                  |
 
 # Build
 

diff --git a/05_saxpy/docs/UserManual.md b/05_saxpy/docs/UserManual.md
@@ -41,7 +41,8 @@ where:
 |  5  | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling                |
 |     | (2^15*2^7*16==2^26)                                                    |
 |  6  | <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling                |
-|  7  | cublasSaxpy in CUBLAS                                                  |
+|  7  | de-linearize the vector gives slightly better performance than CUBLAS  |
+|  8  | cublasSaxpy in CUBLAS                                                  |
 
 # Usage
 

diff --git a/05_saxpy/src/asaxpy.c b/05_saxpy/src/asaxpy.c
@@ -210,6 +210,32 @@ for (int i = 0; i < m; ++i) {
   y[i + 0xf * m] = a * x[i + 0xf * m] + y[i + 0xf * m];
 }
   clock_gettime(CLOCK_REALTIME, rt + 1);
+}
+      break;
+    case 7:
+/*
+ * - <<<2^16, 2^9>>>:
+ *     * de-linearize the vector (convert the vector to matrix)
+ *     * collapse the ji-loop
+ *     * 2x i-loop unrolling
+ */
+#pragma omp target data  device(0) \
+  map(to:a, x[0:n]) map(tofrom:y[0:n])
+{
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target teams device(0) num_teams(65536) thread_limit(512) \
+  map(to:a, x[0:n]) map(tofrom:y[0:n]) \
+  default(none) shared(a, x, y)
+#pragma omp distribute parallel for num_threads(512) \
+  dist_schedule(static, 512) collapse(2) \
+  default(none) shared(a, x, y)
+for (int j = 0; j < 65536; ++j) {
+  for (int i = 0; i < 512; ++i) { /* 2x i-loop unrolling */
+    y[j * 1024 + i      ] += a * x[j * 1024 + i      ];
+    y[j * 1024 + i + 512] += a * x[j * 1024 + i + 512];
+  }
+}
+  clock_gettime(CLOCK_REALTIME, rt + 1);
 }
       break;
     default:

diff --git a/05_saxpy/src/saxpy.c b/05_saxpy/src/saxpy.c
@@ -131,7 +131,7 @@ int main(int argc, char *argv[])
   /*
    * saxpy on accl
    */
-  for (ial = 1; ial < 8; ++ial) {
+  for (ial = 1; ial < 9; ++ial) {
     /*
      * See asaxpy.c for details:
      *
@@ -144,6 +144,7 @@ int main(int argc, char *argv[])
      * 4: <<<2^16, 2^10>>>, manual scheduling
      * 5: <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26)
      * 6: <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling
+     * 7: de-linearize the vector and then collapse the ji-loop.
      * otherwise: cublasSaxpy in CUBLAS
      */
     memcpy(yaccl, y, nbytes);

diff --git a/05_saxpy/tests/saxpy_real_00.sh.5385100.out b/05_saxpy/tests/saxpy_real_00.sh.5385100.out
diff --git a/05_saxpy/tests/saxpy_real_00.sh.5422320.out b/05_saxpy/tests/saxpy_real_00.sh.5422320.out
@@ -0,0 +1,14 @@
+hallo from gpu029
+The system supports 1 ns time resolution
+total size of x and y is     512.0 MB
+tests are averaged over 32 loops
+saxpy on host (0) :   65092.0 MB/s   65093.6 MB/s maxabserr =       0.0
+saxpy on host (1) :   70769.4 MB/s   70772.0 MB/s maxabserr =       0.0
+saxpy on accl (1) :    1400.7 MB/s    4648.9 MB/s maxabserr =       0.0
+saxpy on accl (2) :    1371.6 MB/s    4653.9 MB/s maxabserr =       0.0
+saxpy on accl (3) :    2046.7 MB/s  227586.6 MB/s maxabserr =       0.0
+saxpy on accl (4) :    2062.4 MB/s  224540.3 MB/s maxabserr =       0.0
+saxpy on accl (5) :    2073.9 MB/s  276659.5 MB/s maxabserr =       0.0
+saxpy on accl (6) :    2045.0 MB/s  271431.4 MB/s maxabserr =       0.0
+saxpy on accl (7) :    2025.2 MB/s  280631.7 MB/s maxabserr =       0.0
+saxpy on accl (8) :    2025.7 MB/s  279577.4 MB/s maxabserr =       0.0
diff --git a/09_matAdd/tests/matAdd_real_00.sh.5329336.out b/09_matAdd/tests/matAdd_real_00.sh.5329336.out
diff --git a/09_matAdd/tests/matAdd_real_00.sh.5422037.out b/09_matAdd/tests/matAdd_real_00.sh.5422037.out
diff --git a/09_matAdd/tests/matAdd_real_00.sh.5422334.out b/09_matAdd/tests/matAdd_real_00.sh.5422334.out
@@ -0,0 +1,21 @@
+hallo from gpu028
+matrix dim: 4096 x 4096
+time averaged over 64 loops
+matAddAB (0) :       1.9 GB/s      86.6 GB/s maxabserr =       0.0
+matAddAB (1) :       1.6 GB/s      35.7 GB/s maxabserr =       0.0
+matAddAB (2) :       1.6 GB/s      48.1 GB/s maxabserr =       0.0
+matAddAB (3) :       1.7 GB/s     166.6 GB/s maxabserr =       0.0
+matAddAB (4) :       2.0 GB/s     183.3 GB/s maxabserr =       0.0
+matAddAB (5) :       1.9 GB/s     183.7 GB/s maxabserr =       0.0
+matAddAB (6) :       1.9 GB/s     185.3 GB/s maxabserr =       0.0
+matAddAB (7) :       1.8 GB/s     185.4 GB/s maxabserr =       0.0
+matrix dim: 8192 x 8192
+time averaged over 64 loops
+matAddAB (0) :       1.9 GB/s     172.2 GB/s maxabserr =       0.0
+matAddAB (1) :       1.9 GB/s      34.0 GB/s maxabserr =       0.0
+matAddAB (2) :       1.6 GB/s       8.4 GB/s maxabserr =       0.0
+matAddAB (3) :       1.9 GB/s     265.8 GB/s maxabserr =       0.0
+matAddAB (4) :       1.9 GB/s     265.4 GB/s maxabserr =       0.0
+matAddAB (5) :       1.9 GB/s     265.8 GB/s maxabserr =       0.0
+matAddAB (6) :       1.9 GB/s     264.9 GB/s maxabserr =       0.0
+matAddAB (7) :       1.9 GB/s     269.0 GB/s maxabserr =       0.0
diff --git a/10_matMul/tests/matMul_real_00.sh.5422034.out b/10_matMul/tests/matMul_real_00.sh.5422034.out
diff --git a/10_matMul/tests/matMul_real_00.sh.5422392.out b/10_matMul/tests/matMul_real_00.sh.5422392.out
@@ -0,0 +1,14 @@
+hallo from gpu028
+matrix dim: 4096 x 4096
+time averaged over 16 loops
+matMulAB (0) :      24.9 GFLOPS      25.5 GFLOPS maxabserr =       0.0
+matMulAB (1) :       9.8 GFLOPS       9.9 GFLOPS maxabserr =       0.0
+matMulAB (2) :     184.5 GFLOPS     228.9 GFLOPS maxabserr =       0.0
+matMulAB (3) :       5.0 GFLOPS       5.1 GFLOPS maxabserr =    1018.4
+matMulAB (4) :     176.1 GFLOPS     216.2 GFLOPS maxabserr =       0.0
+matMulAB (5) :     340.9 GFLOPS     531.9 GFLOPS maxabserr =       0.0
+matMulAB (6) :     610.3 GFLOPS    1708.9 GFLOPS maxabserr =       0.0
+matMulAB (7) :     218.9 GFLOPS     284.6 GFLOPS maxabserr =       0.0
+matMulAB (8) :     233.8 GFLOPS     310.4 GFLOPS maxabserr =       0.0
+matMulAB (9) :     254.5 GFLOPS     348.1 GFLOPS maxabserr =       0.0
+matMulAB (10) :     931.6 GFLOPS   10126.1 GFLOPS maxabserr =       0.0
diff --git a/README.md b/README.md
@@ -8,6 +8,19 @@ Paderborn University. The sub-directories are generally organized as:
 * docs: documentation
 * tests: some tests
 
+Some highlights of the codes in this repository:
+
+* The performance of our `saxpy` implemented by using OpenMP GPU-offloading is
+  as good as `cublasSaxpy` in CUBLAS. See `case 7` in `05_saxpy/src/asaxpy.c`
+  for details.
+
+* The GPU shared memory has not been standardized in OpenMP API Specification
+  (Version 5.0 Nov. 2018). To optimize the performance of matrix multiplication
+  by using OpenMP GPU-offloading, i) `case 6` in `10_matMul/src/matMulAB.c`
+  implements a register blocking algorithm and ii) `case 8` in the same source
+  code file implements a common GPU-based tiled algorithm by blocking the local
+  shared memory in a very tricky manner and the OpenMP code resembles CUDA.
+
 # List of Projects
 
 * 00_build_OpenMP_offload