Merge branch 'pcie-implementation' into 'master'

Prepare PCIe-MPI communication benchmarks for master See merge request pc2/HPCC_FPGA!56
pc2 · Oct 6, 2021 · ff640b0 · ff640b0
2 parents 03f7eda + f00682f
commit ff640b0
Show file tree

Hide file tree

Showing 90 changed files with 5,160 additions and 604 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -9,7 +9,7 @@ default:
   tags:
     - jacamar
   before_script:
-    - module load intelFPGA_pro/20.4.0_max bittware_520n/20.4.0_max intel devel/CMake/3.15.3-GCCcore-8.3.0
+    - module load intelFPGA_pro/20.4.0 bittware_520n/20.4.0_max intel devel/CMake/3.15.3-GCCcore-8.3.0
 
 ###
 #
@@ -38,7 +38,50 @@ build:STREAM:
       - scripts/**/*
       - cmake/**/*
       - .gitlab-ci.yml
-
+
+build:STREAM_HP:
+  stage: build
+  script:
+    - rm -rf build
+    - mkdir -p build
+    - cd build
+    - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - make -j 40 all
+  artifacts:
+    paths:
+      - build/bin/stream_kernels_single_emulate.aocx
+      - build/bin/stream_kernels_emulate.aocx
+      - build/bin/STREAM_FPGA_intel
+      - build/bin/STREAM_FPGA_test_intel
+  only:
+    changes:
+      - STREAM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
+
+build:STREAM_DP:
+  stage: build
+  script:
+    - rm -rf build
+    - mkdir -p build
+    - cd build
+    - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - make -j 40 all
+  artifacts:
+    paths:
+      - build/bin/stream_kernels_single_emulate.aocx
+      - build/bin/stream_kernels_emulate.aocx
+      - build/bin/STREAM_FPGA_intel
+      - build/bin/STREAM_FPGA_test_intel
+  only:
+    changes:
+      - STREAM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
 
 build:RandomAccess:
   stage: build
@@ -72,8 +115,11 @@ build:PTRANS:
     - make -j 40 all
   artifacts:
     paths:
-      - build/bin/transpose_diagonal_emulate.aocx
-      - build/bin/transpose_diagonal_c2_emulate.aocx
+      - build/bin/transpose_DIAG_IEC_emulate.aocx
+      - build/bin/transpose_PQ_IEC_emulate.aocx
+      - build/bin/transpose_PQ_PCIE_emulate.aocx
+      - build/bin/transpose_DIAG_PCIE_emulate.aocx
+      - build/bin/transpose_c2_DIAG_IEC_emulate.aocx
       - build/bin/Transpose_intel
       - build/bin/Transpose_test_intel
   only:
@@ -90,11 +136,12 @@ build:LINPACK:
     - rm -rf build
     - mkdir -p build
     - cd build
-    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
+    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DUSE_PCIE_MPI_COMMUNICATION=Yes
     - make -j 40 all
   artifacts:
     paths:
-      - build/bin/hpl_torus_emulate.aocx
+      - build/bin/hpl_torus_PCIE_emulate.aocx
+      - build/bin/hpl_torus_IEC_emulate.aocx
       - build/bin/Linpack_intel
       - build/bin/Linpack_test_intel
   only:
@@ -147,6 +194,27 @@ build:GEMM_HP_REP2:
       - cmake/**/*
       - .gitlab-ci.yml
 
+build:GEMM_DP_REP2:
+  stage: build
+  script:
+    - rm -rf build
+    - mkdir -p build
+    - cd build
+    - cmake ../GEMM -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
+    - make -j 40 all
+  artifacts:
+    paths:
+      - build/bin/gemm_base_emulate.aocx
+      - build/bin/GEMM_intel
+      - build/bin/GEMM_test_intel
+  only:
+    changes:
+      - GEMM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
+
 build:FFT:
   stage: build
   script:
@@ -199,11 +267,7 @@ build:b_eff:
     - make -j 40 all
   artifacts:
     paths:
-      - build/bin/communication_bw520n_emulate.aocx
-      - build/bin/communication_bw520n_combined_loops_emulate.aocx
-      - build/bin/communication_bw520n_disable_pipelining_emulate.aocx
-      - build/bin/Network_intel
-      - build/bin/Network_test_intel
+      - build/bin/*
   only:
     changes:
       - b_eff/**/*
@@ -223,7 +287,7 @@ test:STREAM:
   script:
     - cd build
     - cmake ../STREAM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:STREAM
   artifacts:
@@ -238,13 +302,57 @@ test:STREAM:
       - cmake/**/*
       - .gitlab-ci.yml
   needs: ["build:STREAM"]
+
+test:STREAM_HP:
+  stage: test
+  script:
+    - cd build
+    - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  dependencies:
+    - build:STREAM_HP
+  artifacts:
+    when: on_failure
+    paths:
+      - build/Testing/Temporary/LastTest.log
+  only:
+    changes:
+      - STREAM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
+  needs: ["build:STREAM_HP"]
+  # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE)
+  allow_failure: true
+
+test:STREAM_DP:
+  stage: test
+  script:
+    - cd build
+    - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  dependencies:
+    - build:STREAM_DP
+  artifacts:
+    when: on_failure
+    paths:
+      - build/Testing/Temporary/LastTest.log
+  only:
+    changes:
+      - STREAM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
+  needs: ["build:STREAM_DP"]
 
 test:RandomAccess:
   stage: test
   script:
     - cd build
     - cmake ../RandomAccess -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:RandomAccess
   artifacts:
@@ -275,7 +383,7 @@ test:PTRANS:
     - ln -s kernel_output_ch1 kernel_input_ch0
     - ln -s kernel_output_ch3 kernel_input_ch2
     - cd ..
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:PTRANS
   artifacts:
@@ -296,7 +404,7 @@ test:LINPACK:
   script:
     - cd build
     - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0  -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:LINPACK
   artifacts:
@@ -317,7 +425,7 @@ test:GEMM:
   script:
     - cd build
     - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:GEMM
   artifacts:
@@ -338,7 +446,7 @@ test:GEMM_HP_REP2:
   script:
     - cd build
     - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=half -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:GEMM_HP_REP2
   artifacts:
@@ -353,13 +461,36 @@ test:GEMM_HP_REP2:
       - cmake/**/*
       - .gitlab-ci.yml
   needs: ["build:GEMM_HP_REP2"]
+  # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE)
+  allow_failure: true
+
+test:GEMM_DP_REP2:
+  stage: test
+  script:
+    - cd build
+    - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=double -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  dependencies:
+    - build:GEMM_DP_REP2
+  artifacts:
+    when: on_failure
+    paths:
+      - build/Testing/Temporary/LastTest.log
+  only:
+    changes:
+      - GEMM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
+  needs: ["build:GEMM_DP_REP2"]
 
 test:FFT:
   stage: test
   script:
     - cd build
     - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:FFT
   artifacts:
@@ -380,7 +511,7 @@ test:FFT_small:
   script:
     - cd build
     - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:FFT_small
   artifacts:
@@ -411,7 +542,7 @@ test:b_eff:
     - ln -s kernel_output_ch1 kernel_input_ch0
     - ln -s kernel_output_ch3 kernel_input_ch2
     - cd ..
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:b_eff
   artifacts:

diff --git a/FFT/src/device/fft1d_float_8.cl b/FFT/src/device/fft1d_float_8.cl
@@ -109,29 +109,31 @@ void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["i
 
   // for iter iterations and one additional iteration to empty the last buffer
   for(unsigned k = 0; k < (iter + 1) * (N / POINTS); k++){ 
+
+    if (k < iter * ( N / POINTS)) {
 
-    float2 read_chunk[POINTS];
-
-    // Read the next 8 values from global memory
-    // in the last iteration just read garbage, but the data will not be forwarded over the pipes.
-    // This allows the use of memory bursts here.
-    // Also the data is shifted  every N/POINTS/POINTS iterations
-    __attribute__((opencl_unroll_hint(POINTS)))
-    for(int j = 0; j < POINTS; j++){
-      // Shift the data depending on the total FFT size
-      // Shifts every new chunk by one. If N/POINTS is a multiple of POINTS, the shifting is reduced to prevent mappings to the same bank.
-      unsigned shift = ((LOGN - LOGPOINTS - LOGPOINTS > 0) ? (k & (N/POINTS - 1)) >> (LOGN - LOGPOINTS - LOGPOINTS) : (k & (N/POINTS - 1)));
-      unsigned final_buffer_pos = (j + shift) & (POINTS - 1);
-      read_chunk[final_buffer_pos] = src[(k << LOGPOINTS) + j];
-    }
+      float2 read_chunk[POINTS];
 
-    // Write the shifted data into the memory buffer
-    __attribute__((opencl_unroll_hint(POINTS)))
-    for(int j = 0; j < POINTS; j++){
-      unsigned local_i = k & (2 * N/POINTS - 1);
-      buf[local_i][j] = read_chunk[j];
-    }
+      // Read the next 8 values from global memory
+      // in the last iteration just read garbage, but the data will not be forwarded over the pipes.
+      // This allows the use of memory bursts here.
+      // Also the data is shifted  every N/POINTS/POINTS iterations
+      __attribute__((opencl_unroll_hint(POINTS)))
+      for(int j = 0; j < POINTS; j++){
+        // Shift the data depending on the total FFT size
+        // Shifts every new chunk by one. If N/POINTS is a multiple of POINTS, the shifting is reduced to prevent mappings to the same bank.
+        unsigned shift = ((LOGN - LOGPOINTS - LOGPOINTS > 0) ? (k & (N/POINTS - 1)) >> (LOGN - LOGPOINTS - LOGPOINTS) : (k & (N/POINTS - 1)));
+        unsigned final_buffer_pos = (j + shift) & (POINTS - 1);
+        read_chunk[final_buffer_pos] = src[(k << LOGPOINTS) + j];
+      }
 
+      // Write the shifted data into the memory buffer
+      __attribute__((opencl_unroll_hint(POINTS)))
+      for(int j = 0; j < POINTS; j++){
+        unsigned local_i = k & (2 * N/POINTS - 1);
+        buf[local_i][j] = read_chunk[j];
+      }
+    }
     if (k >= ( N / POINTS)) {
       float2x8 buf2x8;
 

diff --git a/FFT/src/host/execution_default.cpp b/FFT/src/host/execution_default.cpp
@@ -29,7 +29,6 @@ SOFTWARE.
 #include <chrono>
 
 /* External library headers */
-#include "CL/cl.hpp"
 #ifdef INTEL_FPGA
 #ifdef USE_HBM
 // CL_HETEROGENEOUS_INTELFPGA is defined here 

diff --git a/GEMM/Readme.md b/GEMM/Readme.md
@@ -21,6 +21,8 @@ If available, the benchmark will use `sgemm_` to validate the calculation instea
 For matrix sizes above 1000x1000 we recommend using such a library to speed up the benchmark execution. 
 Using such a library will not change the performance result of the benchmark but might affect the reported error of the calculation.
 
+For half precision support, the IEEE 754-based half-precision floating-point library by Christian Rau is used and a copy is provided with this code. 
+
 ## Build
 
 CMake is used as the build system.
@@ -53,7 +55,7 @@ Next to the common configuration options given in the [README](../README.md) of
 
 Name             | Default     | Description                          |
 ---------------- |-------------|--------------------------------------|
- `DATA_TYPE`     | float (also supported: half, double)      | Data type used for calculation       |
+ `DATA_TYPE`     | float (also supported: half, double)      | Data type used for calculation. *Note: Currently, half-precision does not work on Intel FPGAs because they can not be passed as kernel argument per value.*  |
 `DEFAULT_MATRIX_SIZE` | 8      | The default size of the quadratic matrices in blocks |
 `BLOCK_SIZE`    | 512          | Block size used by the kernel for calculation |
 `GEMM_SIZE`    | 8             | Block size of the fully unrolled matrix multiplication in registers |

diff --git a/LINPACK/CHANGELOG b/LINPACK/CHANGELOG
@@ -2,11 +2,17 @@
 
 This file contains all changes made to the source code for each release.
 
+## 2.3
+#### Changed:
+- Refactored the code to support different execution kernels and data distributions
+#### Added:
+- FPGA kernel with communication via PCIe and MPI 
+
 ## 2.2
 
 #### Added:
 - LU facotrization kernel w/o pivoting in quadratic torus
-- Distributed calculation of GEL on CPU nodes and validation
+- Distributed calculation of GESL on CPU nodes and validation
 
 ## 2.1