Skip to content

Commit

Permalink
Merge branch 'pcie-implementation' into 'master'
Browse files Browse the repository at this point in the history
Prepare PCIe-MPI communication benchmarks for master

See merge request pc2/HPCC_FPGA!56
  • Loading branch information
Mellich committed Oct 6, 2021
2 parents 03f7eda + f00682f commit ff640b0
Show file tree
Hide file tree
Showing 90 changed files with 5,160 additions and 604 deletions.
171 changes: 151 additions & 20 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ default:
tags:
- jacamar
before_script:
- module load intelFPGA_pro/20.4.0_max bittware_520n/20.4.0_max intel devel/CMake/3.15.3-GCCcore-8.3.0
- module load intelFPGA_pro/20.4.0 bittware_520n/20.4.0_max intel devel/CMake/3.15.3-GCCcore-8.3.0

###
#
Expand Down Expand Up @@ -38,7 +38,50 @@ build:STREAM:
- scripts/**/*
- cmake/**/*
- .gitlab-ci.yml


build:STREAM_HP:
stage: build
script:
- rm -rf build
- mkdir -p build
- cd build
- cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
- make -j 40 all
artifacts:
paths:
- build/bin/stream_kernels_single_emulate.aocx
- build/bin/stream_kernels_emulate.aocx
- build/bin/STREAM_FPGA_intel
- build/bin/STREAM_FPGA_test_intel
only:
changes:
- STREAM/**/*
- shared/**/*
- scripts/**/*
- cmake/**/*
- .gitlab-ci.yml

build:STREAM_DP:
stage: build
script:
- rm -rf build
- mkdir -p build
- cd build
- cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
- make -j 40 all
artifacts:
paths:
- build/bin/stream_kernels_single_emulate.aocx
- build/bin/stream_kernels_emulate.aocx
- build/bin/STREAM_FPGA_intel
- build/bin/STREAM_FPGA_test_intel
only:
changes:
- STREAM/**/*
- shared/**/*
- scripts/**/*
- cmake/**/*
- .gitlab-ci.yml

build:RandomAccess:
stage: build
Expand Down Expand Up @@ -72,8 +115,11 @@ build:PTRANS:
- make -j 40 all
artifacts:
paths:
- build/bin/transpose_diagonal_emulate.aocx
- build/bin/transpose_diagonal_c2_emulate.aocx
- build/bin/transpose_DIAG_IEC_emulate.aocx
- build/bin/transpose_PQ_IEC_emulate.aocx
- build/bin/transpose_PQ_PCIE_emulate.aocx
- build/bin/transpose_DIAG_PCIE_emulate.aocx
- build/bin/transpose_c2_DIAG_IEC_emulate.aocx
- build/bin/Transpose_intel
- build/bin/Transpose_test_intel
only:
Expand All @@ -90,11 +136,12 @@ build:LINPACK:
- rm -rf build
- mkdir -p build
- cd build
- cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
- cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DUSE_PCIE_MPI_COMMUNICATION=Yes
- make -j 40 all
artifacts:
paths:
- build/bin/hpl_torus_emulate.aocx
- build/bin/hpl_torus_PCIE_emulate.aocx
- build/bin/hpl_torus_IEC_emulate.aocx
- build/bin/Linpack_intel
- build/bin/Linpack_test_intel
only:
Expand Down Expand Up @@ -147,6 +194,27 @@ build:GEMM_HP_REP2:
- cmake/**/*
- .gitlab-ci.yml

build:GEMM_DP_REP2:
stage: build
script:
- rm -rf build
- mkdir -p build
- cd build
- cmake ../GEMM -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
- make -j 40 all
artifacts:
paths:
- build/bin/gemm_base_emulate.aocx
- build/bin/GEMM_intel
- build/bin/GEMM_test_intel
only:
changes:
- GEMM/**/*
- shared/**/*
- scripts/**/*
- cmake/**/*
- .gitlab-ci.yml

build:FFT:
stage: build
script:
Expand Down Expand Up @@ -199,11 +267,7 @@ build:b_eff:
- make -j 40 all
artifacts:
paths:
- build/bin/communication_bw520n_emulate.aocx
- build/bin/communication_bw520n_combined_loops_emulate.aocx
- build/bin/communication_bw520n_disable_pipelining_emulate.aocx
- build/bin/Network_intel
- build/bin/Network_test_intel
- build/bin/*
only:
changes:
- b_eff/**/*
Expand All @@ -223,7 +287,7 @@ test:STREAM:
script:
- cd build
- cmake ../STREAM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:STREAM
artifacts:
Expand All @@ -238,13 +302,57 @@ test:STREAM:
- cmake/**/*
- .gitlab-ci.yml
needs: ["build:STREAM"]

test:STREAM_HP:
stage: test
script:
- cd build
- cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:STREAM_HP
artifacts:
when: on_failure
paths:
- build/Testing/Temporary/LastTest.log
only:
changes:
- STREAM/**/*
- shared/**/*
- scripts/**/*
- cmake/**/*
- .gitlab-ci.yml
needs: ["build:STREAM_HP"]
# Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE)
allow_failure: true

test:STREAM_DP:
stage: test
script:
- cd build
- cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:STREAM_DP
artifacts:
when: on_failure
paths:
- build/Testing/Temporary/LastTest.log
only:
changes:
- STREAM/**/*
- shared/**/*
- scripts/**/*
- cmake/**/*
- .gitlab-ci.yml
needs: ["build:STREAM_DP"]

test:RandomAccess:
stage: test
script:
- cd build
- cmake ../RandomAccess -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:RandomAccess
artifacts:
Expand Down Expand Up @@ -275,7 +383,7 @@ test:PTRANS:
- ln -s kernel_output_ch1 kernel_input_ch0
- ln -s kernel_output_ch3 kernel_input_ch2
- cd ..
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:PTRANS
artifacts:
Expand All @@ -296,7 +404,7 @@ test:LINPACK:
script:
- cd build
- cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:LINPACK
artifacts:
Expand All @@ -317,7 +425,7 @@ test:GEMM:
script:
- cd build
- cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:GEMM
artifacts:
Expand All @@ -338,7 +446,7 @@ test:GEMM_HP_REP2:
script:
- cd build
- cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=half -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:GEMM_HP_REP2
artifacts:
Expand All @@ -353,13 +461,36 @@ test:GEMM_HP_REP2:
- cmake/**/*
- .gitlab-ci.yml
needs: ["build:GEMM_HP_REP2"]
# Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE)
allow_failure: true

test:GEMM_DP_REP2:
stage: test
script:
- cd build
- cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=double -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:GEMM_DP_REP2
artifacts:
when: on_failure
paths:
- build/Testing/Temporary/LastTest.log
only:
changes:
- GEMM/**/*
- shared/**/*
- scripts/**/*
- cmake/**/*
- .gitlab-ci.yml
needs: ["build:GEMM_DP_REP2"]

test:FFT:
stage: test
script:
- cd build
- cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:FFT
artifacts:
Expand All @@ -380,7 +511,7 @@ test:FFT_small:
script:
- cd build
- cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:FFT_small
artifacts:
Expand Down Expand Up @@ -411,7 +542,7 @@ test:b_eff:
- ln -s kernel_output_ch1 kernel_input_ch0
- ln -s kernel_output_ch3 kernel_input_ch2
- cd ..
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
- make CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:b_eff
artifacts:
Expand Down
42 changes: 22 additions & 20 deletions FFT/src/device/fft1d_float_8.cl
Original file line number Diff line number Diff line change
Expand Up @@ -109,29 +109,31 @@ void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["i

// for iter iterations and one additional iteration to empty the last buffer
for(unsigned k = 0; k < (iter + 1) * (N / POINTS); k++){

if (k < iter * ( N / POINTS)) {

float2 read_chunk[POINTS];

// Read the next 8 values from global memory
// in the last iteration just read garbage, but the data will not be forwarded over the pipes.
// This allows the use of memory bursts here.
// Also the data is shifted every N/POINTS/POINTS iterations
__attribute__((opencl_unroll_hint(POINTS)))
for(int j = 0; j < POINTS; j++){
// Shift the data depending on the total FFT size
// Shifts every new chunk by one. If N/POINTS is a multiple of POINTS, the shifting is reduced to prevent mappings to the same bank.
unsigned shift = ((LOGN - LOGPOINTS - LOGPOINTS > 0) ? (k & (N/POINTS - 1)) >> (LOGN - LOGPOINTS - LOGPOINTS) : (k & (N/POINTS - 1)));
unsigned final_buffer_pos = (j + shift) & (POINTS - 1);
read_chunk[final_buffer_pos] = src[(k << LOGPOINTS) + j];
}
float2 read_chunk[POINTS];

// Write the shifted data into the memory buffer
__attribute__((opencl_unroll_hint(POINTS)))
for(int j = 0; j < POINTS; j++){
unsigned local_i = k & (2 * N/POINTS - 1);
buf[local_i][j] = read_chunk[j];
}
// Read the next 8 values from global memory
// in the last iteration just read garbage, but the data will not be forwarded over the pipes.
// This allows the use of memory bursts here.
// Also the data is shifted every N/POINTS/POINTS iterations
__attribute__((opencl_unroll_hint(POINTS)))
for(int j = 0; j < POINTS; j++){
// Shift the data depending on the total FFT size
// Shifts every new chunk by one. If N/POINTS is a multiple of POINTS, the shifting is reduced to prevent mappings to the same bank.
unsigned shift = ((LOGN - LOGPOINTS - LOGPOINTS > 0) ? (k & (N/POINTS - 1)) >> (LOGN - LOGPOINTS - LOGPOINTS) : (k & (N/POINTS - 1)));
unsigned final_buffer_pos = (j + shift) & (POINTS - 1);
read_chunk[final_buffer_pos] = src[(k << LOGPOINTS) + j];
}

// Write the shifted data into the memory buffer
__attribute__((opencl_unroll_hint(POINTS)))
for(int j = 0; j < POINTS; j++){
unsigned local_i = k & (2 * N/POINTS - 1);
buf[local_i][j] = read_chunk[j];
}
}
if (k >= ( N / POINTS)) {
float2x8 buf2x8;

Expand Down
1 change: 0 additions & 1 deletion FFT/src/host/execution_default.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ SOFTWARE.
#include <chrono>

/* External library headers */
#include "CL/cl.hpp"
#ifdef INTEL_FPGA
#ifdef USE_HBM
// CL_HETEROGENEOUS_INTELFPGA is defined here
Expand Down
4 changes: 3 additions & 1 deletion GEMM/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ If available, the benchmark will use `sgemm_` to validate the calculation instea
For matrix sizes above 1000x1000 we recommend using such a library to speed up the benchmark execution.
Using such a library will not change the performance result of the benchmark but might affect the reported error of the calculation.

For half precision support, the IEEE 754-based half-precision floating-point library by Christian Rau is used and a copy is provided with this code.

## Build

CMake is used as the build system.
Expand Down Expand Up @@ -53,7 +55,7 @@ Next to the common configuration options given in the [README](../README.md) of

Name | Default | Description |
---------------- |-------------|--------------------------------------|
`DATA_TYPE` | float (also supported: half, double) | Data type used for calculation |
`DATA_TYPE` | float (also supported: half, double) | Data type used for calculation. *Note: Currently, half-precision does not work on Intel FPGAs because they can not be passed as kernel argument per value.* |
`DEFAULT_MATRIX_SIZE` | 8 | The default size of the quadratic matrices in blocks |
`BLOCK_SIZE` | 512 | Block size used by the kernel for calculation |
`GEMM_SIZE` | 8 | Block size of the fully unrolled matrix multiplication in registers |
Expand Down
8 changes: 7 additions & 1 deletion LINPACK/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@

This file contains all changes made to the source code for each release.

## 2.3
#### Changed:
- Refactored the code to support different execution kernels and data distributions
#### Added:
- FPGA kernel with communication via PCIe and MPI

## 2.2

#### Added:
- LU facotrization kernel w/o pivoting in quadratic torus
- Distributed calculation of GEL on CPU nodes and validation
- Distributed calculation of GESL on CPU nodes and validation

## 2.1

Expand Down
Loading

0 comments on commit ff640b0

Please sign in to comment.