-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 1d8dead
Showing
205 changed files
with
18,320 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
*/.DS_Store | ||
.DS_Store | ||
cmake-* | ||
.vscode | ||
*._* | ||
build-* | ||
.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
[submodule "extern/cxxopts"] | ||
path = extern/cxxopts | ||
url = https://github.com/jarro2783/cxxopts.git | ||
[submodule "extern/hlslib"] | ||
path = extern/hlslib | ||
url = https://github.com/definelicht/hlslib.git | ||
[submodule "extern/googletest"] | ||
path = extern/googletest | ||
url = https://github.com/google/googletest.git |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cmake-* | ||
.DS_Store | ||
build-* | ||
.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# Changelog | ||
|
||
This file contains all changes made to the source code for each release. | ||
|
||
## 1.0 | ||
|
||
#### Added: | ||
- Host code and OpenCL kernel from Intel FPGA SDK AOC examples | ||
- Execution result for the Bittware 520N board with brief performance model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
cmake_minimum_required(VERSION 2.8.12) | ||
project(fFFT) | ||
|
||
set(VERSION 1.0) | ||
set(DEFAULT_REPETITIONS 10 CACHE STRING "Default number of repetitions") | ||
set(DEFAULT_ITERATIONS 100 CACHE STRING "Default number of iterations that is done with a single kernel execution") | ||
set(HOST_DATA_TYPE cl_float CACHE STRING "Data type used by the host code. Should match the data type of the used FFT") | ||
set(DEFAULT_DEVICE -1 CACHE STRING "Index of the default device to use") | ||
set(DEFAULT_PLATFORM -1 CACHE STRING "Index of the default platform to use") | ||
set(FPGA_BOARD_NAME p520_hpc_sg280l CACHE STRING "Name of the target FPGA board") | ||
|
||
set(AOC_FLAGS "-fpc -fp-relaxed" CACHE STRING "Used flags for the AOC compiler") | ||
separate_arguments(AOC_FLAGS) | ||
|
||
|
||
set(FFT_KERNEL_NAME fft1d CACHE STRING "Name of the kernel that is used for calculation") | ||
set(FETCH_KERNEL_NAME fetch CACHE STRING "Name of the kernel that is used to fetch data from global memory") | ||
set(LOG_FFT_SIZE 12 CACHE STRING "Log2 of the used FFT size") | ||
set(FFT_UNROLL 8 CACHE STRING "Amount of global memory unrolling of the kernel. Will be used by the host to calculate NDRange sizes") | ||
|
||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_SOURCE_DIR}/../extern/hlslib/cmake) | ||
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) | ||
|
||
configure_file( | ||
"${CMAKE_SOURCE_DIR}/src/common/parameters.h.in" | ||
"${CMAKE_BINARY_DIR}/src/common/parameters.h" | ||
) | ||
|
||
include_directories(${CMAKE_BINARY_DIR}/src/common) | ||
|
||
find_package(IntelFPGAOpenCL REQUIRED) | ||
|
||
add_subdirectory(src/device) | ||
add_subdirectory(src/host) | ||
add_subdirectory(tests) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2020 pc2 | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
# FFT Benchmark for FPGA | ||
|
||
This repository contains the FFT Benchmark for FPGA and its OpenCL kernels. | ||
Currently only the Intel FPGA SDK for OpenCL utility is supported. | ||
|
||
It is based on the FFT benchmark of the [HPC Challenge Benchmark](https://icl.utk.edu/hpcc/) suite. | ||
The FFT1D reference implementation is used for the kernel code. | ||
|
||
## Dependencies | ||
|
||
The benchmark comes with the following requirements for building and running: | ||
|
||
- CMake 2.8 | ||
- GCC 4.9 | ||
- Intel OpenCL FPGA SDK 19.3 | ||
|
||
It also contains submodules that will be automatically updated when running cmake: | ||
|
||
- cxxopts: A header only library to parse command line parameters | ||
- googletest: A C++ test framework | ||
|
||
## Build | ||
|
||
CMake is used as the build system. | ||
The targets below can be used to build the benchmark and its kernels: | ||
|
||
| Target | Description | | ||
| -------- | ---------------------------------------------- | | ||
| fFFT | Builds the host application | | ||
| Google_Tests_run| Compile the tests and its dependencies | | ||
|
||
More over the are additional targets to generate kernel reports and bitstreams. | ||
The provided kernel is optimized for Stratix 10 with 512bit LSUs. | ||
The kernel targets are: | ||
|
||
| Target | Description | | ||
| -------- | ---------------------------------------------- | | ||
| fft1d_float_8 | Synthesizes the kernel (takes several hours!) | | ||
| fft1d_float_8_report | Create an HTML report for the kernel | | ||
| fft1d_float_8_emulate | Create a n emulation kernel | | ||
|
||
|
||
You can build for example the host application by running | ||
|
||
mkdir build && cd build | ||
cmake .. | ||
make fFFT | ||
|
||
You will find all executables and kernel files in the `bin` | ||
folder of your build directory. | ||
You should always specify a target with make to reduce the build time! | ||
You might want to specify predefined parameters before build: | ||
|
||
Name | Default | Description | | ||
---------------- |-------------|--------------------------------------| | ||
`DEFAULT_DEVICE` | -1 | Index of the default device (-1 = ask) | | ||
`DEFAULT_PLATFORM`| -1 | Index of the default platform (-1 = ask) | | ||
`FPGA_BOARD_NAME`| p520_hpc_sg280l | Name of the target board | | ||
`DEFAULT_REPETITIONS`| 10 | Number of times the kernel will be executed | | ||
`DEFAULT_ITERATIONS`| 100 | Default number of iterations that is done with a single kernel execution| | ||
`LOG_FFT_SIZE` | 12 | Log2 of the FFT Size that has to be used i.e. 3 leads to a FFT Size of 2^3=8| | ||
`AOC_FLAGS`| `-fpc -fp-relaxed` | Additional AOC compiler flags that are used for kernel compilation | | ||
|
||
Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root | ||
of the Intel FPGA SDK installation. | ||
|
||
Additionally it is possible to set the used compiler and other build tools | ||
in the `CMakeCache.txt` located in the build directory after running cmake. | ||
|
||
|
||
|
||
## Execution | ||
|
||
For execution of the benchmark run: | ||
|
||
./fFFT -f path_to_kernel.aocx | ||
|
||
For more information on available input parameters run | ||
|
||
./fFFT -h | ||
|
||
To execute the unit and integration tests run | ||
|
||
./Google_Tests_run | ||
|
||
in the `bin` folder within the build directory. | ||
It will run an emulation of the kernel and execute some functionality tests. | ||
|
||
## Output Interpretation | ||
|
||
The benchmark will print the following two tables to standard output after execution: | ||
|
||
res. error mach. eps | ||
2.67000e-01 1.19209e-07 | ||
|
||
avg best | ||
Time in s: 7.56801e-03 7.07241e-03 | ||
GFLOPS: 3.24735e-02 3.47491e-02 | ||
The first table contains the maximum residual error of the calculation and the | ||
machine epsilon that was used to calculate the residual error. | ||
The benchmark will perform a FFT with the FPGA kernel on random input data. | ||
In a second step the resulting data will be used as input for an iFFT using a CPU | ||
reference implementation in double precision. | ||
The residual error is then calculated with: | ||
|
||
![res=\frac{||x-x'||}{\epsilon*ld(n)}](https://latex.codecogs.com/gif.latex?res=\frac{||x-x'||}{\epsilon*ld(n)}) | ||
|
||
where `x` is the input data of the FFT, `x'` the resulting data from the iFFT, epsilon the machine epsilon and `n` the FFT size. | ||
|
||
In the second table the measured execution times and calculated FLOPs are given. | ||
It gives the average and bast for both. | ||
The time gives the averaged execution time for a single FFT in case of a batched execution (an execution with more than one iteration). | ||
They are also used to calculate the FLOPs. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
# Performance Evaluation | ||
|
||
## Performance Model | ||
|
||
FFT1d kernel modelled here can be found in the Intel OpenCL Design Samples. | ||
The design follows the radix 2<sup>2</sup> FFT architecture, which consists of the following: | ||
|
||
1. ld(N) radix-2 butterflies | ||
2. trivial rotations at every even stage | ||
3. non-trivial rotations at every odd stage. This is the twiddle factor multiplication computed after the stage's butterfly. | ||
4. shuffling using shift registers | ||
|
||
The FFT if fully pipelined and the FFT step is unrolled over all ld(N) stages. | ||
Hence the performance is limited by the global memory to feed the pipeline with data. | ||
We will focus on modeling the fetch kernel that is loading the data from memory. | ||
The kernel pipeline can be expressed with the following equation: | ||
|
||
![t_{mempipeline}=\frac{\frac{s_{FFT}}{s_{bus}}}{f}](https://latex.codecogs.com/gif.latex?t_{mempipeline}=\frac{\frac{s_{FFT}}{s_{bus}}}{f}) | ||
|
||
where ![s_{FFT}](https://latex.codecogs.com/gif.latex?s_{block}) is the number of bytes needed to load from global memory for the FFT i.e. 4096 * 8B for a 4096 FFT with single precision complex values. | ||
![s_{bus}](https://latex.codecogs.com/gif.latex?s_{bus}) the bus width of the global memory in bytes. | ||
![f](https://latex.codecogs.com/gif.latex?f) is the kernel frequency. | ||
Moreover latency will be added to this operation for every DRAM page that has to be activated: | ||
|
||
![t_{memoverhead}=\frac{s_{FFT}}{s_{page}}*\(t_{RCD}+t_{RP}\)](https://latex.codecogs.com/gif.latex?t_{memoverhead}=\frac{s_{FFT}}{s_{page}}*\(t_{RCD}+t_{RP}\)) | ||
|
||
where ![s_{page}](https://latex.codecogs.com/gif.latex?s_{page}) is the size of a DRAM page in bytes. | ||
![t_{RCD}](https://latex.codecogs.com/gif.latex?t_{RCD}) and ![t_{RP}](https://latex.codecogs.com/gif.latex?t_{RP}) are the | ||
row address to column address delay and the row precharge time. | ||
|
||
So the total time for the memory accesses for a the calculation of a single FFT is: | ||
|
||
![t_{mem}=t_{mempipeline}+t_{memoverhead}](https://latex.codecogs.com/gif.latex?t_{mem}=t_{mempipeline}+t_{memoverhead}) | ||
|
||
This model does not consider latencies of the calculation pipeline or of the memory but it holds for batched calculations where these latencies are hidden. | ||
If memory interleaving is used, t_memoverhead is also hidden by the access to subsequent memory banks. | ||
|
||
## Synthesis Results | ||
|
||
The kernel was synthesized with the following configuration for the Bittware 520N board: | ||
|
||
Name | Default | Description | | ||
---------------- |-------------|--------------------------------------| | ||
`DEFAULT_DEVICE` | -1 | Index of the default device (-1 = ask) | | ||
`DEFAULT_PLATFORM`| -1 | Index of the default platform (-1 = ask) | | ||
`FPGA_BOARD_NAME`| p520_hpc_sg280l | Name of the target board | | ||
`DEFAULT_REPETITIONS`| 10 | Number of times the kernel will be executed | | ||
`DEFAULT_ITERATIONS`| 5000 | Default number of iterations that is done with a single kernel execution| | ||
`LOG_FFT_SIZE` | 12 | Log2 of the FFT Size that has to be used i.e. 3 leads to a FFT Size of 2^3=8| | ||
`AOC_FLAGS`| `-fpc -fp-relaxed` | Additional AOC compiler flags that are used for kernel compilation | | ||
|
||
The used tool versions: | ||
|
||
Tool | Version | | ||
---------------- |---------| | ||
Intel OpenCL SDK | 19.4.0 | | ||
BSP | 19.2.0 | | ||
GCC | 8.3.0 | | ||
|
||
The resulting output: | ||
|
||
------------------------------------------------------------- | ||
Implementation of the FFT benchmark proposed in the HPCC benchmark suite for FPGA. | ||
Version: 1.0 | ||
------------------------------------------------------------- | ||
Summary: | ||
FFT Size: 4096 | ||
Data Size: 5000 * FFT Size * sizeof(cl_float) = 8.19200e+07 Byte | ||
Repetitions: 10 | ||
Kernel file: fft1d_float_8.aocx | ||
Device: p520_hpc_sg280l : BittWare Stratix 10 OpenCL platform (aclbitt_s10_pcie0) | ||
------------------------------------------------------------- | ||
Start benchmark using the given configuration. | ||
------------------------------------------------------------- | ||
res. error mach. eps | ||
3.17324e-01 1.19209e-07 | ||
|
||
avg best | ||
Time in s: 1.81336e-06 1.81170e-06 | ||
GFLOPS: 1.35528e+02 1.35652e+02 | ||
So the FFT implementation achieved 135.7 GFLOPs with a kernel frequency of 297.5MHz. | ||
The kernel uses memory interleaving so the model simplifies to: | ||
|
||
![t_{mem}=\frac{\frac{4096}{8}}{297.5MHz}=1.72\mu&space;s](https://latex.codecogs.com/gif.latex?t_{mem}=\frac{\frac{4096}{8}}{297.5MHz}=1.72\mu&space;s) | ||
|
||
which shows an 5.2% difference to the measurement that resulted in 1.81µs. | ||
The difference may be caused by the latencies of the global memory and the calculation pipeline. | ||
Also the store of the FFT result may interfere with the load operations since they use the same memory banks. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#ifndef SRC_COMMON_PARAMETERS_H_ | ||
#define SRC_COMMON_PARAMETERS_H_ | ||
|
||
/** | ||
* Host specific parameters | ||
*/ | ||
#define VERSION "@VERSION@" | ||
#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@ | ||
#define DEFAULT_ITERATIONS @DEFAULT_ITERATIONS@ | ||
#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@ | ||
#define DEFAULT_DEVICE @DEFAULT_DEVICE@ | ||
#define HOST_DATA_TYPE @HOST_DATA_TYPE@ | ||
#define FFT_KERNEL_NAME "@FFT_KERNEL_NAME@" | ||
#define FETCH_KERNEL_NAME "@FETCH_KERNEL_NAME@" | ||
|
||
/** | ||
* Kernel Parameters | ||
*/ | ||
#define LOG_FFT_SIZE @LOG_FFT_SIZE@ | ||
#define FFT_UNROLL @FFT_UNROLL@ | ||
|
||
/** | ||
Output separator | ||
*/ | ||
#define HLINE "-------------------------------------------------------------\n" | ||
|
||
#endif // SRC_COMMON_PARAMETERS_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
|
||
set(AOC_INCLUDES "-I${CMAKE_CURRENT_BINARY_DIR}/../common") | ||
|
||
function(generate_kernel_targets) | ||
foreach (kernel_file_name ${ARGN}) | ||
set(source_f ${CMAKE_CURRENT_SOURCE_DIR}/${kernel_file_name}.cl) | ||
set(report_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_report) | ||
set(bitstream_emulate_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_emulate.aocx) | ||
set(bitstream_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}.aocx) | ||
set(out_f "${CMAKE_CURRENT_BINARY_DIR}/${out_f}") | ||
add_custom_command(OUTPUT ${bitstream_emulate_f} | ||
COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -legacy-emulator -march=emulator | ||
-o ${bitstream_emulate_f} | ||
) | ||
add_custom_command(OUTPUT ${bitstream_f} | ||
COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -board=${FPGA_BOARD_NAME} | ||
-o ${bitstream_f} | ||
) | ||
add_custom_command(OUTPUT ${report_f} | ||
COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -rtl -report -board=${FPGA_BOARD_NAME} | ||
-o ${report_f} | ||
) | ||
add_custom_target(${kernel_file_name}_report DEPENDS ${report_f} | ||
DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h | ||
SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h) | ||
add_custom_target(${kernel_file_name} DEPENDS ${bitstream_f} | ||
DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h | ||
SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h) | ||
add_custom_target(${kernel_file_name}_emulate DEPENDS ${bitstream_emulate_f} | ||
DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h | ||
SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h) | ||
endforeach () | ||
endfunction() | ||
|
||
generate_kernel_targets(fft1d_float_8) |
Oops, something went wrong.