Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Marius Meyer authored and Mellich committed Apr 22, 2020
0 parents commit 1d8dead
Show file tree
Hide file tree
Showing 205 changed files with 18,320 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
*/.DS_Store
.DS_Store
cmake-*
.vscode
*._*
build-*
.idea
9 changes: 9 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[submodule "extern/cxxopts"]
path = extern/cxxopts
url = https://github.com/jarro2783/cxxopts.git
[submodule "extern/hlslib"]
path = extern/hlslib
url = https://github.com/definelicht/hlslib.git
[submodule "extern/googletest"]
path = extern/googletest
url = https://github.com/google/googletest.git
4 changes: 4 additions & 0 deletions FFT/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cmake-*
.DS_Store
build-*
.idea
9 changes: 9 additions & 0 deletions FFT/CHANGELOG
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Changelog

This file contains all changes made to the source code for each release.

## 1.0

#### Added:
- Host code and OpenCL kernel from Intel FPGA SDK AOC examples
- Execution result for the Bittware 520N board with brief performance model
36 changes: 36 additions & 0 deletions FFT/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
cmake_minimum_required(VERSION 2.8.12)
project(fFFT)

set(VERSION 1.0)
set(DEFAULT_REPETITIONS 10 CACHE STRING "Default number of repetitions")
set(DEFAULT_ITERATIONS 100 CACHE STRING "Default number of iterations that is done with a single kernel execution")
set(HOST_DATA_TYPE cl_float CACHE STRING "Data type used by the host code. Should match the data type of the used FFT")
set(DEFAULT_DEVICE -1 CACHE STRING "Index of the default device to use")
set(DEFAULT_PLATFORM -1 CACHE STRING "Index of the default platform to use")
set(FPGA_BOARD_NAME p520_hpc_sg280l CACHE STRING "Name of the target FPGA board")

set(AOC_FLAGS "-fpc -fp-relaxed" CACHE STRING "Used flags for the AOC compiler")
separate_arguments(AOC_FLAGS)


set(FFT_KERNEL_NAME fft1d CACHE STRING "Name of the kernel that is used for calculation")
set(FETCH_KERNEL_NAME fetch CACHE STRING "Name of the kernel that is used to fetch data from global memory")
set(LOG_FFT_SIZE 12 CACHE STRING "Log2 of the used FFT size")
set(FFT_UNROLL 8 CACHE STRING "Amount of global memory unrolling of the kernel. Will be used by the host to calculate NDRange sizes")

set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_SOURCE_DIR}/../extern/hlslib/cmake)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)

configure_file(
"${CMAKE_SOURCE_DIR}/src/common/parameters.h.in"
"${CMAKE_BINARY_DIR}/src/common/parameters.h"
)

include_directories(${CMAKE_BINARY_DIR}/src/common)

find_package(IntelFPGAOpenCL REQUIRED)

add_subdirectory(src/device)
add_subdirectory(src/host)
add_subdirectory(tests)

21 changes: 21 additions & 0 deletions FFT/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2020 pc2

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
114 changes: 114 additions & 0 deletions FFT/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# FFT Benchmark for FPGA

This repository contains the FFT Benchmark for FPGA and its OpenCL kernels.
Currently only the Intel FPGA SDK for OpenCL utility is supported.

It is based on the FFT benchmark of the [HPC Challenge Benchmark](https://icl.utk.edu/hpcc/) suite.
The FFT1D reference implementation is used for the kernel code.

## Dependencies

The benchmark comes with the following requirements for building and running:

- CMake 2.8
- GCC 4.9
- Intel OpenCL FPGA SDK 19.3

It also contains submodules that will be automatically updated when running cmake:

- cxxopts: A header only library to parse command line parameters
- googletest: A C++ test framework

## Build

CMake is used as the build system.
The targets below can be used to build the benchmark and its kernels:

| Target | Description |
| -------- | ---------------------------------------------- |
| fFFT | Builds the host application |
| Google_Tests_run| Compile the tests and its dependencies |

More over the are additional targets to generate kernel reports and bitstreams.
The provided kernel is optimized for Stratix 10 with 512bit LSUs.
The kernel targets are:

| Target | Description |
| -------- | ---------------------------------------------- |
| fft1d_float_8 | Synthesizes the kernel (takes several hours!) |
| fft1d_float_8_report | Create an HTML report for the kernel |
| fft1d_float_8_emulate | Create a n emulation kernel |


You can build for example the host application by running

mkdir build && cd build
cmake ..
make fFFT

You will find all executables and kernel files in the `bin`
folder of your build directory.
You should always specify a target with make to reduce the build time!
You might want to specify predefined parameters before build:

Name | Default | Description |
---------------- |-------------|--------------------------------------|
`DEFAULT_DEVICE` | -1 | Index of the default device (-1 = ask) |
`DEFAULT_PLATFORM`| -1 | Index of the default platform (-1 = ask) |
`FPGA_BOARD_NAME`| p520_hpc_sg280l | Name of the target board |
`DEFAULT_REPETITIONS`| 10 | Number of times the kernel will be executed |
`DEFAULT_ITERATIONS`| 100 | Default number of iterations that is done with a single kernel execution|
`LOG_FFT_SIZE` | 12 | Log2 of the FFT Size that has to be used i.e. 3 leads to a FFT Size of 2^3=8|
`AOC_FLAGS`| `-fpc -fp-relaxed` | Additional AOC compiler flags that are used for kernel compilation |

Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root
of the Intel FPGA SDK installation.

Additionally it is possible to set the used compiler and other build tools
in the `CMakeCache.txt` located in the build directory after running cmake.



## Execution

For execution of the benchmark run:

./fFFT -f path_to_kernel.aocx

For more information on available input parameters run

./fFFT -h

To execute the unit and integration tests run

./Google_Tests_run

in the `bin` folder within the build directory.
It will run an emulation of the kernel and execute some functionality tests.

## Output Interpretation

The benchmark will print the following two tables to standard output after execution:

res. error mach. eps
2.67000e-01 1.19209e-07

avg best
Time in s: 7.56801e-03 7.07241e-03
GFLOPS: 3.24735e-02 3.47491e-02
The first table contains the maximum residual error of the calculation and the
machine epsilon that was used to calculate the residual error.
The benchmark will perform a FFT with the FPGA kernel on random input data.
In a second step the resulting data will be used as input for an iFFT using a CPU
reference implementation in double precision.
The residual error is then calculated with:

![res=\frac{||x-x'||}{\epsilon*ld(n)}](https://latex.codecogs.com/gif.latex?res=\frac{||x-x'||}{\epsilon*ld(n)})

where `x` is the input data of the FFT, `x'` the resulting data from the iFFT, epsilon the machine epsilon and `n` the FFT size.

In the second table the measured execution times and calculated FLOPs are given.
It gives the average and bast for both.
The time gives the averaged execution time for a single FFT in case of a batched execution (an execution with more than one iteration).
They are also used to calculate the FLOPs.
90 changes: 90 additions & 0 deletions FFT/performance/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Performance Evaluation

## Performance Model

FFT1d kernel modelled here can be found in the Intel OpenCL Design Samples.
The design follows the radix 2<sup>2</sup> FFT architecture, which consists of the following:

1. ld(N) radix-2 butterflies
2. trivial rotations at every even stage
3. non-trivial rotations at every odd stage. This is the twiddle factor multiplication computed after the stage's butterfly.
4. shuffling using shift registers

The FFT if fully pipelined and the FFT step is unrolled over all ld(N) stages.
Hence the performance is limited by the global memory to feed the pipeline with data.
We will focus on modeling the fetch kernel that is loading the data from memory.
The kernel pipeline can be expressed with the following equation:

![t_{mempipeline}=\frac{\frac{s_{FFT}}{s_{bus}}}{f}](https://latex.codecogs.com/gif.latex?t_{mempipeline}=\frac{\frac{s_{FFT}}{s_{bus}}}{f})

where ![s_{FFT}](https://latex.codecogs.com/gif.latex?s_{block}) is the number of bytes needed to load from global memory for the FFT i.e. 4096 * 8B for a 4096 FFT with single precision complex values.
![s_{bus}](https://latex.codecogs.com/gif.latex?s_{bus}) the bus width of the global memory in bytes.
![f](https://latex.codecogs.com/gif.latex?f) is the kernel frequency.
Moreover latency will be added to this operation for every DRAM page that has to be activated:

![t_{memoverhead}=\frac{s_{FFT}}{s_{page}}*\(t_{RCD}+t_{RP}\)](https://latex.codecogs.com/gif.latex?t_{memoverhead}=\frac{s_{FFT}}{s_{page}}*\(t_{RCD}+t_{RP}\))

where ![s_{page}](https://latex.codecogs.com/gif.latex?s_{page}) is the size of a DRAM page in bytes.
![t_{RCD}](https://latex.codecogs.com/gif.latex?t_{RCD}) and ![t_{RP}](https://latex.codecogs.com/gif.latex?t_{RP}) are the
row address to column address delay and the row precharge time.

So the total time for the memory accesses for a the calculation of a single FFT is:

![t_{mem}=t_{mempipeline}+t_{memoverhead}](https://latex.codecogs.com/gif.latex?t_{mem}=t_{mempipeline}+t_{memoverhead})

This model does not consider latencies of the calculation pipeline or of the memory but it holds for batched calculations where these latencies are hidden.
If memory interleaving is used, t_memoverhead is also hidden by the access to subsequent memory banks.

## Synthesis Results

The kernel was synthesized with the following configuration for the Bittware 520N board:

Name | Default | Description |
---------------- |-------------|--------------------------------------|
`DEFAULT_DEVICE` | -1 | Index of the default device (-1 = ask) |
`DEFAULT_PLATFORM`| -1 | Index of the default platform (-1 = ask) |
`FPGA_BOARD_NAME`| p520_hpc_sg280l | Name of the target board |
`DEFAULT_REPETITIONS`| 10 | Number of times the kernel will be executed |
`DEFAULT_ITERATIONS`| 5000 | Default number of iterations that is done with a single kernel execution|
`LOG_FFT_SIZE` | 12 | Log2 of the FFT Size that has to be used i.e. 3 leads to a FFT Size of 2^3=8|
`AOC_FLAGS`| `-fpc -fp-relaxed` | Additional AOC compiler flags that are used for kernel compilation |

The used tool versions:

Tool | Version |
---------------- |---------|
Intel OpenCL SDK | 19.4.0 |
BSP | 19.2.0 |
GCC | 8.3.0 |

The resulting output:

-------------------------------------------------------------
Implementation of the FFT benchmark proposed in the HPCC benchmark suite for FPGA.
Version: 1.0
-------------------------------------------------------------
Summary:
FFT Size: 4096
Data Size: 5000 * FFT Size * sizeof(cl_float) = 8.19200e+07 Byte
Repetitions: 10
Kernel file: fft1d_float_8.aocx
Device: p520_hpc_sg280l : BittWare Stratix 10 OpenCL platform (aclbitt_s10_pcie0)
-------------------------------------------------------------
Start benchmark using the given configuration.
-------------------------------------------------------------
res. error mach. eps
3.17324e-01 1.19209e-07

avg best
Time in s: 1.81336e-06 1.81170e-06
GFLOPS: 1.35528e+02 1.35652e+02
So the FFT implementation achieved 135.7 GFLOPs with a kernel frequency of 297.5MHz.
The kernel uses memory interleaving so the model simplifies to:

![t_{mem}=\frac{\frac{4096}{8}}{297.5MHz}=1.72\mu&space;s](https://latex.codecogs.com/gif.latex?t_{mem}=\frac{\frac{4096}{8}}{297.5MHz}=1.72\mu&space;s)

which shows an 5.2% difference to the measurement that resulted in 1.81µs.
The difference may be caused by the latencies of the global memory and the calculation pipeline.
Also the store of the FFT result may interfere with the load operations since they use the same memory banks.

27 changes: 27 additions & 0 deletions FFT/src/common/parameters.h.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef SRC_COMMON_PARAMETERS_H_
#define SRC_COMMON_PARAMETERS_H_

/**
* Host specific parameters
*/
#define VERSION "@VERSION@"
#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
#define DEFAULT_ITERATIONS @DEFAULT_ITERATIONS@
#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
#define DEFAULT_DEVICE @DEFAULT_DEVICE@
#define HOST_DATA_TYPE @HOST_DATA_TYPE@
#define FFT_KERNEL_NAME "@FFT_KERNEL_NAME@"
#define FETCH_KERNEL_NAME "@FETCH_KERNEL_NAME@"

/**
* Kernel Parameters
*/
#define LOG_FFT_SIZE @LOG_FFT_SIZE@
#define FFT_UNROLL @FFT_UNROLL@

/**
Output separator
*/
#define HLINE "-------------------------------------------------------------\n"

#endif // SRC_COMMON_PARAMETERS_H_
35 changes: 35 additions & 0 deletions FFT/src/device/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@

set(AOC_INCLUDES "-I${CMAKE_CURRENT_BINARY_DIR}/../common")

function(generate_kernel_targets)
foreach (kernel_file_name ${ARGN})
set(source_f ${CMAKE_CURRENT_SOURCE_DIR}/${kernel_file_name}.cl)
set(report_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_report)
set(bitstream_emulate_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_emulate.aocx)
set(bitstream_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}.aocx)
set(out_f "${CMAKE_CURRENT_BINARY_DIR}/${out_f}")
add_custom_command(OUTPUT ${bitstream_emulate_f}
COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -legacy-emulator -march=emulator
-o ${bitstream_emulate_f}
)
add_custom_command(OUTPUT ${bitstream_f}
COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -board=${FPGA_BOARD_NAME}
-o ${bitstream_f}
)
add_custom_command(OUTPUT ${report_f}
COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -rtl -report -board=${FPGA_BOARD_NAME}
-o ${report_f}
)
add_custom_target(${kernel_file_name}_report DEPENDS ${report_f}
DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h
SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h)
add_custom_target(${kernel_file_name} DEPENDS ${bitstream_f}
DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h
SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h)
add_custom_target(${kernel_file_name}_emulate DEPENDS ${bitstream_emulate_f}
DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h
SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h)
endforeach ()
endfunction()

generate_kernel_targets(fft1d_float_8)
Loading

0 comments on commit 1d8dead

Please sign in to comment.