Initial commit

pc2 · Apr 22, 2020 · 1d8dead · 1d8dead
commit 1d8dead
Show file tree

Hide file tree

Showing 205 changed files with 18,320 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+*/.DS_Store
+.DS_Store
+cmake-*
+.vscode
+*._*
+build-*
+.idea
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,9 @@
+[submodule "extern/cxxopts"]
+	path = extern/cxxopts
+	url = https://github.com/jarro2783/cxxopts.git
+[submodule "extern/hlslib"]
+	path = extern/hlslib
+	url = https://github.com/definelicht/hlslib.git
+[submodule "extern/googletest"]
+	path = extern/googletest
+	url = https://github.com/google/googletest.git
diff --git a/FFT/.gitignore b/FFT/.gitignore
@@ -0,0 +1,4 @@
+cmake-*
+.DS_Store
+build-*
+.idea
diff --git a/FFT/CHANGELOG b/FFT/CHANGELOG
@@ -0,0 +1,9 @@
+# Changelog
+
+This file contains all changes made to the source code for each release.
+
+## 1.0
+
+#### Added:
+- Host code and OpenCL kernel from Intel FPGA SDK AOC examples
+- Execution result for the Bittware 520N board with brief performance model
diff --git a/FFT/CMakeLists.txt b/FFT/CMakeLists.txt
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 2.8.12)
+project(fFFT)
+
+set(VERSION 1.0)
+set(DEFAULT_REPETITIONS 10 CACHE STRING "Default number of repetitions")
+set(DEFAULT_ITERATIONS 100 CACHE STRING "Default number of iterations that is done with a single kernel execution")
+set(HOST_DATA_TYPE cl_float CACHE STRING "Data type used by the host code. Should match the data type of the used FFT")
+set(DEFAULT_DEVICE -1 CACHE STRING "Index of the default device to use")
+set(DEFAULT_PLATFORM -1 CACHE STRING "Index of the default platform to use")
+set(FPGA_BOARD_NAME p520_hpc_sg280l CACHE STRING "Name of the target FPGA board")
+
+set(AOC_FLAGS "-fpc -fp-relaxed" CACHE STRING "Used flags for the AOC compiler")
+separate_arguments(AOC_FLAGS)
+
+
+set(FFT_KERNEL_NAME fft1d CACHE STRING "Name of the kernel that is used for calculation")
+set(FETCH_KERNEL_NAME fetch CACHE STRING "Name of the kernel that is used to fetch data from global memory")
+set(LOG_FFT_SIZE 12 CACHE STRING "Log2 of the used FFT size")
+set(FFT_UNROLL 8 CACHE STRING "Amount of global memory unrolling of the kernel. Will be used by the host to calculate NDRange sizes")
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_SOURCE_DIR}/../extern/hlslib/cmake)
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
+
+configure_file(
+        "${CMAKE_SOURCE_DIR}/src/common/parameters.h.in"
+        "${CMAKE_BINARY_DIR}/src/common/parameters.h"
+)
+
+include_directories(${CMAKE_BINARY_DIR}/src/common)
+
+find_package(IntelFPGAOpenCL REQUIRED)
+
+add_subdirectory(src/device)
+add_subdirectory(src/host)
+add_subdirectory(tests)
+
diff --git a/FFT/LICENSE b/FFT/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 pc2
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/FFT/README.md b/FFT/README.md
@@ -0,0 +1,114 @@
+# FFT Benchmark for FPGA
+
+This repository contains the FFT Benchmark for FPGA and its OpenCL kernels.
+Currently only the  Intel FPGA SDK for OpenCL utility is supported.
+
+It is based on the FFT benchmark of the [HPC Challenge Benchmark](https://icl.utk.edu/hpcc/) suite.
+The FFT1D reference implementation is used for the kernel code.
+
+## Dependencies
+
+The benchmark comes with the following requirements for building and running:
+
+- CMake 2.8
+- GCC 4.9
+- Intel OpenCL FPGA SDK 19.3
+
+It also contains submodules that will be automatically updated when running cmake:
+
+- cxxopts: A header only library to parse command line parameters
+- googletest: A C++ test framework
+
+## Build
+
+CMake is used as the build system.
+The targets below can be used to build the benchmark and its kernels:
+
+ |  Target  | Description                                    |
+ | -------- | ---------------------------------------------- |
+ | fFFT     | Builds the host application                    |
+ | Google_Tests_run| Compile the tests and its dependencies  |
+
+ More over the are additional targets to generate kernel reports and bitstreams.
+ The provided kernel is optimized for Stratix 10 with 512bit LSUs.
+ The kernel targets are:
+
+  |  Target  | Description                                    |
+  | -------- | ---------------------------------------------- |
+  | fft1d_float_8          | Synthesizes the kernel (takes several hours!)  |
+  | fft1d_float_8_report   | Create an HTML report for the kernel    |
+  | fft1d_float_8_emulate  | Create a n emulation kernel             |
+
+
+ You can build for example the host application by running
+
+    mkdir build && cd build
+    cmake ..
+    make fFFT
+
+You will find all executables and kernel files in the `bin`
+folder of your build directory.
+You should always specify a target with make to reduce the build time!
+You might want to specify predefined parameters before build:
+
+Name             | Default     | Description                          |
+---------------- |-------------|--------------------------------------|
+`DEFAULT_DEVICE` | -1          | Index of the default device (-1 = ask) |
+`DEFAULT_PLATFORM`| -1          | Index of the default platform (-1 = ask) |
+`FPGA_BOARD_NAME`| p520_hpc_sg280l | Name of the target board |
+`DEFAULT_REPETITIONS`| 10          | Number of times the kernel will be executed |
+`DEFAULT_ITERATIONS`| 100          | Default number of iterations that is done with a single kernel execution|
+`LOG_FFT_SIZE`   | 12          | Log2 of the FFT Size that has to be used i.e. 3 leads to a FFT Size of 2^3=8|
+`AOC_FLAGS`| `-fpc -fp-relaxed` | Additional AOC compiler flags that are used for kernel compilation |
+
+Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root
+of the Intel FPGA SDK installation.
+
+Additionally it is possible to set the used compiler and other build tools 
+in the `CMakeCache.txt` located in the build directory after running cmake.
+
+
+
+## Execution
+
+For execution of the benchmark run:
+
+    ./fFFT -f path_to_kernel.aocx
+
+For more information on available input parameters run
+
+    ./fFFT -h
+
+To execute the unit and integration tests run
+
+    ./Google_Tests_run
+
+in the `bin` folder within the build directory.
+It will run an emulation of the kernel and execute some functionality tests.
+
+## Output Interpretation
+
+The benchmark will print the following two tables to standard output after execution:
+
+       res. error    mach. eps
+      2.67000e-01  1.19209e-07
+
+                           avg         best
+       Time in s:  7.56801e-03  7.07241e-03
+          GFLOPS:  3.24735e-02  3.47491e-02
+          
+The first table contains the maximum residual error of the calculation and the
+machine epsilon that was used to calculate the residual error.
+The benchmark will perform a FFT with the FPGA kernel on random input data.
+In a second step the resulting data will be used as input for an iFFT using a CPU
+reference implementation in double precision.
+The residual error is then calculated with:
+
+![res=\frac{||x-x'||}{\epsilon*ld(n)}](https://latex.codecogs.com/gif.latex?res=\frac{||x-x'||}{\epsilon*ld(n)})
+
+where `x` is the input data of the FFT, `x'` the resulting data from the iFFT, epsilon the machine epsilon and `n` the FFT size.
+
+In the second table the measured execution times and calculated FLOPs are given.
+It gives the average and bast for both.
+The time gives the averaged execution time for a single FFT in case of a batched execution (an execution with more than one iteration).
+They are also used to calculate the FLOPs.
diff --git a/FFT/performance/README.md b/FFT/performance/README.md
@@ -0,0 +1,90 @@
+# Performance Evaluation
+
+## Performance Model
+
+FFT1d kernel modelled here can be found in the Intel OpenCL Design Samples. 
+The design follows the radix 2<sup>2</sup> FFT architecture, which consists of the following:
+
+1. ld(N) radix-2 butterflies
+2. trivial rotations at every even stage
+3. non-trivial rotations at every odd stage. This is the twiddle factor multiplication computed after the stage's butterfly.
+4. shuffling using shift registers
+
+The FFT if fully pipelined and the FFT step is unrolled over all ld(N) stages.
+Hence the performance is limited by the global memory to feed the pipeline with data.
+We will focus on modeling the fetch kernel that is loading the data from memory.
+The kernel pipeline can be expressed with the following equation:
+
+![t_{mempipeline}=\frac{\frac{s_{FFT}}{s_{bus}}}{f}](https://latex.codecogs.com/gif.latex?t_{mempipeline}=\frac{\frac{s_{FFT}}{s_{bus}}}{f}) 
+
+where ![s_{FFT}](https://latex.codecogs.com/gif.latex?s_{block}) is the number of bytes needed to load from global memory for the FFT i.e. 4096 * 8B for a 4096 FFT with single precision complex values.
+![s_{bus}](https://latex.codecogs.com/gif.latex?s_{bus}) the bus width of the global memory in bytes.
+![f](https://latex.codecogs.com/gif.latex?f) is the kernel frequency.
+Moreover latency will be added to this operation for every DRAM page that has to be activated:
+
+![t_{memoverhead}=\frac{s_{FFT}}{s_{page}}*\(t_{RCD}+t_{RP}\)](https://latex.codecogs.com/gif.latex?t_{memoverhead}=\frac{s_{FFT}}{s_{page}}*\(t_{RCD}+t_{RP}\))
+
+where ![s_{page}](https://latex.codecogs.com/gif.latex?s_{page}) is the size of a DRAM page in bytes.
+![t_{RCD}](https://latex.codecogs.com/gif.latex?t_{RCD}) and ![t_{RP}](https://latex.codecogs.com/gif.latex?t_{RP}) are the
+row address to column address delay and the row precharge time.
+
+So the total time for the memory accesses for a the calculation of a single FFT is:
+
+![t_{mem}=t_{mempipeline}+t_{memoverhead}](https://latex.codecogs.com/gif.latex?t_{mem}=t_{mempipeline}+t_{memoverhead}) 
+
+This model does not consider latencies of the calculation pipeline or of the memory but it holds for batched calculations where these latencies are hidden.
+If memory interleaving is used, t_memoverhead is also hidden by the access to subsequent memory banks.
+
+## Synthesis Results
+
+The kernel was synthesized with the following configuration for the Bittware 520N board:
+
+Name             | Default     | Description                          |
+---------------- |-------------|--------------------------------------|
+`DEFAULT_DEVICE` | -1          | Index of the default device (-1 = ask) |
+`DEFAULT_PLATFORM`| -1          | Index of the default platform (-1 = ask) |
+`FPGA_BOARD_NAME`| p520_hpc_sg280l | Name of the target board |
+`DEFAULT_REPETITIONS`| 10          | Number of times the kernel will be executed |
+`DEFAULT_ITERATIONS`| 5000          | Default number of iterations that is done with a single kernel execution|
+`LOG_FFT_SIZE`   | 12          | Log2 of the FFT Size that has to be used i.e. 3 leads to a FFT Size of 2^3=8|
+`AOC_FLAGS`| `-fpc -fp-relaxed` | Additional AOC compiler flags that are used for kernel compilation |
+
+The used tool versions:
+
+Tool             | Version |
+---------------- |---------|
+Intel OpenCL SDK | 19.4.0  |
+BSP              | 19.2.0  |
+GCC              | 8.3.0   |
+
+The resulting output:
+
+    -------------------------------------------------------------
+    Implementation of the FFT benchmark proposed in the HPCC benchmark suite for FPGA.
+    Version: 1.0
+    -------------------------------------------------------------
+    Summary:
+    FFT Size:            4096
+    Data Size:           5000 * FFT Size * sizeof(cl_float) = 8.19200e+07 Byte
+    Repetitions:         10
+    Kernel file:         fft1d_float_8.aocx
+    Device:              p520_hpc_sg280l : BittWare Stratix 10 OpenCL platform (aclbitt_s10_pcie0)
+    -------------------------------------------------------------
+    Start benchmark using the given configuration.
+    -------------------------------------------------------------
+       res. error    mach. eps
+      3.17324e-01  1.19209e-07
+
+                           avg         best
+       Time in s:  1.81336e-06  1.81170e-06
+          GFLOPS:  1.35528e+02  1.35652e+02
+          
+So the FFT implementation achieved 135.7 GFLOPs with a kernel frequency of 297.5MHz.
+The kernel uses memory interleaving so the model simplifies to:
+
+![t_{mem}=\frac{\frac{4096}{8}}{297.5MHz}=1.72\mu&space;s](https://latex.codecogs.com/gif.latex?t_{mem}=\frac{\frac{4096}{8}}{297.5MHz}=1.72\mu&space;s) 
+
+which shows an 5.2% difference to the measurement that resulted in 1.81µs.
+The difference may be caused by the latencies of the global memory and the calculation pipeline.
+Also the store of the FFT result may interfere with the load operations since they use the same memory banks.
+
diff --git a/FFT/src/common/parameters.h.in b/FFT/src/common/parameters.h.in
@@ -0,0 +1,27 @@
+#ifndef SRC_COMMON_PARAMETERS_H_
+#define SRC_COMMON_PARAMETERS_H_
+
+/**
+ * Host specific parameters
+ */
+#define VERSION "@VERSION@"
+#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
+#define DEFAULT_ITERATIONS @DEFAULT_ITERATIONS@
+#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
+#define DEFAULT_DEVICE @DEFAULT_DEVICE@
+#define HOST_DATA_TYPE @HOST_DATA_TYPE@
+#define FFT_KERNEL_NAME "@FFT_KERNEL_NAME@"
+#define FETCH_KERNEL_NAME "@FETCH_KERNEL_NAME@"
+
+/**
+ * Kernel Parameters
+ */
+#define LOG_FFT_SIZE @LOG_FFT_SIZE@
+#define FFT_UNROLL @FFT_UNROLL@
+
+/**
+Output separator
+*/
+#define HLINE "-------------------------------------------------------------\n"
+
+#endif // SRC_COMMON_PARAMETERS_H_
diff --git a/FFT/src/device/CMakeLists.txt b/FFT/src/device/CMakeLists.txt
@@ -0,0 +1,35 @@
+
+set(AOC_INCLUDES "-I${CMAKE_CURRENT_BINARY_DIR}/../common")
+
+function(generate_kernel_targets)
+    foreach (kernel_file_name ${ARGN})
+        set(source_f ${CMAKE_CURRENT_SOURCE_DIR}/${kernel_file_name}.cl)
+        set(report_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_report)
+        set(bitstream_emulate_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_emulate.aocx)
+        set(bitstream_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}.aocx)
+        set(out_f "${CMAKE_CURRENT_BINARY_DIR}/${out_f}")
+        add_custom_command(OUTPUT ${bitstream_emulate_f}
+                COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -legacy-emulator -march=emulator
+                -o ${bitstream_emulate_f}
+                )
+        add_custom_command(OUTPUT ${bitstream_f}
+                COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -board=${FPGA_BOARD_NAME}
+                -o ${bitstream_f}
+                )
+        add_custom_command(OUTPUT ${report_f}
+                COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} ${AOC_INCLUDES} ${AOC_FLAGS} -rtl -report -board=${FPGA_BOARD_NAME}
+                -o ${report_f}
+                )
+        add_custom_target(${kernel_file_name}_report DEPENDS ${report_f}
+                DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h
+                SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h)
+        add_custom_target(${kernel_file_name} DEPENDS ${bitstream_f}
+                DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h
+                SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h)
+        add_custom_target(${kernel_file_name}_emulate DEPENDS ${bitstream_emulate_f}
+                DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h
+                SOURCES ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h)
+    endforeach ()
+endfunction()
+
+generate_kernel_targets(fft1d_float_8)