Skip to content

Commit

Permalink
Merge branch 'add-support-s10mxhbm' into 'master'
Browse files Browse the repository at this point in the history
Add support Stratix 10 MX with HBM2

See merge request pc2/HPCC_FPGA!10
  • Loading branch information
Marius Meyer committed May 20, 2020
2 parents c73b039 + 8881ee1 commit 6bfd62f
Show file tree
Hide file tree
Showing 14 changed files with 490 additions and 40 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ All benchmarks come with the following build dependencies:
- CMake >= 3.1
- C++ compiler with C++11 support
- Intel OpenCL FPGA SDK or Xilinx Vitis
- Python 3 with [pandas](https://pandas.pydata.org) installed (for the evaluation scripts)
- Python 3 for code generation and with [pandas](https://pandas.pydata.org) installed for the evaluation scripts

Moreover the host code and the build system use additional libraries included as git submodules:

Expand Down Expand Up @@ -86,6 +86,7 @@ For the Intel compiler these are:
Name | Default | Description |
---------------- |-------------|--------------------------------------|
`AOC_FLAGS`| `-fpc -fp-relaxed -no-interleaving=default` | Additional Intel AOC compiler flags that are used for kernel compilation |
`INTEL_CODE_GENERATION_SETTINGS` | "" | Path to the settings file that will be used as input for the code generator script. It may contain additional variables or functions. |

For the Xilinx compiler it is also necessary to set settings files for the compile and link step of the compiler.
The available options are given in the following table:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

global_memory_name = "HBM"

def generate_attributes(num_replications, num_global_memory_banks=32):
"""
Generates the kernel attributes for the global memory. They specify in which
global memory the buffer is located. The buffers will be placed using a
round robin scheme using the available global memory banks and the number of
replications that should be generated (e.g. if a global memory contains multiple banks)
@param num_replications Number okernel replications
@param num_global_memory_banks Number of global memory banks that should be used for generation
@return Array of strings that contain the attributes for every kernel
"""
global_memory_names = [ "%s%d" % (global_memory_name, i) for i in range(num_global_memory_banks)]
return [ "__attribute__((buffer_location(\"%s\")))"
% (global_memory_names[i % num_global_memory_banks])
for i in range(num_replications)]
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@

# Set number of available SLRs
# PY_CODE_GEN num_slrs = 3

[connectivity]
nk=accessMemory_0:{TOTAL_KERNEL_NUMBER}
nk=accessMemory_0:$PY_CODE_GEN num_replications$

# slrs
slr=accessMemory_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC}
# Assign kernels to the SLRs
# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
slr=accessMemory_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$
# PY_CODE_GEN block_end

# matrix ports
sp=accessMemory_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[{KERNEL_NUMBER_DEC}]
# Assign the kernels to the memory ports
# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
sp=accessMemory_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN i$]
# PY_CODE_GEN block_end
13 changes: 12 additions & 1 deletion RandomAccess/src/device/random_access_kernels_single.cl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ Constant used to update the pseudo random number
*/
#define POLY 7

/* PY_CODE_GEN
try:
kernel_param_attributes = generate_attributes(num_replications)
except:
kernel_param_attributes = ["" for i in range(num_replications)]
*/

// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]

/*
Kernel, that will update the given data array accoring to a predefined pseudo-
random access scheme. The overall data array might be equally split between
Expand All @@ -39,7 +48,7 @@ to the kernel.
*/
__attribute__((max_global_work_dim(0)))
__kernel
void accessMemory_KERNEL_NUMBER(__global DEVICE_DATA_TYPE_UNSIGNED volatile * restrict data,
void accessMemory_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE_UNSIGNED volatile * restrict data,
const DEVICE_DATA_TYPE_UNSIGNED m,
const DEVICE_DATA_TYPE_UNSIGNED data_chunk,
const uint kernel_number) {
Expand Down Expand Up @@ -109,3 +118,5 @@ void accessMemory_KERNEL_NUMBER(__global DEVICE_DATA_TYPE_UNSIGNED volatile * r
}
}
}

// PY_CODE_GEN block_end
39 changes: 39 additions & 0 deletions STREAM/scripts/build_s10xm_hbm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
#
# Synthesize the STREAM single kernel for the Stratix 10 MX HBM board on Noctua.
# Submit this script to sbatch in this folder!
#
#SBATCH -p fpgasyn
#SBATCH --exclusive

module load intelFPGA_pro/19.4.0
module load intel_s10mx/19.3.0
module load lang/Python/3.7.0-foss-2018b
module load devel/CMake/3.15.3-GCCcore-8.3.0

SCRIPT_PATH=${SLURM_SUBMIT_DIR}

BENCHMARK_DIR=${SCRIPT_PATH}/../

BUILD_DIR_4K=${SCRIPT_PATH}/../../build/synth/STREAM-s10xm_hbm-4k
BUILD_DIR_8K=${SCRIPT_PATH}/../../build/synth/STREAM-s10xm_hbm-8k

mkdir -p ${BUILD_DIR_4K}
cd ${BUILD_DIR_4K}

cmake ${BENCHMARK_DIR} -DDEVICE_BUFFER_SIZE=4096 -DVECTOR_COUNT=8 -DNUM_REPLICATIONS=32 \
-DAOC_FLAGS="-fpc -fp-relaxed -global-ring" \
-DINTEL_CODE_GENERATION_SETTINGS=${BENCHMARK_DIR}/settings/settings.gen.intel.stream_kernels_single.s10mxhbm.py

make stream_kernels_single_intel&

mkdir -p ${BUILD_DIR_8K}
cd ${BUILD_DIR_8K}

cmake ${BENCHMARK_DIR} -DDEVICE_BUFFER_SIZE=8192 -DVECTOR_COUNT=8 -DNUM_REPLICATIONS=32 \
-DAOC_FLAGS="-fpc -fp-relaxed -global-ring" \
-DINTEL_CODE_GENERATION_SETTINGS=${BENCHMARK_DIR}/settings/settings.gen.intel.stream_kernels_single.s10mxhbm.py

make stream_kernels_single_intel&

wait
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

global_memory_name = "HBM"

def generate_attributes(num_replications, num_global_memory_banks=32):
"""
Generates the kernel attributes for the global memory. They specify in which
global memory the buffer is located. The buffers will be placed using a
round robin scheme using the available global memory banks and the number of
replications that should be generated (e.g. if a global memory contains multiple banks)
@param num_replications Number okernel replications
@param num_global_memory_banks Number of global memory banks that should be used for generation
@return Array of strings that contain the attributes for every kernel
"""
global_memory_names = [ "%s%d" % (global_memory_name, i) for i in range(num_global_memory_banks)]
return [ "__attribute__((buffer_location(\"%s\")))"
% (global_memory_names[i % num_global_memory_banks])
for i in range(num_replications)]
Original file line number Diff line number Diff line change
@@ -1,17 +1,25 @@

# Set number of available SLRs
# PY_CODE_GEN num_slrs = 3

[connectivity]
nk=copy_0:{TOTAL_KERNEL_NUMBER}
nk=scale_0:{TOTAL_KERNEL_NUMBER}
nk=add_0:{TOTAL_KERNEL_NUMBER}
nk=triad_0:{TOTAL_KERNEL_NUMBER}
nk=copy_0:$PY_CODE_GEN num_replications$
nk=scale_0:$PY_CODE_GEN num_replications$
nk=add_0:$PY_CODE_GEN num_replications$
nk=triad_0:$PY_CODE_GEN num_replications$

# slrs
slr=copy_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC}
slr=scale_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC}
slr=add_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC}
slr=triad_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC}
# Assign kernels to the SLRs
# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
slr=copy_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$
slr=scale_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$
slr=add_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$
slr=triad_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$
# PY_CODE_GEN block_end

# matrix ports
sp=copy_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[0:2]
sp=scale_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[0:2]
sp=add_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[0:2]
sp=triad_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[0:2]
# Assign the kernels to the memory ports
# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
sp=copy_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN 2*i$:$PY_CODE_GEN 2*i+1$]
sp=scale_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN 2*i$:$PY_CODE_GEN 2*i+1$]
sp=add_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN 2*i$:$PY_CODE_GEN 2*i+1$]
sp=triad_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN 2*i$:$PY_CODE_GEN 2*i+1$]
# PY_CODE_GEN block_end
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@


# Set number of available SLRs
# PY_CODE_GEN num_slrs = 3

[connectivity]
nk=calc_0:{TOTAL_KERNEL_NUMBER}
nk=calc_0:$PY_CODE_GEN num_replications$

# slrs
slr=calc_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC}
# Assign kernels to the SLRs
# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
slr=calc_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$
# PY_CODE_GEN block_end

# matrix ports
sp=calc_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[{KERNEL_NUMBER_DEC}]
# Assign the kernels to the memory ports
# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
sp=calc_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN i$]
# PY_CODE_GEN block_end
12 changes: 8 additions & 4 deletions STREAM/src/device/stream_kernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re
*/
#include "parameters.h"

// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]

__kernel
__attribute__((uses_global_work_offset(0)))
void copy_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
void copy_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
__global DEVICE_ARRAY_DATA_TYPE * restrict out,
const uint array_size) {
uint number_elements = array_size / VECTOR_COUNT;
Expand All @@ -20,7 +22,7 @@ void copy_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,

__kernel
__attribute__((uses_global_work_offset(0)))
void add_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
void add_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
__global const DEVICE_ARRAY_DATA_TYPE * restrict in2,
__global DEVICE_ARRAY_DATA_TYPE * restrict out,
const uint array_size) {
Expand All @@ -33,7 +35,7 @@ void add_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,

__kernel
__attribute__((uses_global_work_offset(0)))
void scale_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
void scale_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
__global DEVICE_ARRAY_DATA_TYPE * restrict out,
const DEVICE_SCALAR_DATA_TYPE scalar,
const uint array_size) {
Expand All @@ -46,7 +48,7 @@ void scale_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,

__kernel
__attribute__((uses_global_work_offset(0)))
void triad_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
void triad_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
__global const DEVICE_ARRAY_DATA_TYPE * restrict in2,
__global DEVICE_ARRAY_DATA_TYPE * restrict out,
const DEVICE_SCALAR_DATA_TYPE scalar,
Expand All @@ -57,3 +59,5 @@ void triad_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
out[i] = in1[i] + scalar * in2[i];
}
}

// PY_CODE_GEN block_end
23 changes: 17 additions & 6 deletions STREAM/src/device/stream_kernels_single.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,28 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re
*/
#include "parameters.h"

/* PY_CODE_GEN
try:
kernel_param_attributes = generate_attributes(num_replications)
except:
kernel_param_attributes = ["" for i in range(num_replications)]
*/
// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
__kernel
__attribute__((uses_global_work_offset(0)))
void calc_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE *restrict in1,
__global const DEVICE_ARRAY_DATA_TYPE *restrict in2,
__global DEVICE_ARRAY_DATA_TYPE *restrict out,
void calc_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_ARRAY_DATA_TYPE *restrict in1,
__global /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_ARRAY_DATA_TYPE *restrict in2,
__global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_ARRAY_DATA_TYPE *restrict out,
const DEVICE_SCALAR_DATA_TYPE scalar,
const uint array_size,
const uint operation_type) {
#ifndef INNER_LOOP_BUFFERS
DEVICE_ARRAY_DATA_TYPE buffer1[BUFFER_SIZE];
#endif
uint number_elements = array_size / VECTOR_COUNT;
#ifdef INTEL_FPGA
#pragma disable_loop_pipelining
#endif
for(uint i = 0;i<number_elements;i += BUFFER_SIZE){
#ifdef INNER_LOOP_BUFFERS
DEVICE_ARRAY_DATA_TYPE buffer1[BUFFER_SIZE];
Expand All @@ -38,8 +48,9 @@ void calc_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE *restrict in1,
// Calculate result and write back to output array depending on chosen operation type
__attribute__((opencl_unroll_hint(UNROLL_COUNT)))
for (uint k = 0;k<BUFFER_SIZE;k++) {
out[i + k] = buffer1[k];

}
out[i + k] = buffer1[k];
}
}
}

// PY_CODE_GEN block_end
3 changes: 3 additions & 0 deletions cmake/general_benchmark_build_setup.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,12 @@ if (VITIS_FOUND)
endif()
if (INTELFPGAOPENCL_FOUND)
set(AOC_FLAGS "-fpc -fp-relaxed -no-interleaving=default" CACHE STRING "Used flags for the AOC compiler")
set(INTEL_CODE_GENERATION_SETTINGS "" CACHE FILEPATH "Code generation settings file for the intel targets")
separate_arguments(AOC_FLAGS)
endif()

set(CODE_GENERATOR "${CMAKE_SOURCE_DIR}/../scripts/code_generator/generator.py" CACHE FILEPATH "Path to the code generator executable")

# Add subdirectories of the project
add_subdirectory(${CMAKE_SOURCE_DIR}/src/device)
add_subdirectory(${CMAKE_SOURCE_DIR}/src/host)
Expand Down
18 changes: 14 additions & 4 deletions cmake/kernelTargets.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@ function(generate_kernel_targets_xilinx)
set(bitstream_emulate_f
${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_emulate.xclbin)
set(bitstream_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}.xclbin)
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/settings)
if (XILINX_GENERATE_LINK_SETTINGS)
set(gen_xilinx_link_settings ${XILINX_LINK_SETTINGS_FILE})
set(xilinx_link_settings ${CMAKE_BINARY_DIR}/settings/settings.link.xilinx.${kernel_file_name}.ini)
else()
set(gen_xilinx_link_settings ${XILINX_LINK_SETTINGS_FILE})
set(xilinx_link_settings ${XILINX_LINK_SETTINGS_FILE})
set(xilinx_link_settings ${CMAKE_BINARY_DIR}/settings/settings.link.xilinx.${kernel_file_name}.ini)
endif()
set(xilinx_report_folder "--report_dir=${EXECUTABLE_OUTPUT_PATH}/xilinx_reports")
file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_reports)
Expand All @@ -38,14 +39,19 @@ function(generate_kernel_targets_xilinx)
)
if (XILINX_GENERATE_LINK_SETTINGS)
add_custom_command(OUTPUT ${xilinx_link_settings}
COMMAND ${CMAKE_COMMAND} -Dsettings_f=${xilinx_link_settings} -Dbase_file=${gen_xilinx_link_settings} -DNUM_REPLICATIONS=${NUM_REPLICATIONS} -P "${CMAKE_SOURCE_DIR}/../cmake/generateXilinxSettings.cmake"
COMMAND ${CODE_GENERATOR} -o ${xilinx_link_settings} -p num_replications=${NUM_REPLICATIONS} --comment "\"#\"" --comment-ml-start "\"$$\"" --comment-ml-end "\"$$\"" ${gen_xilinx_link_settings}
MAIN_DEPENDENCY ${gen_xilinx_link_settings}
)
else()
add_custom_command(OUTPUT ${xilinx_link_settings}
COMMAND cp ${gen_xilinx_link_settings} ${xilinx_link_settings}
MAIN_DEPENDENCY ${gen_xilinx_link_settings}
)
endif()

if (KERNEL_REPLICATION_ENABLED)
add_custom_command(OUTPUT ${source_f}
COMMAND ${CMAKE_COMMAND} -Dsource_f=${source_f} -Dbase_file=${base_file} -DNUM_REPLICATIONS=1 -P "${CMAKE_SOURCE_DIR}/../cmake/generateKernels.cmake"
COMMAND ${CODE_GENERATOR} -o ${source_f} -p num_replications=1 ${base_file}
MAIN_DEPENDENCY ${base_file}
)
else()
Expand Down Expand Up @@ -104,8 +110,12 @@ function(generate_kernel_targets_intel)
set(bitstream_emulate_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_emulate.aocx)
set(bitstream_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}.aocx)
if (KERNEL_REPLICATION_ENABLED)
set(codegen_parameters -p num_replications=${NUM_REPLICATIONS})
if (INTEL_CODE_GENERATION_SETTINGS)
list(APPEND codegen_parameters -p "\"use_file('${INTEL_CODE_GENERATION_SETTINGS}')\"")
endif()
add_custom_command(OUTPUT ${source_f}
COMMAND ${CMAKE_COMMAND} -Dsource_f=${source_f} -Dbase_file=${base_file} -DNUM_REPLICATIONS=${NUM_REPLICATIONS} -P "${CMAKE_SOURCE_DIR}/../cmake/generateKernels.cmake"
COMMAND ${CODE_GENERATOR} -o ${source_f} ${codegen_parameters} ${base_file}
MAIN_DEPENDENCY ${base_file}
)
else()
Expand Down
Loading

0 comments on commit 6bfd62f

Please sign in to comment.