diff --git a/README.md b/README.md index 8bf40c4b..405ac454 100755 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ All benchmarks come with the following build dependencies: - CMake >= 3.1 - C++ compiler with C++11 support - Intel OpenCL FPGA SDK or Xilinx Vitis -- Python 3 with [pandas](https://pandas.pydata.org) installed (for the evaluation scripts) +- Python 3 for code generation and with [pandas](https://pandas.pydata.org) installed for the evaluation scripts Moreover the host code and the build system use additional libraries included as git submodules: @@ -86,6 +86,7 @@ For the Intel compiler these are: Name | Default | Description | ---------------- |-------------|--------------------------------------| `AOC_FLAGS`| `-fpc -fp-relaxed -no-interleaving=default` | Additional Intel AOC compiler flags that are used for kernel compilation | +`INTEL_CODE_GENERATION_SETTINGS` | "" | Path to the settings file that will be used as input for the code generator script. It may contain additional variables or functions. | For the Xilinx compiler it is also necessary to set settings files for the compile and link step of the compiler. The available options are given in the following table: diff --git a/RandomAccess/settings/settings.gen.intel.random_access_kernels_single.s10mxhbm.py b/RandomAccess/settings/settings.gen.intel.random_access_kernels_single.s10mxhbm.py new file mode 100644 index 00000000..030a9d1e --- /dev/null +++ b/RandomAccess/settings/settings.gen.intel.random_access_kernels_single.s10mxhbm.py @@ -0,0 +1,19 @@ + +global_memory_name = "HBM" + +def generate_attributes(num_replications, num_global_memory_banks=32): + """ + Generates the kernel attributes for the global memory. They specify in which + global memory the buffer is located. The buffers will be placed using a + round robin scheme using the available global memory banks and the number of + replications that should be generated (e.g. if a global memory contains multiple banks) + + @param num_replications Number okernel replications + @param num_global_memory_banks Number of global memory banks that should be used for generation + + @return Array of strings that contain the attributes for every kernel + """ + global_memory_names = [ "%s%d" % (global_memory_name, i) for i in range(num_global_memory_banks)] + return [ "__attribute__((buffer_location(\"%s\")))" + % (global_memory_names[i % num_global_memory_banks]) + for i in range(num_replications)] \ No newline at end of file diff --git a/RandomAccess/settings/settings.link.xilinx.random_access_kernels_single.hbm.generator.ini b/RandomAccess/settings/settings.link.xilinx.random_access_kernels_single.hbm.generator.ini index b4a1a4b4..986bab03 100644 --- a/RandomAccess/settings/settings.link.xilinx.random_access_kernels_single.hbm.generator.ini +++ b/RandomAccess/settings/settings.link.xilinx.random_access_kernels_single.hbm.generator.ini @@ -1,8 +1,16 @@ + +# Set number of available SLRs +# PY_CODE_GEN num_slrs = 3 + [connectivity] -nk=accessMemory_0:{TOTAL_KERNEL_NUMBER} +nk=accessMemory_0:$PY_CODE_GEN num_replications$ -# slrs -slr=accessMemory_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC} +# Assign kernels to the SLRs +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=accessMemory_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$ +# PY_CODE_GEN block_end -# matrix ports -sp=accessMemory_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[{KERNEL_NUMBER_DEC}] +# Assign the kernels to the memory ports +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=accessMemory_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN i$] +# PY_CODE_GEN block_end diff --git a/RandomAccess/src/device/random_access_kernels_single.cl b/RandomAccess/src/device/random_access_kernels_single.cl index 13ab9194..341616de 100644 --- a/RandomAccess/src/device/random_access_kernels_single.cl +++ b/RandomAccess/src/device/random_access_kernels_single.cl @@ -27,6 +27,15 @@ Constant used to update the pseudo random number */ #define POLY 7 +/* PY_CODE_GEN +try: + kernel_param_attributes = generate_attributes(num_replications) +except: + kernel_param_attributes = ["" for i in range(num_replications)] +*/ + +// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] + /* Kernel, that will update the given data array accoring to a predefined pseudo- random access scheme. The overall data array might be equally split between @@ -39,7 +48,7 @@ to the kernel. */ __attribute__((max_global_work_dim(0))) __kernel -void accessMemory_KERNEL_NUMBER(__global DEVICE_DATA_TYPE_UNSIGNED volatile * restrict data, +void accessMemory_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE_UNSIGNED volatile * restrict data, const DEVICE_DATA_TYPE_UNSIGNED m, const DEVICE_DATA_TYPE_UNSIGNED data_chunk, const uint kernel_number) { @@ -109,3 +118,5 @@ void accessMemory_KERNEL_NUMBER(__global DEVICE_DATA_TYPE_UNSIGNED volatile * r } } } + +// PY_CODE_GEN block_end diff --git a/STREAM/scripts/build_s10xm_hbm.sh b/STREAM/scripts/build_s10xm_hbm.sh new file mode 100755 index 00000000..de4962b3 --- /dev/null +++ b/STREAM/scripts/build_s10xm_hbm.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# +# Synthesize the STREAM single kernel for the Stratix 10 MX HBM board on Noctua. +# Submit this script to sbatch in this folder! +# +#SBATCH -p fpgasyn +#SBATCH --exclusive + +module load intelFPGA_pro/19.4.0 +module load intel_s10mx/19.3.0 +module load lang/Python/3.7.0-foss-2018b +module load devel/CMake/3.15.3-GCCcore-8.3.0 + +SCRIPT_PATH=${SLURM_SUBMIT_DIR} + +BENCHMARK_DIR=${SCRIPT_PATH}/../ + +BUILD_DIR_4K=${SCRIPT_PATH}/../../build/synth/STREAM-s10xm_hbm-4k +BUILD_DIR_8K=${SCRIPT_PATH}/../../build/synth/STREAM-s10xm_hbm-8k + +mkdir -p ${BUILD_DIR_4K} +cd ${BUILD_DIR_4K} + +cmake ${BENCHMARK_DIR} -DDEVICE_BUFFER_SIZE=4096 -DVECTOR_COUNT=8 -DNUM_REPLICATIONS=32 \ + -DAOC_FLAGS="-fpc -fp-relaxed -global-ring" \ + -DINTEL_CODE_GENERATION_SETTINGS=${BENCHMARK_DIR}/settings/settings.gen.intel.stream_kernels_single.s10mxhbm.py + +make stream_kernels_single_intel& + +mkdir -p ${BUILD_DIR_8K} +cd ${BUILD_DIR_8K} + +cmake ${BENCHMARK_DIR} -DDEVICE_BUFFER_SIZE=8192 -DVECTOR_COUNT=8 -DNUM_REPLICATIONS=32 \ + -DAOC_FLAGS="-fpc -fp-relaxed -global-ring" \ + -DINTEL_CODE_GENERATION_SETTINGS=${BENCHMARK_DIR}/settings/settings.gen.intel.stream_kernels_single.s10mxhbm.py + +make stream_kernels_single_intel& + +wait diff --git a/STREAM/settings/settings.gen.intel.stream_kernels_single.s10mxhbm.py b/STREAM/settings/settings.gen.intel.stream_kernels_single.s10mxhbm.py new file mode 100644 index 00000000..030a9d1e --- /dev/null +++ b/STREAM/settings/settings.gen.intel.stream_kernels_single.s10mxhbm.py @@ -0,0 +1,19 @@ + +global_memory_name = "HBM" + +def generate_attributes(num_replications, num_global_memory_banks=32): + """ + Generates the kernel attributes for the global memory. They specify in which + global memory the buffer is located. The buffers will be placed using a + round robin scheme using the available global memory banks and the number of + replications that should be generated (e.g. if a global memory contains multiple banks) + + @param num_replications Number okernel replications + @param num_global_memory_banks Number of global memory banks that should be used for generation + + @return Array of strings that contain the attributes for every kernel + """ + global_memory_names = [ "%s%d" % (global_memory_name, i) for i in range(num_global_memory_banks)] + return [ "__attribute__((buffer_location(\"%s\")))" + % (global_memory_names[i % num_global_memory_banks]) + for i in range(num_replications)] \ No newline at end of file diff --git a/STREAM/settings/settings.link.xilinx.stream_kernels.hbm.generator.ini b/STREAM/settings/settings.link.xilinx.stream_kernels.hbm.generator.ini index 30617e7a..bf914f10 100644 --- a/STREAM/settings/settings.link.xilinx.stream_kernels.hbm.generator.ini +++ b/STREAM/settings/settings.link.xilinx.stream_kernels.hbm.generator.ini @@ -1,17 +1,25 @@ + +# Set number of available SLRs +# PY_CODE_GEN num_slrs = 3 + [connectivity] -nk=copy_0:{TOTAL_KERNEL_NUMBER} -nk=scale_0:{TOTAL_KERNEL_NUMBER} -nk=add_0:{TOTAL_KERNEL_NUMBER} -nk=triad_0:{TOTAL_KERNEL_NUMBER} +nk=copy_0:$PY_CODE_GEN num_replications$ +nk=scale_0:$PY_CODE_GEN num_replications$ +nk=add_0:$PY_CODE_GEN num_replications$ +nk=triad_0:$PY_CODE_GEN num_replications$ -# slrs -slr=copy_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC} -slr=scale_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC} -slr=add_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC} -slr=triad_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC} +# Assign kernels to the SLRs +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=copy_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$ +slr=scale_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$ +slr=add_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$ +slr=triad_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$ +# PY_CODE_GEN block_end -# matrix ports -sp=copy_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[0:2] -sp=scale_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[0:2] -sp=add_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[0:2] -sp=triad_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[0:2] +# Assign the kernels to the memory ports +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=copy_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN 2*i$:$PY_CODE_GEN 2*i+1$] +sp=scale_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN 2*i$:$PY_CODE_GEN 2*i+1$] +sp=add_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN 2*i$:$PY_CODE_GEN 2*i+1$] +sp=triad_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN 2*i$:$PY_CODE_GEN 2*i+1$] +# PY_CODE_GEN block_end diff --git a/STREAM/settings/settings.link.xilinx.stream_kernels_single.hbm.generator.ini b/STREAM/settings/settings.link.xilinx.stream_kernels_single.hbm.generator.ini index f64070f3..eca40fa3 100644 --- a/STREAM/settings/settings.link.xilinx.stream_kernels_single.hbm.generator.ini +++ b/STREAM/settings/settings.link.xilinx.stream_kernels_single.hbm.generator.ini @@ -1,8 +1,17 @@ + + +# Set number of available SLRs +# PY_CODE_GEN num_slrs = 3 + [connectivity] -nk=calc_0:{TOTAL_KERNEL_NUMBER} +nk=calc_0:$PY_CODE_GEN num_replications$ -# slrs -slr=calc_0_{KERNEL_NUMBER}:SLR{KERNEL_NUMBER_DEC} +# Assign kernels to the SLRs +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=calc_0_$PY_CODE_GEN i+1$:SLR$PY_CODE_GEN i % num_slrs$ +# PY_CODE_GEN block_end -# matrix ports -sp=calc_0_{KERNEL_NUMBER}.m_axi_gmem:HBM[{KERNEL_NUMBER_DEC}] +# Assign the kernels to the memory ports +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=calc_0_$PY_CODE_GEN i+1$.m_axi_gmem:HBM[$PY_CODE_GEN i$] +# PY_CODE_GEN block_end diff --git a/STREAM/src/device/stream_kernels.cl b/STREAM/src/device/stream_kernels.cl index b266127f..cd569727 100644 --- a/STREAM/src/device/stream_kernels.cl +++ b/STREAM/src/device/stream_kernels.cl @@ -6,9 +6,11 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re */ #include "parameters.h" +// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] + __kernel __attribute__((uses_global_work_offset(0))) -void copy_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, +void copy_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, __global DEVICE_ARRAY_DATA_TYPE * restrict out, const uint array_size) { uint number_elements = array_size / VECTOR_COUNT; @@ -20,7 +22,7 @@ void copy_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, __kernel __attribute__((uses_global_work_offset(0))) -void add_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, +void add_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, __global const DEVICE_ARRAY_DATA_TYPE * restrict in2, __global DEVICE_ARRAY_DATA_TYPE * restrict out, const uint array_size) { @@ -33,7 +35,7 @@ void add_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, __kernel __attribute__((uses_global_work_offset(0))) -void scale_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, +void scale_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, __global DEVICE_ARRAY_DATA_TYPE * restrict out, const DEVICE_SCALAR_DATA_TYPE scalar, const uint array_size) { @@ -46,7 +48,7 @@ void scale_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, __kernel __attribute__((uses_global_work_offset(0))) -void triad_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, +void triad_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, __global const DEVICE_ARRAY_DATA_TYPE * restrict in2, __global DEVICE_ARRAY_DATA_TYPE * restrict out, const DEVICE_SCALAR_DATA_TYPE scalar, @@ -57,3 +59,5 @@ void triad_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, out[i] = in1[i] + scalar * in2[i]; } } + +// PY_CODE_GEN block_end diff --git a/STREAM/src/device/stream_kernels_single.cl b/STREAM/src/device/stream_kernels_single.cl index 48e1aa30..80754015 100644 --- a/STREAM/src/device/stream_kernels_single.cl +++ b/STREAM/src/device/stream_kernels_single.cl @@ -7,11 +7,18 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re */ #include "parameters.h" +/* PY_CODE_GEN +try: + kernel_param_attributes = generate_attributes(num_replications) +except: + kernel_param_attributes = ["" for i in range(num_replications)] +*/ +// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] __kernel __attribute__((uses_global_work_offset(0))) -void calc_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE *restrict in1, - __global const DEVICE_ARRAY_DATA_TYPE *restrict in2, - __global DEVICE_ARRAY_DATA_TYPE *restrict out, +void calc_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_ARRAY_DATA_TYPE *restrict in1, + __global /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_ARRAY_DATA_TYPE *restrict in2, + __global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_ARRAY_DATA_TYPE *restrict out, const DEVICE_SCALAR_DATA_TYPE scalar, const uint array_size, const uint operation_type) { @@ -19,6 +26,9 @@ void calc_KERNEL_NUMBER(__global const DEVICE_ARRAY_DATA_TYPE *restrict in1, DEVICE_ARRAY_DATA_TYPE buffer1[BUFFER_SIZE]; #endif uint number_elements = array_size / VECTOR_COUNT; +#ifdef INTEL_FPGA +#pragma disable_loop_pipelining +#endif for(uint i = 0;i(.|\n)+?)%s" % (ml_comment_symbol_start, pycodegen_cmd, ml_comment_symbol_end), code_block, flags=0), + re.finditer("%s\\s+(?!block_start\\s+)(?!block_end\\s+)(?P(.)+?)\n" % (pragma_cmd), code_block, flags=0)) + for res_ml in matches: + res_ml_code = res_ml.group(0) + try: + evaluated = str(eval(res_ml.groupdict()["code"], variables)) + code_block = code_block.replace(res_ml_code, evaluated) + logging.debug("Evaluated '%s' to '%s'" % (res_ml.groupdict()["code"], evaluated)) + continue + except Exception as e: + logging.debug("Failed to evaluate inline code") + try: + exec(res_ml.groupdict()["code"], globals()) + code_block = code_block.replace(res_ml_code, "") + logging.debug("Executed in global space: '%s'" % res_ml.groupdict()["code"]) + except Exception as e: + logging.warning("Could not execute inline code:\n\tCommand: '''\n%s\n'''\n\tError: %s" % (res_ml.groupdict()["code"], e)) + return code_block + + +def modify_block(code_block, cmd_str, out): + global CODE + CODE = code_block + if cmd_str == "": + cmd_str = "None" + try: + mod_code = eval(cmd_str, {**globals(), **locals()}) + except Exception as e: + logging.error("Block: %s \n %s" % (code_block, e)) + logging.error("Global variables: %s" % globals()) + print( "Block: %s \n %s" % (code_block, e),file=sys.stderr) + exit(1) + if type(mod_code) is list: + mod_code = "".join(mod_code) + elif mod_code is None: + mod_code = "" + elif type(mod_code) is not str: + logging.warning("%s is not a string. Automatic convert to string!" % mod_code) + mod_code = str(mod_code) + return mod_code + #logging.debug("Start parsing of modified sub-block") + #parse_string(mod_code, out) + #logging.debug("Finished parsing of modified sub-block") + + +def parse_string(code_string, out): + try: + code_string = replace(code_string) + for res in re.finditer("%s\\s+block_start\\s+(?P.*)\n(?P(.|\n)+?)%s\\s+block_end\\s*\n" % (pragma_cmd, pragma_cmd), code_string, flags=0): + logging.debug("Found block match!") + d = res.groupdict() + code_block = d["code"] + logging.debug("Modify the block!") + code_block = modify_block(code_block, d["cmd"], out) + code_string = code_string.replace(res.group(0), code_block) + logging.debug("Parsing complete. Write result to file.") + output.write(code_string) + except Exception as e: + logging.error("Block: %s \n %s" % (code_string, e)) + logging.error("Global variables: %s" % globals()) + logging.error("Local variables: %s" % locals()) + print( "Error while parsing code block: %s \n %s" % (e),file=sys.stderr) + + +def parse_file(file_name, out): + """ + Opens a single source code file and applies the changes to it. + + The function will output the modified source code into the given output stream. + + @param file_name The psth to the source code file relative to the current working directory + @param out Output stream that is used to output the modified source code + """ + try: + with open(file_name) as f: + parse_string(f.read(), out) + except Exception as e: + logging.error("Error when opening and parsing file %s: %s" % (file_name, e)) + print("Error occurred when parsing file. See logs for more details.",file=sys.stderr) + + + + +if __name__=="__main__": + args = parser.parse_args() + if args.output_file: + log_file_name = args.output_file + ".log" + else: + log_file_name = "generator.log" + logging.basicConfig(filename=log_file_name, filemode='w', level=logging.DEBUG) + output = sys.stdout + for p in args.params: + logging.debug("Parse statement: %s" % p) + exec(p, globals()) + if args.output_file: + logging.debug("Use output file: %s" % args.output_file) + output = open(args.output_file, 'w') + comment_symbol = re.escape(args.comment_symbol) + ml_comment_symbol_start = re.escape(args.comment_symbol_ml_start) + ml_comment_symbol_end = re.escape(args.comment_symbol_ml_end) + pragma_cmd = comment_symbol +"\\s*"+ pycodegen_cmd + logging.debug("Use pragma command: %s", pragma_cmd) + logging.debug("Start parsing file: %s" % args.file) + parse_file(args.file, output)