diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1b1812b0..8d0bf4ae 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,7 +9,7 @@ default: tags: - jacamar before_script: - - module load intelFPGA_pro/20.4.0_max bittware_520n/20.4.0_max intel devel/CMake/3.15.3-GCCcore-8.3.0 + - module load intelFPGA_pro/20.4.0 bittware_520n/20.4.0_max intel devel/CMake/3.15.3-GCCcore-8.3.0 ### # @@ -38,7 +38,50 @@ build:STREAM: - scripts/**/* - cmake/**/* - .gitlab-ci.yml - + +build:STREAM_HP: + stage: build + script: + - rm -rf build + - mkdir -p build + - cd build + - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - make -j 40 all + artifacts: + paths: + - build/bin/stream_kernels_single_emulate.aocx + - build/bin/stream_kernels_emulate.aocx + - build/bin/STREAM_FPGA_intel + - build/bin/STREAM_FPGA_test_intel + only: + changes: + - STREAM/**/* + - shared/**/* + - scripts/**/* + - cmake/**/* + - .gitlab-ci.yml + +build:STREAM_DP: + stage: build + script: + - rm -rf build + - mkdir -p build + - cd build + - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - make -j 40 all + artifacts: + paths: + - build/bin/stream_kernels_single_emulate.aocx + - build/bin/stream_kernels_emulate.aocx + - build/bin/STREAM_FPGA_intel + - build/bin/STREAM_FPGA_test_intel + only: + changes: + - STREAM/**/* + - shared/**/* + - scripts/**/* + - cmake/**/* + - .gitlab-ci.yml build:RandomAccess: stage: build @@ -72,8 +115,11 @@ build:PTRANS: - make -j 40 all artifacts: paths: - - build/bin/transpose_diagonal_emulate.aocx - - build/bin/transpose_diagonal_c2_emulate.aocx + - build/bin/transpose_DIAG_IEC_emulate.aocx + - build/bin/transpose_PQ_IEC_emulate.aocx + - build/bin/transpose_PQ_PCIE_emulate.aocx + - build/bin/transpose_DIAG_PCIE_emulate.aocx + - build/bin/transpose_c2_DIAG_IEC_emulate.aocx - build/bin/Transpose_intel - build/bin/Transpose_test_intel only: @@ -90,11 +136,12 @@ build:LINPACK: - rm -rf build - mkdir -p build - cd build - - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 + - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DUSE_PCIE_MPI_COMMUNICATION=Yes - make -j 40 all artifacts: paths: - - build/bin/hpl_torus_emulate.aocx + - build/bin/hpl_torus_PCIE_emulate.aocx + - build/bin/hpl_torus_IEC_emulate.aocx - build/bin/Linpack_intel - build/bin/Linpack_test_intel only: @@ -147,6 +194,27 @@ build:GEMM_HP_REP2: - cmake/**/* - .gitlab-ci.yml +build:GEMM_DP_REP2: + stage: build + script: + - rm -rf build + - mkdir -p build + - cd build + - cmake ../GEMM -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 + - make -j 40 all + artifacts: + paths: + - build/bin/gemm_base_emulate.aocx + - build/bin/GEMM_intel + - build/bin/GEMM_test_intel + only: + changes: + - GEMM/**/* + - shared/**/* + - scripts/**/* + - cmake/**/* + - .gitlab-ci.yml + build:FFT: stage: build script: @@ -199,11 +267,7 @@ build:b_eff: - make -j 40 all artifacts: paths: - - build/bin/communication_bw520n_emulate.aocx - - build/bin/communication_bw520n_combined_loops_emulate.aocx - - build/bin/communication_bw520n_disable_pipelining_emulate.aocx - - build/bin/Network_intel - - build/bin/Network_test_intel + - build/bin/* only: changes: - b_eff/**/* @@ -223,7 +287,7 @@ test:STREAM: script: - cd build - cmake ../STREAM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + - make CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:STREAM artifacts: @@ -238,13 +302,57 @@ test:STREAM: - cmake/**/* - .gitlab-ci.yml needs: ["build:STREAM"] + +test:STREAM_HP: + stage: test + script: + - cd build + - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - make CTEST_OUTPUT_ON_FAILURE=1 test + dependencies: + - build:STREAM_HP + artifacts: + when: on_failure + paths: + - build/Testing/Temporary/LastTest.log + only: + changes: + - STREAM/**/* + - shared/**/* + - scripts/**/* + - cmake/**/* + - .gitlab-ci.yml + needs: ["build:STREAM_HP"] + # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE) + allow_failure: true + +test:STREAM_DP: + stage: test + script: + - cd build + - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - make CTEST_OUTPUT_ON_FAILURE=1 test + dependencies: + - build:STREAM_DP + artifacts: + when: on_failure + paths: + - build/Testing/Temporary/LastTest.log + only: + changes: + - STREAM/**/* + - shared/**/* + - scripts/**/* + - cmake/**/* + - .gitlab-ci.yml + needs: ["build:STREAM_DP"] test:RandomAccess: stage: test script: - cd build - cmake ../RandomAccess -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + - make CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:RandomAccess artifacts: @@ -275,7 +383,7 @@ test:PTRANS: - ln -s kernel_output_ch1 kernel_input_ch0 - ln -s kernel_output_ch3 kernel_input_ch2 - cd .. - - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + - make CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:PTRANS artifacts: @@ -296,7 +404,7 @@ test:LINPACK: script: - cd build - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 - - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + - make CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:LINPACK artifacts: @@ -317,7 +425,7 @@ test:GEMM: script: - cd build - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 - - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + - make CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:GEMM artifacts: @@ -338,7 +446,7 @@ test:GEMM_HP_REP2: script: - cd build - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=half -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 - - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + - make CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:GEMM_HP_REP2 artifacts: @@ -353,13 +461,36 @@ test:GEMM_HP_REP2: - cmake/**/* - .gitlab-ci.yml needs: ["build:GEMM_HP_REP2"] + # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE) + allow_failure: true + +test:GEMM_DP_REP2: + stage: test + script: + - cd build + - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=double -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 + - make CTEST_OUTPUT_ON_FAILURE=1 test + dependencies: + - build:GEMM_DP_REP2 + artifacts: + when: on_failure + paths: + - build/Testing/Temporary/LastTest.log + only: + changes: + - GEMM/**/* + - shared/**/* + - scripts/**/* + - cmake/**/* + - .gitlab-ci.yml + needs: ["build:GEMM_DP_REP2"] test:FFT: stage: test script: - cd build - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + - make CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:FFT artifacts: @@ -380,7 +511,7 @@ test:FFT_small: script: - cd build - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2 - - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + - make CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:FFT_small artifacts: @@ -411,7 +542,7 @@ test:b_eff: - ln -s kernel_output_ch1 kernel_input_ch0 - ln -s kernel_output_ch3 kernel_input_ch2 - cd .. - - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + - make CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:b_eff artifacts: diff --git a/FFT/src/device/fft1d_float_8.cl b/FFT/src/device/fft1d_float_8.cl index 778bec18..5a411aef 100644 --- a/FFT/src/device/fft1d_float_8.cl +++ b/FFT/src/device/fft1d_float_8.cl @@ -109,29 +109,31 @@ void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["i // for iter iterations and one additional iteration to empty the last buffer for(unsigned k = 0; k < (iter + 1) * (N / POINTS); k++){ + + if (k < iter * ( N / POINTS)) { - float2 read_chunk[POINTS]; - - // Read the next 8 values from global memory - // in the last iteration just read garbage, but the data will not be forwarded over the pipes. - // This allows the use of memory bursts here. - // Also the data is shifted every N/POINTS/POINTS iterations - __attribute__((opencl_unroll_hint(POINTS))) - for(int j = 0; j < POINTS; j++){ - // Shift the data depending on the total FFT size - // Shifts every new chunk by one. If N/POINTS is a multiple of POINTS, the shifting is reduced to prevent mappings to the same bank. - unsigned shift = ((LOGN - LOGPOINTS - LOGPOINTS > 0) ? (k & (N/POINTS - 1)) >> (LOGN - LOGPOINTS - LOGPOINTS) : (k & (N/POINTS - 1))); - unsigned final_buffer_pos = (j + shift) & (POINTS - 1); - read_chunk[final_buffer_pos] = src[(k << LOGPOINTS) + j]; - } + float2 read_chunk[POINTS]; - // Write the shifted data into the memory buffer - __attribute__((opencl_unroll_hint(POINTS))) - for(int j = 0; j < POINTS; j++){ - unsigned local_i = k & (2 * N/POINTS - 1); - buf[local_i][j] = read_chunk[j]; - } + // Read the next 8 values from global memory + // in the last iteration just read garbage, but the data will not be forwarded over the pipes. + // This allows the use of memory bursts here. + // Also the data is shifted every N/POINTS/POINTS iterations + __attribute__((opencl_unroll_hint(POINTS))) + for(int j = 0; j < POINTS; j++){ + // Shift the data depending on the total FFT size + // Shifts every new chunk by one. If N/POINTS is a multiple of POINTS, the shifting is reduced to prevent mappings to the same bank. + unsigned shift = ((LOGN - LOGPOINTS - LOGPOINTS > 0) ? (k & (N/POINTS - 1)) >> (LOGN - LOGPOINTS - LOGPOINTS) : (k & (N/POINTS - 1))); + unsigned final_buffer_pos = (j + shift) & (POINTS - 1); + read_chunk[final_buffer_pos] = src[(k << LOGPOINTS) + j]; + } + // Write the shifted data into the memory buffer + __attribute__((opencl_unroll_hint(POINTS))) + for(int j = 0; j < POINTS; j++){ + unsigned local_i = k & (2 * N/POINTS - 1); + buf[local_i][j] = read_chunk[j]; + } + } if (k >= ( N / POINTS)) { float2x8 buf2x8; diff --git a/FFT/src/host/execution_default.cpp b/FFT/src/host/execution_default.cpp index bafd18c7..614560ae 100644 --- a/FFT/src/host/execution_default.cpp +++ b/FFT/src/host/execution_default.cpp @@ -29,7 +29,6 @@ SOFTWARE. #include /* External library headers */ -#include "CL/cl.hpp" #ifdef INTEL_FPGA #ifdef USE_HBM // CL_HETEROGENEOUS_INTELFPGA is defined here diff --git a/GEMM/Readme.md b/GEMM/Readme.md index ebe13caf..831194bd 100755 --- a/GEMM/Readme.md +++ b/GEMM/Readme.md @@ -21,6 +21,8 @@ If available, the benchmark will use `sgemm_` to validate the calculation instea For matrix sizes above 1000x1000 we recommend using such a library to speed up the benchmark execution. Using such a library will not change the performance result of the benchmark but might affect the reported error of the calculation. +For half precision support, the IEEE 754-based half-precision floating-point library by Christian Rau is used and a copy is provided with this code. + ## Build CMake is used as the build system. @@ -53,7 +55,7 @@ Next to the common configuration options given in the [README](../README.md) of Name | Default | Description | ---------------- |-------------|--------------------------------------| - `DATA_TYPE` | float (also supported: half, double) | Data type used for calculation | + `DATA_TYPE` | float (also supported: half, double) | Data type used for calculation. *Note: Currently, half-precision does not work on Intel FPGAs because they can not be passed as kernel argument per value.* | `DEFAULT_MATRIX_SIZE` | 8 | The default size of the quadratic matrices in blocks | `BLOCK_SIZE` | 512 | Block size used by the kernel for calculation | `GEMM_SIZE` | 8 | Block size of the fully unrolled matrix multiplication in registers | diff --git a/LINPACK/CHANGELOG b/LINPACK/CHANGELOG index cb070b06..3e86dedc 100644 --- a/LINPACK/CHANGELOG +++ b/LINPACK/CHANGELOG @@ -2,11 +2,17 @@ This file contains all changes made to the source code for each release. +## 2.3 +#### Changed: +- Refactored the code to support different execution kernels and data distributions +#### Added: +- FPGA kernel with communication via PCIe and MPI + ## 2.2 #### Added: - LU facotrization kernel w/o pivoting in quadratic torus -- Distributed calculation of GEL on CPU nodes and validation +- Distributed calculation of GESL on CPU nodes and validation ## 2.1 diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt index bec71e7e..72ae009c 100755 --- a/LINPACK/CMakeLists.txt +++ b/LINPACK/CMakeLists.txt @@ -1,9 +1,9 @@ cmake_minimum_required(VERSION 3.1) -project(LINPACK VERSION 2.2) +project(LINPACK VERSION 2.3) set(USE_DEPRECATED_HPP_HEADER No) -set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size") +set(DEFAULT_MATRIX_SIZE 2 CACHE STRING "Default matrix size") set(LOCAL_MEM_BLOCK_LOG 5 CACHE STRING "Used to define the width and height of the block stored in local memory") set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers") set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated") @@ -11,6 +11,8 @@ set(TEST_UNIFORM No CACHE BOOL "All tests executed by CTest will be executed wit set(TEST_EMULATION Yes CACHE BOOL "All tests executed by CTest will be executed with emulation kernels") set(DISTRIBUTED_VALIDATION Yes CACHE BOOL "Use the distributed validation scheme instead of validation on rank 0") +set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes) + if (TEST_UNIFORM) set(TEST_HOST_FLAGS "--uniform") endif() diff --git a/LINPACK/Readme.md b/LINPACK/Readme.md index b2b65b6d..e0e276c1 100644 --- a/LINPACK/Readme.md +++ b/LINPACK/Readme.md @@ -20,13 +20,13 @@ The targets below can be used to build the benchmark and its kernels, where `VEN Only the LU facotrization without pivoting is implemented on FPGA and external channels are used to calculate the solution in a 2D torus of FPGAs. - The kernel targets are: + The kernel targets are listed below. `COMM_TYPE` can be IEC for Intel external channel (only available for vendor Intel) and PCIE for communication via PCIe and MPI. | Target | Description | | ------------------------------ | ---------------------------------------------- | - | hpl_torus_`VENDOR` | Synthesizes the kernel (takes several hours!) | - | hpl_torus_report_`VENDOR` | Just compile kernel and create reports | - | hpl_torus_emulate_`VENDOR` | Create a n emulation kernel | + | hpl_torus_`COMM_TYPE`_`VENDOR` | Synthesizes the kernel (takes several hours!) | + | hpl_torus_`COMM_TYPE`_report_`VENDOR` | Just compile kernel and create reports | + | hpl_torus_`COMM_TYPE`_emulate_`VENDOR` | Create a n emulation kernel | You can build for example the host application by running @@ -69,14 +69,12 @@ For more information on available input parameters run ./Linpack_intel -h Implementation of the LINPACK benchmark proposed in the HPCC benchmark suite for FPGA. - Version: 2.2 + Version: 2.3 MPI Version: 3.1 - Config. Time: Wed Apr 14 09:31:37 UTC 2021 - Git Commit: 60651eb-dirty Usage: - ./bin/Linpack_intel [OPTION...] + bin/Linpack_intel [OPTION...] -f, --file arg Kernel file name -n, arg Number of repetitions (default: 10) @@ -86,29 +84,34 @@ For more information on available input parameters run data types. --device arg Index of the device that has to be used. If not given you will be asked which device to use if there are - multiple devices available. (default: -1) + multiple devices available. (default: 0) --platform arg Index of the platform that has to be used. If not given you will be asked which platform to use if there - are multiple platforms available. (default: -1) - -r, arg Number of used kernel replications (default: 3) + are multiple platforms available. (default: 0) + -r, arg Number of used kernel replications (default: 1) + --comm-type arg Used communication type for inter-FPGA communication + (default: AUTO) --test Only test given configuration and skip execution and validation -h, --help Print this help -m, arg Matrix size in number of blocks in one dimension for a singe MPI rank. Total matrix will have size m * - sqrt(MPI_size) (default: 1024) + sqrt(MPI_size) (default: 2) -b, arg Log2 of the block size in number of values in one - dimension (default: 3) + dimension (default: 5) --uniform Generate a uniform matrix instead of a diagonally dominant. This has to be supported by the FPGA kernel! --emulation Use kernel arguments for emulation. This may be necessary to simulate persistent local memory on the FPGA +Available options for `--comm-type`: +- `IEC`: Intel external channels are used by the kernels for communication. +- `PCIE`: PCIe and MPI are used to exchange data between FPGAs over the CPU. To execute the unit and integration tests for Intel devices run - CL_CONTEXT_EMULATOR_DEVICE=1 ./Linpack_test_intel -f KERNEL_FILE_NAME + ./Linpack_test_intel -f KERNEL_FILE_NAME in the `bin` folder within the build directory. It will run an emulation of the kernel and execute some functionality tests. diff --git a/LINPACK/configs/Xilinx_U250_B8_SB3_R3.cmake b/LINPACK/configs/Xilinx_U250_B8_SB3_R3.cmake new file mode 100644 index 00000000..c5f40c60 --- /dev/null +++ b/LINPACK/configs/Xilinx_U250_B8_SB3_R3.cmake @@ -0,0 +1,25 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u250_xdma_201830_2" CACHE STRING "" FORCE) + +# LINPACK specific options +set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) +set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) +set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(NUM_REPLICATIONS 3 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) + +set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/pre_synthesis.u250.tcl" CACHE STRING "Compile settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.u250.generator.ini" CACHE STRING "Link settings file" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) +set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) + diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2.cmake new file mode 100644 index 00000000..a9adfef5 --- /dev/null +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2.cmake @@ -0,0 +1,25 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) + +# LINPACK specific options +set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) +set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) +set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) + +set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini" CACHE STRING "Link settings file" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) +set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) + diff --git a/LINPACK/settings/pre_synthesis.u250.tcl b/LINPACK/settings/pre_synthesis.u250.tcl new file mode 100644 index 00000000..5a5a9373 --- /dev/null +++ b/LINPACK/settings/pre_synthesis.u250.tcl @@ -0,0 +1,6 @@ + +# Allow reordeing of math operations to increase parallelism +config_compile -unsafe_math_optimizations + +# Reduce number of memory ports to reduce resource uage for GMI +#config_interface -m_axi_auto_max_ports false \ No newline at end of file diff --git a/LINPACK/settings/settings.compile.xilinx.lu_blocked_pvt.ddr.ini b/LINPACK/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini similarity index 100% rename from LINPACK/settings/settings.compile.xilinx.lu_blocked_pvt.ddr.ini rename to LINPACK/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini diff --git a/LINPACK/settings/settings.compile.xilinx.hpl_torus_pcie.u250.ini b/LINPACK/settings/settings.compile.xilinx.hpl_torus_pcie.u250.ini new file mode 100644 index 00000000..a1334eb2 --- /dev/null +++ b/LINPACK/settings/settings.compile.xilinx.hpl_torus_pcie.u250.ini @@ -0,0 +1,4 @@ +kernel_frequency=300 + +[hls] +max_memory_ports=all diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini new file mode 100644 index 00000000..e032e407 --- /dev/null +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini @@ -0,0 +1,34 @@ +[connectivity] +nk=lu:1 +nk=left_update:1 +nk=top_update:1 +nk=inner_update_mm0:$PY_CODE_GEN num_replications$ + +# slrs +# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR +slr=lu_1:SLR0 +slr=left_update_1:SLR0 +slr=top_update_1:SLR0 +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$ +# PY_CODE_GEN block_end + +# matrix ports +sp=lu_1.m_axi_gmem0:DDR[0] +sp=lu_1.m_axi_gmem1:DDR[0] +sp=lu_1.m_axi_gmem2:DDR[1] + +sp=top_update_1.m_axi_gmem0:DDR[0] +sp=top_update_1.m_axi_gmem1:DDR[0] +sp=top_update_1.m_axi_gmem2:DDR[0] + +sp=left_update_1.m_axi_gmem0:DDR[0] +sp=left_update_1.m_axi_gmem1:DDR[1] +sp=left_update_1.m_axi_gmem2:DDR[1] + +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:DDR[0] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:DDR[1] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:DDR[0] +# PY_CODE_GEN block_end + diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.u250.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.u250.generator.ini new file mode 100644 index 00000000..2da7f651 --- /dev/null +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.u250.generator.ini @@ -0,0 +1,26 @@ +[connectivity] +nk=lu:1 +nk=left_update:1 +nk=top_update:1 +nk=inner_update_mm0:$PY_CODE_GEN num_replications$ + +# slrs +# all special kernels are on SLR1. MM kernels are put on all remaining SLRs using RR +slr=lu_1:SLR1 +slr=left_update_1:SLR1 +slr=top_update_1:SLR1 +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +2) % 4$ +# PY_CODE_GEN block_end + +# matrix ports +sp=lu_1.m_axi_gmem:DDR[1] + +sp=top_update_1.m_axi_gmem:DDR[1] + +sp=left_update_1.m_axi_gmem:DDR[1] + +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem:DDR[1] +# PY_CODE_GEN block_end + diff --git a/LINPACK/settings/settings.link.xilinx.lu_blocked_pvt.ddr.ini b/LINPACK/settings/settings.link.xilinx.lu_blocked_pvt.ddr.ini deleted file mode 100644 index f2117a92..00000000 --- a/LINPACK/settings/settings.link.xilinx.lu_blocked_pvt.ddr.ini +++ /dev/null @@ -1,9 +0,0 @@ -[connectivity] -nk=gefa:1 - -# slrs -slr=gefa_1:SLR0 - -# matrix ports -sp=gefa_1.m_axi_gmem0:DDR[0] -sp=gefa_1.m_axi_gmem1:DDR[1] diff --git a/LINPACK/src/device/CMakeLists.txt b/LINPACK/src/device/CMakeLists.txt index 2c5c0bce..7a28cc56 100644 --- a/LINPACK/src/device/CMakeLists.txt +++ b/LINPACK/src/device/CMakeLists.txt @@ -2,18 +2,18 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) if (INTELFPGAOPENCL_FOUND) - generate_kernel_targets_intel(hpl_torus) - add_test(NAME test_emulation_intel COMMAND Linpack_intel -f hpl_torus_emulate.aocx -m 2 -n 1 ${TEST_HOST_FLAGS} + generate_kernel_targets_intel(hpl_torus_IEC hpl_torus_PCIE) + add_test(NAME test_emulation_intel COMMAND Linpack_intel -f hpl_torus_PCIE_emulate.aocx -m 2 -n 1 ${TEST_HOST_FLAGS} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_intel ${TEST_HOST_FLAGS} -f hpl_torus_emulate.aocx -m 2 -n 1 + add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_intel ${TEST_HOST_FLAGS} -f hpl_torus_PCIE_emulate.aocx -m 2 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (VITIS_FOUND) - generate_kernel_targets_xilinx(hpl_torus) - add_test(NAME test_emulation_xilinx COMMAND Linpack_xilinx -f hpl_torus_emulate.xclbin -m 2 -n 1 ${TEST_HOST_FLAGS} + generate_kernel_targets_xilinx(hpl_torus_PCIE) + add_test(NAME test_emulation_xilinx COMMAND Linpack_xilinx -f hpl_torus_PCIE_emulate.xclbin -m 2 -n 1 ${TEST_HOST_FLAGS} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_xilinx ${TEST_HOST_FLAGS} -f hpl_torus_emulate.xclbin -m 2 -n 1 + add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_xilinx ${TEST_HOST_FLAGS} -f hpl_torus_PCIE_emulate.xclbin -m 2 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/LINPACK/src/device/hpl_torus.cl b/LINPACK/src/device/hpl_torus_IEC.cl similarity index 100% rename from LINPACK/src/device/hpl_torus.cl rename to LINPACK/src/device/hpl_torus_IEC.cl diff --git a/LINPACK/src/device/hpl_torus_PCIE.cl b/LINPACK/src/device/hpl_torus_PCIE.cl new file mode 100644 index 00000000..0f31f0d4 --- /dev/null +++ b/LINPACK/src/device/hpl_torus_PCIE.cl @@ -0,0 +1,831 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#include "parameters.h" + +// Current implementation uses __fpga_reg call to add additional registers for +#ifdef XILINX_FPGA +#define __fpga_reg(x) x +#endif + +#define BLOCK_SIZE (1 << LOCAL_MEM_BLOCK_LOG) +#define GEMM_BLOCK (1 << REGISTER_BLOCK_LOG) + +#ifdef INTEL_FPGA +#pragma OPENCL EXTENSION cl_intel_channels : enable +#endif + +typedef struct tmp_channel_chunk { DEVICE_DATA_TYPE data[GEMM_BLOCK];} ch_chunk_t; + +/** +Executes a single step of the LU factorization. + +This method takes a partially solved 8x8 matrix and calculates the next step of the LU factorization +The method needs 7 (GEMM_BLOCK-1) calls to perform a single LU factorization. This is done to reduce resource usage, +since all upcomng calls are anyway depending on the results of the previous call and there is no way +to pipeline multiple executions. + +A is the input block that might be partially computed +step is the current step and must be a value between 0 to GEMM_BLOCK-2. After step GEMM_BLOCK-2, the block is factorized + */ +void +lu_block(const DEVICE_DATA_TYPE A[GEMM_BLOCK][GEMM_BLOCK], const int step, DEVICE_DATA_TYPE A_out[GEMM_BLOCK][GEMM_BLOCK]) { + + // Read current line from input + DEVICE_DATA_TYPE line[GEMM_BLOCK]; + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i=0; i < GEMM_BLOCK; i++) { + line[i] = A[step][i]; + } + + // calculate the inverse of the diagonal element for the scaling + DEVICE_DATA_TYPE inv_scale_a = -1.0 / line[step]; + + // Scale the current row + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i=0; i < GEMM_BLOCK; i++) { + if (i > step) { + line[i] = line[i] * inv_scale_a; + } + } + line[step] = inv_scale_a; + + // Update all rows fully unrolled + // The multiply adds are fully independent + //__attribute__((opencl_unroll_hint(GEMM_BLOCK))) + // Unrolling disabled for this loop to save resources + for (int j = 0; j < GEMM_BLOCK; j++) { + DEVICE_DATA_TYPE curr_scale = A[j][step]; + // Update a single row. If it is already updated, just write back the value, if it is the current row + // write back the value in "line", else update the value + if (j != step) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i = 0; i < GEMM_BLOCK; i++) { + A_out[j][i] = (i > step && j > step) ? A[j][i] + line[i] * curr_scale : A[j][i]; + } + } + else { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i = 0; i < GEMM_BLOCK; i++) { + A_out[j][i] = line[i]; + } + } + } +} + +/** +This function can be used to update blocks using with three different operations. +It will execute the update for a single row in the block. The update is completed after GEMM_BLOCK calls of this +update function + +operation_type: 0 for top = the top row of blocks will need a triangular MM + 1 for left = the left column of blocks will need a triangular MM, matrices have to be transposed + 2 for inner block == all inner blocks will be updated with a MM + */ +void +update_block(const DEVICE_DATA_TYPE a[GEMM_BLOCK][GEMM_BLOCK], + const DEVICE_DATA_TYPE top[GEMM_BLOCK], + const DEVICE_DATA_TYPE left_or_lu[GEMM_BLOCK], + DEVICE_DATA_TYPE out[GEMM_BLOCK][GEMM_BLOCK], + const int current_row, + const int operation_type) { + + // Define different operation types of function + const int op_top = 0; + const int op_left = 1; + const int op_inner = 2; + + // Transpose the input matrices if the target is a left block + DEVICE_DATA_TYPE current_block[GEMM_BLOCK][GEMM_BLOCK] __attribute__((register, xcl_array_partition(complete, 1), xcl_array_partition(complete, 2))); + if (operation_type == op_left) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + current_block[ii][jj] = __fpga_reg(a[jj][ii]); + } + } + } + else { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + current_block[ii][jj] = __fpga_reg(a[ii][jj]); + } + } + } + + // Generate the first scalling array depending on the operation type + DEVICE_DATA_TYPE scale_row[GEMM_BLOCK] __attribute__((register, xcl_array_partition(complete, 1))); + if (operation_type == op_inner) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + scale_row[jj] = top[jj]; + } + } + else { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + scale_row[jj] = current_block[current_row][jj]; + } + } + if (operation_type == op_top) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + scale_row[jj] *= left_or_lu[current_row]; + } + } + + DEVICE_DATA_TYPE tmp[GEMM_BLOCK][GEMM_BLOCK] __attribute__((register, xcl_array_partition(complete, 1), xcl_array_partition(complete, 2))); + // scale all values with the pre calculated scaling array and the second input + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + // left_or_lu_block are stored transposed to simplify the data access here + tmp[ii][jj] = current_block[ii][jj] + scale_row[jj] * left_or_lu[ii]; + } + } + + // overwrite results that were calculated altough they are not needed for the triangular operations left and top + if (operation_type != op_inner) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + if (ii == current_row) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + tmp[ii][jj] = scale_row[jj]; + } + } + else if (ii < current_row) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + tmp[ii][jj] = current_block[ii][jj]; + } + } + } + } + + // write result back and transpose if necessary + if (operation_type == op_left) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + out[ii][jj] = __fpga_reg(tmp[jj][ii]); + } + } + } + else { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + out[ii][jj] = __fpga_reg(tmp[ii][jj]); + } + } + } +} + +__attribute__((uses_global_work_offset(0))) +__kernel +void +lu(__global DEVICE_DATA_TYPE* restrict a, + __global DEVICE_DATA_TYPE* restrict a_block_trans, + __global DEVICE_DATA_TYPE* restrict a_block, + const uint block_col, + const uint block_row, + const uint blocks_per_row) { + + local DEVICE_DATA_TYPE a_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4))); + + // Store current row and column in separate buffers for + // easier access in the deep pipeline + // need to be declared as local to prevent the compiler from + local DEVICE_DATA_TYPE top_buffer[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2))); + local DEVICE_DATA_TYPE left_buffer[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2))); + + // Load block to local memory + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj]; + } + } + } + } + + // For each row in the matrix update whole matrix. + // The iterations depend on each other, so loop pipelining is disabled here + #pragma disable_loop_pipelining + for (int gk = 0; gk < BLOCK_SIZE; gk++) { + + int k = gk / GEMM_BLOCK; + int kk = gk & (GEMM_BLOCK - 1); + + // Read in current LU block + DEVICE_DATA_TYPE lu_a_buffer_in[GEMM_BLOCK][GEMM_BLOCK]; + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + lu_a_buffer_in[ii][jj] = a_buffer[k][k][ii][jj]; + } + } + + DEVICE_DATA_TYPE lu_a_buffer_out[GEMM_BLOCK][GEMM_BLOCK]; + DEVICE_DATA_TYPE lu_a_buffer_out_row[GEMM_BLOCK]; + DEVICE_DATA_TYPE lu_a_buffer_out_col[GEMM_BLOCK]; + // Calculate next row and column of LU factorization and store in local memory buffer + lu_block(lu_a_buffer_in, kk, lu_a_buffer_out); + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a_buffer[k][k][ii][jj] = lu_a_buffer_out[ii][jj]; + } + } + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + lu_a_buffer_out_row[jj] = lu_a_buffer_out[kk][jj]; + } + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + lu_a_buffer_out_col[jj] = lu_a_buffer_out[jj][kk]; + } + + // The update pipeline does not need to be executed for the last + // row of blocks + if (gk < BLOCK_SIZE - GEMM_BLOCK) { + + // update all left blocks + for (int tj = 1; tj < BLOCK_SIZE/GEMM_BLOCK; tj++) { + + int j = k; + int i = tj; + + if (i > k) { + // copy the correct block in the second input buffer + // this depends on the operations that has to be executed + DEVICE_DATA_TYPE second_input[GEMM_BLOCK]; + + // left matrix block will be calculated + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + second_input[jj] = __fpga_reg(lu_a_buffer_out_row[jj]); + } + DEVICE_DATA_TYPE a_input[GEMM_BLOCK][GEMM_BLOCK] __attribute__((xcl_array_partition(complete, 1),xcl_array_partition(complete, 2))); + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj = 0; jj < GEMM_BLOCK; jj++) { + a_input[ii][jj] = __fpga_reg(a_buffer[i][j][ii][jj]); + } + } + DEVICE_DATA_TYPE top_input[GEMM_BLOCK]; + DEVICE_DATA_TYPE out[GEMM_BLOCK][GEMM_BLOCK] __attribute__((register, xcl_array_partition(complete, 1), xcl_array_partition(complete, 2))); + update_block(a_input, + top_input, + second_input, + out, + kk, + 1); + + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + left_buffer[i][ii] = __fpga_reg(out[ii][kk]); + } + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj = 0; jj < GEMM_BLOCK; jj++) { + a_buffer[i][j][ii][jj] = __fpga_reg(out[ii][jj]); + } + } + } + } + + // Update all other blocks with the new calculated row and column + // First update top blocks, then update left blocks, then all inner blocks + // ti == 0: top blocks + // ti == 1: left blocks + // ti > 1: inner blocks +#ifdef INTEL_FPGA + #pragma loop_coalesce + #pragma ivdep safelen(BLOCK_SIZE/GEMM_BLOCK - 1) +#endif + for (int ti = 0; ti < BLOCK_SIZE/GEMM_BLOCK - k; ti++) { +#ifdef INTEL_FPGA + #pragma ivdep +#endif + for (int tj = 1; tj < BLOCK_SIZE/GEMM_BLOCK; tj++) { + + int j = tj; + int i = ti + k; + // always execute the pipeline for whole rows of matrix blocks. + // Only execute update for blocks that are required. + // This helps to keep constant latencies between data dependencies of the pipeline stages + if ((i > k || ti == 0) && j > k ) { + + // copy the correct block in the second input buffer + // this depends on the operations that has to be executed + DEVICE_DATA_TYPE second_input[GEMM_BLOCK]; + if (ti == 0) { + // top matrix block will be calculated + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + second_input[jj] = __fpga_reg(lu_a_buffer_out_col[jj]); + } + } + else { + // inner block will be calculated + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + second_input[jj] = __fpga_reg(left_buffer[i][jj]); + } + } + DEVICE_DATA_TYPE a_input[GEMM_BLOCK][GEMM_BLOCK] __attribute__((xcl_array_partition(complete, 1),xcl_array_partition(complete, 2))); + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj = 0; jj < GEMM_BLOCK; jj++) { + a_input[ii][jj] = __fpga_reg(a_buffer[i][j][ii][jj]); + } + } + DEVICE_DATA_TYPE top_input[GEMM_BLOCK]; + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + top_input[jj] = __fpga_reg(top_buffer[j][jj]); + } + DEVICE_DATA_TYPE out[GEMM_BLOCK][GEMM_BLOCK] __attribute__((register, xcl_array_partition(complete, 1), xcl_array_partition(complete, 2))); + update_block(a_input, + top_input, + second_input, + out, + kk, + (ti == 0) ? 0 : 2); + if (ti == 0) { + // only update in the first row + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + top_buffer[j][jj] = __fpga_reg(out[kk][jj]); + } + } + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int ii =0; ii < GEMM_BLOCK; ii++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj = 0; jj < GEMM_BLOCK; jj++) { + a_buffer[i][j][ii][jj] = __fpga_reg(out[ii][jj]); + } + } + } + } + } + } + } + + // Store block to global memory + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; + } + } + } + } + // Store current block in global memory also transposed to allow easier access from the top kernel + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a_block_trans[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii]; + } + } + } + } + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; + } + } + } + } +} + +/** +Update the blocks to the right of the current LU block + + */ + __attribute__((uses_global_work_offset(0))) +__kernel +void top_update(__global DEVICE_DATA_TYPE* restrict a, + __global DEVICE_DATA_TYPE* restrict top_block, + __global DEVICE_DATA_TYPE* restrict lu_global_buffer_transposed, + const uint is_first_block, + const uint block_col, + const uint block_row, + const uint blocks_per_row) { + + // Store current block in local memory + local DEVICE_DATA_TYPE a_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4))); + + + // Load block to local memory + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj]; + } + } + } + } + + // For each row in the matrix update whole matrix. + // The iterations depend on each other, so loop pipelining is disabled here + #pragma disable_loop_pipelining + for (int gk = 0; gk < BLOCK_SIZE; gk++) { + + int k = gk / GEMM_BLOCK; + int kk = gk & (GEMM_BLOCK - 1); + + DEVICE_DATA_TYPE current_lu_col[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2))); + DEVICE_DATA_TYPE current_row[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2))); + DEVICE_DATA_TYPE current_scale; + + for (int col = 0; col < BLOCK_SIZE / GEMM_BLOCK; col++) { + ch_chunk_t col_in; + + DEVICE_DATA_TYPE scale_chunk[GEMM_BLOCK] __attribute((xcl_array_partition(complete, 1))); + + // get current row chunk + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i =0; i < GEMM_BLOCK; i++) { + scale_chunk[i] = a_buffer[k][col][kk][i]; + } + + // if current column data is still available read it in and store it in buffer + if (col < BLOCK_SIZE / GEMM_BLOCK - k) { + // Load LU data from global memory instead of receiving it from the channel + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i=0; i < GEMM_BLOCK; i++) { + col_in.data[i] = lu_global_buffer_transposed[gk * BLOCK_SIZE + (col + k) * GEMM_BLOCK + i]; + } + if (col == 0) { + current_scale = col_in.data[kk]; + } + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i =0; i < GEMM_BLOCK; i++) { + current_lu_col[col][i] = (col > 0 || i > kk) ? col_in.data[i] : 0.f; + } + } + + // scale current row chunk with the rows scale factor received over the external channel + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i =0; i < GEMM_BLOCK; i++) { + scale_chunk[i] = scale_chunk[i] * current_scale; + } + + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i =0; i < GEMM_BLOCK; i++) { + current_row[col][i] = scale_chunk[i]; + } + + // Update local memory buffer with chunk + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i =0; i < GEMM_BLOCK; i++) { + a_buffer[k][col][kk][i] = scale_chunk[i]; + } + } + + // Update all remaining rows + #pragma loop_coalesce + for (int row = 0; row < BLOCK_SIZE/GEMM_BLOCK - k; row++) { + // Update whole rows! + __attribute__((xcl_pipeline_loop(1))) + for (int curr_col = 0; curr_col < BLOCK_SIZE/GEMM_BLOCK; curr_col++) { + DEVICE_DATA_TYPE colbuf[GEMM_BLOCK]; + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int j=0; j < GEMM_BLOCK; j++) { + colbuf[j] = current_lu_col[row][j]; + } + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i = 0; i < GEMM_BLOCK; i++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int j=0; j < GEMM_BLOCK; j++) { + a_buffer[row + k][curr_col][i][j] += colbuf[i] * current_row[curr_col][j]; + } + } + } + } + } + + // Store block to global memory + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; + } + } + } + } + // Store current block separately for easier transmission over host + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + top_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; + } + } + } + } +} + +/** +Update the blocks below the current LU block + + */ + __attribute__((uses_global_work_offset(0))) +__kernel +void left_update(__global DEVICE_DATA_TYPE* restrict a, + __global DEVICE_DATA_TYPE* restrict left_block, + __global DEVICE_DATA_TYPE* restrict lu_global_buffer, + const uint is_first_block, + const uint block_col, + const uint block_row, + const uint blocks_per_row) { + + // Store current block in local memory + local DEVICE_DATA_TYPE a_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4))); + + // Load block to local memory + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj]; + } + } + } + } + + // For each row in the matrix update whole matrix. + // The iterations depend on each other, so loop pipelining is disabled here + #pragma disable_loop_pipelining + for (int gk = 0; gk < BLOCK_SIZE; gk++) { + + int k = gk / GEMM_BLOCK; + int kk = gk & (GEMM_BLOCK - 1); + + DEVICE_DATA_TYPE current_lu_row[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2))); + DEVICE_DATA_TYPE current_col[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2))); + + for (int col = 0; col < BLOCK_SIZE / GEMM_BLOCK; col++) { + DEVICE_DATA_TYPE chunk[GEMM_BLOCK]; + // get current row chunk + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i =0; i < GEMM_BLOCK; i++) { + chunk[i] = a_buffer[col][k][i][kk]; + } + + // Store chunk for later update + ch_chunk_t col_out; + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i =0; i < GEMM_BLOCK; i++) { + current_col[col][i] = chunk[i]; + } + + ch_chunk_t row_in; + + // if current column data is still available read it in and store it in buffer + if (col < BLOCK_SIZE / GEMM_BLOCK - k) { + // Load LU data from global memory + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i=0; i < GEMM_BLOCK; i++) { + row_in.data[i] = lu_global_buffer[gk * BLOCK_SIZE + (col + k) * GEMM_BLOCK + i]; + } + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i =0; i < GEMM_BLOCK; i++) { + current_lu_row[col][i] = (col > 0 || i > kk) ? row_in.data[i] : 0.f; + } + } + } + + // Update all rows + #pragma loop_coalesce + // Update only remaining row chunks + #pragma ivdep + for (int curr_col = 0; curr_col < BLOCK_SIZE/GEMM_BLOCK - k; curr_col++) { +#ifdef INTEL_FPGA + #pragma ivdep +#endif +#ifdef XILINX_FPGA + __attribute__((xcl_pipeline_loop(1))) +#endif + for (int row = 0; row < BLOCK_SIZE/GEMM_BLOCK; row++) { + DEVICE_DATA_TYPE colbuf[GEMM_BLOCK]; + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int j=0; j < GEMM_BLOCK; j++) { + colbuf[j] = current_col[row][j]; + } + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i = 0; i < GEMM_BLOCK; i++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int j=0; j < GEMM_BLOCK; j++) { + a_buffer[row][curr_col + k][i][j] += current_lu_row[curr_col][j] * colbuf[i]; + } + } + } + } + } + + // Store block to global memory + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; + } + } + } + } + + // Store current block separately for easier transmission over host + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + left_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii]; + } + } + } + } +} + +// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] + +/** +Update the inner blocks using the left and right column and rows + + */ + __attribute__((uses_global_work_offset(0))) +__kernel +void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a, + __global DEVICE_DATA_TYPE* restrict left_global_buffer, + __global DEVICE_DATA_TYPE* restrict top_global_buffer, + const uint block_col, + const uint block_row, + const uint blocks_per_row) { + + // Store current block in local memory + local DEVICE_DATA_TYPE a_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4))); + local DEVICE_DATA_TYPE top_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4))); + local DEVICE_DATA_TYPE left_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4))); + + // Load blocks to local memory + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj]; + } + } + } + } + + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + top_buffer[i][j][ii][jj] = top_global_buffer[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj]; + } + } + } + } + + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + left_buffer[i][j][ii][jj] = left_global_buffer[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj]; + } + } + } + } + + // Update whole block + #pragma ivdep array(a_buffer) safelen((BLOCK_SIZE/GEMM_BLOCK)*(BLOCK_SIZE/GEMM_BLOCK)) + for (int c = 0; c < (BLOCK_SIZE/GEMM_BLOCK) * (BLOCK_SIZE/GEMM_BLOCK) * (BLOCK_SIZE/GEMM_BLOCK); c++) { + + int mcol = c / ((BLOCK_SIZE/GEMM_BLOCK)*(BLOCK_SIZE/GEMM_BLOCK)); + int row = (c / (BLOCK_SIZE/GEMM_BLOCK)) & ((BLOCK_SIZE/GEMM_BLOCK) - 1); + int curr_col = c & ((BLOCK_SIZE/GEMM_BLOCK) - 1); + + DEVICE_DATA_TYPE top_sub[GEMM_BLOCK][GEMM_BLOCK]; + DEVICE_DATA_TYPE left_sub[GEMM_BLOCK][GEMM_BLOCK]; + + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i = 0; i < GEMM_BLOCK; i++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int j=0; j < GEMM_BLOCK; j++) { + top_sub[i][j] = top_buffer[mcol][curr_col][i][j]; + } + } + + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i = 0; i < GEMM_BLOCK; i++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int j=0; j < GEMM_BLOCK; j++) { + left_sub[i][j] = left_buffer[mcol][row][i][j]; + } + } + + DEVICE_DATA_TYPE result_sub[GEMM_BLOCK][GEMM_BLOCK]; + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i = 0; i < GEMM_BLOCK; i++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int j = 0; j < GEMM_BLOCK; j++) { + // Calculate sum of whole column and only write it back once + DEVICE_DATA_TYPE sum = 0.0; + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int k=0; k < GEMM_BLOCK; k++) { + sum += left_sub[k][i] * top_sub[k][j]; + } + result_sub[i][j] = sum; + } + } + + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int i = 0; i < GEMM_BLOCK; i++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int j=0; j < GEMM_BLOCK; j++) { + a_buffer[row][curr_col][i][j] += __fpga_reg(result_sub[i][j]); + } + } + } + + // Store block to global memory + #pragma loop_coalesce + for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { + for (int ii =0; ii < GEMM_BLOCK; ii++) { + for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) + for (int jj =0; jj < GEMM_BLOCK; jj++) { + a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; + } + } + } + } +} + +// PY_CODE_GEN block_end diff --git a/LINPACK/src/host/CMakeLists.txt b/LINPACK/src/host/CMakeLists.txt index ee126aca..d8feb95d 100755 --- a/LINPACK/src/host/CMakeLists.txt +++ b/LINPACK/src/host/CMakeLists.txt @@ -1,6 +1,6 @@ add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) -set(HOST_SOURCE execution_blocked_pvt.cpp linpack_benchmark.cpp gmres.c blas.c) +set(HOST_SOURCE linpack_benchmark.cpp gmres.c blas.c) set(HOST_EXE_NAME Linpack) set(LIB_NAME lp) diff --git a/LINPACK/src/host/execution.h b/LINPACK/src/host/execution.h index 153d4be0..a4ebf6f3 100644 --- a/LINPACK/src/host/execution.h +++ b/LINPACK/src/host/execution.h @@ -27,7 +27,6 @@ SOFTWARE. #include /* External library headers */ -#include "CL/cl2.hpp" #include "parameters.h" #include "linpack_benchmark.hpp" diff --git a/LINPACK/src/host/execution_blocked_pvt.cpp b/LINPACK/src/host/execution_types/execution_iec.hpp similarity index 99% rename from LINPACK/src/host/execution_blocked_pvt.cpp rename to LINPACK/src/host/execution_types/execution_iec.hpp index 7ad5499a..ea426799 100644 --- a/LINPACK/src/host/execution_blocked_pvt.cpp +++ b/LINPACK/src/host/execution_types/execution_iec.hpp @@ -19,9 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - -/* Related header files */ -#include "execution.h" +#ifndef EXECUTION_TYPES_EXECUTION_IEC_HPP +#define EXECUTION_TYPES_EXECUTION_IEC_HPP /* C++ standard library headers */ #include @@ -31,17 +30,19 @@ SOFTWARE. #include /* External library headers */ -#include "CL/cl2.hpp" #if QUARTUS_MAJOR_VERSION > 18 #include "CL/cl_ext_intelfpga.h" #endif -namespace bm_execution { +#include "parameters.h" +#include "linpack_benchmark.hpp" -/* - Prepare kernels and execute benchmark +namespace linpack { +namespace execution { +namespace iec { - @copydoc bm_execution::calculate() +/* + Prepare kernels and execute benchmark for a bitstream that makes use of intel external channels */ std::unique_ptr calculate(const hpcc_base::ExecutionSettings&config, @@ -724,5 +725,8 @@ calculate(const hpcc_base::ExecutionSettings&co return results; } -} // namespace bm_execution +} // namespace iec +} // namespace execution +} // namespace linpack +#endif \ No newline at end of file diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp new file mode 100644 index 00000000..6a83dbbe --- /dev/null +++ b/LINPACK/src/host/execution_types/execution_pcie.hpp @@ -0,0 +1,627 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef EXECUTION_TYPES_EXECUTION_PCIE_HPP +#define EXECUTION_TYPES_EXECUTION_PCIE_HPP + +/* C++ standard library headers */ +#include +#include +#include +#include +#include + +/* External library headers */ +#if QUARTUS_MAJOR_VERSION > 18 +#include "CL/cl_ext_intelfpga.h" +#endif + +#include "parameters.h" +#include "linpack_benchmark.hpp" + +namespace linpack { +namespace execution { +namespace pcie { + +/* + Prepare kernels and execute benchmark + + @copydoc bm_execution::calculate() +*/ +std::unique_ptr +calculate(const hpcc_base::ExecutionSettings&config, + HOST_DATA_TYPE* A, + HOST_DATA_TYPE* b, + cl_int* ipvt) { + + int err; + + uint blocks_per_row = config.programSettings->matrixSize / config.programSettings->blockSize; + + // Communicate with all ranks in the same row of the torus + MPI_Comm row_communicator; + MPI_Comm col_communicator; + + MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_row, 0, &row_communicator); + MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_col, 0, &col_communicator); + + cl::CommandQueue buffer_queue(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + + // Create Buffers for input and output + cl::Buffer Buffer_a(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize); + cl::Buffer Buffer_b(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize); + cl::Buffer Buffer_pivot(*config.context, CL_MEM_READ_WRITE, + sizeof(cl_int)*config.programSettings->matrixSize); + + // Buffers only used to store data received over the network layer + // The content will not be modified by the host + cl::Buffer Buffer_lu1(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*(config.programSettings->blockSize)*(config.programSettings->blockSize)); + cl::Buffer Buffer_lu2(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*(config.programSettings->blockSize)*(config.programSettings->blockSize)); + cl::Buffer Buffer_top(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize * (config.programSettings->blockSize)); + cl::Buffer Buffer_left(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize * (config.programSettings->blockSize)); + cl::Buffer Buffer_network_scaling(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*(config.programSettings->blockSize)); + + /* --- Setup MPI communication and required additional buffers --- */ + + HOST_DATA_TYPE *lu_block, *lu_trans_block; + posix_memalign(reinterpret_cast(&lu_block), 1024, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize)*(config.programSettings->blockSize)); + posix_memalign(reinterpret_cast(&lu_trans_block), 1024, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize)*(config.programSettings->blockSize)); + + std::vector left_blocks(blocks_per_row); + std::vector top_blocks(blocks_per_row); + + for (int i =0; i < blocks_per_row; i++) { + posix_memalign(reinterpret_cast(&left_blocks[i]), 1024, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize)*(config.programSettings->blockSize)); + posix_memalign(reinterpret_cast(&top_blocks[i]), 1024, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize)*(config.programSettings->blockSize)); + } + + /* --- Execute actual benchmark kernels --- */ + + double t; + std::vector gefaExecutionTimes; + std::vector geslExecutionTimes; + std::vector gefaWaitTimes; + for (int i = 0; i < config.programSettings->numRepetitions; i++) { + + err = buffer_queue.enqueueWriteBuffer(Buffer_a, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, A); + ASSERT_CL(err) + err = buffer_queue.enqueueWriteBuffer(Buffer_b, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize, b); + ASSERT_CL(err) + buffer_queue.finish(); + + // Command queues + // A new command queue is created for every iteration of the algorithm to reduce the overhead + // of too large queues + std::list lu_queues; + std::list top_queues; + std::list left_queues; + std::list> left_buffers; + std::list> top_buffers; + std::list> inner_queues; + std::list> kernels; + + // User event that is used to start actual execution of benchmark kernels + cl::UserEvent start_event(*config.context, &err); + ASSERT_CL(err); + std::list> all_events; + all_events.emplace_back(); + all_events.back().emplace_back(start_event); + all_events.emplace_back(); + + left_buffers.emplace_back(); + top_buffers.emplace_back(); + kernels.emplace_back(); + inner_queues.emplace_back(); + for (uint rep = 0; rep < config.programSettings->kernelReplications; rep++) { + inner_queues.back().emplace_back(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + } + + std::chrono::time_point t1, t2, twait1, twait2; + std::chrono::duration currentwaittime = std::chrono::duration::zero(); + + uint current_replication = 0; + + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << "Start! " << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + t1 = std::chrono::high_resolution_clock::now(); + // Trigger the user event that will start the first tasks in the queue + start_event.setStatus(CL_COMPLETE); + + // For every row of blocks create kernels and enqueue them + for (int block_row=0; block_row < config.programSettings->matrixSize / config.programSettings->blockSize * config.programSettings->torus_width; block_row++) { + + // Create Command queues + lu_queues.emplace_back(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + top_queues.emplace_back(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + left_queues.emplace_back(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + + // already emplace new buffer list for next iteration since left and top buffers need to be stored until all MMs are executed. + // this is only the case after the next iteration is finished, because the inner MMs are calculated overlapped with the next iteration! + left_buffers.emplace_back(); + top_buffers.emplace_back(); + + int local_block_row_remainder = (block_row % config.programSettings->torus_width); + int local_block_row= (block_row / config.programSettings->torus_width); + bool in_same_row_as_lu = local_block_row_remainder == config.programSettings->torus_row; + bool in_same_col_as_lu = local_block_row_remainder == config.programSettings->torus_col; + int start_row_index = local_block_row + ((local_block_row_remainder >= config.programSettings->torus_row) ? 1: 0); + int start_col_index = local_block_row + ((local_block_row_remainder >= config.programSettings->torus_col) ? 1: 0); + int num_left_blocks = (in_same_col_as_lu) ? blocks_per_row - start_row_index : 0; + int num_top_blocks = (in_same_row_as_lu) ? blocks_per_row - start_col_index : 0; + int num_inner_block_rows = (blocks_per_row - start_row_index); + int num_inner_block_cols = (num_inner_block_rows > 0) ? (blocks_per_row - start_col_index) : 0; + num_inner_block_rows = (num_inner_block_cols > 0) ?num_inner_block_rows : 0; + int num_network_layer_executions = (config.programSettings->matrixSize / config.programSettings->blockSize) - std::min(start_col_index, start_row_index); + num_network_layer_executions = std::max(num_network_layer_executions, 1); + std::vector network_layer_op_flags(num_network_layer_executions); + std::fill(network_layer_op_flags.begin(), network_layer_op_flags.end(), 0); + bool is_calulating_lu_block = (in_same_col_as_lu && in_same_row_as_lu); + + if (is_calulating_lu_block) { + // create the LU kernel + kernels.back().emplace_back(*config.program, "lu", + &err); +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " LU " << local_block_row << "," << local_block_row << std::endl; +#endif + err = kernels.back().back().setArg(0, Buffer_a); + ASSERT_CL(err); + err = kernels.back().back().setArg(1, Buffer_lu1); + ASSERT_CL(err); + err = kernels.back().back().setArg(2, Buffer_lu2); + ASSERT_CL(err); + err = kernels.back().back().setArg(3, local_block_row); + ASSERT_CL(err) + err = kernels.back().back().setArg(4, local_block_row); + ASSERT_CL(err) + err =kernels.back().back().setArg(5, config.programSettings->matrixSize / config.programSettings->blockSize); + ASSERT_CL(err) + all_events.back().emplace_back(); + err = lu_queues.back().enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &(*std::prev(std::prev(all_events.end())))); + ASSERT_CL(err) + // read back result of LU calculation so it can be distributed + err = lu_queues.back().enqueueReadBuffer(Buffer_lu2, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, lu_block); + ASSERT_CL(err) + err = lu_queues.back().enqueueReadBuffer(Buffer_lu1, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, lu_trans_block, NULL, &all_events.back().back()); + ASSERT_CL(err) + } + + // Exchange LU blocks on all ranks to prevent stalls in MPI broadcast + // All tasks until now need to be executed so we can use the result of the LU factorization and communicate it via MPI with the other FPGAs + lu_queues.back().finish(); + + // Broadcast LU block in column to update all left blocks + MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator); + // Broadcast LU block in row to update all top blocks + MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator); + + if (num_top_blocks > 0) { + + // Copy LU block to FPGA for calulation of top blocks only if required + err = top_queues.back().enqueueWriteBuffer(Buffer_lu1, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, lu_trans_block); + ASSERT_CL(err) + + // Create top kernels + for (int tops=start_col_index; tops < (config.programSettings->matrixSize / config.programSettings->blockSize); tops++) { + kernels.back().emplace_back(*config.program, "top_update", + &err); +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Top " << local_block_row << "," << tops << std::endl; +#endif + ASSERT_CL(err); + err = kernels.back().back().setArg(0, Buffer_a); + ASSERT_CL(err); + err = kernels.back().back().setArg(1, Buffer_top); + ASSERT_CL(err); + err = kernels.back().back().setArg(2, Buffer_lu1); + ASSERT_CL(err) + err = kernels.back().back().setArg(3, (tops == start_col_index) ? CL_TRUE : CL_FALSE); + ASSERT_CL(err) + err = kernels.back().back().setArg(4, tops); + ASSERT_CL(err) + err = kernels.back().back().setArg(5, local_block_row); + ASSERT_CL(err) + err = kernels.back().back().setArg(6, config.programSettings->matrixSize / config.programSettings->blockSize); + ASSERT_CL(err) + + err = top_queues.back().enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &(*std::prev(std::prev(all_events.end())))); + ASSERT_CL(err) + + if (tops + 1 == (config.programSettings->matrixSize / config.programSettings->blockSize)) { + all_events.back().emplace_back(); + err = top_queues.back().enqueueReadBuffer(Buffer_top, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, top_blocks[tops - start_col_index], + &(*std::prev(std::prev(all_events.end()))), &(all_events.back().back())); + ASSERT_CL(err) + } + else { + err = top_queues.back().enqueueReadBuffer(Buffer_top, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, top_blocks[tops - start_col_index]); + ASSERT_CL(err) + } + + } + } + if (num_left_blocks > 0) { + + // Copy LU block to FPGA for calulation of left blocks only if required + err = left_queues.back().enqueueWriteBuffer(Buffer_lu2, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, lu_block); + ASSERT_CL(err) + // Create left kernels + for (int tops=start_row_index; tops < (config.programSettings->matrixSize / config.programSettings->blockSize); tops++) { + kernels.back().emplace_back(*config.program, "left_update", + &err); +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Left " <matrixSize / config.programSettings->blockSize); + ASSERT_CL(err) + + err = left_queues.back().enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &(*std::prev(std::prev(all_events.end())))); + ASSERT_CL(err) + + if (tops + 1 == (config.programSettings->matrixSize / config.programSettings->blockSize)) { + all_events.back().emplace_back(); + err = left_queues.back().enqueueReadBuffer(Buffer_left, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, left_blocks[tops - start_row_index], + &(*std::prev(std::prev(all_events.end()))), &(all_events.back().back())); + ASSERT_CL(err) + } + else { + err = left_queues.back().enqueueReadBuffer(Buffer_left, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, left_blocks[tops - start_row_index]); + ASSERT_CL(err) + } + } + } + // Wait until all top and left blocks are calculated + top_queues.back().finish(); + left_queues.back().finish(); + + // Send the left and top blocks to all other ranks so they can be used to update all inner blocks + for (int lbi=0; lbi < blocks_per_row - local_block_row; lbi++) { + MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator); + } + for (int tbi=0; tbi < blocks_per_row - local_block_row; tbi++) { + MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator); + } + + // update all remaining inner blocks using only global memory + + all_events.emplace_back(); + //auto communication_events = all_events.back(); + + // Write all left and top blocks to FPGA memory + for (int lbi=0; lbi < num_inner_block_rows; lbi++) { + left_buffers.back().emplace_back(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize * (config.programSettings->blockSize)); + err = inner_queues.back()[0].enqueueWriteBuffer(left_buffers.back().back(), CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, left_blocks[lbi]); + } + for (int tbi=0; tbi < num_inner_block_cols; tbi++) { + top_buffers.back().emplace_back(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize * (config.programSettings->blockSize)); + err = inner_queues.back()[0].enqueueWriteBuffer(top_buffers.back().back(), CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, top_blocks[tbi]); + } + + uint current_update = 0; + uint total_inner_updates_first_row = top_buffers.back().size(); + uint updates_per_replication = total_inner_updates_first_row / config.programSettings->kernelReplications; + uint total_inner_updates = (top_buffers.back().size() - 1) * (left_buffers.back().size() - 1); + uint total_updates_per_replication = total_inner_updates/ config.programSettings->kernelReplications; + + // Wait until data is copied to FPGA + inner_queues.back()[0].finish(); + + for (auto l = std::next(left_buffers.back().begin()); l < left_buffers.back().end(); l++) { + // select the matrix multiplication kernel that should be used for this block updated + kernels.back().emplace_back(*config.program, ("inner_update_mm" + std::to_string(current_replication)).c_str(), + &err); + + int block_col = static_cast((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_cols); + int block_row = static_cast((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_rows + std::distance(left_buffers.back().begin(), l)); + ASSERT_CL(err); + err = kernels.back().back().setArg(0, Buffer_a); + ASSERT_CL(err); + err = kernels.back().back().setArg(1, *l); + ASSERT_CL(err) + err = kernels.back().back().setArg(2, *top_buffers.back().begin()); + ASSERT_CL(err) + err = kernels.back().back().setArg(3, block_col); + ASSERT_CL(err) + err = kernels.back().back().setArg(4, block_row); + ASSERT_CL(err) + err = kernels.back().back().setArg(5, blocks_per_row); + ASSERT_CL(err) + + if ((left_buffers.back().size() - 1) - current_update <= config.programSettings->kernelReplications) { +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner L Ev " << block_row << "," << block_col << std::endl; +#endif + // this is the last taks that will be enqueued in this queue, so create an event + all_events.back().emplace_back(); + // Distribute the workload over all available matrix multiplication kernels + err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &(*std::prev(std::prev(all_events.end()))), &(all_events.back().back())); + //err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &communication_events, &(all_events.back().back())); + } + else { +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner L " << block_row << "," << block_col << std::endl; +#endif + // Distribute the workload over all available matrix multiplication kernels + err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &(*std::prev(std::prev(all_events.end())))); + //err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &communication_events); + } + current_update++; + current_replication = (current_replication + 1) % config.programSettings->kernelReplications; + } + + current_update = 0; + for (auto t = top_buffers.back().begin(); t < top_buffers.back().end(); t++) { + // select the matrix multiplication kernel that should be used for this block updated + kernels.back().emplace_back(*config.program, ("inner_update_mm" + std::to_string(current_replication)).c_str(), + &err); + + int block_col = static_cast((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_cols + std::distance(top_buffers.back().begin(), t)); + int block_row = static_cast((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_rows); + + ASSERT_CL(err); + err = kernels.back().back().setArg(0, Buffer_a); + ASSERT_CL(err); + err = kernels.back().back().setArg(1, *left_buffers.back().begin()); + ASSERT_CL(err) + err = kernels.back().back().setArg(2, *t); + ASSERT_CL(err) + err = kernels.back().back().setArg(3, block_col); + ASSERT_CL(err) + err = kernels.back().back().setArg(4, block_row); + ASSERT_CL(err) + err = kernels.back().back().setArg(5, blocks_per_row); + ASSERT_CL(err) + // If number of blocks is not dividable by the number of replications, the first replications will do one update more + if (top_buffers.back().size() - current_update <= config.programSettings->kernelReplications) { +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner T Ev " << block_row << "," << block_col << std::endl; +#endif + // this is the last taks that will be enqueued in this queue, so create an event + all_events.back().emplace_back(); + // Distribute the workload over all available matrix multiplication kernels + err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &(*std::prev(std::prev(all_events.end()))), &(all_events.back().back())); + } + else { +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner T " << block_row << "," << block_col << std::endl; +#endif + // Distribute the workload over all available matrix multiplication kernels + err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &(*std::prev(std::prev(all_events.end())))); + } + ASSERT_CL(err) + current_update++; + current_replication = (current_replication + 1) % config.programSettings->kernelReplications; + } + + // count the inner MM already to next iteration by creating new buffers in the queue + all_events.emplace_back(); + kernels.emplace_back(); + inner_queues.emplace_back(); + current_update = 0; + for (uint rep = 0; rep < config.programSettings->kernelReplications; rep++) { + inner_queues.back().emplace_back(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + } + + for (auto l = std::next(left_buffers.back().begin()); l < left_buffers.back().end(); l++) { + for (auto t = std::next(top_buffers.back().begin()); t < top_buffers.back().end(); t++) { + // select the matrix multiplication kernel that should be used for this block updated + kernels.back().emplace_back(*config.program, ("inner_update_mm" + std::to_string(current_replication)).c_str(), + &err); + + int block_col = static_cast((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_cols + std::distance(top_buffers.back().begin(), t)); + int block_row = static_cast((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_rows + std::distance(left_buffers.back().begin(), l)); + + ASSERT_CL(err); + err = kernels.back().back().setArg(0, Buffer_a); + ASSERT_CL(err); + err = kernels.back().back().setArg(1, *l); + ASSERT_CL(err) + err = kernels.back().back().setArg(2, *t); + ASSERT_CL(err) + err = kernels.back().back().setArg(3, block_col); + ASSERT_CL(err) + err = kernels.back().back().setArg(4, block_row); + ASSERT_CL(err) + err = kernels.back().back().setArg(5, blocks_per_row); + ASSERT_CL(err) + + // If number of blocks is not dividable by the number of replications, the first replications will do one update more + if (((top_buffers.back().size() - 1) * (left_buffers.back().size() - 1)) - current_update <= config.programSettings->kernelReplications) { +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner Ev " << block_row << "," << block_col << std::endl; +#endif + // this is the last taks that will be enqueued in this queue, so create an event + all_events.back().emplace_back(); + // Distribute the workload over all available matrix multiplication kernels + err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &(*std::prev(std::prev(std::prev(all_events.end())))), &(all_events.back().back())); + } + else { +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner " << block_row << "," << block_col << std::endl; +#endif + // Distribute the workload over all available matrix multiplication kernels + err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &(*std::prev(std::prev(std::prev(all_events.end()))))); + } + + ASSERT_CL(err) + current_update++; + current_replication = (current_replication + 1) % config.programSettings->kernelReplications; + } + } +#ifndef NDEBUG + MPI_Barrier(MPI_COMM_WORLD); + if (is_calulating_lu_block) std::cout << "---------------" << std::endl; + + // // // Execute GEFA + // if (block_row == 0) { + // MPI_Barrier(MPI_COMM_WORLD); + // t1 = std::chrono::high_resolution_clock::now(); + // // Trigger the user event that will start the first tasks in the queue + // start_event.setStatus(CL_COMPLETE); + // } +#endif + +#ifndef NDEBUG + cl::Event::waitForEvents(all_events.back()); + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Done " << block_row << std::endl; + + if (block_row == blocks_per_row * config.programSettings->torus_width - 1) { + // wait until the last LU queue is done since it will be the last required operation + lu_queues.back().finish(); + t2 = std::chrono::high_resolution_clock::now(); + + // Finish all other queues + top_queues.back().finish(); + left_queues.back().finish(); + cl::Event::waitForEvents(all_events.back()); + + } +#endif + } +#ifdef NDEBUG + int count = 0; + for (auto evs : all_events) { + count++; + cl::Event::waitForEvents(evs); + // std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << "Step " << count << " of " << all_events.size() << std::endl; + } + lu_queues.back().finish(); + t2 = std::chrono::high_resolution_clock::now(); + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << "End! " << std::endl; +#endif + +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << "Wait time: " << currentwaittime.count() << "s" << std::endl; + std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Exit " << i << std::endl; +#endif + + std::chrono::duration timespan = + std::chrono::duration_cast> + (t2 - t1); + gefaExecutionTimes.push_back(timespan.count()); + + // Execute GESL + t1 = std::chrono::high_resolution_clock::now(); + // lu_queue.enqueueTask(geslkernel); + // lu_queue.finish(); + t2 = std::chrono::high_resolution_clock::now(); + timespan = std::chrono::duration_cast>(t2 - t1); + geslExecutionTimes.push_back(timespan.count()); + } + + /* --- Read back results from Device --- */ + +#ifdef USE_SVM + err = clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(A), 0, + NULL, NULL); + ASSERT_CL(err) + err = clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(b), 0, + NULL, NULL); + ASSERT_CL(err) + err = clEnqueueSVMUnmap(compute_queue(), + reinterpret_cast(ipvt), 0, + NULL, NULL); + ASSERT_CL(err) + + // read back result from temporary buffer + for (int k=0; k < config.programSettings->matrixSize * config.programSettings->matrixSize; k++) { + A[k] = A_tmp[k]; + } + clSVMFree((*config.context)(), reinterpret_cast(A_tmp)); + +#else + buffer_queue.enqueueReadBuffer(Buffer_a, CL_TRUE, 0, + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, A); + // buffer_queue.enqueueReadBuffer(Buffer_b, CL_TRUE, 0, + // sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize, b); + if (!config.programSettings->isDiagonallyDominant) { + buffer_queue.enqueueReadBuffer(Buffer_pivot, CL_TRUE, 0, + sizeof(cl_int)*config.programSettings->matrixSize, ipvt); + } +#endif + + /* --- Clean up MPI communication buffers --- */ + free(lu_block); + free(lu_trans_block); + + for (int i =0; i < left_blocks.size(); i++) { + free(top_blocks[i]); + free(left_blocks[i]); + } + + MPI_Comm_free(&row_communicator); + MPI_Comm_free(&col_communicator); + + std::unique_ptr results( + new linpack::LinpackExecutionTimings{gefaExecutionTimes, geslExecutionTimes}); + + MPI_Barrier(MPI_COMM_WORLD); + + return results; +} + +} // namespace pcie +} // namespace execution +} // namespace linpack + +#endif \ No newline at end of file diff --git a/LINPACK/src/host/execution_types/execution_types.hpp b/LINPACK/src/host/execution_types/execution_types.hpp new file mode 100644 index 00000000..975dd4cf --- /dev/null +++ b/LINPACK/src/host/execution_types/execution_types.hpp @@ -0,0 +1,28 @@ +/* +Copyright (c) 2021 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef EXECUTION_TYPES_HPP +#define EXECUTION_TYPES_HPP + +#include "execution_types/execution_pcie.hpp" +#include "execution_types/execution_iec.hpp" + +#endif \ No newline at end of file diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp index 43628906..1ffc6fcd 100644 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -31,7 +31,8 @@ SOFTWARE. #include /* Project's headers */ -#include "execution.h" +#include "communication_types.hpp" +#include "execution_types/execution_types.hpp" #include "parameters.h" linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), @@ -106,7 +107,12 @@ linpack::LinpackBenchmark::addAdditionalParseOptions(cxxopts::Options &options) std::unique_ptr linpack::LinpackBenchmark::executeKernel(LinpackData &data) { - auto timings = bm_execution::calculate(*executionSettings, data.A, data.b, data.ipvt); + std::unique_ptr timings; + switch (executionSettings->programSettings->communicationType) { + case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*executionSettings, data.A, data.b, data.ipvt); break; + case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*executionSettings, data.A, data.b, data.ipvt); break; + default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType)); + } #ifdef DISTRIBUTED_VALIDATION distributed_gesl_nopvt_ref(data); #endif @@ -369,7 +375,7 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat for (int j = 0; j < n; j++) { // For each element below it for (int i = 0; i < n; i++) { - std::cout << ref_result->A[n * j + i] << ", "; + std::cout << std::abs(ref_result->A[n * j + i] - data.A[n * j + i]) << ", "; } std::cout << std::endl; } diff --git a/LINPACK/tests/test_kernel_communication.cpp b/LINPACK/tests/test_kernel_communication.cpp index bc677f5b..db40ea97 100644 --- a/LINPACK/tests/test_kernel_communication.cpp +++ b/LINPACK/tests/test_kernel_communication.cpp @@ -24,6 +24,9 @@ class LinpackKernelCommunicationTest : public testing::Test { bm = std::unique_ptr(new linpack::LinpackBenchmark(global_argc, global_argv)); bm->getExecutionSettings().programSettings->isDiagonallyDominant = true; bm->getExecutionSettings().programSettings->matrixSize = BLOCK_SIZE; + if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) { + GTEST_SKIP() << "This test is IEC Specific but other kernel is used"; + } data = bm->generateInputData(); setupExternalChannelFiles(); } @@ -76,6 +79,9 @@ class LinpackKernelCommunicationTestLU : public LinpackKernelCommunicationTest { void SetUp() override { LinpackKernelCommunicationTest::SetUp(); + if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) { + GTEST_SKIP() << "This test is IEC Specific but other kernel is used"; + } executeKernel(); } @@ -609,29 +615,29 @@ TEST_F(LinpackKernelCommunicationTestLeft, LeftBlockExternalResultisCorrect) { uint matrix_size = bm->getExecutionSettings().programSettings->matrixSize; auto gefa_data = bm->generateInputData(); - // generate uniformly distributed block as top block + // generate uniformly distributed block as left block bm->getExecutionSettings().programSettings->isDiagonallyDominant = false; auto ref_data = bm->generateInputData(); bm->getExecutionSettings().programSettings->isDiagonallyDominant = true; linpack::gefa_ref_nopvt(gefa_data->A, matrix_size,matrix_size); - // For each diagnonal element + // reference implementation to update left block for (int k = 0; k < matrix_size; k++) { - // For each row below the current row for (int j = 0; j < matrix_size; j++) { - // multiply current column to current row and add it up for (int i = k + 1; i < matrix_size; i++) { ref_data->A[j * matrix_size + i] += ref_data->A[j * matrix_size + k] * gefa_data->A[k * matrix_size + i]; } } } - double total_error = 0.0; + double max_error = 0.0; for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) { for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) { - total_error += std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]); + max_error = std::max(max_error, static_cast(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]))); } } - EXPECT_FLOAT_EQ(total_error, 0.0); + // tolerated delta between expected and real result is machine epsilon + double delta = std::numeric_limits::epsilon(); + EXPECT_NEAR(max_error, 0.0, delta); } TEST_F(LinpackKernelCommunicationTestLeft, LeftBlockGlobalMemLUBufferContentSameAsLUBlock) { @@ -781,13 +787,15 @@ TEST_F(LinpackKernelCommunicationTestTop, TopBlockExternalResultisCorrect) { } } } - double total_error = 0.0; + double max_error = 0.0; for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) { for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) { - total_error += std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]); + max_error = std::max(max_error, static_cast(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]))); } } - EXPECT_FLOAT_EQ(total_error, 0.0); + // tolerated delta between expected and real result is machine epsilon + double delta = std::numeric_limits::epsilon(); + EXPECT_NEAR(max_error, 0.0, delta); } TEST_F(LinpackKernelCommunicationTestTop, TopBlockExternalChannelOutputToRightCorrectAmountOfData) { @@ -898,13 +906,15 @@ TEST_F(LinpackKernelCommunicationTestTop, TopBlockExternalChannelOutputToTopCorr TEST_F(LinpackKernelCommunicationTestLU, LUBlockExternalResultisSameAsRef) { auto data2 = bm->generateInputData(); linpack::gefa_ref_nopvt(data2->A, bm->getExecutionSettings().programSettings->matrixSize,bm->getExecutionSettings().programSettings->matrixSize); - double total_error = 0.0; + double max_error = 0.0; for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) { for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) { - total_error += std::abs(data2->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]); + max_error = std::max(max_error, static_cast(std::abs(data2->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]))); } } - EXPECT_FLOAT_EQ(total_error, 0.0); + // tolerated delta between expected and real result is machine epsilon + double delta = std::numeric_limits::epsilon(); + EXPECT_NEAR(max_error, 0.0, delta); } @@ -1024,13 +1034,15 @@ TEST_F(LinpackKernelCommunicationTestLeftOut, LeftBlockExternalResultisCorrect) } } } - double total_error = 0.0; + double max_error = 0.0; for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) { for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) { - total_error += std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]); + max_error = std::max(max_error, static_cast(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]))); } } - EXPECT_FLOAT_EQ(total_error, 0.0); + // tolerated delta between expected and real result is machine epsilon + double delta = std::numeric_limits::epsilon(); + EXPECT_NEAR(max_error, 0.0, delta); } @@ -1058,13 +1070,15 @@ TEST_F(LinpackKernelCommunicationTestTopOut, TopBlockExternalResultisCorrect) { } } } - double total_error = 0.0; + double max_error = 0.0; for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) { for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) { - total_error += std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]); + max_error = std::max(max_error, static_cast(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]))); } } - EXPECT_FLOAT_EQ(total_error, 0.0); + // tolerated delta between expected and real result is machine epsilon + double delta = std::numeric_limits::epsilon(); + EXPECT_NEAR(max_error, 0.0, delta); } @@ -1197,13 +1211,15 @@ TEST_F(LinpackKernelCommunicationTestAll, AllBlockExternalResultisCorrect) { linpack::gefa_ref_nopvt(ref_data->A, matrix_size, matrix_size); - double total_error = 0.0; + double max_error = 0.0; for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) { for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) { - total_error += std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]); + max_error = std::max(max_error, static_cast(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]))); } } - EXPECT_FLOAT_EQ(total_error, 0.0); + // tolerated delta between expected and real result is machine epsilon times matrix width + double delta = std::numeric_limits::epsilon(); + EXPECT_NEAR(max_error, 0.0, delta); } TEST_F(LinpackKernelCommunicationTestAll, AllBlockExternalChannelOutputToRightCorrectAmountOfData) { diff --git a/PTRANS/CHANGELOG b/PTRANS/CHANGELOG index f74b3445..7338ff54 100644 --- a/PTRANS/CHANGELOG +++ b/PTRANS/CHANGELOG @@ -2,6 +2,18 @@ This file contains all changes made to the source code for each release. +## 1.5 + +#### Changed: +- Refactored the code to support different execution kernels and data distributions +- Changed formatting of the output metrics + +#### Added: +- CPU only implementation or diagonal and PQ data distribution +- FPGA kernel with communication via PCIe and MPI for diagonal and PQ distribution +- FPGA kernel with communication via external channels for PQ distribution + + ## 1.4 #### Changed: diff --git a/PTRANS/CMakeLists.txt b/PTRANS/CMakeLists.txt index 6a577787..7728fe2a 100755 --- a/PTRANS/CMakeLists.txt +++ b/PTRANS/CMakeLists.txt @@ -1,16 +1,23 @@ cmake_minimum_required(VERSION 3.13) -project(PTRANS VERSION 1.4) +project(PTRANS VERSION 1.5) set(READ_KERNEL_NAME transpose_read CACHE STRING "Name of the OpenCL kernel that reads A and sends it over external channel") set(WRITE_KERNEL_NAME transpose_write CACHE STRING "Name of the OpenCL kernel that receives A, adds C to it and stores result") set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices") +set(DEFAULT_COMM_TYPE "AUTO" CACHE STRING "Default communication type if nothing else is given over the --connectivity parameter") +set(DEFAULT_DIST_TYPE "AUTO" CACHE STRING "Default distribution type if nothing is specified over the --handler parameter") set(BLOCK_SIZE 512 CACHE STRING "Block size used in the FPGA kernel") set(CHANNEL_WIDTH 8 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory ") # NUM_REPLICATIONS set to 2 by default to allow build and execution of both versions of the transpose kernel set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the kernels will be replicated") +set(USE_BUFFER_WRITE_RECT_FOR_A No CACHE BOOL "Only valid for PQ with IEC. Use the enqueueWriteBufferRect call to copy only the relevant part of A to memory bank of each replication. Whole matrix A will be copied otherwise.") +set(XILINX_UNROLL_INNER_LOOPS No CACHE BOOL "When building for Xilinx devices, unroll the inner loops to create a single pipeline per block and keep memory bursts. This is a tradeoff between resource usage and performance.") -mark_as_advanced(READ_KERNEL_NAME WRITE_KERNEL_NAME) +mark_as_advanced(READ_KERNEL_NAME WRITE_KERNEL_NAME USE_BUFFER_WRITE_RECT_FOR_A XILINX_UNROLL_INNER_LOOPS) set(USE_MPI Yes) +set(USE_OPENMP Yes) +set(USE_DEPRECATED_HPP_HEADER No) +set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes) include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake) diff --git a/PTRANS/README.md b/PTRANS/README.md index a711ab42..dff4ed06 100644 --- a/PTRANS/README.md +++ b/PTRANS/README.md @@ -12,7 +12,8 @@ _Introduction to the HPCChallenge Benchmark Suite_ available ## Additional Dependencies -The benchmark needs no additional dependencies than the ones given in the main [README](../README.md). +In addition to the dependencies defined in the main [README](../README.md), the benchmark optionally requires MKL for the CPU only implementation of the benchmark. +If MKL is not found, the benchmark can still be build without support for the CPU only execution! ## Build @@ -37,7 +38,8 @@ The targets below can be used to build the benchmark and its kernels: The currently supported values for KERNEL_FILE_NAME are listed below where `transpose_diagonal` is set to be the default for the ase run: -- transpose_diagonal +- `transpose_diagonal`: Transposes a matrix that is distributed with the diagonal data handler +- `transpose_pq`: Transposes a matrix that is distributed with the PQ data handler. P = Q has to hold! You can build for example the host application by running @@ -51,7 +53,6 @@ Next to the common configuration options given in the [README](../README.md) of Name | Default | Description | ---------------- |-------------|--------------------------------------| - `DATA_TYPE` | float | Data type used for calculation | `READ_KERNEL_NAME` | transpose_read | Name of the kernel that reads A from global memory and sends it to an external channel (only needed for own implementations) | `WRITE_KERNEL_NAME` | transpose_write | Name of the kernel that receives a from an external channel and adds it to B (only needed for own implementations) | `BLOCK_SIZE` | 512 | Block size used by the kernel to transpose the matrix | @@ -75,35 +76,58 @@ For more information on available input parameters run The clock precision seems to be 1.00000e+01ns ------------------------------------------------------------- Implementation of the matrix transposition benchmark proposed in the HPCC benchmark suite for FPGA. - Version: 1.3 + Version: 1.5 + + MPI Version: 3.1 + Config. Time: Fri Jul 16 11:51:37 UTC 2021 + Git Commit: 2a12191-dirty Usage: - ./Transpose_xilinx [OPTION...] - - -f, --file arg Kernel file name - -n, arg Number of repetitions (default: 10) - -i, Use memory Interleaving - --skip-validation Skip the validation of the output data. This will - speed up execution and helps when working with special - data types. - --device arg Index of the device that has to be used. If not - given you will be asked which device to use if there are - multiple devices available. (default: -1) - --platform arg Index of the platform that has to be used. If not - given you will be asked which platform to use if there - are multiple platforms available. (default: -1) - -r, arg Number of used kernel replications (default: 4) - --test Only test given configuration and skip execution and - validation - -h, --help Print this help - -m, arg Matrix size in number of blocks in one dimension - (default: 8) - -b, arg Block size in number of values in one dimension - (default: 512) - --handler arg Specify the used data handler that distributes the - data over devices and memory banks (default: distext) + bin/Transpose_intel [OPTION...] + + -f, --file arg Kernel file name + -n, arg Number of repetitions (default: 10) + -i, Use memory Interleaving + --skip-validation Skip the validation of the output data. This will + speed up execution and helps when working with + special data types. + --device arg Index of the device that has to be used. If not + given you will be asked which device to use if + there are multiple devices available. (default: -1) + --platform arg Index of the platform that has to be used. If not + given you will be asked which platform to use if + there are multiple platforms available. (default: + -1) + -r, arg Number of used kernel replications (default: 4) + --test Only test given configuration and skip execution + and validation + -h, --help Print this help + -m, arg Matrix size in number of blocks in one dimension + (default: 8) + -b, arg Block size in number of values in one dimension + (default: 512) + --comm-type arg Used communication type for inter-FPGA communication + (default: AUTO) + --distribute-buffers Distribute buffers over memory banks. This will + use three memory banks instead of one for a single + kernel replication, but kernel replications may + interfere. This is an Intel only attribute, since + buffer placement is decided at compile time for + Xilinx FPGAs. + --handler arg Specify the used data handler that distributes + the data over devices and memory banks (default: + DIAG) +Available options for `--comm-type`: + +- `CPU`: CPU only execution. MKL required. +- `IEC`: Intel external channels are used by the kernels for communication. +- `PCIE`: PCIe and MPI are used to exchange data between FPGAs over the CPU. + +Possible options for `--handler`: +- `DIAG`: Diagonal distribution between FPGAs. Simplifies memory accesses by creating one-dimensional array of matrix blocks. +- `PQ`: PQ distribution of data between FPGAs. P = Q, similar to the distribution used in the LINPAK implementation. To execute the unit and integration tests run @@ -111,26 +135,30 @@ To execute the unit and integration tests run in the `bin` folder within the build directory. It will run an emulation of the kernel and execute some functionality tests. - ## Output Interpretation An example output from an emulation is given below: + ------------------------------------------------------------- + Validate output... + ------------------------------------------------------------- Maximum error: 7.62939e-06 < 1.19209e-05 Mach. Epsilon: 1.19209e-07 - Validation Time: 4.69627e+00 s - calc calc FLOPS Net [GB/s] Mem [GB/s] - avg: 1.15169e-01 3.72929e+10 1.49172e+11 4.47515e+11 - best: 1.14216e-01 3.76041e+10 1.50416e+11 4.51249e+11 + Validation Time: 4.66312e+00 s + total [s] transfer [s] calc [s] calc FLOPS Mem [B/s] PCIe [B/s] + avg: 1.15886e+00 1.04112e+00 1.17743e-01 9.11940e+09 1.09433e+11 1.23760e+10 + best: 1.13323e+00 1.02481e+00 1.08424e-01 9.90319e+09 1.18838e+11 1.25730e+10 Validation: SUCCESS! -The output gives the average and best calculation time for the transposition with the derived metrics. +The output gives the average and best calculation time for the transposition and important derived metrics based on these times. For the average and best timings, we have the following columns: -- `calc`: Calculation time in seconds, which is the pure kernel execution time without data transfer from the host. -- `calc FLOPS`: Achieved FLOPS just considering the calculation time. -- `Net [GB/s]`: Used total network bandwidth in GB/s. -- `Mem [GB/s]`: Used total global memory bandwidth in GB/s. +- `total [s]`: Total execution time in seconds of a whole repetition of the experiment that includes transfer and calcuation time. +- `transfer [s]`: Time in seconds that is required to transfer the data buffers to a memory location that can be accessed by the kernels on the FPGA board. +- `calc [s]`: Time in seconds to execute a single repetition of the matrix transposition also including communication between devices. +- `calc FLOPS`: Derived floating-point operations per second based on the calculation time. +- `Mem [B/s]`: Derived bandwidth of the memory that is accessed by the FPGA kernels during calculation based on the calculation time. +- `PCIe [B/s]`: Derived bandwidth of the transfer interface that is used to copy the buffers to a memory location accessible by the FPGA based on the transfer time. The `Maximum Error` field shows the largest error that was computed. Since the arithmetic intensity of the algorithm is quite low and only one addition is required to calculate one value of the result matrix, the error should be close to the machine epsilon, which depends on the chosen data type. diff --git a/PTRANS/configs/Nallatech_520N_CPU.cmake b/PTRANS/configs/Nallatech_520N_CPU.cmake new file mode 100644 index 00000000..cbe60d34 --- /dev/null +++ b/PTRANS/configs/Nallatech_520N_CPU.cmake @@ -0,0 +1,20 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) + +set(CMAKE_CXX_FLAGS "-march=native" CACHE STRING "Additional flags sued for every build type" FORCE) +set(CMAKE_C_FLAGS "-march=native" CACHE STRING "Additional flags sued for every build type" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 16 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) diff --git a/PTRANS/configs/Nallatech_520N_pcie.cmake b/PTRANS/configs/Nallatech_520N_pcie.cmake new file mode 100644 index 00000000..e5a49184 --- /dev/null +++ b/PTRANS/configs/Nallatech_520N_pcie.cmake @@ -0,0 +1,20 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "p520_hpc_sg280l" CACHE STRING "" FORCE) +set(AOC_FLAGS "-fpc -fp-relaxed -no-interleaving=default" CACHE STRING "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 512 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 4 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) diff --git a/PTRANS/configs/Xilinx_U250_DDR_PCIE.cmake b/PTRANS/configs/Xilinx_U250_DDR_PCIE.cmake new file mode 100644 index 00000000..6f446e97 --- /dev/null +++ b/PTRANS/configs/Xilinx_U250_DDR_PCIE.cmake @@ -0,0 +1,23 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u250_xdma_201830_2" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.u250.generator.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.u250.ini CACHE FILEPATH "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 4 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/configs/Xilinx_U250_DDR_PCIE_unroll.cmake b/PTRANS/configs/Xilinx_U250_DDR_PCIE_unroll.cmake new file mode 100644 index 00000000..533a44bf --- /dev/null +++ b/PTRANS/configs/Xilinx_U250_DDR_PCIE_unroll.cmake @@ -0,0 +1,24 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u250_xdma_201830_2" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.u250.generator.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.u250.ini CACHE FILEPATH "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 64 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 4 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) +set(XILINX_UNROLL_INNER_LOOPS Yes CACHE BOOL "When building for Xilinx devices, unroll the inner loops to create a single pipeline per block and keep memory bursts. This is a tradeoff between resource usage and performance." FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake new file mode 100644 index 00000000..46ef245c --- /dev/null +++ b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake @@ -0,0 +1,23 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/scripts/build_520n_pcie.sh b/PTRANS/scripts/build_520n_pcie.sh new file mode 100644 index 00000000..09de20c2 --- /dev/null +++ b/PTRANS/scripts/build_520n_pcie.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# +# Synthesize the PTRANS kernel for the Nallatech 520N board. +# This is an example script, how the synthesis can be started on Noctua using a HPCC FPGA configuration file. +# Submit this script to sbatch in this folder! +# +#SBATCH -p fpgasyn +#SBATCH -J PTRANS + +module load intelFPGA_pro/20.4.0 +module load nalla_pcie/19.4.0_hpc +module load intel +module load devel/CMake/3.15.3-GCCcore-8.3.0 + +SCRIPT_PATH=${SLURM_SUBMIT_DIR} + +BENCHMARK_DIR=${SCRIPT_PATH}/../ + +SYNTH_DIR=${PFS_SCRATCH}/synth/520n/multi_fpga/PTRANS/pq_pcie + +CONFIG_NAMES=("Nallatech_520N_pcie") + +for r in "${CONFIG_NAMES[@]}"; do + BUILD_DIR=${SYNTH_DIR}/20.4.0-19.4.0_hpc-${r} + + mkdir -p ${BUILD_DIR} + cd ${BUILD_DIR} + + cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/${r}.cmake + + make transpose_PQ_PCIE_intel Transpose_intel +done diff --git a/PTRANS/scripts/build_u250.sh b/PTRANS/scripts/build_u250.sh new file mode 100644 index 00000000..49da9930 --- /dev/null +++ b/PTRANS/scripts/build_u250.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +SCRIPT_PATH=${PWD} + +BENCHMARK_DIR=${SCRIPT_PATH}/../ + +SYNTH_DIR=/mnt/local/meyermar/synth/u250/PTRANS + +CONFIG_NAMES=("Xilinx_U250_DDR_PCIE") + +for r in "${CONFIG_NAMES[@]}"; do + BUILD_DIR=${SYNTH_DIR}/${r} + + mkdir -p ${BUILD_DIR} + cd ${BUILD_DIR} + + cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/${r}.cmake + + make transpose_PQ_PCIE_xilinx Transpose_xilinx +done diff --git a/PTRANS/scripts/build_u250_unroll.sh b/PTRANS/scripts/build_u250_unroll.sh new file mode 100644 index 00000000..01f7ada4 --- /dev/null +++ b/PTRANS/scripts/build_u250_unroll.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +SCRIPT_PATH=${PWD} + +BENCHMARK_DIR=${SCRIPT_PATH}/../ + +SYNTH_DIR=/mnt/local/meyermar/synth/u250/PTRANS + +CONFIG_NAMES=("Xilinx_U250_DDR_PCIE_unroll") + +for r in "${CONFIG_NAMES[@]}"; do + BUILD_DIR=${SYNTH_DIR}/${r} + + mkdir -p ${BUILD_DIR} + cd ${BUILD_DIR} + + cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/${r}.cmake + + make transpose_PQ_PCIE_xilinx Transpose_xilinx +done diff --git a/PTRANS/scripts/build_u280_alveo.sh b/PTRANS/scripts/build_u280_alveo.sh new file mode 100644 index 00000000..a175d755 --- /dev/null +++ b/PTRANS/scripts/build_u280_alveo.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +SCRIPT_PATH=${PWD} + +BENCHMARK_DIR=${SCRIPT_PATH}/../ + +SYNTH_DIR=/mnt/scratch/meyermar/synth/u280/PTRANS + +CONFIG_NAMES=("Xilinx_U280_DDR_PCIE") + +for r in "${CONFIG_NAMES[@]}"; do + BUILD_DIR=${SYNTH_DIR}/${r} + + mkdir -p ${BUILD_DIR} + cd ${BUILD_DIR} + + cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/${r}.cmake + + make transpose_PQ_PCIE_xilinx Transpose_xilinx +done diff --git a/PTRANS/scripts/build_u280_alveo_ddr_singleloop.sh b/PTRANS/scripts/build_u280_alveo_ddr_singleloop.sh new file mode 100644 index 00000000..2e011ab3 --- /dev/null +++ b/PTRANS/scripts/build_u280_alveo_ddr_singleloop.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +SCRIPT_PATH=${PWD} + +BENCHMARK_DIR=${SCRIPT_PATH}/../ + +SYNTH_DIR=/mnt/scratch/meyermar/synth/u280/PTRANS + +CONFIG_NAMES=("Xilinx_U280_DDR_PCIE") + +for r in "${CONFIG_NAMES[@]}"; do + BUILD_DIR=${SYNTH_DIR}/${r}-singleloop + + mkdir -p ${BUILD_DIR} + cd ${BUILD_DIR} + + cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/${r}.cmake + + make transpose_PQ_PCIE_singleloop_xilinx Transpose_xilinx +done diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini new file mode 100644 index 00000000..bce5c3ff --- /dev/null +++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini @@ -0,0 +1,2 @@ +kernel_frequency=300 + diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini new file mode 100644 index 00000000..7e52533c --- /dev/null +++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini @@ -0,0 +1,4 @@ +kernel_frequency=450 + +[hls] +max_memory_ports=all diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.u250.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.u250.ini new file mode 100644 index 00000000..a1334eb2 --- /dev/null +++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.u250.ini @@ -0,0 +1,4 @@ +kernel_frequency=300 + +[hls] +max_memory_ports=all diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini new file mode 100644 index 00000000..882d5af1 --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini @@ -0,0 +1,19 @@ + +# Set number of available SLRs +# PY_CODE_GEN num_slrs = 3 +# PY_CODE_GEN num_ddrs = 2 + +[connectivity] +nk=transpose0:$PY_CODE_GEN num_replications$ + +# Assign kernels to the SLRs +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$ +# PY_CODE_GEN block_end + +# Assign the kernels to the memory ports +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem0:DDR[$PY_CODE_GEN i % num_ddrs$] +sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem1:DDR[$PY_CODE_GEN i % num_ddrs$] +sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem2:DDR[$PY_CODE_GEN i % num_ddrs$] +# PY_CODE_GEN block_end diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.u250.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.u250.generator.ini new file mode 100644 index 00000000..f5fbbfa7 --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.u250.generator.ini @@ -0,0 +1,17 @@ + +# Set number of available SLRs +# PY_CODE_GEN num_slrs = 4 +# PY_CODE_GEN num_ddrs = 4 + +[connectivity] +nk=transpose0:$PY_CODE_GEN num_replications$ + +# Assign kernels to the SLRs +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$ +# PY_CODE_GEN block_end + +# Assign the kernels to the memory ports +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem:DDR[$PY_CODE_GEN i % num_ddrs$] +# PY_CODE_GEN block_end diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie_spread_banks.hbm.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie_spread_banks.hbm.generator.ini new file mode 100644 index 00000000..bda2578e --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie_spread_banks.hbm.generator.ini @@ -0,0 +1,19 @@ + +# Set number of available SLRs +# PY_CODE_GEN num_slrs = 3 +# PY_CODE_GEN num_hbms = 32 + +[connectivity] +nk=transpose0:$PY_CODE_GEN num_replications$ + +# Assign kernels to the SLRs +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$ +# PY_CODE_GEN block_end + +# Assign the kernels to the memory ports +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[$PY_CODE_GEN (3*i) % num_hbms$] +sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[$PY_CODE_GEN (3*i + 1) % num_hbms$] +sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[$PY_CODE_GEN (3*i + 2) % num_hbms$] +# PY_CODE_GEN block_end diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in index 1ea39a89..cccac277 100644 --- a/PTRANS/src/common/parameters.h.in +++ b/PTRANS/src/common/parameters.h.in @@ -7,6 +7,8 @@ #define WRITE_KERNEL_NAME "@WRITE_KERNEL_NAME@" #define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@ #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@ +#define DEFAULT_COMM_TYPE "@DEFAULT_COMM_TYPE@" +#define DEFAULT_DIST_TYPE "@DEFAULT_DIST_TYPE@" #define DEFAULT_PLATFORM @DEFAULT_PLATFORM@ #define DEFAULT_DEVICE @DEFAULT_DEVICE@ @@ -22,6 +24,8 @@ #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ #cmakedefine USE_SVM +#cmakedefine USE_BUFFER_WRITE_RECT_FOR_A +#cmakedefine XILINX_UNROLL_INNER_LOOPS /* Short description of the program. diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt index 4d5f9338..7542a861 100644 --- a/PTRANS/src/device/CMakeLists.txt +++ b/PTRANS/src/device/CMakeLists.txt @@ -1,17 +1,19 @@ +set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication in the CMake target generation function") + include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) if (INTELFPGAOPENCL_FOUND) - generate_kernel_targets_intel(transpose_diagonal transpose_diagonal_c2) - add_test(NAME test_emulation_diagonal_intel COMMAND Transpose_intel -f transpose_diagonal_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_intel -f transpose_diagonal_emulate.aocx -n 1 -m 1 + generate_kernel_targets_intel(transpose_DIAG_IEC transpose_c2_DIAG_IEC transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_IEC) + add_test(NAME test_emulation_diagonal_intel COMMAND Transpose_intel -f transpose_DIAG_IEC_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_intel -f transpose_DIAG_IEC_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (VITIS_FOUND) - generate_kernel_targets_xilinx(transpose_diagonal) - add_test(NAME test_emulation_diagonal_xilinx COMMAND Transpose_xilinx -f transpose_diagonal_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_diagonal_emulate.xclbin -n 1 -m 1 + generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE) + add_test(NAME test_emulation_PQ_PCIE_xilinx COMMAND Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/PTRANS/src/device/transpose_diagonal.cl b/PTRANS/src/device/transpose_DIAG_IEC.cl similarity index 100% rename from PTRANS/src/device/transpose_diagonal.cl rename to PTRANS/src/device/transpose_DIAG_IEC.cl diff --git a/PTRANS/src/device/transpose_DIAG_PCIE.cl b/PTRANS/src/device/transpose_DIAG_PCIE.cl new file mode 100644 index 00000000..614800f3 --- /dev/null +++ b/PTRANS/src/device/transpose_DIAG_PCIE.cl @@ -0,0 +1,175 @@ +/****************************************************************************** + * Author: Arjun Ramaswami + * + * Edited by Marius Meyer: + * - Adapt to used kernel signature + * - Change to row-column loop structure + *****************************************************************************/ + +#include "parameters.h" + +/** +* Load a block of A into local memory in a reordered fashion +* to transpose it half-way +* +* +* @param A Buffer for matrix A +* @param local_buffer The local memory buffer the block is stored into +* @param current_block Index of the current block used to calculate the offset in global memory +* +*/ +void +load_chunk_of_a(__global DEVICE_DATA_TYPE *restrict A, + DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], + const ulong current_block, + const ulong chunk) { + + ulong local_mem_converted_row = chunk; + + DEVICE_DATA_TYPE rotate_in[CHANNEL_WIDTH]; + + ulong load_address = current_block * BLOCK_SIZE * BLOCK_SIZE + chunk * CHANNEL_WIDTH; + + // Blocks of a will be stored columnwise in global memory +__attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + rotate_in[unroll_count] = A[load_address + unroll_count]; + } + + unsigned rot = (chunk / (BLOCK_SIZE / CHANNEL_WIDTH)) & (CHANNEL_WIDTH - 1); + + // rotate temporary buffer to store data into local buffer +__attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + // every block of (N / CHANNEL_WIDTH), rotates the index by 1 + // store in double buffer + local_buffer[local_mem_converted_row][unroll_count] = rotate_in[(unroll_count + CHANNEL_WIDTH - rot) + & (CHANNEL_WIDTH - 1)]; + } +} + +/** +* send a chunk of A into local memory in a reordered fashion +* to transpose it half-way +* +* +* @param A Buffer for matrix A +* @param local_buffer The local memory buffer the block is stored into +* @param current_block Index of the current block used to calculate the offset in global memory +* +*/ +void +load_chunk_of_trans_a(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], + DEVICE_DATA_TYPE chunk_out[CHANNEL_WIDTH], + const ulong chunk) { + + DEVICE_DATA_TYPE rotate_out[CHANNEL_WIDTH]; + + ulong base = (chunk & (BLOCK_SIZE / CHANNEL_WIDTH - 1)) * BLOCK_SIZE; + ulong offset = (chunk / (BLOCK_SIZE / CHANNEL_WIDTH)) / CHANNEL_WIDTH; + + +__attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + unsigned rot = ((CHANNEL_WIDTH + unroll_count - (chunk / (BLOCK_SIZE / CHANNEL_WIDTH))) * (BLOCK_SIZE / CHANNEL_WIDTH)) & + (BLOCK_SIZE - 1); + unsigned row_rotate = base + offset + rot; + rotate_out[unroll_count] = local_buffer[row_rotate][unroll_count]; + } + + unsigned rot_out = (chunk / (BLOCK_SIZE / CHANNEL_WIDTH)) & (CHANNEL_WIDTH - 1); + + // rotate temporary buffer to store data into local buffer + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + chunk_out[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)]; + } + + +} + +void +add_a_and_b(__global DEVICE_DATA_TYPE *restrict B, + const DEVICE_DATA_TYPE local_buffer_a[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], + DEVICE_DATA_TYPE local_buffer_a_plus_b[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], + const ulong block, + const ulong chunk) { + + DEVICE_DATA_TYPE data_chunk[CHANNEL_WIDTH]; + + load_chunk_of_trans_a(local_buffer_a, data_chunk, chunk); + + ulong ls_address = block * BLOCK_SIZE * BLOCK_SIZE + chunk * CHANNEL_WIDTH; + // load tranposed A from global memory + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + data_chunk[unroll_count] += B[ls_address + unroll_count]; + } + + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + local_buffer_a_plus_b[chunk][unroll_count] = data_chunk[unroll_count]; + } +} + + +void +store_a(__global DEVICE_DATA_TYPE *restrict A_out, + const DEVICE_DATA_TYPE local_buffer_a_plus_b[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], + const ulong block, + const ulong chunk) { + + ulong ls_address = block * BLOCK_SIZE * BLOCK_SIZE + chunk * CHANNEL_WIDTH; + + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + A_out[ls_address + unroll_count] = local_buffer_a_plus_b[chunk][unroll_count]; + } +} + +// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] + +/** + * Read blocks of matrix A and transpose them in memory. + * Write the block into an external channel. + * + * Will do the following: + * + * A -> trans(A) -> ext. ch + * + * @param A Buffer for matrix A + * @param B Buffer for matrix B + * @param A_out Buffer for result matrix + * @param number_of_blocks The number of blocks that will be processed starting from the block offset + */ +__attribute__((max_global_work_dim(0))) +__kernel +void transpose/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, + __global DEVICE_DATA_TYPE *restrict B, + __global DEVICE_DATA_TYPE *restrict A_out, + const uint number_of_blocks) { + + // local memory double buffer for a matrix block + DEVICE_DATA_TYPE a_block[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH] __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,1))) __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,2))); + // local memory double buffer for a matrix block + DEVICE_DATA_TYPE a_plus_b_block[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH] __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,1))) __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,2))); + + // transpose the matrix block-wise from global memory + #pragma loop_coalesce + for (uint block = 0; block < number_of_blocks; block++) { + // Combine all three steps in single pipeline to reduce overhead + for (uint chunk = 0; chunk < 3 * BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH; chunk++) { + uint current_chunk = chunk & (BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH - 1); + switch (chunk / (BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH)) { + // read in block of A from global memory and store it in a memory efficient manner for transpose + case 0: load_chunk_of_a(A, a_block, block, current_chunk); break; + // read transposed block of A from local memory buffer and add B from global memory to it + case 1: add_a_and_b(B, a_block, a_plus_b_block, block, current_chunk); break; + // Store result in global memory + case 2: store_a(A_out, a_plus_b_block, block, current_chunk); break; + } + } + } +} + +// PY_CODE_GEN block_end diff --git a/PTRANS/src/device/transpose_PQ_IEC.cl b/PTRANS/src/device/transpose_PQ_IEC.cl new file mode 100644 index 00000000..e219ae1c --- /dev/null +++ b/PTRANS/src/device/transpose_PQ_IEC.cl @@ -0,0 +1,205 @@ +/****************************************************************************** + * Author: Arjun Ramaswami + * + * Edited by Marius Meyer: + * - Adapt to used kernel signature + * - Change to row-column loop structure + *****************************************************************************/ + +#include "parameters.h" + +// Need some depth to our channels to accommodate their bursty filling. +#ifdef INTEL_FPGA +#pragma OPENCL EXTENSION cl_intel_channels : enable + +typedef struct { + DEVICE_DATA_TYPE data[CHANNEL_WIDTH]; +} ch_data; + +// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)] +// Channel used to send the transposed blocks of A +channel ch_data chan_a_out/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(i) + "\""*/), depth(1))); +channel ch_data chan_a_in/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2 * (i // 2) + ((i + 1) % 2)) + "\""*/), depth(1))); +// PY_CODE_GEN block_end +#endif + +/** +* Load a block of A into local memory in a reordered fashion +* to transpose it half-way +* +* +* @param A Buffer for matrix A +* @param local_buffer The local memory buffer the block is stored into +* @param current_block Index of the current block used to calculate the offset in global memory +* +*/ +void +load_chunk_of_a(__global DEVICE_DATA_TYPE *restrict A, + DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], + const ulong block_row, + const ulong block_col, + const ulong width_in_blocks, + const ulong row, + const ulong col) { + + ulong local_mem_converted_row = row * (BLOCK_SIZE / CHANNEL_WIDTH) + col; + + DEVICE_DATA_TYPE rotate_in[CHANNEL_WIDTH]; + + ulong load_address = block_row * BLOCK_SIZE * BLOCK_SIZE * width_in_blocks + + block_col * BLOCK_SIZE + + row * BLOCK_SIZE * width_in_blocks + + col * CHANNEL_WIDTH; + + // Blocks of a will be stored columnwise in global memory +__attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + rotate_in[unroll_count] = A[load_address + unroll_count]; + } + + unsigned rot = row & (CHANNEL_WIDTH - 1); + + // rotate temporary buffer to store data into local buffer +__attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + // every block of (N / CHANNEL_WIDTH), rotates the index by 1 + // store in double buffer + local_buffer[local_mem_converted_row][unroll_count] = rotate_in[(unroll_count + CHANNEL_WIDTH - rot) + & (CHANNEL_WIDTH - 1)]; + } +} + +// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)] + +/** +* send a chunk of A into local memory in a reordered fashion +* to transpose it half-way +* +* +* @param A Buffer for matrix A +* @param local_buffer The local memory buffer the block is stored into +* @param current_block Index of the current block used to calculate the offset in global memory +* +*/ +void +send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], + const ulong row, + const ulong col) { + + DEVICE_DATA_TYPE rotate_out[CHANNEL_WIDTH]; + + ulong base = col * BLOCK_SIZE; + ulong offset = row / CHANNEL_WIDTH; + + +__attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + unsigned rot = ((CHANNEL_WIDTH + unroll_count - row) * (BLOCK_SIZE / CHANNEL_WIDTH)) & + (BLOCK_SIZE - 1); + unsigned row_rotate = base + offset + rot; + rotate_out[unroll_count] = local_buffer[row_rotate][unroll_count]; + } + + unsigned rot_out = row & (CHANNEL_WIDTH - 1); + + ch_data data; + // rotate temporary buffer to store data into local buffer +__attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + data.data[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)]; + } + + write_channel_intel(chan_a_out/*PY_CODE_GEN i*/, data); +} + +/** + * Read blocks of matrix A and transpose them in memory. + * Write the block into an external channel. + * + * Will do the following: + * + * A -> trans(A) -> ext. ch + * + * @param A Buffer for matrix A + * @param block_offset The first block that will be processed in the provided buffer + * @param number_of_blocks The number of blocks that will be processed starting from the block offset + */ +__attribute__((max_global_work_dim(0))) +__kernel +void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, + const ulong offset, + const ulong width_in_blocks, + const ulong height_in_blocks, + const ulong number_of_blocks) { + + // local memory double buffer for a matrix block + DEVICE_DATA_TYPE a_block[2][BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH] __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,1))) __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,2))); + + // transpose the matrix block-wise from global memory + // One extra iteration to empty double buffer + #pragma loop_coalesce + for (ulong block = offset; block < number_of_blocks + offset + 1; block++) { + // read in block from global memory and store it in a memory efficient manner + for (ulong row = 0; row < BLOCK_SIZE; row++) { + for (ulong col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) { + if (block < number_of_blocks + offset) { + ulong block_col = block / height_in_blocks; + ulong block_row = block % height_in_blocks; + load_chunk_of_a(A, a_block[block & 1], block_row, block_col, width_in_blocks, row, col); + } + if (block > offset) { + send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col); + } + } + } + } +} + +/** + * Will add a matrix from external channel and matrix from global memory and store result in global memory. + * + * Will do the following: + * + * ext. ch + B --> A_out + * + * where A_out, ext. ch and B are matrices of size matrixSize*matrixSize + * + * @param B Buffer for matrix B + * @param A_out Output buffer for result matrix + * @param block_offset The first block that will be processed in the provided buffer + * @param number_of_blocks The number of blocks that will be processed starting from the block offset + */ +__attribute__((max_global_work_dim(0))) +__kernel +void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B, + __global DEVICE_DATA_TYPE *restrict A_out, + const ulong offset, + const ulong width_in_blocks, + const ulong number_of_blocks) { + + #pragma loop_coalesce + for (ulong block = offset; block < number_of_blocks + offset; block++) { + // complete matrix transposition and write the result back to global memory + for (ulong row = 0; row < BLOCK_SIZE; row++) { + for (ulong col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) { + + ch_data data = read_channel_intel(chan_a_in/*PY_CODE_GEN i*/); + + ulong block_col = block % width_in_blocks; + ulong block_row = block / width_in_blocks; + + // rotate temporary buffer to store data into local buffer +__attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + ulong ls_address = block_row * BLOCK_SIZE * BLOCK_SIZE * width_in_blocks + + block_col * BLOCK_SIZE + + row * BLOCK_SIZE * width_in_blocks + + col * CHANNEL_WIDTH + unroll_count; + A_out[ls_address] = data.data[unroll_count] + B[ls_address]; + } + } + } + } +} + +// PY_CODE_GEN block_end diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cl b/PTRANS/src/device/transpose_PQ_PCIE.cl new file mode 100644 index 00000000..5e8fb034 --- /dev/null +++ b/PTRANS/src/device/transpose_PQ_PCIE.cl @@ -0,0 +1,185 @@ +/****************************************************************************** + * Author: Arjun Ramaswami + * + * Edited by Marius Meyer: + * - Adapt to used kernel signature + * - Change to row-column loop structure + *****************************************************************************/ + +#include "parameters.h" + +// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] + +/** + * Read blocks of matrix A and transpose them in memory. + * Write the block into an external channel. + * + * Will do the following: + * + * A -> trans(A) -> ext. ch + * + * @param A Buffer for matrix A + * @param B Buffer for matrix B + * @param A_out Buffer for result matrix + * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise + on the block level, the whole matrix A might be written to global memory and the relevant columns + need to be picked using this offset. + * @param number_of_blocks The number of blocks that will be processed starting from the block offset + * @param width_in_blocks The with of matrix A in blocks + * @param height_in_blocks The height of matix A in blocks + */ +__attribute__((max_global_work_dim(0))) +__kernel +void transpose/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, + __global DEVICE_DATA_TYPE *restrict B, + __global DEVICE_DATA_TYPE *restrict A_out, + const uint offset, + const uint number_of_blocks, + const uint width_in_blocks, + const uint height_in_blocks) { + + // local memory double buffer for a matrix block + DEVICE_DATA_TYPE a_block[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH] __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,1))) __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,2))); + // local memory double buffer for a matrix block + DEVICE_DATA_TYPE a_plus_b_block[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH] __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,1))) __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,2))); + + // transpose the matrix block-wise from global memory + for (uint block = 0; block < number_of_blocks; block++) { +#ifdef INTEL_FPGA + // Load A to local memory + #pragma loop_coalesce +#endif +#ifdef XILINX_FPGA +#ifdef XILINX_UNROLL_INNER_LOOPS + __attribute__((xcl_pipeline_loop(BLOCK_SIZE / CHANNEL_WIDTH))) +#endif +#endif + for (uint row = 0; row < BLOCK_SIZE; row++) { +#ifdef XILINX_FPGA +#ifndef XILINX_UNROLL_INNER_LOOPS + __attribute__((xcl_pipeline_loop(1))) +#endif +#endif + for (uint col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) { + ulong block_row_a = (block + offset) / width_in_blocks; + ulong block_col_a = (block + offset) % width_in_blocks; + ulong ls_address_trans = block_col_a * BLOCK_SIZE * BLOCK_SIZE * height_in_blocks + + block_row_a * BLOCK_SIZE + + row * BLOCK_SIZE * height_in_blocks; + + // read in block of A from global memory and store it in a memory efficient manner for transpose + DEVICE_DATA_TYPE rotate_in[CHANNEL_WIDTH]; + + // Blocks of a will be stored columnwise in global memory + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + rotate_in[unroll_count] = A[ls_address_trans + col * CHANNEL_WIDTH + unroll_count]; + } + + uint chunk = row * (BLOCK_SIZE / CHANNEL_WIDTH) + col; + + unsigned rot = (row) & (CHANNEL_WIDTH - 1); + + // rotate temporary buffer to store data into local buffer + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + // every block of (N / CHANNEL_WIDTH), rotates the index by 1 + // store in double buffer + a_block[chunk][unroll_count] = rotate_in[(unroll_count + CHANNEL_WIDTH - rot) + & (CHANNEL_WIDTH - 1)]; + } + } + } + + // Read transposed A from local memory and add B +#ifdef INTEL_FPGA + // Load A to local memory + #pragma loop_coalesce +#endif +#ifdef XILINX_FPGA +#ifdef XILINX_UNROLL_INNER_LOOPS + __attribute__((xcl_pipeline_loop(BLOCK_SIZE / CHANNEL_WIDTH))) +#endif +#endif + for (uint row = 0; row < BLOCK_SIZE; row++) { +#ifdef XILINX_FPGA +#ifndef XILINX_UNROLL_INNER_LOOPS + __attribute__((xcl_pipeline_loop(1))) +#endif +#endif + for (uint col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) { + ulong block_row = block / width_in_blocks; + ulong block_col = block % width_in_blocks; + ulong ls_address_row = block_row * BLOCK_SIZE * BLOCK_SIZE * width_in_blocks + + block_col * BLOCK_SIZE + + row * BLOCK_SIZE * width_in_blocks; + uint chunk = row * (BLOCK_SIZE / CHANNEL_WIDTH) + col; + + DEVICE_DATA_TYPE data_chunk[CHANNEL_WIDTH]; + DEVICE_DATA_TYPE rotate_out[CHANNEL_WIDTH]; + + uint base = col * BLOCK_SIZE; + uint offset = row / CHANNEL_WIDTH; + + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + unsigned rot = ((CHANNEL_WIDTH + unroll_count - row) * (BLOCK_SIZE / CHANNEL_WIDTH)) & + (BLOCK_SIZE - 1); + unsigned row_rotate = base + offset + rot; + rotate_out[unroll_count] = a_block[row_rotate][unroll_count]; + } + + unsigned rot_out = row & (CHANNEL_WIDTH - 1); + + // rotate temporary buffer to store data into local buffer + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)]; + } + + // load tranposed A from global memory + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + data_chunk[unroll_count] += B[ls_address_row + col * CHANNEL_WIDTH + unroll_count]; + } + + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + a_plus_b_block[chunk][unroll_count] = data_chunk[unroll_count]; + } + } + } + // Write back result +#ifdef INTEL_FPGA + // Load A to local memory + #pragma loop_coalesce +#endif +#ifdef XILINX_FPGA +#ifdef XILINX_UNROLL_INNER_LOOPS + __attribute__((xcl_pipeline_loop(BLOCK_SIZE / CHANNEL_WIDTH))) +#endif +#endif + for (uint row = 0; row < BLOCK_SIZE; row++) { +#ifdef XILINX_FPGA +#ifndef XILINX_UNROLL_INNER_LOOPS + __attribute__((xcl_pipeline_loop(1))) +#endif +#endif + for (uint col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) { + ulong block_row = block / width_in_blocks; + ulong block_col = block % width_in_blocks; + ulong ls_address_row = block_row * BLOCK_SIZE * BLOCK_SIZE * width_in_blocks + + block_col * BLOCK_SIZE + + row * BLOCK_SIZE * width_in_blocks; + uint chunk = row * (BLOCK_SIZE / CHANNEL_WIDTH) + col; + + __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) + for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) { + A_out[ls_address_row + col * CHANNEL_WIDTH + unroll_count] = a_plus_b_block[chunk][unroll_count]; + } + } + } + } +} + +// PY_CODE_GEN block_end diff --git a/PTRANS/src/device/transpose_diagonal_c2.cl b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl similarity index 100% rename from PTRANS/src/device/transpose_diagonal_c2.cl rename to PTRANS/src/device/transpose_c2_DIAG_IEC.cl diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index 9d8f2c99..89b45ff8 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -1,10 +1,14 @@ add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) -set(HOST_SOURCE execution_default.cpp transpose_benchmark.cpp transpose_handlers.cpp transpose_data.cpp) +set(HOST_SOURCE transpose_benchmark.cpp transpose_data.cpp) set(HOST_EXE_NAME Transpose) set(LIB_NAME trans) +set(BLA_VENDOR Intel10_64lp) +set(BLA_STATIC ON) +find_package(BLAS) + if (INTELFPGAOPENCL_FOUND) add_library(${LIB_NAME}_intel STATIC ${HOST_SOURCE}) target_include_directories(${LIB_NAME}_intel PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${IntelFPGAOpenCL_INCLUDE_DIRS}) @@ -16,6 +20,12 @@ if (INTELFPGAOPENCL_FOUND) if (USE_SVM) target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0) endif() + if (BLAS_FOUND) + target_link_libraries(${LIB_NAME}_intel ${BLAS_LIBRARIES}) + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DMKL_FOUND) + target_compile_options(${LIB_NAME}_intel PRIVATE "${BLAS_LINKER_FLAGS}") + target_include_directories(${LIB_NAME}_intel PRIVATE "$ENV{MKL_ROOT}/include") + endif() target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA) target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_intel_host_executable COMMAND $ -h) diff --git a/PTRANS/src/host/data_handlers/data_handler_types.h b/PTRANS/src/host/data_handlers/data_handler_types.h new file mode 100644 index 00000000..e15bb7b6 --- /dev/null +++ b/PTRANS/src/host/data_handlers/data_handler_types.h @@ -0,0 +1,85 @@ +/* +Copyright (c) 2021 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_DATA_HANDLER_TPYES_H_ +#define SRC_HOST_DATA_HANDLER_TPYES_H_ + +/* C++ standard library headers */ +#include + +namespace transpose { +namespace data_handler { + +/** + * @brief This enumeration contains all available data handler types. + * + */ +typedef enum _DataHandlerType { + + /** + * @brief The matrix is already blockwise diagonally distributed which only required data exchange with a single node + * + */ + diagonal, + + /** + * @brief Classical distribution of the matrix in a PQ grid + * + */ + pq, + + /** + * @brief Automatically detect distribution scheme from kernel file name + * + */ + automatic + + + +} DataHandlerType; + +static const std::map comm_to_str_map{ + {"DIAG", DataHandlerType::diagonal}, + {"PQ", DataHandlerType::pq}, + {"AUTO", DataHandlerType::automatic} + }; + +static std::string handlerToString(DataHandlerType c) { + for (auto& entry : comm_to_str_map) { + if (entry.second == c) { + return entry.first; + } + } + throw new std::runtime_error("Communication type could not be converted to string!"); +} + +static DataHandlerType stringToHandler(std::string comm_name) { + auto result = comm_to_str_map.find(comm_name); + if (result != comm_to_str_map.end()) { + return result->second; + } + throw new std::runtime_error("Communication type could not be converted from string: " + comm_name); +} + +} +} + +#endif \ No newline at end of file diff --git a/PTRANS/src/host/data_handlers/diagonal.hpp b/PTRANS/src/host/data_handlers/diagonal.hpp new file mode 100644 index 00000000..e1d72f3b --- /dev/null +++ b/PTRANS/src/host/data_handlers/diagonal.hpp @@ -0,0 +1,210 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef SRC_HOST_TRANSPOSE_HANDLER_DIAGONAL_HPP_ +#define SRC_HOST_TRANSPOSE_HANDLER_DIAGONAL_HPP_ + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "handler.hpp" + +/** + * @brief Contains all classes and methods needed by the Transpose benchmark + * + */ +namespace transpose { + + namespace data_handler { + +/** + * @brief Transposes the data over external channels, so every part of a pair is located on a different FPGA. + * Data will be distributed to the ranks such that only a fixed pair of ranks will communicate to exchange + * the missing data. e.g. for N ranks, the pairs will be (0, N/2), (1, N/2 + 1), ... + * + */ +class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler { + +private: + + /** + * @brief Number of diagonal ranks that will sent the blcoks to themselves + * + */ + int num_diagonal_ranks; + + /** + * @brief MPI data for matrix blocks + * + */ + MPI_Datatype data_block; + +public: + + /** + * @brief Generate data for transposition based on the implemented distribution scheme + * + * @param settings The execution settings that contain information about the data size + * @return std::unique_ptr The generated data + */ + std::unique_ptr + generateData(hpcc_base::ExecutionSettings& settings) override { + MPI_Type_contiguous(settings.programSettings->blockSize * settings.programSettings->blockSize, MPI_FLOAT, &data_block); + MPI_Type_commit(&data_block); + + int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize; + + int avg_blocks_per_rank = (width_in_blocks * width_in_blocks) / mpi_comm_size; + int avg_diagonal_blocks = width_in_blocks; + if (avg_blocks_per_rank > 0) { + avg_diagonal_blocks = (width_in_blocks / avg_blocks_per_rank); + } + num_diagonal_ranks = std::max(avg_diagonal_blocks, 1); + + if (num_diagonal_ranks % 2 != mpi_comm_size % 2) { + #ifndef NDEBUG + std::cout << "Rank " << mpi_comm_rank << ": Fail 1!" << std::endl; + #endif + // Abort if there is a too high difference in the number of matrix blocks between the MPI ranks + throw std::runtime_error("Matrix size and MPI ranks to not allow fair distribution of blocks! Increase or reduce the number of MPI ranks by 1."); + } + if ((mpi_comm_size - num_diagonal_ranks) % 2 != 0 || (mpi_comm_size - num_diagonal_ranks) == 0 && width_in_blocks > 1) { + #ifndef NDEBUG + std::cout << "Rank " << mpi_comm_rank << ": Fail 2!" << std::endl; + #endif + throw std::runtime_error("Not possible to create pairs of MPI ranks for lower and upper half of matrix. Increase number of MPI ranks!."); + } + bool this_rank_is_diagonal = mpi_comm_rank >= (mpi_comm_size - num_diagonal_ranks); + int blocks_if_diagonal = width_in_blocks / num_diagonal_ranks + ( (mpi_comm_rank - (mpi_comm_size - num_diagonal_ranks)) < (width_in_blocks % num_diagonal_ranks) ? 1 : 0); + int blocks_if_not_diagonal = 0; + if ((mpi_comm_size - num_diagonal_ranks) > 0 ) { + blocks_if_not_diagonal = (width_in_blocks * (width_in_blocks - 1)) / (mpi_comm_size - num_diagonal_ranks) + (mpi_comm_rank < ((width_in_blocks * (width_in_blocks - 1)) % (mpi_comm_size - num_diagonal_ranks)) ? 1 : 0); + } + + + int blocks_per_rank = (this_rank_is_diagonal) ? blocks_if_diagonal : blocks_if_not_diagonal; + + if (mpi_comm_rank == 0) { + std::cout << "Diag. blocks per rank: " << blocks_if_diagonal << std::endl; + std::cout << "Blocks per rank: " << blocks_if_not_diagonal << std::endl; + std::cout << "Loopback ranks for diagonal blocks: " << num_diagonal_ranks << std::endl; + } + // Height of a matrix generated for a single memory bank on a single MPI rank + int data_height_per_rank = blocks_per_rank * settings.programSettings->blockSize; + + #ifndef NDEBUG + std::cout << "Rank " << mpi_comm_rank << ": NumBlocks = " << blocks_per_rank << std::endl; + #endif + + // Allocate memory for a single device and all its memory banks + auto d = std::unique_ptr(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank)); + + // Fill the allocated memory with pseudo random values + std::mt19937 gen(mpi_comm_rank); + std::uniform_real_distribution<> dis(-100.0, 100.0); + for (size_t i = 0; i < data_height_per_rank; i++) { + for (size_t j = 0; j < settings.programSettings->blockSize; j++) { + d->A[i * settings.programSettings->blockSize + j] = dis(gen); + d->B[i * settings.programSettings->blockSize + j] = dis(gen); + d->result[i * settings.programSettings->blockSize + j] = 0.0; + } + } + + return d; + } + + /** + * @brief Exchange the data blocks for verification + * + * @param data The data that was generated locally and will be exchanged with other MPI ranks + * Exchanged data will be stored in the same object. + */ + void + exchangeData(TransposeData& data) override { + + #ifndef NDEBUG + // std::cout << "Start data exchange " << mpi_comm_rank << std::endl; + #endif + // Only need to exchange data, if rank has a partner + if (mpi_comm_rank < mpi_comm_size - num_diagonal_ranks) { + + int first_upper_half_rank = (mpi_comm_size - num_diagonal_ranks)/2; + int pair_rank = (mpi_comm_rank >= first_upper_half_rank) ? mpi_comm_rank - first_upper_half_rank : mpi_comm_rank + first_upper_half_rank; + + // To re-calculate the matrix transposition locally on this host, we need to + // exchange matrix A for every kernel replication + // The order of the matrix blocks does not change during the exchange, because they are distributed diagonally + // and will be handled in the order below: + // + // . . 1 3 + // . . . 2 + // 1 . . . + // 3 2 . . + MPI_Status status; + + size_t remaining_data_size = data.numBlocks; + size_t offset = 0; + while (remaining_data_size > 0) { + int next_chunk = (remaining_data_size > std::numeric_limits::max()) ? std::numeric_limits::max(): remaining_data_size; + MPI_Sendrecv(&data.A[offset], next_chunk, data_block, pair_rank, 0, &data.exchange[offset], next_chunk, data_block, pair_rank, 0, MPI_COMM_WORLD, &status); + + remaining_data_size -= next_chunk; + offset += static_cast(next_chunk) * static_cast(data.blockSize * data.blockSize); + } + + // Exchange window pointers + HOST_DATA_TYPE* tmp = data.exchange; + data.exchange = data.A; + data.A = tmp; + } + #ifndef NDEBUG + // std::cout << "End data exchange " << mpi_comm_rank << std::endl; + #endif + } + + void + reference_transpose(TransposeData& data) { + size_t block_offset = data.blockSize * data.blockSize; + for (size_t b = 0; b < data.numBlocks; b++) { + for (size_t i = 0; i < data.blockSize; i++) { + for (size_t j = 0; j < data.blockSize; j++) { + data.A[b * block_offset + j * data.blockSize + i] -= (data.result[b * block_offset + i * data.blockSize + j] + - data.B[b * block_offset + i * data.blockSize + j]); + } + } + } + } + + DistributedDiagonalTransposeDataHandler(int mpi_rank, int mpi_size): TransposeDataHandler(mpi_rank, mpi_size) { + if (mpi_rank >= mpi_size) { + throw std::runtime_error("MPI rank must be smaller the MPI world size!"); + } + } + +}; + +} +} + +#endif diff --git a/PTRANS/src/host/transpose_handlers.hpp b/PTRANS/src/host/data_handlers/handler.hpp similarity index 58% rename from PTRANS/src/host/transpose_handlers.hpp rename to PTRANS/src/host/data_handlers/handler.hpp index c30a7acd..fe1293fe 100644 --- a/PTRANS/src/host/transpose_handlers.hpp +++ b/PTRANS/src/host/data_handlers/handler.hpp @@ -27,20 +27,14 @@ SOFTWARE. #include /* Project's headers */ -#include "transpose_data.hpp" - -/** - * @brief String that identifies the transpose::DistributedExternalTransposeDataHandler - * - */ -#define TRANSPOSE_HANDLERS_DIST_DIAG "distdiag" +#include "../transpose_data.hpp" /** * @brief Contains all classes and methods needed by the Transpose benchmark * */ namespace transpose { - +namespace data_handler { /** * @brief The parallel matrix transposition is designed to support different kinds of data distribution. * This abstract class provides the necessary methods that need to be implemented for every data distribution scheme. @@ -85,6 +79,9 @@ class TransposeDataHandler { virtual void exchangeData(TransposeData& data) = 0; + virtual void + reference_transpose(TransposeData& data) = 0; + /** * @brief Construct a new Transpose Data Handler object and initialize the MPI rank and MPI size variables if MPI is used * @@ -93,69 +90,7 @@ class TransposeDataHandler { }; -#ifdef _USE_MPI_ - -/** - * @brief Transposes the data over external channels, so every part of a pair is located on a different FPGA. - * Data will be distributed to the ranks such that only a fixed pair of ranks will communicate to exchange - * the missing data. e.g. for N ranks, the pairs will be (0, N/2), (1, N/2 + 1), ... - * - */ -class DistributedDiagonalTransposeDataHandler : public transpose::TransposeDataHandler { - -private: - - /** - * @brief Number of diagonal ranks that will sent the blcoks to themselves - * - */ - int num_diagonal_ranks; - -public: - - /** - * @brief Generate data for transposition based on the implemented distribution scheme - * - * @param settings The execution settings that contain information about the data size - * @return std::unique_ptr The generated data - */ - std::unique_ptr - generateData(hpcc_base::ExecutionSettings& settings) override; - - /** - * @brief Exchange the data blocks for verification - * - * @param data The data that was generated locally and will be exchanged with other MPI ranks - * Exchanged data will be stored in the same object. - */ - void - exchangeData(TransposeData& data) override; - - DistributedDiagonalTransposeDataHandler(int mpi_rank, int mpi_size); - -}; - -#endif - -/** - * @brief Generate a data handler object - * - * @tparam T The class of the data handler object - * @return std::unique_ptr a unique poiinter to the generated data handler object - */ -template -std::unique_ptr -generateDataHandler(int rank, int size) { - return std::unique_ptr(new T(rank, size)); } - -/** - * @brief A map that contains the mapping from plain strings to the data handler object that should be used in the program - * - */ -extern std::map (*)(int rank, int size)> dataHandlerIdentifierMap; - - } #endif diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp new file mode 100644 index 00000000..369cab31 --- /dev/null +++ b/PTRANS/src/host/data_handlers/pq.hpp @@ -0,0 +1,161 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef SRC_HOST_TRANSPOSE_HANDLERS_PQ_HPP_ +#define SRC_HOST_TRANSPOSE_HANDLERS_PQ_HPP_ + +/* C++ standard library headers */ +#include + +/* Project's headers */ +#include "handler.hpp" + +/** + * @brief Contains all classes and methods needed by the Transpose benchmark + * + */ +namespace transpose { +namespace data_handler { + +class DistributedPQTransposeDataHandler : public TransposeDataHandler { + +private: + + /** + * @brief Number of diagonal ranks that will sent the blcoks to themselves + * + */ + int width_per_rank; + + int pq_row; + + int pq_col; + + int pq_width; + + MPI_Datatype data_block; + +public: + + /** + * @brief Generate data for transposition based on the implemented distribution scheme + * + * @param settings The execution settings that contain information about the data size + * @return std::unique_ptr The generated data + */ + std::unique_ptr + generateData(hpcc_base::ExecutionSettings& settings) override { + int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize; + + // A data block is strided! + MPI_Type_contiguous(settings.programSettings->blockSize * settings.programSettings->blockSize, MPI_FLOAT, &data_block); + MPI_Type_commit(&data_block); + + width_per_rank = width_in_blocks / pq_width; + pq_row = mpi_comm_rank / pq_width; + pq_col = mpi_comm_rank % pq_width; + + int blocks_per_rank = width_per_rank * width_per_rank; + + // Allocate memory for a single device and all its memory banks + auto d = std::unique_ptr(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank)); + + // Fill the allocated memory with pseudo random values + std::mt19937 gen(mpi_comm_rank); + std::uniform_real_distribution<> dis(-100.0, 100.0); + for (size_t i = 0; i < blocks_per_rank * settings.programSettings->blockSize; i++) { + for (size_t j = 0; j < settings.programSettings->blockSize; j++) { + d->A[i * settings.programSettings->blockSize + j] = dis(gen); + d->B[i * settings.programSettings->blockSize + j] = dis(gen); + d->result[i * settings.programSettings->blockSize + j] = 0.0; + } + } + + return d; + } + + /** + * @brief Exchange the data blocks for verification + * + * @param data The data that was generated locally and will be exchanged with other MPI ranks + * Exchanged data will be stored in the same object. + */ + void + exchangeData(TransposeData& data) override { + + if (pq_col != pq_row) { + + int pair_rank = pq_width * pq_col + pq_row; + + // To re-calculate the matrix transposition locally on this host, we need to + // exchange matrix A for every kernel replication + // The order of the matrix blocks does not change during the exchange, because they are distributed diagonally + // and will be handled in the order below: + // + // . . 1 3 + // . . . 2 + // 1 . . . + // 3 2 . . + MPI_Status status; + + size_t remaining_data_size = data.numBlocks; + size_t offset = 0; + while (remaining_data_size > 0) { + int next_chunk = (remaining_data_size > std::numeric_limits::max()) ? std::numeric_limits::max(): remaining_data_size; + MPI_Sendrecv(&data.A[offset], next_chunk, data_block, pair_rank, 0, &data.exchange[offset], next_chunk, data_block, pair_rank, 0, MPI_COMM_WORLD, &status); + + remaining_data_size -= next_chunk; + offset += static_cast(next_chunk) * static_cast(data.blockSize * data.blockSize); + } + + // Exchange window pointers + HOST_DATA_TYPE* tmp = data.exchange; + data.exchange = data.A; + data.A = tmp; + } + + } + + void + reference_transpose(TransposeData& data) { + for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { + for (size_t j = 0; j < width_per_rank * data.blockSize; j++) { + data.A[j * width_per_rank * data.blockSize + i] -= (data.result[i * width_per_rank * data.blockSize + j] - data.B[i * width_per_rank * data.blockSize + j]); + } + } + } + + DistributedPQTransposeDataHandler(int mpi_rank, int mpi_size) : TransposeDataHandler(mpi_rank, mpi_size) { + int sqrt_size = std::sqrt(mpi_size); + if (sqrt_size * sqrt_size != mpi_size) { + throw std::runtime_error("Number of MPI ranks must have an integer as square root since P = Q has to hold!"); + } + pq_width = std::sqrt(mpi_size); + } + +}; + + +} +} + +#endif diff --git a/PTRANS/src/host/execution_types/execution_cpu.hpp b/PTRANS/src/host/execution_types/execution_cpu.hpp new file mode 100644 index 00000000..f7e369a1 --- /dev/null +++ b/PTRANS/src/host/execution_types/execution_cpu.hpp @@ -0,0 +1,128 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_CPU_EXECUTION_H_ +#define SRC_HOST_CPU_EXECUTION_H_ + +#ifdef MKL_FOUND + +/* C++ standard library headers */ +#include +#include +#include + +/* External library headers */ +#include "mpi.h" +#include "mkl_trans.h" + +/* Project's headers */ +#include "data_handlers/handler.hpp" + +namespace transpose +{ + namespace fpga_execution + { + namespace cpu + { + + /** + * @brief Transpose and add the matrices using MKL routines + * + * @param config The progrma configuration + * @param data data object that contains all required data for the execution + * @return std::unique_ptr The measured execution times + */ + static std::unique_ptr + calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) + { + int err; + + std::vector transferTimings; + std::vector calculationTimings; + + if (data.blockSize != BLOCK_SIZE) { + throw std::runtime_error("Block size for CPU hardcoded to " + std::to_string(BLOCK_SIZE) + ". Recompile to use different block sizes!"); + } + + ulong local_matrix_width = std::sqrt(data.numBlocks); + + for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) + { + + std::chrono::duration transferTime(0); + + MPI_Barrier(MPI_COMM_WORLD); + + auto startCalculation = std::chrono::high_resolution_clock::now(); + + // Exchange A data via PCIe and MPI + handler.exchangeData(data); + + + switch (config.programSettings->dataHandlerIdentifier) { + case transpose::data_handler::DataHandlerType::diagonal: + #pragma omp parallel for + for (ulong offset=0; offset < data.numBlocks * BLOCK_SIZE * BLOCK_SIZE; offset += BLOCK_SIZE * BLOCK_SIZE) { + mkl_somatadd('R', 'T', 'N', BLOCK_SIZE, BLOCK_SIZE, 1.0, &data.A[offset], BLOCK_SIZE, 1.0, &data.B[offset], BLOCK_SIZE, &data.result[offset], BLOCK_SIZE); + } + break; + case transpose::data_handler::DataHandlerType::pq: + #pragma omp parallel for + for (ulong yoffset=0; yoffset < BLOCK_SIZE * local_matrix_width; yoffset += BLOCK_SIZE) { + for (ulong xoffset=0; xoffset < BLOCK_SIZE * local_matrix_width; xoffset += BLOCK_SIZE) { + ulong toffset = xoffset * BLOCK_SIZE * local_matrix_width + yoffset; + ulong offset = yoffset * BLOCK_SIZE * local_matrix_width + xoffset; + mkl_somatadd('R', 'T', 'N', BLOCK_SIZE, BLOCK_SIZE, 1.0, &data.A[toffset], BLOCK_SIZE * local_matrix_width, 1.0, &data.B[offset], BLOCK_SIZE * local_matrix_width, &data.result[offset], BLOCK_SIZE * local_matrix_width); + } + } + break; + default: throw std::runtime_error("Given data handler is not supported by CPU implementation: " + transpose::data_handler::handlerToString(config.programSettings->dataHandlerIdentifier)); + } + + + auto endCalculation = std::chrono::high_resolution_clock::now(); +#ifndef NDEBUG + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " + << "Done i=" << repetition << std::endl; +#endif + std::chrono::duration calculationTime = + std::chrono::duration_cast>(endCalculation - startCalculation); + calculationTimings.push_back(calculationTime.count()); + + // Transfer back data for next repetition! + handler.exchangeData(data); + + transferTimings.push_back(transferTime.count()); + } + + std::unique_ptr result(new transpose::TransposeExecutionTimings{ + transferTimings, + calculationTimings}); + return result; + } + + } // namespace bm_execution + } +} +#endif // MKL_FOUND +#endif // SRC_HOST_CPU_EXECUTION_H_ diff --git a/PTRANS/src/host/execution_default.cpp b/PTRANS/src/host/execution_types/execution_intel.hpp similarity index 84% rename from PTRANS/src/host/execution_default.cpp rename to PTRANS/src/host/execution_types/execution_intel.hpp index 232362c1..4b278db9 100644 --- a/PTRANS/src/host/execution_default.cpp +++ b/PTRANS/src/host/execution_types/execution_intel.hpp @@ -19,30 +19,37 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - -/* Related header files */ -#include "execution.h" +#ifndef SRC_HOST_INTEL_EXECUTION_H_ +#define SRC_HOST_INTEL_EXECUTION_H_ /* C++ standard library headers */ #include #include #include -/* External library headers */ -#include "CL/cl.hpp" - /* Project's headers */ +#include "transpose_benchmark.hpp" +#include "data_handlers/data_handler_types.h" -namespace bm_execution { +namespace transpose { +namespace fpga_execution { +namespace intel { - /* - Implementation for the single kernel. - @copydoc bm_execution::calculate() - */ - std::unique_ptr + /** + * @brief Transpose and add the matrices using the OpenCL kernel using a diagonal distribution and Intel external channels for communication + * + * @param config The progrma configuration + * @param data data object that contains all required data for the execution on the FPGA + * @return std::unique_ptr The measured execution times + */ +static std::unique_ptr calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data) { int err; + if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::diagonal) { + throw std::runtime_error("Used data handler not supported by execution handler!"); + } + std::vector bufferSizeList; std::vector bufferListA; std::vector bufferListB; @@ -78,12 +85,12 @@ namespace bm_execution { // Define the memory bank the buffers will be placed in if (config.programSettings->distributeBuffers) { memory_bank_info_a = ((((r * 3) % 7) + 1) << 16); - memory_bank_info_a = ((((r * 3 + 1) % 7) + 1) << 16); + memory_bank_info_b = ((((r * 3 + 1) % 7) + 1) << 16); memory_bank_info_out = ((((r * 3 + 2) % 7) + 1) << 16); } else { memory_bank_info_a = ((r + 1) << 16); - memory_bank_info_a = ((r + 1) << 16); + memory_bank_info_b = ((r + 1) << 16); memory_bank_info_out = ((r + 1) << 16); } } @@ -122,10 +129,18 @@ namespace bm_execution { #endif // TODO If SVM, the start index might be different because all replcations // access the same buffer! - err = transposeWriteKernel.setArg(2, static_cast(0)); - ASSERT_CL(err) - err = transposeReadKernel.setArg(1, static_cast(0)); - ASSERT_CL(err) + if (config.programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::pq) { + err = transposeWriteKernel.setArg(2, static_cast(std::sqrt(data.numBlocks))); + ASSERT_CL(err) + err = transposeReadKernel.setArg(1, static_cast(std::sqrt(data.numBlocks))); + ASSERT_CL(err) + } + else { + err = transposeWriteKernel.setArg(2, static_cast(0)); + ASSERT_CL(err) + err = transposeReadKernel.setArg(1, static_cast(0)); + ASSERT_CL(err) + } err = transposeWriteKernel.setArg(3, static_cast(blocks_per_replication)); ASSERT_CL(err) err = transposeReadKernel.setArg(2, static_cast(blocks_per_replication)); @@ -190,8 +205,8 @@ namespace bm_execution { auto startCalculation = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeReadKernelList.size(); r++) { - writeCommandQueueList[r].enqueueTask(transposeWriteKernelList[r]); - readCommandQueueList[r].enqueueTask(transposeReadKernelList[r]); + writeCommandQueueList[r].enqueueNDRangeKernel(transposeWriteKernelList[r], cl::NullRange, cl::NDRange(1)); + readCommandQueueList[r].enqueueNDRangeKernel(transposeReadKernelList[r], cl::NullRange, cl::NDRange(1)); } for (int r = 0; r < transposeReadKernelList.size(); r++) { writeCommandQueueList[r].finish(); @@ -241,4 +256,8 @@ namespace bm_execution { return result; } -} // namespace bm_execution +} // namespace transpose +} // namespace fpga_execution +} // namespace intel + +#endif \ No newline at end of file diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp new file mode 100644 index 00000000..e9249b8e --- /dev/null +++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp @@ -0,0 +1,288 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_INTEL_PQ_EXECUTION_H_ +#define SRC_HOST_INTEL_PQ_EXECUTION_H_ + +/* C++ standard library headers */ +#include +#include +#include + +/* Project's headers */ +#include "transpose_benchmark.hpp" +#include "data_handlers/data_handler_types.h" + +namespace transpose { +namespace fpga_execution { +namespace intel_pq { + + /** + * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and Intel external channels for communication + * + * @param config The progrma configuration + * @param data data object that contains all required data for the execution on the FPGA + * @return std::unique_ptr The measured execution times + */ +static std::unique_ptr + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data) { + int err; + + if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { + throw std::runtime_error("Used data handler not supported by execution handler!"); + } + +#ifdef USE_SVM + throw new std::runtime_error("SVM not supported in the host implementation of this communication method"); +#endif + + std::vector bufferSizeList; + std::vector bufferStartList; + std::vector bufferOffsetList; + std::vector bufferListA; + std::vector bufferListB; + std::vector bufferListA_out; + std::vector transposeReadKernelList; + std::vector transposeWriteKernelList; + std::vector readCommandQueueList; + std::vector writeCommandQueueList; + + size_t local_matrix_width = std::sqrt(data.numBlocks); + size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + + size_t total_offset = 0; + + // Setup the kernels depending on the number of kernel replications + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + // Calculate how many blocks the current kernel replication will need to process. + size_t blocks_per_replication = (local_matrix_width / config.programSettings->kernelReplications * local_matrix_width); + size_t blocks_remainder = local_matrix_width % config.programSettings->kernelReplications; + if (blocks_remainder > r) { + // Catch the case, that the number of blocks is not divisible by the number of kernel replications + blocks_per_replication += local_matrix_width; + } + if (blocks_per_replication < 1) { + continue; + } + + size_t buffer_size = blocks_per_replication * data.blockSize * data.blockSize; + bufferSizeList.push_back(buffer_size); + bufferStartList.push_back(total_offset); + + total_offset += blocks_per_replication; + + int memory_bank_info_a = 0; + int memory_bank_info_b = 0; + int memory_bank_info_out = 0; +#ifdef INTEL_FPGA + if (!config.programSettings->useMemoryInterleaving) { + // Define the memory bank the buffers will be placed in + if (config.programSettings->distributeBuffers) { + memory_bank_info_a = ((((r * 3) % 7) + 1) << 16); + memory_bank_info_b = ((((r * 3 + 1) % 7) + 1) << 16); + memory_bank_info_out = ((((r * 3 + 2) % 7) + 1) << 16); + } + else { + memory_bank_info_a = ((r + 1) << 16); + memory_bank_info_b = ((r + 1) << 16); + memory_bank_info_out = ((r + 1) << 16); + } + } +#endif +#ifdef USE_BUFFER_WRITE_RECT_FOR_A + cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY | memory_bank_info_a, + buffer_size * sizeof(HOST_DATA_TYPE)); +#else + cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY | memory_bank_info_a, + data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE)); +#endif + cl::Buffer bufferB(*config.context, CL_MEM_READ_ONLY | memory_bank_info_b, + buffer_size * sizeof(HOST_DATA_TYPE)); + cl::Buffer bufferA_out(*config.context, CL_MEM_WRITE_ONLY | memory_bank_info_out, + buffer_size * sizeof(HOST_DATA_TYPE)); + + // TODO the kernel name may need to be changed for Xilinx support + cl::Kernel transposeReadKernel(*config.program, (READ_KERNEL_NAME + std::to_string(r)).c_str(), &err); + ASSERT_CL(err) + cl::Kernel transposeWriteKernel(*config.program, (WRITE_KERNEL_NAME + std::to_string(r)).c_str(), &err); + ASSERT_CL(err) + + err = transposeReadKernel.setArg(0, bufferA); + ASSERT_CL(err) + err = transposeWriteKernel.setArg(0, bufferB); + ASSERT_CL(err) + err = transposeWriteKernel.setArg(1, bufferA_out); + ASSERT_CL(err) + + // Row offset in blocks + err = transposeWriteKernel.setArg(2, static_cast(0)); + ASSERT_CL(err) + + // Width of the whole local matrix in blocks + err = transposeWriteKernel.setArg(3, static_cast(local_matrix_width)); + ASSERT_CL(err) +#ifndef USE_BUFFER_WRITE_RECT_FOR_A + // Row offset in blocks + err = transposeReadKernel.setArg(1, static_cast(bufferStartList[r])); + ASSERT_CL(err) + err = transposeReadKernel.setArg(2, static_cast(local_matrix_width)); + ASSERT_CL(err) +#else + // Row offset in blocks + err = transposeReadKernel.setArg(1, static_cast(0)); + ASSERT_CL(err) + err = transposeReadKernel.setArg(2, static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))); + ASSERT_CL(err) +#endif + + // Height of the whole local matrix in blocks + err = transposeReadKernel.setArg(3, static_cast(local_matrix_width )); + ASSERT_CL(err) + + // total number of blocks that are processed in this replication + err = transposeWriteKernel.setArg(4, static_cast(blocks_per_replication)); + ASSERT_CL(err) + err = transposeReadKernel.setArg(4, static_cast(blocks_per_replication)); + ASSERT_CL(err) + + cl::CommandQueue readQueue(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + cl::CommandQueue writeQueue(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + + readCommandQueueList.push_back(readQueue); + writeCommandQueueList.push_back(writeQueue); + bufferListA.push_back(bufferA); + bufferListB.push_back(bufferB); + bufferListA_out.push_back(bufferA_out); + transposeReadKernelList.push_back(transposeReadKernel); + transposeWriteKernelList.push_back(transposeWriteKernel); + } + + std::vector transferTimings; + std::vector calculationTimings; + + for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { + + auto startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeReadKernelList.size(); r++) { + writeCommandQueueList[r].enqueueWriteBuffer(bufferListB[r], CL_FALSE, 0, + bufferSizeList[r]* sizeof(HOST_DATA_TYPE), &data.B[bufferStartList[r] * data.blockSize * data.blockSize]); +#ifdef USE_BUFFER_WRITE_RECT_FOR_A +#ifndef USE_DEPRECATED_HPP_HEADER + cl::array deviceOffset; + cl::array hostOffset; + cl::array rectShape; +#else + cl::size_t<3> deviceOffset; + cl::size_t<3> hostOffset; + cl::size_t<3> rectShape; +#endif + deviceOffset[0] = 0; + deviceOffset[1] = 0; + deviceOffset[2] = 0; + hostOffset[0] = (bufferStartList[r]) / local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + hostOffset[1] = 0; + hostOffset[2] = 0; + rectShape[0] = (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE); + rectShape[1] = local_matrix_width* data.blockSize; + rectShape[2] = 1L; + readCommandQueueList[r].enqueueWriteBufferRect(bufferListA[r],CL_FALSE, + deviceOffset, + hostOffset, + rectShape, + (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE), 0, + local_matrix_width* data.blockSize*sizeof(HOST_DATA_TYPE), 0, + data.A); +#else + readCommandQueueList[r].enqueueWriteBuffer(bufferListA[r], CL_FALSE, 0, + data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), data.A); +#endif + + } + for (int r = 0; r < transposeReadKernelList.size(); r++) { + readCommandQueueList[r].finish(); + writeCommandQueueList[r].finish(); + } + auto endTransfer = std::chrono::high_resolution_clock::now(); + std::chrono::duration transferTime = + std::chrono::duration_cast> + (endTransfer - startTransfer); + + MPI_Barrier(MPI_COMM_WORLD); + + auto startCalculation = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < transposeReadKernelList.size(); r++) { + writeCommandQueueList[r].enqueueNDRangeKernel(transposeWriteKernelList[r], cl::NullRange, cl::NDRange(1)); + readCommandQueueList[r].enqueueNDRangeKernel(transposeReadKernelList[r], cl::NullRange, cl::NDRange(1)); + } + for (int r = 0; r < transposeReadKernelList.size(); r++) { + writeCommandQueueList[r].finish(); +#ifndef NDEBUG + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " << "Write done r=" << r << ", i=" << repetition << std::endl; +#endif + readCommandQueueList[r].finish(); +#ifndef NDEBUG + mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " << "Read done r=" << r << ", i=" << repetition << std::endl; +#endif + } + auto endCalculation = std::chrono::high_resolution_clock::now(); +#ifndef NDEBUG + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl; +#endif + std::chrono::duration calculationTime = + std::chrono::duration_cast> + (endCalculation - startCalculation); + calculationTimings.push_back(calculationTime.count()); + + startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeReadKernelList.size(); r++) { + writeCommandQueueList[r].enqueueReadBuffer(bufferListA_out[r], CL_TRUE, 0, + bufferSizeList[r]* sizeof(HOST_DATA_TYPE), &data.result[bufferStartList[r] * data.blockSize * data.blockSize]); + } + endTransfer = std::chrono::high_resolution_clock::now(); + transferTime += + std::chrono::duration_cast> + (endTransfer - startTransfer); + transferTimings.push_back(transferTime.count()); + } + + std::unique_ptr result(new transpose::TransposeExecutionTimings{ + transferTimings, + calculationTimings + }); + return result; + } + +} // namespace transpose +} // namespace fpga_execution +} // namespace intel + +#endif \ No newline at end of file diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp new file mode 100644 index 00000000..5e29ad2e --- /dev/null +++ b/PTRANS/src/host/execution_types/execution_pcie.hpp @@ -0,0 +1,240 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_PCIE_EXECUTION_H_ +#define SRC_HOST_PCIE_EXECUTION_H_ + +/* C++ standard library headers */ +#include +#include +#include + +/* External library headers */ +#include "mpi.h" + +/* Project's headers */ +#include "data_handlers/handler.hpp" + +namespace transpose +{ + namespace fpga_execution + { + namespace pcie + { + + /** + * @brief Transpose and add the matrices using the OpenCL kernel using a diagonal distribution and PCIe+MPI over the host for communication + * + * @param config The progrma configuration + * @param data data object that contains all required data for the execution on the FPGA + * @param handler data handler instance that should be used to exchange data between hosts + * @return std::unique_ptr The measured execution times + */ + static std::unique_ptr + calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) + { + int err; + +#ifdef USE_SVM + throw new std::runtime_error("SVM not supported in the host implementation of this communication method"); +#endif + if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::diagonal) { + throw std::runtime_error("Used data handler not supported by execution handler!"); + } + + std::vector bufferSizeList; + std::vector bufferListA; + std::vector bufferListB; + std::vector bufferListA_out; + std::vector transposeKernelList; + std::vector transCommandQueueList; + + size_t local_matrix_width = std::sqrt(data.numBlocks); + + // Setup the kernels depending on the number of kernel replications + for (int r = 0; r < config.programSettings->kernelReplications; r++) + { + + // Calculate how many blocks the current kernel replication will need to process. + size_t blocks_per_replication = data.numBlocks / config.programSettings->kernelReplications; + size_t blocks_remainder = data.numBlocks % config.programSettings->kernelReplications; + if (blocks_remainder > r) + { + // Catch the case, that the number of blocks is not divisible by the number of kernel replications + blocks_per_replication += 1; + } + if (blocks_per_replication < 1) + { + continue; + } + + size_t buffer_size = data.blockSize * (data.blockSize * blocks_per_replication); + + bufferSizeList.push_back(buffer_size); + + int memory_bank_info_a = 0; + int memory_bank_info_b = 0; + int memory_bank_info_out = 0; +#ifdef INTEL_FPGA + if (!config.programSettings->useMemoryInterleaving) + { + // Define the memory bank the buffers will be placed in + if (config.programSettings->distributeBuffers) + { + memory_bank_info_a = ((((r * 3) % 7) + 1) << 16); + memory_bank_info_b = ((((r * 3 + 1) % 7) + 1) << 16); + memory_bank_info_out = ((((r * 3 + 2) % 7) + 1) << 16); + } + else + { + memory_bank_info_a = ((r + 1) << 16); + memory_bank_info_b = ((r + 1) << 16); + memory_bank_info_out = ((r + 1) << 16); + } + } +#endif + cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY | memory_bank_info_a, + buffer_size * sizeof(HOST_DATA_TYPE)); + cl::Buffer bufferB(*config.context, CL_MEM_READ_ONLY | memory_bank_info_b, + buffer_size * sizeof(HOST_DATA_TYPE)); + cl::Buffer bufferA_out(*config.context, CL_MEM_WRITE_ONLY | memory_bank_info_out, + buffer_size * sizeof(HOST_DATA_TYPE)); + + // TODO the kernel name may need to be changed for Xilinx support + cl::Kernel transposeKernel(*config.program, ("transpose" + std::to_string(r)).c_str(), &err); + ASSERT_CL(err) + + + err = transposeKernel.setArg(0, bufferA); + ASSERT_CL(err) + err = transposeKernel.setArg(1, bufferB); + ASSERT_CL(err) + err = transposeKernel.setArg(2, bufferA_out); + ASSERT_CL(err) + err = transposeKernel.setArg(3, static_cast(blocks_per_replication)); + ASSERT_CL(err) + + cl::CommandQueue transQueue(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + + transCommandQueueList.push_back(transQueue); + bufferListA.push_back(bufferA); + bufferListB.push_back(bufferB); + bufferListA_out.push_back(bufferA_out); + transposeKernelList.push_back(transposeKernel); + } + + std::vector transferTimings; + std::vector calculationTimings; + + for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) + { + + MPI_Barrier(MPI_COMM_WORLD); + + auto startTransfer = std::chrono::high_resolution_clock::now(); + size_t bufferOffset = 0; + + for (int r = 0; r < transposeKernelList.size(); r++) + { + transCommandQueueList[r].enqueueWriteBuffer(bufferListB[r], CL_TRUE, 0, + bufferSizeList[r] * sizeof(HOST_DATA_TYPE), &data.B[bufferOffset]); + transCommandQueueList[r].enqueueWriteBuffer(bufferListA[r], CL_TRUE, 0, + bufferSizeList[r] * sizeof(HOST_DATA_TYPE), &data.A[bufferOffset]); + bufferOffset += bufferSizeList[r]; + } + + auto endTransfer = std::chrono::high_resolution_clock::now(); + std::chrono::duration transferTime = + std::chrono::duration_cast>(endTransfer - startTransfer); + + MPI_Barrier(MPI_COMM_WORLD); + + auto startCalculation = std::chrono::high_resolution_clock::now(); + bufferOffset = 0; + for (int r = 0; r < transposeKernelList.size(); r++) + { + transCommandQueueList[r].enqueueReadBuffer(bufferListA[r], CL_TRUE, 0, + bufferSizeList[r] * sizeof(HOST_DATA_TYPE), &data.A[bufferOffset]); + bufferOffset += bufferSizeList[r]; + } + + // Exchange A data via PCIe and MPI + handler.exchangeData(data); + + bufferOffset = 0; + for (int r = 0; r < transposeKernelList.size(); r++) + { + transCommandQueueList[r].enqueueWriteBuffer(bufferListA[r], CL_FALSE, 0, + bufferSizeList[r] * sizeof(HOST_DATA_TYPE), &data.A[bufferOffset]); + bufferOffset += bufferSizeList[r]; + } + + for (int r = 0; r < transposeKernelList.size(); r++) + { + transCommandQueueList[r].enqueueNDRangeKernel(transposeKernelList[r], cl::NullRange, cl::NDRange(1)); + } + for (int r = 0; r < transposeKernelList.size(); r++) + { + transCommandQueueList[r].finish(); + } + + auto endCalculation = std::chrono::high_resolution_clock::now(); +#ifndef NDEBUG + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " + << "Done i=" << repetition << std::endl; +#endif + std::chrono::duration calculationTime = + std::chrono::duration_cast>(endCalculation - startCalculation); + calculationTimings.push_back(calculationTime.count()); + + // Transfer back data for next repetition! + handler.exchangeData(data); + + bufferOffset = 0; + startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) + { + transCommandQueueList[r].enqueueReadBuffer(bufferListA_out[r], CL_TRUE, 0, + bufferSizeList[r] * sizeof(HOST_DATA_TYPE), &data.result[bufferOffset]); + bufferOffset += bufferSizeList[r]; + } + + endTransfer = std::chrono::high_resolution_clock::now(); + transferTime += + std::chrono::duration_cast>(endTransfer - startTransfer); + transferTimings.push_back(transferTime.count()); + } + + std::unique_ptr result(new transpose::TransposeExecutionTimings{ + transferTimings, + calculationTimings}); + return result; + } + + } // namespace bm_execution + } +} + +#endif // SRC_HOST_PCIE_EXECUTION_H_ diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp new file mode 100644 index 00000000..93c891a0 --- /dev/null +++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp @@ -0,0 +1,344 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_PCIE_PQ_EXECUTION_H_ +#define SRC_HOST_PCIE_PQ_EXECUTION_H_ + +/* C++ standard library headers */ +#include +#include +#include + +/* Project's headers */ +#include "transpose_benchmark.hpp" +#include "data_handlers/data_handler_types.h" + +namespace transpose { +namespace fpga_execution { +namespace pcie_pq { + + /** + * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication + * + * @param config The progrma configuration + * @param data data object that contains all required data for the execution on the FPGA + * @param handler data handler instance that should be used to exchange data between hosts + * @return std::unique_ptr The measured execution times + */ +static std::unique_ptr + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::TransposeDataHandler &handler) { + int err; + + if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { + throw std::runtime_error("Used data handler not supported by execution handler!"); + } +#ifdef USE_SVM + throw new std::runtime_error("SVM not supported in the host implementation of this communication method"); +#endif + + std::vector bufferSizeList; + std::vector bufferStartList; + std::vector bufferOffsetList; + std::vector bufferListA; + std::vector bufferListB; + std::vector bufferListA_out; + std::vector transposeKernelList; + std::vector transCommandQueueList; + + size_t local_matrix_width = std::sqrt(data.numBlocks); + size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + + size_t total_offset = 0; + + // Setup the kernels depending on the number of kernel replications + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + // Calculate how many blocks the current kernel replication will need to process. + size_t blocks_per_replication = (local_matrix_width / config.programSettings->kernelReplications * local_matrix_width); + size_t blocks_remainder = local_matrix_width % config.programSettings->kernelReplications; + if (blocks_remainder > r) { + // Catch the case, that the number of blocks is not divisible by the number of kernel replications + blocks_per_replication += local_matrix_width; + } + if (blocks_per_replication < 1) { + continue; + } + + size_t buffer_size = blocks_per_replication * data.blockSize * data.blockSize; + bufferSizeList.push_back(buffer_size); + bufferStartList.push_back(total_offset); + + total_offset += blocks_per_replication; + + int memory_bank_info_a = 0; + int memory_bank_info_b = 0; + int memory_bank_info_out = 0; +#ifdef INTEL_FPGA + if (!config.programSettings->useMemoryInterleaving) { + // Define the memory bank the buffers will be placed in + if (config.programSettings->distributeBuffers) { + memory_bank_info_a = ((((r * 3) % 7) + 1) << 16); + memory_bank_info_b = ((((r * 3 + 1) % 7) + 1) << 16); + memory_bank_info_out = ((((r * 3 + 2) % 7) + 1) << 16); + } + else { + memory_bank_info_a = ((r + 1) << 16); + memory_bank_info_b = ((r + 1) << 16); + memory_bank_info_out = ((r + 1) << 16); + } + } +#endif +#ifdef USE_BUFFER_WRITE_RECT_FOR_A + cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY | memory_bank_info_a, + buffer_size * sizeof(HOST_DATA_TYPE)); +#else + cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY | memory_bank_info_a, + data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE)); +#endif + cl::Buffer bufferB(*config.context, CL_MEM_READ_ONLY | memory_bank_info_b, + buffer_size * sizeof(HOST_DATA_TYPE)); + cl::Buffer bufferA_out(*config.context, CL_MEM_WRITE_ONLY | memory_bank_info_out, + buffer_size * sizeof(HOST_DATA_TYPE)); + +#ifdef INTEL_FPGA + cl::Kernel transposeKernel(*config.program, ("transpose" + std::to_string(r)).c_str(), &err); + ASSERT_CL(err) +#endif +#ifdef XILINX_FPGA + // create the kernels + cl::Kernel transposeKernel(*config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str(), + &err); + ASSERT_CL(err); +#endif + + err = transposeKernel.setArg(0, bufferA); + ASSERT_CL(err) + err = transposeKernel.setArg(1, bufferB); + ASSERT_CL(err) + err = transposeKernel.setArg(2, bufferA_out); + ASSERT_CL(err) + err = transposeKernel.setArg(4, static_cast(blocks_per_replication)); + ASSERT_CL(err) + err = transposeKernel.setArg(5, static_cast(local_matrix_width)); + ASSERT_CL(err) +#ifndef USE_BUFFER_WRITE_RECT_FOR_A + err = transposeKernel.setArg(6, static_cast(local_matrix_width)); + ASSERT_CL(err) + err = transposeKernel.setArg(3, static_cast(bufferStartList[r])); + ASSERT_CL(err) +#else + err = transposeKernel.setArg(6, static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))); + ASSERT_CL(err) + err = transposeKernel.setArg(3, static_cast(0)); + ASSERT_CL(err) +#endif + + + cl::CommandQueue transQueue(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + + transCommandQueueList.push_back(transQueue); + bufferListA.push_back(bufferA); + bufferListB.push_back(bufferB); + bufferListA_out.push_back(bufferA_out); + transposeKernelList.push_back(transposeKernel); + } + + std::vector transferTimings; + std::vector calculationTimings; + + for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { + + auto startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) { + transCommandQueueList[r].enqueueWriteBuffer(bufferListB[r], CL_FALSE, 0, + bufferSizeList[r]* sizeof(HOST_DATA_TYPE), &data.B[bufferStartList[r] * data.blockSize * data.blockSize]); +#ifdef USE_BUFFER_WRITE_RECT_FOR_A +#ifndef USE_DEPRECATED_HPP_HEADER + cl::array deviceOffset; + cl::array hostOffset; + cl::array rectShape; +#else + cl::size_t<3> deviceOffset; + cl::size_t<3> hostOffset; + cl::size_t<3> rectShape; +#endif + deviceOffset[0] = 0; + deviceOffset[1] = 0; + deviceOffset[2] = 0; + hostOffset[0] = (bufferStartList[r]) / local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + hostOffset[1] = 0; + hostOffset[2] = 0; + rectShape[0] = (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE); + rectShape[1] = local_matrix_width* data.blockSize; + rectShape[2] = 1L; + transCommandQueueList[r].enqueueWriteBufferRect(bufferListA[r],CL_FALSE, + deviceOffset, + hostOffset, + rectShape, + (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE), 0, + local_matrix_width* data.blockSize*sizeof(HOST_DATA_TYPE), 0, + data.A); +#else + transCommandQueueList[r].enqueueWriteBuffer(bufferListA[r], CL_FALSE, 0, + data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), data.A); +#endif + + } + for (int r = 0; r < transposeKernelList.size(); r++) { + transCommandQueueList[r].finish(); + } + auto endTransfer = std::chrono::high_resolution_clock::now(); + + std::chrono::duration transferTime = + std::chrono::duration_cast> + (endTransfer - startTransfer); + + MPI_Barrier(MPI_COMM_WORLD); + + auto startCalculation = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) + { +#ifdef USE_BUFFER_WRITE_RECT_FOR_A +#ifndef USE_DEPRECATED_HPP_HEADER + cl::array deviceOffset; + cl::array hostOffset; + cl::array rectShape; +#else + cl::size_t<3> deviceOffset; + cl::size_t<3> hostOffset; + cl::size_t<3> rectShape; +#endif + deviceOffset[0] = 0; + deviceOffset[1] = 0; + deviceOffset[2] = 0; + hostOffset[0] = (bufferStartList[r]) / local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + hostOffset[1] = 0; + hostOffset[2] = 0; + rectShape[0] = (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE); + rectShape[1] = local_matrix_width* data.blockSize; + rectShape[2] = 1L; + transCommandQueueList[r].enqueueReadBufferRect(bufferListA[r],CL_FALSE, + deviceOffset, + hostOffset, + rectShape, + (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE), 0, + local_matrix_width* data.blockSize*sizeof(HOST_DATA_TYPE), 0, + data.A); +#else + transCommandQueueList[r].enqueueReadBuffer(bufferListA[r], CL_FALSE, 0, + data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), data.A); +#endif + } + + + // Exchange A data via PCIe and MPI + handler.exchangeData(data); + + for (int r = 0; r < transposeKernelList.size(); r++) + { +#ifdef USE_BUFFER_WRITE_RECT_FOR_A +#ifndef USE_DEPRECATED_HPP_HEADER + cl::array deviceOffset; + cl::array hostOffset; + cl::array rectShape; +#else + cl::size_t<3> deviceOffset; + cl::size_t<3> hostOffset; + cl::size_t<3> rectShape; +#endif + deviceOffset[0] = 0; + deviceOffset[1] = 0; + deviceOffset[2] = 0; + hostOffset[0] = (bufferStartList[r]) / local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + hostOffset[1] = 0; + hostOffset[2] = 0; + rectShape[0] = (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE); + rectShape[1] = local_matrix_width* data.blockSize; + rectShape[2] = 1L; + transCommandQueueList[r].enqueueWriteBufferRect(bufferListA[r],CL_FALSE, + deviceOffset, + hostOffset, + rectShape, + (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE), 0, + local_matrix_width* data.blockSize*sizeof(HOST_DATA_TYPE), 0, + data.A); +#else + transCommandQueueList[r].enqueueWriteBuffer(bufferListA[r], CL_TRUE, 0, + data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), data.A); +#endif + } +#ifndef NDEBUG + auto startKernelCalculation = std::chrono::high_resolution_clock::now(); +#endif + for (int r = 0; r < transposeKernelList.size(); r++) + { + transCommandQueueList[r].enqueueNDRangeKernel(transposeKernelList[r], cl::NullRange, cl::NDRange(1)); + } + for (int r = 0; r < transposeKernelList.size(); r++) + { + transCommandQueueList[r].finish(); + } + auto endCalculation = std::chrono::high_resolution_clock::now(); +#ifndef NDEBUG + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl; + std::cout << "Kernel execution time: " << std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() + << "s (" << ((config.programSettings->matrixSize * config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * 3) + / std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl; +#endif + + // Transfer back data for next repetition! + handler.exchangeData(data); + + std::chrono::duration calculationTime = + std::chrono::duration_cast> + (endCalculation - startCalculation); + calculationTimings.push_back(calculationTime.count()); + + startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) { + transCommandQueueList[r].enqueueReadBuffer(bufferListA_out[r], CL_TRUE, 0, + bufferSizeList[r]* sizeof(HOST_DATA_TYPE), &data.result[bufferStartList[r] * data.blockSize * data.blockSize]); + } + endTransfer = std::chrono::high_resolution_clock::now(); + transferTime += + std::chrono::duration_cast> + (endTransfer - startTransfer); + transferTimings.push_back(transferTime.count()); + } + + std::unique_ptr result(new transpose::TransposeExecutionTimings{ + transferTimings, + calculationTimings + }); + return result; + } + +} // namespace transpose +} // namespace fpga_execution +} // namespace intel + +#endif \ No newline at end of file diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index cb114097..7657b85d 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -31,12 +31,24 @@ SOFTWARE. #include /* Project's headers */ -#include "execution.h" +#include "execution_types/execution_intel.hpp" +#include "execution_types/execution_intel_pq.hpp" +#include "execution_types/execution_pcie.hpp" +#include "execution_types/execution_pcie_pq.hpp" +#include "execution_types/execution_cpu.hpp" +#include "communication_types.hpp" + +#include "data_handlers/data_handler_types.h" +#include "data_handlers/diagonal.hpp" +#include "data_handlers/pq.hpp" + #include "parameters.h" + transpose::TransposeBenchmark::TransposeBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) { - setupBenchmark(argc, argv); - setTransposeDataHandler(executionSettings->programSettings->dataHandlerIdentifier); + if (setupBenchmark(argc, argv)) { + setTransposeDataHandler(executionSettings->programSettings->dataHandlerIdentifier); + } } void @@ -48,12 +60,31 @@ transpose::TransposeBenchmark::addAdditionalParseOptions(cxxopts::Options &optio cxxopts::value()->default_value(std::to_string(BLOCK_SIZE))) ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.") ("handler", "Specify the used data handler that distributes the data over devices and memory banks", - cxxopts::value()->default_value(TRANSPOSE_HANDLERS_DIST_DIAG)); + cxxopts::value()->default_value(DEFAULT_DIST_TYPE)); } std::unique_ptr transpose::TransposeBenchmark::executeKernel(TransposeData &data) { - return bm_execution::calculate(*executionSettings, data); + switch (executionSettings->programSettings->communicationType) { + case hpcc_base::CommunicationType::intel_external_channels: + if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { + return transpose::fpga_execution::intel::calculate(*executionSettings, data); + } + else { + return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data); + } break; + case hpcc_base::CommunicationType::pcie_mpi : + if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { + return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler); + } + else { + return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, *dataHandler); + } break; +#ifdef MKL_FOUND + case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break; +#endif + default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType)); + } } void @@ -63,39 +94,50 @@ transpose::TransposeBenchmark::collectAndPrintResults(const transpose::Transpose // Number of experiment repetitions uint number_measurements = output.calculationTimings.size(); std::vector max_measures(number_measurements); + std::vector max_transfers(number_measurements); #ifdef _USE_MPI_ // Copy the object variable to a local variable to make it accessible to the lambda function int mpi_size = mpi_comm_size; MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); #else std::copy(output.calculationTimings.begin(), output.calculationTimings.end(), max_measures.begin()); + std::copy(output.transferTimings.begin(), output.transferTimings.end(), max_transfers.begin()); #endif double avgCalculationTime = accumulate(max_measures.begin(), max_measures.end(), 0.0) / max_measures.size(); double minCalculationTime = *min_element(max_measures.begin(), max_measures.end()); + double avgTransferTime = accumulate(max_transfers.begin(), max_transfers.end(), 0.0) + / max_transfers.size(); + double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end()); + double avgCalcFLOPS = flops / avgCalculationTime; double maxCalcFLOPS = flops / minCalculationTime; double avgMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime; - double avgNetworkBandwidth = flops * sizeof(HOST_DATA_TYPE) / avgCalculationTime; double maxMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime; - double maxNetworkBandwidth = flops * sizeof(HOST_DATA_TYPE) / minCalculationTime; + double avgTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime; + double maxTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime; if (mpi_comm_rank == 0) { - std::cout << " calc calc FLOPS Net [B/s] Mem [B/s]" << std::endl; - std::cout << "avg: " << avgCalculationTime + std::cout << " total [s] transfer [s] calc [s] calc FLOPS Mem [B/s] PCIe [B/s]" << std::endl; + std::cout << "avg: " << (avgTransferTime + avgCalculationTime) + << " " << avgTransferTime + << " " << avgCalculationTime << " " << avgCalcFLOPS - << " " << avgNetworkBandwidth << " " << avgMemBandwidth + << " " << avgTransferBandwidth << std::endl; - std::cout << "best: " << minCalculationTime + std::cout << "best: " << (minTransferTime + minCalculationTime) + << " " << minTransferTime + << " " << minCalculationTime << " " << maxCalcFLOPS - << " " << maxNetworkBandwidth << " " << maxMemBandwidth + << " " << maxTransferBandwidth << std::endl; } } @@ -111,15 +153,7 @@ transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeD // exchange the data using MPI depending on the chosen distribution scheme dataHandler->exchangeData(data); - size_t block_offset = executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize; - for (size_t b = 0; b < data.numBlocks; b++) { - for (size_t i = 0; i < executionSettings->programSettings->blockSize; i++) { - for (size_t j = 0; j < executionSettings->programSettings->blockSize; j++) { - data.A[b * block_offset + j * executionSettings->programSettings->blockSize + i] -= (data.result[b * block_offset + i * executionSettings->programSettings->blockSize + j] - - data.B[b * block_offset + i * executionSettings->programSettings->blockSize + j]); - } - } - } + dataHandler->reference_transpose(data); double max_error = 0.0; for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) { @@ -138,9 +172,12 @@ transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeD } void -transpose::TransposeBenchmark::setTransposeDataHandler(std::string dataHandlerIdentifier) { - if (transpose::dataHandlerIdentifierMap.find(dataHandlerIdentifier) == transpose::dataHandlerIdentifierMap.end()) { - throw std::runtime_error("Could not match selected data handler: " + dataHandlerIdentifier); +transpose::TransposeBenchmark::setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) { + switch (dataHandlerIdentifier) { + case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break; + case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break; + default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier)); } - dataHandler = transpose::dataHandlerIdentifierMap[dataHandlerIdentifier](mpi_comm_rank, mpi_comm_size); + + } diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index e57f4a1c..5de333ca 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -30,7 +30,9 @@ SOFTWARE. /* Project's headers */ #include "hpcc_benchmark.hpp" #include "transpose_data.hpp" -#include "transpose_handlers.hpp" + +#include "data_handlers/data_handler_types.h" +#include "data_handlers/handler.hpp" #include "parameters.h" @@ -56,7 +58,7 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark dataHandler; + std::unique_ptr dataHandler; public: @@ -73,7 +75,7 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark #include "transpose_data.hpp" +#include "data_handlers/data_handler_types.h" +#include "communication_types.hpp" transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), matrixSize(results["m"].as() * results["b"].as()), - blockSize(results["b"].as()), dataHandlerIdentifier(results["handler"].as()), + blockSize(results["b"].as()), dataHandlerIdentifier(transpose::data_handler::stringToHandler(results["handler"].as())), distributeBuffers(results["distribute-buffers"].count() > 0) { + // auto detect data distribution type if required + if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) { + if (kernelFileName.find("_"+ transpose::data_handler::handlerToString(transpose::data_handler::DataHandlerType::diagonal) +"_") != kernelFileName.npos) { + dataHandlerIdentifier = transpose::data_handler::DataHandlerType::diagonal; + } + else if (kernelFileName.find("_"+ transpose::data_handler::handlerToString(transpose::data_handler::DataHandlerType::pq) + "_") != kernelFileName.npos) { + dataHandlerIdentifier = transpose::data_handler::DataHandlerType::pq; + } + if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) { + throw std::runtime_error("Required data distribution could not be detected from kernel file name!"); + } + } } std::map transpose::TransposeProgramSettings::getSettingsMap() { auto map = hpcc_base::BaseSettings::getSettingsMap(); - map["Matrix Size"] = std::to_string(matrixSize); + int mpi_size; +#ifdef _USE_MPI_ + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); +#endif + map["Matrix Size"] = std::to_string(matrixSize * static_cast(std::sqrt(mpi_size))); map["Block Size"] = std::to_string(blockSize); map["Dist. Buffers"] = distributeBuffers ? "Yes" : "No"; - map["Data Handler"] = dataHandlerIdentifier; + map["Data Handler"] = transpose::data_handler::handlerToString(dataHandlerIdentifier); return map; } @@ -31,6 +50,9 @@ transpose::TransposeData::TransposeData(cl::Context context, uint block_size, ui result = reinterpret_cast( clSVMAlloc(context(), 0 , block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024)); + exchange = reinterpret_cast( + clSVMAlloc(context(), 0 , + block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024)); #else posix_memalign(reinterpret_cast(&A), 64, sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); @@ -38,6 +60,8 @@ transpose::TransposeData::TransposeData(cl::Context context, uint block_size, ui sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); posix_memalign(reinterpret_cast(&result), 64, sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); + posix_memalign(reinterpret_cast(&exchange), 64, + sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); #endif } } @@ -48,10 +72,12 @@ transpose::TransposeData::~TransposeData() { clSVMFree(context(), reinterpret_cast(A));}); clSVMFree(context(), reinterpret_cast(B));}); clSVMFree(context(), reinterpret_cast(result));}); + clSVMFree(context(), reinterpret_cast(exchange));}); #else free(A); free(B); free(result); + free(exchange); #endif } } diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp index 4eaa684b..1e318fc0 100644 --- a/PTRANS/src/host/transpose_data.hpp +++ b/PTRANS/src/host/transpose_data.hpp @@ -28,6 +28,8 @@ SOFTWARE. /* Project's headers */ #include "hpcc_benchmark.hpp" +#include "data_handlers/data_handler_types.h" + /** * @brief Contains all classes and methods needed by the Transpose benchmark @@ -58,7 +60,7 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings { * @brief Identifier of the used data handler * */ - std::string dataHandlerIdentifier; + transpose::data_handler::DataHandlerType dataHandlerIdentifier; /** * @brief If true, the three buffers for A,B and A_out will be placed on three different memory banks, if possible @@ -107,6 +109,12 @@ class TransposeData { */ HOST_DATA_TYPE* result; + /** + * @brief Data buffer used during data exchange of matrices + * + */ + HOST_DATA_TYPE* exchange; + /** * @brief Number of matrix blocks that are stored in every matrix A, B and result. Blocks are * always stored columnwise. diff --git a/PTRANS/src/host/transpose_handlers.cpp b/PTRANS/src/host/transpose_handlers.cpp deleted file mode 100644 index e3b18c39..00000000 --- a/PTRANS/src/host/transpose_handlers.cpp +++ /dev/null @@ -1,162 +0,0 @@ -/* -Copyright (c) 2020 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#include - -#ifdef _USE_MPI_ -#include "mpi.h" -#endif - -#include "transpose_handlers.hpp" - -// Add every data handler that should be selectable from the command line into this map -// and also specify a string identifier for it -std::map (*)(int rank, int size)> transpose::dataHandlerIdentifierMap{ - // distributed external data handler -#ifdef _USE_MPI_ - {TRANSPOSE_HANDLERS_DIST_DIAG, &generateDataHandler} -#endif - }; - -#ifdef _USE_MPI_ - -transpose::DistributedDiagonalTransposeDataHandler::DistributedDiagonalTransposeDataHandler(int rank, int size) : TransposeDataHandler(rank, size) { - if (rank >= size) { - throw std::runtime_error("MPI rank must be smaller the MPI world size!"); - } -} - - -std::unique_ptr transpose::DistributedDiagonalTransposeDataHandler::generateData(hpcc_base::ExecutionSettings& settings) { - int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize; - - int avg_blocks_per_rank = (width_in_blocks * width_in_blocks) / mpi_comm_size; - int avg_diagonal_blocks = width_in_blocks; - if (avg_blocks_per_rank > 0) { - avg_diagonal_blocks = (width_in_blocks / avg_blocks_per_rank); - } - num_diagonal_ranks = std::max(avg_diagonal_blocks, 1); - - if (num_diagonal_ranks % 2 != mpi_comm_size % 2) { - #ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << ": Fail 1!" << std::endl; - #endif - // Abort if there is a too high difference in the number of matrix blocks between the MPI ranks - throw std::runtime_error("Matrix size and MPI ranks to not allow fair distribution of blocks! Increase or reduce the number of MPI ranks by 1."); - } - if ((mpi_comm_size - num_diagonal_ranks) % 2 != 0 || (mpi_comm_size - num_diagonal_ranks) == 0 && width_in_blocks > 1) { - #ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << ": Fail 2!" << std::endl; - #endif - throw std::runtime_error("Not possible to create pairs of MPI ranks for lower and upper half of matrix. Increase number of MPI ranks!."); - } - bool this_rank_is_diagonal = mpi_comm_rank >= (mpi_comm_size - num_diagonal_ranks); - int blocks_if_diagonal = width_in_blocks / num_diagonal_ranks + ( (mpi_comm_rank - (mpi_comm_size - num_diagonal_ranks)) < (width_in_blocks % num_diagonal_ranks) ? 1 : 0); - int blocks_if_not_diagonal = 0; - if ((mpi_comm_size - num_diagonal_ranks) > 0 ) { - blocks_if_not_diagonal = (width_in_blocks * (width_in_blocks - 1)) / (mpi_comm_size - num_diagonal_ranks) + (mpi_comm_rank < ((width_in_blocks * (width_in_blocks - 1)) % (mpi_comm_size - num_diagonal_ranks)) ? 1 : 0); - } - - - int blocks_per_rank = (this_rank_is_diagonal) ? blocks_if_diagonal : blocks_if_not_diagonal; - - if (mpi_comm_rank == 0) { - std::cout << "Diag. blocks per rank: " << blocks_if_diagonal << std::endl; - std::cout << "Blocks per rank: " << blocks_if_not_diagonal << std::endl; - std::cout << "Loopback ranks for diagonal blocks: " << num_diagonal_ranks << std::endl; - } - // Height of a matrix generated for a single memory bank on a single MPI rank - int data_height_per_rank = blocks_per_rank * settings.programSettings->blockSize; - -#ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << ": NumBlocks = " << blocks_per_rank << std::endl; -#endif - - // Allocate memory for a single device and all its memory banks - auto d = std::unique_ptr(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank)); - - // Fill the allocated memory with pseudo random values - std::mt19937 gen(mpi_comm_rank); - std::uniform_real_distribution<> dis(-100.0, 100.0); - for (size_t i = 0; i < data_height_per_rank; i++) { - for (size_t j = 0; j < settings.programSettings->blockSize; j++) { - d->A[i * settings.programSettings->blockSize + j] = dis(gen); - d->B[i * settings.programSettings->blockSize + j] = dis(gen); - d->result[i * settings.programSettings->blockSize + j] = 0.0; - } - } - - return d; -} - -void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose::TransposeData& data) { -#ifndef NDEBUG - // std::cout << "Start data exchange " << mpi_comm_rank << std::endl; -#endif - // Only need to exchange data, if rank has a partner - if (mpi_comm_rank < mpi_comm_size - num_diagonal_ranks) { - int first_upper_half_rank = (mpi_comm_size - num_diagonal_ranks)/2; - int pair_rank = (mpi_comm_rank >= first_upper_half_rank) ? mpi_comm_rank - first_upper_half_rank : mpi_comm_rank + first_upper_half_rank; - - // To re-calculate the matrix transposition locally on this host, we need to - // exchange matrix A for every kernel replication - // The order of the matrix blocks does not change during the exchange, because they are distributed diagonally - // and will be handled in the order below: - // - // . . 1 3 - // . . . 2 - // 1 . . . - // 3 2 . . - MPI_Status status; - size_t remaining_data_size = static_cast(data.blockSize) * data.blockSize * data.numBlocks; - size_t offset = 0; - while (remaining_data_size > 0) { - int next_chunk = (remaining_data_size > std::numeric_limits::max()) ? std::numeric_limits::max(): remaining_data_size; -#ifndef NDEBUG - // std::cout << "Rank " << mpi_comm_rank << " " << next_chunk << " to " << pair_rank << std::endl; -#endif - if (pair_rank > mpi_comm_rank) { - MPI_Send(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD); - MPI_Recv(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD, &status); - } - else { - std::vector buffer(next_chunk); - for (int i = 0; i < next_chunk; i++) { - buffer[i] = data.A[offset + i]; - } - MPI_Recv(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD, &status); - MPI_Send(buffer.data(), next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD); - } - // MPI_Sendrecv_replace(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, pair_rank, 0, MPI_COMM_WORLD, &status); - #ifndef NDEBUG - // std::cout << "Rank " << mpi_comm_rank << " Done!"<< std::endl; -#endif - remaining_data_size -= next_chunk; - offset += next_chunk; - } - } -#ifndef NDEBUG - // std::cout << "End data exchange " << mpi_comm_rank << std::endl; -#endif -} - -#endif diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp index 8db00927..1733cf15 100644 --- a/PTRANS/tests/test_host_functionality.cpp +++ b/PTRANS/tests/test_host_functionality.cpp @@ -40,7 +40,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatHeader) { std::cout.rdbuf(oldStdOutBuffer); EXPECT_THAT(newStdOutBuffer.str(), - ::testing::MatchesRegex("(\\s+)calc(\\s+)calc\\sFLOPS(\\s+)Net\\s\\[B/s\\](\\s+)Mem\\s\\[B/s\\]\n.*")); + ::testing::MatchesRegex("(\\s+)total\\s\\[s\\](\\s+)transfer\\s\\[s\\](\\s+)calc\\s\\[s\\](\\s+)calc\\sFLOPS(\\s+)Mem\\s\\[B/s\\](\\s+)PCIe\\s\\[B/s\\]\n.*")); } /** @@ -66,7 +66,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) { std::cout.rdbuf(oldStdOutBuffer); EXPECT_THAT(newStdOutBuffer.str(), - ::testing::MatchesRegex(".*\navg:\\s+1\\.00000e\\+00.*\n.*\n")); + ::testing::MatchesRegex(".*\navg:\\s+2\\.00000e\\+00\\s+1\\.00000e\\+00\\s+1\\.00000e\\+00.*\n.*\n")); } /** diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp index 93673b59..d7bc0c7f 100644 --- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp +++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp @@ -65,6 +65,34 @@ TEST_F(TransposeKernelTest, FPGACorrectBStaysTheSame) { EXPECT_FLOAT_EQ(aggregated_error, 0.0); } +/** + * Tests if B will not be transposed + */ +TEST_F(TransposeKernelTest, FPGACorrectBStaysTheSame4Blocks) { + if (bm->getExecutionSettings().programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { + // Diagonal data handler does not support this test, since matrix is stored differently in memory buffer + return; + } + matrix_size = BLOCK_SIZE * bm->getExecutionSettings().programSettings->kernelReplications; + bm->getExecutionSettings().programSettings->matrixSize = matrix_size; + data = bm->generateInputData(); + createChannelFilesAndSymbolicLinks(); + for (int i = 0; i < matrix_size; i++) { + for (int j = 0; j < matrix_size; j++) { + data->A[i * matrix_size + j] = 0.0; + data->B[i * matrix_size + j] = i * matrix_size + j; + } + } + bm->executeKernel(*data); + double aggregated_error = 0.0; + for (int i = 0; i < matrix_size; i++) { + for (int j = 0; j < matrix_size; j++) { + aggregated_error += std::abs(data->result[i * matrix_size + j] - data->B[i * matrix_size + j]); + } + } + EXPECT_FLOAT_EQ(aggregated_error, 0.0); +} + /** * Tests if a block of A will be correctly transposed */ @@ -85,6 +113,34 @@ TEST_F(TransposeKernelTest, FPGAABlockIsTransposed) { EXPECT_FLOAT_EQ(aggregated_error, 0.0); } +/** + * Tests if a block of A will be correctly transposed + */ +TEST_F(TransposeKernelTest, FPGAABlockIsTransposed4Blocks) { + if (bm->getExecutionSettings().programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { + // Diagonal data handler does not support this test, since matrix is stored differently in memory buffer + return; + } + matrix_size = BLOCK_SIZE * bm->getExecutionSettings().programSettings->kernelReplications; + bm->getExecutionSettings().programSettings->matrixSize = matrix_size; + data = bm->generateInputData(); + createChannelFilesAndSymbolicLinks(); + for (int i = 0; i < matrix_size; i++) { + for (int j = 0; j < matrix_size; j++) { + data->A[i * matrix_size + j] = i * matrix_size + j; + data->B[i * matrix_size + j] = 0.0; + } + } + bm->executeKernel(*data); + double aggregated_error = 0.0; + for (int i = 0; i < matrix_size; i++) { + for (int j = 0; j < matrix_size; j++) { + aggregated_error += std::abs(data->result[i * matrix_size + j] - data->A[j * matrix_size + i]); + } + } + EXPECT_FLOAT_EQ(aggregated_error, 0.0); +} + /** * Tests if matrix A and B will be summed up in the result */ @@ -105,6 +161,34 @@ TEST_F(TransposeKernelTest, FPGAAAndBAreSummedUp) { EXPECT_FLOAT_EQ(aggregated_error, 0.0); } +/** + * Tests if matrix A and B will be summed up in the result + */ +TEST_F(TransposeKernelTest, FPGAAAndBAreSummedUp4Blocks) { + if (bm->getExecutionSettings().programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { + // Diagonal data handler does not support this test, since matrix is stored differently in memory buffer + return; + } + matrix_size = BLOCK_SIZE * bm->getExecutionSettings().programSettings->kernelReplications; + bm->getExecutionSettings().programSettings->matrixSize = matrix_size; + data = bm->generateInputData(); + createChannelFilesAndSymbolicLinks(); + for (int i = 0; i < matrix_size; i++) { + for (int j = 0; j < matrix_size; j++) { + data->A[i * matrix_size + j] = 1.0; + data->B[i * matrix_size + j] = i * matrix_size + j; + } + } + bm->executeKernel(*data); + double aggregated_error = 0.0; + for (int i = 0; i < matrix_size; i++) { + for (int j = 0; j < matrix_size; j++) { + aggregated_error += std::abs(data->result[i * matrix_size + j] - (data->B[i * matrix_size + j] + 1.0)); + } + } + EXPECT_FLOAT_EQ(aggregated_error, 0.0); +} + /** * Checks the size and values of the timing measurements that are retured by calculate. diff --git a/PTRANS/tests/test_transpose_data_handlers.cpp b/PTRANS/tests/test_transpose_data_handlers.cpp index 6f6d0249..9b666bc2 100644 --- a/PTRANS/tests/test_transpose_data_handlers.cpp +++ b/PTRANS/tests/test_transpose_data_handlers.cpp @@ -7,7 +7,7 @@ #include "test_program_settings.h" #include "gmock/gmock-matchers.h" #include "transpose_benchmark.hpp" -#include "transpose_handlers.hpp" +#include "data_handlers/diagonal.hpp" struct TransposeHandlersTest : testing::Test { @@ -15,6 +15,7 @@ struct TransposeHandlersTest : testing::Test { TransposeHandlersTest() { bm = std::unique_ptr( new transpose::TransposeBenchmark(global_argc, global_argv)); + bm->setTransposeDataHandler(transpose::data_handler::DataHandlerType::diagonal); } void SetUp() override { @@ -29,11 +30,11 @@ struct TransposeHandlersTest : testing::Test { * Test DitExt class instantiation */ TEST_F(TransposeHandlersTest, DistDiagCreateHandlerSuccess) { - EXPECT_NO_THROW(transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1)); + EXPECT_NO_THROW(transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1)); } TEST_F(TransposeHandlersTest, DistDiagCreateHandlerFail) { - EXPECT_THROW(transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](1,1), std::runtime_error); + EXPECT_THROW(transpose::data_handler::DistributedDiagonalTransposeDataHandler(1,1), std::runtime_error); } /** @@ -48,8 +49,8 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI1Block1) { bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks; uint block_count = 0; for (int i=0; i < mpi_size; i++) { - auto h = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](i, mpi_size); - auto d = h->generateData(bm->getExecutionSettings()); + auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size); + auto d = h.generateData(bm->getExecutionSettings()); block_count += d->numBlocks; } EXPECT_EQ(block_count, matrix_size_in_blocks * matrix_size_in_blocks); @@ -63,8 +64,8 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI3Block3) { bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks; uint block_count = 0; for (int i=0; i < mpi_size; i++) { - auto h = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](i, mpi_size); - auto d = h->generateData(bm->getExecutionSettings()); + auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size); + auto d = h.generateData(bm->getExecutionSettings()); block_count += d->numBlocks; } EXPECT_EQ(block_count, matrix_size_in_blocks * matrix_size_in_blocks); @@ -78,8 +79,8 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI9Block3) { bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks; uint block_count = 0; for (int i=0; i < mpi_size; i++) { - auto h = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](i, mpi_size); - auto d = h->generateData(bm->getExecutionSettings()); + auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size); + auto d = h.generateData(bm->getExecutionSettings()); block_count += d->numBlocks; } EXPECT_EQ(block_count, matrix_size_in_blocks * matrix_size_in_blocks); @@ -93,8 +94,8 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI5Block4) { bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks; uint block_count = 0; for (int i=0; i < mpi_size; i++) { - auto h = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](i, mpi_size); - auto d = h->generateData(bm->getExecutionSettings()); + auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size); + auto d = h.generateData(bm->getExecutionSettings()); block_count += d->numBlocks; } EXPECT_EQ(block_count, matrix_size_in_blocks * matrix_size_in_blocks); @@ -105,45 +106,45 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI5Block4) { * */ TEST_F(TransposeHandlersTest, DataGenerationDistDiagSucceedsForMPISizeEquals1SingleBlock) { - auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1); + auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1); bm->getExecutionSettings().programSettings->blockSize = 4; bm->getExecutionSettings().programSettings->matrixSize = 4; - EXPECT_NO_THROW(handler->generateData(bm->getExecutionSettings())); + EXPECT_NO_THROW(handler.generateData(bm->getExecutionSettings())); } TEST_F(TransposeHandlersTest, DataGenerationDistDiagSucceedsForMPISizeEquals1Blocks9) { - auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1); + auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1); bm->getExecutionSettings().programSettings->blockSize = 4; bm->getExecutionSettings().programSettings->matrixSize = 4*3; - EXPECT_THROW(handler->generateData(bm->getExecutionSettings()), std::runtime_error); + EXPECT_THROW(handler.generateData(bm->getExecutionSettings()), std::runtime_error); } TEST_F(TransposeHandlersTest, DataGenerationDistDiagSucceedsForMPISizeEquals3Blocks9) { - auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,3); + auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,3); bm->getExecutionSettings().programSettings->blockSize = 4; bm->getExecutionSettings().programSettings->matrixSize = 4*3; - EXPECT_NO_THROW(handler->generateData(bm->getExecutionSettings())); + EXPECT_NO_THROW(handler.generateData(bm->getExecutionSettings())); } TEST_F(TransposeHandlersTest, DataGenerationDistDiagFailsForMPISizeEquals3Blocks1) { - auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,3); + auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,3); bm->getExecutionSettings().programSettings->blockSize = 4; bm->getExecutionSettings().programSettings->matrixSize = 4; - EXPECT_NO_THROW(handler->generateData(bm->getExecutionSettings())); + EXPECT_NO_THROW(handler.generateData(bm->getExecutionSettings())); } TEST_F(TransposeHandlersTest, DataGenerationDistDiagFailsForMPISizeEquals3Blocks4) { - auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,3); + auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,3); bm->getExecutionSettings().programSettings->blockSize = 4; bm->getExecutionSettings().programSettings->matrixSize = 4 * 2; - EXPECT_THROW(handler->generateData(bm->getExecutionSettings()), std::runtime_error); + EXPECT_THROW(handler.generateData(bm->getExecutionSettings()), std::runtime_error); } TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForOneReplication) { bm->getExecutionSettings().programSettings->kernelReplications = 1; bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize; - auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1); - auto data = handler->generateData(bm->getExecutionSettings()); + auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1); + auto data = handler.generateData(bm->getExecutionSettings()); EXPECT_EQ(data->blockSize, bm->getExecutionSettings().programSettings->blockSize); EXPECT_EQ(data->numBlocks, 1); } @@ -151,8 +152,8 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForOneReplication) { TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForTwoReplications) { bm->getExecutionSettings().programSettings->kernelReplications = 2; bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize; - auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1); - auto data = handler->generateData(bm->getExecutionSettings()); + auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1); + auto data = handler.generateData(bm->getExecutionSettings()); EXPECT_EQ(data->blockSize, bm->getExecutionSettings().programSettings->blockSize); EXPECT_EQ(data->numBlocks, 1); } @@ -160,9 +161,9 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForTwoReplications) { TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableA) { bm->getExecutionSettings().programSettings->kernelReplications = 2; bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize; - auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1); - auto data = handler->generateData(bm->getExecutionSettings()); - auto data2 = handler->generateData(bm->getExecutionSettings()); + auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1); + auto data = handler.generateData(bm->getExecutionSettings()); + auto data2 = handler.generateData(bm->getExecutionSettings()); double aggregated_error = 0.0; for (int i = 0; i < data->blockSize * data->blockSize * data->numBlocks; i++) { aggregated_error += std::fabs(data->A[i] - data2->A[i]); @@ -173,9 +174,9 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableA) { TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableB) { bm->getExecutionSettings().programSettings->kernelReplications = 2; bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize; - auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1); - auto data = handler->generateData(bm->getExecutionSettings()); - auto data2 = handler->generateData(bm->getExecutionSettings()); + auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1); + auto data = handler.generateData(bm->getExecutionSettings()); + auto data2 = handler.generateData(bm->getExecutionSettings()); double aggregated_error = 0.0; for (int i = 0; i < data->blockSize * data->blockSize * data->numBlocks; i++) { aggregated_error += std::fabs(data->B[i] - data2->B[i]); @@ -186,10 +187,10 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableB) { TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagExchangeWorksForSingleRank) { bm->getExecutionSettings().programSettings->kernelReplications = 2; bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize; - auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1); - auto data = handler->generateData(bm->getExecutionSettings()); - auto data2 = handler->generateData(bm->getExecutionSettings()); - handler->exchangeData(*data); + auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1); + auto data = handler.generateData(bm->getExecutionSettings()); + auto data2 = handler.generateData(bm->getExecutionSettings()); + handler.exchangeData(*data); double aggregated_error = 0.0; for (int i = 0; i < data->blockSize * data->blockSize * data->numBlocks; i++) { aggregated_error += std::fabs(data->A[i] - data2->A[i]); diff --git a/README.md b/README.md index c75d7ab0..4f7325c8 100755 --- a/README.md +++ b/README.md @@ -227,7 +227,7 @@ This allows higher memory bandwidths during kernel execution. - *High Bandwidth Memory (HBM)*: The FPGA fabric itself is equipped with memory banks that can be accessed by the host to copy data. Compared to DDR, this memory type consists of more, but smaller memory banks so that the host needs to split the data between all memory banks to achieve the best performance. Still, the total achievable memory bandwidth is much higher compared to DDR. The following three tables contain an overview of the compatibility of all benchmarks that use global memory with the three mentioned memory types. -b_eff is not included since it does not use global memory. +b_eff does use global memory only for validation. Still, the support for different memory types needs to be implemented on the host side. Full support of the benchmark is indicated with a **Yes**, functionally correct behavior but performance limitations are indicated with **(Yes)**, no support is indicated with **No**. For Xilinx, all benchmarks need a compatible compile- and link-settings-file to map the kernel memory ports to the available memory banks. LINPACK, PTRANS and b_eff are currently not working with Xilinx FPGAs because the implementations lack support for inter-FPGA communication on these devices. @@ -239,10 +239,11 @@ Support will be added subsequently. |--------------|------------|--------------| | STREAM | Yes | Yes | | RandomAccess | Yes | Yes | -| PTRANS | Yes | No | -| LINPACK | Yes | No | +| PTRANS | Yes | Yes | +| LINPACK | Yes | Yes | | GEMM | Yes | Yes | -| FFT | Yes | Yes | +| FFT | Yes | Yes | +| b_eff | Yes | Yes | #### HBM @@ -257,6 +258,7 @@ Support will be added subsequently. | LINPACK | No | No | | GEMM | Yes | Yes | | FFT | Yes | Yes | +| b_eff | No | No | #### SVM @@ -270,6 +272,7 @@ SVM could not be tested with Xilinx-based boards, yet. Thus, they are considered | LINPACK | No | No | | GEMM | Yes | No | | FFT | Yes | No | +| b_eff | No | No | ## Publications diff --git a/STREAM/CMakeLists.txt b/STREAM/CMakeLists.txt index b087939b..67f66982 100755 --- a/STREAM/CMakeLists.txt +++ b/STREAM/CMakeLists.txt @@ -11,8 +11,10 @@ set(INNER_LOOP_BUFFERS ON CACHE BOOL "Put the local memory buffers inside the ou mark_as_advanced(INNER_LOOP_BUFFERS) -# Set the data type since optional vector types are used -set(DATA_TYPE float) +# Set the data type if not defined before to set up vector types +if (NOT DEFINED DATA_TYPE) + set(DATA_TYPE float) +endif() set(HOST_DATA_TYPE cl_${DATA_TYPE}) if (VECTOR_COUNT GREATER 1) set(DEVICE_DATA_TYPE ${DATA_TYPE}${VECTOR_COUNT}) diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt index ecdd99ac..0793aa92 100755 --- a/b_eff/CMakeLists.txt +++ b/b_eff/CMakeLists.txt @@ -14,6 +14,8 @@ set(NUM_REPLICATIONS 2 CACHE STRING "") set(USE_MPI Yes) set(USE_DEPRECATED_HPP_HEADER No) +set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes) + set(DATA_TYPE char) include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake) unset(DATA_TYPE CACHE) diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt index 4f34a65a..8316a884 100644 --- a/b_eff/src/device/CMakeLists.txt +++ b/b_eff/src/device/CMakeLists.txt @@ -3,8 +3,12 @@ set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication in set(NUM_REPLICATIONS 2) include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) -generate_kernel_targets_intel(communication_bw520n) -add_test(NAME test_emulation_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_emulate.aocx -l 1 -u 10 -m 0 -n 1 +generate_kernel_targets_intel(communication_bw520n_IEC) +add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) -add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_emulate.aocx -l 1 -u 1 -m 20 -n 1 +add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) +add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) +add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) diff --git a/b_eff/src/device/communication_bw520n.cl b/b_eff/src/device/communication_bw520n_IEC.cl similarity index 100% rename from b_eff/src/device/communication_bw520n.cl rename to b_eff/src/device/communication_bw520n_IEC.cl diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index 44bb1832..fb08281f 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -1,5 +1,5 @@ add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) -set(HOST_SOURCE execution_default.cpp network_benchmark.cpp) +set(HOST_SOURCE network_benchmark.cpp) include_directories(${MPI_CXX_INCLUDE_PATH}) set(HOST_EXE_NAME Network) diff --git a/PTRANS/src/host/execution.h b/b_eff/src/host/execution_types/execution.hpp similarity index 56% rename from PTRANS/src/host/execution.h rename to b_eff/src/host/execution_types/execution.hpp index 81ed371c..df630838 100644 --- a/PTRANS/src/host/execution.h +++ b/b_eff/src/host/execution_types/execution.hpp @@ -19,32 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef SRC_HOST_EXECUTION_H_ -#define SRC_HOST_EXECUTION_H_ -/* C++ standard library headers */ -#include -#include - -/* External library headers */ -#include "CL/cl.hpp" -#include "parameters.h" -#include "transpose_benchmark.hpp" - - -namespace bm_execution { - - -/** - * @brief Transpose and add the matrices using the OpenCL kernel - * - * @param config The progrma configuration - * @param data data object that contains all required data for the execution on the FPGA - * @return std::unique_ptr The measured execution times - */ - std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data); - -} // namespace bm_execution - -#endif // SRC_HOST_EXECUTION_H_ +#include "execution_types/execution_cpu.hpp" +#include "execution_types/execution_pcie.hpp" +#include "execution_types/execution_iec.hpp" \ No newline at end of file diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp new file mode 100644 index 00000000..778dc2f1 --- /dev/null +++ b/b_eff/src/host/execution_types/execution_cpu.hpp @@ -0,0 +1,118 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_CPU_HPP +#define SRC_HOST_EXECUTION_TYPES_EXECUTION_CPU_HPP + +/* C++ standard library headers */ +#include +#include +#include + +/* External library headers */ +#include "mpi.h" + +/* Project's headers */ + +namespace network::execution_types::cpu { + + /* + Implementation for the single kernel. + @copydoc bm_execution::calculate() + */ + std::shared_ptr + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, + cl::vector &validationData) { + + int err; + std::vector sendQueues; + std::vector dummyBuffers; + std::vector> dummyBufferContents; + + cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); + + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + + int current_size; + MPI_Comm_size(MPI_COMM_WORLD, & current_size); + + std::vector calculationTimings; + for (uint r =0; r < config.programSettings->numRepetitions; r++) { + sendQueues.clear(); + dummyBuffers.clear(); + dummyBufferContents.clear(); + // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err)); + ASSERT_CL(err) + + dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); + + cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + + sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data()); + + sendQueues.push_back(sendQueue); + + } + double calculationTime = 0.0; + for (int i = 0; i < config.programSettings->kernelReplications; i++) { + MPI_Barrier(MPI_COMM_WORLD); + auto startCalculation = std::chrono::high_resolution_clock::now(); + for (int l = 0; l < looplength; l++) { + MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, + dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + auto endCalculation = std::chrono::high_resolution_clock::now(); + calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); + #ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl; + #endif + } + calculationTimings.push_back(calculationTime); +#ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Done " << r << std::endl; +#endif + } + // Read validation data from FPGA will be placed sequentially in buffer for all replications + // The data order should not matter, because every byte should have the same value! + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); + ASSERT_CL(err); + } + std::shared_ptr result(new network::ExecutionTimings{ + looplength, + messageSize, + calculationTimings + }); + return result; + } + +} // namespace bm_execution + +#endif diff --git a/b_eff/src/host/execution_default.cpp b/b_eff/src/host/execution_types/execution_iec.hpp similarity index 95% rename from b_eff/src/host/execution_default.cpp rename to b_eff/src/host/execution_types/execution_iec.hpp index 1c5cd908..bed54f09 100644 --- a/b_eff/src/host/execution_default.cpp +++ b/b_eff/src/host/execution_types/execution_iec.hpp @@ -19,9 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - -/* Related header files */ -#include "execution.h" +#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_IEC_HPP +#define SRC_HOST_EXECUTION_TYPES_EXECUTION_IEC_HPP /* C++ standard library headers */ #include @@ -34,7 +33,7 @@ SOFTWARE. /* Project's headers */ -namespace bm_execution { +namespace network::execution_types::iec { /* Implementation for the single kernel. @@ -126,7 +125,7 @@ namespace bm_execution { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / 2, &validationData.data()[r * validationData.size() / 2]); + err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); ASSERT_CL(err); } std::shared_ptr result(new network::ExecutionTimings{ @@ -138,3 +137,5 @@ namespace bm_execution { } } // namespace bm_execution + +#endif \ No newline at end of file diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp new file mode 100644 index 00000000..73156b7e --- /dev/null +++ b/b_eff/src/host/execution_types/execution_pcie.hpp @@ -0,0 +1,124 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_PCIE_HPP +#define SRC_HOST_EXECUTION_TYPES_EXECUTION_PCIE_HPP + +/* C++ standard library headers */ +#include +#include +#include + +/* External library headers */ +#include "mpi.h" + +/* Project's headers */ + +namespace network::execution_types::pcie { + + /* + Implementation for the single kernel. + @copydoc bm_execution::calculate() + */ + std::shared_ptr + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, + cl::vector &validationData) { + + int err; + std::vector sendQueues; + std::vector dummyBuffers; + std::vector> dummyBufferContents; + + cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); + + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + + int current_size; + MPI_Comm_size(MPI_COMM_WORLD, & current_size); + + std::vector calculationTimings; + for (uint r =0; r < config.programSettings->numRepetitions; r++) { + sendQueues.clear(); + dummyBuffers.clear(); + dummyBufferContents.clear(); + // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err)); + ASSERT_CL(err) + + dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); + + cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + + sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data()); + + sendQueues.push_back(sendQueue); + + } + double calculationTime = 0.0; + for (int i = 0; i < config.programSettings->kernelReplications; i++) { + MPI_Barrier(MPI_COMM_WORLD); + auto startCalculation = std::chrono::high_resolution_clock::now(); + for (int l = 0; l < looplength; l++) { + + sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); + + MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, + dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); + + } + auto endCalculation = std::chrono::high_resolution_clock::now(); + calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); + #ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl; + #endif + } + calculationTimings.push_back(calculationTime); +#ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Done " << r << std::endl; +#endif + } + // Read validation data from FPGA will be placed sequentially in buffer for all replications + // The data order should not matter, because every byte should have the same value! + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); + ASSERT_CL(err); + } + std::shared_ptr result(new network::ExecutionTimings{ + looplength, + messageSize, + calculationTimings + }); + return result; + } + +} // namespace bm_execution + +#endif diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 8a310a14..9d1512e4 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -31,7 +31,7 @@ SOFTWARE. #include /* Project's headers */ -#include "execution.h" +#include "execution_types/execution.hpp" #include "parameters.h" network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), @@ -50,6 +50,7 @@ network::NetworkProgramSettings::getSettingsMap() { network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength) : messageSize(_messageSize), loopLength(_loopLength), validationBuffer(CHANNEL_WIDTH * 2 * 2, 0) { + // TODO: fix the validation buffer size to use the variable number of kernel replications and channels // Validation data buffer should be big enough to fit the data of two channels // for every repetition. The number of kernel replications is fixed to 2, which // also needs to be multiplied with the buffer size @@ -104,7 +105,14 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { if (world_rank == 0) { std::cout << "Measure for " << (1 << run.messageSize) << " Byte" << std::endl; } - timing_results.push_back(bm_execution::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer)); + std::shared_ptr timing; + switch (executionSettings->programSettings->communicationType) { + case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; + case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; + case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; + default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType)); + } + timing_results.push_back(timing); } std::unique_ptr collected_results = std::unique_ptr (new network::NetworkExecutionTimings()); diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp index a564cedc..ba254201 100644 --- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp +++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp @@ -10,7 +10,7 @@ #include "test_program_settings.h" #include -struct NetworkKernelTest : testing::Test { +struct NetworkKernelTest : testing::TestWithParam { std::unique_ptr bm; std::unique_ptr data; unsigned numberOfChannels = 4; @@ -22,6 +22,7 @@ struct NetworkKernelTest : testing::Test { void SetUp() override { bm = std::unique_ptr(new network::NetworkBenchmark(global_argc, global_argv)); bm->getExecutionSettings().programSettings->numRepetitions = 1; + bm->getExecutionSettings().programSettings->communicationType = GetParam(); data = bm->generateInputData(); createChannelFilesAndSymbolicLinks(); } @@ -47,7 +48,7 @@ struct NetworkKernelTest : testing::Test { /** * Tests if calculate returns the correct execution results */ -TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) { +TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) { data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(1,1)); auto result = bm->executeKernel(*data); @@ -59,7 +60,7 @@ TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) { /** * Tests if calculate returns the correct execution results for multiple repetitions */ -TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) { +TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) { bm->getExecutionSettings().programSettings->numRepetitions = 2; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(8,4)); @@ -72,7 +73,11 @@ TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) { /** * Tests if data is written to the channels for small message sizes */ -TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) { +TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) { + if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) { + // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files + GTEST_SKIP(); + } const unsigned messageSize = std::log2(CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); @@ -96,7 +101,11 @@ TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) /** * Tests if data is written to the channels for small message sizes filling two channels */ -TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) { +TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) { + if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) { + // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files + GTEST_SKIP(); + } const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); @@ -117,7 +126,11 @@ TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels /** * Tests if data is written to the channels for message sizes filling more than two channels */ -TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) { +TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) { + if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) { + // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files + GTEST_SKIP(); + } const unsigned messageSize = std::log2(8 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 1; data->items.clear(); @@ -138,7 +151,11 @@ TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwo /** * Tests if correct data is written to the channels */ -TEST_F(NetworkKernelTest, CorrectDataIsWrittenToChannel) { +TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) { + if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) { + // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files + GTEST_SKIP(); + } const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); @@ -158,7 +175,7 @@ TEST_F(NetworkKernelTest, CorrectDataIsWrittenToChannel) { delete [] buffer; } -TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) { +TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); @@ -173,7 +190,7 @@ TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) { EXPECT_TRUE(all_same); } -TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) { +TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) { const unsigned messageSize = 0; const unsigned looplength = 4; data->items.clear(); @@ -188,7 +205,7 @@ TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) { EXPECT_TRUE(all_same); } -TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) { +TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); @@ -197,7 +214,7 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) { EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size()); } -TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) { +TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 1; data->items.clear(); @@ -206,7 +223,7 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) { EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size()); } -TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) { +TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) { const unsigned messageSize = 0; const unsigned looplength = 1; data->items.clear(); @@ -215,7 +232,7 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) { EXPECT_EQ(looplength * CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size()); } -TEST_F(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) { +TEST_P(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const HOST_DATA_TYPE expected_data = static_cast(messageSize & 255); const unsigned looplength = 4; @@ -226,7 +243,7 @@ TEST_F(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) { EXPECT_FALSE(bm->validateOutputAndPrintError(*data)); } -TEST_F(NetworkKernelTest, ValidationDataWrongCheckFails) { +TEST_P(NetworkKernelTest, ValidationDataWrongCheckFails) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const HOST_DATA_TYPE expected_data = static_cast(messageSize & 255); const unsigned looplength = 4; @@ -236,7 +253,7 @@ TEST_F(NetworkKernelTest, ValidationDataWrongCheckFails) { EXPECT_FALSE(bm->validateOutputAndPrintError(*data)); } -TEST_F(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) { +TEST_P(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const HOST_DATA_TYPE expected_data = static_cast(messageSize & 255); const unsigned looplength = 4; @@ -246,7 +263,7 @@ TEST_F(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) { EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); } -TEST_F(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) { +TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); @@ -258,7 +275,7 @@ TEST_F(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) { // This test is disabled because it does not work with the current implementation of the // external channels in software emulation. The different kernel executions will read // the old data from the channel file, which will lead to a failing validation! -TEST_F(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) { +TEST_P(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); @@ -268,7 +285,7 @@ TEST_F(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExec EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); } -TEST_F(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) { +TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); @@ -279,3 +296,9 @@ TEST_F(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) { EXPECT_FALSE(bm->validateOutputAndPrintError(*data)); } + + +INSTANTIATE_TEST_CASE_P( + NetworkKernelParametrizedTests, + NetworkKernelTest, + ::testing::Values(hpcc_base::CommunicationType::intel_external_channels,hpcc_base::CommunicationType::cpu_only, hpcc_base::CommunicationType::pcie_mpi)); diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index 2624dcaa..28f696f3 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -16,6 +16,12 @@ else() set(header_default Yes) endif() +if(DEFINED COMMUNICATION_TYPE_SUPPORT_ENABLED) + set(comm_support_default ${COMMUNICATION_TYPE_SUPPORT_ENABLED}) +else() + set(comm_support_default No) +endif() + # Host code specific options set(DEFAULT_REPETITIONS 10 CACHE STRING "Default number of repetitions") set(DEFAULT_DEVICE -1 CACHE STRING "Index of the default device to use") @@ -29,8 +35,9 @@ set(USE_DEPRECATED_HPP_HEADER ${header_default} CACHE BOOL "Flag that indicates set(HPCC_FPGA_CONFIG ${HPCC_FPGA_CONFIG} CACHE FILEPATH "Configuration file that is used to overwrite the default configuration") set(NUM_REPLICATIONS 4 CACHE STRING "Number of times the kernels will be replicated") set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication for the OpenCL kernel targets") +set(COMMUNICATION_TYPE_SUPPORT_ENABLED ${comm_support_default} CACHE INTERNAL "Enables the support for the selection of the communication type which has to be implemented by the specific benchmark") -mark_as_advanced(KERNEL_REPLICATION_ENABLED) +mark_as_advanced(KERNEL_REPLICATION_ENABLED COMMUNICATION_TYPE_SUPPORT_ENABLED) if (NOT KERNEL_REPLICATION_ENABLED) # Only define NUM_REPLICATIONS if kernel replications is enabled unset(NUM_REPLICATIONS) @@ -119,6 +126,11 @@ if (USE_DEPRECATED_HPP_HEADER) add_definitions(-DUSE_DEPRECATED_HPP_HEADER) endif() +# set the communication type flag if required +if (COMMUNICATION_TYPE_SUPPORT_ENABLED) + add_definitions(-DCOMMUNICATION_TYPE_SUPPORT_ENABLED) +endif() + # Set OpenCL version that should be used set(HPCC_FPGA_OPENCL_VERSION 200 CACHE STRING "OpenCL version that should be used for the host code compilation") mark_as_advanced(HPCC_FPGA_OPENCL_VERSION) diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 20d6506d..1d7e667f 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -1,6 +1,5 @@ set(COMPILER_INCLUDES "-I${CMAKE_BINARY_DIR}/src/common/" "-I${CMAKE_CURRENT_SOURCE_DIR}") -set(CLFLAGS --config ${XILINX_COMPILE_SETTINGS_FILE}) set(Vitis_EMULATION_CONFIG_UTIL $ENV{XILINX_VITIS}/bin/emconfigutil) @@ -45,6 +44,13 @@ function(generate_kernel_targets_xilinx) set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA) list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs) + string(REGEX MATCH "^.+\.tcl" is_tcl_script ${XILINX_COMPILE_SETTINGS_FILE}) + if (is_tcl_script) + set(CLFLAGS --hls.pre_tcl ${XILINX_COMPILE_SETTINGS_FILE}) + else() + set(CLFLAGS --config ${XILINX_COMPILE_SETTINGS_FILE}) + endif() + # build emulation config for device add_custom_command(OUTPUT ${EXECUTABLE_OUTPUT_PATH}/emconfig.json COMMAND ${Vitis_EMULATION_CONFIG_UTIL} -f ${FPGA_BOARD_NAME} --od ${EXECUTABLE_OUTPUT_PATH} @@ -164,7 +170,7 @@ function(generate_kernel_targets_intel) DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_f} ) add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_emulate_f} - COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} -DEMULATE -DINTEL_FPGA ${COMPILER_INCLUDES} ${AOC_FLAGS} -legacy-emulator -march=emulator + COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} -DEMULATE -DINTEL_FPGA ${COMPILER_INCLUDES} ${AOC_FLAGS} -march=emulator -o ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_emulate_f} MAIN_DEPENDENCY ${source_f} DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake index 921cf5ab..2597017b 100644 --- a/cmake/unitTestTargets.cmake +++ b/cmake/unitTestTargets.cmake @@ -13,8 +13,9 @@ if (INTELFPGAOPENCL_FOUND) endif() target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}") foreach (kernel_target ${kernel_emulation_targets_intel}) + set(additional_commands "") string(REPLACE "_intel" ".aocx" kernel_name ${kernel_target}) - add_test(NAME test_unit_${kernel_target} COMMAND $ -f ${kernel_name} ${TEST_HOST_FLAGS} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_unit_${kernel_target} COMMAND $ ${additional_commands} -f ${kernel_name} ${TEST_HOST_FLAGS} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endforeach(kernel_target) endif() diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index 5b57fd57..f5e437d2 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -9,10 +9,10 @@ import sys # Regular expressions for the raw output of all -fft_regex = "Version:\\s+(?P.+)\n(.*\n)+Batch\\sSize\\s+(?P\d+)\nFFT\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" +fft_regex = "Version:\\s+(?P.+)\n(.*\n)+Batch\\sSize\\s+(?P\d+)\n(.*\n)FFT\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" -trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+calc\\s+calc\\s+FLOPS\\s+Net\\s+\\[B/s\\]\\s+Mem\\s+\\[B/s\\]\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" +trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s\\[s\\]\\s+transfer\\s\\[s\\]\\s+calc\\s\\[s\\]\\s+calc\\s+FLOPS\\s+Mem\\s+\\[B/s\\]\\s+PCIe\\s+\\[B/s\\]\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e|inf)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e|inf)+)" stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(\\s*\n)\\s+GEFA\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(\\s*\n)\\s+GESL\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" diff --git a/scripts/test_all.sh b/scripts/test_all.sh index fd8b710c..081e8474 100755 --- a/scripts/test_all.sh +++ b/scripts/test_all.sh @@ -12,7 +12,6 @@ # ./test_all.sh -DFPGA_BOARD_NAME=other_board # - SCRIPT_PATH=$( cd "$(dirname $0)"; pwd -P) PROJECT_ROOT=${SCRIPT_PATH}/.. @@ -24,8 +23,12 @@ TEST_LOG_FILE=${TEST_DIR}/lasttests.log BENCHMARKS=("b_eff" "FFT" "GEMM" "LINPACK" "PTRANS" "RandomAccess" "STREAM") -# Xilinx benchmarks: -#BENCHMARKS=("RandomAccess" "STREAM") +if [ "$1" != "inc" ]; then + echo "Clean build directory, use option 'inc' to prevent this!" + rm -rf ${TEST_DIR} +else + echo "Do incremental build based on previous run!" +fi mkdir -p $TEST_DIR rm -f $BUILD_LOG_FILE @@ -37,11 +40,16 @@ echo "Start building hosts code, tests and emulation kernel for all benchmarks." for bm in ${BENCHMARKS[@]}; do echo "Building $bm..." + if [ -f ${TEST_DIR}/$bm/BUILD_SUCCESS ]; then + continue + else + rm -rf ${TEST_DIR}/$bm + fi cd $TEST_DIR mkdir -p $bm ret=0 cd $bm - cmake ${PROJECT_ROOT}/$bm -DDEFAULT_DEVICE=0 -DDEFAULT_PLATFORM=0 -DBLOCK_SIZE=32 $@ &>> $BUILD_LOG_FILE + cmake ${PROJECT_ROOT}/$bm -DDEFAULT_DEVICE=0 -DDEFAULT_PLATFORM=0 -DBLOCK_SIZE=32 &>> $BUILD_LOG_FILE ret=$(($ret + $?)) make -j 40 VERBOSE=1 all &>> $BUILD_LOG_FILE ret=$(($ret + $?)) @@ -50,12 +58,16 @@ for bm in ${BENCHMARKS[@]}; do echo "For more information see $BUILD_LOG_FILE" exit $ret fi + touch ${TEST_DIR}/$bm/BUILD_SUCCESS done echo "Start testing all benchmarks" for bm in ${BENCHMARKS[@]}; do echo "Testing $bm..." + if [ -f ${TEST_DIR}/$bm/TEST_SUCCESS ]; then + continue + fi cd $TEST_DIR ret=0 cd $bm @@ -85,13 +97,14 @@ for bm in ${BENCHMARKS[@]}; do ln -s kernel_output_ch3 kernel_input_ch2 cd .. fi - make XCL_EMULATION_MODE=sw_emu CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test &>> $TEST_LOG_FILE + make XCL_EMULATION_MODE=sw_emu CTEST_OUTPUT_ON_FAILURE=1 test &>> $TEST_LOG_FILE ret=$(($ret + $?)) if [ $ret -ne 0 ]; then echo "Failed testing $bm" echo "For more information see $TEST_LOG_FILE" exit $ret fi + touch ${TEST_DIR}/$bm/TEST_SUCCESS done echo "-----------" diff --git a/shared/include/communication_types.hpp b/shared/include/communication_types.hpp new file mode 100644 index 00000000..bb46bb8d --- /dev/null +++ b/shared/include/communication_types.hpp @@ -0,0 +1,124 @@ +/* +Copyright (c) 2021 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef HPCC_BASE_COMMUNICATION_TYPES_H_ +#define HPCC_BASE_COMMUNICATION_TYPES_H_ + +#define DEFAULT_COMM_TYPE "AUTO" + +#include + +namespace hpcc_base { + +/** + * @brief This enumeration contains all available communication types. They differ in the way data is exchanged between FPGAs. A special case is cpu_only which can be used to implement CPU references + * + */ +typedef enum _CommunicationType { + + /** + * @brief Communication using the external channels extension + * + */ + intel_external_channels, + + /** + * @brief Copy the data from FPGA to CPU and send it via MPI + * + */ + pcie_mpi, + + /** + * @brief Communcation using the Streaming Message Interface + * + */ + smi, + + /** + * @brief Calculate the benchmark on CPU instead of FPGA + * + */ + cpu_only, + + /** + * @brief Indicates, that the use of the communication type is disabled + * + */ + unsupported, + + /** + * @brief Automatically detect communication type from kernel file name + * + */ + automatic + +} CommunicationType; + +static const std::map comm_to_str_map{ + {"IEC", CommunicationType::intel_external_channels}, + {"PCIE", CommunicationType::pcie_mpi}, + {"SMI", CommunicationType::smi}, + {"CPU", CommunicationType::cpu_only}, + {"UNSUPPORTED", CommunicationType::unsupported}, + {"AUTO", CommunicationType::automatic} + }; + +/** + * @brief Serializes a enum of type CommunicationType into a string. The resulting string can be used with the function retrieveCommunicationType to get back the enum. + * + * @param e the communication type that should be converted into a string + * @return std::string String representation of the communication type + */ +static std::string commToString(CommunicationType c) { + for (auto& entry : comm_to_str_map) { + if (entry.second == c) { + return entry.first; + } + } + throw std::runtime_error("Communication type could not be converted to string!"); +} + +/** + * @brief Deserializes a string into a enum of type CommunicationType. If the execution type is auto, the given kernel file name is used to determine the communication type. If this is not possible, an exception is thrown + * + * @param exe_name String serialization of the communication tpye + * @param kernel_filename the name of the used bitstream file + * @return CommunicationType the determined communication type. Will throw a runtime error if it is not possible to retrieve the execution type + */ +static CommunicationType retrieveCommunicationType(std::string comm_name, std::string kernel_filename) { + auto result = comm_to_str_map.find(comm_name); + if (result != comm_to_str_map.end()) { + if (result->second == CommunicationType::automatic) { + for (auto &comm_type: comm_to_str_map) { + if (kernel_filename.find(comm_type.first) != std::string::npos) { + return comm_type.second; + } + } + throw std::runtime_error("Communication type could not be autodetected from kernel_filename: " + kernel_filename); + } else { + return result->second; + } + } + throw std::runtime_error("Communication type could not be converted from string: " + comm_name); +} +} + +#endif \ No newline at end of file diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 1d3bd92b..f61a1e91 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -26,6 +26,7 @@ SOFTWARE. /* Project's headers */ #include "setup/fpga_setup.hpp" +#include "communication_types.hpp" #include "cxxopts.hpp" #include "parameters.h" @@ -115,6 +116,12 @@ class BaseSettings { */ bool testOnly; + /** + * @brief Type of inter-FPGA communication used + * + */ + CommunicationType communicationType; + /** * @brief Construct a new Base Settings object * @@ -134,6 +141,11 @@ class BaseSettings { kernelReplications(results.count("r") > 0 ? results["r"].as() : NUM_REPLICATIONS), #else kernelReplications(results.count("r") > 0 ? results["r"].as() : 1), +#endif +#ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED + communicationType(retrieveCommunicationType(results["comm-type"].as(), results["f"].as())), +#else + communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as())), #endif testOnly(static_cast(results.count("test"))) {} @@ -153,7 +165,8 @@ class BaseSettings { str_mpi_ranks = std::to_string(mpi_size); } return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, - {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", std::to_string(testOnly)}}; + {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"}, + {"Communication Type", commToString(communicationType)}}; } }; @@ -372,6 +385,10 @@ class HpccFpgaBenchmark { #ifdef NUM_REPLICATIONS ("r", "Number of used kernel replications", cxxopts::value()->default_value(std::to_string(NUM_REPLICATIONS))) +#endif +#ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED + ("comm-type", "Used communication type for inter-FPGA communication", + cxxopts::value()->default_value(DEFAULT_COMM_TYPE)) #endif ("test", "Only test given configuration and skip execution and validation") ("h,help", "Print this help"); @@ -391,10 +408,7 @@ class HpccFpgaBenchmark { // Check parsed options and handle special cases if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - throw fpga_setup::FpgaSetupException("Mandatory option is missing"); + throw fpga_setup::FpgaSetupException("Mandatory option is missing! Use -h to show all available options. ERROR: Kernel file must be given with option -f!"); } // Create program settings from program arguments @@ -403,9 +417,7 @@ class HpccFpgaBenchmark { return sharedSettings; } catch (const cxxopts::OptionException& e) { - std::cerr << "Error while parsing input parameters: "<< e.what() << std::endl; - std::cout << options.help() << std::endl; - throw fpga_setup::FpgaSetupException("Input parameters could not be parsed"); + throw fpga_setup::FpgaSetupException("Input parameters could not be parsed! Use -h to show all available options. ERROR: " + std::string(e.what())); } } @@ -462,16 +474,21 @@ class HpccFpgaBenchmark { std::unique_ptr programSettings = parseProgramParameters(tmp_argc, tmp_argv); - auto usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); + std::unique_ptr context; + std::unique_ptr program; + std::unique_ptr usedDevice; - auto context = std::unique_ptr(new cl::Context(*usedDevice)); - auto program = fpga_setup::fpgaSetup(context.get(), {*usedDevice}, - &programSettings->kernelFileName); + if (!programSettings->testOnly) { + usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, + programSettings->defaultDevice); - executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), - std::move(context), std::move(program))); + context = std::unique_ptr(new cl::Context(*usedDevice)); + program = fpga_setup::fpgaSetup(context.get(), {*usedDevice}, + &programSettings->kernelFileName); + } + executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), + std::move(context), std::move(program))); if (mpi_comm_rank == 0) { if (!checkInputParameters()) { std::cerr << "ERROR: Input parameter check failed!" << std::endl; @@ -508,9 +525,16 @@ class HpccFpgaBenchmark { executeBenchmark() { if (!benchmark_setup_succeeded) { - std::cerr << "Benchmark execution started without running the benchmark setup!" << std::endl; + std::cerr << "Benchmark execution started without successfully running the benchmark setup!" << std::endl; return false; } + if (executionSettings->programSettings->testOnly) { + if (mpi_comm_rank == 0) { + std::cout << "TEST MODE ENABLED: SKIP DATA GENERATION, EXECUTION, AND VALIDATION!" << std::endl; + std::cout << "SUCCESSFULLY parsed input parameters!" << std::endl; + } + return benchmark_setup_succeeded; + } if (mpi_comm_rank == 0) { std::cout << HLINE << "Start benchmark using the given configuration. Generating data..." << std::endl << HLINE; @@ -531,45 +555,39 @@ class HpccFpgaBenchmark { } bool validateSuccess = false; - if (!executionSettings->programSettings->testOnly) { - auto exe_start = std::chrono::high_resolution_clock::now(); - std::unique_ptr output = executeKernel(*data); + auto exe_start = std::chrono::high_resolution_clock::now(); + std::unique_ptr output = executeKernel(*data); #ifdef _USE_MPI_ - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); #endif - std::chrono::duration exe_time = std::chrono::high_resolution_clock::now() - exe_start; + std::chrono::duration exe_time = std::chrono::high_resolution_clock::now() - exe_start; - if (mpi_comm_rank == 0) { - std::cout << "Execution Time: " << exe_time.count() << " s" << std::endl; - std::cout << HLINE << "Validate output..." << std::endl - << HLINE; - } - - if (!executionSettings->programSettings->skipValidation) { - auto eval_start = std::chrono::high_resolution_clock::now(); - validateSuccess = validateOutputAndPrintError(*data); - std::chrono::duration eval_time = std::chrono::high_resolution_clock::now() - eval_start; + if (mpi_comm_rank == 0) { + std::cout << "Execution Time: " << exe_time.count() << " s" << std::endl; + std::cout << HLINE << "Validate output..." << std::endl + << HLINE; + } - if (mpi_comm_rank == 0) { - std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl; - } - } - collectAndPrintResults(*output); + if (!executionSettings->programSettings->skipValidation) { + auto eval_start = std::chrono::high_resolution_clock::now(); + validateSuccess = validateOutputAndPrintError(*data); + std::chrono::duration eval_time = std::chrono::high_resolution_clock::now() - eval_start; if (mpi_comm_rank == 0) { - if (!validateSuccess) { - std::cerr << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl; - } - else { - std::cout << "Validation: SUCCESS!" << std::endl; - } + std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl; } - } - else { - std::cout << "TEST MODE ENABLED: SKIP EXECUTION AND VALIDATION!" << std::endl; + collectAndPrintResults(*output); + + if (mpi_comm_rank == 0) { + if (!validateSuccess) { + std::cerr << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl; + } + else { + std::cout << "Validation: SUCCESS!" << std::endl; + } } return validateSuccess; @@ -645,7 +663,12 @@ template std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ std::string device_name; os << std::left; + if (!printedExecutionSettings.programSettings->testOnly) { printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name); + } + else { + device_name = "TEST RUN: Not selected!"; + } for (auto k : printedExecutionSettings.programSettings->getSettingsMap()) { os << std::setw(2 * ENTRY_SPACE) << k.first << k.second << std::endl; } diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index 33a4b5c8..dd1ddd28 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -166,6 +166,10 @@ Sets up the given FPGA with the kernel in the provided file. // Create the Program from the AOCX file. cl::Program program(*context, deviceList, mybinaries, NULL, &err); ASSERT_CL(err) + + // Build the program (required for fast emulation on Intel) + ASSERT_CL(program.build()); + if (world_rank == 0) { std::cout << "Prepared FPGA successfully for global Execution!" << std::endl; @@ -298,6 +302,10 @@ choose a device. } else { chosenDeviceId = static_cast(world_rank % deviceList.size()); } + } else if (deviceList.size() == 1) { + chosenDeviceId = 0; + } else { + throw std::runtime_error("No devices found for selected Platform!"); } if (world_rank == 0) { diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp index bc3c9bdb..a93a2a69 100644 --- a/shared/tests/hpcc_base_benchmark_test.cpp +++ b/shared/tests/hpcc_base_benchmark_test.cpp @@ -66,6 +66,7 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark::checkInputParameters(); + } + } + SuccessBenchmark() : HpccFpgaBenchmark(0, { nullptr}) {} }; @@ -106,13 +117,18 @@ class BaseHpccBenchmarkTest :public ::testing::Test { BaseHpccBenchmarkTest() { bm = std::unique_ptr(new SuccessBenchmark()); - bool success = bm->setupBenchmark(global_argc, global_argv); - EXPECT_TRUE(success); + bm->setupBenchmark(global_argc, global_argv); } }; +TEST_F(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) { + bool success = bm->setupBenchmark(global_argc, global_argv); + EXPECT_TRUE(success); +} + + /** * Checks if the testing flag works as expected */ @@ -122,15 +138,14 @@ TEST_F(BaseHpccBenchmarkTest, AllExecutedWhenNotTestOnly) { EXPECT_EQ(bm->validateOutputcalled, 1); EXPECT_EQ(bm->executeKernelcalled, 1); EXPECT_EQ(bm->generateInputDatacalled, 1); - } -TEST_F(BaseHpccBenchmarkTest, GenerateInputExecutedWhenTestOnly) { +TEST_F(BaseHpccBenchmarkTest, NothingExecutedWhenTestOnly) { bm->getExecutionSettings().programSettings->testOnly = true; bm->executeBenchmark(); EXPECT_EQ(bm->validateOutputcalled, 0); EXPECT_EQ(bm->executeKernelcalled, 0); - EXPECT_EQ(bm->generateInputDatacalled, 1); + EXPECT_EQ(bm->generateInputDatacalled, 0); } TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenNotTestOnly) { @@ -139,11 +154,17 @@ TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenNotTestOnly) { } -TEST_F(BaseHpccBenchmarkTest, ExecutionFailsWhenTestOnly) { +TEST_F(BaseHpccBenchmarkTest, ExecutionFailsWhenTestOnlyAndSetupFails) { bm->getExecutionSettings().programSettings->testOnly = true; + bm->forceSetupFail = true; + bm->setupBenchmark(global_argc, global_argv); EXPECT_FALSE(bm->executeBenchmark()); } +TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenTestOnlyAndSetupSuccess) { + bm->getExecutionSettings().programSettings->testOnly = true; + EXPECT_TRUE(bm->executeBenchmark()); +} /** * Checks if using default platform and device is successful