diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1b1812b0..8d0bf4ae 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,7 +9,7 @@ default:
   tags:
     - jacamar
   before_script:
-    - module load intelFPGA_pro/20.4.0_max bittware_520n/20.4.0_max intel devel/CMake/3.15.3-GCCcore-8.3.0
+    - module load intelFPGA_pro/20.4.0 bittware_520n/20.4.0_max intel devel/CMake/3.15.3-GCCcore-8.3.0
 
 ###
 #
@@ -38,7 +38,50 @@ build:STREAM:
       - scripts/**/*
       - cmake/**/*
       - .gitlab-ci.yml
-      
+
+build:STREAM_HP:
+  stage: build
+  script:
+    - rm -rf build
+    - mkdir -p build
+    - cd build
+    - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - make -j 40 all
+  artifacts:
+    paths:
+      - build/bin/stream_kernels_single_emulate.aocx
+      - build/bin/stream_kernels_emulate.aocx
+      - build/bin/STREAM_FPGA_intel
+      - build/bin/STREAM_FPGA_test_intel
+  only:
+    changes:
+      - STREAM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
+
+build:STREAM_DP:
+  stage: build
+  script:
+    - rm -rf build
+    - mkdir -p build
+    - cd build
+    - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - make -j 40 all
+  artifacts:
+    paths:
+      - build/bin/stream_kernels_single_emulate.aocx
+      - build/bin/stream_kernels_emulate.aocx
+      - build/bin/STREAM_FPGA_intel
+      - build/bin/STREAM_FPGA_test_intel
+  only:
+    changes:
+      - STREAM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
       
 build:RandomAccess:
   stage: build
@@ -72,8 +115,11 @@ build:PTRANS:
     - make -j 40 all
   artifacts:
     paths:
-      - build/bin/transpose_diagonal_emulate.aocx
-      - build/bin/transpose_diagonal_c2_emulate.aocx
+      - build/bin/transpose_DIAG_IEC_emulate.aocx
+      - build/bin/transpose_PQ_IEC_emulate.aocx
+      - build/bin/transpose_PQ_PCIE_emulate.aocx
+      - build/bin/transpose_DIAG_PCIE_emulate.aocx
+      - build/bin/transpose_c2_DIAG_IEC_emulate.aocx
       - build/bin/Transpose_intel
       - build/bin/Transpose_test_intel
   only:
@@ -90,11 +136,12 @@ build:LINPACK:
     - rm -rf build
     - mkdir -p build
     - cd build
-    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
+    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DUSE_PCIE_MPI_COMMUNICATION=Yes
     - make -j 40 all
   artifacts:
     paths:
-      - build/bin/hpl_torus_emulate.aocx
+      - build/bin/hpl_torus_PCIE_emulate.aocx
+      - build/bin/hpl_torus_IEC_emulate.aocx
       - build/bin/Linpack_intel
       - build/bin/Linpack_test_intel
   only:
@@ -147,6 +194,27 @@ build:GEMM_HP_REP2:
       - cmake/**/*
       - .gitlab-ci.yml
 
+build:GEMM_DP_REP2:
+  stage: build
+  script:
+    - rm -rf build
+    - mkdir -p build
+    - cd build
+    - cmake ../GEMM -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
+    - make -j 40 all
+  artifacts:
+    paths:
+      - build/bin/gemm_base_emulate.aocx
+      - build/bin/GEMM_intel
+      - build/bin/GEMM_test_intel
+  only:
+    changes:
+      - GEMM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
+
 build:FFT:
   stage: build
   script:
@@ -199,11 +267,7 @@ build:b_eff:
     - make -j 40 all
   artifacts:
     paths:
-      - build/bin/communication_bw520n_emulate.aocx
-      - build/bin/communication_bw520n_combined_loops_emulate.aocx
-      - build/bin/communication_bw520n_disable_pipelining_emulate.aocx
-      - build/bin/Network_intel
-      - build/bin/Network_test_intel
+      - build/bin/*
   only:
     changes:
       - b_eff/**/*
@@ -223,7 +287,7 @@ test:STREAM:
   script:
     - cd build
     - cmake ../STREAM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:STREAM
   artifacts:
@@ -238,13 +302,57 @@ test:STREAM:
       - cmake/**/*
       - .gitlab-ci.yml
   needs: ["build:STREAM"]
+
+test:STREAM_HP:
+  stage: test
+  script:
+    - cd build
+    - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  dependencies:
+    - build:STREAM_HP
+  artifacts:
+    when: on_failure
+    paths:
+      - build/Testing/Temporary/LastTest.log
+  only:
+    changes:
+      - STREAM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
+  needs: ["build:STREAM_HP"]
+  # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE)
+  allow_failure: true
+
+test:STREAM_DP:
+  stage: test
+  script:
+    - cd build
+    - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  dependencies:
+    - build:STREAM_DP
+  artifacts:
+    when: on_failure
+    paths:
+      - build/Testing/Temporary/LastTest.log
+  only:
+    changes:
+      - STREAM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
+  needs: ["build:STREAM_DP"]
     
 test:RandomAccess:
   stage: test
   script:
     - cd build
     - cmake ../RandomAccess -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:RandomAccess
   artifacts:
@@ -275,7 +383,7 @@ test:PTRANS:
     - ln -s kernel_output_ch1 kernel_input_ch0
     - ln -s kernel_output_ch3 kernel_input_ch2
     - cd ..
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:PTRANS
   artifacts:
@@ -296,7 +404,7 @@ test:LINPACK:
   script:
     - cd build
     - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0  -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:LINPACK
   artifacts:
@@ -317,7 +425,7 @@ test:GEMM:
   script:
     - cd build
     - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:GEMM
   artifacts:
@@ -338,7 +446,7 @@ test:GEMM_HP_REP2:
   script:
     - cd build
     - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=half -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:GEMM_HP_REP2
   artifacts:
@@ -353,13 +461,36 @@ test:GEMM_HP_REP2:
       - cmake/**/*
       - .gitlab-ci.yml
   needs: ["build:GEMM_HP_REP2"]
+  # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE)
+  allow_failure: true
+
+test:GEMM_DP_REP2:
+  stage: test
+  script:
+    - cd build
+    - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=double -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  dependencies:
+    - build:GEMM_DP_REP2
+  artifacts:
+    when: on_failure
+    paths:
+      - build/Testing/Temporary/LastTest.log
+  only:
+    changes:
+      - GEMM/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+      - .gitlab-ci.yml
+  needs: ["build:GEMM_DP_REP2"]
 
 test:FFT:
   stage: test
   script:
     - cd build
     - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:FFT
   artifacts:
@@ -380,7 +511,7 @@ test:FFT_small:
   script:
     - cd build
     - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:FFT_small
   artifacts:
@@ -411,7 +542,7 @@ test:b_eff:
     - ln -s kernel_output_ch1 kernel_input_ch0
     - ln -s kernel_output_ch3 kernel_input_ch2
     - cd ..
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+    - make CTEST_OUTPUT_ON_FAILURE=1 test
   dependencies:
     - build:b_eff
   artifacts:
diff --git a/FFT/src/device/fft1d_float_8.cl b/FFT/src/device/fft1d_float_8.cl
index 778bec18..5a411aef 100644
--- a/FFT/src/device/fft1d_float_8.cl
+++ b/FFT/src/device/fft1d_float_8.cl
@@ -109,29 +109,31 @@ void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["i
 
   // for iter iterations and one additional iteration to empty the last buffer
   for(unsigned k = 0; k < (iter + 1) * (N / POINTS); k++){ 
+    
+    if (k < iter * ( N / POINTS)) {
 
-    float2 read_chunk[POINTS];
-
-    // Read the next 8 values from global memory
-    // in the last iteration just read garbage, but the data will not be forwarded over the pipes.
-    // This allows the use of memory bursts here.
-    // Also the data is shifted  every N/POINTS/POINTS iterations
-    __attribute__((opencl_unroll_hint(POINTS)))
-    for(int j = 0; j < POINTS; j++){
-      // Shift the data depending on the total FFT size
-      // Shifts every new chunk by one. If N/POINTS is a multiple of POINTS, the shifting is reduced to prevent mappings to the same bank.
-      unsigned shift = ((LOGN - LOGPOINTS - LOGPOINTS > 0) ? (k & (N/POINTS - 1)) >> (LOGN - LOGPOINTS - LOGPOINTS) : (k & (N/POINTS - 1)));
-      unsigned final_buffer_pos = (j + shift) & (POINTS - 1);
-      read_chunk[final_buffer_pos] = src[(k << LOGPOINTS) + j];
-    }
+      float2 read_chunk[POINTS];
 
-    // Write the shifted data into the memory buffer
-    __attribute__((opencl_unroll_hint(POINTS)))
-    for(int j = 0; j < POINTS; j++){
-      unsigned local_i = k & (2 * N/POINTS - 1);
-      buf[local_i][j] = read_chunk[j];
-    }
+      // Read the next 8 values from global memory
+      // in the last iteration just read garbage, but the data will not be forwarded over the pipes.
+      // This allows the use of memory bursts here.
+      // Also the data is shifted  every N/POINTS/POINTS iterations
+      __attribute__((opencl_unroll_hint(POINTS)))
+      for(int j = 0; j < POINTS; j++){
+        // Shift the data depending on the total FFT size
+        // Shifts every new chunk by one. If N/POINTS is a multiple of POINTS, the shifting is reduced to prevent mappings to the same bank.
+        unsigned shift = ((LOGN - LOGPOINTS - LOGPOINTS > 0) ? (k & (N/POINTS - 1)) >> (LOGN - LOGPOINTS - LOGPOINTS) : (k & (N/POINTS - 1)));
+        unsigned final_buffer_pos = (j + shift) & (POINTS - 1);
+        read_chunk[final_buffer_pos] = src[(k << LOGPOINTS) + j];
+      }
 
+      // Write the shifted data into the memory buffer
+      __attribute__((opencl_unroll_hint(POINTS)))
+      for(int j = 0; j < POINTS; j++){
+        unsigned local_i = k & (2 * N/POINTS - 1);
+        buf[local_i][j] = read_chunk[j];
+      }
+    }
     if (k >= ( N / POINTS)) {
       float2x8 buf2x8;
 
diff --git a/FFT/src/host/execution_default.cpp b/FFT/src/host/execution_default.cpp
index bafd18c7..614560ae 100644
--- a/FFT/src/host/execution_default.cpp
+++ b/FFT/src/host/execution_default.cpp
@@ -29,7 +29,6 @@ SOFTWARE.
 #include <chrono>
 
 /* External library headers */
-#include "CL/cl.hpp"
 #ifdef INTEL_FPGA
 #ifdef USE_HBM
 // CL_HETEROGENEOUS_INTELFPGA is defined here 
diff --git a/GEMM/Readme.md b/GEMM/Readme.md
index ebe13caf..831194bd 100755
--- a/GEMM/Readme.md
+++ b/GEMM/Readme.md
@@ -21,6 +21,8 @@ If available, the benchmark will use `sgemm_` to validate the calculation instea
 For matrix sizes above 1000x1000 we recommend using such a library to speed up the benchmark execution. 
 Using such a library will not change the performance result of the benchmark but might affect the reported error of the calculation.
 
+For half precision support, the IEEE 754-based half-precision floating-point library by Christian Rau is used and a copy is provided with this code. 
+
 ## Build
 
 CMake is used as the build system.
@@ -53,7 +55,7 @@ Next to the common configuration options given in the [README](../README.md) of
 
 Name             | Default     | Description                          |
 ---------------- |-------------|--------------------------------------|
- `DATA_TYPE`     | float (also supported: half, double)      | Data type used for calculation       |
+ `DATA_TYPE`     | float (also supported: half, double)      | Data type used for calculation. *Note: Currently, half-precision does not work on Intel FPGAs because they can not be passed as kernel argument per value.*  |
 `DEFAULT_MATRIX_SIZE` | 8      | The default size of the quadratic matrices in blocks |
 `BLOCK_SIZE`    | 512          | Block size used by the kernel for calculation |
 `GEMM_SIZE`    | 8             | Block size of the fully unrolled matrix multiplication in registers |
diff --git a/LINPACK/CHANGELOG b/LINPACK/CHANGELOG
index cb070b06..3e86dedc 100644
--- a/LINPACK/CHANGELOG
+++ b/LINPACK/CHANGELOG
@@ -2,11 +2,17 @@
 
 This file contains all changes made to the source code for each release.
 
+## 2.3
+#### Changed:
+- Refactored the code to support different execution kernels and data distributions
+#### Added:
+- FPGA kernel with communication via PCIe and MPI 
+
 ## 2.2
 
 #### Added:
 - LU facotrization kernel w/o pivoting in quadratic torus
-- Distributed calculation of GEL on CPU nodes and validation
+- Distributed calculation of GESL on CPU nodes and validation
 
 ## 2.1
 
diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt
index bec71e7e..72ae009c 100755
--- a/LINPACK/CMakeLists.txt
+++ b/LINPACK/CMakeLists.txt
@@ -1,9 +1,9 @@
 cmake_minimum_required(VERSION 3.1)
-project(LINPACK VERSION 2.2)
+project(LINPACK VERSION 2.3)
 
 set(USE_DEPRECATED_HPP_HEADER No)
 
-set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size")
+set(DEFAULT_MATRIX_SIZE 2 CACHE STRING "Default matrix size")
 set(LOCAL_MEM_BLOCK_LOG 5 CACHE STRING "Used to define the width and height of the block stored in local memory")
 set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers")
 set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated")
@@ -11,6 +11,8 @@ set(TEST_UNIFORM No CACHE BOOL "All tests executed by CTest will be executed wit
 set(TEST_EMULATION Yes CACHE BOOL "All tests executed by CTest will be executed with emulation kernels")
 set(DISTRIBUTED_VALIDATION Yes CACHE BOOL "Use the distributed validation scheme instead of validation on rank 0")
 
+set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes)
+
 if (TEST_UNIFORM)
     set(TEST_HOST_FLAGS "--uniform")
 endif()
diff --git a/LINPACK/Readme.md b/LINPACK/Readme.md
index b2b65b6d..e0e276c1 100644
--- a/LINPACK/Readme.md
+++ b/LINPACK/Readme.md
@@ -20,13 +20,13 @@ The targets below can be used to build the benchmark and its kernels, where `VEN
  Only the LU facotrization without pivoting is implemented on FPGA and external channels are
  used to calculate the solution in a 2D torus of FPGAs.
 
- The kernel targets are:
+ The kernel targets are listed below. `COMM_TYPE` can be IEC for Intel external channel (only available for vendor Intel) and PCIE for communication via PCIe and MPI.
  
   |  Target                        | Description                                    |
   | ------------------------------ | ---------------------------------------------- |
-  | hpl_torus_`VENDOR`                | Synthesizes the kernel (takes several hours!)  |
-  | hpl_torus_report_`VENDOR`         | Just compile kernel and create reports         |
-  | hpl_torus_emulate_`VENDOR`          | Create a n emulation kernel                    |
+  | hpl_torus_`COMM_TYPE`_`VENDOR`                | Synthesizes the kernel (takes several hours!)  |
+  | hpl_torus_`COMM_TYPE`_report_`VENDOR`         | Just compile kernel and create reports         |
+  | hpl_torus_`COMM_TYPE`_emulate_`VENDOR`          | Create a n emulation kernel                    |
 
  You can build for example the host application by running
  
@@ -69,14 +69,12 @@ For more information on available input parameters run
     ./Linpack_intel -h
     
     Implementation of the LINPACK benchmark proposed in the HPCC benchmark suite for FPGA.
-    Version: 2.2
+    Version: 2.3
 
     MPI Version:  3.1
-    Config. Time: Wed Apr 14 09:31:37 UTC 2021
-    Git Commit:   60651eb-dirty
 
     Usage:
-    ./bin/Linpack_intel [OPTION...]
+    bin/Linpack_intel [OPTION...]
 
     -f, --file arg         Kernel file name
     -n, arg                Number of repetitions (default: 10)
@@ -86,29 +84,34 @@ For more information on available input parameters run
                             data types.
         --device arg       Index of the device that has to be used. If not
                             given you will be asked which device to use if there are
-                            multiple devices available. (default: -1)
+                            multiple devices available. (default: 0)
         --platform arg     Index of the platform that has to be used. If not
                             given you will be asked which platform to use if there
-                            are multiple platforms available. (default: -1)
-    -r, arg                Number of used kernel replications (default: 3)
+                            are multiple platforms available. (default: 0)
+    -r, arg                Number of used kernel replications (default: 1)
+        --comm-type arg    Used communication type for inter-FPGA communication
+                            (default: AUTO)
         --test             Only test given configuration and skip execution and
                             validation
     -h, --help             Print this help
     -m, arg                Matrix size in number of blocks in one dimension for
                             a singe MPI rank. Total matrix will have size m *
-                            sqrt(MPI_size) (default: 1024)
+                            sqrt(MPI_size) (default: 2)
     -b, arg                Log2 of the block size in number of values in one
-                            dimension (default: 3)
+                            dimension (default: 5)
         --uniform          Generate a uniform matrix instead of a diagonally
                             dominant. This has to be supported by the FPGA kernel!
         --emulation        Use kernel arguments for emulation. This may be
                             necessary to simulate persistent local memory on the FPGA
 
+Available options for `--comm-type`:
 
+- `IEC`: Intel external channels are used by the kernels for communication.
+- `PCIE`: PCIe and MPI are used to exchange data between FPGAs over the CPU.
     
 To execute the unit and integration tests for Intel devices run
 
-    CL_CONTEXT_EMULATOR_DEVICE=1 ./Linpack_test_intel -f KERNEL_FILE_NAME
+    ./Linpack_test_intel -f KERNEL_FILE_NAME
     
 in the `bin` folder within the build directory.
 It will run an emulation of the kernel and execute some functionality tests.
diff --git a/LINPACK/configs/Xilinx_U250_B8_SB3_R3.cmake b/LINPACK/configs/Xilinx_U250_B8_SB3_R3.cmake
new file mode 100644
index 00000000..c5f40c60
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U250_B8_SB3_R3.cmake
@@ -0,0 +1,25 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u250_xdma_201830_2" CACHE STRING "" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 3 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/pre_synthesis.u250.tcl" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.u250.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
+
diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2.cmake
new file mode 100644
index 00000000..a9adfef5
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2.cmake
@@ -0,0 +1,25 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
+
diff --git a/LINPACK/settings/pre_synthesis.u250.tcl b/LINPACK/settings/pre_synthesis.u250.tcl
new file mode 100644
index 00000000..5a5a9373
--- /dev/null
+++ b/LINPACK/settings/pre_synthesis.u250.tcl
@@ -0,0 +1,6 @@
+
+# Allow reordeing of math operations to increase parallelism
+config_compile -unsafe_math_optimizations
+
+# Reduce number of memory ports to reduce resource uage for GMI
+#config_interface -m_axi_auto_max_ports false
\ No newline at end of file
diff --git a/LINPACK/settings/settings.compile.xilinx.lu_blocked_pvt.ddr.ini b/LINPACK/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini
similarity index 100%
rename from LINPACK/settings/settings.compile.xilinx.lu_blocked_pvt.ddr.ini
rename to LINPACK/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini
diff --git a/LINPACK/settings/settings.compile.xilinx.hpl_torus_pcie.u250.ini b/LINPACK/settings/settings.compile.xilinx.hpl_torus_pcie.u250.ini
new file mode 100644
index 00000000..a1334eb2
--- /dev/null
+++ b/LINPACK/settings/settings.compile.xilinx.hpl_torus_pcie.u250.ini
@@ -0,0 +1,4 @@
+kernel_frequency=300
+
+[hls]
+max_memory_ports=all
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini
new file mode 100644
index 00000000..e032e407
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini
@@ -0,0 +1,34 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$
+# PY_CODE_GEN block_end
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:DDR[0]
+sp=lu_1.m_axi_gmem1:DDR[0]
+sp=lu_1.m_axi_gmem2:DDR[1]
+
+sp=top_update_1.m_axi_gmem0:DDR[0]
+sp=top_update_1.m_axi_gmem1:DDR[0]
+sp=top_update_1.m_axi_gmem2:DDR[0]
+
+sp=left_update_1.m_axi_gmem0:DDR[0]
+sp=left_update_1.m_axi_gmem1:DDR[1]
+sp=left_update_1.m_axi_gmem2:DDR[1]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:DDR[0]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:DDR[1]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:DDR[0]
+# PY_CODE_GEN block_end
+
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.u250.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.u250.generator.ini
new file mode 100644
index 00000000..2da7f651
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.u250.generator.ini
@@ -0,0 +1,26 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR1. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR1
+slr=left_update_1:SLR1
+slr=top_update_1:SLR1
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +2) % 4$
+# PY_CODE_GEN block_end
+
+# matrix ports
+sp=lu_1.m_axi_gmem:DDR[1]
+
+sp=top_update_1.m_axi_gmem:DDR[1]
+
+sp=left_update_1.m_axi_gmem:DDR[1]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem:DDR[1]
+# PY_CODE_GEN block_end
+
diff --git a/LINPACK/settings/settings.link.xilinx.lu_blocked_pvt.ddr.ini b/LINPACK/settings/settings.link.xilinx.lu_blocked_pvt.ddr.ini
deleted file mode 100644
index f2117a92..00000000
--- a/LINPACK/settings/settings.link.xilinx.lu_blocked_pvt.ddr.ini
+++ /dev/null
@@ -1,9 +0,0 @@
-[connectivity]
-nk=gefa:1
-
-# slrs
-slr=gefa_1:SLR0
-
-# matrix ports
-sp=gefa_1.m_axi_gmem0:DDR[0]
-sp=gefa_1.m_axi_gmem1:DDR[1]
diff --git a/LINPACK/src/device/CMakeLists.txt b/LINPACK/src/device/CMakeLists.txt
index 2c5c0bce..7a28cc56 100644
--- a/LINPACK/src/device/CMakeLists.txt
+++ b/LINPACK/src/device/CMakeLists.txt
@@ -2,18 +2,18 @@
 include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)
 
 if (INTELFPGAOPENCL_FOUND)
-    generate_kernel_targets_intel(hpl_torus)
-    add_test(NAME test_emulation_intel COMMAND Linpack_intel -f hpl_torus_emulate.aocx -m 2 -n 1 ${TEST_HOST_FLAGS}
+    generate_kernel_targets_intel(hpl_torus_IEC hpl_torus_PCIE)
+    add_test(NAME test_emulation_intel COMMAND Linpack_intel -f hpl_torus_PCIE_emulate.aocx -m 2 -n 1 ${TEST_HOST_FLAGS}
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-    add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_intel ${TEST_HOST_FLAGS} -f hpl_torus_emulate.aocx -m 2 -n 1 
+    add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_intel ${TEST_HOST_FLAGS} -f hpl_torus_PCIE_emulate.aocx -m 2 -n 1 
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
 endif()
 
 if (VITIS_FOUND)
-    generate_kernel_targets_xilinx(hpl_torus)
-    add_test(NAME test_emulation_xilinx COMMAND Linpack_xilinx -f hpl_torus_emulate.xclbin -m 2 -n 1 ${TEST_HOST_FLAGS}
+    generate_kernel_targets_xilinx(hpl_torus_PCIE)
+    add_test(NAME test_emulation_xilinx COMMAND Linpack_xilinx -f hpl_torus_PCIE_emulate.xclbin -m 2 -n 1 ${TEST_HOST_FLAGS}
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-    add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_xilinx ${TEST_HOST_FLAGS} -f hpl_torus_emulate.xclbin -m 2 -n 1 
+    add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_xilinx ${TEST_HOST_FLAGS} -f hpl_torus_PCIE_emulate.xclbin -m 2 -n 1 
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
 endif()
 
diff --git a/LINPACK/src/device/hpl_torus.cl b/LINPACK/src/device/hpl_torus_IEC.cl
similarity index 100%
rename from LINPACK/src/device/hpl_torus.cl
rename to LINPACK/src/device/hpl_torus_IEC.cl
diff --git a/LINPACK/src/device/hpl_torus_PCIE.cl b/LINPACK/src/device/hpl_torus_PCIE.cl
new file mode 100644
index 00000000..0f31f0d4
--- /dev/null
+++ b/LINPACK/src/device/hpl_torus_PCIE.cl
@@ -0,0 +1,831 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#include "parameters.h"
+
+// Current implementation uses __fpga_reg call to add additional registers for 
+#ifdef XILINX_FPGA
+#define __fpga_reg(x) x
+#endif
+
+#define BLOCK_SIZE (1 << LOCAL_MEM_BLOCK_LOG)
+#define GEMM_BLOCK (1 << REGISTER_BLOCK_LOG)
+
+#ifdef INTEL_FPGA
+#pragma OPENCL EXTENSION cl_intel_channels : enable
+#endif
+
+typedef struct tmp_channel_chunk { DEVICE_DATA_TYPE data[GEMM_BLOCK];} ch_chunk_t;
+
+/**
+Executes a single step of the LU factorization.
+
+This method takes a partially solved 8x8 matrix and calculates the next step of the LU factorization
+The method needs 7 (GEMM_BLOCK-1) calls to perform a single LU factorization. This is done to reduce resource usage,
+since all upcomng calls are anyway depending on the results of the previous call and there is no way
+to pipeline multiple executions.
+
+A is the input block that might be partially computed
+step is the current step and must be a value between 0 to GEMM_BLOCK-2. After step GEMM_BLOCK-2, the block is factorized
+ */
+void
+lu_block(const DEVICE_DATA_TYPE A[GEMM_BLOCK][GEMM_BLOCK], const int step, DEVICE_DATA_TYPE A_out[GEMM_BLOCK][GEMM_BLOCK]) {
+
+	// Read current line from input
+	DEVICE_DATA_TYPE line[GEMM_BLOCK];
+	__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+	for (int i=0; i < GEMM_BLOCK; i++) {
+		line[i] = A[step][i];
+	}
+
+	// calculate the inverse of the diagonal element for the scaling
+	DEVICE_DATA_TYPE inv_scale_a = -1.0 / line[step];
+
+	// Scale the current row
+	__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+	for (int i=0; i < GEMM_BLOCK; i++) {
+		if (i > step) {
+			line[i] = line[i] * inv_scale_a;
+		}
+	}
+	line[step] = inv_scale_a;
+
+	// Update all rows fully unrolled
+	// The multiply adds are fully independent
+	//__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+	// Unrolling disabled for this loop to save resources
+	for (int j = 0; j < GEMM_BLOCK; j++) {
+		DEVICE_DATA_TYPE curr_scale = A[j][step];
+		// Update a single row. If it is already updated, just write back the value, if it is the current row
+		// write back the value in "line", else update the value
+		if (j != step) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int i = 0; i < GEMM_BLOCK; i++) {
+				A_out[j][i] = (i > step && j > step) ? A[j][i] + line[i] * curr_scale : A[j][i];
+			}
+		}
+		else {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int i = 0; i < GEMM_BLOCK; i++) {
+				A_out[j][i] = line[i];
+			}		
+		}
+	}
+}
+
+/**
+This function can be used to update blocks using with three different operations.
+It will execute the update for a single row in the block. The update is completed after GEMM_BLOCK calls of this
+update function
+
+operation_type: 0 for top = the top row of blocks will need a triangular MM
+				1 for left = the left column of blocks will need a triangular MM, matrices have to be transposed
+				2 for inner block == all inner blocks will be updated with a MM
+ */
+void
+update_block(const DEVICE_DATA_TYPE a[GEMM_BLOCK][GEMM_BLOCK], 
+			 const DEVICE_DATA_TYPE top[GEMM_BLOCK],
+			 const DEVICE_DATA_TYPE left_or_lu[GEMM_BLOCK],
+			 DEVICE_DATA_TYPE out[GEMM_BLOCK][GEMM_BLOCK],
+			 const int current_row,
+			 const int operation_type) {
+	
+	// Define different operation types of function
+	const int op_top = 0;
+	const int op_left = 1;
+	const int op_inner = 2;
+
+	// Transpose the input matrices if the target is a left block
+	DEVICE_DATA_TYPE current_block[GEMM_BLOCK][GEMM_BLOCK]  __attribute__((register, xcl_array_partition(complete, 1), xcl_array_partition(complete, 2)));
+	if (operation_type == op_left) {
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int jj =0; jj < GEMM_BLOCK; jj++) {
+				current_block[ii][jj] = __fpga_reg(a[jj][ii]);
+			}
+		}
+	}
+	else {
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int jj =0; jj < GEMM_BLOCK; jj++) {
+				current_block[ii][jj] = __fpga_reg(a[ii][jj]);
+			}
+		}
+	}
+
+	// Generate the first scalling array depending on the operation type
+	DEVICE_DATA_TYPE scale_row[GEMM_BLOCK]  __attribute__((register, xcl_array_partition(complete, 1)));
+	if (operation_type == op_inner) {
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int jj =0; jj < GEMM_BLOCK; jj++) {
+			scale_row[jj] = top[jj];
+		}
+	}
+	else {
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int jj =0; jj < GEMM_BLOCK; jj++) {
+			scale_row[jj] = current_block[current_row][jj];
+		}
+	}
+	if (operation_type == op_top) {
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int jj =0; jj < GEMM_BLOCK; jj++) {
+			scale_row[jj] *= left_or_lu[current_row];
+		}
+	}
+
+	DEVICE_DATA_TYPE tmp[GEMM_BLOCK][GEMM_BLOCK]  __attribute__((register, xcl_array_partition(complete, 1), xcl_array_partition(complete, 2)));
+	// scale all values with the pre calculated scaling array and the second input
+	__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+	for (int ii =0; ii < GEMM_BLOCK; ii++) {
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int jj =0; jj < GEMM_BLOCK; jj++) {
+			// left_or_lu_block are stored transposed to simplify the data access here
+			tmp[ii][jj] = current_block[ii][jj] + scale_row[jj] * left_or_lu[ii];
+		}
+	}
+
+	// overwrite results that were calculated altough they are not needed for the triangular operations left and top
+	if (operation_type != op_inner) {
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			if (ii == current_row) {
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					tmp[ii][jj] = scale_row[jj];
+				}
+			}
+			else if (ii < current_row) {
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					tmp[ii][jj] = current_block[ii][jj];
+				}				
+			}
+		}
+	}
+
+	// write result back and transpose if necessary
+	if (operation_type == op_left) {
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int jj =0; jj < GEMM_BLOCK; jj++) {
+				out[ii][jj] = __fpga_reg(tmp[jj][ii]);
+			}
+		}
+	}
+	else {
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int jj =0; jj < GEMM_BLOCK; jj++) {
+				out[ii][jj] = __fpga_reg(tmp[ii][jj]);
+			}
+		}		
+	}
+}
+
+__attribute__((uses_global_work_offset(0)))
+__kernel
+void
+lu(__global DEVICE_DATA_TYPE* restrict a, 
+   __global DEVICE_DATA_TYPE* restrict a_block_trans,
+   __global DEVICE_DATA_TYPE* restrict a_block,
+				const uint block_col,
+				const uint block_row,
+				const uint blocks_per_row) {
+
+	local DEVICE_DATA_TYPE a_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4)));
+	
+	// Store current row and column in separate buffers for 
+	// easier access in the deep pipeline
+	// need to be declared as local to prevent the compiler from 
+	local DEVICE_DATA_TYPE top_buffer[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2)));
+	local DEVICE_DATA_TYPE left_buffer[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2)));
+
+	// Load block to local memory
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
+				}
+			}
+		}
+	}
+	
+	// For each row in the matrix update whole matrix.
+	// The iterations depend on each other, so loop pipelining is disabled here
+	#pragma disable_loop_pipelining
+	for (int gk = 0; gk < BLOCK_SIZE; gk++) {
+
+		int k = gk / GEMM_BLOCK;
+		int kk = gk & (GEMM_BLOCK - 1);
+
+		// Read in current LU block
+		DEVICE_DATA_TYPE lu_a_buffer_in[GEMM_BLOCK][GEMM_BLOCK];
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int jj =0; jj < GEMM_BLOCK; jj++) {
+				lu_a_buffer_in[ii][jj] = a_buffer[k][k][ii][jj];
+			}
+		}
+
+		DEVICE_DATA_TYPE lu_a_buffer_out[GEMM_BLOCK][GEMM_BLOCK];
+		DEVICE_DATA_TYPE lu_a_buffer_out_row[GEMM_BLOCK];
+		DEVICE_DATA_TYPE lu_a_buffer_out_col[GEMM_BLOCK];
+		// Calculate next row and column of LU factorization and store in local memory buffer
+		lu_block(lu_a_buffer_in, kk, lu_a_buffer_out);
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int jj =0; jj < GEMM_BLOCK; jj++) {
+				a_buffer[k][k][ii][jj] = lu_a_buffer_out[ii][jj];
+			}
+		}
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int jj =0; jj < GEMM_BLOCK; jj++) {
+			lu_a_buffer_out_row[jj] = lu_a_buffer_out[kk][jj];
+		}
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int jj =0; jj < GEMM_BLOCK; jj++) {
+			lu_a_buffer_out_col[jj] = lu_a_buffer_out[jj][kk];
+		}
+
+		// The update pipeline does not need to be executed for the last
+		// row of blocks
+		if (gk < BLOCK_SIZE - GEMM_BLOCK) {
+
+			// update all left blocks
+			for (int tj = 1; tj < BLOCK_SIZE/GEMM_BLOCK; tj++) {
+
+				int j = k;
+				int i = tj;
+				
+				if (i > k) {
+					// copy the correct block in the second input buffer
+					// this depends on the operations that has to be executed
+					DEVICE_DATA_TYPE second_input[GEMM_BLOCK];
+
+					// left matrix block will be calculated
+					__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+					for (int jj =0; jj < GEMM_BLOCK; jj++) {
+						second_input[jj] = __fpga_reg(lu_a_buffer_out_row[jj]);
+					}
+					DEVICE_DATA_TYPE a_input[GEMM_BLOCK][GEMM_BLOCK] __attribute__((xcl_array_partition(complete, 1),xcl_array_partition(complete, 2)));
+					__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+					for (int ii =0; ii < GEMM_BLOCK; ii++) {
+						__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+						for (int jj = 0; jj < GEMM_BLOCK; jj++) {
+							a_input[ii][jj] = __fpga_reg(a_buffer[i][j][ii][jj]);
+						}
+					}
+					DEVICE_DATA_TYPE top_input[GEMM_BLOCK];
+					DEVICE_DATA_TYPE out[GEMM_BLOCK][GEMM_BLOCK] __attribute__((register, xcl_array_partition(complete, 1), xcl_array_partition(complete, 2)));
+					update_block(a_input, 
+									top_input, 
+									second_input, 
+									out,
+									kk,
+									1);
+
+					__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+					for (int ii =0; ii < GEMM_BLOCK; ii++) {
+						left_buffer[i][ii] = __fpga_reg(out[ii][kk]);
+					}
+					__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+					for (int ii =0; ii < GEMM_BLOCK; ii++) {
+						__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+						for (int jj = 0; jj < GEMM_BLOCK; jj++) {
+							a_buffer[i][j][ii][jj] = __fpga_reg(out[ii][jj]);
+						}
+					}
+				}
+			}
+
+			// Update all other blocks with the new calculated row and column
+			// First update top blocks, then update left blocks, then all inner blocks
+			// ti == 0: top blocks
+			// ti == 1: left blocks
+			// ti > 1: inner blocks
+#ifdef INTEL_FPGA
+			#pragma loop_coalesce
+			#pragma ivdep safelen(BLOCK_SIZE/GEMM_BLOCK - 1)
+#endif
+			for (int ti = 0; ti < BLOCK_SIZE/GEMM_BLOCK - k; ti++) {
+#ifdef INTEL_FPGA
+				#pragma ivdep
+#endif
+				for (int tj = 1; tj < BLOCK_SIZE/GEMM_BLOCK; tj++) {
+
+					int j = tj;
+					int i = ti + k;
+					// always execute the pipeline for whole rows of matrix blocks.
+					// Only execute update for blocks that are required.
+					// This helps to keep constant latencies between data dependencies of the pipeline stages
+					if ((i > k || ti == 0) && j > k ) {
+						
+						// copy the correct block in the second input buffer
+						// this depends on the operations that has to be executed
+						DEVICE_DATA_TYPE second_input[GEMM_BLOCK];
+						if (ti == 0) {
+							// top matrix block will be calculated
+							__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+							for (int jj =0; jj < GEMM_BLOCK; jj++) {
+								second_input[jj] = __fpga_reg(lu_a_buffer_out_col[jj]);
+							}
+						}
+						else {
+							// inner block will be calculated
+							__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+							for (int jj =0; jj < GEMM_BLOCK; jj++) {
+								second_input[jj] = __fpga_reg(left_buffer[i][jj]);
+							}
+						}
+						DEVICE_DATA_TYPE a_input[GEMM_BLOCK][GEMM_BLOCK] __attribute__((xcl_array_partition(complete, 1),xcl_array_partition(complete, 2)));
+						__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+						for (int ii =0; ii < GEMM_BLOCK; ii++) {
+							__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+							for (int jj = 0; jj < GEMM_BLOCK; jj++) {
+								a_input[ii][jj] = __fpga_reg(a_buffer[i][j][ii][jj]);
+							}
+						}
+						DEVICE_DATA_TYPE top_input[GEMM_BLOCK];
+						__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+						for (int jj =0; jj < GEMM_BLOCK; jj++) {
+							top_input[jj] = __fpga_reg(top_buffer[j][jj]);
+						}
+						DEVICE_DATA_TYPE out[GEMM_BLOCK][GEMM_BLOCK] __attribute__((register, xcl_array_partition(complete, 1), xcl_array_partition(complete, 2)));
+						update_block(a_input, 
+										top_input, 
+										second_input, 
+										out,
+										kk,
+										(ti == 0) ? 0 : 2);
+						if (ti == 0) {
+							// only update in the first row
+							__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+							for (int jj =0; jj < GEMM_BLOCK; jj++) {
+								top_buffer[j][jj] = __fpga_reg(out[kk][jj]);
+							}
+						}
+						__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+						for (int ii =0; ii < GEMM_BLOCK; ii++) {
+							__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+							for (int jj = 0; jj < GEMM_BLOCK; jj++) {
+								a_buffer[i][j][ii][jj] = __fpga_reg(out[ii][jj]);
+							}
+						}
+					}
+				}
+			}
+		}
+ 	}
+
+	// Store block to global memory
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
+				}
+			}
+		}
+	}
+	// Store current block in global memory also transposed to allow easier access from the top kernel
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					a_block_trans[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii];
+				}
+			}
+		}
+	}
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					a_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
+				}
+			}
+		}
+	}
+}
+
+/**
+Update the blocks to the right of the current LU block
+
+ */
+ __attribute__((uses_global_work_offset(0)))
+__kernel
+void top_update(__global DEVICE_DATA_TYPE* restrict a, 
+				__global DEVICE_DATA_TYPE* restrict top_block, 
+				__global DEVICE_DATA_TYPE* restrict lu_global_buffer_transposed,
+				const uint is_first_block,
+				const uint block_col,
+				const uint block_row,
+				const uint blocks_per_row) {
+
+	// Store current block in local memory
+	local DEVICE_DATA_TYPE a_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4)));
+	
+
+	// Load block to local memory
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
+				}
+			}
+		}
+	}
+	
+	// For each row in the matrix update whole matrix.
+	// The iterations depend on each other, so loop pipelining is disabled here
+	#pragma disable_loop_pipelining
+	for (int gk = 0; gk < BLOCK_SIZE; gk++) {
+
+		int k = gk / GEMM_BLOCK;
+		int kk = gk & (GEMM_BLOCK - 1);
+
+		DEVICE_DATA_TYPE current_lu_col[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2)));
+		DEVICE_DATA_TYPE current_row[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2)));
+		DEVICE_DATA_TYPE current_scale;
+
+		for (int col = 0; col < BLOCK_SIZE / GEMM_BLOCK; col++) {
+			ch_chunk_t col_in;
+
+			DEVICE_DATA_TYPE scale_chunk[GEMM_BLOCK] __attribute((xcl_array_partition(complete, 1)));
+
+			// get current row chunk
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int i =0; i < GEMM_BLOCK; i++) {
+				scale_chunk[i] = a_buffer[k][col][kk][i];
+			}
+			
+			// if current column data is still available read it in and store it in buffer
+			if (col < BLOCK_SIZE / GEMM_BLOCK - k) {
+				// Load LU data from global memory instead of receiving it from the channel
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int i=0; i < GEMM_BLOCK; i++) {
+					col_in.data[i] = lu_global_buffer_transposed[gk * BLOCK_SIZE + (col + k) * GEMM_BLOCK + i];
+				}
+				if (col == 0) {
+					current_scale = col_in.data[kk];
+				}
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int i =0; i < GEMM_BLOCK; i++) {
+					current_lu_col[col][i] = (col > 0 || i > kk) ? col_in.data[i] : 0.f;
+				}
+			}
+
+			// scale current row chunk with the rows scale factor received over the external channel
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int i =0; i < GEMM_BLOCK; i++) {
+				scale_chunk[i] = scale_chunk[i] * current_scale;
+			}
+
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int i =0; i < GEMM_BLOCK; i++) {
+				current_row[col][i] = scale_chunk[i];
+			}
+
+			// Update local memory buffer with chunk
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int i =0; i < GEMM_BLOCK; i++) {
+				a_buffer[k][col][kk][i] = scale_chunk[i];
+			}
+		}
+
+		// Update all remaining rows
+		#pragma loop_coalesce
+		for (int row = 0; row < BLOCK_SIZE/GEMM_BLOCK - k; row++) {
+			// Update whole rows!
+			__attribute__((xcl_pipeline_loop(1)))
+			for (int curr_col = 0; curr_col < BLOCK_SIZE/GEMM_BLOCK; curr_col++) {
+				DEVICE_DATA_TYPE colbuf[GEMM_BLOCK];
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int j=0; j < GEMM_BLOCK; j++) {
+					colbuf[j] = current_lu_col[row][j];
+				}	
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int i = 0; i < GEMM_BLOCK; i++) {
+					__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+					for (int j=0; j < GEMM_BLOCK; j++) {
+						a_buffer[row + k][curr_col][i][j] += colbuf[i] * current_row[curr_col][j];
+					}
+				}
+			}
+		}
+ 	}
+
+	// Store block to global memory
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
+				}
+			}
+		}
+	}
+	// Store current block separately for easier transmission over host
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					top_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
+				}
+			}
+		}
+	}
+}
+
+/**
+Update the blocks below the current LU block
+
+ */
+ __attribute__((uses_global_work_offset(0)))
+__kernel
+void left_update(__global DEVICE_DATA_TYPE* restrict a, 
+				__global DEVICE_DATA_TYPE* restrict left_block,
+				__global DEVICE_DATA_TYPE* restrict lu_global_buffer,
+				const uint is_first_block,
+				const uint block_col,
+				const uint block_row,
+				const uint blocks_per_row) {
+
+	// Store current block in local memory
+	local DEVICE_DATA_TYPE a_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4)));
+
+	// Load block to local memory
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
+				}
+			}
+		}
+	}
+	
+	// For each row in the matrix update whole matrix.
+	// The iterations depend on each other, so loop pipelining is disabled here
+	#pragma disable_loop_pipelining
+	for (int gk = 0; gk < BLOCK_SIZE; gk++) {
+
+		int k = gk / GEMM_BLOCK;
+		int kk = gk & (GEMM_BLOCK - 1);
+
+		DEVICE_DATA_TYPE current_lu_row[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2)));
+		DEVICE_DATA_TYPE current_col[BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 2)));
+
+		for (int col = 0; col < BLOCK_SIZE / GEMM_BLOCK; col++) {
+			DEVICE_DATA_TYPE chunk[GEMM_BLOCK];
+			// get current row chunk
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int i =0; i < GEMM_BLOCK; i++) {
+				chunk[i] = a_buffer[col][k][i][kk];
+			}
+
+			// Store chunk for later update
+			ch_chunk_t col_out;
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int i =0; i < GEMM_BLOCK; i++) {
+				current_col[col][i] = chunk[i];
+			}
+
+			ch_chunk_t row_in;
+			
+			// if current column data is still available read it in and store it in buffer
+			if (col < BLOCK_SIZE / GEMM_BLOCK - k) {
+				// Load LU data from global memory 
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int i=0; i < GEMM_BLOCK; i++) {
+					row_in.data[i] = lu_global_buffer[gk * BLOCK_SIZE + (col + k) * GEMM_BLOCK + i];
+				}
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int i =0; i < GEMM_BLOCK; i++) {
+					current_lu_row[col][i] = (col > 0 || i > kk) ? row_in.data[i] : 0.f;
+				}
+			}
+		}
+
+		// Update all rows
+		#pragma loop_coalesce
+		// Update only remaining row chunks
+		#pragma ivdep
+		for (int curr_col = 0; curr_col < BLOCK_SIZE/GEMM_BLOCK - k; curr_col++) {
+#ifdef INTEL_FPGA
+			#pragma ivdep
+#endif
+#ifdef XILINX_FPGA
+			__attribute__((xcl_pipeline_loop(1)))
+#endif
+			for (int row = 0; row < BLOCK_SIZE/GEMM_BLOCK; row++) {
+				DEVICE_DATA_TYPE colbuf[GEMM_BLOCK];
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int j=0; j < GEMM_BLOCK; j++) {
+					colbuf[j] = current_col[row][j];
+				}	
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int i = 0; i < GEMM_BLOCK; i++) {
+					__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+					for (int j=0; j < GEMM_BLOCK; j++) {
+						a_buffer[row][curr_col + k][i][j] += current_lu_row[curr_col][j] * colbuf[i];
+					}
+				}
+			}
+		}
+ 	}
+
+	// Store block to global memory
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
+				}
+			}
+		}
+	}
+
+	// Store current block separately for easier transmission over host
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					left_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii];
+				}
+			}
+		}
+	}
+}
+
+// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+
+/**
+Update the inner blocks using the left and right column and rows
+
+ */
+ __attribute__((uses_global_work_offset(0)))
+__kernel
+void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a, 
+				__global DEVICE_DATA_TYPE* restrict left_global_buffer,
+				__global DEVICE_DATA_TYPE* restrict top_global_buffer,
+				const uint block_col,
+				const uint block_row,
+				const uint blocks_per_row) {
+
+	// Store current block in local memory
+	local DEVICE_DATA_TYPE a_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4)));
+	local DEVICE_DATA_TYPE top_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4)));
+	local DEVICE_DATA_TYPE left_buffer[BLOCK_SIZE/GEMM_BLOCK][BLOCK_SIZE/GEMM_BLOCK][GEMM_BLOCK][GEMM_BLOCK] __attribute((xcl_array_partition(complete, 3),xcl_array_partition(complete, 4)));
+
+	// Load blocks to local memory
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
+				}
+			}
+		}
+	}
+
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					top_buffer[i][j][ii][jj] = top_global_buffer[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj];
+				}
+			}
+		}
+	}
+
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					left_buffer[i][j][ii][jj] = left_global_buffer[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj];
+				}
+			}
+		}
+	}
+
+	// Update whole block
+	#pragma ivdep array(a_buffer) safelen((BLOCK_SIZE/GEMM_BLOCK)*(BLOCK_SIZE/GEMM_BLOCK))
+	for (int c = 0; c < (BLOCK_SIZE/GEMM_BLOCK) * (BLOCK_SIZE/GEMM_BLOCK) * (BLOCK_SIZE/GEMM_BLOCK); c++) {
+
+		int mcol = c / ((BLOCK_SIZE/GEMM_BLOCK)*(BLOCK_SIZE/GEMM_BLOCK));
+		int row = (c / (BLOCK_SIZE/GEMM_BLOCK)) & ((BLOCK_SIZE/GEMM_BLOCK) - 1);
+		int curr_col = c & ((BLOCK_SIZE/GEMM_BLOCK) - 1);
+
+		DEVICE_DATA_TYPE top_sub[GEMM_BLOCK][GEMM_BLOCK];
+		DEVICE_DATA_TYPE left_sub[GEMM_BLOCK][GEMM_BLOCK];
+
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int i = 0; i < GEMM_BLOCK; i++) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int j=0; j < GEMM_BLOCK; j++) {
+				top_sub[i][j] = top_buffer[mcol][curr_col][i][j];
+			}
+		}
+
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int i = 0; i < GEMM_BLOCK; i++) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int j=0; j < GEMM_BLOCK; j++) {
+				left_sub[i][j] = left_buffer[mcol][row][i][j];
+			}
+		}
+
+		DEVICE_DATA_TYPE result_sub[GEMM_BLOCK][GEMM_BLOCK];
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int i = 0; i < GEMM_BLOCK; i++) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int j = 0; j < GEMM_BLOCK; j++) {
+				// Calculate sum of whole column and only write it back once
+				DEVICE_DATA_TYPE sum = 0.0;
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int k=0; k < GEMM_BLOCK; k++) {
+					sum += left_sub[k][i] * top_sub[k][j];
+				}
+				result_sub[i][j] = sum;
+			}
+		}
+
+		__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+		for (int i = 0; i < GEMM_BLOCK; i++) {
+			__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+			for (int j=0; j < GEMM_BLOCK; j++) {
+				a_buffer[row][curr_col][i][j] += __fpga_reg(result_sub[i][j]);
+			}
+		}
+	}
+
+	// Store block to global memory
+	#pragma loop_coalesce
+	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
+		for (int ii =0; ii < GEMM_BLOCK; ii++) {
+			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
+				for (int jj =0; jj < GEMM_BLOCK; jj++) {
+					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
+				}
+			}
+		}
+	}
+}
+
+// PY_CODE_GEN block_end
diff --git a/LINPACK/src/host/CMakeLists.txt b/LINPACK/src/host/CMakeLists.txt
index ee126aca..d8feb95d 100755
--- a/LINPACK/src/host/CMakeLists.txt
+++ b/LINPACK/src/host/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase)
-set(HOST_SOURCE execution_blocked_pvt.cpp linpack_benchmark.cpp gmres.c blas.c)
+set(HOST_SOURCE linpack_benchmark.cpp gmres.c blas.c)
 
 set(HOST_EXE_NAME Linpack)
 set(LIB_NAME lp)
diff --git a/LINPACK/src/host/execution.h b/LINPACK/src/host/execution.h
index 153d4be0..a4ebf6f3 100644
--- a/LINPACK/src/host/execution.h
+++ b/LINPACK/src/host/execution.h
@@ -27,7 +27,6 @@ SOFTWARE.
 #include <vector>
 
 /* External library headers */
-#include "CL/cl2.hpp"
 #include "parameters.h"
 #include "linpack_benchmark.hpp"
 
diff --git a/LINPACK/src/host/execution_blocked_pvt.cpp b/LINPACK/src/host/execution_types/execution_iec.hpp
similarity index 99%
rename from LINPACK/src/host/execution_blocked_pvt.cpp
rename to LINPACK/src/host/execution_types/execution_iec.hpp
index 7ad5499a..ea426799 100644
--- a/LINPACK/src/host/execution_blocked_pvt.cpp
+++ b/LINPACK/src/host/execution_types/execution_iec.hpp
@@ -19,9 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
-
-/* Related header files */
-#include "execution.h"
+#ifndef EXECUTION_TYPES_EXECUTION_IEC_HPP
+#define EXECUTION_TYPES_EXECUTION_IEC_HPP
 
 /* C++ standard library headers */
 #include <chrono>
@@ -31,17 +30,19 @@ SOFTWARE.
 #include <list>
 
 /* External library headers */
-#include "CL/cl2.hpp"
 #if QUARTUS_MAJOR_VERSION > 18
 #include "CL/cl_ext_intelfpga.h"
 #endif
 
-namespace bm_execution {
+#include "parameters.h"
+#include "linpack_benchmark.hpp"
 
-/*
- Prepare kernels and execute benchmark
+namespace linpack {
+namespace execution {
+namespace iec {
 
- @copydoc bm_execution::calculate()
+/*
+ Prepare kernels and execute benchmark for a bitstream that makes use of intel external channels
 */
 std::unique_ptr<linpack::LinpackExecutionTimings>
 calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&config,
@@ -724,5 +725,8 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
     return results;
 }
 
-}  // namespace bm_execution
+}   // namespace iec
+}   // namespace execution
+}  // namespace linpack
 
+#endif
\ No newline at end of file
diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp
new file mode 100644
index 00000000..6a83dbbe
--- /dev/null
+++ b/LINPACK/src/host/execution_types/execution_pcie.hpp
@@ -0,0 +1,627 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef EXECUTION_TYPES_EXECUTION_PCIE_HPP
+#define EXECUTION_TYPES_EXECUTION_PCIE_HPP
+
+/* C++ standard library headers */
+#include <chrono>
+#include <fstream>
+#include <memory>
+#include <vector>
+#include <list>
+
+/* External library headers */
+#if QUARTUS_MAJOR_VERSION > 18
+#include "CL/cl_ext_intelfpga.h"
+#endif
+
+#include "parameters.h"
+#include "linpack_benchmark.hpp"
+
+namespace linpack {
+namespace execution {
+namespace pcie {
+
+/*
+ Prepare kernels and execute benchmark
+
+ @copydoc bm_execution::calculate()
+*/
+std::unique_ptr<linpack::LinpackExecutionTimings>
+calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&config,
+          HOST_DATA_TYPE* A,
+          HOST_DATA_TYPE* b,
+          cl_int* ipvt) {
+
+    int err;
+
+    uint blocks_per_row = config.programSettings->matrixSize / config.programSettings->blockSize;
+
+    // Communicate with all ranks in the same row of the torus
+    MPI_Comm row_communicator;
+    MPI_Comm col_communicator;
+
+    MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_row, 0, &row_communicator);
+    MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_col, 0, &col_communicator);
+
+    cl::CommandQueue buffer_queue(*config.context, *config.device, 0, &err);
+    ASSERT_CL(err)
+
+    // Create Buffers for input and output
+    cl::Buffer Buffer_a(*config.context, CL_MEM_READ_WRITE,
+                                        sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize);
+    cl::Buffer Buffer_b(*config.context, CL_MEM_READ_WRITE,
+                                        sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize);
+    cl::Buffer Buffer_pivot(*config.context, CL_MEM_READ_WRITE,
+                                        sizeof(cl_int)*config.programSettings->matrixSize);
+
+    // Buffers only used to store data received over the network layer
+    // The content will not be modified by the host
+    cl::Buffer Buffer_lu1(*config.context, CL_MEM_READ_WRITE,
+                                        sizeof(HOST_DATA_TYPE)*(config.programSettings->blockSize)*(config.programSettings->blockSize));
+    cl::Buffer Buffer_lu2(*config.context, CL_MEM_READ_WRITE,
+                                        sizeof(HOST_DATA_TYPE)*(config.programSettings->blockSize)*(config.programSettings->blockSize));
+    cl::Buffer Buffer_top(*config.context, CL_MEM_READ_WRITE,
+                                        sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize * (config.programSettings->blockSize));
+    cl::Buffer Buffer_left(*config.context, CL_MEM_READ_WRITE,
+                                        sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize * (config.programSettings->blockSize));
+    cl::Buffer Buffer_network_scaling(*config.context, CL_MEM_READ_WRITE,
+                                        sizeof(HOST_DATA_TYPE)*(config.programSettings->blockSize));
+
+    /* --- Setup MPI communication and required additional buffers --- */
+
+    HOST_DATA_TYPE *lu_block, *lu_trans_block;
+    posix_memalign(reinterpret_cast<void**>(&lu_block), 1024, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize)*(config.programSettings->blockSize));
+    posix_memalign(reinterpret_cast<void**>(&lu_trans_block), 1024, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize)*(config.programSettings->blockSize));
+
+    std::vector<HOST_DATA_TYPE*> left_blocks(blocks_per_row);
+    std::vector<HOST_DATA_TYPE*> top_blocks(blocks_per_row);
+
+    for (int i =0; i < blocks_per_row; i++) {
+        posix_memalign(reinterpret_cast<void**>(&left_blocks[i]), 1024, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize)*(config.programSettings->blockSize));
+        posix_memalign(reinterpret_cast<void**>(&top_blocks[i]), 1024, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize)*(config.programSettings->blockSize));
+    }
+
+    /* --- Execute actual benchmark kernels --- */
+
+    double t;
+    std::vector<double> gefaExecutionTimes;
+    std::vector<double> geslExecutionTimes;
+    std::vector<double> gefaWaitTimes;
+    for (int i = 0; i < config.programSettings->numRepetitions; i++) {
+
+        err = buffer_queue.enqueueWriteBuffer(Buffer_a, CL_TRUE, 0,
+                                    sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, A);
+        ASSERT_CL(err)
+        err = buffer_queue.enqueueWriteBuffer(Buffer_b, CL_TRUE, 0,
+                                    sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize, b);
+        ASSERT_CL(err)
+        buffer_queue.finish();
+
+        // Command queues 
+        // A new command queue is created for every iteration of the algorithm to reduce the overhead
+        // of too large queues
+        std::list<cl::CommandQueue> lu_queues;
+        std::list<cl::CommandQueue> top_queues;
+        std::list<cl::CommandQueue> left_queues;
+        std::list<std::vector<cl::Buffer>> left_buffers;
+        std::list<std::vector<cl::Buffer>> top_buffers;
+        std::list<std::vector<cl::CommandQueue>> inner_queues;
+        std::list<std::vector<cl::Kernel>> kernels;
+
+        // User event that is used to start actual execution of benchmark kernels
+        cl::UserEvent start_event(*config.context, &err);
+        ASSERT_CL(err);
+        std::list<std::vector<cl::Event>> all_events;
+        all_events.emplace_back();
+        all_events.back().emplace_back(start_event);
+        all_events.emplace_back();
+
+        left_buffers.emplace_back();
+        top_buffers.emplace_back();
+        kernels.emplace_back();
+        inner_queues.emplace_back();
+        for (uint rep = 0; rep < config.programSettings->kernelReplications; rep++) {
+            inner_queues.back().emplace_back(*config.context, *config.device, 0, &err);
+            ASSERT_CL(err)
+        }
+
+        std::chrono::time_point<std::chrono::high_resolution_clock> t1, t2, twait1, twait2;
+        std::chrono::duration<double> currentwaittime = std::chrono::duration<double>::zero();
+
+        uint current_replication = 0;
+
+        std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col <<  "Start! " << std::endl;
+        MPI_Barrier(MPI_COMM_WORLD);
+        t1 = std::chrono::high_resolution_clock::now();
+        // Trigger the user event that will start the first tasks in the queue
+        start_event.setStatus(CL_COMPLETE);
+
+        // For every row of blocks create kernels and enqueue them
+        for (int block_row=0; block_row < config.programSettings->matrixSize / config.programSettings->blockSize * config.programSettings->torus_width; block_row++) {
+
+            // Create Command queues
+            lu_queues.emplace_back(*config.context, *config.device, 0, &err);
+            ASSERT_CL(err)
+            top_queues.emplace_back(*config.context, *config.device, 0, &err);
+            ASSERT_CL(err)
+            left_queues.emplace_back(*config.context, *config.device, 0, &err);
+            ASSERT_CL(err)
+
+            // already emplace new buffer list for next iteration since left and top buffers need to be stored until all MMs are executed.
+            // this is only the case after the next iteration is finished, because the inner MMs are calculated overlapped with the next iteration!
+            left_buffers.emplace_back();
+            top_buffers.emplace_back();
+
+            int local_block_row_remainder = (block_row % config.programSettings->torus_width);
+            int local_block_row= (block_row / config.programSettings->torus_width);
+            bool in_same_row_as_lu = local_block_row_remainder == config.programSettings->torus_row;
+            bool in_same_col_as_lu = local_block_row_remainder == config.programSettings->torus_col;
+            int start_row_index = local_block_row + ((local_block_row_remainder >= config.programSettings->torus_row) ? 1: 0); 
+            int start_col_index = local_block_row + ((local_block_row_remainder >= config.programSettings->torus_col) ? 1: 0);
+            int num_left_blocks = (in_same_col_as_lu) ? blocks_per_row - start_row_index : 0;
+            int num_top_blocks = (in_same_row_as_lu) ? blocks_per_row - start_col_index : 0;
+            int num_inner_block_rows = (blocks_per_row - start_row_index);
+            int num_inner_block_cols = (num_inner_block_rows > 0) ? (blocks_per_row - start_col_index) : 0;
+            num_inner_block_rows = (num_inner_block_cols > 0) ?num_inner_block_rows : 0;
+            int num_network_layer_executions = (config.programSettings->matrixSize / config.programSettings->blockSize) - std::min(start_col_index, start_row_index);
+            num_network_layer_executions = std::max(num_network_layer_executions, 1);
+            std::vector<cl_uint> network_layer_op_flags(num_network_layer_executions);
+            std::fill(network_layer_op_flags.begin(), network_layer_op_flags.end(), 0);
+            bool is_calulating_lu_block = (in_same_col_as_lu && in_same_row_as_lu);
+
+            if (is_calulating_lu_block) {
+                // create the LU kernel
+                kernels.back().emplace_back(*config.program, "lu",
+                                            &err);
+#ifndef NDEBUG
+                std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " LU     " << local_block_row << "," << local_block_row <<  std::endl;
+#endif
+                err = kernels.back().back().setArg(0, Buffer_a);
+                ASSERT_CL(err);
+                err = kernels.back().back().setArg(1, Buffer_lu1);
+                ASSERT_CL(err);
+                err = kernels.back().back().setArg(2, Buffer_lu2);
+                ASSERT_CL(err);
+                err = kernels.back().back().setArg(3, local_block_row);
+                ASSERT_CL(err)
+                err = kernels.back().back().setArg(4, local_block_row);
+                ASSERT_CL(err)
+                err =kernels.back().back().setArg(5, config.programSettings->matrixSize / config.programSettings->blockSize);
+                ASSERT_CL(err)
+                all_events.back().emplace_back();
+                err = lu_queues.back().enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange,  &(*std::prev(std::prev(all_events.end()))));
+                ASSERT_CL(err)
+                // read back result of LU calculation so it can be distributed 
+                err = lu_queues.back().enqueueReadBuffer(Buffer_lu2, CL_TRUE, 0,
+                                     sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, lu_block);
+                ASSERT_CL(err)
+                err = lu_queues.back().enqueueReadBuffer(Buffer_lu1, CL_TRUE, 0,
+                                     sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, lu_trans_block, NULL, &all_events.back().back());
+                ASSERT_CL(err)
+            }
+
+            // Exchange LU blocks on all ranks to prevent stalls in MPI broadcast
+            // All tasks until now need to be executed so we can use the result of the LU factorization and communicate it via MPI with the other FPGAs
+            lu_queues.back().finish();
+
+            // Broadcast LU block in column to update all left blocks
+            MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator);
+            // Broadcast LU block in row to update all top blocks
+            MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator);
+
+            if (num_top_blocks > 0) {
+
+                // Copy LU block to FPGA for calulation of top blocks only if required
+                err = top_queues.back().enqueueWriteBuffer(Buffer_lu1, CL_TRUE, 0,
+                                    sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, lu_trans_block);
+                ASSERT_CL(err)
+
+                // Create top kernels
+                for (int tops=start_col_index; tops < (config.programSettings->matrixSize / config.programSettings->blockSize); tops++) {
+                    kernels.back().emplace_back(*config.program, "top_update",
+                                                    &err);
+#ifndef NDEBUG
+                    std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Top    " << local_block_row << "," << tops <<  std::endl;
+#endif
+                    ASSERT_CL(err);     
+                    err = kernels.back().back().setArg(0, Buffer_a);
+                    ASSERT_CL(err);    
+                    err = kernels.back().back().setArg(1, Buffer_top);
+                    ASSERT_CL(err);    
+                    err = kernels.back().back().setArg(2, Buffer_lu1);
+                    ASSERT_CL(err) 
+                    err = kernels.back().back().setArg(3, (tops == start_col_index) ? CL_TRUE : CL_FALSE);
+                    ASSERT_CL(err) 
+                    err = kernels.back().back().setArg(4, tops);
+                    ASSERT_CL(err)
+                    err = kernels.back().back().setArg(5, local_block_row);
+                    ASSERT_CL(err)
+                    err = kernels.back().back().setArg(6, config.programSettings->matrixSize / config.programSettings->blockSize);
+                    ASSERT_CL(err)
+
+                    err = top_queues.back().enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange,  &(*std::prev(std::prev(all_events.end()))));
+                    ASSERT_CL(err) 
+
+                    if (tops + 1 == (config.programSettings->matrixSize / config.programSettings->blockSize)) {
+                        all_events.back().emplace_back();
+                        err = top_queues.back().enqueueReadBuffer(Buffer_top, CL_TRUE, 0,
+                                     sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, top_blocks[tops - start_col_index],
+                                     &(*std::prev(std::prev(all_events.end()))), &(all_events.back().back()));
+                        ASSERT_CL(err) 
+                    }
+                    else {
+                        err = top_queues.back().enqueueReadBuffer(Buffer_top, CL_TRUE, 0,
+                                     sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, top_blocks[tops - start_col_index]);
+                        ASSERT_CL(err)
+                    }
+
+                }
+            }
+            if (num_left_blocks > 0) {
+
+                // Copy LU block to FPGA for calulation of left blocks only if required
+                err = left_queues.back().enqueueWriteBuffer(Buffer_lu2, CL_TRUE, 0,
+                                    sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, lu_block);
+                ASSERT_CL(err)
+                // Create left kernels
+                for (int tops=start_row_index; tops < (config.programSettings->matrixSize / config.programSettings->blockSize); tops++) {
+                    kernels.back().emplace_back(*config.program, "left_update",
+                                                    &err);
+#ifndef NDEBUG
+                    std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col <<  " Left   " <<tops  << "," << local_block_row <<  std::endl;
+#endif
+                    ASSERT_CL(err);     
+                    err = kernels.back().back().setArg(0, Buffer_a);
+                    ASSERT_CL(err);    
+                    err = kernels.back().back().setArg(1, Buffer_left);
+                    ASSERT_CL(err) 
+                    err = kernels.back().back().setArg(2, Buffer_lu2);
+                    ASSERT_CL(err) 
+                    err = kernels.back().back().setArg(3, (tops == start_row_index) ? CL_TRUE : CL_FALSE);
+                    ASSERT_CL(err) 
+                    err = kernels.back().back().setArg(4, local_block_row);
+                    ASSERT_CL(err)
+                    err = kernels.back().back().setArg(5, tops);
+                    ASSERT_CL(err)
+                    err = kernels.back().back().setArg(6, config.programSettings->matrixSize / config.programSettings->blockSize);
+                    ASSERT_CL(err)
+
+                    err = left_queues.back().enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange, &(*std::prev(std::prev(all_events.end()))));
+                    ASSERT_CL(err) 
+
+                    if (tops + 1 == (config.programSettings->matrixSize / config.programSettings->blockSize)) {
+                        all_events.back().emplace_back();
+                        err = left_queues.back().enqueueReadBuffer(Buffer_left, CL_TRUE, 0,
+                                     sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, left_blocks[tops - start_row_index],
+                                     &(*std::prev(std::prev(all_events.end()))), &(all_events.back().back()));
+                        ASSERT_CL(err) 
+                    }
+                    else {
+                        err = left_queues.back().enqueueReadBuffer(Buffer_left, CL_TRUE, 0,
+                                     sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, left_blocks[tops - start_row_index]);
+                        ASSERT_CL(err) 
+                    }
+                }
+            }
+            // Wait until all top and left blocks are calculated
+            top_queues.back().finish();
+            left_queues.back().finish();
+
+            // Send the left and top blocks to all other ranks so they can be used to update all inner blocks
+            for (int lbi=0; lbi < blocks_per_row - local_block_row; lbi++) {
+                MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator);
+            }
+            for (int tbi=0; tbi < blocks_per_row  - local_block_row; tbi++) {
+                MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator);
+            }
+
+            // update all remaining inner blocks using only global memory
+
+            all_events.emplace_back();
+            //auto communication_events = all_events.back();
+
+            // Write all left and top blocks to FPGA memory
+            for (int lbi=0; lbi < num_inner_block_rows; lbi++) {
+                left_buffers.back().emplace_back(*config.context, CL_MEM_READ_WRITE,
+                                        sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize * (config.programSettings->blockSize));
+                err = inner_queues.back()[0].enqueueWriteBuffer(left_buffers.back().back(), CL_TRUE, 0,
+                                    sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, left_blocks[lbi]);
+            }
+            for (int tbi=0; tbi < num_inner_block_cols; tbi++) {
+                top_buffers.back().emplace_back(*config.context, CL_MEM_READ_WRITE,
+                                        sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize * (config.programSettings->blockSize));
+                err = inner_queues.back()[0].enqueueWriteBuffer(top_buffers.back().back(), CL_TRUE, 0,
+                                    sizeof(HOST_DATA_TYPE)*config.programSettings->blockSize*config.programSettings->blockSize, top_blocks[tbi]);
+            }
+
+            uint current_update = 0;
+            uint total_inner_updates_first_row = top_buffers.back().size();
+            uint updates_per_replication = total_inner_updates_first_row / config.programSettings->kernelReplications;
+            uint total_inner_updates = (top_buffers.back().size() - 1) * (left_buffers.back().size() - 1);
+            uint total_updates_per_replication = total_inner_updates/ config.programSettings->kernelReplications;
+
+            // Wait until data is copied to FPGA
+            inner_queues.back()[0].finish();
+
+            for (auto l = std::next(left_buffers.back().begin()); l < left_buffers.back().end(); l++) {
+                // select the matrix multiplication kernel that should be used for this block updated 
+                kernels.back().emplace_back(*config.program, ("inner_update_mm" + std::to_string(current_replication)).c_str(),
+                                    &err);
+
+                int block_col = static_cast<cl_uint>((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_cols);
+                int block_row = static_cast<cl_uint>((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_rows + std::distance(left_buffers.back().begin(), l));  
+                ASSERT_CL(err);
+                err = kernels.back().back().setArg(0, Buffer_a);
+                ASSERT_CL(err);
+                err = kernels.back().back().setArg(1, *l);
+                ASSERT_CL(err)
+                err = kernels.back().back().setArg(2, *top_buffers.back().begin());
+                ASSERT_CL(err)
+                err = kernels.back().back().setArg(3, block_col);
+                ASSERT_CL(err)
+                err = kernels.back().back().setArg(4, block_row);
+                ASSERT_CL(err)
+                err = kernels.back().back().setArg(5, blocks_per_row);
+                ASSERT_CL(err)
+
+                if ((left_buffers.back().size() - 1) - current_update <= config.programSettings->kernelReplications) {
+#ifndef NDEBUG
+                    std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner L Ev " << block_row << "," << block_col <<  std::endl;
+#endif 
+                    // this is the last taks that will be enqueued in this queue, so create an event
+                    all_events.back().emplace_back();
+                    // Distribute the workload over all available matrix multiplication kernels
+                    err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange,  &(*std::prev(std::prev(all_events.end()))), &(all_events.back().back()));         
+                    //err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange,  &communication_events, &(all_events.back().back()));         
+                }
+                else {
+#ifndef NDEBUG
+                    std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner L " << block_row << "," << block_col <<  std::endl;
+#endif 
+                    // Distribute the workload over all available matrix multiplication kernels
+                    err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange,  &(*std::prev(std::prev(all_events.end()))));         
+                    //err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange,  &communication_events);         
+                }
+                current_update++;
+                current_replication = (current_replication + 1) % config.programSettings->kernelReplications;
+            }
+
+            current_update = 0;
+            for (auto t = top_buffers.back().begin(); t < top_buffers.back().end(); t++) {
+                // select the matrix multiplication kernel that should be used for this block updated 
+                kernels.back().emplace_back(*config.program, ("inner_update_mm" + std::to_string(current_replication)).c_str(),
+                                    &err);
+
+                int block_col = static_cast<cl_uint>((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_cols + std::distance(top_buffers.back().begin(), t));
+                int block_row = static_cast<cl_uint>((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_rows);
+
+                ASSERT_CL(err);
+                err = kernels.back().back().setArg(0, Buffer_a);
+                ASSERT_CL(err);
+                err = kernels.back().back().setArg(1, *left_buffers.back().begin());
+                ASSERT_CL(err)
+                err = kernels.back().back().setArg(2, *t);
+                ASSERT_CL(err)
+                err = kernels.back().back().setArg(3, block_col);
+                ASSERT_CL(err)
+                err = kernels.back().back().setArg(4, block_row);
+                ASSERT_CL(err)
+                err = kernels.back().back().setArg(5, blocks_per_row);
+                ASSERT_CL(err)
+                // If number of blocks is not dividable by the number of replications, the first replications will do one update more
+                if (top_buffers.back().size() - current_update <= config.programSettings->kernelReplications) {
+#ifndef NDEBUG
+                    std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner T Ev " << block_row << "," << block_col <<  std::endl;
+#endif 
+                    // this is the last taks that will be enqueued in this queue, so create an event
+                    all_events.back().emplace_back();
+                    // Distribute the workload over all available matrix multiplication kernels
+                    err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange,  &(*std::prev(std::prev(all_events.end()))), &(all_events.back().back()));         
+                }
+                else {
+#ifndef NDEBUG
+                    std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner T " << block_row << "," << block_col <<  std::endl;
+#endif 
+                    // Distribute the workload over all available matrix multiplication kernels
+                    err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange,  &(*std::prev(std::prev(all_events.end()))));         
+                }
+                ASSERT_CL(err) 
+                current_update++;
+                current_replication = (current_replication + 1) % config.programSettings->kernelReplications;
+            }
+            
+            // count the inner MM already to next iteration by creating new buffers in the queue
+            all_events.emplace_back();
+            kernels.emplace_back();
+            inner_queues.emplace_back();
+            current_update = 0;
+            for (uint rep = 0; rep < config.programSettings->kernelReplications; rep++) {
+                inner_queues.back().emplace_back(*config.context, *config.device, 0, &err);
+                ASSERT_CL(err)
+            }
+
+            for (auto l = std::next(left_buffers.back().begin()); l < left_buffers.back().end(); l++) {
+                for (auto t = std::next(top_buffers.back().begin()); t < top_buffers.back().end(); t++) {
+                    // select the matrix multiplication kernel that should be used for this block updated 
+                    kernels.back().emplace_back(*config.program, ("inner_update_mm" + std::to_string(current_replication)).c_str(),
+                                        &err);
+
+                    int block_col = static_cast<cl_uint>((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_cols + std::distance(top_buffers.back().begin(), t));
+                    int block_row = static_cast<cl_uint>((config.programSettings->matrixSize / config.programSettings->blockSize) - num_inner_block_rows + std::distance(left_buffers.back().begin(), l));
+  
+                    ASSERT_CL(err);
+                    err = kernels.back().back().setArg(0, Buffer_a);
+                    ASSERT_CL(err);
+                    err = kernels.back().back().setArg(1, *l);
+                    ASSERT_CL(err)
+                    err = kernels.back().back().setArg(2, *t);
+                    ASSERT_CL(err)
+                    err = kernels.back().back().setArg(3, block_col);
+                    ASSERT_CL(err)
+                    err = kernels.back().back().setArg(4, block_row);
+                    ASSERT_CL(err)
+                    err = kernels.back().back().setArg(5, blocks_per_row);
+                    ASSERT_CL(err)
+
+                    // If number of blocks is not dividable by the number of replications, the first replications will do one update more
+                    if (((top_buffers.back().size() - 1) * (left_buffers.back().size() - 1)) - current_update <= config.programSettings->kernelReplications) {
+#ifndef NDEBUG
+                    std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner Ev " << block_row << "," << block_col <<  std::endl;
+#endif 
+                        // this is the last taks that will be enqueued in this queue, so create an event
+                        all_events.back().emplace_back();
+                        // Distribute the workload over all available matrix multiplication kernels
+                        err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange,  &(*std::prev(std::prev(std::prev(all_events.end())))), &(all_events.back().back()));         
+                    }
+                    else {
+#ifndef NDEBUG
+                    std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Inner " << block_row << "," << block_col <<  std::endl;
+#endif 
+                        // Distribute the workload over all available matrix multiplication kernels
+                        err = inner_queues.back()[(current_replication)].enqueueNDRangeKernel(kernels.back().back(), cl::NullRange, cl::NDRange(1), cl::NullRange,  &(*std::prev(std::prev(std::prev(all_events.end())))));         
+                    }
+
+                    ASSERT_CL(err)
+                    current_update++;
+                    current_replication = (current_replication + 1) % config.programSettings->kernelReplications;
+                }
+            }
+#ifndef NDEBUG
+            MPI_Barrier(MPI_COMM_WORLD);
+            if (is_calulating_lu_block) std::cout << "---------------" << std::endl;
+
+            // // // Execute GEFA
+            // if (block_row == 0) {
+            //     MPI_Barrier(MPI_COMM_WORLD);
+            //     t1 = std::chrono::high_resolution_clock::now();
+            //     // Trigger the user event that will start the first tasks in the queue
+            //     start_event.setStatus(CL_COMPLETE);
+            // }
+#endif
+
+#ifndef NDEBUG
+            cl::Event::waitForEvents(all_events.back());
+            std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Done    " << block_row <<  std::endl;
+
+            if (block_row == blocks_per_row * config.programSettings->torus_width - 1) {
+                // wait until the last LU queue is done since it will be the last required operation
+                lu_queues.back().finish();
+                t2 = std::chrono::high_resolution_clock::now();
+
+                // Finish all other queues
+                top_queues.back().finish();
+                left_queues.back().finish();
+                cl::Event::waitForEvents(all_events.back());
+
+            }
+#endif
+        }
+#ifdef NDEBUG
+        int count = 0;
+        for (auto evs : all_events) {
+            count++;
+            cl::Event::waitForEvents(evs);
+            // std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col <<  "Step " << count << " of " << all_events.size() << std::endl;
+        }
+        lu_queues.back().finish();
+        t2 = std::chrono::high_resolution_clock::now();
+        std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col <<  "End! " << std::endl;
+#endif
+
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col <<  "Wait time: " << currentwaittime.count() << "s" << std::endl;
+            std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Exit    " << i <<  std::endl;
+#endif
+
+        std::chrono::duration<double> timespan =
+                std::chrono::duration_cast<std::chrono::duration<double>>
+                                                                    (t2 - t1);
+        gefaExecutionTimes.push_back(timespan.count());
+
+        // Execute GESL
+        t1 = std::chrono::high_resolution_clock::now();
+        // lu_queue.enqueueTask(geslkernel);
+        // lu_queue.finish();
+        t2 = std::chrono::high_resolution_clock::now();
+        timespan = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+        geslExecutionTimes.push_back(timespan.count());
+    }
+
+    /* --- Read back results from Device --- */
+
+#ifdef USE_SVM
+    err = clEnqueueSVMUnmap(compute_queue(),
+                        reinterpret_cast<void *>(A), 0,
+                        NULL, NULL);
+    ASSERT_CL(err)
+    err = clEnqueueSVMUnmap(compute_queue(),
+                        reinterpret_cast<void *>(b), 0,
+                        NULL, NULL);
+    ASSERT_CL(err)
+    err = clEnqueueSVMUnmap(compute_queue(),
+                        reinterpret_cast<void *>(ipvt), 0,
+                        NULL, NULL);
+    ASSERT_CL(err)
+    
+    // read back result from temporary buffer
+    for (int k=0; k < config.programSettings->matrixSize * config.programSettings->matrixSize; k++) {
+        A[k] = A_tmp[k];
+    }
+    clSVMFree((*config.context)(), reinterpret_cast<void*>(A_tmp));
+
+#else
+    buffer_queue.enqueueReadBuffer(Buffer_a, CL_TRUE, 0,
+                                     sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, A);
+    // buffer_queue.enqueueReadBuffer(Buffer_b, CL_TRUE, 0,
+    //                                  sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize, b);
+    if (!config.programSettings->isDiagonallyDominant) {
+        buffer_queue.enqueueReadBuffer(Buffer_pivot, CL_TRUE, 0,
+                                        sizeof(cl_int)*config.programSettings->matrixSize, ipvt);
+    }
+#endif
+
+    /* --- Clean up MPI communication buffers --- */
+    free(lu_block);
+    free(lu_trans_block);
+
+    for (int i =0; i < left_blocks.size(); i++) {
+        free(top_blocks[i]);
+        free(left_blocks[i]);
+    }
+
+    MPI_Comm_free(&row_communicator);
+    MPI_Comm_free(&col_communicator);
+
+    std::unique_ptr<linpack::LinpackExecutionTimings> results(
+                    new linpack::LinpackExecutionTimings{gefaExecutionTimes, geslExecutionTimes});
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    return results;
+}
+
+}   // namespace pcie
+}   // namespace execution
+}  // namespace linpack
+
+#endif
\ No newline at end of file
diff --git a/LINPACK/src/host/execution_types/execution_types.hpp b/LINPACK/src/host/execution_types/execution_types.hpp
new file mode 100644
index 00000000..975dd4cf
--- /dev/null
+++ b/LINPACK/src/host/execution_types/execution_types.hpp
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2021 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef EXECUTION_TYPES_HPP
+#define EXECUTION_TYPES_HPP
+
+#include "execution_types/execution_pcie.hpp"
+#include "execution_types/execution_iec.hpp"
+
+#endif
\ No newline at end of file
diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
index 43628906..1ffc6fcd 100644
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ b/LINPACK/src/host/linpack_benchmark.cpp
@@ -31,7 +31,8 @@ SOFTWARE.
 #include <random>
 
 /* Project's headers */
-#include "execution.h"
+#include "communication_types.hpp"
+#include "execution_types/execution_types.hpp"
 #include "parameters.h"
 
 linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
@@ -106,7 +107,12 @@ linpack::LinpackBenchmark::addAdditionalParseOptions(cxxopts::Options &options)
 
 std::unique_ptr<linpack::LinpackExecutionTimings>
 linpack::LinpackBenchmark::executeKernel(LinpackData &data) {
-    auto timings = bm_execution::calculate(*executionSettings, data.A, data.b, data.ipvt);
+    std::unique_ptr<linpack::LinpackExecutionTimings> timings;
+    switch (executionSettings->programSettings->communicationType) {
+        case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*executionSettings, data.A, data.b, data.ipvt); break;
+        case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*executionSettings, data.A, data.b, data.ipvt); break;
+        default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType));
+    }
 #ifdef DISTRIBUTED_VALIDATION
     distributed_gesl_nopvt_ref(data);
 #endif
@@ -369,7 +375,7 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat
             for (int j = 0; j < n; j++) {
                 // For each element below it
                 for (int i = 0; i < n; i++) {
-                    std::cout << ref_result->A[n * j + i] << ", ";
+                    std::cout << std::abs(ref_result->A[n * j + i] - data.A[n * j + i]) << ", ";
                 }
                 std::cout << std::endl;
             }
diff --git a/LINPACK/tests/test_kernel_communication.cpp b/LINPACK/tests/test_kernel_communication.cpp
index bc677f5b..db40ea97 100644
--- a/LINPACK/tests/test_kernel_communication.cpp
+++ b/LINPACK/tests/test_kernel_communication.cpp
@@ -24,6 +24,9 @@ class LinpackKernelCommunicationTest : public testing::Test {
         bm = std::unique_ptr<linpack::LinpackBenchmark>(new linpack::LinpackBenchmark(global_argc, global_argv));
         bm->getExecutionSettings().programSettings->isDiagonallyDominant = true;
         bm->getExecutionSettings().programSettings->matrixSize = BLOCK_SIZE;
+        if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
+            GTEST_SKIP() << "This test is IEC Specific but other kernel is used";
+        }
         data = bm->generateInputData();
         setupExternalChannelFiles();
     }
@@ -76,6 +79,9 @@ class LinpackKernelCommunicationTestLU : public LinpackKernelCommunicationTest {
 
     void SetUp() override {
         LinpackKernelCommunicationTest::SetUp();
+        if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
+            GTEST_SKIP() << "This test is IEC Specific but other kernel is used";
+        }
         executeKernel();
     }
 
@@ -609,29 +615,29 @@ TEST_F(LinpackKernelCommunicationTestLeft, LeftBlockExternalResultisCorrect) {
     uint matrix_size = bm->getExecutionSettings().programSettings->matrixSize;
     auto gefa_data = bm->generateInputData();
 
-    // generate uniformly distributed block as top block
+    // generate uniformly distributed block as left block
     bm->getExecutionSettings().programSettings->isDiagonallyDominant = false;
     auto ref_data = bm->generateInputData();
     bm->getExecutionSettings().programSettings->isDiagonallyDominant = true;
     linpack::gefa_ref_nopvt(gefa_data->A, matrix_size,matrix_size);
 
-    // For each diagnonal element
+    // reference implementation to update left block    
     for (int k = 0; k < matrix_size; k++) {
-        // For each row below the current row
         for (int j = 0; j < matrix_size; j++) {
-            // multiply current column to current row and add it up
             for (int i = k + 1; i < matrix_size; i++) {
                 ref_data->A[j * matrix_size + i] += ref_data->A[j * matrix_size + k] * gefa_data->A[k * matrix_size + i];
             }
         }
     }
-    double total_error = 0.0;
+    double max_error = 0.0;
     for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) {
         for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) {
-            total_error += std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]);
+            max_error = std::max(max_error, static_cast<double>(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j])));
         }
     }
-    EXPECT_FLOAT_EQ(total_error, 0.0);
+    // tolerated delta between expected and real result is machine epsilon
+    double delta = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+    EXPECT_NEAR(max_error, 0.0, delta);
 }
 
 TEST_F(LinpackKernelCommunicationTestLeft, LeftBlockGlobalMemLUBufferContentSameAsLUBlock) {
@@ -781,13 +787,15 @@ TEST_F(LinpackKernelCommunicationTestTop, TopBlockExternalResultisCorrect) {
             }
         }
     }
-    double total_error = 0.0;
+    double max_error = 0.0;
     for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) {
         for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) {
-            total_error += std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]);
+            max_error = std::max(max_error, static_cast<double>(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j])));
         }
     }
-    EXPECT_FLOAT_EQ(total_error, 0.0);
+    // tolerated delta between expected and real result is machine epsilon
+    double delta = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+    EXPECT_NEAR(max_error, 0.0, delta);
 }
 
 TEST_F(LinpackKernelCommunicationTestTop, TopBlockExternalChannelOutputToRightCorrectAmountOfData) {
@@ -898,13 +906,15 @@ TEST_F(LinpackKernelCommunicationTestTop, TopBlockExternalChannelOutputToTopCorr
 TEST_F(LinpackKernelCommunicationTestLU, LUBlockExternalResultisSameAsRef) {
     auto data2 = bm->generateInputData();
     linpack::gefa_ref_nopvt(data2->A, bm->getExecutionSettings().programSettings->matrixSize,bm->getExecutionSettings().programSettings->matrixSize);
-    double total_error = 0.0;
+    double max_error = 0.0;
     for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) {
         for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) {
-            total_error += std::abs(data2->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]);
+            max_error = std::max(max_error, static_cast<double>(std::abs(data2->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j])));
         }
     }
-    EXPECT_FLOAT_EQ(total_error, 0.0);
+    // tolerated delta between expected and real result is machine epsilon
+    double delta = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+    EXPECT_NEAR(max_error, 0.0, delta);
 }
 
 
@@ -1024,13 +1034,15 @@ TEST_F(LinpackKernelCommunicationTestLeftOut, LeftBlockExternalResultisCorrect)
             }
         }
     }
-    double total_error = 0.0;
+    double max_error = 0.0;
     for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) {
         for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) {
-            total_error += std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]);
+            max_error = std::max(max_error, static_cast<double>(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j])));
         }
     }
-    EXPECT_FLOAT_EQ(total_error, 0.0);
+    // tolerated delta between expected and real result is machine epsilon
+    double delta = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+    EXPECT_NEAR(max_error, 0.0, delta);
 }
 
 
@@ -1058,13 +1070,15 @@ TEST_F(LinpackKernelCommunicationTestTopOut, TopBlockExternalResultisCorrect) {
             }
         }
     }
-    double total_error = 0.0;
+    double max_error = 0.0;
     for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) {
         for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) {
-            total_error += std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]);
+            max_error = std::max(max_error, static_cast<double>(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j])));
         }
     }
-    EXPECT_FLOAT_EQ(total_error, 0.0);
+    // tolerated delta between expected and real result is machine epsilon
+    double delta = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+    EXPECT_NEAR(max_error, 0.0, delta);
 }
 
 
@@ -1197,13 +1211,15 @@ TEST_F(LinpackKernelCommunicationTestAll, AllBlockExternalResultisCorrect) {
 
     linpack::gefa_ref_nopvt(ref_data->A, matrix_size, matrix_size);
 
-    double total_error = 0.0;
+    double max_error = 0.0;
     for (int i = 0; i < bm->getExecutionSettings().programSettings->matrixSize; i++) {
         for (int j = 0; j < bm->getExecutionSettings().programSettings->matrixSize; j++) {
-            total_error += std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]);
+            max_error = std::max(max_error, static_cast<double>(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j])));
         }
     }
-    EXPECT_FLOAT_EQ(total_error, 0.0);
+    // tolerated delta between expected and real result is machine epsilon times matrix width
+    double delta = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+    EXPECT_NEAR(max_error, 0.0, delta);
 }
 
 TEST_F(LinpackKernelCommunicationTestAll, AllBlockExternalChannelOutputToRightCorrectAmountOfData) {
diff --git a/PTRANS/CHANGELOG b/PTRANS/CHANGELOG
index f74b3445..7338ff54 100644
--- a/PTRANS/CHANGELOG
+++ b/PTRANS/CHANGELOG
@@ -2,6 +2,18 @@
 
 This file contains all changes made to the source code for each release.
 
+## 1.5
+
+#### Changed:
+- Refactored the code to support different execution kernels and data distributions
+- Changed formatting of the output metrics
+
+#### Added:
+- CPU only implementation or diagonal and PQ data distribution
+- FPGA kernel with communication via PCIe and MPI for diagonal and PQ distribution
+- FPGA kernel with communication via external channels for PQ distribution
+
+
 ## 1.4
 
 #### Changed:
diff --git a/PTRANS/CMakeLists.txt b/PTRANS/CMakeLists.txt
index 6a577787..7728fe2a 100755
--- a/PTRANS/CMakeLists.txt
+++ b/PTRANS/CMakeLists.txt
@@ -1,16 +1,23 @@
 cmake_minimum_required(VERSION 3.13)
-project(PTRANS VERSION 1.4)
+project(PTRANS VERSION 1.5)
 
 set(READ_KERNEL_NAME transpose_read CACHE STRING "Name of the OpenCL kernel that reads A and sends it over external channel")
 set(WRITE_KERNEL_NAME transpose_write CACHE STRING "Name of the OpenCL kernel that receives A, adds C to it and stores result")
 set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices")
+set(DEFAULT_COMM_TYPE "AUTO" CACHE STRING "Default communication type if nothing else is given over the --connectivity parameter")
+set(DEFAULT_DIST_TYPE "AUTO" CACHE STRING "Default distribution type if nothing is specified over the --handler parameter")
 set(BLOCK_SIZE 512 CACHE STRING "Block size used in the FPGA kernel")
 set(CHANNEL_WIDTH 8 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory ")
 # NUM_REPLICATIONS set to 2 by default to allow build and execution of both versions of the transpose kernel
 set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the kernels will be replicated")
+set(USE_BUFFER_WRITE_RECT_FOR_A No CACHE BOOL "Only valid for PQ with IEC. Use the enqueueWriteBufferRect call to copy only the relevant part of A to memory bank of each replication. Whole matrix A will be copied otherwise.")
+set(XILINX_UNROLL_INNER_LOOPS No CACHE BOOL "When building for Xilinx devices, unroll the inner loops to create a single pipeline per block and keep memory bursts. This is a tradeoff between resource usage and performance.")
 
-mark_as_advanced(READ_KERNEL_NAME WRITE_KERNEL_NAME)
+mark_as_advanced(READ_KERNEL_NAME WRITE_KERNEL_NAME USE_BUFFER_WRITE_RECT_FOR_A XILINX_UNROLL_INNER_LOOPS)
 
 set(USE_MPI Yes)
+set(USE_OPENMP Yes)
+set(USE_DEPRECATED_HPP_HEADER No)
+set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes)
 
 include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
diff --git a/PTRANS/README.md b/PTRANS/README.md
index a711ab42..dff4ed06 100644
--- a/PTRANS/README.md
+++ b/PTRANS/README.md
@@ -12,7 +12,8 @@ _Introduction to the HPCChallenge Benchmark Suite_ available
 
 ## Additional Dependencies
 
-The benchmark needs no additional dependencies than the ones given in the main [README](../README.md).
+In addition to the dependencies defined in the main [README](../README.md), the benchmark optionally requires MKL for the CPU only implementation of the benchmark.
+If MKL is not found, the benchmark can still be build without support for the CPU only execution!
 
 ## Build
 
@@ -37,7 +38,8 @@ The targets below can be used to build the benchmark and its kernels:
   
 The currently supported values for KERNEL_FILE_NAME are listed below where `transpose_diagonal` is set to be the default for the ase run:
 
-- transpose_diagonal
+- `transpose_diagonal`: Transposes a matrix that is distributed with the diagonal data handler
+- `transpose_pq`: Transposes a matrix that is distributed with the PQ data handler. P = Q has to hold!
  
  You can build for example the host application by running
  
@@ -51,7 +53,6 @@ Next to the common configuration options given in the [README](../README.md) of
 
 Name             | Default     | Description                          |
 ---------------- |-------------|--------------------------------------|
- `DATA_TYPE`     | float       | Data type used for calculation       |
 `READ_KERNEL_NAME`    | transpose_read | Name of the kernel that reads A from global memory and sends it to an external channel (only needed for own implementations) |
 `WRITE_KERNEL_NAME`    | transpose_write | Name of the kernel that receives a from an external channel and adds it to B (only needed for own implementations) |
 `BLOCK_SIZE`     | 512          | Block size used by the kernel to transpose the matrix |
@@ -75,35 +76,58 @@ For more information on available input parameters run
     The clock precision seems to be 1.00000e+01ns
     -------------------------------------------------------------
     Implementation of the matrix transposition benchmark proposed in the HPCC benchmark suite for FPGA.
-    Version: 1.3
+    Version: 1.5
+
+    MPI Version:  3.1
+    Config. Time: Fri Jul 16 11:51:37 UTC 2021
+    Git Commit:   2a12191-dirty
 
     Usage:
-    ./Transpose_xilinx [OPTION...]
-
-    -f, --file arg         Kernel file name
-    -n, arg                Number of repetitions (default: 10)
-    -i,                    Use memory Interleaving
-        --skip-validation  Skip the validation of the output data. This will
-                            speed up execution and helps when working with special
-                            data types.
-        --device arg       Index of the device that has to be used. If not
-                            given you will be asked which device to use if there are
-                            multiple devices available. (default: -1)
-        --platform arg     Index of the platform that has to be used. If not
-                            given you will be asked which platform to use if there
-                            are multiple platforms available. (default: -1)
-    -r, arg                Number of used kernel replications (default: 4)
-        --test             Only test given configuration and skip execution and
-                            validation
-    -h, --help             Print this help
-    -m, arg                Matrix size in number of blocks in one dimension
-                            (default: 8)
-    -b, arg                Block size in number of values in one dimension
-                            (default: 512)
-        --handler arg      Specify the used data handler that distributes the
-                            data over devices and memory banks (default: distext)
+    bin/Transpose_intel [OPTION...]
+
+    -f, --file arg            Kernel file name
+    -n, arg                   Number of repetitions (default: 10)
+    -i,                       Use memory Interleaving
+        --skip-validation     Skip the validation of the output data. This will
+                                speed up execution and helps when working with
+                                special data types.
+        --device arg          Index of the device that has to be used. If not
+                                given you will be asked which device to use if
+                                there are multiple devices available. (default: -1)
+        --platform arg        Index of the platform that has to be used. If not
+                                given you will be asked which platform to use if
+                                there are multiple platforms available. (default:
+                                -1)
+    -r, arg                   Number of used kernel replications (default: 4)
+        --test                Only test given configuration and skip execution
+                                and validation
+    -h, --help                Print this help
+    -m, arg                   Matrix size in number of blocks in one dimension
+                                (default: 8)
+    -b, arg                   Block size in number of values in one dimension
+                                (default: 512)
+        --comm-type arg       Used communication type for inter-FPGA communication
+                                (default: AUTO)
+        --distribute-buffers  Distribute buffers over memory banks. This will
+                                use three memory banks instead of one for a single
+                                kernel replication, but kernel replications may
+                                interfere. This is an Intel only attribute, since
+                                buffer placement is decided at compile time for
+                                Xilinx FPGAs.
+        --handler arg         Specify the used data handler that distributes
+                                the data over devices and memory banks (default:
+                                DIAG)
     
+Available options for `--comm-type`:
+
+- `CPU`: CPU only execution. MKL required.
+- `IEC`: Intel external channels are used by the kernels for communication.
+- `PCIE`: PCIe and MPI are used to exchange data between FPGAs over the CPU.
+
+Possible options for `--handler`:
 
+- `DIAG`: Diagonal distribution between FPGAs. Simplifies memory accesses by creating one-dimensional array of matrix blocks.
+- `PQ`: PQ distribution of data between FPGAs. P = Q, similar to the distribution used in the LINPAK implementation.
     
 To execute the unit and integration tests run
 
@@ -111,26 +135,30 @@ To execute the unit and integration tests run
     
 in the `bin` folder within the build directory.
 It will run an emulation of the kernel and execute some functionality tests.
-
 ## Output Interpretation
 
 An example output from an emulation is given below:
 
+    -------------------------------------------------------------
+    Validate output...
+    -------------------------------------------------------------
     Maximum error: 7.62939e-06 < 1.19209e-05
     Mach. Epsilon: 1.19209e-07
-    Validation Time: 4.69627e+00 s
-                  calc    calc FLOPS    Net [GB/s]    Mem [GB/s]
-    avg:   1.15169e-01   3.72929e+10   1.49172e+11   4.47515e+11
-    best:  1.14216e-01   3.76041e+10   1.50416e+11   4.51249e+11
+    Validation Time: 4.66312e+00 s
+           total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]
+    avg:   1.15886e+00   1.04112e+00   1.17743e-01   9.11940e+09   1.09433e+11   1.23760e+10
+    best:  1.13323e+00   1.02481e+00   1.08424e-01   9.90319e+09   1.18838e+11   1.25730e+10
     Validation: SUCCESS!
 
-The output gives the average and best calculation time for the transposition with the derived metrics.
+The output gives the average and best calculation time for the transposition and important derived metrics based on these times.
 For the average and best timings, we have the following columns:
 
-- `calc`: Calculation time in seconds, which is the pure kernel execution time without data transfer from the host.
-- `calc FLOPS`: Achieved FLOPS just considering the calculation time.
-- `Net [GB/s]`: Used total network bandwidth in GB/s.
-- `Mem [GB/s]`: Used total global memory bandwidth in GB/s.
+- `total [s]`: Total execution time in seconds of a whole repetition of the experiment that includes transfer and calcuation time.
+- `transfer [s]`: Time in seconds that is required to transfer the data buffers to a memory location that can be accessed by the kernels on the FPGA board.
+- `calc [s]`: Time in seconds to execute a single repetition of the matrix transposition also including communication between devices.
+- `calc FLOPS`: Derived floating-point operations per second based on the calculation time.
+- `Mem [B/s]`: Derived bandwidth of the memory that is accessed by the FPGA kernels during calculation based on the calculation time.
+- `PCIe [B/s]`: Derived bandwidth of the transfer interface that is used to copy the buffers to a memory location accessible by the FPGA based on the transfer time.
 
 The `Maximum Error` field shows the largest error that was computed.
 Since the arithmetic intensity of the algorithm is quite low and only one addition is required to calculate one value of the result matrix, the error should be close to the machine epsilon, which depends on the chosen data type.
diff --git a/PTRANS/configs/Nallatech_520N_CPU.cmake b/PTRANS/configs/Nallatech_520N_CPU.cmake
new file mode 100644
index 00000000..cbe60d34
--- /dev/null
+++ b/PTRANS/configs/Nallatech_520N_CPU.cmake
@@ -0,0 +1,20 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+
+set(CMAKE_CXX_FLAGS "-march=native" CACHE STRING "Additional flags sued for every build type" FORCE)
+set(CMAKE_C_FLAGS "-march=native" CACHE STRING "Additional flags sued for every build type" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 16 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
diff --git a/PTRANS/configs/Nallatech_520N_pcie.cmake b/PTRANS/configs/Nallatech_520N_pcie.cmake
new file mode 100644
index 00000000..e5a49184
--- /dev/null
+++ b/PTRANS/configs/Nallatech_520N_pcie.cmake
@@ -0,0 +1,20 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "p520_hpc_sg280l" CACHE STRING "" FORCE)
+set(AOC_FLAGS "-fpc -fp-relaxed -no-interleaving=default" CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 512 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 4 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
diff --git a/PTRANS/configs/Xilinx_U250_DDR_PCIE.cmake b/PTRANS/configs/Xilinx_U250_DDR_PCIE.cmake
new file mode 100644
index 00000000..6f446e97
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U250_DDR_PCIE.cmake
@@ -0,0 +1,23 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u250_xdma_201830_2" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.u250.generator.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.u250.ini CACHE FILEPATH "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 4 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/configs/Xilinx_U250_DDR_PCIE_unroll.cmake b/PTRANS/configs/Xilinx_U250_DDR_PCIE_unroll.cmake
new file mode 100644
index 00000000..533a44bf
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U250_DDR_PCIE_unroll.cmake
@@ -0,0 +1,24 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u250_xdma_201830_2" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.u250.generator.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.u250.ini CACHE FILEPATH "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 64 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 4 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+set(XILINX_UNROLL_INNER_LOOPS Yes CACHE BOOL "When building for Xilinx devices, unroll the inner loops to create a single pipeline per block and keep memory bursts. This is a tradeoff between resource usage and performance." FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
new file mode 100644
index 00000000..46ef245c
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
@@ -0,0 +1,23 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/scripts/build_520n_pcie.sh b/PTRANS/scripts/build_520n_pcie.sh
new file mode 100644
index 00000000..09de20c2
--- /dev/null
+++ b/PTRANS/scripts/build_520n_pcie.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#
+# Synthesize the PTRANS kernel for the Nallatech 520N board.
+# This is an example script, how the synthesis can be started on Noctua using a HPCC FPGA configuration file.
+# Submit this script to sbatch in this folder!
+#
+#SBATCH -p fpgasyn
+#SBATCH -J PTRANS
+
+module load intelFPGA_pro/20.4.0
+module load nalla_pcie/19.4.0_hpc
+module load intel
+module load devel/CMake/3.15.3-GCCcore-8.3.0
+
+SCRIPT_PATH=${SLURM_SUBMIT_DIR}
+
+BENCHMARK_DIR=${SCRIPT_PATH}/../
+
+SYNTH_DIR=${PFS_SCRATCH}/synth/520n/multi_fpga/PTRANS/pq_pcie
+
+CONFIG_NAMES=("Nallatech_520N_pcie")
+
+for r in "${CONFIG_NAMES[@]}"; do
+    BUILD_DIR=${SYNTH_DIR}/20.4.0-19.4.0_hpc-${r}
+
+    mkdir -p ${BUILD_DIR}
+    cd ${BUILD_DIR}
+
+    cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/${r}.cmake
+
+    make transpose_PQ_PCIE_intel Transpose_intel
+done
diff --git a/PTRANS/scripts/build_u250.sh b/PTRANS/scripts/build_u250.sh
new file mode 100644
index 00000000..49da9930
--- /dev/null
+++ b/PTRANS/scripts/build_u250.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+SCRIPT_PATH=${PWD}
+
+BENCHMARK_DIR=${SCRIPT_PATH}/../
+
+SYNTH_DIR=/mnt/local/meyermar/synth/u250/PTRANS
+
+CONFIG_NAMES=("Xilinx_U250_DDR_PCIE")
+
+for r in "${CONFIG_NAMES[@]}"; do
+    BUILD_DIR=${SYNTH_DIR}/${r}
+
+    mkdir -p ${BUILD_DIR}
+    cd ${BUILD_DIR}
+
+    cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/${r}.cmake
+
+    make transpose_PQ_PCIE_xilinx Transpose_xilinx
+done
diff --git a/PTRANS/scripts/build_u250_unroll.sh b/PTRANS/scripts/build_u250_unroll.sh
new file mode 100644
index 00000000..01f7ada4
--- /dev/null
+++ b/PTRANS/scripts/build_u250_unroll.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+SCRIPT_PATH=${PWD}
+
+BENCHMARK_DIR=${SCRIPT_PATH}/../
+
+SYNTH_DIR=/mnt/local/meyermar/synth/u250/PTRANS
+
+CONFIG_NAMES=("Xilinx_U250_DDR_PCIE_unroll")
+
+for r in "${CONFIG_NAMES[@]}"; do
+    BUILD_DIR=${SYNTH_DIR}/${r}
+
+    mkdir -p ${BUILD_DIR}
+    cd ${BUILD_DIR}
+
+    cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/${r}.cmake
+
+    make transpose_PQ_PCIE_xilinx Transpose_xilinx
+done
diff --git a/PTRANS/scripts/build_u280_alveo.sh b/PTRANS/scripts/build_u280_alveo.sh
new file mode 100644
index 00000000..a175d755
--- /dev/null
+++ b/PTRANS/scripts/build_u280_alveo.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+SCRIPT_PATH=${PWD}
+
+BENCHMARK_DIR=${SCRIPT_PATH}/../
+
+SYNTH_DIR=/mnt/scratch/meyermar/synth/u280/PTRANS
+
+CONFIG_NAMES=("Xilinx_U280_DDR_PCIE")
+
+for r in "${CONFIG_NAMES[@]}"; do
+    BUILD_DIR=${SYNTH_DIR}/${r}
+
+    mkdir -p ${BUILD_DIR}
+    cd ${BUILD_DIR}
+
+    cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/${r}.cmake
+
+    make transpose_PQ_PCIE_xilinx Transpose_xilinx
+done
diff --git a/PTRANS/scripts/build_u280_alveo_ddr_singleloop.sh b/PTRANS/scripts/build_u280_alveo_ddr_singleloop.sh
new file mode 100644
index 00000000..2e011ab3
--- /dev/null
+++ b/PTRANS/scripts/build_u280_alveo_ddr_singleloop.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+SCRIPT_PATH=${PWD}
+
+BENCHMARK_DIR=${SCRIPT_PATH}/../
+
+SYNTH_DIR=/mnt/scratch/meyermar/synth/u280/PTRANS
+
+CONFIG_NAMES=("Xilinx_U280_DDR_PCIE")
+
+for r in "${CONFIG_NAMES[@]}"; do
+    BUILD_DIR=${SYNTH_DIR}/${r}-singleloop
+
+    mkdir -p ${BUILD_DIR}
+    cd ${BUILD_DIR}
+
+    cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/${r}.cmake
+
+    make transpose_PQ_PCIE_singleloop_xilinx Transpose_xilinx
+done
diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini
new file mode 100644
index 00000000..bce5c3ff
--- /dev/null
+++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini
@@ -0,0 +1,2 @@
+kernel_frequency=300
+
diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
new file mode 100644
index 00000000..7e52533c
--- /dev/null
+++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
@@ -0,0 +1,4 @@
+kernel_frequency=450
+
+[hls]
+max_memory_ports=all
diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.u250.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.u250.ini
new file mode 100644
index 00000000..a1334eb2
--- /dev/null
+++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.u250.ini
@@ -0,0 +1,4 @@
+kernel_frequency=300
+
+[hls]
+max_memory_ports=all
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini
new file mode 100644
index 00000000..882d5af1
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini
@@ -0,0 +1,19 @@
+
+# Set number of available SLRs
+# PY_CODE_GEN num_slrs = 3
+# PY_CODE_GEN num_ddrs = 2
+
+[connectivity]
+nk=transpose0:$PY_CODE_GEN num_replications$
+
+# Assign kernels to the SLRs
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$
+# PY_CODE_GEN block_end
+
+# Assign the kernels to the memory ports
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem0:DDR[$PY_CODE_GEN i % num_ddrs$]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem1:DDR[$PY_CODE_GEN i % num_ddrs$]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem2:DDR[$PY_CODE_GEN i % num_ddrs$]
+# PY_CODE_GEN block_end
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.u250.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.u250.generator.ini
new file mode 100644
index 00000000..f5fbbfa7
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.u250.generator.ini
@@ -0,0 +1,17 @@
+
+# Set number of available SLRs
+# PY_CODE_GEN num_slrs = 4
+# PY_CODE_GEN num_ddrs = 4
+
+[connectivity]
+nk=transpose0:$PY_CODE_GEN num_replications$
+
+# Assign kernels to the SLRs
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$
+# PY_CODE_GEN block_end
+
+# Assign the kernels to the memory ports
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem:DDR[$PY_CODE_GEN i % num_ddrs$]
+# PY_CODE_GEN block_end
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie_spread_banks.hbm.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie_spread_banks.hbm.generator.ini
new file mode 100644
index 00000000..bda2578e
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie_spread_banks.hbm.generator.ini
@@ -0,0 +1,19 @@
+
+# Set number of available SLRs
+# PY_CODE_GEN num_slrs = 3
+# PY_CODE_GEN num_hbms = 32
+
+[connectivity]
+nk=transpose0:$PY_CODE_GEN num_replications$
+
+# Assign kernels to the SLRs
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$
+# PY_CODE_GEN block_end
+
+# Assign the kernels to the memory ports
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[$PY_CODE_GEN (3*i) % num_hbms$]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[$PY_CODE_GEN (3*i + 1) % num_hbms$]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[$PY_CODE_GEN (3*i + 2) % num_hbms$]
+# PY_CODE_GEN block_end
diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in
index 1ea39a89..cccac277 100644
--- a/PTRANS/src/common/parameters.h.in
+++ b/PTRANS/src/common/parameters.h.in
@@ -7,6 +7,8 @@
 #define WRITE_KERNEL_NAME "@WRITE_KERNEL_NAME@"
 #define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
 #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@
+#define DEFAULT_COMM_TYPE "@DEFAULT_COMM_TYPE@"
+#define DEFAULT_DIST_TYPE "@DEFAULT_DIST_TYPE@"
 #define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
 #define DEFAULT_DEVICE @DEFAULT_DEVICE@
 
@@ -22,6 +24,8 @@
 #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
 
 #cmakedefine USE_SVM
+#cmakedefine USE_BUFFER_WRITE_RECT_FOR_A
+#cmakedefine XILINX_UNROLL_INNER_LOOPS
 
 /*
 Short description of the program.
diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt
index 4d5f9338..7542a861 100644
--- a/PTRANS/src/device/CMakeLists.txt
+++ b/PTRANS/src/device/CMakeLists.txt
@@ -1,17 +1,19 @@
 
+set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication in the CMake target generation function")
+
 include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)
 
 if (INTELFPGAOPENCL_FOUND)
-    generate_kernel_targets_intel(transpose_diagonal transpose_diagonal_c2)
-    add_test(NAME test_emulation_diagonal_intel COMMAND Transpose_intel -f transpose_diagonal_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-    add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_intel -f transpose_diagonal_emulate.aocx -n 1 -m 1
+    generate_kernel_targets_intel(transpose_DIAG_IEC transpose_c2_DIAG_IEC transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_IEC)
+    add_test(NAME test_emulation_diagonal_intel COMMAND Transpose_intel -f transpose_DIAG_IEC_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+    add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_intel -f transpose_DIAG_IEC_emulate.aocx -n 1 -m 1
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
 endif()
 
 if (VITIS_FOUND)
-    generate_kernel_targets_xilinx(transpose_diagonal)
-    add_test(NAME test_emulation_diagonal_xilinx COMMAND Transpose_xilinx -f transpose_diagonal_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-    add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_diagonal_emulate.xclbin -n 1 -m 1
+    generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE)
+    add_test(NAME test_emulation_PQ_PCIE_xilinx COMMAND Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+    add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
 endif()
 
diff --git a/PTRANS/src/device/transpose_diagonal.cl b/PTRANS/src/device/transpose_DIAG_IEC.cl
similarity index 100%
rename from PTRANS/src/device/transpose_diagonal.cl
rename to PTRANS/src/device/transpose_DIAG_IEC.cl
diff --git a/PTRANS/src/device/transpose_DIAG_PCIE.cl b/PTRANS/src/device/transpose_DIAG_PCIE.cl
new file mode 100644
index 00000000..614800f3
--- /dev/null
+++ b/PTRANS/src/device/transpose_DIAG_PCIE.cl
@@ -0,0 +1,175 @@
+/******************************************************************************
+ *  Author: Arjun Ramaswami
+ *
+ *  Edited by Marius Meyer:
+ *  - Adapt to used kernel signature
+ *  - Change to row-column loop structure
+ *****************************************************************************/
+
+#include "parameters.h"
+
+/**
+* Load a block of A into local memory in a reordered fashion
+* to transpose it half-way
+*
+*
+* @param A Buffer for matrix A
+* @param local_buffer The local memory buffer the block is stored into
+* @param current_block Index of the current block used to calculate the offset in global memory
+*
+*/
+void
+load_chunk_of_a(__global DEVICE_DATA_TYPE *restrict A,
+        DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+        const ulong current_block,
+        const ulong chunk) {
+
+        ulong local_mem_converted_row = chunk;
+
+        DEVICE_DATA_TYPE rotate_in[CHANNEL_WIDTH];
+
+        ulong load_address = current_block * BLOCK_SIZE * BLOCK_SIZE + chunk * CHANNEL_WIDTH;
+
+        // Blocks of a will be stored columnwise in global memory
+__attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+        for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+            rotate_in[unroll_count] = A[load_address + unroll_count];
+        }
+
+        unsigned rot = (chunk / (BLOCK_SIZE / CHANNEL_WIDTH)) & (CHANNEL_WIDTH - 1);
+
+        // rotate temporary buffer to store data into local buffer
+__attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+        for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+            // every block of (N / CHANNEL_WIDTH), rotates the index by 1
+            // store in double buffer
+            local_buffer[local_mem_converted_row][unroll_count] = rotate_in[(unroll_count + CHANNEL_WIDTH - rot)
+                                                                                        & (CHANNEL_WIDTH - 1)];
+        }
+}
+
+/**
+* send a chunk of A into local memory in a reordered fashion
+* to transpose it half-way
+*
+*
+* @param A Buffer for matrix A
+* @param local_buffer The local memory buffer the block is stored into
+* @param current_block Index of the current block used to calculate the offset in global memory
+*
+*/
+void
+load_chunk_of_trans_a(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+    DEVICE_DATA_TYPE chunk_out[CHANNEL_WIDTH],
+    const ulong chunk) {
+
+        DEVICE_DATA_TYPE rotate_out[CHANNEL_WIDTH];
+
+        ulong base = (chunk & (BLOCK_SIZE / CHANNEL_WIDTH - 1)) * BLOCK_SIZE;
+        ulong offset = (chunk / (BLOCK_SIZE / CHANNEL_WIDTH)) / CHANNEL_WIDTH;
+
+
+__attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+        for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+            unsigned rot = ((CHANNEL_WIDTH + unroll_count - (chunk / (BLOCK_SIZE / CHANNEL_WIDTH))) * (BLOCK_SIZE / CHANNEL_WIDTH)) &
+                                                                                        (BLOCK_SIZE - 1);
+            unsigned row_rotate = base + offset + rot;
+            rotate_out[unroll_count] = local_buffer[row_rotate][unroll_count];
+        }
+
+        unsigned rot_out = (chunk / (BLOCK_SIZE / CHANNEL_WIDTH)) & (CHANNEL_WIDTH - 1);
+
+        // rotate temporary buffer to store data into local buffer
+        __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+        for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+            chunk_out[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)];
+        }
+
+        
+}
+
+void
+add_a_and_b(__global DEVICE_DATA_TYPE *restrict B,
+    const DEVICE_DATA_TYPE local_buffer_a[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+    DEVICE_DATA_TYPE local_buffer_a_plus_b[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+    const ulong block,
+    const ulong chunk) {
+
+    DEVICE_DATA_TYPE data_chunk[CHANNEL_WIDTH];
+
+    load_chunk_of_trans_a(local_buffer_a, data_chunk, chunk);
+
+    ulong ls_address = block * BLOCK_SIZE * BLOCK_SIZE + chunk * CHANNEL_WIDTH;
+    // load tranposed A from global memory
+    __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+    for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+        data_chunk[unroll_count] += B[ls_address + unroll_count];
+    }
+
+    __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+    for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+        local_buffer_a_plus_b[chunk][unroll_count] = data_chunk[unroll_count];
+    }
+}
+
+
+void
+store_a(__global DEVICE_DATA_TYPE *restrict A_out,
+    const DEVICE_DATA_TYPE local_buffer_a_plus_b[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+    const ulong block,
+    const ulong chunk) {
+
+    ulong ls_address = block * BLOCK_SIZE * BLOCK_SIZE + chunk * CHANNEL_WIDTH;
+
+    __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+    for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+        A_out[ls_address + unroll_count] = local_buffer_a_plus_b[chunk][unroll_count];
+    }
+}
+
+// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+
+/**
+ * Read blocks of matrix A and transpose them in memory.
+ * Write the block into an external channel.
+ *
+ * Will do the following:
+ *
+ * A -> trans(A) -> ext. ch
+ *
+ * @param A Buffer for matrix A
+ * @param B Buffer for matrix B
+ * @param A_out Buffer for result matrix
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ */
+__attribute__((max_global_work_dim(0)))
+__kernel
+void transpose/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+                                __global DEVICE_DATA_TYPE *restrict B,
+                                __global DEVICE_DATA_TYPE *restrict A_out,
+            const uint number_of_blocks) {
+
+    // local memory double buffer for a matrix block
+    DEVICE_DATA_TYPE a_block[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH] __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,1))) __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,2)));
+    // local memory double buffer for a matrix block
+    DEVICE_DATA_TYPE a_plus_b_block[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH] __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,1))) __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,2)));
+
+    // transpose the matrix block-wise from global memory
+    #pragma loop_coalesce
+    for (uint block = 0; block < number_of_blocks; block++) {
+        // Combine all three steps in single pipeline to reduce overhead
+        for (uint chunk = 0; chunk < 3 * BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH; chunk++) {
+            uint current_chunk = chunk & (BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH - 1);
+            switch (chunk / (BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH)) {
+                // read in block of A from global memory and store it in a memory efficient manner for transpose
+                case 0: load_chunk_of_a(A, a_block, block, current_chunk); break;
+                // read transposed block of A from local memory buffer and add B from global memory to it
+                case 1: add_a_and_b(B, a_block, a_plus_b_block, block, current_chunk); break;
+                // Store result in global memory
+                case 2: store_a(A_out, a_plus_b_block, block, current_chunk); break;
+            }
+        }
+    }
+}
+
+// PY_CODE_GEN block_end
diff --git a/PTRANS/src/device/transpose_PQ_IEC.cl b/PTRANS/src/device/transpose_PQ_IEC.cl
new file mode 100644
index 00000000..e219ae1c
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_IEC.cl
@@ -0,0 +1,205 @@
+/******************************************************************************
+ *  Author: Arjun Ramaswami
+ *
+ *  Edited by Marius Meyer:
+ *  - Adapt to used kernel signature
+ *  - Change to row-column loop structure
+ *****************************************************************************/
+
+#include "parameters.h"
+
+// Need some depth to our channels to accommodate their bursty filling.
+#ifdef INTEL_FPGA
+#pragma OPENCL EXTENSION cl_intel_channels : enable
+
+typedef struct {
+    DEVICE_DATA_TYPE data[CHANNEL_WIDTH];
+} ch_data;
+
+// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+// Channel used to send the transposed blocks of A
+channel ch_data chan_a_out/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(i) + "\""*/), depth(1)));
+channel ch_data chan_a_in/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2 * (i // 2) + ((i + 1) % 2)) + "\""*/), depth(1)));
+// PY_CODE_GEN block_end
+#endif
+
+/**
+* Load a block of A into local memory in a reordered fashion
+* to transpose it half-way
+*
+*
+* @param A Buffer for matrix A
+* @param local_buffer The local memory buffer the block is stored into
+* @param current_block Index of the current block used to calculate the offset in global memory
+*
+*/
+void
+load_chunk_of_a(__global DEVICE_DATA_TYPE *restrict A,
+        DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+        const ulong block_row,
+        const ulong block_col,
+        const ulong width_in_blocks,
+        const ulong row,
+        const ulong col) {
+
+        ulong local_mem_converted_row = row * (BLOCK_SIZE / CHANNEL_WIDTH) + col;
+
+        DEVICE_DATA_TYPE rotate_in[CHANNEL_WIDTH];
+
+        ulong load_address = block_row * BLOCK_SIZE * BLOCK_SIZE * width_in_blocks +
+                                block_col * BLOCK_SIZE + 
+                                row * BLOCK_SIZE * width_in_blocks + 
+                                col * CHANNEL_WIDTH;
+
+        // Blocks of a will be stored columnwise in global memory
+__attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+        for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+            rotate_in[unroll_count] = A[load_address + unroll_count];
+        }
+
+        unsigned rot = row & (CHANNEL_WIDTH - 1);
+
+        // rotate temporary buffer to store data into local buffer
+__attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+        for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+            // every block of (N / CHANNEL_WIDTH), rotates the index by 1
+            // store in double buffer
+            local_buffer[local_mem_converted_row][unroll_count] = rotate_in[(unroll_count + CHANNEL_WIDTH - rot)
+                                                                                        & (CHANNEL_WIDTH - 1)];
+        }
+}
+
+// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+
+/**
+* send a chunk of A into local memory in a reordered fashion
+* to transpose it half-way
+*
+*
+* @param A Buffer for matrix A
+* @param local_buffer The local memory buffer the block is stored into
+* @param current_block Index of the current block used to calculate the offset in global memory
+*
+*/
+void
+send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+        const ulong row,
+        const ulong col) {
+
+        DEVICE_DATA_TYPE rotate_out[CHANNEL_WIDTH];
+
+        ulong base = col * BLOCK_SIZE;
+        ulong offset = row / CHANNEL_WIDTH;
+
+
+__attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+        for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+            unsigned rot = ((CHANNEL_WIDTH + unroll_count - row) * (BLOCK_SIZE / CHANNEL_WIDTH)) &
+                                                                                        (BLOCK_SIZE - 1);
+            unsigned row_rotate = base + offset + rot;
+            rotate_out[unroll_count] = local_buffer[row_rotate][unroll_count];
+        }
+
+        unsigned rot_out = row & (CHANNEL_WIDTH - 1);
+
+        ch_data data;
+        // rotate temporary buffer to store data into local buffer
+__attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+        for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+            data.data[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)];
+        }
+
+        write_channel_intel(chan_a_out/*PY_CODE_GEN i*/, data); 
+}
+
+/**
+ * Read blocks of matrix A and transpose them in memory.
+ * Write the block into an external channel.
+ *
+ * Will do the following:
+ *
+ * A -> trans(A) -> ext. ch
+ *
+ * @param A Buffer for matrix A
+ * @param block_offset The first block that will be processed in the provided buffer
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ */
+__attribute__((max_global_work_dim(0)))
+__kernel
+void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+            const ulong offset,
+            const ulong width_in_blocks,
+            const ulong height_in_blocks,
+            const ulong number_of_blocks) {
+
+    // local memory double buffer for a matrix block
+    DEVICE_DATA_TYPE a_block[2][BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH] __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,1))) __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,2)));
+
+    // transpose the matrix block-wise from global memory
+    // One extra iteration to empty double buffer
+    #pragma loop_coalesce
+    for (ulong block = offset; block < number_of_blocks + offset + 1; block++) {
+        // read in block from global memory and store it in a memory efficient manner
+        for (ulong row = 0; row < BLOCK_SIZE; row++) {
+            for (ulong col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) {
+                if (block < number_of_blocks + offset) {
+                    ulong block_col = block / height_in_blocks;
+                    ulong block_row = block % height_in_blocks;
+                    load_chunk_of_a(A, a_block[block & 1], block_row, block_col, width_in_blocks, row, col);
+                }
+                if (block > offset) {
+                    send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col);
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Will add a matrix from external channel and matrix from global memory and store result in global memory.
+ *
+ * Will do the following:
+ *
+ * ext. ch + B --> A_out
+ *
+ * where A_out, ext. ch and B are matrices of size matrixSize*matrixSize
+ *
+ * @param B Buffer for matrix B
+ * @param A_out Output buffer for result matrix
+ * @param block_offset The first block that will be processed in the provided buffer
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ */
+__attribute__((max_global_work_dim(0)))
+__kernel
+void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
+            __global DEVICE_DATA_TYPE *restrict A_out,
+            const ulong offset,
+            const ulong width_in_blocks,
+            const ulong number_of_blocks) {
+
+    #pragma loop_coalesce
+    for (ulong block = offset; block < number_of_blocks + offset; block++) {
+        // complete matrix transposition and write the result back to global memory
+        for (ulong row = 0; row < BLOCK_SIZE; row++) {
+            for (ulong col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) {
+
+                ch_data data = read_channel_intel(chan_a_in/*PY_CODE_GEN i*/); 
+
+                ulong block_col = block % width_in_blocks;
+                ulong block_row = block / width_in_blocks;
+
+                // rotate temporary buffer to store data into local buffer
+__attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+                for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+                    ulong ls_address = block_row * BLOCK_SIZE * BLOCK_SIZE * width_in_blocks +
+                                        block_col * BLOCK_SIZE +
+                                        row * BLOCK_SIZE * width_in_blocks + 
+                                        col * CHANNEL_WIDTH + unroll_count;
+                    A_out[ls_address] = data.data[unroll_count] + B[ls_address];
+                }
+            }
+        }
+    }
+}
+
+// PY_CODE_GEN block_end
diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cl b/PTRANS/src/device/transpose_PQ_PCIE.cl
new file mode 100644
index 00000000..5e8fb034
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_PCIE.cl
@@ -0,0 +1,185 @@
+/******************************************************************************
+ *  Author: Arjun Ramaswami
+ *
+ *  Edited by Marius Meyer:
+ *  - Adapt to used kernel signature
+ *  - Change to row-column loop structure
+ *****************************************************************************/
+
+#include "parameters.h"
+
+// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+
+/**
+ * Read blocks of matrix A and transpose them in memory.
+ * Write the block into an external channel.
+ *
+ * Will do the following:
+ *
+ * A -> trans(A) -> ext. ch
+ *
+ * @param A Buffer for matrix A
+ * @param B Buffer for matrix B
+ * @param A_out Buffer for result matrix
+ * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise
+                on the block level, the whole matrix A might be written to global memory and the relevant columns
+                need to be picked using this offset.
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ * @param width_in_blocks The with of matrix A in blocks
+ * @param height_in_blocks The height of matix A in blocks
+ */
+__attribute__((max_global_work_dim(0)))
+__kernel
+void transpose/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+                                __global DEVICE_DATA_TYPE *restrict B,
+                                __global DEVICE_DATA_TYPE *restrict A_out,
+            const uint offset,
+            const uint number_of_blocks,
+            const uint width_in_blocks,
+            const uint height_in_blocks) {
+
+    // local memory double buffer for a matrix block
+    DEVICE_DATA_TYPE a_block[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH] __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,1))) __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,2)));
+    // local memory double buffer for a matrix block
+    DEVICE_DATA_TYPE a_plus_b_block[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH] __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,1))) __attribute__((xcl_array_partition(cyclic, CHANNEL_WIDTH,2)));
+
+    // transpose the matrix block-wise from global memory
+    for (uint block = 0; block < number_of_blocks; block++) {
+#ifdef INTEL_FPGA
+        // Load A to local memory
+        #pragma loop_coalesce
+#endif
+#ifdef XILINX_FPGA
+#ifdef XILINX_UNROLL_INNER_LOOPS
+        __attribute__((xcl_pipeline_loop(BLOCK_SIZE / CHANNEL_WIDTH)))
+#endif
+#endif
+        for (uint row = 0; row < BLOCK_SIZE; row++) {
+#ifdef XILINX_FPGA
+#ifndef XILINX_UNROLL_INNER_LOOPS
+        __attribute__((xcl_pipeline_loop(1)))
+#endif
+#endif
+            for (uint col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) {
+                ulong block_row_a = (block + offset) / width_in_blocks;
+                ulong block_col_a = (block + offset) % width_in_blocks;
+                ulong ls_address_trans = block_col_a * BLOCK_SIZE * BLOCK_SIZE * height_in_blocks +
+                            block_row_a * BLOCK_SIZE + 
+                            row * BLOCK_SIZE * height_in_blocks;
+
+                // read in block of A from global memory and store it in a memory efficient manner for transpose
+                DEVICE_DATA_TYPE rotate_in[CHANNEL_WIDTH];
+
+                // Blocks of a will be stored columnwise in global memory
+                __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+                for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+                    rotate_in[unroll_count] = A[ls_address_trans + col * CHANNEL_WIDTH + unroll_count];
+                }
+
+                uint chunk = row * (BLOCK_SIZE / CHANNEL_WIDTH) + col;
+
+                unsigned rot = (row) & (CHANNEL_WIDTH - 1);
+
+                // rotate temporary buffer to store data into local buffer
+                __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+                for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+                    // every block of (N / CHANNEL_WIDTH), rotates the index by 1
+                    // store in double buffer
+                    a_block[chunk][unroll_count] = rotate_in[(unroll_count + CHANNEL_WIDTH - rot)
+                                                                                                & (CHANNEL_WIDTH - 1)];
+                }
+            }
+        }
+
+        // Read transposed A from local memory and add B 
+#ifdef INTEL_FPGA
+        // Load A to local memory
+        #pragma loop_coalesce
+#endif
+#ifdef XILINX_FPGA
+#ifdef XILINX_UNROLL_INNER_LOOPS
+        __attribute__((xcl_pipeline_loop(BLOCK_SIZE / CHANNEL_WIDTH)))
+#endif
+#endif
+        for (uint row = 0; row < BLOCK_SIZE; row++) {
+#ifdef XILINX_FPGA
+#ifndef XILINX_UNROLL_INNER_LOOPS
+        __attribute__((xcl_pipeline_loop(1)))
+#endif
+#endif
+            for (uint col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) {
+                ulong block_row = block / width_in_blocks;
+                ulong block_col = block % width_in_blocks;
+                ulong ls_address_row = block_row * BLOCK_SIZE * BLOCK_SIZE * width_in_blocks +
+                        block_col * BLOCK_SIZE + 
+                        row * BLOCK_SIZE * width_in_blocks;
+                uint chunk = row * (BLOCK_SIZE / CHANNEL_WIDTH) + col;
+
+                DEVICE_DATA_TYPE data_chunk[CHANNEL_WIDTH];
+                DEVICE_DATA_TYPE rotate_out[CHANNEL_WIDTH];
+
+                uint base = col * BLOCK_SIZE;
+                uint offset = row / CHANNEL_WIDTH;
+
+                __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+                for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+                    unsigned rot = ((CHANNEL_WIDTH + unroll_count - row) * (BLOCK_SIZE / CHANNEL_WIDTH)) &
+                                                                                                (BLOCK_SIZE - 1);
+                    unsigned row_rotate = base + offset + rot;
+                    rotate_out[unroll_count] = a_block[row_rotate][unroll_count];
+                }
+
+                unsigned rot_out = row & (CHANNEL_WIDTH - 1);
+
+                // rotate temporary buffer to store data into local buffer
+                __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+                for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+                    data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)];
+                }
+
+                // load tranposed A from global memory
+                __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+                for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+                    data_chunk[unroll_count] += B[ls_address_row + col * CHANNEL_WIDTH + unroll_count];
+                }
+
+                __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+                for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+                    a_plus_b_block[chunk][unroll_count] = data_chunk[unroll_count];
+                }
+            }
+        }
+        // Write back result
+#ifdef INTEL_FPGA
+        // Load A to local memory
+        #pragma loop_coalesce
+#endif
+#ifdef XILINX_FPGA
+#ifdef XILINX_UNROLL_INNER_LOOPS
+        __attribute__((xcl_pipeline_loop(BLOCK_SIZE / CHANNEL_WIDTH)))
+#endif
+#endif
+        for (uint row = 0; row < BLOCK_SIZE; row++) {
+#ifdef XILINX_FPGA
+#ifndef XILINX_UNROLL_INNER_LOOPS
+        __attribute__((xcl_pipeline_loop(1)))
+#endif
+#endif
+            for (uint col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) {
+                ulong block_row = block / width_in_blocks;
+                ulong block_col = block % width_in_blocks;
+                ulong ls_address_row = block_row * BLOCK_SIZE * BLOCK_SIZE * width_in_blocks +
+                        block_col * BLOCK_SIZE + 
+                        row * BLOCK_SIZE * width_in_blocks;
+                uint chunk = row * (BLOCK_SIZE / CHANNEL_WIDTH) + col;
+
+                __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
+                for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH; unroll_count++) {
+                    A_out[ls_address_row + col * CHANNEL_WIDTH + unroll_count] = a_plus_b_block[chunk][unroll_count];
+                }
+            }
+        }
+    }
+}
+
+// PY_CODE_GEN block_end
diff --git a/PTRANS/src/device/transpose_diagonal_c2.cl b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl
similarity index 100%
rename from PTRANS/src/device/transpose_diagonal_c2.cl
rename to PTRANS/src/device/transpose_c2_DIAG_IEC.cl
diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index 9d8f2c99..89b45ff8 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -1,10 +1,14 @@
 
 add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase)
-set(HOST_SOURCE execution_default.cpp transpose_benchmark.cpp transpose_handlers.cpp transpose_data.cpp)
+set(HOST_SOURCE transpose_benchmark.cpp transpose_data.cpp)
 
 set(HOST_EXE_NAME Transpose)
 set(LIB_NAME trans)
 
+set(BLA_VENDOR Intel10_64lp)
+set(BLA_STATIC ON)
+find_package(BLAS)
+
 if (INTELFPGAOPENCL_FOUND)
     add_library(${LIB_NAME}_intel STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_intel PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${IntelFPGAOpenCL_INCLUDE_DIRS})
@@ -16,6 +20,12 @@ if (INTELFPGAOPENCL_FOUND)
     if (USE_SVM)
         target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0)
     endif()
+    if (BLAS_FOUND)
+        target_link_libraries(${LIB_NAME}_intel ${BLAS_LIBRARIES})
+        target_compile_definitions(${LIB_NAME}_intel PRIVATE -DMKL_FOUND)
+        target_compile_options(${LIB_NAME}_intel PRIVATE "${BLAS_LINKER_FLAGS}")
+        target_include_directories(${LIB_NAME}_intel PRIVATE "$ENV{MKL_ROOT}/include")
+    endif()
     target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA)
     target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
diff --git a/PTRANS/src/host/data_handlers/data_handler_types.h b/PTRANS/src/host/data_handlers/data_handler_types.h
new file mode 100644
index 00000000..e15bb7b6
--- /dev/null
+++ b/PTRANS/src/host/data_handlers/data_handler_types.h
@@ -0,0 +1,85 @@
+/*
+Copyright (c) 2021 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_DATA_HANDLER_TPYES_H_
+#define SRC_HOST_DATA_HANDLER_TPYES_H_
+
+/* C++ standard library headers */
+#include <map>
+
+namespace transpose {
+namespace data_handler {
+
+/**
+ * @brief This enumeration contains all available data handler types.
+ * 
+ */
+typedef enum _DataHandlerType {
+
+    /**
+     * @brief The matrix is already blockwise diagonally distributed which only required data exchange with a single node
+     * 
+     */
+    diagonal,
+
+    /**
+     * @brief Classical distribution of the matrix in a PQ grid
+     * 
+     */
+    pq,
+
+    /**
+     * @brief Automatically detect distribution scheme from kernel file name
+     * 
+     */
+    automatic
+
+
+
+} DataHandlerType;
+
+static const std::map<const std::string, DataHandlerType> comm_to_str_map{ 
+    {"DIAG", DataHandlerType::diagonal}, 
+    {"PQ", DataHandlerType::pq},
+    {"AUTO", DataHandlerType::automatic}
+    };
+
+static std::string handlerToString(DataHandlerType c) {
+    for (auto& entry : comm_to_str_map) {
+        if (entry.second == c) {
+            return entry.first;
+        }
+    }
+    throw new std::runtime_error("Communication type could not be converted to string!");
+}
+
+static DataHandlerType stringToHandler(std::string comm_name) {
+    auto result = comm_to_str_map.find(comm_name);
+    if (result != comm_to_str_map.end()) {
+        return result->second;
+    }
+    throw new std::runtime_error("Communication type could not be converted from string: " + comm_name);
+}
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/PTRANS/src/host/data_handlers/diagonal.hpp b/PTRANS/src/host/data_handlers/diagonal.hpp
new file mode 100644
index 00000000..e1d72f3b
--- /dev/null
+++ b/PTRANS/src/host/data_handlers/diagonal.hpp
@@ -0,0 +1,210 @@
+/*
+Copyright (c) 2020 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef SRC_HOST_TRANSPOSE_HANDLER_DIAGONAL_HPP_
+#define SRC_HOST_TRANSPOSE_HANDLER_DIAGONAL_HPP_
+
+/* C++ standard library headers */
+#include <memory>
+#include <random>
+
+/* Project's headers */
+#include "handler.hpp"
+
+/**
+ * @brief Contains all classes and methods needed by the Transpose benchmark
+ * 
+ */
+namespace transpose {
+
+    namespace data_handler {
+
+/**
+ * @brief Transposes the data over external channels, so every part of a pair is located on a different FPGA.
+ *         Data will be distributed to the ranks such that only a fixed pair of ranks will communicate to exchange
+ *         the missing data. e.g. for N ranks, the pairs will be (0, N/2), (1, N/2 + 1), ...
+ * 
+ */
+class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler {
+
+private:
+
+    /**
+     * @brief Number of diagonal ranks that will sent the blcoks to themselves
+     * 
+     */
+    int num_diagonal_ranks;
+
+    /**
+     * @brief MPI data for matrix blocks
+     * 
+     */
+    MPI_Datatype data_block;
+
+public:
+
+    /**
+     * @brief Generate data for transposition based on the implemented distribution scheme
+     * 
+     * @param settings The execution settings that contain information about the data size
+     * @return std::unique_ptr<TransposeData> The generated data
+     */
+    std::unique_ptr<TransposeData>
+    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& settings) override {
+        MPI_Type_contiguous(settings.programSettings->blockSize * settings.programSettings->blockSize, MPI_FLOAT, &data_block);
+        MPI_Type_commit(&data_block);
+        
+        int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize;
+
+        int avg_blocks_per_rank = (width_in_blocks * width_in_blocks) / mpi_comm_size;
+        int avg_diagonal_blocks = width_in_blocks;
+        if (avg_blocks_per_rank > 0) {
+            avg_diagonal_blocks = (width_in_blocks / avg_blocks_per_rank);
+        }
+        num_diagonal_ranks = std::max(avg_diagonal_blocks, 1);
+
+        if (num_diagonal_ranks % 2 != mpi_comm_size % 2) {
+        #ifndef NDEBUG
+            std::cout << "Rank " << mpi_comm_rank << ": Fail 1!" << std::endl;
+        #endif
+            // Abort if there is a too high difference in the number of matrix blocks between the MPI ranks
+            throw std::runtime_error("Matrix size and MPI ranks to not allow fair distribution of blocks! Increase or reduce the number of MPI ranks by 1.");
+        }
+        if ((mpi_comm_size - num_diagonal_ranks) % 2 != 0 || (mpi_comm_size - num_diagonal_ranks) == 0 && width_in_blocks > 1) {
+        #ifndef NDEBUG
+            std::cout << "Rank " << mpi_comm_rank << ": Fail 2!" << std::endl;
+        #endif
+            throw std::runtime_error("Not possible to create pairs of MPI ranks for lower and upper half of matrix. Increase number of MPI ranks!.");
+        }
+        bool this_rank_is_diagonal = mpi_comm_rank >= (mpi_comm_size - num_diagonal_ranks);
+        int blocks_if_diagonal = width_in_blocks / num_diagonal_ranks + ( (mpi_comm_rank - (mpi_comm_size - num_diagonal_ranks)) < (width_in_blocks % num_diagonal_ranks) ? 1 : 0);
+        int blocks_if_not_diagonal = 0;
+        if ((mpi_comm_size - num_diagonal_ranks) > 0 ) {
+            blocks_if_not_diagonal = (width_in_blocks * (width_in_blocks - 1)) / (mpi_comm_size - num_diagonal_ranks) + (mpi_comm_rank  < ((width_in_blocks * (width_in_blocks - 1)) % (mpi_comm_size - num_diagonal_ranks)) ? 1 : 0);
+        }
+
+
+        int blocks_per_rank = (this_rank_is_diagonal) ? blocks_if_diagonal : blocks_if_not_diagonal;
+
+        if (mpi_comm_rank == 0) {
+            std::cout << "Diag. blocks per rank:              " << blocks_if_diagonal << std::endl;
+            std::cout << "Blocks per rank:                    " << blocks_if_not_diagonal << std::endl;
+            std::cout << "Loopback ranks for diagonal blocks: " << num_diagonal_ranks << std::endl;
+        }
+        // Height of a matrix generated for a single memory bank on a single MPI rank
+        int data_height_per_rank = blocks_per_rank * settings.programSettings->blockSize;
+
+    #ifndef NDEBUG
+        std::cout << "Rank " << mpi_comm_rank << ": NumBlocks = " << blocks_per_rank << std::endl;
+    #endif
+        
+        // Allocate memory for a single device and all its memory banks
+        auto d = std::unique_ptr<transpose::TransposeData>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
+
+        // Fill the allocated memory with pseudo random values
+        std::mt19937 gen(mpi_comm_rank);
+        std::uniform_real_distribution<> dis(-100.0, 100.0);
+        for (size_t i = 0; i < data_height_per_rank; i++) {
+            for (size_t j = 0; j < settings.programSettings->blockSize; j++) {
+                d->A[i * settings.programSettings->blockSize + j] = dis(gen);
+                d->B[i * settings.programSettings->blockSize + j] = dis(gen);
+                d->result[i * settings.programSettings->blockSize + j] = 0.0;
+            }
+        }
+        
+        return d;
+    }
+
+    /**
+     * @brief Exchange the data blocks for verification
+     * 
+     * @param data The data that was generated locally and will be exchanged with other MPI ranks 
+     *              Exchanged data will be stored in the same object.
+     */
+    void
+    exchangeData(TransposeData& data) override {
+
+    #ifndef NDEBUG
+        // std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
+    #endif
+        // Only need to exchange data, if rank has a partner
+        if (mpi_comm_rank < mpi_comm_size - num_diagonal_ranks) {
+
+            int first_upper_half_rank = (mpi_comm_size - num_diagonal_ranks)/2;
+            int pair_rank = (mpi_comm_rank >= first_upper_half_rank) ? mpi_comm_rank - first_upper_half_rank : mpi_comm_rank + first_upper_half_rank;
+
+            // To re-calculate the matrix transposition locally on this host, we need to 
+            // exchange matrix A for every kernel replication
+            // The order of the matrix blocks does not change during the exchange, because they are distributed diagonally 
+            // and will be handled in the order below:
+            //
+            // . . 1 3
+            // . . . 2
+            // 1 . . .
+            // 3 2 . .
+            MPI_Status status;        
+
+            size_t remaining_data_size = data.numBlocks;
+            size_t offset = 0;
+            while (remaining_data_size > 0) {
+                int next_chunk = (remaining_data_size > std::numeric_limits<int>::max()) ? std::numeric_limits<int>::max(): remaining_data_size;
+                MPI_Sendrecv(&data.A[offset], next_chunk, data_block, pair_rank, 0, &data.exchange[offset], next_chunk, data_block, pair_rank, 0, MPI_COMM_WORLD, &status);
+
+                remaining_data_size -= next_chunk;
+                offset += static_cast<size_t>(next_chunk) * static_cast<size_t>(data.blockSize * data.blockSize);
+            }
+            
+            // Exchange window pointers
+            HOST_DATA_TYPE* tmp = data.exchange;
+            data.exchange = data.A;
+            data.A = tmp;
+        }
+    #ifndef NDEBUG
+        // std::cout << "End data exchange " << mpi_comm_rank << std::endl;
+    #endif
+    }
+
+    void 
+    reference_transpose(TransposeData& data) {
+        size_t block_offset = data.blockSize * data.blockSize;
+        for (size_t b = 0; b < data.numBlocks; b++) {
+            for (size_t i = 0; i < data.blockSize; i++) {
+                for (size_t j = 0; j < data.blockSize; j++) {
+                    data.A[b * block_offset + j * data.blockSize + i] -= (data.result[b * block_offset + i * data.blockSize + j] 
+                                                                                - data.B[b * block_offset + i * data.blockSize + j]);
+                }
+            }
+        }
+    }
+
+    DistributedDiagonalTransposeDataHandler(int mpi_rank, int mpi_size): TransposeDataHandler(mpi_rank, mpi_size) {
+        if (mpi_rank >= mpi_size) {
+            throw std::runtime_error("MPI rank must be smaller the MPI world size!");
+        }
+    }
+
+};
+
+} 
+}
+
+#endif
diff --git a/PTRANS/src/host/transpose_handlers.hpp b/PTRANS/src/host/data_handlers/handler.hpp
similarity index 58%
rename from PTRANS/src/host/transpose_handlers.hpp
rename to PTRANS/src/host/data_handlers/handler.hpp
index c30a7acd..fe1293fe 100644
--- a/PTRANS/src/host/transpose_handlers.hpp
+++ b/PTRANS/src/host/data_handlers/handler.hpp
@@ -27,20 +27,14 @@ SOFTWARE.
 #include <memory>
 
 /* Project's headers */
-#include "transpose_data.hpp"
-
-/**
- * @brief String that identifies the transpose::DistributedExternalTransposeDataHandler 
- * 
- */
-#define TRANSPOSE_HANDLERS_DIST_DIAG "distdiag" 
+#include "../transpose_data.hpp"
 
 /**
  * @brief Contains all classes and methods needed by the Transpose benchmark
  * 
  */
 namespace transpose {
-
+namespace data_handler {
 /**
  * @brief The parallel matrix transposition is designed to support different kinds of data distribution.
  *          This abstract class provides the necessary methods that need to be implemented for every data distribution scheme.
@@ -85,6 +79,9 @@ class TransposeDataHandler {
     virtual void
     exchangeData(TransposeData& data) = 0;
 
+    virtual void
+    reference_transpose(TransposeData& data) = 0;
+
     /**
      * @brief Construct a new Transpose Data Handler object and initialize the MPI rank and MPI size variables if MPI is used
      * 
@@ -93,69 +90,7 @@ class TransposeDataHandler {
 
 };
 
-#ifdef _USE_MPI_
-
-/**
- * @brief Transposes the data over external channels, so every part of a pair is located on a different FPGA.
- *         Data will be distributed to the ranks such that only a fixed pair of ranks will communicate to exchange
- *         the missing data. e.g. for N ranks, the pairs will be (0, N/2), (1, N/2 + 1), ...
- * 
- */
-class DistributedDiagonalTransposeDataHandler : public transpose::TransposeDataHandler {
-
-private:
-
-    /**
-     * @brief Number of diagonal ranks that will sent the blcoks to themselves
-     * 
-     */
-    int num_diagonal_ranks;
-
-public:
-
-    /**
-     * @brief Generate data for transposition based on the implemented distribution scheme
-     * 
-     * @param settings The execution settings that contain information about the data size
-     * @return std::unique_ptr<TransposeData> The generated data
-     */
-    std::unique_ptr<TransposeData>
-    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& settings) override;
-
-    /**
-     * @brief Exchange the data blocks for verification
-     * 
-     * @param data The data that was generated locally and will be exchanged with other MPI ranks 
-     *              Exchanged data will be stored in the same object.
-     */
-    void
-    exchangeData(TransposeData& data) override;
-
-    DistributedDiagonalTransposeDataHandler(int mpi_rank, int mpi_size);
-
-};
-
-#endif
-
-/**
- * @brief Generate a data handler object
- * 
- * @tparam T The class of the data handler object
- * @return std::unique_ptr<transpose::TransposeDataHandler> a unique poiinter to the generated data handler object
- */
-template<class T> 
-std::unique_ptr<transpose::TransposeDataHandler> 
-generateDataHandler(int rank, int size) {
-    return std::unique_ptr<T>(new T(rank, size));
 }
-
-/**
- * @brief A map that contains the mapping from plain strings to the data handler object that should be used in the program
- * 
- */
-extern std::map<std::string, std::unique_ptr<transpose::TransposeDataHandler> (*)(int rank, int size)> dataHandlerIdentifierMap;
-
-
 }
 
 #endif
diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp
new file mode 100644
index 00000000..369cab31
--- /dev/null
+++ b/PTRANS/src/host/data_handlers/pq.hpp
@@ -0,0 +1,161 @@
+/*
+Copyright (c) 2020 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef SRC_HOST_TRANSPOSE_HANDLERS_PQ_HPP_
+#define SRC_HOST_TRANSPOSE_HANDLERS_PQ_HPP_
+
+/* C++ standard library headers */
+#include <memory>
+
+/* Project's headers */
+#include "handler.hpp"
+
+/**
+ * @brief Contains all classes and methods needed by the Transpose benchmark
+ * 
+ */
+namespace transpose {
+namespace data_handler {
+
+class DistributedPQTransposeDataHandler : public TransposeDataHandler {
+
+private:
+
+    /**
+     * @brief Number of diagonal ranks that will sent the blcoks to themselves
+     * 
+     */
+    int width_per_rank;
+
+    int pq_row;
+
+    int pq_col;
+
+    int pq_width;
+
+    MPI_Datatype data_block;
+
+public:
+
+    /**
+     * @brief Generate data for transposition based on the implemented distribution scheme
+     * 
+     * @param settings The execution settings that contain information about the data size
+     * @return std::unique_ptr<TransposeData> The generated data
+     */
+    std::unique_ptr<TransposeData>
+    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& settings) override {
+        int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize;
+
+        // A data block is strided!
+        MPI_Type_contiguous(settings.programSettings->blockSize * settings.programSettings->blockSize, MPI_FLOAT, &data_block);
+        MPI_Type_commit(&data_block);
+
+        width_per_rank = width_in_blocks / pq_width;
+        pq_row = mpi_comm_rank / pq_width;
+        pq_col = mpi_comm_rank % pq_width;
+
+        int blocks_per_rank = width_per_rank * width_per_rank;
+        
+        // Allocate memory for a single device and all its memory banks
+        auto d = std::unique_ptr<transpose::TransposeData>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
+
+        // Fill the allocated memory with pseudo random values
+        std::mt19937 gen(mpi_comm_rank);
+        std::uniform_real_distribution<> dis(-100.0, 100.0);
+        for (size_t i = 0; i < blocks_per_rank * settings.programSettings->blockSize; i++) {
+            for (size_t j = 0; j < settings.programSettings->blockSize; j++) {
+                d->A[i * settings.programSettings->blockSize + j] = dis(gen);
+                d->B[i * settings.programSettings->blockSize + j] = dis(gen);
+                d->result[i * settings.programSettings->blockSize + j] = 0.0;
+            }
+        }
+        
+        return d;
+    }
+
+    /**
+     * @brief Exchange the data blocks for verification
+     * 
+     * @param data The data that was generated locally and will be exchanged with other MPI ranks 
+     *              Exchanged data will be stored in the same object.
+     */
+    void
+    exchangeData(TransposeData& data) override {
+
+        if (pq_col != pq_row) {
+
+            int pair_rank = pq_width * pq_col + pq_row;
+
+            // To re-calculate the matrix transposition locally on this host, we need to 
+            // exchange matrix A for every kernel replication
+            // The order of the matrix blocks does not change during the exchange, because they are distributed diagonally 
+            // and will be handled in the order below:
+            //
+            // . . 1 3
+            // . . . 2
+            // 1 . . .
+            // 3 2 . .
+            MPI_Status status;        
+
+            size_t remaining_data_size = data.numBlocks;
+            size_t offset = 0;
+            while (remaining_data_size > 0) {
+                int next_chunk = (remaining_data_size > std::numeric_limits<int>::max()) ? std::numeric_limits<int>::max(): remaining_data_size;
+                MPI_Sendrecv(&data.A[offset], next_chunk, data_block, pair_rank, 0, &data.exchange[offset], next_chunk, data_block, pair_rank, 0, MPI_COMM_WORLD, &status);
+
+                remaining_data_size -= next_chunk;
+                offset += static_cast<size_t>(next_chunk) * static_cast<size_t>(data.blockSize * data.blockSize);
+            }
+            
+            // Exchange window pointers
+            HOST_DATA_TYPE* tmp = data.exchange;
+            data.exchange = data.A;
+            data.A = tmp;
+        }
+
+    }
+
+    void 
+    reference_transpose(TransposeData& data) {
+        for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+            for (size_t j = 0; j < width_per_rank * data.blockSize; j++) {
+                data.A[j * width_per_rank * data.blockSize + i] -= (data.result[i * width_per_rank * data.blockSize + j] - data.B[i * width_per_rank * data.blockSize + j]);
+            }
+        }
+    }
+
+    DistributedPQTransposeDataHandler(int mpi_rank, int mpi_size) : TransposeDataHandler(mpi_rank, mpi_size) {
+        int sqrt_size = std::sqrt(mpi_size);
+        if (sqrt_size * sqrt_size != mpi_size) {
+            throw std::runtime_error("Number of MPI ranks must have an integer as square root since P = Q has to hold!");
+        }
+        pq_width = std::sqrt(mpi_size);
+    }
+
+};
+
+
+}
+}
+
+#endif
diff --git a/PTRANS/src/host/execution_types/execution_cpu.hpp b/PTRANS/src/host/execution_types/execution_cpu.hpp
new file mode 100644
index 00000000..f7e369a1
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_cpu.hpp
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_CPU_EXECUTION_H_
+#define SRC_HOST_CPU_EXECUTION_H_
+
+#ifdef MKL_FOUND
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+#include "mkl_trans.h"
+
+/* Project's headers */
+#include "data_handlers/handler.hpp"
+
+namespace transpose
+{
+    namespace fpga_execution
+    {
+        namespace cpu
+        {
+
+            /**
+ * @brief Transpose and add the matrices using MKL routines
+ * 
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ */
+            static std::unique_ptr<transpose::TransposeExecutionTimings>
+            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
+            {
+                int err;
+
+                std::vector<double> transferTimings;
+                std::vector<double> calculationTimings;
+
+                if (data.blockSize != BLOCK_SIZE) {
+                    throw std::runtime_error("Block size for CPU hardcoded to " + std::to_string(BLOCK_SIZE) + ". Recompile to use different block sizes!");
+                }
+
+                ulong local_matrix_width = std::sqrt(data.numBlocks);
+
+                for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++)
+                {
+
+                    std::chrono::duration<double> transferTime(0);
+
+                    MPI_Barrier(MPI_COMM_WORLD);
+
+                    auto startCalculation = std::chrono::high_resolution_clock::now();
+
+                    // Exchange A data via PCIe and MPI
+                    handler.exchangeData(data);
+
+
+                    switch (config.programSettings->dataHandlerIdentifier) {
+                        case transpose::data_handler::DataHandlerType::diagonal: 
+                                #pragma omp parallel for 
+                                for (ulong offset=0; offset < data.numBlocks * BLOCK_SIZE * BLOCK_SIZE; offset += BLOCK_SIZE * BLOCK_SIZE) {
+                                    mkl_somatadd('R', 'T', 'N', BLOCK_SIZE, BLOCK_SIZE, 1.0, &data.A[offset], BLOCK_SIZE, 1.0, &data.B[offset], BLOCK_SIZE, &data.result[offset], BLOCK_SIZE);
+                                }
+                                break;
+                        case transpose::data_handler::DataHandlerType::pq: 
+                                #pragma omp parallel for 
+                                for (ulong yoffset=0; yoffset < BLOCK_SIZE * local_matrix_width; yoffset += BLOCK_SIZE) {
+                                    for (ulong xoffset=0; xoffset < BLOCK_SIZE * local_matrix_width; xoffset += BLOCK_SIZE) {
+                                        ulong toffset = xoffset * BLOCK_SIZE * local_matrix_width + yoffset;
+                                        ulong offset = yoffset * BLOCK_SIZE * local_matrix_width + xoffset;
+                                        mkl_somatadd('R', 'T', 'N', BLOCK_SIZE, BLOCK_SIZE, 1.0, &data.A[toffset], BLOCK_SIZE * local_matrix_width, 1.0, &data.B[offset], BLOCK_SIZE * local_matrix_width, &data.result[offset], BLOCK_SIZE * local_matrix_width);
+                                    }
+                                }
+                                break;
+                        default: throw std::runtime_error("Given data handler is not supported by CPU implementation: " + transpose::data_handler::handlerToString(config.programSettings->dataHandlerIdentifier));
+                    }
+
+
+                    auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+                    int mpi_rank;
+                    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+                    std::cout << "Rank " << mpi_rank << ": "
+                          << "Done i=" << repetition << std::endl;
+#endif
+                    std::chrono::duration<double> calculationTime =
+                        std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation);
+                    calculationTimings.push_back(calculationTime.count());
+
+                    // Transfer back data for next repetition!
+                    handler.exchangeData(data);
+
+                    transferTimings.push_back(transferTime.count());
+                }
+
+                std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
+                    transferTimings,
+                    calculationTimings});
+                return result;
+            }
+
+        } // namespace bm_execution
+    }
+}
+#endif // MKL_FOUND
+#endif // SRC_HOST_CPU_EXECUTION_H_
diff --git a/PTRANS/src/host/execution_default.cpp b/PTRANS/src/host/execution_types/execution_intel.hpp
similarity index 84%
rename from PTRANS/src/host/execution_default.cpp
rename to PTRANS/src/host/execution_types/execution_intel.hpp
index 232362c1..4b278db9 100644
--- a/PTRANS/src/host/execution_default.cpp
+++ b/PTRANS/src/host/execution_types/execution_intel.hpp
@@ -19,30 +19,37 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
-
-/* Related header files */
-#include "execution.h"
+#ifndef SRC_HOST_INTEL_EXECUTION_H_
+#define SRC_HOST_INTEL_EXECUTION_H_
 
 /* C++ standard library headers */
 #include <memory>
 #include <vector>
 #include <chrono>
 
-/* External library headers */
-#include "CL/cl.hpp"
-
 /* Project's headers */
+#include "transpose_benchmark.hpp"
+#include "data_handlers/data_handler_types.h"
 
-namespace bm_execution {
+namespace transpose {
+namespace fpga_execution {
+namespace intel {
 
-    /*
-    Implementation for the single kernel.
-     @copydoc bm_execution::calculate()
-    */
-    std::unique_ptr<transpose::TransposeExecutionTimings>
+    /**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a diagonal distribution and Intel external channels for communication
+ * 
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on the FPGA
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ */
+static  std::unique_ptr<transpose::TransposeExecutionTimings>
     calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data) {
         int err;
 
+        if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::diagonal) {
+                throw std::runtime_error("Used data handler not supported by execution handler!");
+        }
+
         std::vector<size_t> bufferSizeList;
         std::vector<cl::Buffer> bufferListA;
         std::vector<cl::Buffer> bufferListB;
@@ -78,12 +85,12 @@ namespace bm_execution {
                         // Define the memory bank the buffers will be placed in
                         if (config.programSettings->distributeBuffers) {
                                 memory_bank_info_a = ((((r * 3) % 7) + 1) << 16);
-                                memory_bank_info_a = ((((r * 3 + 1) % 7) + 1) << 16);
+                                memory_bank_info_b = ((((r * 3 + 1) % 7) + 1) << 16);
                                 memory_bank_info_out = ((((r * 3 + 2) % 7) + 1) << 16);
                         }
                         else {
                                 memory_bank_info_a = ((r + 1) << 16);
-                                memory_bank_info_a = ((r + 1) << 16);
+                                memory_bank_info_b = ((r + 1) << 16);
                                 memory_bank_info_out = ((r + 1) << 16);
                         }
                 }
@@ -122,10 +129,18 @@ namespace bm_execution {
         #endif
                 // TODO If SVM, the start index might be different because all replcations 
                 // access the same buffer!
-                err = transposeWriteKernel.setArg(2, static_cast<cl_ulong>(0));
-                ASSERT_CL(err) 
-                err = transposeReadKernel.setArg(1, static_cast<cl_ulong>(0));
-                ASSERT_CL(err) 
+                if (config.programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::pq) {
+                        err = transposeWriteKernel.setArg(2, static_cast<cl_ulong>(std::sqrt(data.numBlocks)));
+                        ASSERT_CL(err) 
+                        err = transposeReadKernel.setArg(1, static_cast<cl_ulong>(std::sqrt(data.numBlocks)));
+                        ASSERT_CL(err) 
+                }
+                else {
+                        err = transposeWriteKernel.setArg(2, static_cast<cl_ulong>(0));
+                        ASSERT_CL(err) 
+                        err = transposeReadKernel.setArg(1, static_cast<cl_ulong>(0));
+                        ASSERT_CL(err) 
+                }
                 err = transposeWriteKernel.setArg(3, static_cast<cl_ulong>(blocks_per_replication));
                 ASSERT_CL(err) 
                 err = transposeReadKernel.setArg(2, static_cast<cl_ulong>(blocks_per_replication));
@@ -190,8 +205,8 @@ namespace bm_execution {
 
             auto startCalculation = std::chrono::high_resolution_clock::now();
             for (int r = 0; r < transposeReadKernelList.size(); r++) {
-                writeCommandQueueList[r].enqueueTask(transposeWriteKernelList[r]);
-                readCommandQueueList[r].enqueueTask(transposeReadKernelList[r]);
+                writeCommandQueueList[r].enqueueNDRangeKernel(transposeWriteKernelList[r], cl::NullRange, cl::NDRange(1));
+                readCommandQueueList[r].enqueueNDRangeKernel(transposeReadKernelList[r], cl::NullRange, cl::NDRange(1));
             }
             for (int r = 0; r < transposeReadKernelList.size(); r++) {
                 writeCommandQueueList[r].finish();
@@ -241,4 +256,8 @@ namespace bm_execution {
         return result;
     }
 
-}  // namespace bm_execution
+}  // namespace transpose
+}  // namespace fpga_execution
+}  // namespace intel
+
+#endif
\ No newline at end of file
diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
new file mode 100644
index 00000000..e9249b8e
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
@@ -0,0 +1,288 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_INTEL_PQ_EXECUTION_H_
+#define SRC_HOST_INTEL_PQ_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* Project's headers */
+#include "transpose_benchmark.hpp"
+#include "data_handlers/data_handler_types.h"
+
+namespace transpose {
+namespace fpga_execution {
+namespace intel_pq {
+
+    /**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and Intel external channels for communication
+ * 
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on the FPGA
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ */
+static  std::unique_ptr<transpose::TransposeExecutionTimings>
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data) {
+        int err;
+
+        if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
+                throw std::runtime_error("Used data handler not supported by execution handler!");
+        }
+
+#ifdef USE_SVM
+                throw new std::runtime_error("SVM not supported in the host implementation of this communication method");
+#endif
+
+        std::vector<size_t> bufferSizeList;
+        std::vector<size_t> bufferStartList;
+        std::vector<size_t> bufferOffsetList;
+        std::vector<cl::Buffer> bufferListA;
+        std::vector<cl::Buffer> bufferListB;
+        std::vector<cl::Buffer> bufferListA_out;
+        std::vector<cl::Kernel> transposeReadKernelList;
+        std::vector<cl::Kernel> transposeWriteKernelList;
+        std::vector<cl::CommandQueue> readCommandQueueList;
+        std::vector<cl::CommandQueue> writeCommandQueueList;
+
+        size_t local_matrix_width = std::sqrt(data.numBlocks);
+        size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+        size_t total_offset = 0;
+
+        // Setup the kernels depending on the number of kernel replications
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+                // Calculate how many blocks the current kernel replication will need to process.
+                size_t blocks_per_replication = (local_matrix_width / config.programSettings->kernelReplications * local_matrix_width);
+                size_t blocks_remainder = local_matrix_width % config.programSettings->kernelReplications;
+                if (blocks_remainder > r) {
+                        // Catch the case, that the number of blocks is not divisible by the number of kernel replications
+                        blocks_per_replication += local_matrix_width;
+                }
+                if (blocks_per_replication < 1) {
+                        continue;
+                }
+
+                size_t buffer_size = blocks_per_replication * data.blockSize * data.blockSize;
+                bufferSizeList.push_back(buffer_size);
+                bufferStartList.push_back(total_offset);
+
+                total_offset += blocks_per_replication;
+
+                int memory_bank_info_a = 0;
+                int memory_bank_info_b = 0;
+                int memory_bank_info_out = 0;
+#ifdef INTEL_FPGA
+                if (!config.programSettings->useMemoryInterleaving) {
+                        // Define the memory bank the buffers will be placed in
+                        if (config.programSettings->distributeBuffers) {
+                                memory_bank_info_a = ((((r * 3) % 7) + 1) << 16);
+                                memory_bank_info_b = ((((r * 3 + 1) % 7) + 1) << 16);
+                                memory_bank_info_out = ((((r * 3 + 2) % 7) + 1) << 16);
+                        }
+                        else {
+                                memory_bank_info_a = ((r + 1) << 16);
+                                memory_bank_info_b = ((r + 1) << 16);
+                                memory_bank_info_out = ((r + 1) << 16);
+                        }
+                }
+#endif
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+                cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY | memory_bank_info_a,
+                                buffer_size * sizeof(HOST_DATA_TYPE));
+#else
+                cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY | memory_bank_info_a,
+                                data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE));
+#endif
+                cl::Buffer bufferB(*config.context, CL_MEM_READ_ONLY | memory_bank_info_b,
+                                buffer_size * sizeof(HOST_DATA_TYPE));
+                cl::Buffer bufferA_out(*config.context, CL_MEM_WRITE_ONLY | memory_bank_info_out,
+                                buffer_size * sizeof(HOST_DATA_TYPE));
+
+                // TODO the kernel name may need to be changed for Xilinx support
+                cl::Kernel transposeReadKernel(*config.program, (READ_KERNEL_NAME + std::to_string(r)).c_str(), &err);
+                ASSERT_CL(err)
+                cl::Kernel transposeWriteKernel(*config.program, (WRITE_KERNEL_NAME + std::to_string(r)).c_str(), &err);
+                ASSERT_CL(err)
+
+                err = transposeReadKernel.setArg(0, bufferA);
+                ASSERT_CL(err)   
+                err = transposeWriteKernel.setArg(0, bufferB);
+                ASSERT_CL(err)
+                err = transposeWriteKernel.setArg(1, bufferA_out);
+                ASSERT_CL(err)
+
+                // Row offset in blocks 
+                err = transposeWriteKernel.setArg(2, static_cast<cl_ulong>(0));
+                ASSERT_CL(err)
+        
+                // Width of the whole local matrix in blocks
+                err = transposeWriteKernel.setArg(3, static_cast<cl_ulong>(local_matrix_width));
+                ASSERT_CL(err) 
+#ifndef USE_BUFFER_WRITE_RECT_FOR_A
+                // Row offset in blocks
+                err = transposeReadKernel.setArg(1, static_cast<cl_ulong>(bufferStartList[r]));
+                ASSERT_CL(err)   
+                err = transposeReadKernel.setArg(2, static_cast<cl_ulong>(local_matrix_width));
+                ASSERT_CL(err) 
+#else
+                // Row offset in blocks
+                err = transposeReadKernel.setArg(1, static_cast<cl_ulong>(0));
+                ASSERT_CL(err) 
+                err = transposeReadKernel.setArg(2, static_cast<cl_ulong>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)));
+                ASSERT_CL(err) 
+#endif
+
+                // Height of the whole local matrix in blocks
+                err = transposeReadKernel.setArg(3, static_cast<cl_ulong>(local_matrix_width ));
+                ASSERT_CL(err) 
+
+                // total number of blocks that are processed in this replication
+                err = transposeWriteKernel.setArg(4, static_cast<cl_ulong>(blocks_per_replication));
+                ASSERT_CL(err) 
+                err = transposeReadKernel.setArg(4, static_cast<cl_ulong>(blocks_per_replication));
+                ASSERT_CL(err)     
+
+                cl::CommandQueue readQueue(*config.context, *config.device, 0, &err);
+                ASSERT_CL(err)
+                cl::CommandQueue writeQueue(*config.context, *config.device, 0, &err);
+                ASSERT_CL(err)
+
+                readCommandQueueList.push_back(readQueue);
+                writeCommandQueueList.push_back(writeQueue);
+                bufferListA.push_back(bufferA);
+                bufferListB.push_back(bufferB);
+                bufferListA_out.push_back(bufferA_out);
+                transposeReadKernelList.push_back(transposeReadKernel);
+                transposeWriteKernelList.push_back(transposeWriteKernel);
+        }
+
+        std::vector<double> transferTimings;
+        std::vector<double> calculationTimings;
+
+        for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
+
+            auto startTransfer = std::chrono::high_resolution_clock::now();
+
+        for (int r = 0; r < transposeReadKernelList.size(); r++) {
+                writeCommandQueueList[r].enqueueWriteBuffer(bufferListB[r], CL_FALSE, 0,
+                                        bufferSizeList[r]* sizeof(HOST_DATA_TYPE), &data.B[bufferStartList[r] * data.blockSize * data.blockSize]);
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+#ifndef USE_DEPRECATED_HPP_HEADER
+                cl::array<size_t,3> deviceOffset;
+                cl::array<size_t,3> hostOffset;
+                cl::array<size_t,3> rectShape;
+#else
+                cl::size_t<3> deviceOffset;
+                cl::size_t<3> hostOffset;
+                cl::size_t<3> rectShape;
+#endif
+                deviceOffset[0] = 0;
+                deviceOffset[1] = 0;
+                deviceOffset[2] = 0;
+                hostOffset[0] = (bufferStartList[r]) / local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+                hostOffset[1] = 0;
+                hostOffset[2] = 0;
+                rectShape[0] = (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE);
+                rectShape[1] = local_matrix_width* data.blockSize;
+                rectShape[2] = 1L;
+                readCommandQueueList[r].enqueueWriteBufferRect(bufferListA[r],CL_FALSE, 
+                                                deviceOffset, 
+                                                hostOffset, 
+                                                rectShape,
+                                                (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE), 0,
+                                                local_matrix_width* data.blockSize*sizeof(HOST_DATA_TYPE), 0,
+                                                data.A);
+#else
+                readCommandQueueList[r].enqueueWriteBuffer(bufferListA[r], CL_FALSE, 0,
+                                        data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), data.A);
+#endif
+
+        }
+            for (int r = 0; r < transposeReadKernelList.size(); r++) {
+                readCommandQueueList[r].finish();
+                writeCommandQueueList[r].finish();
+            }
+            auto endTransfer = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<double> transferTime =
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endTransfer - startTransfer);
+
+            MPI_Barrier(MPI_COMM_WORLD);
+
+            auto startCalculation = std::chrono::high_resolution_clock::now();
+            for (int r = 0; r < transposeReadKernelList.size(); r++) {
+                writeCommandQueueList[r].enqueueNDRangeKernel(transposeWriteKernelList[r], cl::NullRange, cl::NDRange(1));
+                readCommandQueueList[r].enqueueNDRangeKernel(transposeReadKernelList[r], cl::NullRange, cl::NDRange(1));
+            }
+            for (int r = 0; r < transposeReadKernelList.size(); r++) {
+                writeCommandQueueList[r].finish();
+#ifndef NDEBUG
+                int mpi_rank;
+                MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+                std::cout << "Rank " << mpi_rank << ": " << "Write done r=" << r << ", i=" << repetition << std::endl;
+#endif
+                readCommandQueueList[r].finish();
+#ifndef NDEBUG
+                mpi_rank;
+                MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+                std::cout << "Rank " << mpi_rank << ": " << "Read done r=" << r << ", i=" << repetition << std::endl;
+#endif
+            }
+            auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+                int mpi_rank;
+                MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+                std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl;
+#endif
+            std::chrono::duration<double> calculationTime =
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endCalculation - startCalculation);
+            calculationTimings.push_back(calculationTime.count());
+
+            startTransfer = std::chrono::high_resolution_clock::now();
+
+                for (int r = 0; r < transposeReadKernelList.size(); r++) {
+                        writeCommandQueueList[r].enqueueReadBuffer(bufferListA_out[r], CL_TRUE, 0,
+                                                bufferSizeList[r]* sizeof(HOST_DATA_TYPE), &data.result[bufferStartList[r] * data.blockSize * data.blockSize]);
+                }
+            endTransfer = std::chrono::high_resolution_clock::now();
+            transferTime +=
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endTransfer - startTransfer);
+            transferTimings.push_back(transferTime.count());
+        }
+
+        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
+                transferTimings,
+                calculationTimings
+        });
+        return result;
+    }
+
+}  // namespace transpose
+}  // namespace fpga_execution
+}  // namespace intel
+
+#endif
\ No newline at end of file
diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp
new file mode 100644
index 00000000..5e29ad2e
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_pcie.hpp
@@ -0,0 +1,240 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_PCIE_EXECUTION_H_
+#define SRC_HOST_PCIE_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+
+/* Project's headers */
+#include "data_handlers/handler.hpp"
+
+namespace transpose
+{
+    namespace fpga_execution
+    {
+        namespace pcie
+        {
+
+            /**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a diagonal distribution and PCIe+MPI over the host for communication
+ * 
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on the FPGA
+ * @param handler data handler instance that should be used to exchange data between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ */
+            static std::unique_ptr<transpose::TransposeExecutionTimings>
+            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
+            {
+                int err;
+
+#ifdef USE_SVM
+                throw new std::runtime_error("SVM not supported in the host implementation of this communication method");
+#endif
+                if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::diagonal) {
+                        throw std::runtime_error("Used data handler not supported by execution handler!");
+                }
+
+                std::vector<size_t> bufferSizeList;
+                std::vector<cl::Buffer> bufferListA;
+                std::vector<cl::Buffer> bufferListB;
+                std::vector<cl::Buffer> bufferListA_out;
+                std::vector<cl::Kernel> transposeKernelList;
+                std::vector<cl::CommandQueue> transCommandQueueList;
+
+                size_t local_matrix_width = std::sqrt(data.numBlocks);
+
+                // Setup the kernels depending on the number of kernel replications
+                for (int r = 0; r < config.programSettings->kernelReplications; r++)
+                {
+
+                    // Calculate how many blocks the current kernel replication will need to process.
+                    size_t blocks_per_replication = data.numBlocks / config.programSettings->kernelReplications;
+                    size_t blocks_remainder = data.numBlocks % config.programSettings->kernelReplications;
+                    if (blocks_remainder > r)
+                    {
+                        // Catch the case, that the number of blocks is not divisible by the number of kernel replications
+                        blocks_per_replication += 1;
+                    }
+                    if (blocks_per_replication < 1)
+                    {
+                        continue;
+                    }
+
+                    size_t buffer_size = data.blockSize * (data.blockSize * blocks_per_replication);
+
+                    bufferSizeList.push_back(buffer_size);
+
+                    int memory_bank_info_a = 0;
+                    int memory_bank_info_b = 0;
+                    int memory_bank_info_out = 0;
+#ifdef INTEL_FPGA
+                    if (!config.programSettings->useMemoryInterleaving)
+                    {
+                        // Define the memory bank the buffers will be placed in
+                        if (config.programSettings->distributeBuffers)
+                        {
+                            memory_bank_info_a = ((((r * 3) % 7) + 1) << 16);
+                            memory_bank_info_b = ((((r * 3 + 1) % 7) + 1) << 16);
+                            memory_bank_info_out = ((((r * 3 + 2) % 7) + 1) << 16);
+                        }
+                        else
+                        {
+                            memory_bank_info_a = ((r + 1) << 16);
+                            memory_bank_info_b = ((r + 1) << 16);
+                            memory_bank_info_out = ((r + 1) << 16);
+                        }
+                    }
+#endif
+                    cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY | memory_bank_info_a,
+                               buffer_size * sizeof(HOST_DATA_TYPE));
+                    cl::Buffer bufferB(*config.context, CL_MEM_READ_ONLY | memory_bank_info_b,
+                               buffer_size * sizeof(HOST_DATA_TYPE));
+                    cl::Buffer bufferA_out(*config.context, CL_MEM_WRITE_ONLY | memory_bank_info_out,
+                                   buffer_size * sizeof(HOST_DATA_TYPE));
+
+                    // TODO the kernel name may need to be changed for Xilinx support
+                    cl::Kernel transposeKernel(*config.program, ("transpose" + std::to_string(r)).c_str(), &err);
+                    ASSERT_CL(err)
+
+
+                    err = transposeKernel.setArg(0, bufferA);
+                    ASSERT_CL(err)
+                    err = transposeKernel.setArg(1, bufferB);
+                    ASSERT_CL(err)
+                    err = transposeKernel.setArg(2, bufferA_out);
+                    ASSERT_CL(err)
+                    err = transposeKernel.setArg(3, static_cast<cl_uint>(blocks_per_replication));
+                    ASSERT_CL(err)
+
+                    cl::CommandQueue transQueue(*config.context, *config.device, 0, &err);
+                    ASSERT_CL(err)
+
+                    transCommandQueueList.push_back(transQueue);
+                    bufferListA.push_back(bufferA);
+                    bufferListB.push_back(bufferB);
+                    bufferListA_out.push_back(bufferA_out);
+                    transposeKernelList.push_back(transposeKernel);
+                }
+
+                std::vector<double> transferTimings;
+                std::vector<double> calculationTimings;
+
+                for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++)
+                {
+
+                    MPI_Barrier(MPI_COMM_WORLD);
+
+                    auto startTransfer = std::chrono::high_resolution_clock::now();
+                    size_t bufferOffset = 0;
+
+                    for (int r = 0; r < transposeKernelList.size(); r++)
+                    {
+                        transCommandQueueList[r].enqueueWriteBuffer(bufferListB[r], CL_TRUE, 0,
+                                              bufferSizeList[r] * sizeof(HOST_DATA_TYPE), &data.B[bufferOffset]);
+                        transCommandQueueList[r].enqueueWriteBuffer(bufferListA[r], CL_TRUE, 0,
+                                              bufferSizeList[r] * sizeof(HOST_DATA_TYPE), &data.A[bufferOffset]);
+                        bufferOffset += bufferSizeList[r];
+                    }
+
+                    auto endTransfer = std::chrono::high_resolution_clock::now();
+                    std::chrono::duration<double> transferTime =
+                        std::chrono::duration_cast<std::chrono::duration<double>>(endTransfer - startTransfer);
+
+                    MPI_Barrier(MPI_COMM_WORLD);
+
+                    auto startCalculation = std::chrono::high_resolution_clock::now();
+                    bufferOffset = 0;
+                    for (int r = 0; r < transposeKernelList.size(); r++)
+                    {
+                        transCommandQueueList[r].enqueueReadBuffer(bufferListA[r], CL_TRUE, 0,
+                                               bufferSizeList[r] * sizeof(HOST_DATA_TYPE), &data.A[bufferOffset]);
+                        bufferOffset += bufferSizeList[r];
+                    }
+
+                    // Exchange A data via PCIe and MPI
+                    handler.exchangeData(data);
+
+                    bufferOffset = 0;
+                    for (int r = 0; r < transposeKernelList.size(); r++)
+                    {
+                        transCommandQueueList[r].enqueueWriteBuffer(bufferListA[r], CL_FALSE, 0,
+                                                bufferSizeList[r] * sizeof(HOST_DATA_TYPE), &data.A[bufferOffset]);
+                        bufferOffset += bufferSizeList[r];
+                    }
+
+                    for (int r = 0; r < transposeKernelList.size(); r++)
+                    {
+                        transCommandQueueList[r].enqueueNDRangeKernel(transposeKernelList[r], cl::NullRange, cl::NDRange(1));
+                    }
+                    for (int r = 0; r < transposeKernelList.size(); r++)
+                    {
+                        transCommandQueueList[r].finish();
+                    }
+
+                    auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+                    int mpi_rank;
+                    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+                    std::cout << "Rank " << mpi_rank << ": "
+                          << "Done i=" << repetition << std::endl;
+#endif
+                    std::chrono::duration<double> calculationTime =
+                        std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation);
+                    calculationTimings.push_back(calculationTime.count());
+
+                    // Transfer back data for next repetition!
+                    handler.exchangeData(data);
+
+                    bufferOffset = 0;
+                    startTransfer = std::chrono::high_resolution_clock::now();
+
+                    for (int r = 0; r < transposeKernelList.size(); r++)
+                    {
+                        transCommandQueueList[r].enqueueReadBuffer(bufferListA_out[r], CL_TRUE, 0,
+                                               bufferSizeList[r] * sizeof(HOST_DATA_TYPE), &data.result[bufferOffset]);
+                        bufferOffset += bufferSizeList[r];
+                    }
+
+                    endTransfer = std::chrono::high_resolution_clock::now();
+                    transferTime +=
+                        std::chrono::duration_cast<std::chrono::duration<double>>(endTransfer - startTransfer);
+                    transferTimings.push_back(transferTime.count());
+                }
+
+                std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
+                    transferTimings,
+                    calculationTimings});
+                return result;
+            }
+
+        } // namespace bm_execution
+    }
+}
+
+#endif // SRC_HOST_PCIE_EXECUTION_H_
diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
new file mode 100644
index 00000000..93c891a0
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
@@ -0,0 +1,344 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_PCIE_PQ_EXECUTION_H_
+#define SRC_HOST_PCIE_PQ_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* Project's headers */
+#include "transpose_benchmark.hpp"
+#include "data_handlers/data_handler_types.h"
+
+namespace transpose {
+namespace fpga_execution {
+namespace pcie_pq {
+
+    /**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication
+ * 
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on the FPGA
+ * @param handler data handler instance that should be used to exchange data between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ */
+static  std::unique_ptr<transpose::TransposeExecutionTimings>
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data, transpose::data_handler::TransposeDataHandler &handler) {
+        int err;
+
+        if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
+                throw std::runtime_error("Used data handler not supported by execution handler!");
+        }
+#ifdef USE_SVM
+        throw new std::runtime_error("SVM not supported in the host implementation of this communication method");
+#endif
+
+        std::vector<size_t> bufferSizeList;
+        std::vector<size_t> bufferStartList;
+        std::vector<size_t> bufferOffsetList;
+        std::vector<cl::Buffer> bufferListA;
+        std::vector<cl::Buffer> bufferListB;
+        std::vector<cl::Buffer> bufferListA_out;
+        std::vector<cl::Kernel> transposeKernelList;
+        std::vector<cl::CommandQueue> transCommandQueueList;
+
+        size_t local_matrix_width = std::sqrt(data.numBlocks);
+        size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+        size_t total_offset = 0;
+
+        // Setup the kernels depending on the number of kernel replications
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+                // Calculate how many blocks the current kernel replication will need to process.
+                size_t blocks_per_replication = (local_matrix_width / config.programSettings->kernelReplications * local_matrix_width);
+                size_t blocks_remainder = local_matrix_width % config.programSettings->kernelReplications;
+                if (blocks_remainder > r) {
+                        // Catch the case, that the number of blocks is not divisible by the number of kernel replications
+                        blocks_per_replication += local_matrix_width;
+                }
+                if (blocks_per_replication < 1) {
+                        continue;
+                }
+
+                size_t buffer_size = blocks_per_replication * data.blockSize * data.blockSize;
+                bufferSizeList.push_back(buffer_size);
+                bufferStartList.push_back(total_offset);
+
+                total_offset += blocks_per_replication;
+
+                int memory_bank_info_a = 0;
+                int memory_bank_info_b = 0;
+                int memory_bank_info_out = 0;
+#ifdef INTEL_FPGA
+                if (!config.programSettings->useMemoryInterleaving) {
+                        // Define the memory bank the buffers will be placed in
+                        if (config.programSettings->distributeBuffers) {
+                                memory_bank_info_a = ((((r * 3) % 7) + 1) << 16);
+                                memory_bank_info_b = ((((r * 3 + 1) % 7) + 1) << 16);
+                                memory_bank_info_out = ((((r * 3 + 2) % 7) + 1) << 16);
+                        }
+                        else {
+                                memory_bank_info_a = ((r + 1) << 16);
+                                memory_bank_info_b = ((r + 1) << 16);
+                                memory_bank_info_out = ((r + 1) << 16);
+                        }
+                }
+#endif
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+                cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY | memory_bank_info_a,
+                                buffer_size * sizeof(HOST_DATA_TYPE));
+#else
+                cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY | memory_bank_info_a,
+                                data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE));
+#endif
+                cl::Buffer bufferB(*config.context, CL_MEM_READ_ONLY | memory_bank_info_b,
+                                buffer_size * sizeof(HOST_DATA_TYPE));
+                cl::Buffer bufferA_out(*config.context, CL_MEM_WRITE_ONLY | memory_bank_info_out,
+                                buffer_size * sizeof(HOST_DATA_TYPE));
+
+#ifdef INTEL_FPGA
+                cl::Kernel transposeKernel(*config.program, ("transpose" + std::to_string(r)).c_str(), &err);
+                ASSERT_CL(err)
+#endif
+#ifdef XILINX_FPGA
+        // create the kernels
+        cl::Kernel transposeKernel(*config.program, ("transpose0:{transpose0_" +  std::to_string(r + 1) + "}").c_str(),
+                                        &err);
+        ASSERT_CL(err);
+#endif
+
+                err = transposeKernel.setArg(0, bufferA);
+                ASSERT_CL(err)
+                err = transposeKernel.setArg(1, bufferB);
+                ASSERT_CL(err)
+                err = transposeKernel.setArg(2, bufferA_out);
+                ASSERT_CL(err)
+                err = transposeKernel.setArg(4, static_cast<cl_uint>(blocks_per_replication));
+                ASSERT_CL(err)
+                err = transposeKernel.setArg(5, static_cast<cl_uint>(local_matrix_width));
+                ASSERT_CL(err)
+#ifndef USE_BUFFER_WRITE_RECT_FOR_A
+                err = transposeKernel.setArg(6, static_cast<cl_uint>(local_matrix_width));
+                ASSERT_CL(err) 
+                err = transposeKernel.setArg(3, static_cast<cl_uint>(bufferStartList[r]));
+                ASSERT_CL(err)
+#else
+                err = transposeKernel.setArg(6, static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)));
+                ASSERT_CL(err) 
+                err = transposeKernel.setArg(3, static_cast<cl_uint>(0));
+                ASSERT_CL(err)
+#endif
+ 
+
+                cl::CommandQueue transQueue(*config.context, *config.device, 0, &err);
+                ASSERT_CL(err)
+
+                transCommandQueueList.push_back(transQueue);
+                bufferListA.push_back(bufferA);
+                bufferListB.push_back(bufferB);
+                bufferListA_out.push_back(bufferA_out);
+                transposeKernelList.push_back(transposeKernel);
+        }
+
+        std::vector<double> transferTimings;
+        std::vector<double> calculationTimings;
+
+        for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
+
+            auto startTransfer = std::chrono::high_resolution_clock::now();
+
+        for (int r = 0; r < transposeKernelList.size(); r++) {
+                transCommandQueueList[r].enqueueWriteBuffer(bufferListB[r], CL_FALSE, 0,
+                                        bufferSizeList[r]* sizeof(HOST_DATA_TYPE), &data.B[bufferStartList[r] * data.blockSize * data.blockSize]);
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+#ifndef USE_DEPRECATED_HPP_HEADER
+                cl::array<size_t,3> deviceOffset;
+                cl::array<size_t,3> hostOffset;
+                cl::array<size_t,3> rectShape;
+#else
+                cl::size_t<3> deviceOffset;
+                cl::size_t<3> hostOffset;
+                cl::size_t<3> rectShape;
+#endif
+                deviceOffset[0] = 0;
+                deviceOffset[1] = 0;
+                deviceOffset[2] = 0;
+                hostOffset[0] = (bufferStartList[r]) / local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+                hostOffset[1] = 0;
+                hostOffset[2] = 0;
+                rectShape[0] = (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE);
+                rectShape[1] = local_matrix_width* data.blockSize;
+                rectShape[2] = 1L;
+                transCommandQueueList[r].enqueueWriteBufferRect(bufferListA[r],CL_FALSE, 
+                                                deviceOffset, 
+                                                hostOffset, 
+                                                rectShape,
+                                                (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE), 0,
+                                                local_matrix_width* data.blockSize*sizeof(HOST_DATA_TYPE), 0,
+                                                data.A);
+#else
+                transCommandQueueList[r].enqueueWriteBuffer(bufferListA[r], CL_FALSE, 0,
+                                        data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), data.A);
+#endif
+
+        }
+            for (int r = 0; r < transposeKernelList.size(); r++) {
+                transCommandQueueList[r].finish();
+            }
+            auto endTransfer = std::chrono::high_resolution_clock::now();
+
+            std::chrono::duration<double> transferTime =
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endTransfer - startTransfer);
+
+            MPI_Barrier(MPI_COMM_WORLD);
+
+            auto startCalculation = std::chrono::high_resolution_clock::now();
+
+        for (int r = 0; r < transposeKernelList.size(); r++)
+        {
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+#ifndef USE_DEPRECATED_HPP_HEADER
+                cl::array<size_t,3> deviceOffset;
+                cl::array<size_t,3> hostOffset;
+                cl::array<size_t,3> rectShape;
+#else
+                cl::size_t<3> deviceOffset;
+                cl::size_t<3> hostOffset;
+                cl::size_t<3> rectShape;
+#endif
+                deviceOffset[0] = 0;
+                deviceOffset[1] = 0;
+                deviceOffset[2] = 0;
+                hostOffset[0] = (bufferStartList[r]) / local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+                hostOffset[1] = 0;
+                hostOffset[2] = 0;
+                rectShape[0] = (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE);
+                rectShape[1] = local_matrix_width* data.blockSize;
+                rectShape[2] = 1L;
+                transCommandQueueList[r].enqueueReadBufferRect(bufferListA[r],CL_FALSE, 
+                                                deviceOffset, 
+                                                hostOffset, 
+                                                rectShape,
+                                                (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE), 0,
+                                                local_matrix_width* data.blockSize*sizeof(HOST_DATA_TYPE), 0,
+                                                data.A);
+#else
+                transCommandQueueList[r].enqueueReadBuffer(bufferListA[r], CL_FALSE, 0,
+                                        data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), data.A);
+#endif
+        }
+
+
+        // Exchange A data via PCIe and MPI
+        handler.exchangeData(data);
+
+        for (int r = 0; r < transposeKernelList.size(); r++)
+        {
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+#ifndef USE_DEPRECATED_HPP_HEADER
+                cl::array<size_t,3> deviceOffset;
+                cl::array<size_t,3> hostOffset;
+                cl::array<size_t,3> rectShape;
+#else
+                cl::size_t<3> deviceOffset;
+                cl::size_t<3> hostOffset;
+                cl::size_t<3> rectShape;
+#endif
+                deviceOffset[0] = 0;
+                deviceOffset[1] = 0;
+                deviceOffset[2] = 0;
+                hostOffset[0] = (bufferStartList[r]) / local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+                hostOffset[1] = 0;
+                hostOffset[2] = 0;
+                rectShape[0] = (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE);
+                rectShape[1] = local_matrix_width* data.blockSize;
+                rectShape[2] = 1L;
+                transCommandQueueList[r].enqueueWriteBufferRect(bufferListA[r],CL_FALSE, 
+                                                deviceOffset, 
+                                                hostOffset, 
+                                                rectShape,
+                                                (bufferSizeList[r]) / (local_matrix_width * data.blockSize) * sizeof(HOST_DATA_TYPE), 0,
+                                                local_matrix_width* data.blockSize*sizeof(HOST_DATA_TYPE), 0,
+                                                data.A);
+#else
+                transCommandQueueList[r].enqueueWriteBuffer(bufferListA[r], CL_TRUE, 0,
+                                        data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), data.A);
+#endif
+        }
+#ifndef NDEBUG
+        auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+#endif
+        for (int r = 0; r < transposeKernelList.size(); r++)
+        {
+        transCommandQueueList[r].enqueueNDRangeKernel(transposeKernelList[r], cl::NullRange, cl::NDRange(1));
+        }
+        for (int r = 0; r < transposeKernelList.size(); r++)
+        {
+        transCommandQueueList[r].finish();
+        }
+            auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+                int mpi_rank;
+                MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+                std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl;
+                std::cout << "Kernel execution time: " << std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() 
+                        << "s (" << ((config.programSettings->matrixSize * config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * 3) 
+                                / std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl;
+#endif
+
+        // Transfer back data for next repetition!
+        handler.exchangeData(data);
+
+            std::chrono::duration<double> calculationTime =
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endCalculation - startCalculation);
+            calculationTimings.push_back(calculationTime.count());
+
+            startTransfer = std::chrono::high_resolution_clock::now();
+
+                for (int r = 0; r < transposeKernelList.size(); r++) {
+                        transCommandQueueList[r].enqueueReadBuffer(bufferListA_out[r], CL_TRUE, 0,
+                                                bufferSizeList[r]* sizeof(HOST_DATA_TYPE), &data.result[bufferStartList[r] * data.blockSize * data.blockSize]);
+                }
+            endTransfer = std::chrono::high_resolution_clock::now();
+            transferTime +=
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endTransfer - startTransfer);
+            transferTimings.push_back(transferTime.count());
+        }
+
+        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
+                transferTimings,
+                calculationTimings
+        });
+        return result;
+    }
+
+}  // namespace transpose
+}  // namespace fpga_execution
+}  // namespace intel
+
+#endif
\ No newline at end of file
diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
index cb114097..7657b85d 100644
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ b/PTRANS/src/host/transpose_benchmark.cpp
@@ -31,12 +31,24 @@ SOFTWARE.
 #include <random>
 
 /* Project's headers */
-#include "execution.h"
+#include "execution_types/execution_intel.hpp"
+#include "execution_types/execution_intel_pq.hpp"
+#include "execution_types/execution_pcie.hpp"
+#include "execution_types/execution_pcie_pq.hpp"
+#include "execution_types/execution_cpu.hpp"
+#include "communication_types.hpp"
+
+#include "data_handlers/data_handler_types.h"
+#include "data_handlers/diagonal.hpp"
+#include "data_handlers/pq.hpp"
+
 #include "parameters.h"
 
+
 transpose::TransposeBenchmark::TransposeBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) {
-    setupBenchmark(argc, argv);
-    setTransposeDataHandler(executionSettings->programSettings->dataHandlerIdentifier);
+    if (setupBenchmark(argc, argv)) {
+        setTransposeDataHandler(executionSettings->programSettings->dataHandlerIdentifier);
+    }
 }
 
 void
@@ -48,12 +60,31 @@ transpose::TransposeBenchmark::addAdditionalParseOptions(cxxopts::Options &optio
             cxxopts::value<uint>()->default_value(std::to_string(BLOCK_SIZE)))
         ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.")
         ("handler", "Specify the used data handler that distributes the data over devices and memory banks",
-            cxxopts::value<std::string>()->default_value(TRANSPOSE_HANDLERS_DIST_DIAG));
+            cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE));
 }
 
 std::unique_ptr<transpose::TransposeExecutionTimings>
 transpose::TransposeBenchmark::executeKernel(TransposeData &data) {
-    return bm_execution::calculate(*executionSettings, data);
+    switch (executionSettings->programSettings->communicationType) {
+        case hpcc_base::CommunicationType::intel_external_channels: 
+                                if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+                                    return transpose::fpga_execution::intel::calculate(*executionSettings, data);
+                                }
+                                else {
+                                    return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data);
+                                } break;
+        case hpcc_base::CommunicationType::pcie_mpi :                                 
+                                if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+                                    return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler);
+                                }
+                                else {
+                                    return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, *dataHandler);
+                                } break;
+#ifdef MKL_FOUND
+        case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break;
+#endif
+        default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType));
+    }
 }
 
 void
@@ -63,39 +94,50 @@ transpose::TransposeBenchmark::collectAndPrintResults(const transpose::Transpose
     // Number of experiment repetitions
     uint number_measurements = output.calculationTimings.size();
     std::vector<double> max_measures(number_measurements);
+    std::vector<double> max_transfers(number_measurements);
 #ifdef _USE_MPI_
         // Copy the object variable to a local variable to make it accessible to the lambda function
         int mpi_size = mpi_comm_size;
         MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+        MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
 #else
         std::copy(output.calculationTimings.begin(), output.calculationTimings.end(), max_measures.begin());
+        std::copy(output.transferTimings.begin(), output.transferTimings.end(), max_transfers.begin());
 #endif
 
     double avgCalculationTime = accumulate(max_measures.begin(), max_measures.end(), 0.0)
                                 / max_measures.size();
     double minCalculationTime = *min_element(max_measures.begin(), max_measures.end());
 
+    double avgTransferTime = accumulate(max_transfers.begin(), max_transfers.end(), 0.0)
+                                / max_transfers.size();
+    double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end());
+
     double avgCalcFLOPS = flops / avgCalculationTime;
     double maxCalcFLOPS = flops / minCalculationTime;
     double avgMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime;
-    double avgNetworkBandwidth = flops * sizeof(HOST_DATA_TYPE) / avgCalculationTime;
     double maxMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime;
-    double maxNetworkBandwidth = flops * sizeof(HOST_DATA_TYPE) / minCalculationTime;
+    double avgTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime;
+    double maxTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime;
 
 
 
 
     if (mpi_comm_rank == 0) {
-        std::cout << "              calc    calc FLOPS    Net [B/s]    Mem [B/s]" << std::endl;
-        std::cout << "avg:   " << avgCalculationTime
+        std::cout << "       total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]" << std::endl;
+        std::cout << "avg:   " << (avgTransferTime + avgCalculationTime)
+                << "   " << avgTransferTime
+                << "   " << avgCalculationTime
                 << "   " << avgCalcFLOPS
-                << "   " << avgNetworkBandwidth
                 << "   " << avgMemBandwidth
+                << "   " << avgTransferBandwidth
                 << std::endl;
-        std::cout << "best:  " << minCalculationTime
+        std::cout << "best:  " << (minTransferTime + minCalculationTime)
+                << "   " << minTransferTime
+                << "   " << minCalculationTime
                 << "   " << maxCalcFLOPS
-                << "   " << maxNetworkBandwidth
                 << "   " << maxMemBandwidth
+                << "   " << maxTransferBandwidth
                 << std::endl;
     }
 }
@@ -111,15 +153,7 @@ transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeD
     // exchange the data using MPI depending on the chosen distribution scheme
     dataHandler->exchangeData(data);
 
-    size_t block_offset = executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize;
-    for (size_t b = 0; b < data.numBlocks; b++) {
-        for (size_t i = 0; i < executionSettings->programSettings->blockSize; i++) {
-            for (size_t j = 0; j < executionSettings->programSettings->blockSize; j++) {
-                data.A[b * block_offset + j * executionSettings->programSettings->blockSize + i] -= (data.result[b * block_offset + i * executionSettings->programSettings->blockSize + j] 
-                                                                            - data.B[b * block_offset + i * executionSettings->programSettings->blockSize + j]);
-            }
-        }
-    }
+    dataHandler->reference_transpose(data);
 
     double max_error = 0.0;
     for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) {
@@ -138,9 +172,12 @@ transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeD
 }
 
 void
-transpose::TransposeBenchmark::setTransposeDataHandler(std::string dataHandlerIdentifier) {
-    if (transpose::dataHandlerIdentifierMap.find(dataHandlerIdentifier) == transpose::dataHandlerIdentifierMap.end()) {
-        throw std::runtime_error("Could not match selected data handler: " + dataHandlerIdentifier);
+transpose::TransposeBenchmark::setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) {
+    switch (dataHandlerIdentifier) {
+        case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break;
+        case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break;
+        default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier));
     }
-    dataHandler = transpose::dataHandlerIdentifierMap[dataHandlerIdentifier](mpi_comm_rank, mpi_comm_size);
+        
+
 }
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index e57f4a1c..5de333ca 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -30,7 +30,9 @@ SOFTWARE.
 /* Project's headers */
 #include "hpcc_benchmark.hpp"
 #include "transpose_data.hpp"
-#include "transpose_handlers.hpp"
+
+#include "data_handlers/data_handler_types.h"
+#include "data_handlers/handler.hpp"
 
 #include "parameters.h"
 
@@ -56,7 +58,7 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramS
     void
     addAdditionalParseOptions(cxxopts::Options &options) override;
 
-    std::unique_ptr<transpose::TransposeDataHandler> dataHandler;
+    std::unique_ptr<transpose::data_handler::TransposeDataHandler> dataHandler;
 
 public:
 
@@ -73,7 +75,7 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramS
      * 
      */
     void
-    setTransposeDataHandler(std::string dataHandlerIdentifier);
+    setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier);
 
     /**
      * @brief Transpose specific implementation of the kernel execution
diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp
index b60a4955..3503dc6d 100644
--- a/PTRANS/src/host/transpose_data.cpp
+++ b/PTRANS/src/host/transpose_data.cpp
@@ -1,20 +1,39 @@
+#include <cmath>
 
 #include "transpose_data.hpp"
+#include "data_handlers/data_handler_types.h"
+#include "communication_types.hpp"
 
 transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     matrixSize(results["m"].as<uint>() * results["b"].as<uint>()),
-    blockSize(results["b"].as<uint>()), dataHandlerIdentifier(results["handler"].as<std::string>()),
+    blockSize(results["b"].as<uint>()), dataHandlerIdentifier(transpose::data_handler::stringToHandler(results["handler"].as<std::string>())),
     distributeBuffers(results["distribute-buffers"].count() > 0) {
 
+        // auto detect data distribution type if required
+        if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) {
+            if (kernelFileName.find("_"+ transpose::data_handler::handlerToString(transpose::data_handler::DataHandlerType::diagonal) +"_") != kernelFileName.npos) {
+                dataHandlerIdentifier = transpose::data_handler::DataHandlerType::diagonal;
+            }
+            else if (kernelFileName.find("_"+ transpose::data_handler::handlerToString(transpose::data_handler::DataHandlerType::pq) + "_") != kernelFileName.npos) {
+                dataHandlerIdentifier = transpose::data_handler::DataHandlerType::pq;
+            }
+            if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) {
+                throw std::runtime_error("Required data distribution could not be detected from kernel file name!");
+            }
+        }
 }
 
 std::map<std::string, std::string>
 transpose::TransposeProgramSettings::getSettingsMap() {
         auto map = hpcc_base::BaseSettings::getSettingsMap();
-        map["Matrix Size"] = std::to_string(matrixSize);
+        int mpi_size;
+#ifdef _USE_MPI_
+        MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+#endif
+        map["Matrix Size"] = std::to_string(matrixSize * static_cast<int>(std::sqrt(mpi_size)));
         map["Block Size"] = std::to_string(blockSize);
         map["Dist. Buffers"] = distributeBuffers ? "Yes" : "No";
-        map["Data Handler"] = dataHandlerIdentifier;
+        map["Data Handler"] = transpose::data_handler::handlerToString(dataHandlerIdentifier);
         return map;
 }
 
@@ -31,6 +50,9 @@ transpose::TransposeData::TransposeData(cl::Context context, uint block_size, ui
         result = reinterpret_cast<HOST_DATA_TYPE*>(
                             clSVMAlloc(context(), 0 ,
                             block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024));
+        exchange = reinterpret_cast<HOST_DATA_TYPE*>(
+                            clSVMAlloc(context(), 0 ,
+                            block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024));
 #else
         posix_memalign(reinterpret_cast<void **>(&A), 64,
                     sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
@@ -38,6 +60,8 @@ transpose::TransposeData::TransposeData(cl::Context context, uint block_size, ui
                     sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
         posix_memalign(reinterpret_cast<void **>(&result), 64,
                     sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
+        posix_memalign(reinterpret_cast<void **>(&exchange), 64,
+                    sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
 #endif
     }
 }
@@ -48,10 +72,12 @@ transpose::TransposeData::~TransposeData() {
         clSVMFree(context(), reinterpret_cast<void*>(A));});
         clSVMFree(context(), reinterpret_cast<void*>(B));});
         clSVMFree(context(), reinterpret_cast<void*>(result));});
+        clSVMFree(context(), reinterpret_cast<void*>(exchange));});
 #else
         free(A);
         free(B);
         free(result);
+        free(exchange);
 #endif
     }
 }
diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp
index 4eaa684b..1e318fc0 100644
--- a/PTRANS/src/host/transpose_data.hpp
+++ b/PTRANS/src/host/transpose_data.hpp
@@ -28,6 +28,8 @@ SOFTWARE.
 
 /* Project's headers */
 #include "hpcc_benchmark.hpp"
+#include "data_handlers/data_handler_types.h"
+
 
 /**
  * @brief Contains all classes and methods needed by the Transpose benchmark
@@ -58,7 +60,7 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings {
      * @brief Identifier of the used data handler
      * 
      */
-    std::string dataHandlerIdentifier;
+    transpose::data_handler::DataHandlerType dataHandlerIdentifier;
 
     /**
      * @brief If true, the three buffers for A,B and A_out will be placed on three different memory banks, if possible
@@ -107,6 +109,12 @@ class TransposeData {
      */
     HOST_DATA_TYPE* result;
 
+    /**
+     * @brief Data buffer used during data exchange of matrices
+     * 
+     */
+    HOST_DATA_TYPE* exchange;
+
     /**
      * @brief Number of matrix blocks that are stored in every matrix A, B and result. Blocks are
      *          always stored columnwise.
diff --git a/PTRANS/src/host/transpose_handlers.cpp b/PTRANS/src/host/transpose_handlers.cpp
deleted file mode 100644
index e3b18c39..00000000
--- a/PTRANS/src/host/transpose_handlers.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
-Copyright (c) 2020 Marius Meyer
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*/
-
-#include <random>
-
-#ifdef _USE_MPI_
-#include "mpi.h"
-#endif
-
-#include "transpose_handlers.hpp"
-
-// Add every data handler that should be selectable from the command line into this map
-// and also specify a string identifier for it
-std::map<std::string, std::unique_ptr<transpose::TransposeDataHandler> (*)(int rank, int size)> transpose::dataHandlerIdentifierMap{
-        // distributed external data handler
-#ifdef _USE_MPI_
-        {TRANSPOSE_HANDLERS_DIST_DIAG, &generateDataHandler<transpose::DistributedDiagonalTransposeDataHandler>}
-#endif
-        };
-
-#ifdef _USE_MPI_
-
-transpose::DistributedDiagonalTransposeDataHandler::DistributedDiagonalTransposeDataHandler(int rank, int size) : TransposeDataHandler(rank, size) {
-    if (rank >= size) {
-        throw std::runtime_error("MPI rank must be smaller the MPI world size!");
-    }
-}
-
-
-std::unique_ptr<transpose::TransposeData> transpose::DistributedDiagonalTransposeDataHandler::generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& settings) {
-    int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize;
-
-    int avg_blocks_per_rank = (width_in_blocks * width_in_blocks) / mpi_comm_size;
-    int avg_diagonal_blocks = width_in_blocks;
-    if (avg_blocks_per_rank > 0) {
-        avg_diagonal_blocks = (width_in_blocks / avg_blocks_per_rank);
-    }
-    num_diagonal_ranks = std::max(avg_diagonal_blocks, 1);
-
-    if (num_diagonal_ranks % 2 != mpi_comm_size % 2) {
-    #ifndef NDEBUG
-        std::cout << "Rank " << mpi_comm_rank << ": Fail 1!" << std::endl;
-    #endif
-        // Abort if there is a too high difference in the number of matrix blocks between the MPI ranks
-        throw std::runtime_error("Matrix size and MPI ranks to not allow fair distribution of blocks! Increase or reduce the number of MPI ranks by 1.");
-    }
-    if ((mpi_comm_size - num_diagonal_ranks) % 2 != 0 || (mpi_comm_size - num_diagonal_ranks) == 0 && width_in_blocks > 1) {
-    #ifndef NDEBUG
-        std::cout << "Rank " << mpi_comm_rank << ": Fail 2!" << std::endl;
-    #endif
-        throw std::runtime_error("Not possible to create pairs of MPI ranks for lower and upper half of matrix. Increase number of MPI ranks!.");
-    }
-    bool this_rank_is_diagonal = mpi_comm_rank >= (mpi_comm_size - num_diagonal_ranks);
-    int blocks_if_diagonal = width_in_blocks / num_diagonal_ranks + ( (mpi_comm_rank - (mpi_comm_size - num_diagonal_ranks)) < (width_in_blocks % num_diagonal_ranks) ? 1 : 0);
-    int blocks_if_not_diagonal = 0;
-    if ((mpi_comm_size - num_diagonal_ranks) > 0 ) {
-        blocks_if_not_diagonal = (width_in_blocks * (width_in_blocks - 1)) / (mpi_comm_size - num_diagonal_ranks) + (mpi_comm_rank  < ((width_in_blocks * (width_in_blocks - 1)) % (mpi_comm_size - num_diagonal_ranks)) ? 1 : 0);
-    }
-
-
-    int blocks_per_rank = (this_rank_is_diagonal) ? blocks_if_diagonal : blocks_if_not_diagonal;
-
-    if (mpi_comm_rank == 0) {
-        std::cout << "Diag. blocks per rank:              " << blocks_if_diagonal << std::endl;
-        std::cout << "Blocks per rank:                    " << blocks_if_not_diagonal << std::endl;
-        std::cout << "Loopback ranks for diagonal blocks: " << num_diagonal_ranks << std::endl;
-    }
-    // Height of a matrix generated for a single memory bank on a single MPI rank
-    int data_height_per_rank = blocks_per_rank * settings.programSettings->blockSize;
-
-#ifndef NDEBUG
-    std::cout << "Rank " << mpi_comm_rank << ": NumBlocks = " << blocks_per_rank << std::endl;
-#endif
-    
-    // Allocate memory for a single device and all its memory banks
-    auto d = std::unique_ptr<transpose::TransposeData>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
-
-    // Fill the allocated memory with pseudo random values
-    std::mt19937 gen(mpi_comm_rank);
-    std::uniform_real_distribution<> dis(-100.0, 100.0);
-    for (size_t i = 0; i < data_height_per_rank; i++) {
-        for (size_t j = 0; j < settings.programSettings->blockSize; j++) {
-            d->A[i * settings.programSettings->blockSize + j] = dis(gen);
-            d->B[i * settings.programSettings->blockSize + j] = dis(gen);
-            d->result[i * settings.programSettings->blockSize + j] = 0.0;
-        }
-    }
-    
-    return d;
-}
-
-void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose::TransposeData& data) {
-#ifndef NDEBUG
-    // std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
-#endif
-    // Only need to exchange data, if rank has a partner
-    if (mpi_comm_rank < mpi_comm_size - num_diagonal_ranks) {
-        int first_upper_half_rank = (mpi_comm_size - num_diagonal_ranks)/2;
-        int pair_rank = (mpi_comm_rank >= first_upper_half_rank) ? mpi_comm_rank - first_upper_half_rank : mpi_comm_rank + first_upper_half_rank;
-
-        // To re-calculate the matrix transposition locally on this host, we need to 
-        // exchange matrix A for every kernel replication
-        // The order of the matrix blocks does not change during the exchange, because they are distributed diagonally 
-        // and will be handled in the order below:
-        //
-        // . . 1 3
-        // . . . 2
-        // 1 . . .
-        // 3 2 . .
-        MPI_Status status;
-        size_t remaining_data_size = static_cast<size_t>(data.blockSize) * data.blockSize * data.numBlocks;
-        size_t offset = 0;
-        while (remaining_data_size > 0) {
-            int next_chunk = (remaining_data_size > std::numeric_limits<int>::max()) ? std::numeric_limits<int>::max(): remaining_data_size;
-#ifndef NDEBUG
-    // std::cout << "Rank " << mpi_comm_rank << " " << next_chunk << " to " << pair_rank << std::endl;
-#endif      
-            if (pair_rank > mpi_comm_rank) {
-                MPI_Send(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD);
-                MPI_Recv(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD, &status);
-            }
-            else {
-                std::vector<HOST_DATA_TYPE> buffer(next_chunk);
-                for (int i = 0; i < next_chunk; i++) {
-                    buffer[i] = data.A[offset + i];
-                }
-                MPI_Recv(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD, &status);
-                MPI_Send(buffer.data(), next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD);
-            }
-            // MPI_Sendrecv_replace(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, pair_rank, 0, MPI_COMM_WORLD, &status);
- #ifndef NDEBUG
-    // std::cout << "Rank " << mpi_comm_rank << " Done!"<< std::endl;
-#endif  
-            remaining_data_size -= next_chunk;
-            offset += next_chunk;
-        }
-    }
-#ifndef NDEBUG
-    // std::cout << "End data exchange " << mpi_comm_rank << std::endl;
-#endif
-}
-
-#endif
diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp
index 8db00927..1733cf15 100644
--- a/PTRANS/tests/test_host_functionality.cpp
+++ b/PTRANS/tests/test_host_functionality.cpp
@@ -40,7 +40,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatHeader) {
     std::cout.rdbuf(oldStdOutBuffer);
 
     EXPECT_THAT(newStdOutBuffer.str(),
-                ::testing::MatchesRegex("(\\s+)calc(\\s+)calc\\sFLOPS(\\s+)Net\\s\\[B/s\\](\\s+)Mem\\s\\[B/s\\]\n.*"));
+                ::testing::MatchesRegex("(\\s+)total\\s\\[s\\](\\s+)transfer\\s\\[s\\](\\s+)calc\\s\\[s\\](\\s+)calc\\sFLOPS(\\s+)Mem\\s\\[B/s\\](\\s+)PCIe\\s\\[B/s\\]\n.*"));
 }
 
 /**
@@ -66,7 +66,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) {
     std::cout.rdbuf(oldStdOutBuffer);
 
     EXPECT_THAT(newStdOutBuffer.str(),
-                ::testing::MatchesRegex(".*\navg:\\s+1\\.00000e\\+00.*\n.*\n"));
+                ::testing::MatchesRegex(".*\navg:\\s+2\\.00000e\\+00\\s+1\\.00000e\\+00\\s+1\\.00000e\\+00.*\n.*\n"));
 }
 
 /**
diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
index 93673b59..d7bc0c7f 100644
--- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
@@ -65,6 +65,34 @@ TEST_F(TransposeKernelTest, FPGACorrectBStaysTheSame) {
     EXPECT_FLOAT_EQ(aggregated_error, 0.0);
 }
 
+/**
+ * Tests if B will not be transposed
+ */
+TEST_F(TransposeKernelTest, FPGACorrectBStaysTheSame4Blocks) {
+    if (bm->getExecutionSettings().programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+        // Diagonal data handler does not support this test, since matrix is stored differently in memory buffer
+        return;
+    }
+    matrix_size = BLOCK_SIZE * bm->getExecutionSettings().programSettings->kernelReplications;
+    bm->getExecutionSettings().programSettings->matrixSize = matrix_size;
+    data = bm->generateInputData();
+    createChannelFilesAndSymbolicLinks();
+    for (int i = 0; i < matrix_size; i++) {
+        for (int j = 0; j < matrix_size; j++) {
+            data->A[i * matrix_size + j] = 0.0;
+            data->B[i * matrix_size + j] = i * matrix_size + j;
+        }
+    }
+    bm->executeKernel(*data);
+    double aggregated_error = 0.0;
+    for (int i = 0; i < matrix_size; i++) {
+        for (int j = 0; j < matrix_size; j++) {
+            aggregated_error += std::abs(data->result[i * matrix_size + j] - data->B[i * matrix_size + j]);
+        }
+    }
+    EXPECT_FLOAT_EQ(aggregated_error, 0.0);
+}
+
 /**
  * Tests if a block of A will be correctly transposed
  */
@@ -85,6 +113,34 @@ TEST_F(TransposeKernelTest, FPGAABlockIsTransposed) {
     EXPECT_FLOAT_EQ(aggregated_error, 0.0);
 }
 
+/**
+ * Tests if a block of A will be correctly transposed
+ */
+TEST_F(TransposeKernelTest, FPGAABlockIsTransposed4Blocks) {
+    if (bm->getExecutionSettings().programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+        // Diagonal data handler does not support this test, since matrix is stored differently in memory buffer
+        return;
+    }
+    matrix_size = BLOCK_SIZE * bm->getExecutionSettings().programSettings->kernelReplications;
+    bm->getExecutionSettings().programSettings->matrixSize = matrix_size;
+    data = bm->generateInputData();
+    createChannelFilesAndSymbolicLinks();
+    for (int i = 0; i < matrix_size; i++) {
+        for (int j = 0; j < matrix_size; j++) {
+            data->A[i * matrix_size + j] = i * matrix_size + j;
+            data->B[i * matrix_size + j] = 0.0;
+        }
+    }
+    bm->executeKernel(*data);
+    double aggregated_error = 0.0;
+    for (int i = 0; i < matrix_size; i++) {
+        for (int j = 0; j < matrix_size; j++) {
+            aggregated_error += std::abs(data->result[i * matrix_size + j] - data->A[j * matrix_size + i]);
+        }
+    }
+    EXPECT_FLOAT_EQ(aggregated_error, 0.0);
+}
+
 /**
  * Tests if matrix A and B will be summed up in the result
  */
@@ -105,6 +161,34 @@ TEST_F(TransposeKernelTest, FPGAAAndBAreSummedUp) {
     EXPECT_FLOAT_EQ(aggregated_error, 0.0);
 }
 
+/**
+ * Tests if matrix A and B will be summed up in the result
+ */
+TEST_F(TransposeKernelTest, FPGAAAndBAreSummedUp4Blocks) {
+    if (bm->getExecutionSettings().programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+        // Diagonal data handler does not support this test, since matrix is stored differently in memory buffer
+        return;
+    }
+    matrix_size = BLOCK_SIZE * bm->getExecutionSettings().programSettings->kernelReplications;
+    bm->getExecutionSettings().programSettings->matrixSize = matrix_size;
+    data = bm->generateInputData();
+    createChannelFilesAndSymbolicLinks();
+    for (int i = 0; i < matrix_size; i++) {
+        for (int j = 0; j < matrix_size; j++) {
+            data->A[i * matrix_size + j] = 1.0;
+            data->B[i * matrix_size + j] = i * matrix_size + j;
+        }
+    }
+    bm->executeKernel(*data);
+    double aggregated_error = 0.0;
+    for (int i = 0; i < matrix_size; i++) {
+        for (int j = 0; j < matrix_size; j++) {
+            aggregated_error += std::abs(data->result[i * matrix_size + j] - (data->B[i * matrix_size + j] + 1.0));
+        }
+    }
+    EXPECT_FLOAT_EQ(aggregated_error, 0.0);
+}
+
 
 /**
  * Checks the size and values of the timing measurements that are retured by calculate.
diff --git a/PTRANS/tests/test_transpose_data_handlers.cpp b/PTRANS/tests/test_transpose_data_handlers.cpp
index 6f6d0249..9b666bc2 100644
--- a/PTRANS/tests/test_transpose_data_handlers.cpp
+++ b/PTRANS/tests/test_transpose_data_handlers.cpp
@@ -7,7 +7,7 @@
 #include "test_program_settings.h"
 #include "gmock/gmock-matchers.h"
 #include "transpose_benchmark.hpp"
-#include "transpose_handlers.hpp"
+#include "data_handlers/diagonal.hpp"
 
 
 struct TransposeHandlersTest : testing::Test {
@@ -15,6 +15,7 @@ struct TransposeHandlersTest : testing::Test {
 
     TransposeHandlersTest() {
         bm = std::unique_ptr<transpose::TransposeBenchmark>( new transpose::TransposeBenchmark(global_argc, global_argv));
+        bm->setTransposeDataHandler(transpose::data_handler::DataHandlerType::diagonal);
     }
 
     void SetUp() override {
@@ -29,11 +30,11 @@ struct TransposeHandlersTest : testing::Test {
  * Test DitExt class instantiation
  */
 TEST_F(TransposeHandlersTest, DistDiagCreateHandlerSuccess) {
-    EXPECT_NO_THROW(transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1));
+    EXPECT_NO_THROW(transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1));
 }
 
 TEST_F(TransposeHandlersTest, DistDiagCreateHandlerFail) {
-    EXPECT_THROW(transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](1,1), std::runtime_error);
+    EXPECT_THROW(transpose::data_handler::DistributedDiagonalTransposeDataHandler(1,1), std::runtime_error);
 }
 
 /**
@@ -48,8 +49,8 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI1Block1) {
     bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks;
     uint block_count = 0;
     for (int i=0; i < mpi_size; i++) {
-        auto h = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](i, mpi_size);
-        auto d = h->generateData(bm->getExecutionSettings());
+        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size);
+        auto d = h.generateData(bm->getExecutionSettings());
         block_count += d->numBlocks;
     }
     EXPECT_EQ(block_count, matrix_size_in_blocks * matrix_size_in_blocks);
@@ -63,8 +64,8 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI3Block3) {
     bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks;
     uint block_count = 0;
     for (int i=0; i < mpi_size; i++) {
-        auto h = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](i, mpi_size);
-        auto d = h->generateData(bm->getExecutionSettings());
+        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size);
+        auto d = h.generateData(bm->getExecutionSettings());
         block_count += d->numBlocks;
     }
     EXPECT_EQ(block_count, matrix_size_in_blocks * matrix_size_in_blocks);
@@ -78,8 +79,8 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI9Block3) {
     bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks;
     uint block_count = 0;
     for (int i=0; i < mpi_size; i++) {
-        auto h = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](i, mpi_size);
-        auto d = h->generateData(bm->getExecutionSettings());
+        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size);
+        auto d = h.generateData(bm->getExecutionSettings());
         block_count += d->numBlocks;
     }
     EXPECT_EQ(block_count, matrix_size_in_blocks * matrix_size_in_blocks);
@@ -93,8 +94,8 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI5Block4) {
     bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks;
     uint block_count = 0;
     for (int i=0; i < mpi_size; i++) {
-        auto h = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](i, mpi_size);
-        auto d = h->generateData(bm->getExecutionSettings());
+        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size);
+        auto d = h.generateData(bm->getExecutionSettings());
         block_count += d->numBlocks;
     }
     EXPECT_EQ(block_count, matrix_size_in_blocks * matrix_size_in_blocks);
@@ -105,45 +106,45 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI5Block4) {
  * 
  */
 TEST_F(TransposeHandlersTest, DataGenerationDistDiagSucceedsForMPISizeEquals1SingleBlock) {
-    auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
     bm->getExecutionSettings().programSettings->blockSize = 4;
     bm->getExecutionSettings().programSettings->matrixSize = 4;
-    EXPECT_NO_THROW(handler->generateData(bm->getExecutionSettings()));
+    EXPECT_NO_THROW(handler.generateData(bm->getExecutionSettings()));
 }
 
 TEST_F(TransposeHandlersTest, DataGenerationDistDiagSucceedsForMPISizeEquals1Blocks9) {
-    auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
     bm->getExecutionSettings().programSettings->blockSize = 4;
     bm->getExecutionSettings().programSettings->matrixSize = 4*3;
-    EXPECT_THROW(handler->generateData(bm->getExecutionSettings()), std::runtime_error);
+    EXPECT_THROW(handler.generateData(bm->getExecutionSettings()), std::runtime_error);
 }
 
 TEST_F(TransposeHandlersTest, DataGenerationDistDiagSucceedsForMPISizeEquals3Blocks9) {
-    auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,3);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,3);
     bm->getExecutionSettings().programSettings->blockSize = 4;
     bm->getExecutionSettings().programSettings->matrixSize = 4*3;
-    EXPECT_NO_THROW(handler->generateData(bm->getExecutionSettings()));
+    EXPECT_NO_THROW(handler.generateData(bm->getExecutionSettings()));
 }
 
 TEST_F(TransposeHandlersTest, DataGenerationDistDiagFailsForMPISizeEquals3Blocks1) {
-    auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,3);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,3);
     bm->getExecutionSettings().programSettings->blockSize = 4;
     bm->getExecutionSettings().programSettings->matrixSize = 4;
-    EXPECT_NO_THROW(handler->generateData(bm->getExecutionSettings()));
+    EXPECT_NO_THROW(handler.generateData(bm->getExecutionSettings()));
 }
 
 TEST_F(TransposeHandlersTest, DataGenerationDistDiagFailsForMPISizeEquals3Blocks4) {
-    auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,3);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,3);
     bm->getExecutionSettings().programSettings->blockSize = 4;
     bm->getExecutionSettings().programSettings->matrixSize = 4 * 2;
-    EXPECT_THROW(handler->generateData(bm->getExecutionSettings()), std::runtime_error);
+    EXPECT_THROW(handler.generateData(bm->getExecutionSettings()), std::runtime_error);
 }
 
 TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForOneReplication) {
     bm->getExecutionSettings().programSettings->kernelReplications = 1;
     bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize;
-    auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1);
-    auto data = handler->generateData(bm->getExecutionSettings());
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto data = handler.generateData(bm->getExecutionSettings());
     EXPECT_EQ(data->blockSize, bm->getExecutionSettings().programSettings->blockSize);
     EXPECT_EQ(data->numBlocks, 1);
 }
@@ -151,8 +152,8 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForOneReplication) {
 TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForTwoReplications) {
     bm->getExecutionSettings().programSettings->kernelReplications = 2;
     bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize;
-    auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1);
-    auto data = handler->generateData(bm->getExecutionSettings());
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto data = handler.generateData(bm->getExecutionSettings());
     EXPECT_EQ(data->blockSize, bm->getExecutionSettings().programSettings->blockSize);
     EXPECT_EQ(data->numBlocks, 1);
 }
@@ -160,9 +161,9 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForTwoReplications) {
 TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableA) {
     bm->getExecutionSettings().programSettings->kernelReplications = 2;
     bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize;
-    auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1);
-    auto data = handler->generateData(bm->getExecutionSettings());
-    auto data2 = handler->generateData(bm->getExecutionSettings());
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto data = handler.generateData(bm->getExecutionSettings());
+    auto data2 = handler.generateData(bm->getExecutionSettings());
     double aggregated_error = 0.0;
     for (int i = 0; i < data->blockSize * data->blockSize * data->numBlocks; i++) {
         aggregated_error += std::fabs(data->A[i] - data2->A[i]);
@@ -173,9 +174,9 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableA) {
 TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableB) {
     bm->getExecutionSettings().programSettings->kernelReplications = 2;
     bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize;
-    auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1);
-    auto data = handler->generateData(bm->getExecutionSettings());
-    auto data2 = handler->generateData(bm->getExecutionSettings());
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto data = handler.generateData(bm->getExecutionSettings());
+    auto data2 = handler.generateData(bm->getExecutionSettings());
     double aggregated_error = 0.0;
     for (int i = 0; i < data->blockSize * data->blockSize * data->numBlocks; i++) {
         aggregated_error += std::fabs(data->B[i] - data2->B[i]);
@@ -186,10 +187,10 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableB) {
 TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagExchangeWorksForSingleRank) {
     bm->getExecutionSettings().programSettings->kernelReplications = 2;
     bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize;
-    auto handler = transpose::dataHandlerIdentifierMap[TRANSPOSE_HANDLERS_DIST_DIAG](0,1);
-    auto data = handler->generateData(bm->getExecutionSettings());
-    auto data2 = handler->generateData(bm->getExecutionSettings());
-    handler->exchangeData(*data);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto data = handler.generateData(bm->getExecutionSettings());
+    auto data2 = handler.generateData(bm->getExecutionSettings());
+    handler.exchangeData(*data);
     double aggregated_error = 0.0;
     for (int i = 0; i < data->blockSize * data->blockSize * data->numBlocks; i++) {
         aggregated_error += std::fabs(data->A[i] - data2->A[i]);
diff --git a/README.md b/README.md
index c75d7ab0..4f7325c8 100755
--- a/README.md
+++ b/README.md
@@ -227,7 +227,7 @@ This allows higher memory bandwidths during kernel execution.
 - *High Bandwidth Memory (HBM)*: The FPGA fabric itself is equipped with memory banks that can be accessed by the host to copy data. Compared to DDR, this memory type consists of more, but smaller memory banks so that the host needs to split the data between all memory banks to achieve the best performance. Still, the total achievable memory bandwidth is much higher compared to DDR.
 
 The following three tables contain an overview of the compatibility of all benchmarks that use global memory with the three mentioned memory types.
-b_eff is not included since it does not use global memory.
+b_eff does use global memory only for validation. Still, the support for different memory types needs to be implemented on the host side.
 Full support of the benchmark is indicated with a **Yes**, functionally correct behavior but performance limitations are indicated with **(Yes)**, no support is indicated with **No**.
 For Xilinx, all benchmarks need a compatible compile- and link-settings-file to map the kernel memory ports to the available memory banks.
 LINPACK, PTRANS and b_eff are currently not working with Xilinx FPGAs because the implementations lack support for inter-FPGA communication on these devices.
@@ -239,10 +239,11 @@ Support will be added subsequently.
 |--------------|------------|--------------|
 | STREAM       | Yes        |  Yes         |            
 | RandomAccess | Yes        |  Yes         |      
-| PTRANS       | Yes        |  No          |      
-| LINPACK      | Yes        |  No          |           
+| PTRANS       | Yes        |  Yes         |      
+| LINPACK      | Yes        |  Yes         |           
 | GEMM         | Yes        |  Yes         |      
-| FFT          | Yes        |  Yes         |       
+| FFT          | Yes        |  Yes         | 
+| b_eff        | Yes        |  Yes         |       
 
 
 #### HBM
@@ -257,6 +258,7 @@ Support will be added subsequently.
 | LINPACK      | No         |   No         |           
 | GEMM         | Yes         |  Yes        |      
 | FFT          | Yes         |  Yes        | 
+| b_eff        | No         |  No          | 
 
 #### SVM
 
@@ -270,6 +272,7 @@ SVM could not be tested with Xilinx-based boards, yet. Thus, they are considered
 | LINPACK      | No         |  No          |           
 | GEMM         | Yes        |  No          |      
 | FFT          | Yes        |  No          | 
+| b_eff        | No         |  No          | 
 
 ## Publications
 
diff --git a/STREAM/CMakeLists.txt b/STREAM/CMakeLists.txt
index b087939b..67f66982 100755
--- a/STREAM/CMakeLists.txt
+++ b/STREAM/CMakeLists.txt
@@ -11,8 +11,10 @@ set(INNER_LOOP_BUFFERS ON CACHE BOOL "Put the local memory buffers inside the ou
 
 mark_as_advanced(INNER_LOOP_BUFFERS)
 
-# Set the data type since optional vector types are used
-set(DATA_TYPE float)
+# Set the data type if not defined before to set up vector types
+if (NOT DEFINED DATA_TYPE) 
+    set(DATA_TYPE float)
+endif()
 set(HOST_DATA_TYPE cl_${DATA_TYPE})
 if (VECTOR_COUNT GREATER 1)
     set(DEVICE_DATA_TYPE ${DATA_TYPE}${VECTOR_COUNT})
diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt
index ecdd99ac..0793aa92 100755
--- a/b_eff/CMakeLists.txt
+++ b/b_eff/CMakeLists.txt
@@ -14,6 +14,8 @@ set(NUM_REPLICATIONS 2 CACHE STRING "")
 set(USE_MPI Yes)
 set(USE_DEPRECATED_HPP_HEADER No)
 
+set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes)
+
 set(DATA_TYPE char)
 include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
 unset(DATA_TYPE CACHE)
diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt
index 4f34a65a..8316a884 100644
--- a/b_eff/src/device/CMakeLists.txt
+++ b/b_eff/src/device/CMakeLists.txt
@@ -3,8 +3,12 @@ set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication in
 set(NUM_REPLICATIONS 2)
 include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)
 
-generate_kernel_targets_intel(communication_bw520n)
-add_test(NAME test_emulation_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_emulate.aocx -l 1 -u 10 -m 0 -n 1
+generate_kernel_targets_intel(communication_bw520n_IEC)
+add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1
         WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_emulate.aocx -l 1 -u 1 -m 20 -n 1
+add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1
+        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1
+        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1
         WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
diff --git a/b_eff/src/device/communication_bw520n.cl b/b_eff/src/device/communication_bw520n_IEC.cl
similarity index 100%
rename from b_eff/src/device/communication_bw520n.cl
rename to b_eff/src/device/communication_bw520n_IEC.cl
diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index 44bb1832..fb08281f 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase)
-set(HOST_SOURCE execution_default.cpp network_benchmark.cpp)
+set(HOST_SOURCE network_benchmark.cpp)
 include_directories(${MPI_CXX_INCLUDE_PATH})
 
 set(HOST_EXE_NAME Network)
diff --git a/PTRANS/src/host/execution.h b/b_eff/src/host/execution_types/execution.hpp
similarity index 56%
rename from PTRANS/src/host/execution.h
rename to b_eff/src/host/execution_types/execution.hpp
index 81ed371c..df630838 100644
--- a/PTRANS/src/host/execution.h
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -19,32 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
-#ifndef SRC_HOST_EXECUTION_H_
-#define SRC_HOST_EXECUTION_H_
 
-/* C++ standard library headers */
-#include <memory>
-#include <vector>
-
-/* External library headers */
-#include "CL/cl.hpp"
-#include "parameters.h"
-#include "transpose_benchmark.hpp"
-
-
-namespace bm_execution {
-
-
-/**
- * @brief Transpose and add the matrices using the OpenCL kernel
- * 
- * @param config The progrma configuration
- * @param data data object that contains all required data for the execution on the FPGA
- * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
- */
-    std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data);
-
-}  // namespace bm_execution
-
-#endif  // SRC_HOST_EXECUTION_H_
+#include "execution_types/execution_cpu.hpp"
+#include "execution_types/execution_pcie.hpp"
+#include "execution_types/execution_iec.hpp"
\ No newline at end of file
diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp
new file mode 100644
index 00000000..778dc2f1
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_cpu.hpp
@@ -0,0 +1,118 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_CPU_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_CPU_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+
+/* Project's headers */
+
+namespace network::execution_types::cpu {
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+    std::shared_ptr<network::ExecutionTimings>
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::CommandQueue> sendQueues;
+        std::vector<cl::Buffer> dummyBuffers;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+
+        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            sendQueues.clear();
+            dummyBuffers.clear();
+            dummyBufferContents.clear();
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+                dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err));
+                ASSERT_CL(err)
+
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+
+                cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err);
+                ASSERT_CL(err)
+
+                sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data());
+
+                sendQueues.push_back(sendQueue);
+
+            }
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                for (int l = 0; l < looplength; l++) {
+                        MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, 
+                                        dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                }
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
+            ASSERT_CL(err);
+        }
+        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+                looplength,
+                messageSize,
+                calculationTimings
+        });
+        return result;
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/execution_default.cpp b/b_eff/src/host/execution_types/execution_iec.hpp
similarity index 95%
rename from b_eff/src/host/execution_default.cpp
rename to b_eff/src/host/execution_types/execution_iec.hpp
index 1c5cd908..bed54f09 100644
--- a/b_eff/src/host/execution_default.cpp
+++ b/b_eff/src/host/execution_types/execution_iec.hpp
@@ -19,9 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
-
-/* Related header files */
-#include "execution.h"
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_IEC_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_IEC_HPP
 
 /* C++ standard library headers */
 #include <memory>
@@ -34,7 +33,7 @@ SOFTWARE.
 
 /* Project's headers */
 
-namespace bm_execution {
+namespace network::execution_types::iec {
 
     /*
     Implementation for the single kernel.
@@ -126,7 +125,7 @@ namespace bm_execution {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / 2, &validationData.data()[r * validationData.size() / 2]);
+            err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
             ASSERT_CL(err);
         }
         std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
@@ -138,3 +137,5 @@ namespace bm_execution {
     }
 
 }  // namespace bm_execution
+
+#endif
\ No newline at end of file
diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp
new file mode 100644
index 00000000..73156b7e
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_pcie.hpp
@@ -0,0 +1,124 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_PCIE_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_PCIE_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+
+/* Project's headers */
+
+namespace network::execution_types::pcie {
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+    std::shared_ptr<network::ExecutionTimings>
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::CommandQueue> sendQueues;
+        std::vector<cl::Buffer> dummyBuffers;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+
+        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            sendQueues.clear();
+            dummyBuffers.clear();
+            dummyBufferContents.clear();
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+                dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err));
+                ASSERT_CL(err)
+
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+
+                cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err);
+                ASSERT_CL(err)
+
+                sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data());
+
+                sendQueues.push_back(sendQueue);
+
+            }
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                for (int l = 0; l < looplength; l++) {
+
+                        sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+
+                        MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, 
+                                        dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+                        sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+
+                }
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
+            ASSERT_CL(err);
+        }
+        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+                looplength,
+                messageSize,
+                calculationTimings
+        });
+        return result;
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 8a310a14..9d1512e4 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -31,7 +31,7 @@ SOFTWARE.
 #include <random>
 
 /* Project's headers */
-#include "execution.h"
+#include "execution_types/execution.hpp"
 #include "parameters.h"
 
 network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
@@ -50,6 +50,7 @@ network::NetworkProgramSettings::getSettingsMap() {
 
 network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength) : messageSize(_messageSize), loopLength(_loopLength), 
                                                                             validationBuffer(CHANNEL_WIDTH * 2 * 2, 0) {
+                                                                                // TODO: fix the validation buffer size to use the variable number of kernel replications and channels
                                                                                 // Validation data buffer should be big enough to fit the data of two channels
                                                                                 // for every repetition. The number of kernel replications is fixed to 2, which 
                                                                                 // also needs to be multiplied with the buffer size
@@ -104,7 +105,14 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
         if (world_rank == 0) {
             std::cout << "Measure for " << (1 << run.messageSize) << " Byte" << std::endl;
         }
-        timing_results.push_back(bm_execution::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer));
+        std::shared_ptr<network::ExecutionTimings> timing;
+        switch (executionSettings->programSettings->communicationType) {
+            case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
+            case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
+            case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
+            default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));
+        }
+        timing_results.push_back(timing);
     }
 
     std::unique_ptr<network::NetworkExecutionTimings> collected_results = std::unique_ptr<network::NetworkExecutionTimings> (new network::NetworkExecutionTimings());
diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
index a564cedc..ba254201 100644
--- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
@@ -10,7 +10,7 @@
 #include "test_program_settings.h"
 #include <fstream>
 
-struct NetworkKernelTest : testing::Test {
+struct NetworkKernelTest : testing::TestWithParam<hpcc_base::CommunicationType> {
     std::unique_ptr<network::NetworkBenchmark> bm;
     std::unique_ptr<network::NetworkData> data;
     unsigned numberOfChannels = 4;
@@ -22,6 +22,7 @@ struct NetworkKernelTest : testing::Test {
     void SetUp() override {
         bm = std::unique_ptr<network::NetworkBenchmark>(new network::NetworkBenchmark(global_argc, global_argv));
         bm->getExecutionSettings().programSettings->numRepetitions = 1;
+        bm->getExecutionSettings().programSettings->communicationType = GetParam();
         data = bm->generateInputData();
         createChannelFilesAndSymbolicLinks();
     }
@@ -47,7 +48,7 @@ struct NetworkKernelTest : testing::Test {
 /**
  * Tests if calculate returns the correct execution results
  */
-TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) {
+TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) {
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(1,1));
     auto result = bm->executeKernel(*data);
@@ -59,7 +60,7 @@ TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) {
 /**
  * Tests if calculate returns the correct execution results for multiple repetitions
  */
-TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) {
+TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) {
     bm->getExecutionSettings().programSettings->numRepetitions = 2;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(8,4));
@@ -72,7 +73,11 @@ TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) {
 /**
  * Tests if data is written to the channels for small message sizes
  */
-TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) {
+TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) {
+    if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
+        // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
+        GTEST_SKIP();
+    }
     const unsigned messageSize = std::log2(CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
@@ -96,7 +101,11 @@ TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel)
 /**
  * Tests if data is written to the channels for small message sizes filling two channels
  */
-TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) {
+TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) {
+    if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
+        // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
+        GTEST_SKIP();
+    }
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
@@ -117,7 +126,11 @@ TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels
 /**
  * Tests if data is written to the channels for message sizes filling more than two channels
  */
-TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) {
+TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) {
+    if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
+        // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
+        GTEST_SKIP();
+    }
     const unsigned messageSize = std::log2(8 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 1;
     data->items.clear();
@@ -138,7 +151,11 @@ TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwo
 /**
  * Tests if correct data is written to the channels
  */
-TEST_F(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
+TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
+    if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
+        // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
+        GTEST_SKIP();
+    }
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
@@ -158,7 +175,7 @@ TEST_F(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
     delete [] buffer;
 }
 
-TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
+TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
@@ -173,7 +190,7 @@ TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
     EXPECT_TRUE(all_same);
 }
 
-TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
+TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
     const unsigned messageSize = 0;
     const unsigned looplength = 4;
     data->items.clear();
@@ -188,7 +205,7 @@ TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
     EXPECT_TRUE(all_same);
 }
 
-TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) {
+TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
@@ -197,7 +214,7 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) {
     EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
 }
 
-TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) {
+TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 1;
     data->items.clear();
@@ -206,7 +223,7 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) {
     EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
 }
 
-TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) {
+TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) {
     const unsigned messageSize = 0;
     const unsigned looplength = 1;
     data->items.clear();
@@ -215,7 +232,7 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) {
     EXPECT_EQ(looplength * CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
 }
 
-TEST_F(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) {
+TEST_P(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const HOST_DATA_TYPE expected_data = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     const unsigned looplength = 4;
@@ -226,7 +243,7 @@ TEST_F(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) {
     EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
 }
 
-TEST_F(NetworkKernelTest, ValidationDataWrongCheckFails) {
+TEST_P(NetworkKernelTest, ValidationDataWrongCheckFails) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const HOST_DATA_TYPE expected_data = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     const unsigned looplength = 4;
@@ -236,7 +253,7 @@ TEST_F(NetworkKernelTest, ValidationDataWrongCheckFails) {
     EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
 }
 
-TEST_F(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) {
+TEST_P(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const HOST_DATA_TYPE expected_data = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     const unsigned looplength = 4;
@@ -246,7 +263,7 @@ TEST_F(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) {
     EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
 }
 
-TEST_F(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
+TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
@@ -258,7 +275,7 @@ TEST_F(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
 // This test is disabled because it does not work with the current implementation of the
 // external channels in software emulation. The different kernel executions will read 
 // the old data from the channel file, which will lead to a failing validation!
-TEST_F(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) {
+TEST_P(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
@@ -268,7 +285,7 @@ TEST_F(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExec
     EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
 }
 
-TEST_F(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
+TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
@@ -279,3 +296,9 @@ TEST_F(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
     EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
 }
 
+
+
+INSTANTIATE_TEST_CASE_P(
+        NetworkKernelParametrizedTests,
+        NetworkKernelTest,
+        ::testing::Values(hpcc_base::CommunicationType::intel_external_channels,hpcc_base::CommunicationType::cpu_only, hpcc_base::CommunicationType::pcie_mpi));
diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index 2624dcaa..28f696f3 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -16,6 +16,12 @@ else()
     set(header_default Yes)
 endif()
 
+if(DEFINED COMMUNICATION_TYPE_SUPPORT_ENABLED)
+    set(comm_support_default ${COMMUNICATION_TYPE_SUPPORT_ENABLED})
+else()
+    set(comm_support_default No)
+endif()
+
 # Host code specific options
 set(DEFAULT_REPETITIONS 10 CACHE STRING "Default number of repetitions")
 set(DEFAULT_DEVICE -1 CACHE STRING "Index of the default device to use")
@@ -29,8 +35,9 @@ set(USE_DEPRECATED_HPP_HEADER ${header_default} CACHE BOOL "Flag that indicates
 set(HPCC_FPGA_CONFIG ${HPCC_FPGA_CONFIG} CACHE FILEPATH "Configuration file that is used to overwrite the default configuration")
 set(NUM_REPLICATIONS 4 CACHE STRING "Number of times the kernels will be replicated")
 set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication for the OpenCL kernel targets")
+set(COMMUNICATION_TYPE_SUPPORT_ENABLED ${comm_support_default} CACHE INTERNAL "Enables the support for the selection of the communication type which has to be implemented by the specific benchmark")
 
-mark_as_advanced(KERNEL_REPLICATION_ENABLED)
+mark_as_advanced(KERNEL_REPLICATION_ENABLED COMMUNICATION_TYPE_SUPPORT_ENABLED)
 if (NOT KERNEL_REPLICATION_ENABLED)
 # Only define NUM_REPLICATIONS if kernel replications is enabled
  unset(NUM_REPLICATIONS)
@@ -119,6 +126,11 @@ if (USE_DEPRECATED_HPP_HEADER)
     add_definitions(-DUSE_DEPRECATED_HPP_HEADER)
 endif()
 
+# set the communication type flag if required
+if (COMMUNICATION_TYPE_SUPPORT_ENABLED)
+    add_definitions(-DCOMMUNICATION_TYPE_SUPPORT_ENABLED)
+endif()
+
 # Set OpenCL version that should be used
 set(HPCC_FPGA_OPENCL_VERSION 200 CACHE STRING "OpenCL version that should be used for the host code compilation")
 mark_as_advanced(HPCC_FPGA_OPENCL_VERSION)
diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 20d6506d..1d7e667f 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -1,6 +1,5 @@
 
 set(COMPILER_INCLUDES "-I${CMAKE_BINARY_DIR}/src/common/" "-I${CMAKE_CURRENT_SOURCE_DIR}")
-set(CLFLAGS --config ${XILINX_COMPILE_SETTINGS_FILE})
 
 set(Vitis_EMULATION_CONFIG_UTIL $ENV{XILINX_VITIS}/bin/emconfigutil)
 
@@ -45,6 +44,13 @@ function(generate_kernel_targets_xilinx)
         set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA)
         list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs)
 
+        string(REGEX MATCH "^.+\.tcl" is_tcl_script ${XILINX_COMPILE_SETTINGS_FILE})
+        if (is_tcl_script)
+                set(CLFLAGS --hls.pre_tcl ${XILINX_COMPILE_SETTINGS_FILE})
+        else()
+                set(CLFLAGS --config ${XILINX_COMPILE_SETTINGS_FILE})
+        endif()
+
         # build emulation config for device
         add_custom_command(OUTPUT ${EXECUTABLE_OUTPUT_PATH}/emconfig.json
         COMMAND ${Vitis_EMULATION_CONFIG_UTIL} -f ${FPGA_BOARD_NAME} --od ${EXECUTABLE_OUTPUT_PATH}
@@ -164,7 +170,7 @@ function(generate_kernel_targets_intel)
                 DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_f}
         )
         add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_emulate_f}
-                COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} -DEMULATE -DINTEL_FPGA ${COMPILER_INCLUDES} ${AOC_FLAGS} -legacy-emulator -march=emulator
+                COMMAND ${IntelFPGAOpenCL_AOC} ${source_f} -DEMULATE -DINTEL_FPGA ${COMPILER_INCLUDES} ${AOC_FLAGS} -march=emulator
                 -o ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_emulate_f}
                 MAIN_DEPENDENCY ${source_f}
                 DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h
diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake
index 921cf5ab..2597017b 100644
--- a/cmake/unitTestTargets.cmake
+++ b/cmake/unitTestTargets.cmake
@@ -13,8 +13,9 @@ if (INTELFPGAOPENCL_FOUND)
     endif()
     target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}")
     foreach (kernel_target ${kernel_emulation_targets_intel})
+        set(additional_commands "")
         string(REPLACE "_intel" ".aocx" kernel_name ${kernel_target})
-        add_test(NAME test_unit_${kernel_target} COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_test_intel> -f ${kernel_name} ${TEST_HOST_FLAGS} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_unit_${kernel_target} COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_test_intel> ${additional_commands} -f ${kernel_name} ${TEST_HOST_FLAGS} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
     endforeach(kernel_target)
 endif()
 
diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py
index 5b57fd57..f5e437d2 100755
--- a/scripts/evaluation/parse_raw_to_csv.py
+++ b/scripts/evaluation/parse_raw_to_csv.py
@@ -9,10 +9,10 @@
 import sys
 
 # Regular expressions for the raw output of all 
-fft_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\nFFT\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<best_time>(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_flops>(\d|\.|\+|-|e)+)"
+fft_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\n(.*\n)FFT\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<best_time>(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_flops>(\d|\.|\+|-|e)+)"
 gemm_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P<best_time>.+)\\s+(?P<avg_time>.+)\\s+(?P<gflops>.+)"
 ra_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<gops>(\d|\.|\+|-|e)+)"
-trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+calc\\s+calc\\s+FLOPS\\s+Net\\s+\\[B/s\\]\\s+Mem\\s+\\[B/s\\]\n\\s*avg:\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)"
+trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s\\[s\\]\\s+transfer\\s\\[s\\]\\s+calc\\s\\[s\\]\\s+calc\\s+FLOPS\\s+Mem\\s+\\[B/s\\]\\s+PCIe\\s+\\[B/s\\]\n\\s*avg:\\s+(?P<avg_total_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_transfer_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_trans_bw>(\d|\.|\+|-|e|inf)+)\n\\s*best:\\s+(?P<best_total_time>(\d|\.|\+|-|e)+)\\s+(?P<best_transfer_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_trans_bw>(\d|\.|\+|-|e|inf)+)"
 stream_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)"
 linpack_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)"
    
diff --git a/scripts/test_all.sh b/scripts/test_all.sh
index fd8b710c..081e8474 100755
--- a/scripts/test_all.sh
+++ b/scripts/test_all.sh
@@ -12,7 +12,6 @@
 #     ./test_all.sh -DFPGA_BOARD_NAME=other_board
 #
 
-
 SCRIPT_PATH=$( cd "$(dirname $0)"; pwd -P)
 
 PROJECT_ROOT=${SCRIPT_PATH}/..
@@ -24,8 +23,12 @@ TEST_LOG_FILE=${TEST_DIR}/lasttests.log
 
 BENCHMARKS=("b_eff" "FFT" "GEMM" "LINPACK" "PTRANS" "RandomAccess" "STREAM")
 
-# Xilinx benchmarks:
-#BENCHMARKS=("RandomAccess" "STREAM")
+if [ "$1" != "inc" ]; then
+    echo "Clean build directory, use option 'inc' to prevent this!"
+    rm -rf ${TEST_DIR}
+else
+    echo "Do incremental build based on previous run!"
+fi
 
 mkdir -p $TEST_DIR
 rm -f $BUILD_LOG_FILE
@@ -37,11 +40,16 @@ echo "Start building hosts code, tests and emulation kernel for all benchmarks."
 
 for bm in ${BENCHMARKS[@]}; do
     echo "Building $bm..."
+    if [ -f  ${TEST_DIR}/$bm/BUILD_SUCCESS ]; then
+        continue
+    else
+        rm -rf ${TEST_DIR}/$bm
+    fi
     cd $TEST_DIR
     mkdir -p $bm
     ret=0
     cd $bm
-    cmake ${PROJECT_ROOT}/$bm -DDEFAULT_DEVICE=0 -DDEFAULT_PLATFORM=0 -DBLOCK_SIZE=32 $@ &>> $BUILD_LOG_FILE
+    cmake ${PROJECT_ROOT}/$bm -DDEFAULT_DEVICE=0 -DDEFAULT_PLATFORM=0 -DBLOCK_SIZE=32 &>> $BUILD_LOG_FILE
     ret=$(($ret + $?))
     make -j 40 VERBOSE=1 all &>> $BUILD_LOG_FILE
     ret=$(($ret + $?))
@@ -50,12 +58,16 @@ for bm in ${BENCHMARKS[@]}; do
         echo "For more information see $BUILD_LOG_FILE"
         exit $ret
     fi
+    touch ${TEST_DIR}/$bm/BUILD_SUCCESS
 done
 
 echo "Start testing all benchmarks"
 
 for bm in ${BENCHMARKS[@]}; do
     echo "Testing $bm..."
+    if [ -f  ${TEST_DIR}/$bm/TEST_SUCCESS ]; then
+        continue
+    fi
     cd $TEST_DIR
     ret=0
     cd $bm
@@ -85,13 +97,14 @@ for bm in ${BENCHMARKS[@]}; do
         ln -s kernel_output_ch3 kernel_input_ch2
         cd ..
     fi
-    make XCL_EMULATION_MODE=sw_emu CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test &>> $TEST_LOG_FILE
+    make XCL_EMULATION_MODE=sw_emu CTEST_OUTPUT_ON_FAILURE=1 test &>> $TEST_LOG_FILE
     ret=$(($ret + $?))
     if [ $ret -ne 0 ]; then
         echo "Failed testing $bm"
         echo "For more information see $TEST_LOG_FILE"
         exit $ret
     fi
+    touch ${TEST_DIR}/$bm/TEST_SUCCESS
 done
 
 echo "-----------"
diff --git a/shared/include/communication_types.hpp b/shared/include/communication_types.hpp
new file mode 100644
index 00000000..bb46bb8d
--- /dev/null
+++ b/shared/include/communication_types.hpp
@@ -0,0 +1,124 @@
+/*
+Copyright (c) 2021 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef HPCC_BASE_COMMUNICATION_TYPES_H_
+#define HPCC_BASE_COMMUNICATION_TYPES_H_
+
+#define DEFAULT_COMM_TYPE "AUTO"
+
+#include <map>
+
+namespace hpcc_base {
+
+/**
+ * @brief This enumeration contains all available communication types. They differ in the way data is exchanged between FPGAs. A special case is cpu_only which can be used to implement CPU references
+ * 
+ */
+typedef enum _CommunicationType {
+
+    /**
+     * @brief Communication using the external channels extension
+     * 
+     */
+    intel_external_channels,
+
+    /**
+     * @brief Copy the data from FPGA to CPU and send it via MPI
+     * 
+     */
+    pcie_mpi,
+
+    /**
+     * @brief Communcation using the Streaming Message Interface
+     * 
+     */
+    smi,
+
+    /**
+     * @brief Calculate the benchmark on CPU instead of FPGA
+     * 
+     */
+    cpu_only,
+
+    /**
+     * @brief Indicates, that the use of the communication type is disabled
+     * 
+     */
+    unsupported,
+
+    /**
+     * @brief Automatically detect communication type from kernel file name
+     * 
+     */
+    automatic
+
+} CommunicationType;
+
+static const std::map<const std::string, CommunicationType> comm_to_str_map{ 
+    {"IEC", CommunicationType::intel_external_channels}, 
+    {"PCIE", CommunicationType::pcie_mpi},
+	{"SMI", CommunicationType::smi},
+    {"CPU", CommunicationType::cpu_only},
+    {"UNSUPPORTED", CommunicationType::unsupported},
+    {"AUTO", CommunicationType::automatic}
+    };
+
+/**
+ * @brief Serializes a enum of type CommunicationType into a string. The resulting string can be used with the function retrieveCommunicationType to get back the enum.
+ * 
+ * @param e the communication type that should be converted into a string
+ * @return std::string String representation of the communication type
+ */
+static std::string commToString(CommunicationType c) {
+    for (auto& entry : comm_to_str_map) {
+        if (entry.second == c) {
+            return entry.first;
+        }
+    }
+    throw std::runtime_error("Communication type could not be converted to string!");
+}
+
+/**
+ * @brief Deserializes a string into a enum of type CommunicationType. If the execution type is auto, the given kernel file name is used to determine the communication type. If this is not possible, an exception is thrown
+ * 
+ * @param exe_name String serialization of the communication tpye
+ * @param kernel_filename the name of the used bitstream file
+ * @return CommunicationType the determined communication type. Will throw a runtime error if it is not possible to retrieve the execution type
+ */
+static CommunicationType retrieveCommunicationType(std::string comm_name, std::string kernel_filename) {
+    auto result = comm_to_str_map.find(comm_name);
+    if (result != comm_to_str_map.end()) {
+        if (result->second == CommunicationType::automatic) {
+            for (auto &comm_type: comm_to_str_map) {
+                if (kernel_filename.find(comm_type.first) != std::string::npos) {
+                    return comm_type.second;
+                }
+            }
+            throw std::runtime_error("Communication type could not be autodetected from kernel_filename: " + kernel_filename);
+        } else {
+            return result->second;
+        }
+    }
+    throw std::runtime_error("Communication type could not be converted from string: " + comm_name);
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 1d3bd92b..f61a1e91 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -26,6 +26,7 @@ SOFTWARE.
 
 /* Project's headers */
 #include "setup/fpga_setup.hpp"
+#include "communication_types.hpp"
 #include "cxxopts.hpp"
 #include "parameters.h"
 
@@ -115,6 +116,12 @@ class BaseSettings {
      */
     bool testOnly;
 
+    /**
+     * @brief Type of inter-FPGA communication used
+     * 
+     */
+    CommunicationType communicationType;
+
     /**
      * @brief Construct a new Base Settings object
      * 
@@ -134,6 +141,11 @@ class BaseSettings {
             kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : NUM_REPLICATIONS),
 #else
             kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : 1),
+#endif
+#ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED
+            communicationType(retrieveCommunicationType(results["comm-type"].as<std::string>(), results["f"].as<std::string>())),
+#else
+            communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as<std::string>())),
 #endif
             testOnly(static_cast<bool>(results.count("test"))) {}
 
@@ -153,7 +165,8 @@ class BaseSettings {
         str_mpi_ranks = std::to_string(mpi_size);
     }
         return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, 
-                {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", std::to_string(testOnly)}};
+                {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"},
+                {"Communication Type", commToString(communicationType)}};
     }
 
 };
@@ -372,6 +385,10 @@ class HpccFpgaBenchmark {
 #ifdef NUM_REPLICATIONS
                 ("r", "Number of used kernel replications",
                 cxxopts::value<cl_uint>()->default_value(std::to_string(NUM_REPLICATIONS)))
+#endif
+#ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED
+                ("comm-type", "Used communication type for inter-FPGA communication",
+                cxxopts::value<std::string>()->default_value(DEFAULT_COMM_TYPE))
 #endif
                 ("test", "Only test given configuration and skip execution and validation")
                 ("h,help", "Print this help");
@@ -391,10 +408,7 @@ class HpccFpgaBenchmark {
 
             // Check parsed options and handle special cases
             if (result.count("f") <= 0) {
-                // Path to the kernel file is mandatory - exit if not given!
-                std::cerr << "Kernel file must be given! Aborting" << std::endl;
-                std::cout << options.help() << std::endl;
-                throw fpga_setup::FpgaSetupException("Mandatory option is missing");
+                throw fpga_setup::FpgaSetupException("Mandatory option is missing! Use -h to show all available options. ERROR: Kernel file must be given with option -f!");
             }
 
             // Create program settings from program arguments
@@ -403,9 +417,7 @@ class HpccFpgaBenchmark {
             return sharedSettings;
         }
         catch (const cxxopts::OptionException& e) {
-            std::cerr << "Error while parsing input parameters: "<< e.what() << std::endl;
-            std::cout << options.help() << std::endl;
-            throw fpga_setup::FpgaSetupException("Input parameters could not be parsed");
+            throw fpga_setup::FpgaSetupException("Input parameters could not be parsed! Use -h to show all available options. ERROR: " + std::string(e.what()));
         }
 
     }
@@ -462,16 +474,21 @@ class HpccFpgaBenchmark {
 
             std::unique_ptr<TSettings> programSettings = parseProgramParameters(tmp_argc, tmp_argv);
 
-            auto usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
-                                                                programSettings->defaultDevice);
+            std::unique_ptr<cl::Context> context;
+            std::unique_ptr<cl::Program> program;
+            std::unique_ptr<cl::Device> usedDevice;
 
-            auto context = std::unique_ptr<cl::Context>(new cl::Context(*usedDevice));
-            auto program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
-                                                                &programSettings->kernelFileName);
+            if (!programSettings->testOnly) {
+                usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
+                                                                    programSettings->defaultDevice);
 
-            executionSettings = std::unique_ptr<ExecutionSettings<TSettings>>(new ExecutionSettings<TSettings>(std::move(programSettings), std::move(usedDevice), 
-                                                                std::move(context), std::move(program)));
+                context = std::unique_ptr<cl::Context>(new cl::Context(*usedDevice));
+                program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
+                                                                    &programSettings->kernelFileName);
+            }
 
+            executionSettings = std::unique_ptr<ExecutionSettings<TSettings>>(new ExecutionSettings<TSettings>(std::move(programSettings), std::move(usedDevice), 
+                                                                    std::move(context), std::move(program)));
             if (mpi_comm_rank == 0) {
                 if (!checkInputParameters()) {
                     std::cerr << "ERROR: Input parameter check failed!" << std::endl;
@@ -508,9 +525,16 @@ class HpccFpgaBenchmark {
     executeBenchmark() {
 
         if (!benchmark_setup_succeeded) {
-            std::cerr << "Benchmark execution started without running the benchmark setup!" << std::endl;
+            std::cerr << "Benchmark execution started without successfully running the benchmark setup!" << std::endl;
             return false;
         }
+        if (executionSettings->programSettings->testOnly) {
+            if (mpi_comm_rank == 0) {
+                std::cout << "TEST MODE ENABLED: SKIP DATA GENERATION, EXECUTION, AND VALIDATION!" << std::endl;
+                std::cout << "SUCCESSFULLY parsed input parameters!" << std::endl;
+            }
+            return benchmark_setup_succeeded;
+        }
         if (mpi_comm_rank == 0) {
             std::cout << HLINE << "Start benchmark using the given configuration. Generating data..." << std::endl
                     << HLINE;
@@ -531,45 +555,39 @@ class HpccFpgaBenchmark {
             }
 
             bool validateSuccess = false;
-            if (!executionSettings->programSettings->testOnly) {
-                auto exe_start = std::chrono::high_resolution_clock::now();
-                std::unique_ptr<TOutput> output =  executeKernel(*data);
+            auto exe_start = std::chrono::high_resolution_clock::now();
+            std::unique_ptr<TOutput> output =  executeKernel(*data);
 
 #ifdef _USE_MPI_
-            MPI_Barrier(MPI_COMM_WORLD);
+        MPI_Barrier(MPI_COMM_WORLD);
 #endif
 
-                std::chrono::duration<double> exe_time = std::chrono::high_resolution_clock::now() - exe_start;
+            std::chrono::duration<double> exe_time = std::chrono::high_resolution_clock::now() - exe_start;
 
-                if (mpi_comm_rank == 0) {
-                    std::cout << "Execution Time: " << exe_time.count() << " s"  << std::endl;
-                    std::cout << HLINE << "Validate output..." << std::endl
-                            << HLINE;
-                }
-
-                if (!executionSettings->programSettings->skipValidation) {
-                    auto eval_start = std::chrono::high_resolution_clock::now();
-                    validateSuccess = validateOutputAndPrintError(*data);
-                    std::chrono::duration<double> eval_time = std::chrono::high_resolution_clock::now() - eval_start;
+            if (mpi_comm_rank == 0) {
+                std::cout << "Execution Time: " << exe_time.count() << " s"  << std::endl;
+                std::cout << HLINE << "Validate output..." << std::endl
+                        << HLINE;
+            }
 
-                    if (mpi_comm_rank == 0) {
-                        std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl;
-                    }
-                }
-                collectAndPrintResults(*output);
+            if (!executionSettings->programSettings->skipValidation) {
+                auto eval_start = std::chrono::high_resolution_clock::now();
+                validateSuccess = validateOutputAndPrintError(*data);
+                std::chrono::duration<double> eval_time = std::chrono::high_resolution_clock::now() - eval_start;
 
                 if (mpi_comm_rank == 0) {
-                    if (!validateSuccess) {
-                        std::cerr << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl;
-                    }
-                    else {
-                        std::cout << "Validation: SUCCESS!" << std::endl;
-                    }
+                    std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl;
                 }
-
             }
-            else {
-                std::cout << "TEST MODE ENABLED: SKIP EXECUTION AND VALIDATION!" << std::endl;
+            collectAndPrintResults(*output);
+
+            if (mpi_comm_rank == 0) {
+                if (!validateSuccess) {
+                    std::cerr << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl;
+                }
+                else {
+                    std::cout << "Validation: SUCCESS!" << std::endl;
+                }
             }
 
             return validateSuccess;
@@ -645,7 +663,12 @@ template <class TSettings>
 std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings> const& printedExecutionSettings){
         std::string device_name;
         os << std::left;
+        if (!printedExecutionSettings.programSettings->testOnly) {
         printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name);
+        }
+        else {
+            device_name = "TEST RUN: Not selected!";
+        }
         for (auto k : printedExecutionSettings.programSettings->getSettingsMap()) {
             os   << std::setw(2 * ENTRY_SPACE) << k.first << k.second << std::endl;
         }
diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp
index 33a4b5c8..dd1ddd28 100644
--- a/shared/setup/fpga_setup.cpp
+++ b/shared/setup/fpga_setup.cpp
@@ -166,6 +166,10 @@ Sets up the given FPGA with the kernel in the provided file.
         // Create the Program from the AOCX file.
         cl::Program program(*context, deviceList, mybinaries, NULL, &err);
         ASSERT_CL(err)
+
+        // Build the program (required for fast emulation on Intel)
+        ASSERT_CL(program.build());
+        
         if (world_rank == 0) {
             std::cout << "Prepared FPGA successfully for global Execution!" <<
                       std::endl;
@@ -298,6 +302,10 @@ choose a device.
             } else {
                 chosenDeviceId = static_cast<long unsigned int>(world_rank % deviceList.size());
             }
+        } else if (deviceList.size() == 1) {
+            chosenDeviceId = 0;
+        } else {
+            throw std::runtime_error("No devices found for selected Platform!");
         }
 
         if (world_rank == 0) {
diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp
index bc3c9bdb..a93a2a69 100644
--- a/shared/tests/hpcc_base_benchmark_test.cpp
+++ b/shared/tests/hpcc_base_benchmark_test.cpp
@@ -66,6 +66,7 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
     bool returnInputData = true;  
     bool returnExecuteKernel = true; 
     bool returnValidate = true;
+    bool forceSetupFail = false;
 
     uint executeKernelcalled = 0;
     uint generateInputDatacalled = 0;
@@ -95,6 +96,16 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
     void
     collectAndPrintResults(const int &output) override {}
 
+    bool
+    checkInputParameters() override {
+        if (forceSetupFail) {
+            return false;
+        }
+        else {
+            return hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int, int>::checkInputParameters();
+        }
+    }
+
     SuccessBenchmark() : HpccFpgaBenchmark(0, { nullptr}) {}
 
 };
@@ -106,13 +117,18 @@ class BaseHpccBenchmarkTest :public  ::testing::Test {
 
     BaseHpccBenchmarkTest() {
         bm = std::unique_ptr<SuccessBenchmark>(new SuccessBenchmark());
-        bool success = bm->setupBenchmark(global_argc, global_argv);
-        EXPECT_TRUE(success);
+        bm->setupBenchmark(global_argc, global_argv);
     }
 
 };
 
 
+TEST_F(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) {
+        bool success = bm->setupBenchmark(global_argc, global_argv);
+        EXPECT_TRUE(success);
+}
+
+
 /**
  * Checks if the testing flag works as expected
  */
@@ -122,15 +138,14 @@ TEST_F(BaseHpccBenchmarkTest, AllExecutedWhenNotTestOnly) {
     EXPECT_EQ(bm->validateOutputcalled, 1);
     EXPECT_EQ(bm->executeKernelcalled, 1);
     EXPECT_EQ(bm->generateInputDatacalled, 1);
-
 }
 
-TEST_F(BaseHpccBenchmarkTest, GenerateInputExecutedWhenTestOnly) {
+TEST_F(BaseHpccBenchmarkTest, NothingExecutedWhenTestOnly) {
     bm->getExecutionSettings().programSettings->testOnly = true;
     bm->executeBenchmark();
     EXPECT_EQ(bm->validateOutputcalled, 0);
     EXPECT_EQ(bm->executeKernelcalled, 0);
-    EXPECT_EQ(bm->generateInputDatacalled, 1);
+    EXPECT_EQ(bm->generateInputDatacalled, 0);
 }
 
 TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenNotTestOnly) {
@@ -139,11 +154,17 @@ TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenNotTestOnly) {
 
 }
 
-TEST_F(BaseHpccBenchmarkTest, ExecutionFailsWhenTestOnly) {
+TEST_F(BaseHpccBenchmarkTest, ExecutionFailsWhenTestOnlyAndSetupFails) {
     bm->getExecutionSettings().programSettings->testOnly = true;
+    bm->forceSetupFail = true;
+    bm->setupBenchmark(global_argc, global_argv);
     EXPECT_FALSE(bm->executeBenchmark());
 }
 
+TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenTestOnlyAndSetupSuccess) {
+    bm->getExecutionSettings().programSettings->testOnly = true;
+    EXPECT_TRUE(bm->executeBenchmark());
+}
 
 /**
  * Checks if using default platform and device is successful