From cc013890ced865f5c0e7ef6d234173422d0862eb Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 12 Jul 2021 14:29:14 +0200
Subject: [PATCH 01/10] Add support for DP

---
 LINPACK/CMakeLists.txt                        |  1 -
 .../Nallatech_520N_B8_SB2_R2_DP_noring.cmake  | 22 ++++++++++++++++++
 LINPACK/src/common/parameters.h.in            |  6 +++++
 LINPACK/src/host/linpack_benchmark.cpp        | 23 ++++++++++---------
 .../test_host_reference_implementations.cpp   |  2 ++
 LINPACK/tests/test_kernel_communication.cpp   |  4 ++++
 shared/include/hpcc_benchmark.hpp             | 12 ++++------
 7 files changed, 51 insertions(+), 19 deletions(-)
 create mode 100644 LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake

diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt
index 72ae009c..3618a556 100755
--- a/LINPACK/CMakeLists.txt
+++ b/LINPACK/CMakeLists.txt
@@ -21,7 +21,6 @@ if (TEST_EMULATION)
     set(TEST_HOST_FLAGS "--emulation")
 endif()
 
-set(DATA_TYPE float)
 if (DATA_TYPE STREQUAL "double")
     set(_DP Yes)
     message(STATUS "Set DP flag since data type seems to be double precision")
diff --git a/LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake b/LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake
new file mode 100644
index 00000000..e7c2261e
--- /dev/null
+++ b/LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake
@@ -0,0 +1,22 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "p520_max_sg280l" CACHE STRING "" FORCE)
+set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE)
+
+set(DATA_TYPE "double" CACHE STRING "The ued data type for calculation" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 2 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in
index 86f03eb0..b7ee43ef 100644
--- a/LINPACK/src/common/parameters.h.in
+++ b/LINPACK/src/common/parameters.h.in
@@ -12,6 +12,12 @@
 #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@
 #cmakedefine _DP @_DP@
 
+#ifdef _DP
+#define MPI_DATA_TYPE MPI_DOUBLE
+#else
+#define MPI_DATA_TYPE MPI_FLOAT
+#endif
+
 /**
  * Device specific parameters
  */
diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
index 1ffc6fcd..8d5aaad2 100644
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ b/LINPACK/src/host/linpack_benchmark.cpp
@@ -54,6 +54,7 @@ linpack::LinpackProgramSettings::getSettingsMap() {
         map["Matrix Size"] = std::to_string(matrixSize);
         map["Block Size"] = std::to_string(blockSize);
         map["Emulate"] = (isEmulationKernel) ? "Yes" : "No";
+        map["Data Type"] = STR(HOST_DATA_TYPE);
         return map;
 }
 
@@ -226,7 +227,7 @@ linpack::LinpackBenchmark::generateInputData() {
                 local_row_sum += d->A[executionSettings->programSettings->matrixSize*j + i];
             } 
             HOST_DATA_TYPE row_sum = 0.0;
-            MPI_Reduce(&local_row_sum, &row_sum, 1, MPI_FLOAT, MPI_SUM, executionSettings->programSettings->torus_row, row_communicator);
+            MPI_Reduce(&local_row_sum, &row_sum, 1, MPI_DATA_TYPE, MPI_SUM, executionSettings->programSettings->torus_row, row_communicator);
             // insert row sum into matrix if it contains the diagonal block
             if (executionSettings->programSettings->torus_row == executionSettings->programSettings->torus_col) {
                 // update norm of local matrix
@@ -254,7 +255,7 @@ linpack::LinpackBenchmark::generateInputData() {
             local_col_sum += d->A[executionSettings->programSettings->matrixSize*i+j];
         }
         HOST_DATA_TYPE row_sum = 0.0;
-        MPI_Allreduce(&local_col_sum, &(d->b[j]), 1, MPI_FLOAT, MPI_SUM, col_communicator);   
+        MPI_Allreduce(&local_col_sum, &(d->b[j]), 1, MPI_DATA_TYPE, MPI_SUM, col_communicator);   
         d->normb = (d->b[j] > d->normb) ? d->b[j] : d->normb;   
     }
     return d;
@@ -270,12 +271,12 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat
     if (mpi_comm_rank > 0) {
         for (int j = 0; j < executionSettings->programSettings->matrixSize; j++) {
             for (int i = 0; i < executionSettings->programSettings->matrixSize; i+= executionSettings->programSettings->blockSize) {
-                MPI_Send(&data.A[executionSettings->programSettings->matrixSize * j + i], executionSettings->programSettings->blockSize, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);
+                MPI_Send(&data.A[executionSettings->programSettings->matrixSize * j + i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
             }
         }
         if (executionSettings->programSettings->torus_row == 0) {
             for (int i = 0; i < executionSettings->programSettings->matrixSize; i+= executionSettings->programSettings->blockSize) {
-                MPI_Send(&data.b[i], executionSettings->programSettings->blockSize, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);
+                MPI_Send(&data.b[i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
             }
         }
         residn = 0;
@@ -292,7 +293,7 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat
                 int recvrow= (j / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width;
                 int recvrank = executionSettings->programSettings->torus_width * recvrow + recvcol;
                 if (recvrank > 0) {
-                    MPI_Recv(&total_a[j * executionSettings->programSettings->matrixSize * executionSettings->programSettings->torus_width + i],executionSettings->programSettings->blockSize, MPI_FLOAT, recvrank, 0, MPI_COMM_WORLD,  &status);
+                    MPI_Recv(&total_a[j * executionSettings->programSettings->matrixSize * executionSettings->programSettings->torus_width + i],executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD,  &status);
                 }
                 else {
                     for (int k=0; k < executionSettings->programSettings->blockSize; k++) {
@@ -306,7 +307,7 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat
         for (int i = 0; i < executionSettings->programSettings->matrixSize* executionSettings->programSettings->torus_width; i+= executionSettings->programSettings->blockSize) {
             int recvcol= (i / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width;
             if (recvcol > 0) {
-                MPI_Recv(&total_b[i], executionSettings->programSettings->blockSize, MPI_FLOAT, recvcol, 0, MPI_COMM_WORLD, &status);
+                MPI_Recv(&total_b[i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvcol, 0, MPI_COMM_WORLD, &status);
             }
             else {
                 for (int k=0; k < executionSettings->programSettings->blockSize; k++) {
@@ -442,14 +443,14 @@ linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data
         if ((k / block_size) % executionSettings->programSettings->torus_width == executionSettings->programSettings->torus_row) {
             HOST_DATA_TYPE current_k;
             current_k = (local_k_index_col < matrix_size) ? b_tmp[local_k_index_col] : 0.0;
-            MPI_Bcast(&current_k, 1, MPI_FLOAT,  current_bcast, row_communicator);
+            MPI_Bcast(&current_k, 1, MPI_DATA_TYPE,  current_bcast, row_communicator);
             // For each row below add
             for (int i = start_offset; i < matrix_size; i++) {
                 // add solved upper row to current row
                 tmp_scaled_b[i] = current_k * data.A[matrix_size * local_k_index_row + i];
             }
         }
-        MPI_Bcast(&tmp_scaled_b.data()[start_offset], matrix_size - start_offset, MPI_FLOAT, current_bcast, col_communicator);
+        MPI_Bcast(&tmp_scaled_b.data()[start_offset], matrix_size - start_offset, MPI_DATA_TYPE, current_bcast, col_communicator);
         for (int i = start_offset; i < matrix_size; i++) {
             // add solved upper row to current row
             b_tmp[i] += tmp_scaled_b[i];
@@ -475,11 +476,11 @@ linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data
         }
 
         HOST_DATA_TYPE scale_element = (local_k_index_col < matrix_size && local_k_index_row < matrix_size) ? b_tmp[local_k_index_col] * data.A[matrix_size * local_k_index_row + local_k_index_col] : 0.0;
-        MPI_Bcast(&scale_element, 1, MPI_FLOAT, executionSettings->programSettings->torus_col, col_communicator);
+        MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, executionSettings->programSettings->torus_col, col_communicator);
         if ((k / block_size) % executionSettings->programSettings->torus_width == executionSettings->programSettings->torus_col) {
             b_tmp[local_k_index_col] = -scale_element;
         }
-        MPI_Bcast(&scale_element, 1, MPI_FLOAT, executionSettings->programSettings->torus_row, row_communicator);
+        MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, executionSettings->programSettings->torus_row, row_communicator);
         size_t end_offset = local_k_index_col;
 
         std::vector<HOST_DATA_TYPE> tmp_scaled_b(matrix_size, 0.0);
@@ -490,7 +491,7 @@ linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data
             }
         }
         int current_bcast = (k / block_size) % executionSettings->programSettings->torus_width;
-        MPI_Bcast(tmp_scaled_b.data(), end_offset, MPI_FLOAT, current_bcast, col_communicator);
+        MPI_Bcast(tmp_scaled_b.data(), end_offset, MPI_DATA_TYPE, current_bcast, col_communicator);
         for (int i = 0; i < end_offset; i++) {
             // add solved upper row to current row
             b_tmp[i] += tmp_scaled_b[i];
diff --git a/LINPACK/tests/test_host_reference_implementations.cpp b/LINPACK/tests/test_host_reference_implementations.cpp
index 59dd0f47..b1c7c8fc 100644
--- a/LINPACK/tests/test_host_reference_implementations.cpp
+++ b/LINPACK/tests/test_host_reference_implementations.cpp
@@ -52,6 +52,7 @@ TEST_F(LinpackHostTest, GenerateDiagonallyDominantMatrixWorksCorrectly) {
     }
 }
 
+#ifndef _DP
 TEST_F(LinpackHostTest, ReferenceSolveGMRES) {
     data = bm->generateInputData();
     auto A = std::unique_ptr<double[]>(new double[array_size * array_size]);
@@ -75,6 +76,7 @@ TEST_F(LinpackHostTest, ReferenceSolveGMRES) {
     }
     EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
 }
+#endif
 
 TEST_F(LinpackHostTest, ReferenceSolveWithPivoting) {
     bm->getExecutionSettings().programSettings->isDiagonallyDominant = false;
diff --git a/LINPACK/tests/test_kernel_communication.cpp b/LINPACK/tests/test_kernel_communication.cpp
index db40ea97..99171f35 100644
--- a/LINPACK/tests/test_kernel_communication.cpp
+++ b/LINPACK/tests/test_kernel_communication.cpp
@@ -1192,6 +1192,9 @@ class LinpackKernelCommunicationTestAll : public LinpackKernelCommunicationTest
         top_queue.finish();
         compute_queue.finish();
         inner_queue.finish();
+        network_queue_l.finish();
+        network_queue_t.finish();
+        network_queue_br.finish();
         inner_queue.enqueueNDRangeKernel(innerkernel, cl::NullRange, cl::NDRange(1),cl::NullRange);
         inner_queue.finish();
         network_queue_br.enqueueNDRangeKernel(network_br2, cl::NullRange, cl::NDRange(1),cl::NullRange);
@@ -1217,6 +1220,7 @@ TEST_F(LinpackKernelCommunicationTestAll, AllBlockExternalResultisCorrect) {
             max_error = std::max(max_error, static_cast<double>(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j])));
         }
     }
+
     // tolerated delta between expected and real result is machine epsilon times matrix width
     double delta = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
     EXPECT_NEAR(max_error, 0.0, delta);
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index f61a1e91..17e17bb9 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -24,23 +24,21 @@ SOFTWARE.
 
 #include <memory>
 
-/* Project's headers */
-#include "setup/fpga_setup.hpp"
-#include "communication_types.hpp"
-#include "cxxopts.hpp"
-#include "parameters.h"
-
 /* External library headers */
 #ifdef USE_DEPRECATED_HPP_HEADER
 #include "CL/cl.hpp"
 #else
 #include OPENCL_HPP_HEADER
 #endif
-
 #ifdef _USE_MPI_
 #include "mpi.h"
 #endif
 
+/* Project's headers */
+#include "setup/fpga_setup.hpp"
+#include "cxxopts.hpp"
+#include "parameters.h"
+#include "communication_types.hpp"
 
 #define STR_EXPAND(tok) #tok
 #define STR(tok) STR_EXPAND(tok)

From 0392a542b5185898c535fda7164182790c6b17e4 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 12 Jul 2021 14:31:53 +0200
Subject: [PATCH 02/10] Update benchmark version

---
 LINPACK/CHANGELOG      | 5 +++++
 LINPACK/CMakeLists.txt | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/LINPACK/CHANGELOG b/LINPACK/CHANGELOG
index 3e86dedc..70f0b7b4 100644
--- a/LINPACK/CHANGELOG
+++ b/LINPACK/CHANGELOG
@@ -2,6 +2,11 @@
 
 This file contains all changes made to the source code for each release.
 
+
+## 2.4
+#### Added:
+- Support for double-precision floating-point
+
 ## 2.3
 #### Changed:
 - Refactored the code to support different execution kernels and data distributions
diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt
index 3618a556..bbc58b49 100755
--- a/LINPACK/CMakeLists.txt
+++ b/LINPACK/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.1)
-project(LINPACK VERSION 2.3)
+project(LINPACK VERSION 2.4)
 
 set(USE_DEPRECATED_HPP_HEADER No)
 

From 9ce3cf4181e721c1ff801406749a60565a614f22 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 12 Jul 2021 14:34:11 +0200
Subject: [PATCH 03/10] Add HPL DP to gitlab CI

---
 .gitlab-ci.yml | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 8d0bf4ae..b6887fe4 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -152,6 +152,27 @@ build:LINPACK:
       - cmake/**/*
       - .gitlab-ci.yml
 
+
+build:LINPACK_DP:
+  stage: build
+  script:
+    - rm -rf build
+    - mkdir -p build
+    - cd build
+    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
+    - make -j 40 all
+  artifacts:
+    paths:
+      - build/bin/hpl_torus_emulate.aocx
+      - build/bin/Linpack_intel
+      - build/bin/Linpack_test_intel
+  only:
+    changes:
+      - LINPACK/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+
 build:GEMM:
   stage: build
   script:
@@ -420,6 +441,26 @@ test:LINPACK:
       - .gitlab-ci.yml
   needs: ["build:LINPACK"]
 
+test:LINPACK_DP:
+  stage: test
+  script:
+    - cd build
+    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0  -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
+    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+  dependencies:
+    - build:LINPACK_DP
+  artifacts:
+    when: on_failure
+    paths:
+      - build/Testing/Temporary/LastTest.log
+  only:
+    changes:
+      - LINPACK/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+  needs: ["build:LINPACK_DP"]
+
 test:GEMM:
   stage: test
   script:

From b6f2cf2f090a9adec9d1adea6804272f2319b7c0 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 21 Jul 2021 09:05:30 +0200
Subject: [PATCH 04/10] Adjust config for DP

---
 ...DP_noring.cmake => Nallatech_520N_B8_SB2_R5_DP_noring.cmake} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename LINPACK/configs/{Nallatech_520N_B8_SB2_R2_DP_noring.cmake => Nallatech_520N_B8_SB2_R5_DP_noring.cmake} (93%)

diff --git a/LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake b/LINPACK/configs/Nallatech_520N_B8_SB2_R5_DP_noring.cmake
similarity index 93%
rename from LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake
rename to LINPACK/configs/Nallatech_520N_B8_SB2_R5_DP_noring.cmake
index e7c2261e..1f60c377 100644
--- a/LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake
+++ b/LINPACK/configs/Nallatech_520N_B8_SB2_R5_DP_noring.cmake
@@ -18,5 +18,5 @@ set(DATA_TYPE "double" CACHE STRING "The ued data type for calculation" FORCE)
 set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
 set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
 set(REGISTER_BLOCK_LOG 2 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
-set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+set(NUM_REPLICATIONS 5 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
 

From bf264ba0f821813095ee62d85b1d7cc1c5a099da Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 21 Jul 2021 10:17:18 +0200
Subject: [PATCH 05/10] Move _DP flag to general configuration script

---
 LINPACK/CMakeLists.txt                    | 7 -------
 cmake/general_benchmark_build_setup.cmake | 6 ++++++
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt
index bbc58b49..e12347ab 100755
--- a/LINPACK/CMakeLists.txt
+++ b/LINPACK/CMakeLists.txt
@@ -21,13 +21,6 @@ if (TEST_EMULATION)
     set(TEST_HOST_FLAGS "--emulation")
 endif()
 
-if (DATA_TYPE STREQUAL "double")
-    set(_DP Yes)
-    message(STATUS "Set DP flag since data type seems to be double precision")
-else()
-    set(_DP No)
-endif()
-
 set(USE_OPENMP Yes)
 set(USE_MPI Yes)
 
diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index 28f696f3..15dfd9b1 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -61,6 +61,12 @@ if (NOT HOST_DATA_TYPE OR NOT DEVICE_DATA_TYPE)
     set(DEVICE_DATA_TYPE ${DATA_TYPE})
 endif()
 
+if (DATA_TYPE STREQUAL "double")
+    set(_DP Yes CACHE BOOL "Use double-precision specific code for host and device.")
+    message(STATUS "Set DP flag since data type seems to be double precision")
+    mark_as_advanced(_DP)
+endif()
+
 # check configuration sanity
 if (USE_SVM AND USE_HBM)
     message(ERROR "Misconfiguration: Can not use USE_HBM and USE_SVM at the same time because they target different memory architectures")

From 1fae0a22f48e1eec0f9fd0382e7bb5f97abd3b74 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 6 Oct 2021 14:59:34 +0200
Subject: [PATCH 06/10] Update CI for new kernel naming scheme

---
 .gitlab-ci.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b6887fe4..461c1f9e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -163,7 +163,8 @@ build:LINPACK_DP:
     - make -j 40 all
   artifacts:
     paths:
-      - build/bin/hpl_torus_emulate.aocx
+      - build/bin/hpl_torus_PCIE_emulate.aocx
+      - build/bin/hpl_torus_IEC_emulate.aocx
       - build/bin/Linpack_intel
       - build/bin/Linpack_test_intel
   only:

From 8b72ed32e859c537c7022cfc96df366455f17a0d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 7 Oct 2021 08:30:48 +0200
Subject: [PATCH 07/10] Clean up parameters

---
 LINPACK/src/common/parameters.h.in | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in
index b7ee43ef..f4896432 100644
--- a/LINPACK/src/common/parameters.h.in
+++ b/LINPACK/src/common/parameters.h.in
@@ -10,7 +10,7 @@
 #define DEFAULT_DEVICE @DEFAULT_DEVICE@
 #define HOST_DATA_TYPE @HOST_DATA_TYPE@
 #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@
-#cmakedefine _DP @_DP@
+#cmakedefine _DP
 
 #ifdef _DP
 #define MPI_DATA_TYPE MPI_DOUBLE
@@ -22,7 +22,6 @@
  * Device specific parameters
  */
 #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
-#define GLOBAL_MEM_UNROLL @GLOBAL_MEM_UNROLL@
 #define LOCAL_MEM_BLOCK_LOG @LOCAL_MEM_BLOCK_LOG@
 #define REGISTER_BLOCK_LOG @REGISTER_BLOCK_LOG@
 #define NUM_REPLICATIONS @NUM_REPLICATIONS@

From e8eb303bcc7bbbe2a17d9d5c0d8a354799a6e0b0 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 7 Oct 2021 08:31:08 +0200
Subject: [PATCH 08/10] Use MPI_DATA_TYPE in PCIe implementation

---
 LINPACK/src/host/execution_types/execution_pcie.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp
index 6a83dbbe..895dbd86 100644
--- a/LINPACK/src/host/execution_types/execution_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_pcie.hpp
@@ -225,9 +225,9 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
             lu_queues.back().finish();
 
             // Broadcast LU block in column to update all left blocks
-            MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator);
+            MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
             // Broadcast LU block in row to update all top blocks
-            MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator);
+            MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, row_communicator);
 
             if (num_top_blocks > 0) {
 
@@ -329,10 +329,10 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
 
             // Send the left and top blocks to all other ranks so they can be used to update all inner blocks
             for (int lbi=0; lbi < blocks_per_row - local_block_row; lbi++) {
-                MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator);
+                MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, row_communicator);
             }
             for (int tbi=0; tbi < blocks_per_row  - local_block_row; tbi++) {
-                MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator);
+                MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
             }
 
             // update all remaining inner blocks using only global memory

From a5e695da355a60d30993eb3c5a95b136a3183379 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 7 Oct 2021 08:31:48 +0200
Subject: [PATCH 09/10] Remove GLOBAL_MEM_UNROLL from kernels

---
 LINPACK/src/device/hpl_torus_IEC.cl  | 12 ++++++------
 LINPACK/src/device/hpl_torus_PCIE.cl | 20 ++++++++++----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/LINPACK/src/device/hpl_torus_IEC.cl b/LINPACK/src/device/hpl_torus_IEC.cl
index e744804a..37b79aeb 100644
--- a/LINPACK/src/device/hpl_torus_IEC.cl
+++ b/LINPACK/src/device/hpl_torus_IEC.cl
@@ -404,7 +404,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -569,7 +569,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -600,7 +600,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -709,7 +709,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -739,7 +739,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -829,7 +829,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
diff --git a/LINPACK/src/device/hpl_torus_PCIE.cl b/LINPACK/src/device/hpl_torus_PCIE.cl
index 0f31f0d4..bea5b705 100644
--- a/LINPACK/src/device/hpl_torus_PCIE.cl
+++ b/LINPACK/src/device/hpl_torus_PCIE.cl
@@ -229,7 +229,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -411,7 +411,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -423,7 +423,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_block_trans[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii];
 				}
@@ -434,7 +434,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -466,7 +466,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -558,7 +558,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -570,7 +570,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					top_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -601,7 +601,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -684,7 +684,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -697,7 +697,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					left_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii];
 				}

From 62097c4e7d85ca7595f5dc8b8e0c07aa0c0a2387 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 7 Oct 2021 08:36:51 +0200
Subject: [PATCH 10/10] Add DATA_TYPE to readme

---
 LINPACK/Readme.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/LINPACK/Readme.md b/LINPACK/Readme.md
index e0e276c1..29e33cdb 100644
--- a/LINPACK/Readme.md
+++ b/LINPACK/Readme.md
@@ -51,6 +51,7 @@ Name             | Default     | Description                          |
 `DEFAULT_MATRIX_SIZE`| 1024 | Width and heigth of the input matrix |
 `REGISTER_BLOCK_LOG`| 3        | Size of the blocks that will be processed in registers (2^3=8 is the default) |
 `LOCAL_MEM_BLOCK_LOG`| 5        | Size of the blocks that will be processed in local memory (2^3=8 is the default) |
+`DATA_TYPE`     | float        | Used data type. Can be `float` or `double` |
 
 Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root
 of the Intel FPGA SDK installation.