Merge branch 'hpl-dp' into 'master'

Fix HPL to support DP FP See merge request pc2/HPCC_FPGA!52
pc2 · Oct 7, 2021 · 48e0386 · 48e0386
2 parents ff640b0 + 62097c4
commit 48e0386
Show file tree

Hide file tree

Showing 14 changed files with 127 additions and 49 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -152,6 +152,28 @@ build:LINPACK:
       - cmake/**/*
       - .gitlab-ci.yml
 
+
+build:LINPACK_DP:
+  stage: build
+  script:
+    - rm -rf build
+    - mkdir -p build
+    - cd build
+    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
+    - make -j 40 all
+  artifacts:
+    paths:
+      - build/bin/hpl_torus_PCIE_emulate.aocx
+      - build/bin/hpl_torus_IEC_emulate.aocx
+      - build/bin/Linpack_intel
+      - build/bin/Linpack_test_intel
+  only:
+    changes:
+      - LINPACK/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+
 build:GEMM:
   stage: build
   script:
@@ -420,6 +442,26 @@ test:LINPACK:
       - .gitlab-ci.yml
   needs: ["build:LINPACK"]
 
+test:LINPACK_DP:
+  stage: test
+  script:
+    - cd build
+    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0  -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
+    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+  dependencies:
+    - build:LINPACK_DP
+  artifacts:
+    when: on_failure
+    paths:
+      - build/Testing/Temporary/LastTest.log
+  only:
+    changes:
+      - LINPACK/**/*
+      - shared/**/*
+      - scripts/**/*
+      - cmake/**/*
+  needs: ["build:LINPACK_DP"]
+
 test:GEMM:
   stage: test
   script:

diff --git a/LINPACK/CHANGELOG b/LINPACK/CHANGELOG
@@ -2,6 +2,11 @@
 
 This file contains all changes made to the source code for each release.
 
+
+## 2.4
+#### Added:
+- Support for double-precision floating-point
+
 ## 2.3
 #### Changed:
 - Refactored the code to support different execution kernels and data distributions

diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.1)
-project(LINPACK VERSION 2.3)
+project(LINPACK VERSION 2.4)
 
 set(USE_DEPRECATED_HPP_HEADER No)
 
@@ -21,14 +21,6 @@ if (TEST_EMULATION)
     set(TEST_HOST_FLAGS "--emulation")
 endif()
 
-set(DATA_TYPE float)
-if (DATA_TYPE STREQUAL "double")
-    set(_DP Yes)
-    message(STATUS "Set DP flag since data type seems to be double precision")
-else()
-    set(_DP No)
-endif()
-
 set(USE_OPENMP Yes)
 set(USE_MPI Yes)
 

diff --git a/LINPACK/Readme.md b/LINPACK/Readme.md
@@ -51,6 +51,7 @@ Name             | Default     | Description                          |
 `DEFAULT_MATRIX_SIZE`| 1024 | Width and heigth of the input matrix |
 `REGISTER_BLOCK_LOG`| 3        | Size of the blocks that will be processed in registers (2^3=8 is the default) |
 `LOCAL_MEM_BLOCK_LOG`| 5        | Size of the blocks that will be processed in local memory (2^3=8 is the default) |
+`DATA_TYPE`     | float        | Used data type. Can be `float` or `double` |
 
 Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root
 of the Intel FPGA SDK installation.

diff --git a/LINPACK/configs/Nallatech_520N_B8_SB2_R5_DP_noring.cmake b/LINPACK/configs/Nallatech_520N_B8_SB2_R5_DP_noring.cmake
@@ -0,0 +1,22 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "p520_max_sg280l" CACHE STRING "" FORCE)
+set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE)
+
+set(DATA_TYPE "double" CACHE STRING "The ued data type for calculation" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 2 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 5 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in
@@ -10,13 +10,18 @@
 #define DEFAULT_DEVICE @DEFAULT_DEVICE@
 #define HOST_DATA_TYPE @HOST_DATA_TYPE@
 #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@
-#cmakedefine _DP @_DP@
+#cmakedefine _DP
+
+#ifdef _DP
+#define MPI_DATA_TYPE MPI_DOUBLE
+#else
+#define MPI_DATA_TYPE MPI_FLOAT
+#endif
 
 /**
  * Device specific parameters
  */
 #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
-#define GLOBAL_MEM_UNROLL @GLOBAL_MEM_UNROLL@
 #define LOCAL_MEM_BLOCK_LOG @LOCAL_MEM_BLOCK_LOG@
 #define REGISTER_BLOCK_LOG @REGISTER_BLOCK_LOG@
 #define NUM_REPLICATIONS @NUM_REPLICATIONS@

diff --git a/LINPACK/src/device/hpl_torus_IEC.cl b/LINPACK/src/device/hpl_torus_IEC.cl
@@ -404,7 +404,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -569,7 +569,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -600,7 +600,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -709,7 +709,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -739,7 +739,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -829,7 +829,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}

diff --git a/LINPACK/src/device/hpl_torus_PCIE.cl b/LINPACK/src/device/hpl_torus_PCIE.cl
@@ -229,7 +229,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -411,7 +411,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -423,7 +423,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_block_trans[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii];
 				}
@@ -434,7 +434,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -466,7 +466,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -558,7 +558,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -570,7 +570,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					top_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -601,7 +601,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
 				}
@@ -684,7 +684,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					a[block_col * BLOCK_SIZE  + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
 				}
@@ -697,7 +697,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
 		for (int ii =0; ii < GEMM_BLOCK; ii++) {
 			for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
-				__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
+				__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
 				for (int jj =0; jj < GEMM_BLOCK; jj++) {
 					left_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii];
 				}

diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp
@@ -225,9 +225,9 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
             lu_queues.back().finish();
 
             // Broadcast LU block in column to update all left blocks
-            MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator);
+            MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
             // Broadcast LU block in row to update all top blocks
-            MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator);
+            MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, row_communicator);
 
             if (num_top_blocks > 0) {
 
@@ -329,10 +329,10 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
 
             // Send the left and top blocks to all other ranks so they can be used to update all inner blocks
             for (int lbi=0; lbi < blocks_per_row - local_block_row; lbi++) {
-                MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator);
+                MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, row_communicator);
             }
             for (int tbi=0; tbi < blocks_per_row  - local_block_row; tbi++) {
-                MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator);
+                MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
             }
 
             // update all remaining inner blocks using only global memory