From cc013890ced865f5c0e7ef6d234173422d0862eb Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 12 Jul 2021 14:29:14 +0200 Subject: [PATCH 01/10] Add support for DP --- LINPACK/CMakeLists.txt | 1 - .../Nallatech_520N_B8_SB2_R2_DP_noring.cmake | 22 ++++++++++++++++++ LINPACK/src/common/parameters.h.in | 6 +++++ LINPACK/src/host/linpack_benchmark.cpp | 23 ++++++++++--------- .../test_host_reference_implementations.cpp | 2 ++ LINPACK/tests/test_kernel_communication.cpp | 4 ++++ shared/include/hpcc_benchmark.hpp | 12 ++++------ 7 files changed, 51 insertions(+), 19 deletions(-) create mode 100644 LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt index 72ae009c..3618a556 100755 --- a/LINPACK/CMakeLists.txt +++ b/LINPACK/CMakeLists.txt @@ -21,7 +21,6 @@ if (TEST_EMULATION) set(TEST_HOST_FLAGS "--emulation") endif() -set(DATA_TYPE float) if (DATA_TYPE STREQUAL "double") set(_DP Yes) message(STATUS "Set DP flag since data type seems to be double precision") diff --git a/LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake b/LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake new file mode 100644 index 00000000..e7c2261e --- /dev/null +++ b/LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake @@ -0,0 +1,22 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "p520_max_sg280l" CACHE STRING "" FORCE) +set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE) + +set(DATA_TYPE "double" CACHE STRING "The ued data type for calculation" FORCE) + +# LINPACK specific options +set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) +set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) +set(REGISTER_BLOCK_LOG 2 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) + diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in index 86f03eb0..b7ee43ef 100644 --- a/LINPACK/src/common/parameters.h.in +++ b/LINPACK/src/common/parameters.h.in @@ -12,6 +12,12 @@ #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@ #cmakedefine _DP @_DP@ +#ifdef _DP +#define MPI_DATA_TYPE MPI_DOUBLE +#else +#define MPI_DATA_TYPE MPI_FLOAT +#endif + /** * Device specific parameters */ diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp index 1ffc6fcd..8d5aaad2 100644 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -54,6 +54,7 @@ linpack::LinpackProgramSettings::getSettingsMap() { map["Matrix Size"] = std::to_string(matrixSize); map["Block Size"] = std::to_string(blockSize); map["Emulate"] = (isEmulationKernel) ? "Yes" : "No"; + map["Data Type"] = STR(HOST_DATA_TYPE); return map; } @@ -226,7 +227,7 @@ linpack::LinpackBenchmark::generateInputData() { local_row_sum += d->A[executionSettings->programSettings->matrixSize*j + i]; } HOST_DATA_TYPE row_sum = 0.0; - MPI_Reduce(&local_row_sum, &row_sum, 1, MPI_FLOAT, MPI_SUM, executionSettings->programSettings->torus_row, row_communicator); + MPI_Reduce(&local_row_sum, &row_sum, 1, MPI_DATA_TYPE, MPI_SUM, executionSettings->programSettings->torus_row, row_communicator); // insert row sum into matrix if it contains the diagonal block if (executionSettings->programSettings->torus_row == executionSettings->programSettings->torus_col) { // update norm of local matrix @@ -254,7 +255,7 @@ linpack::LinpackBenchmark::generateInputData() { local_col_sum += d->A[executionSettings->programSettings->matrixSize*i+j]; } HOST_DATA_TYPE row_sum = 0.0; - MPI_Allreduce(&local_col_sum, &(d->b[j]), 1, MPI_FLOAT, MPI_SUM, col_communicator); + MPI_Allreduce(&local_col_sum, &(d->b[j]), 1, MPI_DATA_TYPE, MPI_SUM, col_communicator); d->normb = (d->b[j] > d->normb) ? d->b[j] : d->normb; } return d; @@ -270,12 +271,12 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat if (mpi_comm_rank > 0) { for (int j = 0; j < executionSettings->programSettings->matrixSize; j++) { for (int i = 0; i < executionSettings->programSettings->matrixSize; i+= executionSettings->programSettings->blockSize) { - MPI_Send(&data.A[executionSettings->programSettings->matrixSize * j + i], executionSettings->programSettings->blockSize, MPI_FLOAT, 0, 0, MPI_COMM_WORLD); + MPI_Send(&data.A[executionSettings->programSettings->matrixSize * j + i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD); } } if (executionSettings->programSettings->torus_row == 0) { for (int i = 0; i < executionSettings->programSettings->matrixSize; i+= executionSettings->programSettings->blockSize) { - MPI_Send(&data.b[i], executionSettings->programSettings->blockSize, MPI_FLOAT, 0, 0, MPI_COMM_WORLD); + MPI_Send(&data.b[i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD); } } residn = 0; @@ -292,7 +293,7 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat int recvrow= (j / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width; int recvrank = executionSettings->programSettings->torus_width * recvrow + recvcol; if (recvrank > 0) { - MPI_Recv(&total_a[j * executionSettings->programSettings->matrixSize * executionSettings->programSettings->torus_width + i],executionSettings->programSettings->blockSize, MPI_FLOAT, recvrank, 0, MPI_COMM_WORLD, &status); + MPI_Recv(&total_a[j * executionSettings->programSettings->matrixSize * executionSettings->programSettings->torus_width + i],executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD, &status); } else { for (int k=0; k < executionSettings->programSettings->blockSize; k++) { @@ -306,7 +307,7 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat for (int i = 0; i < executionSettings->programSettings->matrixSize* executionSettings->programSettings->torus_width; i+= executionSettings->programSettings->blockSize) { int recvcol= (i / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width; if (recvcol > 0) { - MPI_Recv(&total_b[i], executionSettings->programSettings->blockSize, MPI_FLOAT, recvcol, 0, MPI_COMM_WORLD, &status); + MPI_Recv(&total_b[i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvcol, 0, MPI_COMM_WORLD, &status); } else { for (int k=0; k < executionSettings->programSettings->blockSize; k++) { @@ -442,14 +443,14 @@ linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data if ((k / block_size) % executionSettings->programSettings->torus_width == executionSettings->programSettings->torus_row) { HOST_DATA_TYPE current_k; current_k = (local_k_index_col < matrix_size) ? b_tmp[local_k_index_col] : 0.0; - MPI_Bcast(¤t_k, 1, MPI_FLOAT, current_bcast, row_communicator); + MPI_Bcast(¤t_k, 1, MPI_DATA_TYPE, current_bcast, row_communicator); // For each row below add for (int i = start_offset; i < matrix_size; i++) { // add solved upper row to current row tmp_scaled_b[i] = current_k * data.A[matrix_size * local_k_index_row + i]; } } - MPI_Bcast(&tmp_scaled_b.data()[start_offset], matrix_size - start_offset, MPI_FLOAT, current_bcast, col_communicator); + MPI_Bcast(&tmp_scaled_b.data()[start_offset], matrix_size - start_offset, MPI_DATA_TYPE, current_bcast, col_communicator); for (int i = start_offset; i < matrix_size; i++) { // add solved upper row to current row b_tmp[i] += tmp_scaled_b[i]; @@ -475,11 +476,11 @@ linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data } HOST_DATA_TYPE scale_element = (local_k_index_col < matrix_size && local_k_index_row < matrix_size) ? b_tmp[local_k_index_col] * data.A[matrix_size * local_k_index_row + local_k_index_col] : 0.0; - MPI_Bcast(&scale_element, 1, MPI_FLOAT, executionSettings->programSettings->torus_col, col_communicator); + MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, executionSettings->programSettings->torus_col, col_communicator); if ((k / block_size) % executionSettings->programSettings->torus_width == executionSettings->programSettings->torus_col) { b_tmp[local_k_index_col] = -scale_element; } - MPI_Bcast(&scale_element, 1, MPI_FLOAT, executionSettings->programSettings->torus_row, row_communicator); + MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, executionSettings->programSettings->torus_row, row_communicator); size_t end_offset = local_k_index_col; std::vector tmp_scaled_b(matrix_size, 0.0); @@ -490,7 +491,7 @@ linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data } } int current_bcast = (k / block_size) % executionSettings->programSettings->torus_width; - MPI_Bcast(tmp_scaled_b.data(), end_offset, MPI_FLOAT, current_bcast, col_communicator); + MPI_Bcast(tmp_scaled_b.data(), end_offset, MPI_DATA_TYPE, current_bcast, col_communicator); for (int i = 0; i < end_offset; i++) { // add solved upper row to current row b_tmp[i] += tmp_scaled_b[i]; diff --git a/LINPACK/tests/test_host_reference_implementations.cpp b/LINPACK/tests/test_host_reference_implementations.cpp index 59dd0f47..b1c7c8fc 100644 --- a/LINPACK/tests/test_host_reference_implementations.cpp +++ b/LINPACK/tests/test_host_reference_implementations.cpp @@ -52,6 +52,7 @@ TEST_F(LinpackHostTest, GenerateDiagonallyDominantMatrixWorksCorrectly) { } } +#ifndef _DP TEST_F(LinpackHostTest, ReferenceSolveGMRES) { data = bm->generateInputData(); auto A = std::unique_ptr(new double[array_size * array_size]); @@ -75,6 +76,7 @@ TEST_F(LinpackHostTest, ReferenceSolveGMRES) { } EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); } +#endif TEST_F(LinpackHostTest, ReferenceSolveWithPivoting) { bm->getExecutionSettings().programSettings->isDiagonallyDominant = false; diff --git a/LINPACK/tests/test_kernel_communication.cpp b/LINPACK/tests/test_kernel_communication.cpp index db40ea97..99171f35 100644 --- a/LINPACK/tests/test_kernel_communication.cpp +++ b/LINPACK/tests/test_kernel_communication.cpp @@ -1192,6 +1192,9 @@ class LinpackKernelCommunicationTestAll : public LinpackKernelCommunicationTest top_queue.finish(); compute_queue.finish(); inner_queue.finish(); + network_queue_l.finish(); + network_queue_t.finish(); + network_queue_br.finish(); inner_queue.enqueueNDRangeKernel(innerkernel, cl::NullRange, cl::NDRange(1),cl::NullRange); inner_queue.finish(); network_queue_br.enqueueNDRangeKernel(network_br2, cl::NullRange, cl::NDRange(1),cl::NullRange); @@ -1217,6 +1220,7 @@ TEST_F(LinpackKernelCommunicationTestAll, AllBlockExternalResultisCorrect) { max_error = std::max(max_error, static_cast(std::abs(ref_data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j] - data->A[i * bm->getExecutionSettings().programSettings->matrixSize + j]))); } } + // tolerated delta between expected and real result is machine epsilon times matrix width double delta = std::numeric_limits::epsilon(); EXPECT_NEAR(max_error, 0.0, delta); diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index f61a1e91..17e17bb9 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -24,23 +24,21 @@ SOFTWARE. #include -/* Project's headers */ -#include "setup/fpga_setup.hpp" -#include "communication_types.hpp" -#include "cxxopts.hpp" -#include "parameters.h" - /* External library headers */ #ifdef USE_DEPRECATED_HPP_HEADER #include "CL/cl.hpp" #else #include OPENCL_HPP_HEADER #endif - #ifdef _USE_MPI_ #include "mpi.h" #endif +/* Project's headers */ +#include "setup/fpga_setup.hpp" +#include "cxxopts.hpp" +#include "parameters.h" +#include "communication_types.hpp" #define STR_EXPAND(tok) #tok #define STR(tok) STR_EXPAND(tok) From 0392a542b5185898c535fda7164182790c6b17e4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 12 Jul 2021 14:31:53 +0200 Subject: [PATCH 02/10] Update benchmark version --- LINPACK/CHANGELOG | 5 +++++ LINPACK/CMakeLists.txt | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/LINPACK/CHANGELOG b/LINPACK/CHANGELOG index 3e86dedc..70f0b7b4 100644 --- a/LINPACK/CHANGELOG +++ b/LINPACK/CHANGELOG @@ -2,6 +2,11 @@ This file contains all changes made to the source code for each release. + +## 2.4 +#### Added: +- Support for double-precision floating-point + ## 2.3 #### Changed: - Refactored the code to support different execution kernels and data distributions diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt index 3618a556..bbc58b49 100755 --- a/LINPACK/CMakeLists.txt +++ b/LINPACK/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.1) -project(LINPACK VERSION 2.3) +project(LINPACK VERSION 2.4) set(USE_DEPRECATED_HPP_HEADER No) From 9ce3cf4181e721c1ff801406749a60565a614f22 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 12 Jul 2021 14:34:11 +0200 Subject: [PATCH 03/10] Add HPL DP to gitlab CI --- .gitlab-ci.yml | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8d0bf4ae..b6887fe4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -152,6 +152,27 @@ build:LINPACK: - cmake/**/* - .gitlab-ci.yml + +build:LINPACK_DP: + stage: build + script: + - rm -rf build + - mkdir -p build + - cd build + - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double + - make -j 40 all + artifacts: + paths: + - build/bin/hpl_torus_emulate.aocx + - build/bin/Linpack_intel + - build/bin/Linpack_test_intel + only: + changes: + - LINPACK/**/* + - shared/**/* + - scripts/**/* + - cmake/**/* + build:GEMM: stage: build script: @@ -420,6 +441,26 @@ test:LINPACK: - .gitlab-ci.yml needs: ["build:LINPACK"] +test:LINPACK_DP: + stage: test + script: + - cd build + - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double + - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + dependencies: + - build:LINPACK_DP + artifacts: + when: on_failure + paths: + - build/Testing/Temporary/LastTest.log + only: + changes: + - LINPACK/**/* + - shared/**/* + - scripts/**/* + - cmake/**/* + needs: ["build:LINPACK_DP"] + test:GEMM: stage: test script: From b6f2cf2f090a9adec9d1adea6804272f2319b7c0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 21 Jul 2021 09:05:30 +0200 Subject: [PATCH 04/10] Adjust config for DP --- ...DP_noring.cmake => Nallatech_520N_B8_SB2_R5_DP_noring.cmake} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename LINPACK/configs/{Nallatech_520N_B8_SB2_R2_DP_noring.cmake => Nallatech_520N_B8_SB2_R5_DP_noring.cmake} (93%) diff --git a/LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake b/LINPACK/configs/Nallatech_520N_B8_SB2_R5_DP_noring.cmake similarity index 93% rename from LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake rename to LINPACK/configs/Nallatech_520N_B8_SB2_R5_DP_noring.cmake index e7c2261e..1f60c377 100644 --- a/LINPACK/configs/Nallatech_520N_B8_SB2_R2_DP_noring.cmake +++ b/LINPACK/configs/Nallatech_520N_B8_SB2_R5_DP_noring.cmake @@ -18,5 +18,5 @@ set(DATA_TYPE "double" CACHE STRING "The ued data type for calculation" FORCE) set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) set(REGISTER_BLOCK_LOG 2 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) -set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) +set(NUM_REPLICATIONS 5 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) From bf264ba0f821813095ee62d85b1d7cc1c5a099da Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 21 Jul 2021 10:17:18 +0200 Subject: [PATCH 05/10] Move _DP flag to general configuration script --- LINPACK/CMakeLists.txt | 7 ------- cmake/general_benchmark_build_setup.cmake | 6 ++++++ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt index bbc58b49..e12347ab 100755 --- a/LINPACK/CMakeLists.txt +++ b/LINPACK/CMakeLists.txt @@ -21,13 +21,6 @@ if (TEST_EMULATION) set(TEST_HOST_FLAGS "--emulation") endif() -if (DATA_TYPE STREQUAL "double") - set(_DP Yes) - message(STATUS "Set DP flag since data type seems to be double precision") -else() - set(_DP No) -endif() - set(USE_OPENMP Yes) set(USE_MPI Yes) diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index 28f696f3..15dfd9b1 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -61,6 +61,12 @@ if (NOT HOST_DATA_TYPE OR NOT DEVICE_DATA_TYPE) set(DEVICE_DATA_TYPE ${DATA_TYPE}) endif() +if (DATA_TYPE STREQUAL "double") + set(_DP Yes CACHE BOOL "Use double-precision specific code for host and device.") + message(STATUS "Set DP flag since data type seems to be double precision") + mark_as_advanced(_DP) +endif() + # check configuration sanity if (USE_SVM AND USE_HBM) message(ERROR "Misconfiguration: Can not use USE_HBM and USE_SVM at the same time because they target different memory architectures") From 1fae0a22f48e1eec0f9fd0382e7bb5f97abd3b74 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 6 Oct 2021 14:59:34 +0200 Subject: [PATCH 06/10] Update CI for new kernel naming scheme --- .gitlab-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b6887fe4..461c1f9e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -163,7 +163,8 @@ build:LINPACK_DP: - make -j 40 all artifacts: paths: - - build/bin/hpl_torus_emulate.aocx + - build/bin/hpl_torus_PCIE_emulate.aocx + - build/bin/hpl_torus_IEC_emulate.aocx - build/bin/Linpack_intel - build/bin/Linpack_test_intel only: From 8b72ed32e859c537c7022cfc96df366455f17a0d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 7 Oct 2021 08:30:48 +0200 Subject: [PATCH 07/10] Clean up parameters --- LINPACK/src/common/parameters.h.in | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in index b7ee43ef..f4896432 100644 --- a/LINPACK/src/common/parameters.h.in +++ b/LINPACK/src/common/parameters.h.in @@ -10,7 +10,7 @@ #define DEFAULT_DEVICE @DEFAULT_DEVICE@ #define HOST_DATA_TYPE @HOST_DATA_TYPE@ #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@ -#cmakedefine _DP @_DP@ +#cmakedefine _DP #ifdef _DP #define MPI_DATA_TYPE MPI_DOUBLE @@ -22,7 +22,6 @@ * Device specific parameters */ #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ -#define GLOBAL_MEM_UNROLL @GLOBAL_MEM_UNROLL@ #define LOCAL_MEM_BLOCK_LOG @LOCAL_MEM_BLOCK_LOG@ #define REGISTER_BLOCK_LOG @REGISTER_BLOCK_LOG@ #define NUM_REPLICATIONS @NUM_REPLICATIONS@ From e8eb303bcc7bbbe2a17d9d5c0d8a354799a6e0b0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 7 Oct 2021 08:31:08 +0200 Subject: [PATCH 08/10] Use MPI_DATA_TYPE in PCIe implementation --- LINPACK/src/host/execution_types/execution_pcie.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp index 6a83dbbe..895dbd86 100644 --- a/LINPACK/src/host/execution_types/execution_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_pcie.hpp @@ -225,9 +225,9 @@ calculate(const hpcc_base::ExecutionSettings&co lu_queues.back().finish(); // Broadcast LU block in column to update all left blocks - MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator); + MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator); // Broadcast LU block in row to update all top blocks - MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator); + MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, row_communicator); if (num_top_blocks > 0) { @@ -329,10 +329,10 @@ calculate(const hpcc_base::ExecutionSettings&co // Send the left and top blocks to all other ranks so they can be used to update all inner blocks for (int lbi=0; lbi < blocks_per_row - local_block_row; lbi++) { - MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator); + MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, row_communicator); } for (int tbi=0; tbi < blocks_per_row - local_block_row; tbi++) { - MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator); + MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator); } // update all remaining inner blocks using only global memory From a5e695da355a60d30993eb3c5a95b136a3183379 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 7 Oct 2021 08:31:48 +0200 Subject: [PATCH 09/10] Remove GLOBAL_MEM_UNROLL from kernels --- LINPACK/src/device/hpl_torus_IEC.cl | 12 ++++++------ LINPACK/src/device/hpl_torus_PCIE.cl | 20 ++++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/LINPACK/src/device/hpl_torus_IEC.cl b/LINPACK/src/device/hpl_torus_IEC.cl index e744804a..37b79aeb 100644 --- a/LINPACK/src/device/hpl_torus_IEC.cl +++ b/LINPACK/src/device/hpl_torus_IEC.cl @@ -404,7 +404,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj]; } @@ -569,7 +569,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; } @@ -600,7 +600,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj]; } @@ -709,7 +709,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; } @@ -739,7 +739,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj]; } @@ -829,7 +829,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; } diff --git a/LINPACK/src/device/hpl_torus_PCIE.cl b/LINPACK/src/device/hpl_torus_PCIE.cl index 0f31f0d4..bea5b705 100644 --- a/LINPACK/src/device/hpl_torus_PCIE.cl +++ b/LINPACK/src/device/hpl_torus_PCIE.cl @@ -229,7 +229,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj]; } @@ -411,7 +411,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; } @@ -423,7 +423,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a_block_trans[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii]; } @@ -434,7 +434,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; } @@ -466,7 +466,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj]; } @@ -558,7 +558,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; } @@ -570,7 +570,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { top_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; } @@ -601,7 +601,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj]; } @@ -684,7 +684,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj]; } @@ -697,7 +697,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a, for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) { for (int ii =0; ii < GEMM_BLOCK; ii++) { for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) { - __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) + __attribute__((opencl_unroll_hint(GEMM_BLOCK))) for (int jj =0; jj < GEMM_BLOCK; jj++) { left_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii]; } From 62097c4e7d85ca7595f5dc8b8e0c07aa0c0a2387 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 7 Oct 2021 08:36:51 +0200 Subject: [PATCH 10/10] Add DATA_TYPE to readme --- LINPACK/Readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/LINPACK/Readme.md b/LINPACK/Readme.md index e0e276c1..29e33cdb 100644 --- a/LINPACK/Readme.md +++ b/LINPACK/Readme.md @@ -51,6 +51,7 @@ Name | Default | Description | `DEFAULT_MATRIX_SIZE`| 1024 | Width and heigth of the input matrix | `REGISTER_BLOCK_LOG`| 3 | Size of the blocks that will be processed in registers (2^3=8 is the default) | `LOCAL_MEM_BLOCK_LOG`| 5 | Size of the blocks that will be processed in local memory (2^3=8 is the default) | +`DATA_TYPE` | float | Used data type. Can be `float` or `double` | Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root of the Intel FPGA SDK installation.