Skip to content

Commit

Permalink
Merge branch 'hpl-dp' into 'master'
Browse files Browse the repository at this point in the history
Fix HPL to support DP FP

See merge request pc2/HPCC_FPGA!52
  • Loading branch information
Mellich committed Oct 7, 2021
2 parents ff640b0 + 62097c4 commit 48e0386
Show file tree
Hide file tree
Showing 14 changed files with 127 additions and 49 deletions.
42 changes: 42 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,28 @@ build:LINPACK:
- cmake/**/*
- .gitlab-ci.yml


build:LINPACK_DP:
stage: build
script:
- rm -rf build
- mkdir -p build
- cd build
- cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
- make -j 40 all
artifacts:
paths:
- build/bin/hpl_torus_PCIE_emulate.aocx
- build/bin/hpl_torus_IEC_emulate.aocx
- build/bin/Linpack_intel
- build/bin/Linpack_test_intel
only:
changes:
- LINPACK/**/*
- shared/**/*
- scripts/**/*
- cmake/**/*

build:GEMM:
stage: build
script:
Expand Down Expand Up @@ -420,6 +442,26 @@ test:LINPACK:
- .gitlab-ci.yml
needs: ["build:LINPACK"]

test:LINPACK_DP:
stage: test
script:
- cd build
- cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
- make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
dependencies:
- build:LINPACK_DP
artifacts:
when: on_failure
paths:
- build/Testing/Temporary/LastTest.log
only:
changes:
- LINPACK/**/*
- shared/**/*
- scripts/**/*
- cmake/**/*
needs: ["build:LINPACK_DP"]

test:GEMM:
stage: test
script:
Expand Down
5 changes: 5 additions & 0 deletions LINPACK/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

This file contains all changes made to the source code for each release.


## 2.4
#### Added:
- Support for double-precision floating-point

## 2.3
#### Changed:
- Refactored the code to support different execution kernels and data distributions
Expand Down
10 changes: 1 addition & 9 deletions LINPACK/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.1)
project(LINPACK VERSION 2.3)
project(LINPACK VERSION 2.4)

set(USE_DEPRECATED_HPP_HEADER No)

Expand All @@ -21,14 +21,6 @@ if (TEST_EMULATION)
set(TEST_HOST_FLAGS "--emulation")
endif()

set(DATA_TYPE float)
if (DATA_TYPE STREQUAL "double")
set(_DP Yes)
message(STATUS "Set DP flag since data type seems to be double precision")
else()
set(_DP No)
endif()

set(USE_OPENMP Yes)
set(USE_MPI Yes)

Expand Down
1 change: 1 addition & 0 deletions LINPACK/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Name | Default | Description |
`DEFAULT_MATRIX_SIZE`| 1024 | Width and heigth of the input matrix |
`REGISTER_BLOCK_LOG`| 3 | Size of the blocks that will be processed in registers (2^3=8 is the default) |
`LOCAL_MEM_BLOCK_LOG`| 5 | Size of the blocks that will be processed in local memory (2^3=8 is the default) |
`DATA_TYPE` | float | Used data type. Can be `float` or `double` |

Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root
of the Intel FPGA SDK installation.
Expand Down
22 changes: 22 additions & 0 deletions LINPACK/configs/Nallatech_520N_B8_SB2_R5_DP_noring.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# This file contains the default configuration for the Nallatech 520N board
# for the use with single precision floating point values.
# To use this configuration file, call cmake with the parameter
#
# cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
#


set(USE_MPI Yes CACHE BOOL "" FORCE)
set(USE_SVM No CACHE BOOL "" FORCE)
set(USE_HBM No CACHE BOOL "" FORCE)
set(FPGA_BOARD_NAME "p520_max_sg280l" CACHE STRING "" FORCE)
set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE)

set(DATA_TYPE "double" CACHE STRING "The ued data type for calculation" FORCE)

# LINPACK specific options
set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
set(REGISTER_BLOCK_LOG 2 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
set(NUM_REPLICATIONS 5 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)

9 changes: 7 additions & 2 deletions LINPACK/src/common/parameters.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,18 @@
#define DEFAULT_DEVICE @DEFAULT_DEVICE@
#define HOST_DATA_TYPE @HOST_DATA_TYPE@
#define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@
#cmakedefine _DP @_DP@
#cmakedefine _DP

#ifdef _DP
#define MPI_DATA_TYPE MPI_DOUBLE
#else
#define MPI_DATA_TYPE MPI_FLOAT
#endif

/**
* Device specific parameters
*/
#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
#define GLOBAL_MEM_UNROLL @GLOBAL_MEM_UNROLL@
#define LOCAL_MEM_BLOCK_LOG @LOCAL_MEM_BLOCK_LOG@
#define REGISTER_BLOCK_LOG @REGISTER_BLOCK_LOG@
#define NUM_REPLICATIONS @NUM_REPLICATIONS@
Expand Down
12 changes: 6 additions & 6 deletions LINPACK/src/device/hpl_torus_IEC.cl
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
}
Expand Down Expand Up @@ -569,7 +569,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
}
Expand Down Expand Up @@ -600,7 +600,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
}
Expand Down Expand Up @@ -709,7 +709,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
}
Expand Down Expand Up @@ -739,7 +739,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
}
Expand Down Expand Up @@ -829,7 +829,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
}
Expand Down
20 changes: 10 additions & 10 deletions LINPACK/src/device/hpl_torus_PCIE.cl
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
}
Expand Down Expand Up @@ -411,7 +411,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
}
Expand All @@ -423,7 +423,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a_block_trans[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii];
}
Expand All @@ -434,7 +434,7 @@ lu(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
}
Expand Down Expand Up @@ -466,7 +466,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
}
Expand Down Expand Up @@ -558,7 +558,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
}
Expand All @@ -570,7 +570,7 @@ void top_update(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
top_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
}
Expand Down Expand Up @@ -601,7 +601,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a_buffer[i][j][ii][jj] = a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj];
}
Expand Down Expand Up @@ -684,7 +684,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
a[block_col * BLOCK_SIZE + (block_row * BLOCK_SIZE + i * GEMM_BLOCK + ii) * BLOCK_SIZE * blocks_per_row + j * GEMM_BLOCK + jj] = a_buffer[i][j][ii][jj];
}
Expand All @@ -697,7 +697,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
for (int i =0; i < BLOCK_SIZE/GEMM_BLOCK; i++) {
for (int ii =0; ii < GEMM_BLOCK; ii++) {
for (int j =0; j < BLOCK_SIZE/GEMM_BLOCK; j++) {
__attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
__attribute__((opencl_unroll_hint(GEMM_BLOCK)))
for (int jj =0; jj < GEMM_BLOCK; jj++) {
left_block[(i * GEMM_BLOCK + ii) * BLOCK_SIZE + j * GEMM_BLOCK + jj] = a_buffer[j][i][jj][ii];
}
Expand Down
8 changes: 4 additions & 4 deletions LINPACK/src/host/execution_types/execution_pcie.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,9 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
lu_queues.back().finish();

// Broadcast LU block in column to update all left blocks
MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator);
MPI_Bcast(lu_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
// Broadcast LU block in row to update all top blocks
MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator);
MPI_Bcast(lu_trans_block, config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, row_communicator);

if (num_top_blocks > 0) {

Expand Down Expand Up @@ -329,10 +329,10 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co

// Send the left and top blocks to all other ranks so they can be used to update all inner blocks
for (int lbi=0; lbi < blocks_per_row - local_block_row; lbi++) {
MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, row_communicator);
MPI_Bcast(left_blocks[lbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, row_communicator);
}
for (int tbi=0; tbi < blocks_per_row - local_block_row; tbi++) {
MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_FLOAT, local_block_row_remainder, col_communicator);
MPI_Bcast(top_blocks[tbi], config.programSettings->blockSize*config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
}

// update all remaining inner blocks using only global memory
Expand Down
Loading

0 comments on commit 48e0386

Please sign in to comment.