Skip to content

Commit

Permalink
Merge branch 'optimize-ptrans-xilinx' into 'master'
Browse files Browse the repository at this point in the history
Optimize PTRANS kernel for Xilinx Vitis

See merge request pc2/HPCC_FPGA!9
  • Loading branch information
Marius Meyer committed May 12, 2020
2 parents 949da2e + 04f7218 commit c73b039
Show file tree
Hide file tree
Showing 11 changed files with 43 additions and 29 deletions.
5 changes: 5 additions & 0 deletions PTRANS/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

This file contains all changes made to the source code for each release.

## 1.1

#### Added:
- Optimized kernel for Xilinx Vitis toolchain

## 1.0.1

#### Added:
Expand Down
4 changes: 2 additions & 2 deletions PTRANS/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
cmake_minimum_required(VERSION 3.1)
project(PTRANS VERSION 1.0.1)
project(PTRANS VERSION 1.1)

set(KERNEL_NAME transpose CACHE STRING "Name of the OpenCL kernel")
set(DEFAULT_MATRIX_SIZE 4096 CACHE STRING "Default size of the used matrices")
set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices")
set(BLOCK_SIZE 512 CACHE STRING "Block size used in the FPGA kernel")
set(GLOBAL_MEM_UNROLL 16 CACHE STRING "Unrolling factor used to stream data")

Expand Down
29 changes: 16 additions & 13 deletions PTRANS/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,25 +69,28 @@ For execution of the benchmark run:

For more information on available input parameters run

./trans_intel -h

$./trans_xilinx -h
Implementation of the matrix transposition benchmark proposed in the HPCC benchmark suite for FPGA.
Version: 0.1.2
Version: 1.0.1

Usage:
./trans_intel [OPTION...]

-f, --file arg Kernel file name
-n, arg Number of repetitions (default: 10)
-m, arg Matrix size (default: 4096)
--kernel arg Name of the kernel (default: transpose)
-i, --nointerleaving Disable memory interleaving
--device arg Index of the device that has to be used. If not given
./trans_xilinx [OPTION...]

-f, --file arg Kernel file name
-n, arg Number of repetitions (default: 10)
-m, arg Matrix size in number of blocks in one dimension
(default: 8)
-b, arg Block size in number of values in one dimension
(default: 512)
--kernel arg Name of the kernel (default: transpose)
-i, --nointerleaving Disable memory interleaving
--device arg Index of the device that has to be used. If not given
you will be asked which device to use if there are
multiple devices available. (default: -1)
--platform arg Index of the platform that has to be used. If not
--platform arg Index of the platform that has to be used. If not
given you will be asked which platform to use if there
are multiple platforms available. (default: -1)
-h, --help Print this help
-h, --help Print this help



Expand Down
8 changes: 4 additions & 4 deletions PTRANS/src/device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)

if (INTELFPGAOPENCL_FOUND)
generate_kernel_targets_intel(transpose_optimized)
add_test(NAME test_emulation_optimized_intel COMMAND trans_intel -f transpose_optimized_emulate.aocx -n 1 -m ${BLOCK_SIZE} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_intel -f transpose_optimized_emulate.aocx -n 1 -m ${BLOCK_SIZE}
add_test(NAME test_emulation_optimized_intel COMMAND trans_intel -f transpose_optimized_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_intel -f transpose_optimized_emulate.aocx -n 1 -m 1
WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
endif()

if (VITIS_FOUND)
generate_kernel_targets_xilinx(transpose_optimized)
add_test(NAME test_emulation_optimized_xilinx COMMAND trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m ${BLOCK_SIZE} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m ${BLOCK_SIZE}
add_test(NAME test_emulation_optimized_xilinx COMMAND trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m 1
WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
endif()

10 changes: 5 additions & 5 deletions PTRANS/src/device/transpose_optimized.cl
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,19 @@
__attribute__((max_global_work_dim(0)))
__kernel
void transpose(__global DEVICE_DATA_TYPE *restrict A,
__global DEVICE_DATA_TYPE *restrict B,
__global DEVICE_DATA_TYPE *restrict A_out,
uint matrixSize) {
__global DEVICE_DATA_TYPE *restrict B,
__global DEVICE_DATA_TYPE *restrict A_out,
const uint number_of_blocks) {

const unsigned number_of_blocks = matrixSize / BLOCK_SIZE;
const unsigned matrixSize = number_of_blocks * BLOCK_SIZE;

// transpose the matrix block-wise from global memory
#pragma loop_coalesce 2
for (int block_row = 0; block_row < number_of_blocks; block_row++) {
for (int block_col = 0; block_col < number_of_blocks; block_col++) {

// local memory buffer for a matrix block
DEVICE_DATA_TYPE a_block[BLOCK_SIZE * BLOCK_SIZE / GLOBAL_MEM_UNROLL][GLOBAL_MEM_UNROLL];
DEVICE_DATA_TYPE a_block[BLOCK_SIZE * BLOCK_SIZE / GLOBAL_MEM_UNROLL][GLOBAL_MEM_UNROLL] __attribute__((xcl_array_partition(cyclic, GLOBAL_MEM_UNROLL,1))) __attribute__((xcl_array_partition(cyclic, GLOBAL_MEM_UNROLL,2)));

// read in block from global memory and store it in a memory efficient manner
#pragma loop_coalesce 2
Expand Down
1 change: 1 addition & 0 deletions PTRANS/src/host/execution.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace bm_execution {
std::string kernelName;
uint repetitons;
cl_uint matrixSize;
cl_uint blockSize;
bool useMemInterleaving;
};

Expand Down
2 changes: 1 addition & 1 deletion PTRANS/src/host/execution_default.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ namespace bm_execution {
transposeKernel.setArg(0, bufferA);
transposeKernel.setArg(1, bufferB);
transposeKernel.setArg(2, bufferA_out);
transposeKernel.setArg(3, config->matrixSize);
transposeKernel.setArg(3, config->matrixSize / config->blockSize);

cl::CommandQueue queue(config->context);

Expand Down
3 changes: 2 additions & 1 deletion PTRANS/src/host/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ main(int argc, char *argv[]) {
context, device, program,
programSettings->kernelName,
programSettings->numRepetitions,
programSettings->matrixSize,
programSettings->matrixSize * programSettings->blockSize,
programSettings->blockSize,
programSettings->useMemInterleaving
});

Expand Down
8 changes: 5 additions & 3 deletions PTRANS/src/host/transpose_functionality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,10 @@ parseProgramParameters(int argc, char *argv[]) {
("f,file", "Kernel file name", cxxopts::value<std::string>())
("n", "Number of repetitions",
cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_REPETITIONS)))
("m", "Matrix size",
("m", "Matrix size in number of blocks in one dimension",
cxxopts::value<cl_uint>()->default_value(std::to_string(DEFAULT_MATRIX_SIZE)))
("b", "Block size in number of values in one dimension",
cxxopts::value<cl_uint>()->default_value(std::to_string(BLOCK_SIZE)))
("kernel", "Name of the kernel",
cxxopts::value<std::string>()->default_value(KERNEL_NAME))
("i,nointerleaving", "Disable memory interleaving")
Expand Down Expand Up @@ -88,6 +90,7 @@ parseProgramParameters(int argc, char *argv[]) {
std::shared_ptr<ProgramSettings> sharedSettings(
new ProgramSettings{result["n"].as<uint>(),
result["m"].as<cl_uint>(),
result["b"].as<cl_uint>(),
result["platform"].as<int>(),
result["device"].as<int>(),
static_cast<bool>(result.count("i") <= 0),
Expand Down Expand Up @@ -178,8 +181,7 @@ void printFinalConfiguration(const std::shared_ptr<ProgramSettings> &programSett
std::cout << "Summary:" << std::endl
<< "Repetitions: " << programSettings->numRepetitions
<< std::endl
<< "Matrix Size: " << programSettings->matrixSize
<< "x" << programSettings->matrixSize
<< "Matrix Size: " << programSettings->matrixSize * programSettings->blockSize
<< std::endl
<< "Memory Interleaving: " << (programSettings->useMemInterleaving ? "Yes" : "No")
<< std::endl
Expand Down
1 change: 1 addition & 0 deletions PTRANS/src/host/transpose_functionality.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Moreover the version and build time is also compiled into the description.
struct ProgramSettings {
uint numRepetitions;
cl_uint matrixSize;
cl_uint blockSize;
int defaultPlatform;
int defaultDevice;
bool useMemInterleaving;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ struct OpenCLKernelTest : testing::Test {
KERNEL_NAME,
1,
matrix_size,
BLOCK_SIZE,
false
});
generateInputData(matrix_size, A, B);
Expand Down

0 comments on commit c73b039

Please sign in to comment.