diff --git a/PTRANS/CHANGELOG b/PTRANS/CHANGELOG index c7665867..e0c91f51 100644 --- a/PTRANS/CHANGELOG +++ b/PTRANS/CHANGELOG @@ -2,6 +2,11 @@ This file contains all changes made to the source code for each release. +## 1.1 + +#### Added: +- Optimized kernel for Xilinx Vitis toolchain + ## 1.0.1 #### Added: diff --git a/PTRANS/CMakeLists.txt b/PTRANS/CMakeLists.txt index 95326050..4af8b3ab 100755 --- a/PTRANS/CMakeLists.txt +++ b/PTRANS/CMakeLists.txt @@ -1,8 +1,8 @@ cmake_minimum_required(VERSION 3.1) -project(PTRANS VERSION 1.0.1) +project(PTRANS VERSION 1.1) set(KERNEL_NAME transpose CACHE STRING "Name of the OpenCL kernel") -set(DEFAULT_MATRIX_SIZE 4096 CACHE STRING "Default size of the used matrices") +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices") set(BLOCK_SIZE 512 CACHE STRING "Block size used in the FPGA kernel") set(GLOBAL_MEM_UNROLL 16 CACHE STRING "Unrolling factor used to stream data") diff --git a/PTRANS/README.md b/PTRANS/README.md index 13b837a2..584c401a 100644 --- a/PTRANS/README.md +++ b/PTRANS/README.md @@ -69,25 +69,28 @@ For execution of the benchmark run: For more information on available input parameters run - ./trans_intel -h - + $./trans_xilinx -h Implementation of the matrix transposition benchmark proposed in the HPCC benchmark suite for FPGA. - Version: 0.1.2 + Version: 1.0.1 + Usage: - ./trans_intel [OPTION...] - - -f, --file arg Kernel file name - -n, arg Number of repetitions (default: 10) - -m, arg Matrix size (default: 4096) - --kernel arg Name of the kernel (default: transpose) - -i, --nointerleaving Disable memory interleaving - --device arg Index of the device that has to be used. If not given + ./trans_xilinx [OPTION...] + + -f, --file arg Kernel file name + -n, arg Number of repetitions (default: 10) + -m, arg Matrix size in number of blocks in one dimension + (default: 8) + -b, arg Block size in number of values in one dimension + (default: 512) + --kernel arg Name of the kernel (default: transpose) + -i, --nointerleaving Disable memory interleaving + --device arg Index of the device that has to be used. If not given you will be asked which device to use if there are multiple devices available. (default: -1) - --platform arg Index of the platform that has to be used. If not + --platform arg Index of the platform that has to be used. If not given you will be asked which platform to use if there are multiple platforms available. (default: -1) - -h, --help Print this help + -h, --help Print this help diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt index ea702d0c..4bf2e7da 100644 --- a/PTRANS/src/device/CMakeLists.txt +++ b/PTRANS/src/device/CMakeLists.txt @@ -3,15 +3,15 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) if (INTELFPGAOPENCL_FOUND) generate_kernel_targets_intel(transpose_optimized) - add_test(NAME test_emulation_optimized_intel COMMAND trans_intel -f transpose_optimized_emulate.aocx -n 1 -m ${BLOCK_SIZE} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_intel -f transpose_optimized_emulate.aocx -n 1 -m ${BLOCK_SIZE} + add_test(NAME test_emulation_optimized_intel COMMAND trans_intel -f transpose_optimized_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_intel -f transpose_optimized_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (VITIS_FOUND) generate_kernel_targets_xilinx(transpose_optimized) - add_test(NAME test_emulation_optimized_xilinx COMMAND trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m ${BLOCK_SIZE} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m ${BLOCK_SIZE} + add_test(NAME test_emulation_optimized_xilinx COMMAND trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/PTRANS/src/device/transpose_optimized.cl b/PTRANS/src/device/transpose_optimized.cl index 06496878..723f1c13 100644 --- a/PTRANS/src/device/transpose_optimized.cl +++ b/PTRANS/src/device/transpose_optimized.cl @@ -25,11 +25,11 @@ __attribute__((max_global_work_dim(0))) __kernel void transpose(__global DEVICE_DATA_TYPE *restrict A, - __global DEVICE_DATA_TYPE *restrict B, -__global DEVICE_DATA_TYPE *restrict A_out, - uint matrixSize) { + __global DEVICE_DATA_TYPE *restrict B, + __global DEVICE_DATA_TYPE *restrict A_out, + const uint number_of_blocks) { - const unsigned number_of_blocks = matrixSize / BLOCK_SIZE; + const unsigned matrixSize = number_of_blocks * BLOCK_SIZE; // transpose the matrix block-wise from global memory #pragma loop_coalesce 2 @@ -37,7 +37,7 @@ __global DEVICE_DATA_TYPE *restrict A_out, for (int block_col = 0; block_col < number_of_blocks; block_col++) { // local memory buffer for a matrix block - DEVICE_DATA_TYPE a_block[BLOCK_SIZE * BLOCK_SIZE / GLOBAL_MEM_UNROLL][GLOBAL_MEM_UNROLL]; + DEVICE_DATA_TYPE a_block[BLOCK_SIZE * BLOCK_SIZE / GLOBAL_MEM_UNROLL][GLOBAL_MEM_UNROLL] __attribute__((xcl_array_partition(cyclic, GLOBAL_MEM_UNROLL,1))) __attribute__((xcl_array_partition(cyclic, GLOBAL_MEM_UNROLL,2))); // read in block from global memory and store it in a memory efficient manner #pragma loop_coalesce 2 diff --git a/PTRANS/src/host/execution.h b/PTRANS/src/host/execution.h index e640ec74..de460742 100644 --- a/PTRANS/src/host/execution.h +++ b/PTRANS/src/host/execution.h @@ -40,6 +40,7 @@ namespace bm_execution { std::string kernelName; uint repetitons; cl_uint matrixSize; + cl_uint blockSize; bool useMemInterleaving; }; diff --git a/PTRANS/src/host/execution_default.cpp b/PTRANS/src/host/execution_default.cpp index 36f10a86..b5267d2e 100644 --- a/PTRANS/src/host/execution_default.cpp +++ b/PTRANS/src/host/execution_default.cpp @@ -55,7 +55,7 @@ namespace bm_execution { transposeKernel.setArg(0, bufferA); transposeKernel.setArg(1, bufferB); transposeKernel.setArg(2, bufferA_out); - transposeKernel.setArg(3, config->matrixSize); + transposeKernel.setArg(3, config->matrixSize / config->blockSize); cl::CommandQueue queue(config->context); diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp index c2ed8a96..07d64b98 100644 --- a/PTRANS/src/host/main.cpp +++ b/PTRANS/src/host/main.cpp @@ -37,7 +37,8 @@ main(int argc, char *argv[]) { context, device, program, programSettings->kernelName, programSettings->numRepetitions, - programSettings->matrixSize, + programSettings->matrixSize * programSettings->blockSize, + programSettings->blockSize, programSettings->useMemInterleaving }); diff --git a/PTRANS/src/host/transpose_functionality.cpp b/PTRANS/src/host/transpose_functionality.cpp index cb832794..f4d98494 100644 --- a/PTRANS/src/host/transpose_functionality.cpp +++ b/PTRANS/src/host/transpose_functionality.cpp @@ -56,8 +56,10 @@ parseProgramParameters(int argc, char *argv[]) { ("f,file", "Kernel file name", cxxopts::value()) ("n", "Number of repetitions", cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("m", "Matrix size", + ("m", "Matrix size in number of blocks in one dimension", cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) + ("b", "Block size in number of values in one dimension", + cxxopts::value()->default_value(std::to_string(BLOCK_SIZE))) ("kernel", "Name of the kernel", cxxopts::value()->default_value(KERNEL_NAME)) ("i,nointerleaving", "Disable memory interleaving") @@ -88,6 +90,7 @@ parseProgramParameters(int argc, char *argv[]) { std::shared_ptr sharedSettings( new ProgramSettings{result["n"].as(), result["m"].as(), + result["b"].as(), result["platform"].as(), result["device"].as(), static_cast(result.count("i") <= 0), @@ -178,8 +181,7 @@ void printFinalConfiguration(const std::shared_ptr &programSett std::cout << "Summary:" << std::endl << "Repetitions: " << programSettings->numRepetitions << std::endl - << "Matrix Size: " << programSettings->matrixSize - << "x" << programSettings->matrixSize + << "Matrix Size: " << programSettings->matrixSize * programSettings->blockSize << std::endl << "Memory Interleaving: " << (programSettings->useMemInterleaving ? "Yes" : "No") << std::endl diff --git a/PTRANS/src/host/transpose_functionality.hpp b/PTRANS/src/host/transpose_functionality.hpp index 53001d4b..69c63375 100644 --- a/PTRANS/src/host/transpose_functionality.hpp +++ b/PTRANS/src/host/transpose_functionality.hpp @@ -46,6 +46,7 @@ Moreover the version and build time is also compiled into the description. struct ProgramSettings { uint numRepetitions; cl_uint matrixSize; + cl_uint blockSize; int defaultPlatform; int defaultDevice; bool useMemInterleaving; diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp index 0aee88ca..d442f2a6 100644 --- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp +++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp @@ -54,6 +54,7 @@ struct OpenCLKernelTest : testing::Test { KERNEL_NAME, 1, matrix_size, + BLOCK_SIZE, false }); generateInputData(matrix_size, A, B);