From 5fe425b8eca36f576b9f1f00aa86642a91fe5247 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 11 May 2020 16:42:17 +0200 Subject: [PATCH 01/45] Add shared test headers and code --- shared/setup/common_benchmark_io.hpp | 66 ++++++++++++++++++++++++++ shared/setup/fpga_setup.cpp | 4 +- shared/setup/test_fpga_setup.cpp | 4 +- shared/testing/main.cpp | 59 +++++++++++++++++++++++ shared/testing/test_program_settings.h | 27 +++++++++++ 5 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 shared/setup/common_benchmark_io.hpp create mode 100644 shared/testing/main.cpp create mode 100644 shared/testing/test_program_settings.h diff --git a/shared/setup/common_benchmark_io.hpp b/shared/setup/common_benchmark_io.hpp new file mode 100644 index 00000000..72dd2834 --- /dev/null +++ b/shared/setup/common_benchmark_io.hpp @@ -0,0 +1,66 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_COMMON_BENCHMARK_IO_H_ +#define SRC_HOST_COMMON_BENCHMARK_IO_H_ + + +/* Project's headers */ +#include "program_settings.h" + +/* External library headers */ +#include "CL/cl.hpp" + +#define STR_EXPAND(tok) #tok +#define STR(tok) STR_EXPAND(tok) + +#define ENTRY_SPACE 15 + +/** +* Parses and returns program options using the cxxopts library. +* The parsed parameters are depending on the benchmark that is implementing +* function. +* The header file is used to specify a unified interface so it can also be used +* in the testing binary. +* cxxopts is used to parse the parameters. +* @see https://github.com/jarro2783/cxxopts +* +* @param argc Number of input parameters as it is provided by the main function +* @param argv Strings containing the input parameters as provided by the main function +* +* @return program settings that are created from the given program arguments +*/ +std::shared_ptr +parseProgramParameters(int argc, char *argv[]); + + +/** + * Prints the used configuration to std out before starting the actual benchmark. + * + * @param programSettings The program settings that are parsed from the command line + using parseProgramParameters + * @param device The device selected for execution + */ +void +printFinalConfiguration(const std::shared_ptr &programSettings, + const cl::Device &device); + +#endif diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index 794731d9..41a8739e 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -261,8 +261,8 @@ choose a device. if (defaultDevice < deviceList.size()) { chosenDeviceId = defaultDevice; } else { - std::cerr << "Default platform " << defaultDevice - << " can not be used. Available platforms: " + std::cerr << "Default device " << defaultDevice + << " can not be used. Available devices: " << deviceList.size() << std::endl; exit(1); } diff --git a/shared/setup/test_fpga_setup.cpp b/shared/setup/test_fpga_setup.cpp index 58eb13c1..6d8cd5be 100644 --- a/shared/setup/test_fpga_setup.cpp +++ b/shared/setup/test_fpga_setup.cpp @@ -20,7 +20,7 @@ TEST (FPGASetup, FindNonExistingPlatform) { // TODO regex does not work so for now its not tested! EXPECT_EXIT(fpga_setup::selectFPGADevice(DEFAULT_PLATFORM + 100, DEFAULT_DEVICE), ::testing::ExitedWithCode(1), - ::testing::MatchesRegex(".*")); + ::testing::MatchesRegex(".*?Default platform \\d+ can not be used. Available platforms: \\d+")); } /** @@ -31,7 +31,7 @@ TEST (FPGASetup, FindNonExistingDevice) { // TODO regex does not work so for now its not tested! EXPECT_EXIT(fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE + 100), ::testing::ExitedWithCode(1), - ::testing::MatchesRegex(".*")); + ::testing::MatchesRegex(".*?Default device \\d+ can not be used. Available devices: \\d+")); } diff --git a/shared/testing/main.cpp b/shared/testing/main.cpp new file mode 100644 index 00000000..819cf53a --- /dev/null +++ b/shared/testing/main.cpp @@ -0,0 +1,59 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "program_settings.h" +#include "setup/common_benchmark_io.hpp" +#include "setup/fpga_setup.hpp" +#include "test_program_settings.h" + +#include "gtest/gtest.h" +#include "CL/cl.hpp" + +std::shared_ptr programSettings; + +/** +The program entry point for the unit tests +*/ +int +main(int argc, char *argv[]) { + + std::cout << "THIS BINARY EXECUTES UNIT TESTS FOR THE FOLLOWING BENCHMARK:" << std::endl << std::endl; + + // Parse input parameters + programSettings = parseProgramParameters(argc, argv); + fpga_setup::setupEnvironmentAndClocks(); + + std::vector usedDevice = + fpga_setup::selectFPGADevice(programSettings->defaultPlatform, + programSettings->defaultDevice); + + // Print input parameters + printFinalConfiguration(programSettings, usedDevice[0]); + + ::testing::InitGoogleTest(); + + return RUN_ALL_TESTS(); + + +} + diff --git a/shared/testing/test_program_settings.h b/shared/testing/test_program_settings.h new file mode 100644 index 00000000..a9a70991 --- /dev/null +++ b/shared/testing/test_program_settings.h @@ -0,0 +1,27 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "program_settings.h" + + +extern std::shared_ptr programSettings; From 69eea71cb3df2e3145d7c9c6899a6bda72485318 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 11 May 2020 16:42:55 +0200 Subject: [PATCH 02/45] Convert STREAM to shared test files --- STREAM/src/host/CMakeLists.txt | 3 +- .../common_benchmark_io_implementation.cpp | 103 ++++++++++++++++++ STREAM/src/host/main.cpp | 2 + STREAM/src/host/program_settings.h | 35 ++++++ STREAM/src/host/stream_functionality.cpp | 103 +----------------- STREAM/src/host/stream_functionality.hpp | 48 -------- STREAM/tests/CMakeLists.txt | 14 ++- ...nel_functionality_and_host_integration.cpp | 51 ++++----- 8 files changed, 173 insertions(+), 186 deletions(-) create mode 100644 STREAM/src/host/common_benchmark_io_implementation.cpp create mode 100644 STREAM/src/host/program_settings.h diff --git a/STREAM/src/host/CMakeLists.txt b/STREAM/src/host/CMakeLists.txt index 859bd2b5..66fa6946 100755 --- a/STREAM/src/host/CMakeLists.txt +++ b/STREAM/src/host/CMakeLists.txt @@ -1,6 +1,7 @@ include_directories(../../../extern/cxxopts/include ../../../shared) +include_directories(./) -set(HOST_SOURCE execution_default.cpp main.cpp ../../../shared/setup/fpga_setup.cpp stream_functionality.cpp) +set(HOST_SOURCE execution_default.cpp main.cpp ../../../shared/setup/fpga_setup.cpp common_benchmark_io_implementation.cpp stream_functionality.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) diff --git a/STREAM/src/host/common_benchmark_io_implementation.cpp b/STREAM/src/host/common_benchmark_io_implementation.cpp new file mode 100644 index 00000000..b95ad3d5 --- /dev/null +++ b/STREAM/src/host/common_benchmark_io_implementation.cpp @@ -0,0 +1,103 @@ + +#include "cxxopts.hpp" +#include "parameters.h" +#include "setup/common_benchmark_io.hpp" + +/** +Parses and returns program options using the cxxopts library. +Supports the following parameters: + - file name of the FPGA kernel file (-f,--file) + - number of repetitions (-n) + - number of kernel replications (-r) + - data size (-d) + - use memory interleaving +@see https://github.com/jarro2783/cxxopts + +@return program settings that are created from the given program arguments +*/ +std::shared_ptr +parseProgramParameters(int argc, char *argv[]) { + // Defining and parsing program options + cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); + options.add_options() + ("f,file", "Kernel file name", cxxopts::value()) + ("n", "Number of repetitions", + cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) + ("s", "Size of the data arrays", + cxxopts::value()->default_value(std::to_string(DEFAULT_ARRAY_LENGTH))) + ("r", "Number of kernel replications used", + cxxopts::value()->default_value(std::to_string(NUM_KERNEL_REPLICATIONS))) +#ifdef INTEL_FPGA + ("i", "Use memory Interleaving") +#endif + ("single-kernel", "Use the single kernel implementation") + ("device", "Index of the device that has to be used. If not given you "\ + "will be asked which device to use if there are multiple devices "\ + "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) + ("platform", "Index of the platform that has to be used. If not given "\ + "you will be asked which platform to use if there are multiple "\ + "platforms available.", + cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) + ("h,help", "Print this help"); + cxxopts::ParseResult result = options.parse(argc, argv); + + if (result.count("h")) { + // Just print help when argument is given + std::cout << options.help() << std::endl; + exit(0); + } + // Check parsed options and handle special cases + if (result.count("f") <= 0) { + // Path to the kernel file is mandatory - exit if not given! + std::cerr << "Kernel file must be given! Aborting" << std::endl; + std::cout << options.help() << std::endl; + exit(1); + } + + // Create program settings from program arguments + std::shared_ptr sharedSettings( + new ProgramSettings{result["n"].as(), + result["s"].as(), + result["r"].as(), +#ifdef INTEL_FPGA + static_cast(result.count("i")), +#else + false, +#endif + result["platform"].as(), + result["device"].as(), + result["f"].as(), + static_cast(result.count("single-kernel"))}); + return sharedSettings; +} + +/** + * Prints the used configuration to std out before starting the actual benchmark. + * + * @param programSettings The program settings retrieved from the command line + * @param device The device used for execution + */ +void printFinalConfiguration(const std::shared_ptr &programSettings, + const cl::Device &device) {// Give setup summary + std::cout << PROGRAM_DESCRIPTION << std::endl; + std::cout << "Version: " << VERSION << std::endl << HLINE; + std::cout << "Summary:" << std::endl + << "Array Size: " + << static_cast(programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)) << " Byte" + << std::endl + << "Data Type: " << STR(HOST_DATA_TYPE) + << std::endl + << "Repetitions: " << programSettings->numRepetitions + << std::endl + << "Kernel Replications: " << programSettings->kernelReplications + << std::endl + << "Kernel Type: " << (programSettings->useSingleKernel ? "Single" : "Separate") + << std::endl + << "Kernel File: " << programSettings->kernelFileName + << std::endl; + std::cout << "Device: " + << device.getInfo() << std::endl; + std::cout << HLINE + << "Start benchmark using the given configuration." << std::endl + << HLINE; +} diff --git a/STREAM/src/host/main.cpp b/STREAM/src/host/main.cpp index 2d9395a2..493e692b 100644 --- a/STREAM/src/host/main.cpp +++ b/STREAM/src/host/main.cpp @@ -3,6 +3,8 @@ // #include "stream_functionality.hpp" +#include "program_settings.h" +#include "setup/common_benchmark_io.hpp" #include "CL/opencl.h" /** diff --git a/STREAM/src/host/program_settings.h b/STREAM/src/host/program_settings.h new file mode 100644 index 00000000..53f8f34e --- /dev/null +++ b/STREAM/src/host/program_settings.h @@ -0,0 +1,35 @@ + +#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ +#define SRC_HOST_PROGRAM_SETTINGS_H_ + +#include "parameters.h" + +/* C++ standard library headers */ +#include + +#include "CL/opencl.h" + + + +#define PROGRAM_DESCRIPTION "Implementation of the STREAM benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + + +/** +* A struct that is used to store the porgram settings +* provided by command line arguments. +*/ +struct ProgramSettings { + uint numRepetitions; + uint streamArraySize; + uint kernelReplications; + bool useMemoryInterleaving; + int defaultPlatform; + int defaultDevice; + std::string kernelFileName; + bool useSingleKernel; +}; + + +#endif diff --git a/STREAM/src/host/stream_functionality.cpp b/STREAM/src/host/stream_functionality.cpp index ba70ccab..705c72fa 100644 --- a/STREAM/src/host/stream_functionality.cpp +++ b/STREAM/src/host/stream_functionality.cpp @@ -32,78 +32,10 @@ SOFTWARE. /* Project's headers */ #include "execution.h" -#include "cxxopts.hpp" #include "setup/fpga_setup.hpp" #include "parameters.h" - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("s", "Size of the data arrays", - cxxopts::value()->default_value(std::to_string(DEFAULT_ARRAY_LENGTH))) - ("r", "Number of kernel replications used", - cxxopts::value()->default_value(std::to_string(NUM_KERNEL_REPLICATIONS))) -#ifdef INTEL_FPGA - ("i", "Use memory Interleaving") -#endif - ("single-kernel", "Use the single kernel implementation") - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["s"].as(), - result["r"].as(), -#ifdef INTEL_FPGA - static_cast(result.count("i")), -#else - false, -#endif - result["platform"].as(), - result["device"].as(), - result["f"].as(), - static_cast(result.count("single-kernel"))}); - return sharedSettings; -} - +#include "program_settings.h" +#include "setup/common_benchmark_io.hpp" /** Prints the execution results to stdout @@ -135,37 +67,6 @@ printResults(std::shared_ptr results) { } -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl; - std::cout << "Version: " << VERSION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "Array Size: " - << static_cast(programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)) << " Byte" - << std::endl - << "Data Type: " << STR(HOST_DATA_TYPE) - << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Kernel Replications: " << programSettings->kernelReplications - << std::endl - << "Kernel Type: " << (programSettings->useSingleKernel ? "Single" : "Separate") - << std::endl - << "Kernel File: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} - void generateInputData(HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C, unsigned array_size) { for (int i=0; i< array_size; i++) { diff --git a/STREAM/src/host/stream_functionality.hpp b/STREAM/src/host/stream_functionality.hpp index 39db6a59..41e7e066 100644 --- a/STREAM/src/host/stream_functionality.hpp +++ b/STREAM/src/host/stream_functionality.hpp @@ -33,45 +33,6 @@ SOFTWARE. #include "setup/fpga_setup.hpp" #include "parameters.h" -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ -#define STR_EXPAND(tok) #tok -#define STR(tok) STR_EXPAND(tok) - -#define PROGRAM_DESCRIPTION "Implementation of the STREAM benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " VERSION "\n" - -#define ENTRY_SPACE 15 - -struct ProgramSettings { - uint numRepetitions; - uint streamArraySize; - uint kernelReplications; - bool useMemoryInterleaving; - int defaultPlatform; - int defaultDevice; - std::string kernelFileName; - bool useSingleKernel; -}; - - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]); /** Prints the execution results to stdout @@ -81,15 +42,6 @@ Prints the execution results to stdout void printResults(std::shared_ptr results); -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device); - /** * Fill the data buffer with random number using the mersenne twister engine with diff --git a/STREAM/tests/CMakeLists.txt b/STREAM/tests/CMakeLists.txt index 2151c733..f9600de3 100755 --- a/STREAM/tests/CMakeLists.txt +++ b/STREAM/tests/CMakeLists.txt @@ -3,27 +3,29 @@ add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) +include_directories(${CMAKE_SOURCE_DIR}/src/host/) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/execution_default.cpp ../src/host/stream_functionality.cpp) -set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp) +set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/execution_default.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/stream_functionality.cpp) +set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp ../../shared/testing/main.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock gtest_main ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") add_dependencies(Test_intel stream_kernels_emulate_intel stream_kernels_single_emulate_intel) target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) target_compile_options(Test_intel PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_intel_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_intel_unit COMMAND $ -f stream_kernels_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_intel_single_unit COMMAND $ -f stream_kernels_single_emulate.aocx --single-kernel WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (Vitis_FOUND) include_directories(SYSTEM ${Vitis_INCLUDE_DIRS}) add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock gtest_main ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") add_dependencies(Test_xilinx stream_kernels_single_emulate_xilinx) target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(Test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_xilinx_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_xilinx_single_unit COMMAND $ -f stream_kernels_single_emulate.xclbin --single-kernel WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() \ No newline at end of file diff --git a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp index 2019a7ba..d03e573d 100644 --- a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp +++ b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp @@ -5,10 +5,11 @@ #include "parameters.h" #include "../src/host/execution.h" #include "setup/fpga_setup.hpp" +#include "testing/test_program_settings.h" #include "../src/host/stream_functionality.hpp" -struct OpenCLKernelTest : testing::Test { +struct OpenCLKernelTest :public ::testing::Test { HOST_DATA_TYPE *A; HOST_DATA_TYPE *B; HOST_DATA_TYPE *C; @@ -25,10 +26,20 @@ struct OpenCLKernelTest : testing::Test { sizeof(HOST_DATA_TYPE) * array_size); } - void setupFPGA(std::string kernelFileName, bool is_single_kernel) { - std::vector device = fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE); + void SetUp( ) { + std::cout << programSettings << std::endl; + setupFPGA(programSettings); + } + + void setupFPGA(std::shared_ptr settings) { + // Redirect stout buffer to local buffer to make checks possible + // std::stringstream newStdOutBuffer; + // std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); + // std::cout.rdbuf(newStdOutBuffer.rdbuf()); + + std::vector device = fpga_setup::selectFPGADevice(settings->defaultPlatform, settings->defaultDevice); cl::Context context(device[0]); - cl::Program program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); + cl::Program program = fpga_setup::fpgaSetup(&context, device, &settings->kernelFileName); config = std::make_shared( bm_execution::ExecutionConfiguration{ context, device[0], program, @@ -36,10 +47,13 @@ struct OpenCLKernelTest : testing::Test { NUM_KERNEL_REPLICATIONS, array_size, false, - is_single_kernel + settings->useSingleKernel }); HOST_DATA_TYPE norm; generateInputData(A, B, C, array_size); + + // Redirect stdout to old buffer + // std::cout.rdbuf(oldStdOutBuffer); } ~OpenCLKernelTest() override { @@ -49,20 +63,11 @@ struct OpenCLKernelTest : testing::Test { } }; -struct DifferentOpenCLKernelTest : OpenCLKernelTest, testing::WithParamInterface> { - DifferentOpenCLKernelTest() { - auto params = GetParam(); - auto kernel_file = std::get<0>(params); - bool is_single_kernel = std::get<1>(params); - setupFPGA(kernel_file, is_single_kernel); - } -}; - /** * Execution returns correct results for a single repetition */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectResultsOneRepetition) { +TEST_F(OpenCLKernelTest, FPGACorrectResultsOneRepetition) { auto result = bm_execution::calculate(config, A, B, C); for (int i = 0; i < array_size; i++) { @@ -75,7 +80,7 @@ TEST_P(DifferentOpenCLKernelTest, FPGACorrectResultsOneRepetition) { /** * Execution returns correct results for three repetitions */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectResultsThreeRepetition) { +TEST_F(OpenCLKernelTest, FPGACorrectResultsThreeRepetition) { config->repetitions = 3; auto result = bm_execution::calculate(config, A, B, C); for (int i = 0; i < array_size; i++) { @@ -84,17 +89,3 @@ TEST_P(DifferentOpenCLKernelTest, FPGACorrectResultsThreeRepetition) { EXPECT_FLOAT_EQ(C[i], 1800.0); } } - - -#ifdef INTEL_FPGA -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Values(std::make_tuple("stream_kernels_emulate.aocx", false), - std::make_tuple("stream_kernels_single_emulate.aocx", true)) -); -#endif - -#ifdef XILINX_FPGA -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Values(std::make_tuple("stream_kernels_single_emulate.xclbin", true)) -); -#endif \ No newline at end of file From 26c0f242ecb152601a0d60c5029f4b45bdb9e835 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 11 May 2020 19:37:37 +0200 Subject: [PATCH 03/45] Fix tests for STREAM --- ...nel_functionality_and_host_integration.cpp | 8 +++---- shared/setup/test_fpga_setup.cpp | 23 +++++++++++-------- shared/testing/main.cpp | 4 ++-- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp index d03e573d..74d52b1b 100644 --- a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp +++ b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp @@ -33,9 +33,9 @@ struct OpenCLKernelTest :public ::testing::Test { void setupFPGA(std::shared_ptr settings) { // Redirect stout buffer to local buffer to make checks possible - // std::stringstream newStdOutBuffer; - // std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); - // std::cout.rdbuf(newStdOutBuffer.rdbuf()); + std::stringstream newStdOutBuffer; + std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); + std::cout.rdbuf(newStdOutBuffer.rdbuf()); std::vector device = fpga_setup::selectFPGADevice(settings->defaultPlatform, settings->defaultDevice); cl::Context context(device[0]); @@ -53,7 +53,7 @@ struct OpenCLKernelTest :public ::testing::Test { generateInputData(A, B, C, array_size); // Redirect stdout to old buffer - // std::cout.rdbuf(oldStdOutBuffer); + std::cout.rdbuf(oldStdOutBuffer); } ~OpenCLKernelTest() override { diff --git a/shared/setup/test_fpga_setup.cpp b/shared/setup/test_fpga_setup.cpp index 6d8cd5be..c1842351 100644 --- a/shared/setup/test_fpga_setup.cpp +++ b/shared/setup/test_fpga_setup.cpp @@ -1,15 +1,20 @@ // // Created by Marius Meyer on 04.12.19 // + + #include "gtest/gtest.h" #include "setup/fpga_setup.hpp" #include "parameters.h" +#include "testing/test_program_settings.h" +#include "gmock/gmock.h" + /** * Check if it is possible to find the platform and device that are given as default */ TEST (FPGASetup, FindValidPlatformAndDevice) { - EXPECT_EQ (1, fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE).size()); + EXPECT_EQ (1, fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice).size()); } /** @@ -17,10 +22,11 @@ TEST (FPGASetup, FindValidPlatformAndDevice) { */ TEST (FPGASetup, FindNonExistingPlatform) { testing::FLAGS_gtest_death_test_style="threadsafe"; - // TODO regex does not work so for now its not tested! - EXPECT_EXIT(fpga_setup::selectFPGADevice(DEFAULT_PLATFORM + 100, DEFAULT_DEVICE), + std::stringstream fmt; + fmt << "Default platform " << programSettings->defaultPlatform + 100 << " can not be used. Available platforms: " ; + EXPECT_EXIT(fpga_setup::selectFPGADevice(programSettings->defaultPlatform + 100, programSettings->defaultDevice), ::testing::ExitedWithCode(1), - ::testing::MatchesRegex(".*?Default platform \\d+ can not be used. Available platforms: \\d+")); + ::testing::StartsWith(fmt.str())); } /** @@ -28,10 +34,9 @@ TEST (FPGASetup, FindNonExistingPlatform) { */ TEST (FPGASetup, FindNonExistingDevice) { testing::FLAGS_gtest_death_test_style="threadsafe"; - // TODO regex does not work so for now its not tested! - EXPECT_EXIT(fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE + 100), + std::stringstream fmt; + fmt << "Default device " << programSettings->defaultDevice + 100 << " can not be used. Available devices: " ; + EXPECT_EXIT(fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice + 100), ::testing::ExitedWithCode(1), - ::testing::MatchesRegex(".*?Default device \\d+ can not be used. Available devices: \\d+")); + ::testing::StartsWith(fmt.str())); } - - diff --git a/shared/testing/main.cpp b/shared/testing/main.cpp index 819cf53a..0de74d6a 100644 --- a/shared/testing/main.cpp +++ b/shared/testing/main.cpp @@ -39,6 +39,8 @@ main(int argc, char *argv[]) { std::cout << "THIS BINARY EXECUTES UNIT TESTS FOR THE FOLLOWING BENCHMARK:" << std::endl << std::endl; + ::testing::InitGoogleTest(&argc, argv); + // Parse input parameters programSettings = parseProgramParameters(argc, argv); fpga_setup::setupEnvironmentAndClocks(); @@ -50,8 +52,6 @@ main(int argc, char *argv[]) { // Print input parameters printFinalConfiguration(programSettings, usedDevice[0]); - ::testing::InitGoogleTest(); - return RUN_ALL_TESTS(); From b67018837063e1acfe0699b9fbb0071671eb1f58 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 May 2020 16:36:06 +0200 Subject: [PATCH 04/45] Add MPI to shared test main --- shared/testing/main.cpp | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/shared/testing/main.cpp b/shared/testing/main.cpp index 0de74d6a..9cbe8554 100644 --- a/shared/testing/main.cpp +++ b/shared/testing/main.cpp @@ -29,6 +29,21 @@ SOFTWARE. #include "gtest/gtest.h" #include "CL/cl.hpp" +#ifdef _USE_MPI_ +#include "mpi.h" + +class MPIEnvironment : public ::testing::Environment { +public: + MPIEnvironment(int* argc, char** argv[]) { + MPI_Init(argc, argv); + } + + ~MPIEnvironment() override { + MPI_Finalize(); + } +}; +#endif + std::shared_ptr programSettings; /** @@ -41,6 +56,11 @@ main(int argc, char *argv[]) { ::testing::InitGoogleTest(&argc, argv); +#ifdef _USE_MPI_ + ::testing::Environment* const mpi_env = + ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); +#endif + // Parse input parameters programSettings = parseProgramParameters(argc, argv); fpga_setup::setupEnvironmentAndClocks(); @@ -54,6 +74,5 @@ main(int argc, char *argv[]) { return RUN_ALL_TESTS(); - } From 3c733ed9ad12a9de946cebf8737f18cf74ebe82c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 May 2020 17:08:54 +0200 Subject: [PATCH 05/45] b_eff port to new test main --- b_eff/src/host/CMakeLists.txt | 3 +- .../common_benchmark_io_implementation.cpp | 80 +++++++++++++++++++ b_eff/src/host/main.cpp | 2 + b_eff/src/host/network_functionality.cpp | 78 +----------------- b_eff/src/host/network_functionality.hpp | 45 ----------- b_eff/src/host/program_settings.h | 31 +++++++ b_eff/tests/CMakeLists.txt | 9 ++- b_eff/tests/setup_mpi.cpp | 20 ----- ...nel_functionality_and_host_integration.cpp | 49 ++++-------- 9 files changed, 135 insertions(+), 182 deletions(-) create mode 100644 b_eff/src/host/common_benchmark_io_implementation.cpp create mode 100644 b_eff/src/host/program_settings.h delete mode 100644 b_eff/tests/setup_mpi.cpp diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index 867ffcb9..65fae838 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -3,8 +3,9 @@ include_directories(../../../extern/cxxopts/include ../../../shared) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) include_directories(${MPI_CXX_INCLUDE_PATH}) include_directories(${CMAKE_BINARY_DIR}/src/common) +include_directories(${CMAKE_SOURCE_DIR}/../shared/setup .) -set(HOST_SOURCE execution_default.cpp main.cpp ../../../shared/setup/fpga_setup.cpp network_functionality.cpp) +set(HOST_SOURCE execution_default.cpp main.cpp common_benchmark_io_implementation.cpp ../../../shared/setup/fpga_setup.cpp network_functionality.cpp) add_executable(fnet ${HOST_SOURCE}) target_link_libraries(fnet ${IntelFPGAOpenCL_LIBRARIES} ${MPI_LIBRARIES}) diff --git a/b_eff/src/host/common_benchmark_io_implementation.cpp b/b_eff/src/host/common_benchmark_io_implementation.cpp new file mode 100644 index 00000000..20d8354c --- /dev/null +++ b/b_eff/src/host/common_benchmark_io_implementation.cpp @@ -0,0 +1,80 @@ + +#include "cxxopts.hpp" +#include "parameters.h" +#include "setup/common_benchmark_io.hpp" + +/** +Parses and returns program options using the cxxopts library. +Supports the following parameters: + - file name of the FPGA kernel file (-f,--file) + - number of repetitions (-n) + - number of kernel replications (-r) + - data size (-d) + - use memory interleaving +@see https://github.com/jarro2783/cxxopts + +@return program settings that are created from the given program arguments +*/ +std::shared_ptr +parseProgramParameters(int argc, char *argv[]) { + // Defining and parsing program options + cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); + options.add_options() + ("f,file", "Kernel file name", cxxopts::value()) + ("n", "Number of repetitions", + cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) + ("l", "Inital looplength of Kernel", + cxxopts::value()->default_value(std::to_string(1u << 15u))) + ("device", "Index of the device that has to be used. If not given you "\ + "will be asked which device to use if there are multiple devices "\ + "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) + ("platform", "Index of the platform that has to be used. If not given "\ + "you will be asked which platform to use if there are multiple "\ + "platforms available.", + cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) + ("h,help", "Print this help"); + cxxopts::ParseResult result = options.parse(argc, argv); + + if (result.count("h")) { + // Just print help when argument is given + std::cout << options.help() << std::endl; + exit(0); + } + // Check parsed options and handle special cases + if (result.count("f") <= 0) { + // Path to the kernel file is mandatory - exit if not given! + std::cerr << "Kernel file must be given! Aborting" << std::endl; + std::cout << options.help() << std::endl; + exit(1); + } + + // Create program settings from program arguments + std::shared_ptr sharedSettings( + new ProgramSettings{result["n"].as(), + result["l"].as(), + result["platform"].as(), + result["device"].as(), + result["f"].as()}); + return sharedSettings; +} + +/** + * Prints the used configuration to std out before starting the actual benchmark. + * + * @param programSettings The program settings retrieved from the command line + * @param device The device used for execution + */ +void printFinalConfiguration(const std::shared_ptr &programSettings, + const cl::Device &device) {// Give setup summary + std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; + std::cout << "Summary:" << std::endl + << "Repetitions: " << programSettings->numRepetitions + << std::endl + << "Kernel file: " << programSettings->kernelFileName + << std::endl; + std::cout << "Device: " + << device.getInfo() << std::endl; + std::cout << HLINE + << "Start benchmark using the given configuration." << std::endl + << HLINE; +} diff --git a/b_eff/src/host/main.cpp b/b_eff/src/host/main.cpp index b2d8ce20..c3876dcb 100644 --- a/b_eff/src/host/main.cpp +++ b/b_eff/src/host/main.cpp @@ -5,6 +5,8 @@ #include #include "network_functionality.hpp" +#include "setup/common_benchmark_io.hpp" +#include "program_settings.h" #include "mpi.h" /** diff --git a/b_eff/src/host/network_functionality.cpp b/b_eff/src/host/network_functionality.cpp index 9ef28315..9f67cd60 100644 --- a/b_eff/src/host/network_functionality.cpp +++ b/b_eff/src/host/network_functionality.cpp @@ -34,64 +34,9 @@ SOFTWARE. #include "execution.h" #include "cxxopts.hpp" #include "setup/fpga_setup.hpp" +#include "setup/common_benchmark_io.hpp" #include "parameters.h" -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("l", "Inital looplength of Kernel", - cxxopts::value()->default_value(std::to_string(1u << 15u))) - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["l"].as(), - result["platform"].as(), - result["device"].as(), - result["f"].as()}); - return sharedSettings; -} - - /** Prints the execution results to stdout @@ -146,27 +91,6 @@ printResults(bm_execution::CollectedResultMap results) { } -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} - std::vector getMessageSizes() { std::vector sizes; for (uint i = 0; i < 13; i++) { diff --git a/b_eff/src/host/network_functionality.hpp b/b_eff/src/host/network_functionality.hpp index d1fc797c..cbcd3427 100644 --- a/b_eff/src/host/network_functionality.hpp +++ b/b_eff/src/host/network_functionality.hpp @@ -32,42 +32,6 @@ SOFTWARE. #include "setup/fpga_setup.hpp" #include "parameters.h" -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ -#define STR_EXPAND(tok) #tok -#define STR(tok) STR_EXPAND(tok) - -#define PROGRAM_DESCRIPTION "Implementation of the effective bandwidth benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " STR(VERSION) - -#define ENTRY_SPACE 13 - -struct ProgramSettings { - uint numRepetitions; - uint looplength; - int defaultPlatform; - int defaultDevice; - std::string kernelFileName; -}; - - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]); /** Prints the execution results to stdout @@ -77,15 +41,6 @@ Prints the execution results to stdout void printResults(bm_execution::CollectedResultMap results); -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device); - std::vector getMessageSizes(); diff --git a/b_eff/src/host/program_settings.h b/b_eff/src/host/program_settings.h new file mode 100644 index 00000000..b5e7c0e3 --- /dev/null +++ b/b_eff/src/host/program_settings.h @@ -0,0 +1,31 @@ + +#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ +#define SRC_HOST_PROGRAM_SETTINGS_H_ + +#include "parameters.h" + +/* C++ standard library headers */ +#include + +#include "CL/opencl.h" + +/* +Short description of the program. +Moreover the version and build time is also compiled into the description. +*/ + +#define PROGRAM_DESCRIPTION "Implementation of the effective bandwidth benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + + +struct ProgramSettings { + uint numRepetitions; + uint looplength; + int defaultPlatform; + int defaultDevice; + std::string kernelFileName; +}; + + +#endif diff --git a/b_eff/tests/CMakeLists.txt b/b_eff/tests/CMakeLists.txt index ea71904f..d423fb66 100755 --- a/b_eff/tests/CMakeLists.txt +++ b/b_eff/tests/CMakeLists.txt @@ -5,12 +5,13 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) include_directories(${MPI_CXX_INCLUDE_PATH}) +include_directories(${CMAKE_SOURCE_DIR}/src/host) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/execution_default.cpp) -set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp setup_mpi.cpp) +set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/execution_default.cpp) +set(TEST_SOURCES ../../shared/testing/main.cpp ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp) add_executable(Google_Tests_run ${TEST_SOURCES} ${PROJECT_SOURCES}) -target_link_libraries(Google_Tests_run gtest gmock gtest_main ${IntelFPGAOpenCL_LIBRARIES} ${MPI_LIBRARIES}) +target_link_libraries(Google_Tests_run gtest gmock ${IntelFPGAOpenCL_LIBRARIES} ${MPI_LIBRARIES}) add_dependencies(Google_Tests_run communication_bw520n_emulate_intel) target_compile_options(Google_Tests_run PRIVATE -D_USE_MPI_) -add_test(NAME test_intel_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) \ No newline at end of file +add_test(NAME test_intel_unit COMMAND $ -f communication_bw520n_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) \ No newline at end of file diff --git a/b_eff/tests/setup_mpi.cpp b/b_eff/tests/setup_mpi.cpp deleted file mode 100644 index b73314f0..00000000 --- a/b_eff/tests/setup_mpi.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include "gtest/gtest.h" -#include "mpi.h" - -class MPIEnvironment : public ::testing::Environment { -public: - ~MPIEnvironment() override {} - - // Override this to define how to set up the environment. - void SetUp() override { - MPI_Init(NULL, NULL); - } - - // Override this to define how to tear down the environment. - void TearDown() override { - MPI_Finalize(); - } -}; - -::testing::Environment* const mpi_env = - ::testing::AddGlobalTestEnvironment(new MPIEnvironment); \ No newline at end of file diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp index fb7ebcf3..91c41472 100644 --- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp +++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp @@ -4,11 +4,12 @@ #include #include "gtest/gtest.h" -#include "../src/host/execution.h" +#include "execution.h" #include "parameters.h" #include "setup/fpga_setup.hpp" #include "unistd.h" #include "mpi.h" +#include "testing/test_program_settings.h" #include struct OpenCLKernelTest : testing::Test { @@ -19,6 +20,11 @@ struct OpenCLKernelTest : testing::Test { std::string channelOutName = "kernel_output_ch"; std::string channelInName = "kernel_input_ch"; + OpenCLKernelTest() { + kernelFileName = programSettings->kernelFileName; + setupFPGA(); + } + void createChannelFilesAndSymbolicLinks() { for (int i=0; i < numberOfChannels; i++) { std::string fname = channelOutName + std::to_string(i); @@ -32,7 +38,7 @@ struct OpenCLKernelTest : testing::Test { void setupFPGA() { createChannelFilesAndSymbolicLinks(); - std::vector device = fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE); + std::vector device = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); cl::Context context(device[0]); cl::Program program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); config = std::make_shared( @@ -43,32 +49,10 @@ struct OpenCLKernelTest : testing::Test { } }; -/** - * Parametrized test takes a tuple of 4 parameters: - * - name of the emulation bitstream - * - number of channels - * - name of the external output channel descriptors - * - name of the external input channel descriptors - */ -struct DifferentOpenCLKernelTest : OpenCLKernelTest, testing::WithParamInterface> { - DifferentOpenCLKernelTest() { - auto params = GetParam(); - kernelFileName = std::get<0>(params); - numberOfChannels = std::get<1>(params); - channelOutName = std::get<2>(params); - channelInName = std::get<3>(params); - setupFPGA(); - } - - ~DifferentOpenCLKernelTest() { - } -}; - - /** * Tests if calculate returns the correct execution results */ -TEST_P(DifferentOpenCLKernelTest, CalculateReturnsCorrectExecutionResultFor111) { +TEST_F(OpenCLKernelTest, CalculateReturnsCorrectExecutionResultFor111) { config->repetitions = 1; auto result = bm_execution::calculate(config, 1,1); EXPECT_EQ(1, result->messageSize); @@ -79,7 +63,7 @@ TEST_P(DifferentOpenCLKernelTest, CalculateReturnsCorrectExecutionResultFor111) /** * Tests if calculate returns the correct execution results for multiple repetitions */ -TEST_P(DifferentOpenCLKernelTest, CalculateReturnsCorrectExecutionResultFor842) { +TEST_F(OpenCLKernelTest, CalculateReturnsCorrectExecutionResultFor842) { config->repetitions = 2; auto result = bm_execution::calculate(config, 8,4); EXPECT_EQ(8, result->messageSize); @@ -90,7 +74,7 @@ TEST_P(DifferentOpenCLKernelTest, CalculateReturnsCorrectExecutionResultFor842) /** * Tests if data is written to the channels for small message sizes */ -TEST_P(DifferentOpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) { +TEST_F(OpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) { config->repetitions = 1; const unsigned messageSize = CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE); const unsigned looplength = 4; @@ -113,7 +97,7 @@ TEST_P(DifferentOpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingOne /** * Tests if data is written to the channels for small message sizes filling two channels */ -TEST_P(DifferentOpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) { +TEST_F(OpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) { config->repetitions = 1; const unsigned messageSize = 2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE); const unsigned looplength = 4; @@ -133,7 +117,7 @@ TEST_P(DifferentOpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwo /** * Tests if data is written to the channels for message sizes filling more than two channels */ -TEST_P(DifferentOpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) { +TEST_F(OpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) { config->repetitions = 1; const unsigned messageSize = 4 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE); const unsigned looplength = 1; @@ -153,7 +137,7 @@ TEST_P(DifferentOpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingMor /** * Tests if correct data is written to the channels */ -TEST_P(DifferentOpenCLKernelTest, CorrectDataIsWrittenToChannel) { +TEST_F(OpenCLKernelTest, CorrectDataIsWrittenToChannel) { config->repetitions = 1; const unsigned messageSize = 2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE); const unsigned looplength = 4; @@ -171,8 +155,3 @@ TEST_P(DifferentOpenCLKernelTest, CorrectDataIsWrittenToChannel) { } delete [] buffer; } - - - -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Values(std::make_tuple("communication_bw520n_emulate.aocx", 4, "kernel_output_ch", "kernel_input_ch"))); From b8ab2d1941c078fd340479ffb48b81a77c5c3ec5 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 May 2020 17:19:18 +0200 Subject: [PATCH 06/45] Clean up STREAM tests --- STREAM/src/device/CMakeLists.txt | 12 +++++++----- .../src/host/common_benchmark_io_implementation.cpp | 1 - STREAM/tests/CMakeLists.txt | 1 - 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/STREAM/src/device/CMakeLists.txt b/STREAM/src/device/CMakeLists.txt index b4e79b8d..c7c40331 100644 --- a/STREAM/src/device/CMakeLists.txt +++ b/STREAM/src/device/CMakeLists.txt @@ -3,20 +3,22 @@ set(KERNEL_REPLICATION_ENABLED Yes) include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) +math(EXPR test_size "2 * ${DEVICE_BUFFER_SIZE} * ${NUM_REPLICATIONS} * ${GLOBAL_MEM_UNROLL} * ${VECTOR_COUNT}") + if (INTELFPGAOPENCL_FOUND) generate_kernel_targets_intel(stream_kernels stream_kernels_single) - add_test(NAME test_emulation_intel COMMAND STREAM_FPGA_intel -f stream_kernels_emulate.aocx -n 1 + add_test(NAME test_emulation_intel COMMAND STREAM_FPGA_intel -f stream_kernels_emulate.aocx -n 1 -s ${test_size} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_single_emulation_intel COMMAND STREAM_FPGA_intel -f stream_kernels_single_emulate.aocx --single-kernel -n 1 + add_test(NAME test_single_emulation_intel COMMAND STREAM_FPGA_intel -f stream_kernels_single_emulate.aocx --single-kernel -n 1 -s ${test_size} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./STREAM_FPGA_intel -f stream_kernels_single_emulate.aocx --single-kernel -n 1 + add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./STREAM_FPGA_intel -s ${test_size} -f stream_kernels_single_emulate.aocx --single-kernel -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (VITIS_FOUND) generate_kernel_targets_xilinx(stream_kernels stream_kernels_single) - add_test(NAME test_single_emulation_xilinx COMMAND STREAM_FPGA_xilinx -f stream_kernels_single_emulate.xclbin --single-kernel -n 1 + add_test(NAME test_single_emulation_xilinx COMMAND STREAM_FPGA_xilinx -f stream_kernels_single_emulate.xclbin --single-kernel -n 1 -s ${test_size} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./STREAM_FPGA_xilinx -f stream_kernels_single_emulate.xclbin --single-kernel -n 1 + add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./STREAM_FPGA_xilinx -s ${test_size} -f stream_kernels_single_emulate.xclbin --single-kernel -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/STREAM/src/host/common_benchmark_io_implementation.cpp b/STREAM/src/host/common_benchmark_io_implementation.cpp index b95ad3d5..0c149f31 100644 --- a/STREAM/src/host/common_benchmark_io_implementation.cpp +++ b/STREAM/src/host/common_benchmark_io_implementation.cpp @@ -80,7 +80,6 @@ parseProgramParameters(int argc, char *argv[]) { void printFinalConfiguration(const std::shared_ptr &programSettings, const cl::Device &device) {// Give setup summary std::cout << PROGRAM_DESCRIPTION << std::endl; - std::cout << "Version: " << VERSION << std::endl << HLINE; std::cout << "Summary:" << std::endl << "Array Size: " << static_cast(programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)) << " Byte" diff --git a/STREAM/tests/CMakeLists.txt b/STREAM/tests/CMakeLists.txt index f9600de3..df89235e 100755 --- a/STREAM/tests/CMakeLists.txt +++ b/STREAM/tests/CMakeLists.txt @@ -5,7 +5,6 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) include_directories(${CMAKE_SOURCE_DIR}/src/host/) - set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/execution_default.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/stream_functionality.cpp) set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp ../../shared/testing/main.cpp) From 1e3e3687071f7f92428fc38a38170e15fd4d1e6e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 May 2020 17:19:49 +0200 Subject: [PATCH 07/45] Port FFT to new test main --- FFT/src/host/CMakeLists.txt | 3 +- .../common_benchmark_io_implementation.cpp | 88 +++++++++++++++++++ FFT/src/host/fft_functionality.cpp | 85 +----------------- FFT/src/host/fft_functionality.hpp | 48 ---------- FFT/src/host/main.cpp | 1 + FFT/src/host/program_settings.h | 30 +++++++ FFT/tests/CMakeLists.txt | 14 +-- FFT/tests/test_execution_functionality.cpp | 10 ++- 8 files changed, 136 insertions(+), 143 deletions(-) create mode 100644 FFT/src/host/common_benchmark_io_implementation.cpp create mode 100644 FFT/src/host/program_settings.h diff --git a/FFT/src/host/CMakeLists.txt b/FFT/src/host/CMakeLists.txt index a2ad856f..20a5124f 100755 --- a/FFT/src/host/CMakeLists.txt +++ b/FFT/src/host/CMakeLists.txt @@ -1,8 +1,9 @@ include_directories(../../../extern/cxxopts/include ../../../shared) include_directories(${CMAKE_BINARY_DIR}/src/common) +include_directories(${CMAKE_SOURCE_DIR}/../shared/setup .) -set(HOST_SOURCE execution_default.cpp main.cpp ../../../shared/setup/fpga_setup.cpp fft_functionality.cpp) +set(HOST_SOURCE execution_default.cpp main.cpp common_benchmark_io_implementation.cpp ../../../shared/setup/fpga_setup.cpp fft_functionality.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) diff --git a/FFT/src/host/common_benchmark_io_implementation.cpp b/FFT/src/host/common_benchmark_io_implementation.cpp new file mode 100644 index 00000000..843c9d88 --- /dev/null +++ b/FFT/src/host/common_benchmark_io_implementation.cpp @@ -0,0 +1,88 @@ + +#include "cxxopts.hpp" +#include "parameters.h" +#include "setup/common_benchmark_io.hpp" + +/** +Parses and returns program options using the cxxopts library. +Supports the following parameters: + - file name of the FPGA kernel file (-f,--file) + - number of repetitions (-n) + - number of kernel replications (-r) + - data size (-d) + - use memory interleaving +@see https://github.com/jarro2783/cxxopts + +@return program settings that are created from the given program arguments +*/ +std::shared_ptr +parseProgramParameters(int argc, char *argv[]) { + // Defining and parsing program options + cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); + options.add_options() + ("f,file", "Kernel file name", cxxopts::value()) + ("n", "Number of repetitions", + cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) + ("i", "Multiplier for the used data size that will be i * FFT_SIZE", + cxxopts::value()->default_value(std::to_string(DEFAULT_ITERATIONS))) + ("inverse", "If set, the inverse FFT is calculated instead") + ("device", "Index of the device that has to be used. If not given you "\ + "will be asked which device to use if there are multiple devices "\ + "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) + ("platform", "Index of the platform that has to be used. If not given "\ + "you will be asked which platform to use if there are multiple "\ + "platforms available.", + cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) + ("h,help", "Print this help"); + cxxopts::ParseResult result = options.parse(argc, argv); + + if (result.count("h")) { + // Just print help when argument is given + std::cout << options.help() << std::endl; + exit(0); + } + // Check parsed options and handle special cases + if (result.count("f") <= 0) { + // Path to the kernel file is mandatory - exit if not given! + std::cerr << "Kernel file must be given! Aborting" << std::endl; + std::cout << options.help() << std::endl; + exit(1); + } + + // Create program settings from program arguments + std::shared_ptr sharedSettings( + new ProgramSettings{result["n"].as(), + result["i"].as(), + static_cast(result.count("inverse")), + result["platform"].as(), + result["device"].as(), + result["f"].as()}); + return sharedSettings; +} + +/** + * Prints the used configuration to std out before starting the actual benchmark. + * + * @param programSettings The program settings retrieved from the command line + * @param device The device used for execution + */ +void printFinalConfiguration(const std::shared_ptr &programSettings, + const cl::Device &device) {// Give setup summary + std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; + std::cout << "Summary:" << std::endl + << "FFT Size: " << (1 << LOG_FFT_SIZE) + << std::endl + << "Data Size: " << programSettings->iterations << " * FFT Size * sizeof(" + << STR(HOST_DATA_TYPE) + << ") = " << static_cast((1 << LOG_FFT_SIZE) * programSettings->iterations * sizeof(HOST_DATA_TYPE)) << " Byte" + << std::endl + << "Repetitions: " << programSettings->numRepetitions + << std::endl + << "Kernel file: " << programSettings->kernelFileName + << std::endl; + std::cout << "Device: " + << device.getInfo() << std::endl; + std::cout << HLINE + << "Start benchmark using the given configuration." << std::endl + << HLINE; +} diff --git a/FFT/src/host/fft_functionality.cpp b/FFT/src/host/fft_functionality.cpp index 41d2ed6b..06d9a71f 100644 --- a/FFT/src/host/fft_functionality.cpp +++ b/FFT/src/host/fft_functionality.cpp @@ -34,65 +34,9 @@ SOFTWARE. #include "execution.h" #include "cxxopts.hpp" #include "setup/fpga_setup.hpp" +#include "setup/common_benchmark_io.hpp" #include "parameters.h" -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("i", "Multiplier for the used data size that will be i * FFT_SIZE", - cxxopts::value()->default_value(std::to_string(DEFAULT_ITERATIONS))) - ("inverse", "If set, the inverse FFT is calculated instead") - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["i"].as(), - static_cast(result.count("inverse")), - result["platform"].as(), - result["device"].as(), - result["f"].as()}); - return sharedSettings; -} - /** Prints the execution results to stdout @@ -117,33 +61,6 @@ printResults(std::shared_ptr results) { } -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "FFT Size: " << (1 << LOG_FFT_SIZE) - << std::endl - << "Data Size: " << programSettings->iterations << " * FFT Size * sizeof(" - << STR(HOST_DATA_TYPE) - << ") = " << static_cast((1 << LOG_FFT_SIZE) * programSettings->iterations * sizeof(HOST_DATA_TYPE)) << " Byte" - << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} - void generateInputData(std::complex* data, unsigned iterations) { std::mt19937 gen(0); diff --git a/FFT/src/host/fft_functionality.hpp b/FFT/src/host/fft_functionality.hpp index 2cd72c3b..c791eadf 100644 --- a/FFT/src/host/fft_functionality.hpp +++ b/FFT/src/host/fft_functionality.hpp @@ -33,44 +33,6 @@ SOFTWARE. #include "setup/fpga_setup.hpp" #include "parameters.h" -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ -#define STR_EXPAND(tok) #tok -#define STR(tok) STR_EXPAND(tok) - -#define PROGRAM_DESCRIPTION "Implementation of the FFT benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " STR(VERSION) - -#define ENTRY_SPACE 13 - -struct ProgramSettings { - uint numRepetitions; - unsigned iterations; - bool inverse; - int defaultPlatform; - int defaultDevice; - std::string kernelFileName; -}; - - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]); - /** Prints the execution results to stdout @@ -79,16 +41,6 @@ Prints the execution results to stdout void printResults(std::shared_ptr results); -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device); - - /** * Fill the data buffer with random number using the mersenne twister engine with * seed 0. diff --git a/FFT/src/host/main.cpp b/FFT/src/host/main.cpp index a978a790..baedf3e5 100644 --- a/FFT/src/host/main.cpp +++ b/FFT/src/host/main.cpp @@ -3,6 +3,7 @@ // #include "fft_functionality.hpp" +#include "setup/common_benchmark_io.hpp" /** The program entry point diff --git a/FFT/src/host/program_settings.h b/FFT/src/host/program_settings.h new file mode 100644 index 00000000..02b3817f --- /dev/null +++ b/FFT/src/host/program_settings.h @@ -0,0 +1,30 @@ + +#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ +#define SRC_HOST_PROGRAM_SETTINGS_H_ + +#include "parameters.h" + +/* C++ standard library headers */ +#include + +#include "CL/opencl.h" + +/* +Short description of the program. +Moreover the version and build time is also compiled into the description. +*/ +#define PROGRAM_DESCRIPTION "Implementation of the FFT benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + +struct ProgramSettings { + uint numRepetitions; + unsigned iterations; + bool inverse; + int defaultPlatform; + int defaultDevice; + std::string kernelFileName; +}; + + +#endif diff --git a/FFT/tests/CMakeLists.txt b/FFT/tests/CMakeLists.txt index 43765c70..0c72e04b 100755 --- a/FFT/tests/CMakeLists.txt +++ b/FFT/tests/CMakeLists.txt @@ -3,25 +3,25 @@ add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) +include_directories(${CMAKE_SOURCE_DIR}/src/host) - -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/execution_default.cpp ../src/host/fft_functionality.cpp) -set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp "test_fft_functionality.cpp" test_execution_functionality.cpp) +set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/execution_default.cpp ../src/host/fft_functionality.cpp) +set(TEST_SOURCES ../../shared/testing/main.cpp ../../shared/setup/test_fpga_setup.cpp test_fft_functionality.cpp test_execution_functionality.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock gtest_main ${IntelFPGAOpenCL_LIBRARIES}) + target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES}) target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) add_dependencies(Test_intel fft1d_float_8_emulate_intel) - add_test(NAME test_intel_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_intel_unit COMMAND $ -f fft1d_float_8_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (Vitis_FOUND) include_directories(${Vitis_INCLUDE_DIRS}) add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock gtest_main ${Vitis_LIBRARIES}) + target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES}) target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) add_dependencies(Test_xilinx fft1d_float_8_emulate_xilinx) - add_test(NAME test_xilinx_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_xilinx_unit COMMAND $ -f fft1d_float_8_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/FFT/tests/test_execution_functionality.cpp b/FFT/tests/test_execution_functionality.cpp index 5dee9642..fe2cb27e 100644 --- a/FFT/tests/test_execution_functionality.cpp +++ b/FFT/tests/test_execution_functionality.cpp @@ -8,6 +8,7 @@ #include "parameters.h" #include "setup/fpga_setup.hpp" #include "../src/host/fft_functionality.hpp" +#include "testing/test_program_settings.h" struct OpenCLKernelTest : testing::Test { @@ -15,6 +16,12 @@ struct OpenCLKernelTest : testing::Test { std::shared_ptr config; unsigned repetitions = 10; + OpenCLKernelTest() { + kernelFileName = programSettings->kernelFileName; + setupFPGA(); + } + +// TODO fix test void setupFPGA() { std::vector device = fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE); cl::Context context(device[0]); @@ -235,6 +242,3 @@ TEST_P (DifferentOpenCLKernelTest, FPGAiFFTAndCPUiFFTGiveSameResults) { free(data); free(data2); } - -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Values("fft1d_float_8_emulate.aocx")); From cde64316829f0cbb4466712a27b780b6f6a4a6fb Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 May 2020 17:20:15 +0200 Subject: [PATCH 08/45] Port GEMM to new test main --- GEMM/src/host/CMakeLists.txt | 3 +- .../common_benchmark_io_implementation.cpp | 101 ++++++++++++++++++ GEMM/src/host/gemm_functionality.cpp | 67 +----------- GEMM/src/host/gemm_functionality.hpp | 35 ------ GEMM/src/host/main.cpp | 25 +---- GEMM/src/host/program_settings.h | 36 +++++++ GEMM/tests/CMakeLists.txt | 15 +-- ...nel_functionality_and_host_integration.cpp | 20 ++-- 8 files changed, 157 insertions(+), 145 deletions(-) create mode 100644 GEMM/src/host/common_benchmark_io_implementation.cpp create mode 100644 GEMM/src/host/program_settings.h diff --git a/GEMM/src/host/CMakeLists.txt b/GEMM/src/host/CMakeLists.txt index d43ac29e..68c00da8 100755 --- a/GEMM/src/host/CMakeLists.txt +++ b/GEMM/src/host/CMakeLists.txt @@ -1,8 +1,9 @@ include_directories(../../../extern/cxxopts/include ../../../shared/) include_directories(${CMAKE_BINARY_DIR}/src/common) +include_directories(${CMAKE_SOURCE_DIR}/../shared/setup .) -set(HOST_SOURCE execution_cannon.cpp main.cpp ../../../shared/setup/fpga_setup.cpp gemm_functionality.cpp) +set(HOST_SOURCE execution_cannon.cpp main.cpp common_benchmark_io_implementation.cpp ../../../shared/setup/fpga_setup.cpp gemm_functionality.cpp) find_package(BLAS) diff --git a/GEMM/src/host/common_benchmark_io_implementation.cpp b/GEMM/src/host/common_benchmark_io_implementation.cpp new file mode 100644 index 00000000..227975ef --- /dev/null +++ b/GEMM/src/host/common_benchmark_io_implementation.cpp @@ -0,0 +1,101 @@ + +#include "cxxopts.hpp" +#include "parameters.h" +#include "setup/common_benchmark_io.hpp" + +/** +Parses and returns program options using the cxxopts library. +Supports the following parameters: + - file name of the FPGA kernel file (-f,--file) + - number of repetitions (-n) + - number of kernel replications (-r) + - data size (-d) + - use memory interleaving +@see https://github.com/jarro2783/cxxopts + +@return program settings that are created from the given program arguments +*/ +std::shared_ptr +parseProgramParameters(int argc, char *argv[]) { + cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); + options.add_options() + ("f,file", "Kernel file name", cxxopts::value()) + ("n", "Number of repetitions", + cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) + ("m", "Matrix size", + cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) + ("kernel", "Name of the kernel", + cxxopts::value()->default_value(KERNEL_NAME)) +#ifdef INTEL_FPGA + ("i,interleaving", "Use memory interleaving on the FPGA") +#endif + ("device", "Index of the device that has to be used. If not given you "\ + "will be asked which device to use if there are multiple devices "\ + "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) + ("platform", "Index of the platform that has to be used. If not given "\ + "you will be asked which platform to use if there are multiple "\ + "platforms available.", + cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) + ("h,help", "Print this help"); + cxxopts::ParseResult result = options.parse(argc, argv); + + if (result.count("h")) { + // Just print help when argument is given + std::cout << options.help() << std::endl; + exit(0); + } + // Check parsed options and handle special cases + if (result.count("f") <= 0) { + // Path to the kernel file is mandatory - exit if not given! + std::cerr << "Kernel file must be given! Aborting" << std::endl; + std::cout << options.help() << std::endl; + exit(1); + } + + // Create program settings from program arguments + std::shared_ptr sharedSettings( + new ProgramSettings{result["n"].as(), + result["m"].as(), + result["platform"].as(), + result["device"].as(), +#ifdef INTEL_FPGA + static_cast(result.count("i") > 0), +#else + false, +#endif + result["f"].as(), + result["kernel"].as()}); + return sharedSettings; +} + +/** + * Prints the used configuration to std out before starting the actual benchmark. + * + * @param programSettings The program settings retrieved from the command line + * @param device The device used for execution + */ +void printFinalConfiguration(const std::shared_ptr &programSettings, + const cl::Device &device) {// Give setup summary + std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; + std::cout << "Summary:" << std::endl + << "Kernel Repetitions: " << programSettings->numRepetitions + << std::endl + << "Total matrix size: " << programSettings->matrixSize + << std::endl + << "Memory Interleaving: " << programSettings->useMemInterleaving + << " (Intel only)" << std::endl + << "Kernel file: " << programSettings->kernelFileName + << std::endl + << "Device: " + << device.getInfo() << std::endl + << "Verification: " + #ifdef _USE_BLAS_ + << "external library" + #else + << "internal ref. implementation" + #endif + << std::endl + << HLINE + << "Start benchmark using the given configuration." << std::endl + << HLINE; +} diff --git a/GEMM/src/host/gemm_functionality.cpp b/GEMM/src/host/gemm_functionality.cpp index eec55b02..f9802f36 100755 --- a/GEMM/src/host/gemm_functionality.cpp +++ b/GEMM/src/host/gemm_functionality.cpp @@ -43,74 +43,9 @@ SOFTWARE. /* Project's headers */ #include "parameters.h" #include "execution.h" +#include "setup/common_benchmark_io.hpp" -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char * argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("m", "Matrix size", - cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) - ("kernel", "Name of the kernel", - cxxopts::value()->default_value(KERNEL_NAME)) -#ifdef INTEL_FPGA - ("i,interleaving", "Use memory interleaving on the FPGA") -#endif - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["m"].as(), - result["platform"].as(), - result["device"].as(), -#ifdef INTEL_FPGA - static_cast(result.count("i") > 0), -#else - false, -#endif - result["f"].as(), - result["kernel"].as()}); - return sharedSettings; -} - /** Print the benchmark Results diff --git a/GEMM/src/host/gemm_functionality.hpp b/GEMM/src/host/gemm_functionality.hpp index e34ddcf5..4b09d2e0 100755 --- a/GEMM/src/host/gemm_functionality.hpp +++ b/GEMM/src/host/gemm_functionality.hpp @@ -29,13 +29,6 @@ SOFTWARE. #include "execution.h" #include "parameters.h" -/* -Short description of the program -*/ -#define PROGRAM_DESCRIPTION "Implementation of the GEMM benchmark"\ - " proposed in the HPCC benchmark adapted for FPGA\n"\ - "Version: " VERSION "\n" - /* Number of times the execution of the benchmark will be repeated. */ @@ -50,17 +43,6 @@ The full name will be */ #define GEMM_KERNEL "gemm" -#define ENTRY_SPACE 13 - -struct ProgramSettings { - uint numRepetitions; - cl_uint matrixSize; - int defaultPlatform; - int defaultDevice; - bool useMemInterleaving; - std::string kernelFileName; - std::string kernelName; -}; #ifdef _USE_BLAS_ @@ -71,21 +53,6 @@ extern "C" void sgemm_(char*, char*, int*, int*,int*, float*, float*, int*, floa double checkGEMMresults(HOST_DATA_TYPE* c_res, cl_int lda, cl_int n); -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char * argv[]); - /** Print the benchmark results to stdout @@ -123,8 +90,6 @@ C = alpha * A * B + beta * C void gemm_ref( HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, int n, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta); -double checkLINPACKresults (HOST_DATA_TYPE* b_res, cl_int lda, cl_int n); - HOST_DATA_TYPE epslon (HOST_DATA_TYPE x); #endif // SRC_HOST_COMMON_FUNCTIONALITY_H_ diff --git a/GEMM/src/host/main.cpp b/GEMM/src/host/main.cpp index 1b1418eb..596aabd6 100755 --- a/GEMM/src/host/main.cpp +++ b/GEMM/src/host/main.cpp @@ -4,6 +4,7 @@ #include "parameters.h" #include "gemm_functionality.hpp" +#include "setup/common_benchmark_io.hpp" #include "setup/fpga_setup.hpp" /** @@ -21,29 +22,7 @@ int main(int argc, char * argv[]) { cl::Program program = fpga_setup::fpgaSetup(&context, usedDevice, &programSettings->kernelFileName); - // Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "Kernel Repetitions: " << programSettings->numRepetitions - << std::endl - << "Total matrix size: " << programSettings->matrixSize - << std::endl - << "Memory Interleaving: " << programSettings->useMemInterleaving - << " (Intel only)" << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl - << "Device: " - << usedDevice[0].getInfo() << std::endl - << "Verification: " - #ifdef _USE_BLAS_ - << "external library" - #else - << "internal ref. implementation" - #endif - << std::endl - << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; + printFinalConfiguration(programSettings, usedDevice[0]); std::shared_ptr config( new bm_execution::ExecutionConfiguration{ diff --git a/GEMM/src/host/program_settings.h b/GEMM/src/host/program_settings.h new file mode 100644 index 00000000..91f9bf36 --- /dev/null +++ b/GEMM/src/host/program_settings.h @@ -0,0 +1,36 @@ + +#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ +#define SRC_HOST_PROGRAM_SETTINGS_H_ + +#include "parameters.h" + +/* C++ standard library headers */ +#include + +#include "CL/opencl.h" + +/* +Short description of the program. +Moreover the version and build time is also compiled into the description. +*/ + +/* +Short description of the program +*/ +#define PROGRAM_DESCRIPTION "Implementation of the GEMM benchmark"\ + " proposed in the HPCC benchmark adapted for FPGA\n"\ + "Version: " VERSION "\n" + + +struct ProgramSettings { + uint numRepetitions; + cl_uint matrixSize; + int defaultPlatform; + int defaultDevice; + bool useMemInterleaving; + std::string kernelFileName; + std::string kernelName; +}; + + +#endif diff --git a/GEMM/tests/CMakeLists.txt b/GEMM/tests/CMakeLists.txt index 754025f0..6ea2a4f4 100755 --- a/GEMM/tests/CMakeLists.txt +++ b/GEMM/tests/CMakeLists.txt @@ -3,24 +3,25 @@ add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared/) +include_directories(${CMAKE_SOURCE_DIR}/src/host) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/gemm_functionality.cpp ../src/host/execution_cannon.cpp) -set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp) +set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/gemm_functionality.cpp ../src/host/execution_cannon.cpp) +set(TEST_SOURCES ../../shared/testing/main.cpp ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock gtest_main ${IntelFPGAOpenCL_LIBRARIES}) + target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES}) target_compile_definitions(GEMM_intel PRIVATE -DINTEL_FPGA) add_dependencies(Test_intel gemm_cannon_emulate_intel) - add_test(NAME test_intel_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_intel_unit COMMAND $ -f gemm_cannon_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (Vitis_FOUND) include_directories(${Vitis_INCLUDE_DIRS}) add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock gtest_main ${Vitis_LIBRARIES}) + target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES}) target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) - add_dependencies(Test_xilinx gemm_cannon_emulate_xilinx) - add_test(NAME test_xilinx_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_dependencies(Test_xilinx gemm_cannon_emulate_xilinx) + add_test(NAME test_xilinx_unit COMMAND $ -f gemm_cannon_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp index 35ed5a7b..af4b34a0 100755 --- a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp +++ b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp @@ -8,6 +8,7 @@ #include "../src/host/gemm_functionality.hpp" #include "parameters.h" #include "setup/fpga_setup.hpp" +#include "testing/test_program_settings.h" void ref_matmul(HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C, int size) { @@ -43,8 +44,9 @@ struct OpenCLKernelTest : testing::Test { setupFPGA(); } + void setupFPGA() { - std::vector device = fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE); + std::vector device = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); cl::Context context(device[0]); cl::Program program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); config = std::make_shared( @@ -69,11 +71,11 @@ struct OpenCLKernelTest : testing::Test { } }; -struct DifferentOpenCLKernelTest : OpenCLKernelTest, testing::WithParamInterface> { +struct DifferentOpenCLKernelTest : OpenCLKernelTest, testing::WithParamInterface { DifferentOpenCLKernelTest() { auto params = GetParam(); - kernelFileName = std::get<0>(params); - matrix_size = std::get<1>(params) * BLOCK_SIZE; + kernelFileName = programSettings->kernelFileName; + matrix_size = params * BLOCK_SIZE; posix_memalign(reinterpret_cast(&A), 64, sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); posix_memalign(reinterpret_cast(&B), 64, @@ -204,14 +206,6 @@ TEST_P(DifferentOpenCLKernelTest, FPGACorrectbetaCplusalphaAB) { } } -#ifdef INTEL_FPGA INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Combine(testing::Values("gemm_cannon_emulate.aocx"), testing::Values(1,2) - )); -#endif + testing::Values(1,2)); -#ifdef XILINXL_FPGA -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Combine(testing::Values("gemm_cannon_emulate.xclbin"), testing::Values(1,2) - )); -#endif From 054e0761bc0265e43cff62866318be2af038720d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 May 2020 17:20:42 +0200 Subject: [PATCH 09/45] Port LINPACK to new test main --- LINPACK/src/host/CMakeLists.txt | 3 +- .../common_benchmark_io_implementation.cpp | 86 +++++++++++++++++++ LINPACK/src/host/linpack_functionality.cpp | 83 +----------------- LINPACK/src/host/linpack_functionality.hpp | 46 ---------- LINPACK/src/host/main.cpp | 1 + LINPACK/src/host/program_settings.h | 34 ++++++++ LINPACK/tests/CMakeLists.txt | 13 +-- ...nel_functionality_and_host_integration.cpp | 32 ++----- 8 files changed, 138 insertions(+), 160 deletions(-) create mode 100644 LINPACK/src/host/common_benchmark_io_implementation.cpp create mode 100644 LINPACK/src/host/program_settings.h diff --git a/LINPACK/src/host/CMakeLists.txt b/LINPACK/src/host/CMakeLists.txt index 2642ccde..8de7ee59 100755 --- a/LINPACK/src/host/CMakeLists.txt +++ b/LINPACK/src/host/CMakeLists.txt @@ -1,6 +1,7 @@ include_directories(../../../extern/cxxopts/include ../../../shared/) +include_directories(.) -set(HOST_SOURCE execution_blocked_pvt.cpp main.cpp ../../../shared/setup/fpga_setup.cpp linpack_functionality.cpp) +set(HOST_SOURCE execution_blocked_pvt.cpp common_benchmark_io_implementation.cpp main.cpp ../../../shared/setup/fpga_setup.cpp linpack_functionality.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) diff --git a/LINPACK/src/host/common_benchmark_io_implementation.cpp b/LINPACK/src/host/common_benchmark_io_implementation.cpp new file mode 100644 index 00000000..a243dc1c --- /dev/null +++ b/LINPACK/src/host/common_benchmark_io_implementation.cpp @@ -0,0 +1,86 @@ + +#include "cxxopts.hpp" +#include "parameters.h" +#include "setup/common_benchmark_io.hpp" + +/** +Parses and returns program options using the cxxopts library. +Supports the following parameters: + - file name of the FPGA kernel file (-f,--file) + - number of repetitions (-n) + - number of kernel replications (-r) + - data size (-d) + - use memory interleaving +@see https://github.com/jarro2783/cxxopts + +@return program settings that are created from the given program arguments +*/ +std::shared_ptr +parseProgramParameters(int argc, char *argv[]) { + // Defining and parsing program options + cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); + options.add_options() + ("f,file", "Kernel file name", cxxopts::value()) + ("n", "Number of repetitions", + cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) + ("s", "Size of the data arrays", + cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) + ("device", "Index of the device that has to be used. If not given you "\ + "will be asked which device to use if there are multiple devices "\ + "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) + ("platform", "Index of the platform that has to be used. If not given "\ + "you will be asked which platform to use if there are multiple "\ + "platforms available.", + cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) + ("h,help", "Print this help"); + cxxopts::ParseResult result = options.parse(argc, argv); + + if (result.count("h")) { + // Just print help when argument is given + std::cout << options.help() << std::endl; + exit(0); + } + // Check parsed options and handle special cases + if (result.count("f") <= 0) { + // Path to the kernel file is mandatory - exit if not given! + std::cerr << "Kernel file must be given! Aborting" << std::endl; + std::cout << options.help() << std::endl; + exit(1); + } + + // Create program settings from program arguments + std::shared_ptr sharedSettings( + new ProgramSettings{result["n"].as(), + result["s"].as(), + result["platform"].as(), + result["device"].as(), + result["f"].as()}); + return sharedSettings; +} + +/** + * Prints the used configuration to std out before starting the actual benchmark. + * + * @param programSettings The program settings retrieved from the command line + * @param device The device used for execution + */ +void printFinalConfiguration(const std::shared_ptr &programSettings, + const cl::Device &device) {// Give setup summary + std::cout << PROGRAM_DESCRIPTION << HLINE; + std::cout << "Summary:" << std::endl + << "Matrix Size: " << programSettings->matrixSize + << std::endl + << "Block Size: " << (1 << LOCAL_MEM_BLOCK_LOG) + << std::endl + << "Data Type: " << STR(HOST_DATA_TYPE) + << std::endl + << "Repetitions: " << programSettings->numRepetitions + << std::endl + << "Kernel file: " << programSettings->kernelFileName + << std::endl; + std::cout << "Device: " + << device.getInfo() << std::endl; + std::cout << HLINE + << "Start benchmark using the given configuration." << std::endl + << HLINE; +} diff --git a/LINPACK/src/host/linpack_functionality.cpp b/LINPACK/src/host/linpack_functionality.cpp index e5c9c843..82aba0dc 100644 --- a/LINPACK/src/host/linpack_functionality.cpp +++ b/LINPACK/src/host/linpack_functionality.cpp @@ -34,63 +34,9 @@ SOFTWARE. #include "execution.h" #include "cxxopts.hpp" #include "setup/fpga_setup.hpp" +#include "setup/common_benchmark_io.hpp" #include "parameters.h" -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("s", "Size of the data arrays", - cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["s"].as(), - result["platform"].as(), - result["device"].as(), - result["f"].as()}); - return sharedSettings; -} - /** Prints the execution results to stdout @@ -130,33 +76,6 @@ printResults(std::shared_ptr results, unsigned m } -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << HLINE; - std::cout << "Summary:" << std::endl - << "Matrix Size: " << programSettings->matrixSize - << std::endl - << "Block Size: " << (1 << LOCAL_MEM_BLOCK_LOG) - << std::endl - << "Data Type: " << STR(HOST_DATA_TYPE) - << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} - void generateInputData(HOST_DATA_TYPE* A, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned matrix_size, HOST_DATA_TYPE* norma) { std::mt19937 gen(7); diff --git a/LINPACK/src/host/linpack_functionality.hpp b/LINPACK/src/host/linpack_functionality.hpp index 1db6ef2e..ef7fd358 100644 --- a/LINPACK/src/host/linpack_functionality.hpp +++ b/LINPACK/src/host/linpack_functionality.hpp @@ -32,43 +32,6 @@ SOFTWARE. #include "execution.h" #include "parameters.h" -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ -#define STR_EXPAND(tok) #tok -#define STR(tok) STR_EXPAND(tok) - -#define PROGRAM_DESCRIPTION "Implementation of the LINPACK benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " VERSION "\n" - -#define ENTRY_SPACE 15 - -struct ProgramSettings { - uint numRepetitions; - uint matrixSize; - int defaultPlatform; - int defaultDevice; - std::string kernelFileName; -}; - - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]); - /** Prints the execution results to stdout @@ -77,15 +40,6 @@ Prints the execution results to stdout void printResults(std::shared_ptr results, unsigned matrix_size); -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device); - /** * Fill the data buffer with random number using the mersenne twister engine with diff --git a/LINPACK/src/host/main.cpp b/LINPACK/src/host/main.cpp index 9bea5953..f1009df9 100644 --- a/LINPACK/src/host/main.cpp +++ b/LINPACK/src/host/main.cpp @@ -4,6 +4,7 @@ #include "linpack_functionality.hpp" #include "setup/fpga_setup.hpp" +#include "setup/common_benchmark_io.hpp" #include "execution.h" /** diff --git a/LINPACK/src/host/program_settings.h b/LINPACK/src/host/program_settings.h new file mode 100644 index 00000000..f2f1e97e --- /dev/null +++ b/LINPACK/src/host/program_settings.h @@ -0,0 +1,34 @@ + +#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ +#define SRC_HOST_PROGRAM_SETTINGS_H_ + +#include "parameters.h" + +/* C++ standard library headers */ +#include + +#include "CL/opencl.h" + +/* +Short description of the program. +Moreover the version and build time is also compiled into the description. +*/ + +/* +Short description of the program +*/ +#define PROGRAM_DESCRIPTION "Implementation of the LINPACK benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + + +struct ProgramSettings { + uint numRepetitions; + uint matrixSize; + int defaultPlatform; + int defaultDevice; + std::string kernelFileName; +}; + + +#endif diff --git a/LINPACK/tests/CMakeLists.txt b/LINPACK/tests/CMakeLists.txt index baff3930..8f4569a5 100755 --- a/LINPACK/tests/CMakeLists.txt +++ b/LINPACK/tests/CMakeLists.txt @@ -2,10 +2,11 @@ add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared/) +include_directories(${CMAKE_SOURCE_DIR}/src/host) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/execution_blocked_pvt.cpp ../src/host/linpack_functionality.cpp) -set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp test_kernel_functionality_separate_cores.cpp) +set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/execution_blocked_pvt.cpp ../src/host/linpack_functionality.cpp) +set(TEST_SOURCES ../../shared/testing/main.cpp ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp test_kernel_functionality_separate_cores.cpp) set(BLA_VENDOR Intel10_64lp) find_package(LAPACK) @@ -18,7 +19,7 @@ endif() if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock gtest_main ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) if (LAPACK_FOUND) target_compile_definitions(Test_intel PRIVATE -D_INTEL_MKL_) @@ -27,13 +28,13 @@ if (INTELFPGAOPENCL_FOUND) endif() add_dependencies(Test_intel lu_blocked_pvt_emulate_intel lu_blocked_pvt_test_emulate_intel) target_compile_options(Test_intel PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_intel_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_intel_unit COMMAND $ -f lu_blocked_pvt_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (Vitis_FOUND) include_directories(${Vitis_INCLUDE_DIRS}) add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock gtest_main ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) if (LAPACK_FOUND) target_compile_definitions(Test_xilinx PRIVATE -D_INTEL_MKL_) @@ -44,5 +45,5 @@ if (Vitis_FOUND) # Disabled since compilation is not possible #add_dependencies(Test_xilinx lu_blocked_pvt_test_emulate_xilinx) target_compile_options(Test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_xilinx_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_xilinx_unit COMMAND $ -f lu_blocked_pvt_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp index ec241188..c6a83de0 100644 --- a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp +++ b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp @@ -5,6 +5,7 @@ #include "parameters.h" #include "../src/host/execution.h" #include "setup/fpga_setup.hpp" +#include "testing/test_program_settings.h" #include "../src/host/linpack_functionality.hpp" #ifdef _INTEL_MKL_ #include "mkl.h" @@ -25,11 +26,12 @@ struct OpenCLKernelTest : testing::Test { sizeof(HOST_DATA_TYPE) * array_size); posix_memalign(reinterpret_cast(&ipvt), 64, sizeof(cl_int) * array_size); + setupFPGA(programSettings->kernelFileName); } void setupFPGA(std::string kernelFileName) { lastKernelFileName = kernelFileName; - std::vector device = fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE); + std::vector device = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); cl::Context context(device[0]); cl::Program program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); config = std::make_shared( @@ -49,19 +51,11 @@ struct OpenCLKernelTest : testing::Test { } }; -struct DifferentOpenCLKernelTest : OpenCLKernelTest, testing::WithParamInterface { - DifferentOpenCLKernelTest() { - auto params = GetParam(); - auto kernel_file = params; - setupFPGA(kernel_file); - } -}; - /** * Execution returns correct results for a single repetition */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectResultsOneRepetition) { +TEST_F(OpenCLKernelTest, FPGACorrectResultsOneRepetition) { auto result = bm_execution::calculate(config, A, b, ipvt); for (int i = 0; i < array_size; i++) { @@ -73,7 +67,7 @@ TEST_P(DifferentOpenCLKernelTest, FPGACorrectResultsOneRepetition) { /** * Execution returns correct results for a single repetition */ -TEST_P(DifferentOpenCLKernelTest, FPGASimilarResultsToLAPACKforSingleBlock) { +TEST_F(OpenCLKernelTest, FPGASimilarResultsToLAPACKforSingleBlock) { auto result = bm_execution::calculate(config, A, b, ipvt); int info; @@ -103,7 +97,7 @@ TEST_P(DifferentOpenCLKernelTest, FPGASimilarResultsToLAPACKforSingleBlock) { /** * Execution of reference implementation returns correct results for a single repetition */ -TEST_P(DifferentOpenCLKernelTest, FPGAReferenceImplSimilarToMKL) { +TEST_F(OpenCLKernelTest, FPGAReferenceImplSimilarToMKL) { gefa_ref(A, config->matrixSize, config->matrixSize, ipvt); gesl_ref(A, b, ipvt, config->matrixSize, config->matrixSize); @@ -138,7 +132,7 @@ TEST_P(DifferentOpenCLKernelTest, FPGAReferenceImplSimilarToMKL) { // TODO this test fails most likely because of inreasing errors in C2. Use partial pivoting or other mechanisms // to make the calculation stable again! // Remove DISABLED_ from test name to enable the test again. -TEST_P(DifferentOpenCLKernelTest, DISABLED_FPGASimilarResultsToLAPACKforMultipleBlocks) { +TEST_F(OpenCLKernelTest, DISABLED_FPGASimilarResultsToLAPACKforMultipleBlocks) { free(A); free(b); free(ipvt); @@ -180,15 +174,3 @@ TEST_P(DifferentOpenCLKernelTest, DISABLED_FPGASimilarResultsToLAPACKforMultiple #endif - -#ifdef INTEL_FPGA -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Values("lu_blocked_pvt_emulate.aocx") -); -#endif - -#ifdef XILINX_FPGA -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Values("lu_blocked_pvt_emulate.xclbin") -); -#endif From 7cdd54233a23a7238905a1d386103a075d390aa0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 May 2020 17:21:07 +0200 Subject: [PATCH 10/45] Port PTRANS to new test main --- PTRANS/src/host/CMakeLists.txt | 3 +- .../common_benchmark_io_implementation.cpp | 91 +++++++++++++++++++ PTRANS/src/host/main.cpp | 1 + PTRANS/src/host/program_settings.h | 33 +++++++ PTRANS/src/host/transpose_functionality.cpp | 87 ------------------ PTRANS/src/host/transpose_functionality.hpp | 46 ---------- PTRANS/tests/CMakeLists.txt | 13 +-- ...nel_functionality_and_host_integration.cpp | 35 ++----- 8 files changed, 143 insertions(+), 166 deletions(-) create mode 100644 PTRANS/src/host/common_benchmark_io_implementation.cpp create mode 100644 PTRANS/src/host/program_settings.h diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index d6261523..1bcf2935 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -1,8 +1,9 @@ include_directories(../../../extern/cxxopts/include ../../../shared) include_directories(${CMAKE_BINARY_DIR}/src/common) +include_directories(.) -set(HOST_SOURCE execution_default.cpp main.cpp ../../../shared/setup/fpga_setup.cpp transpose_functionality.cpp) +set(HOST_SOURCE execution_default.cpp main.cpp common_benchmark_io_implementation.cpp ../../../shared/setup/fpga_setup.cpp transpose_functionality.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) diff --git a/PTRANS/src/host/common_benchmark_io_implementation.cpp b/PTRANS/src/host/common_benchmark_io_implementation.cpp new file mode 100644 index 00000000..cc497608 --- /dev/null +++ b/PTRANS/src/host/common_benchmark_io_implementation.cpp @@ -0,0 +1,91 @@ + +#include "cxxopts.hpp" +#include "parameters.h" +#include "setup/common_benchmark_io.hpp" + +/** +Parses and returns program options using the cxxopts library. +Supports the following parameters: + - file name of the FPGA kernel file (-f,--file) + - number of repetitions (-n) + - number of kernel replications (-r) + - data size (-d) + - use memory interleaving +@see https://github.com/jarro2783/cxxopts + +@return program settings that are created from the given program arguments +*/ +std::shared_ptr +parseProgramParameters(int argc, char *argv[]) { + // Defining and parsing program options + cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); + options.add_options() + ("f,file", "Kernel file name", cxxopts::value()) + ("n", "Number of repetitions", + cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) + ("m", "Matrix size in number of blocks in one dimension", + cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) + ("b", "Block size in number of values in one dimension", + cxxopts::value()->default_value(std::to_string(BLOCK_SIZE))) + ("kernel", "Name of the kernel", + cxxopts::value()->default_value(KERNEL_NAME)) + ("i,nointerleaving", "Disable memory interleaving") + ("device", "Index of the device that has to be used. If not given you "\ + "will be asked which device to use if there are multiple devices "\ + "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) + ("platform", "Index of the platform that has to be used. If not given "\ + "you will be asked which platform to use if there are multiple "\ + "platforms available.", + cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) + ("h,help", "Print this help"); + cxxopts::ParseResult result = options.parse(argc, argv); + + if (result.count("h")) { + // Just print help when argument is given + std::cout << options.help() << std::endl; + exit(0); + } + // Check parsed options and handle special cases + if (result.count("f") <= 0) { + // Path to the kernel file is mandatory - exit if not given! + std::cerr << "Kernel file must be given! Aborting" << std::endl; + std::cout << options.help() << std::endl; + exit(1); + } + + // Create program settings from program arguments + std::shared_ptr sharedSettings( + new ProgramSettings{result["n"].as(), + result["m"].as(), + result["platform"].as(), + result["device"].as(), + static_cast(result.count("i") <= 0), + result["f"].as(), + result["kernel"].as()}); + return sharedSettings; +} + +/** + * Prints the used configuration to std out before starting the actual benchmark. + * + * @param programSettings The program settings retrieved from the command line + * @param device The device used for execution + */ +void printFinalConfiguration(const std::shared_ptr &programSettings, + const cl::Device &device) {// Give setup summary + std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; + std::cout << "Summary:" << std::endl + << "Repetitions: " << programSettings->numRepetitions + << std::endl + << "Matrix Size: " << programSettings->matrixSize * programSettings->blockSize + << std::endl + << "Memory Interleaving: " << (programSettings->useMemInterleaving ? "Yes" : "No") + << std::endl + << "Kernel file: " << programSettings->kernelFileName + << std::endl; + std::cout << "Device: " + << device.getInfo() << std::endl; + std::cout << HLINE + << "Start benchmark using the given configuration." << std::endl + << HLINE; +} diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp index 07d64b98..367fc9d7 100644 --- a/PTRANS/src/host/main.cpp +++ b/PTRANS/src/host/main.cpp @@ -3,6 +3,7 @@ // #include "transpose_functionality.hpp" +#include "setup/common_benchmark_io.hpp" /** The program entry point diff --git a/PTRANS/src/host/program_settings.h b/PTRANS/src/host/program_settings.h new file mode 100644 index 00000000..8284c74d --- /dev/null +++ b/PTRANS/src/host/program_settings.h @@ -0,0 +1,33 @@ + +#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ +#define SRC_HOST_PROGRAM_SETTINGS_H_ + +#include "parameters.h" + +/* C++ standard library headers */ +#include + +#include "CL/opencl.h" + +/* +Short description of the program. +Moreover the version and build time is also compiled into the description. +*/ + +#define PROGRAM_DESCRIPTION "Implementation of the matrix transposition benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + +struct ProgramSettings { + uint numRepetitions; + cl_uint matrixSize; + cl_uint blockSize; + int defaultPlatform; + int defaultDevice; + bool useMemInterleaving; + std::string kernelFileName; + std::string kernelName; +}; + + +#endif diff --git a/PTRANS/src/host/transpose_functionality.cpp b/PTRANS/src/host/transpose_functionality.cpp index f4d98494..250bf57f 100644 --- a/PTRANS/src/host/transpose_functionality.cpp +++ b/PTRANS/src/host/transpose_functionality.cpp @@ -36,68 +36,6 @@ SOFTWARE. #include "setup/fpga_setup.hpp" #include "parameters.h" -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("m", "Matrix size in number of blocks in one dimension", - cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) - ("b", "Block size in number of values in one dimension", - cxxopts::value()->default_value(std::to_string(BLOCK_SIZE))) - ("kernel", "Name of the kernel", - cxxopts::value()->default_value(KERNEL_NAME)) - ("i,nointerleaving", "Disable memory interleaving") - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["m"].as(), - result["b"].as(), - result["platform"].as(), - result["device"].as(), - static_cast(result.count("i") <= 0), - result["f"].as(), - result["kernel"].as()}); - return sharedSettings; -} /** * Reference implementation that takes two matrices and calculates @@ -169,31 +107,6 @@ printResults(std::shared_ptr results, cl_uint ma << std::endl; } -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Matrix Size: " << programSettings->matrixSize * programSettings->blockSize - << std::endl - << "Memory Interleaving: " << (programSettings->useMemInterleaving ? "Yes" : "No") - << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} - double printCalculationError(cl_uint matrixSize, const HOST_DATA_TYPE *result) { double max_error = 0.0; diff --git a/PTRANS/src/host/transpose_functionality.hpp b/PTRANS/src/host/transpose_functionality.hpp index 69c63375..e71bf11b 100644 --- a/PTRANS/src/host/transpose_functionality.hpp +++ b/PTRANS/src/host/transpose_functionality.hpp @@ -32,43 +32,6 @@ SOFTWARE. #include "setup/fpga_setup.hpp" #include "parameters.h" -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ -#define STR_EXPAND(tok) #tok -#define STR(tok) STR_EXPAND(tok) - -#define PROGRAM_DESCRIPTION "Implementation of the matrix transposition benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " VERSION "\n" - -struct ProgramSettings { - uint numRepetitions; - cl_uint matrixSize; - cl_uint blockSize; - int defaultPlatform; - int defaultDevice; - bool useMemInterleaving; - std::string kernelFileName; - std::string kernelName; -}; - - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]); /** * Reference implementation that takes two matrices and calculates @@ -97,15 +60,6 @@ Prints the execution results to stdout void printResults(std::shared_ptr results, cl_uint matrixSize); -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device); - /** * Prints the aggregated error for the result matrix to stdout. diff --git a/PTRANS/tests/CMakeLists.txt b/PTRANS/tests/CMakeLists.txt index 1f1a7c5b..78534477 100755 --- a/PTRANS/tests/CMakeLists.txt +++ b/PTRANS/tests/CMakeLists.txt @@ -3,24 +3,25 @@ add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) include_directories(${gtest_SOURCE_DIR}/include ${gmock_SOURCE_DIR}/include) include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) +include_directories(${CMAKE_SOURCE_DIR}/src/host) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/execution_default.cpp ../src/host/transpose_functionality.cpp) -set(TEST_SOURCES test_host_functionality.cpp test_kernel_functionality_and_host_integration.cpp ../../shared/setup/test_fpga_setup.cpp) +set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/execution_default.cpp ../src/host/transpose_functionality.cpp) +set(TEST_SOURCES ../../shared/testing/main.cpp test_host_functionality.cpp test_kernel_functionality_and_host_integration.cpp ../../shared/setup/test_fpga_setup.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock gtest_main ${IntelFPGAOpenCL_LIBRARIES}) + target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES}) add_dependencies(Test_intel transpose_optimized_emulate_intel) target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) - add_test(NAME test_intel_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_intel_unit COMMAND $ -f transpose_optimized_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (VITIS_FOUND) include_directories(${Vitis_INCLUDE_DIRS}) add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock gtest_main ${Vitis_LIBRARIES}) + target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES}) add_dependencies(Test_xilinx transpose_optimized_emulate_xilinx) target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) - add_test(NAME test_xilinx_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_xilinx_unit COMMAND $ -f transpose_optimized_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() \ No newline at end of file diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp index d442f2a6..0d1c347e 100644 --- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp +++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp @@ -7,6 +7,7 @@ #include "../src/host/execution.h" #include "../src/host/transpose_functionality.hpp" #include "parameters.h" +#include "testing/test_program_settings.h" struct OpenCLKernelTest : testing::Test { @@ -21,7 +22,7 @@ struct OpenCLKernelTest : testing::Test { std::vector device; OpenCLKernelTest() { - kernelFileName = "transpose_default_emulate.aocx"; + kernelFileName = programSettings->kernelFileName; matrix_size = BLOCK_SIZE; posix_memalign(reinterpret_cast(&A), 64, sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); @@ -36,6 +37,7 @@ struct OpenCLKernelTest : testing::Test { A_out[i * matrix_size + j] = 0.0; } } + setupFPGA(); } void setupFPGA() { @@ -43,7 +45,7 @@ struct OpenCLKernelTest : testing::Test { // TODO: Workaround. File bug report to XRT? // This is done because of a bug in Xilix XRT that does not allow // to reprogram an FPGA twice which will crash with CL_OUT_OF_RESOURCES - device = fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE); + device = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); context = cl::Context(device[0]); program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); } @@ -67,18 +69,11 @@ struct OpenCLKernelTest : testing::Test { } }; -struct DifferentOpenCLKernelTest : OpenCLKernelTest, testing::WithParamInterface { - DifferentOpenCLKernelTest() { - kernelFileName = GetParam(); - setupFPGA(); - } -}; - /** * Tests if B will not be transposed */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectBStaysTheSame) { +TEST_F(OpenCLKernelTest, FPGACorrectBStaysTheSame) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { A[i * matrix_size + j] = 0.0; @@ -96,7 +91,7 @@ TEST_P(DifferentOpenCLKernelTest, FPGACorrectBStaysTheSame) { /** * Tests if a block of A will be correctly transposed */ -TEST_P(DifferentOpenCLKernelTest, FPGAABlockIsTransposed) { +TEST_F(OpenCLKernelTest, FPGAABlockIsTransposed) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { A[i * matrix_size + j] = i * matrix_size + j; @@ -114,7 +109,7 @@ TEST_P(DifferentOpenCLKernelTest, FPGAABlockIsTransposed) { /** * Tests if A will be transposed when it is bigger than one block */ -TEST_P(DifferentOpenCLKernelTest, FPGAAIsTransposed) { +TEST_F(OpenCLKernelTest, FPGAAIsTransposed) { // delete memory allocated in constructor free(A); free(B); @@ -150,7 +145,7 @@ TEST_P(DifferentOpenCLKernelTest, FPGAAIsTransposed) { /** * Tests if matrix A and B will be summed up in the result */ -TEST_P(DifferentOpenCLKernelTest, FPGAAAndBAreSummedUp) { +TEST_F(OpenCLKernelTest, FPGAAAndBAreSummedUp) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { A[i * matrix_size + j] = 1.0; @@ -169,7 +164,7 @@ TEST_P(DifferentOpenCLKernelTest, FPGAAAndBAreSummedUp) { /** * Checks the size and values of the timing measurements that are retured by calculate. */ -TEST_P(DifferentOpenCLKernelTest, FPGATimingsMeasuredForEveryIteration) { +TEST_F(OpenCLKernelTest, FPGATimingsMeasuredForEveryIteration) { config->repetitons = 10; auto result = bm_execution::calculate(config, A, B, A_out); EXPECT_EQ(result->calculationTimings.size(), 10); @@ -180,18 +175,6 @@ TEST_P(DifferentOpenCLKernelTest, FPGATimingsMeasuredForEveryIteration) { } } -#ifdef INTEL_FPGA -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Values( - "transpose_optimized_emulate.aocx" - )); -#else -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Values( - "transpose_optimized_emulate.xclbin" - )); -#endif - /** * Check if the generated input data is in the specified range */ From a2adef2704f7140653e56ef8afaa24f432f9544f Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 May 2020 17:21:34 +0200 Subject: [PATCH 11/45] Port RandomAccess to new test main --- RandomAccess/src/host/CMakeLists.txt | 3 +- .../common_benchmark_io_implementation.cpp | 89 +++++++++++++++++++ RandomAccess/src/host/main.cpp | 1 + RandomAccess/src/host/program_settings.h | 32 +++++++ .../src/host/random_access_functionality.cpp | 82 +---------------- .../src/host/random_access_functionality.hpp | 49 ---------- RandomAccess/tests/CMakeLists.txt | 14 +-- 7 files changed, 132 insertions(+), 138 deletions(-) create mode 100644 RandomAccess/src/host/common_benchmark_io_implementation.cpp create mode 100644 RandomAccess/src/host/program_settings.h diff --git a/RandomAccess/src/host/CMakeLists.txt b/RandomAccess/src/host/CMakeLists.txt index 0dfb82d1..3e0f9bd0 100755 --- a/RandomAccess/src/host/CMakeLists.txt +++ b/RandomAccess/src/host/CMakeLists.txt @@ -1,6 +1,7 @@ include_directories(../../../extern/cxxopts/include ../../../shared) +include_directories(.) -set(HOST_SOURCE execution_single.cpp main.cpp ../../../shared/setup/fpga_setup.cpp random_access_functionality.cpp) +set(HOST_SOURCE execution_single.cpp main.cpp common_benchmark_io_implementation.cpp ../../../shared/setup/fpga_setup.cpp random_access_functionality.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) diff --git a/RandomAccess/src/host/common_benchmark_io_implementation.cpp b/RandomAccess/src/host/common_benchmark_io_implementation.cpp new file mode 100644 index 00000000..67e8aa65 --- /dev/null +++ b/RandomAccess/src/host/common_benchmark_io_implementation.cpp @@ -0,0 +1,89 @@ + +#include "cxxopts.hpp" +#include "parameters.h" +#include "setup/common_benchmark_io.hpp" + +/** +Parses and returns program options using the cxxopts library. +Supports the following parameters: + - file name of the FPGA kernel file (-f,--file) + - number of repetitions (-n) + - number of kernel replications (-r) + - data size (-d) + - use memory interleaving +@see https://github.com/jarro2783/cxxopts + +@return program settings that are created from the given program arguments +*/ +std::shared_ptr +parseProgramParameters(int argc, char *argv[]) { + // Defining and parsing program options + cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); + options.add_options() + ("f,file", "Kernel file name", cxxopts::value()) + ("n", "Number of repetitions", + cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) + ("r", "Number of used kernel replications", + cxxopts::value()->default_value(std::to_string(NUM_KERNEL_REPLICATIONS))) + ("d,data", "Size of the used data array (Should be half of the "\ + "available global memory)", + cxxopts::value() + ->default_value(std::to_string(DEFAULT_ARRAY_LENGTH))) + ("device", "Index of the device that has to be used. If not given you "\ + "will be asked which device to use if there are multiple devices "\ + "available.", cxxopts::value()->default_value(std::to_string(-1))) + ("platform", "Index of the platform that has to be used. If not given "\ + "you will be asked which platform to use if there are multiple "\ + "platforms available.", + cxxopts::value()->default_value(std::to_string(-1))) + ("h,help", "Print this help"); + cxxopts::ParseResult result = options.parse(argc, argv); + + if (result.count("h")) { + // Just print help when argument is given + std::cout << options.help() << std::endl; + exit(0); + } + // Check parsed options and handle special cases + if (result.count("f") <= 0) { + // Path to the kernel file is mandatory - exit if not given! + std::cerr << "Kernel file must be given! Aborting" << std::endl; + std::cout << options.help() << std::endl; + exit(1); + } + + // Create program settings from program arguments + std::shared_ptr sharedSettings( + new ProgramSettings {result["n"].as(), result["r"].as(), + result["platform"].as(), + result["device"].as(), + result["d"].as(), + result["f"].as()}); + return sharedSettings; +} + +/** + * Prints the used configuration to std out before starting the actual benchmark. + * + * @param programSettings The program settings retrieved from the command line + * @param device The device used for execution + */ +void printFinalConfiguration(const std::shared_ptr &programSettings, + const cl::Device &device) {// Give setup summary + std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; + std::cout << "Summary:" << std::endl + << "Kernel Replications: " << programSettings->numReplications + << std::endl + << "Repetitions: " << programSettings->numRepetitions + << std::endl + << "Total data size: " << (programSettings->dataSize + * sizeof(HOST_DATA_TYPE)) * 1.0 + << " Byte" << std::endl + << "Kernel file: " << programSettings->kernelFileName + << std::endl; + std::cout << "Device: " + << device.getInfo() << std::endl; + std::cout << HLINE + << "Start benchmark using the given configuration." << std::endl + << HLINE; +} diff --git a/RandomAccess/src/host/main.cpp b/RandomAccess/src/host/main.cpp index 184ed82f..ec9ef0cc 100644 --- a/RandomAccess/src/host/main.cpp +++ b/RandomAccess/src/host/main.cpp @@ -3,6 +3,7 @@ // #include "random_access_functionality.hpp" +#include "setup/common_benchmark_io.hpp" #include "execution.h" /** diff --git a/RandomAccess/src/host/program_settings.h b/RandomAccess/src/host/program_settings.h new file mode 100644 index 00000000..9c33fa92 --- /dev/null +++ b/RandomAccess/src/host/program_settings.h @@ -0,0 +1,32 @@ + +#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ +#define SRC_HOST_PROGRAM_SETTINGS_H_ + +#include "parameters.h" + +/* C++ standard library headers */ +#include + +#include "CL/opencl.h" + +/* +Short description of the program. +Moreover the version and build time is also compiled into the description. +*/ + +#define PROGRAM_DESCRIPTION "Implementation of the random access benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + + +struct ProgramSettings { + uint numRepetitions; + uint numReplications; + int defaultPlatform; + int defaultDevice; + size_t dataSize; + std::string kernelFileName; +}; + + +#endif diff --git a/RandomAccess/src/host/random_access_functionality.cpp b/RandomAccess/src/host/random_access_functionality.cpp index 8fdf446a..cb1fb82f 100644 --- a/RandomAccess/src/host/random_access_functionality.cpp +++ b/RandomAccess/src/host/random_access_functionality.cpp @@ -37,68 +37,9 @@ SOFTWARE. /* Project's headers */ #include "setup/fpga_setup.hpp" +#include "setup/common_benchmark_io.hpp" #include "execution.h" - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char * argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("r", "Number of used kernel replications", - cxxopts::value()->default_value(std::to_string(NUM_KERNEL_REPLICATIONS))) - ("d,data", "Size of the used data array (Should be half of the "\ - "available global memory)", - cxxopts::value() - ->default_value(std::to_string(DEFAULT_ARRAY_LENGTH))) - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(-1))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(-1))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings {result["n"].as(), result["r"].as(), - result["platform"].as(), - result["device"].as(), - result["d"].as(), - result["f"].as()}); - return sharedSettings; -} - /** Print the benchmark Results @@ -219,27 +160,6 @@ double checkRandomAccessResults(HOST_DATA_TYPE* result_array, size_t array_size) return errors / array_size; } -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) { - // Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "Kernel Replications: " << programSettings->numReplications - << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Total data size: " << (programSettings->dataSize - * sizeof(HOST_DATA_TYPE)) * 1.0 - << " Byte" << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} - void generateInputData(HOST_DATA_TYPE* data, size_t dataSize) { for (HOST_DATA_TYPE j=0; j < dataSize ; j++) { data[j] = j; diff --git a/RandomAccess/src/host/random_access_functionality.hpp b/RandomAccess/src/host/random_access_functionality.hpp index 2328b450..31619a30 100644 --- a/RandomAccess/src/host/random_access_functionality.hpp +++ b/RandomAccess/src/host/random_access_functionality.hpp @@ -31,17 +31,6 @@ SOFTWARE. #include "setup/fpga_setup.hpp" #include "parameters.h" -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ -#define STR_EXPAND(tok) #tok -#define STR(tok) STR_EXPAND(tok) - -#define PROGRAM_DESCRIPTION "Implementation of the random access benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " VERSION "\n" - /** Prefix of the function name of the used kernel. It will be used to construct the full function name for the case of replications. @@ -57,34 +46,6 @@ Constants used to verify benchmark results #define BIT_SIZE (sizeof(HOST_DATA_TYPE) * 8) -#define ENTRY_SPACE 13 - -struct ProgramSettings { - uint numRepetitions; - uint numReplications; - int defaultPlatform; - int defaultDevice; - size_t dataSize; - std::string kernelFileName; -}; - - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char * argv[]); - - /** Generates the value of the random number after a desired number of updates @@ -117,16 +78,6 @@ void printResults(std::shared_ptr results, */ double checkRandomAccessResults(HOST_DATA_TYPE* result_array, size_t array_size); -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device); - - void generateInputData(HOST_DATA_TYPE* data, size_t dataSize); diff --git a/RandomAccess/tests/CMakeLists.txt b/RandomAccess/tests/CMakeLists.txt index 7d595107..2790fbdf 100755 --- a/RandomAccess/tests/CMakeLists.txt +++ b/RandomAccess/tests/CMakeLists.txt @@ -3,27 +3,27 @@ add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) +include_directories(${CMAKE_SOURCE_DIR}/src/host) - -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/execution_single.cpp ../src/host/random_access_functionality.cpp) -set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_host_code.cpp test_kernel_functionality_and_host_integration.cpp) +set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/execution_single.cpp ../src/host/random_access_functionality.cpp) +set(TEST_SOURCES ../../shared/testing/main.cpp ../../shared/setup/test_fpga_setup.cpp test_host_code.cpp test_kernel_functionality_and_host_integration.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock gtest_main ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") add_dependencies(Test_intel random_access_kernels_single_emulate_intel) target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) target_compile_options(Test_intel PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_intel_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_intel_unit COMMAND $ -f random_access_kernels_single_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (Vitis_FOUND) include_directories(SYSTEM ${Vitis_INCLUDE_DIRS}) add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock gtest_main ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") add_dependencies(Test_xilinx random_access_kernels_single_emulate_xilinx) target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(Test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_xilinx_unit COMMAND $ WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_xilinx_unit COMMAND $ -f random_access_kernels_single_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() \ No newline at end of file From 1a0f260036885b32b04b85d2e4adf155e7642a09 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 May 2020 17:40:00 +0200 Subject: [PATCH 12/45] Fix rebase error --- PTRANS/src/host/common_benchmark_io_implementation.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/PTRANS/src/host/common_benchmark_io_implementation.cpp b/PTRANS/src/host/common_benchmark_io_implementation.cpp index cc497608..4b631c29 100644 --- a/PTRANS/src/host/common_benchmark_io_implementation.cpp +++ b/PTRANS/src/host/common_benchmark_io_implementation.cpp @@ -57,6 +57,7 @@ parseProgramParameters(int argc, char *argv[]) { std::shared_ptr sharedSettings( new ProgramSettings{result["n"].as(), result["m"].as(), + result["b"].as(), result["platform"].as(), result["device"].as(), static_cast(result.count("i") <= 0), From 9303c6e5effa0772e87d2c82af70c282128ed5bd Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 14 May 2020 17:30:22 +0200 Subject: [PATCH 13/45] First step to OO conversion --- STREAM/src/common/parameters.h.in | 4 + STREAM/src/host/CMakeLists.txt | 3 +- STREAM/src/host/execution.h | 22 +- STREAM/src/host/execution_default.cpp | 133 +++++----- STREAM/src/host/main.cpp | 75 +----- STREAM/src/host/stream_functionality.cpp | 116 +++++++-- STREAM/src/host/stream_functionality.hpp | 95 ++++--- STREAM/tests/CMakeLists.txt | 2 +- shared/hpcc_benchmark.hpp | 245 ++++++++++++++++++ shared/setup/fpga_setup.cpp | 304 ----------------------- shared/setup/fpga_setup.hpp | 283 +++++++++++++++++---- 11 files changed, 717 insertions(+), 565 deletions(-) create mode 100644 shared/hpcc_benchmark.hpp delete mode 100644 shared/setup/fpga_setup.cpp diff --git a/STREAM/src/common/parameters.h.in b/STREAM/src/common/parameters.h.in index ba5ef0e5..4c3d036a 100644 --- a/STREAM/src/common/parameters.h.in +++ b/STREAM/src/common/parameters.h.in @@ -23,6 +23,10 @@ #cmakedefine INNER_LOOP_BUFFERS #cmakedefine USE_SVM +#define PROGRAM_DESCRIPTION "Implementation of the STREAM benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + /** Output separator */ diff --git a/STREAM/src/host/CMakeLists.txt b/STREAM/src/host/CMakeLists.txt index 66fa6946..9be0036f 100755 --- a/STREAM/src/host/CMakeLists.txt +++ b/STREAM/src/host/CMakeLists.txt @@ -1,7 +1,6 @@ include_directories(../../../extern/cxxopts/include ../../../shared) -include_directories(./) -set(HOST_SOURCE execution_default.cpp main.cpp ../../../shared/setup/fpga_setup.cpp common_benchmark_io_implementation.cpp stream_functionality.cpp) +set(HOST_SOURCE execution_default.cpp main.cpp stream_functionality.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) diff --git a/STREAM/src/host/execution.h b/STREAM/src/host/execution.h index 52785382..1b78d928 100644 --- a/STREAM/src/host/execution.h +++ b/STREAM/src/host/execution.h @@ -31,6 +31,8 @@ SOFTWARE. /* External library headers */ #include "CL/cl.hpp" #include "parameters.h" +#include "hpcc_benchmark.hpp" +#include "stream_functionality.hpp" // Map keys for execution timings #define PCIE_WRITE_KEY "PCI write" @@ -42,22 +44,6 @@ SOFTWARE. namespace bm_execution { - struct ExecutionConfiguration { - cl::Context context; - cl::Device device; - cl::Program program; - uint repetitions; - uint replications; - unsigned arraySize; - bool useMemoryInterleaving; - bool useSingleKernel; - }; - - struct ExecutionTimings { - std::map> timings; - uint arraySize; - }; - static std::map multiplicatorMap = { {PCIE_WRITE_KEY, 3.0}, {PCIE_READ_KEY, 3.0}, @@ -77,8 +63,8 @@ simple exchange of the different calculation methods. @return The resulting matrix */ - std::shared_ptr - calculate(std::shared_ptr config, + std::shared_ptr + calculate(const hpcc_base::ExecutionSettings config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C); diff --git a/STREAM/src/host/execution_default.cpp b/STREAM/src/host/execution_default.cpp index 25ce019e..3d79c845 100644 --- a/STREAM/src/host/execution_default.cpp +++ b/STREAM/src/host/execution_default.cpp @@ -36,15 +36,14 @@ SOFTWARE. #include "CL/cl_ext_intelfpga.h" #endif /* Project's headers */ -#include "setup/fpga_setup.hpp" namespace bm_execution { - void initialize_buffers(const std::shared_ptr &config, unsigned int data_per_kernel, + void initialize_buffers(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, std::vector &Buffers_A, std::vector &Buffers_B, std::vector &Buffers_C); - void initialize_queues_and_kernels(const std::shared_ptr &config, + void initialize_queues_and_kernels(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, const std::vector &Buffers_A, const std::vector &Buffers_B, const std::vector &Buffers_C, @@ -53,7 +52,7 @@ namespace bm_execution { std::vector &triad_kernels, std::vector &command_queues); - void initialize_queues_and_kernels_single(const std::shared_ptr &config, + void initialize_queues_and_kernels_single(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, const std::vector &Buffers_A, const std::vector &Buffers_B, const std::vector &Buffers_C, @@ -69,13 +68,13 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr - calculate(std::shared_ptr config, + std::shared_ptr + calculate(const hpcc_base::ExecutionSettings config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C) { - unsigned data_per_kernel = config->arraySize/config->replications; + unsigned data_per_kernel = config.programSettings->streamArraySize/config.programSettings->kernelReplications; std::vector Buffers_A; std::vector Buffers_B; @@ -95,7 +94,7 @@ namespace bm_execution { // // Setup kernels // - if (config->useSingleKernel) { + if (config.programSettings->useSingleKernel) { initialize_queues_and_kernels_single(config, data_per_kernel, Buffers_A, Buffers_B, Buffers_C, test_kernels, copy_kernels, scale_kernels, add_kernels, triad_kernels, A, B, C, command_queues); @@ -123,7 +122,7 @@ namespace bm_execution { std::chrono::time_point startExecution, endExecution; std::chrono::duration duration; // Time checking with test kernel - for (int i=0; ireplications; i++) { + for (int i=0; ikernelReplications; i++) { #ifdef USE_SVM ASSERT_CL(clEnqueueSVMMap(command_queues[i](), CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, @@ -135,14 +134,14 @@ namespace bm_execution { ASSERT_CL(command_queues[i].enqueueWriteBuffer(Buffers_A[i], CL_FALSE, 0, sizeof(HOST_DATA_TYPE)*data_per_kernel, &A[data_per_kernel*i])); #endif } - for (int i=0; ireplications; i++) { + for (int i=0; ikernelReplications; i++) { ASSERT_CL(command_queues[i].finish()); } startExecution = std::chrono::high_resolution_clock::now(); - for (int i=0; ireplications; i++) { + for (int i=0; ikernelReplications; i++) { ASSERT_CL(command_queues[i].enqueueTask(test_kernels[i])); } - for (int i=0; ireplications; i++) { + for (int i=0; ikernelReplications; i++) { ASSERT_CL(command_queues[i].finish()); } endExecution = std::chrono::high_resolution_clock::now(); @@ -157,7 +156,7 @@ namespace bm_execution { std::cout << "precision of your system timer." << std::endl; std::cout << HLINE; - for (int i=0; ireplications; i++) { + for (int i=0; inumRepetitions; i++) { #ifdef USE_SVM ASSERT_CL(clEnqueueSVMUnmap(command_queues[i](), reinterpret_cast(A), 0, @@ -167,7 +166,7 @@ namespace bm_execution { ASSERT_CL(command_queues[i].enqueueReadBuffer(Buffers_A[i], CL_FALSE, 0, sizeof(HOST_DATA_TYPE)*data_per_kernel, &A[data_per_kernel*i])); #endif } - for (int i=0; ireplications; i++) { + for (int i=0; ikernelReplications; i++) { ASSERT_CL(command_queues[i].finish()); } @@ -175,13 +174,13 @@ namespace bm_execution { // // Do actual benchmark measurements // - for (uint r = 0; r < config->repetitions; r++) { + for (uint r = 0; r < config.programSettings->kernelReplications; r++) { #pragma omp parallel { #pragma omp single startExecution = std::chrono::high_resolution_clock::now(); #pragma omp for nowait - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { #ifdef USE_SVM clEnqueueSVMMap(command_queues[i](), CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, @@ -211,7 +210,7 @@ namespace bm_execution { #endif } #pragma omp for - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { command_queues[i].finish(); } #pragma omp single @@ -224,11 +223,11 @@ namespace bm_execution { startExecution = std::chrono::high_resolution_clock::now(); } #pragma omp for nowait - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { command_queues[i].enqueueTask(copy_kernels[i]); } #pragma omp for - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { command_queues[i].finish(); } #pragma omp single @@ -241,11 +240,11 @@ namespace bm_execution { startExecution = std::chrono::high_resolution_clock::now(); } #pragma omp for nowait - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { command_queues[i].enqueueTask(scale_kernels[i]); } #pragma omp for - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { command_queues[i].finish(); } #pragma omp single @@ -258,11 +257,11 @@ namespace bm_execution { startExecution = std::chrono::high_resolution_clock::now(); } #pragma omp for nowait - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { command_queues[i].enqueueTask(add_kernels[i]); } #pragma omp for - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { command_queues[i].finish(); } #pragma omp single @@ -275,11 +274,11 @@ namespace bm_execution { startExecution = std::chrono::high_resolution_clock::now(); } #pragma omp for nowait - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { command_queues[i].enqueueTask(triad_kernels[i]); } #pragma omp for - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { command_queues[i].finish(); } #pragma omp single @@ -292,7 +291,7 @@ namespace bm_execution { startExecution = std::chrono::high_resolution_clock::now(); } #pragma omp for nowait - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { #ifdef USE_SVM clEnqueueSVMUnmap(command_queues[i](), reinterpret_cast(&A[data_per_kernel * i]), 0, @@ -316,7 +315,7 @@ namespace bm_execution { #endif } #pragma omp for - for (int i = 0; i < config->replications; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { command_queues[i].finish(); } #pragma omp single @@ -329,14 +328,14 @@ namespace bm_execution { } } - std::shared_ptr result(new ExecutionTimings{ + std::shared_ptr result(new StreamExecutionTimings{ timingMap, - config->arraySize + config.programSettings->streamArraySize }); return result; } - void initialize_queues_and_kernels(const std::shared_ptr &config, + void initialize_queues_and_kernels(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, const std::vector &Buffers_A, const std::vector &Buffers_B, const std::vector &Buffers_C, @@ -345,17 +344,17 @@ namespace bm_execution { std::vector &triad_kernels, std::vector &command_queues) { int err; - for (int i=0; i < config->replications; i++) { + for (int i=0; i < config.programSettings->kernelReplications; i++) { // create the kernels - cl::Kernel testkernel(config->program, ("scale_" + std::to_string(i)).c_str(), &err); + cl::Kernel testkernel(config.program, ("scale_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel copykernel(config->program, ("copy_" + std::to_string(i)).c_str(), &err); + cl::Kernel copykernel(config.program, ("copy_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel scalekernel(config->program, ("scale_" + std::to_string(i)).c_str(), &err); + cl::Kernel scalekernel(config.program, ("scale_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel addkernel(config->program, ("add_" + std::to_string(i)).c_str(), &err); + cl::Kernel addkernel(config.program, ("add_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel triadkernel(config->program, ("triad_" + std::to_string(i)).c_str(), &err); + cl::Kernel triadkernel(config.program, ("triad_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); HOST_DATA_TYPE scalar = 3.0; @@ -406,7 +405,7 @@ namespace bm_execution { err = triadkernel.setArg(4, data_per_kernel); ASSERT_CL(err); - command_queues.push_back(cl::CommandQueue(config->context)); + command_queues.push_back(cl::CommandQueue(config.context)); test_kernels.push_back(testkernel); copy_kernels.push_back(copykernel); scale_kernels.push_back(scalekernel); @@ -415,7 +414,7 @@ namespace bm_execution { } } - void initialize_queues_and_kernels_single(const std::shared_ptr &config, + void initialize_queues_and_kernels_single(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, const std::vector &Buffers_A, const std::vector &Buffers_B, const std::vector &Buffers_C, @@ -427,31 +426,31 @@ namespace bm_execution { HOST_DATA_TYPE* C, std::vector &command_queues) { int err; - for (int i=0; i < config->replications; i++) { + for (int i=0; i < config.programSettings->kernelReplications; i++) { #ifdef INTEL_FPGA // create the kernels - cl::Kernel testkernel(config->program, ("calc_" + std::to_string(i)).c_str(), &err); + cl::Kernel testkernel(config.program, ("calc_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel copykernel(config->program, ("calc_" + std::to_string(i)).c_str(), &err); + cl::Kernel copykernel(config.program, ("calc_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel scalekernel(config->program, ("calc_" + std::to_string(i)).c_str(), &err); + cl::Kernel scalekernel(config.program, ("calc_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel addkernel(config->program, ("calc_" + std::to_string(i)).c_str(), &err); + cl::Kernel addkernel(config.program, ("calc_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel triadkernel(config->program, ("calc_" + std::to_string(i)).c_str(), &err); + cl::Kernel triadkernel(config.program, ("calc_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); #endif #ifdef XILINX_FPGA // create the kernels - cl::Kernel testkernel(config->program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); + cl::Kernel testkernel(config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); ASSERT_CL(err); - cl::Kernel copykernel(config->program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); + cl::Kernel copykernel(config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); ASSERT_CL(err); - cl::Kernel scalekernel(config->program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); + cl::Kernel scalekernel(config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); ASSERT_CL(err); - cl::Kernel addkernel(config->program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); + cl::Kernel addkernel(config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); ASSERT_CL(err); - cl::Kernel triadkernel(config->program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); + cl::Kernel triadkernel(config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); ASSERT_CL(err); #endif HOST_DATA_TYPE scalar = 3.0; @@ -583,7 +582,7 @@ namespace bm_execution { err = triadkernel.setArg(5, TRIAD_KERNEL_TYPE); ASSERT_CL(err); - command_queues.push_back(cl::CommandQueue(config->context)); + command_queues.push_back(cl::CommandQueue(config.context)); test_kernels.push_back(testkernel); copy_kernels.push_back(copykernel); scale_kernels.push_back(scalekernel); @@ -592,39 +591,39 @@ namespace bm_execution { } } - void initialize_buffers(const std::shared_ptr &config, unsigned int data_per_kernel, + void initialize_buffers(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, std::vector &Buffers_A, std::vector &Buffers_B, std::vector &Buffers_C) { - if (!config->useMemoryInterleaving) { + if (!config.programSettings->useMemoryInterleaving) { //Create Buffers for input and output - for (int i=0; i < config->replications; i++) { + for (int i=0; i < config.programSettings->kernelReplications; i++) { #ifdef INTEL_FPGA - if (config->useSingleKernel) { + if (config.programSettings->useSingleKernel) { //Create Buffers for input and output - Buffers_A.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_B.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_C.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_A.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_B.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_C.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); } else { //Create Buffers for input and output - Buffers_A.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE | CL_CHANNEL_1_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_B.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE | CL_CHANNEL_3_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_C.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE | CL_CHANNEL_2_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_A.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | CL_CHANNEL_1_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_B.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | CL_CHANNEL_3_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_C.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | CL_CHANNEL_2_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); } #endif #ifdef XILINX_FPGA - Buffers_A.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_B.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_C.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_A.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_B.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_C.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); #endif } } else { - for (int i=0; i < config->replications; i++) { + for (int i=0; i < config.programSettings->kernelReplications; i++) { //Create Buffers for input and output - Buffers_A.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_B.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_C.push_back(cl::Buffer(config->context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_A.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_B.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_C.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); } } } diff --git a/STREAM/src/host/main.cpp b/STREAM/src/host/main.cpp index 493e692b..1aa2896a 100644 --- a/STREAM/src/host/main.cpp +++ b/STREAM/src/host/main.cpp @@ -3,9 +3,6 @@ // #include "stream_functionality.hpp" -#include "program_settings.h" -#include "setup/common_benchmark_io.hpp" -#include "CL/opencl.h" /** The program entry point @@ -13,69 +10,13 @@ The program entry point int main(int argc, char *argv[]) { // Setup benchmark - std::shared_ptr programSettings = - parseProgramParameters(argc, argv); - fpga_setup::setupEnvironmentAndClocks(); - std::vector usedDevice = - fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); - cl::Context context = cl::Context(usedDevice); - cl::Program program = fpga_setup::fpgaSetup(&context, usedDevice, - &programSettings->kernelFileName); - - printFinalConfiguration(programSettings, usedDevice[0]); - - std::shared_ptr config( - new bm_execution::ExecutionConfiguration { - context, usedDevice[0], program, - programSettings->numRepetitions, - programSettings->kernelReplications, - programSettings->streamArraySize, - programSettings->useMemoryInterleaving, - programSettings->useSingleKernel - }); - - HOST_DATA_TYPE *A, *B, *C; -#ifdef INTEL_FPGA -#ifdef USE_SVM - A = reinterpret_cast( - clSVMAlloc(context(), 0 , - programSettings->streamArraySize * sizeof(HOST_DATA_TYPE), 1024)); - B = reinterpret_cast( - clSVMAlloc(context(), 0 , - programSettings->streamArraySize * sizeof(HOST_DATA_TYPE), 1024)); - C = reinterpret_cast( - clSVMAlloc(context(), 0 , - programSettings->streamArraySize * sizeof(HOST_DATA_TYPE), 1024)); -#else - posix_memalign(reinterpret_cast(&A), 64, programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&B), 64, programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&C), 64, programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); -#endif -#endif -#ifdef XILINX_FPGA - posix_memalign(reinterpret_cast(&A), 4096, programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&B), 4096, programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&C), 4096, programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); -#endif - generateInputData(A, B, C, programSettings->streamArraySize); - - auto timing = bm_execution::calculate(config, A, B, C); - - double error = checkSTREAMResult(A, B, C, programSettings->numRepetitions, programSettings->streamArraySize); - -#ifdef USE_SVM - clSVMFree(context(), reinterpret_cast(A)); - clSVMFree(context(), reinterpret_cast(B)); - clSVMFree(context(), reinterpret_cast(C)); -#else - free(A); - free(B); - free(C); -#endif - - printResults(timing); - - return error < 1 ? 0 : 1; + auto bm = StreamBenchmark(argc, argv); + bool success = bm.executeBenchmark(); + if (success) { + return 0; + } + else { + return 1; + } } diff --git a/STREAM/src/host/stream_functionality.cpp b/STREAM/src/host/stream_functionality.cpp index 705c72fa..fc5f5370 100644 --- a/STREAM/src/host/stream_functionality.cpp +++ b/STREAM/src/host/stream_functionality.cpp @@ -32,10 +32,47 @@ SOFTWARE. /* Project's headers */ #include "execution.h" -#include "setup/fpga_setup.hpp" #include "parameters.h" -#include "program_settings.h" -#include "setup/common_benchmark_io.hpp" + +StreamProgramSettings::StreamProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), + streamArraySize(results["s"].as()), + kernelReplications(results["r"].as()), + useSingleKernel(static_cast(results.count("single-kernel"))) { + +} + +std::ostream& operator<<(std::ostream& os, StreamProgramSettings const& printedSettings) { + return os << "Array Size: " + << static_cast(printedSettings.streamArraySize * sizeof(HOST_DATA_TYPE)) << " Byte" + << std::endl + << "Data Type: " << STR(HOST_DATA_TYPE) + << std::endl + << "Kernel Replications: " << printedSettings.kernelReplications + << std::endl + << "Kernel Type: " << (printedSettings.useSingleKernel ? "Single" : "Separate") + << std::endl; +} + +StreamBenchmark::StreamBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) { +} + +void +StreamBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { + options.add_options() + ("s", "Size of the data arrays", + cxxopts::value()->default_value(std::to_string(DEFAULT_ARRAY_LENGTH))) + ("r", "Number of kernel replications used", + cxxopts::value()->default_value(std::to_string(NUM_KERNEL_REPLICATIONS))) + ("single-kernel", "Use the single kernel implementation"); +} + +std::shared_ptr +StreamBenchmark::executeKernel(const hpcc_base::ExecutionSettings &settings, StreamData &data) { + return bm_execution::calculate(settings, + data.A, + data.B, + data.C); +} /** Prints the execution results to stdout @@ -43,7 +80,7 @@ Prints the execution results to stdout @param results The execution results */ void -printResults(std::shared_ptr results) { +StreamBenchmark::printResults(const hpcc_base::ExecutionSettings &settings, const StreamExecutionTimings &output) { std::cout << std::setw(ENTRY_SPACE) << "Function"; std::cout << std::setw(ENTRY_SPACE) << "Best Rate MB/s"; @@ -51,7 +88,7 @@ printResults(std::shared_ptr results) { std::cout << std::setw(ENTRY_SPACE) << "Min time" ; std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::endl; - for (auto v : results->timings) { + for (auto v : output.timings) { double minTime = *min_element(v.second.begin(), v.second.end()); double avgTime = accumulate(v.second.begin(), v.second.end(), 0.0) / v.second.size(); @@ -59,7 +96,7 @@ printResults(std::shared_ptr results) { std::cout << std::setw(ENTRY_SPACE) << v.first; std::cout << std::setw(ENTRY_SPACE) - << (static_cast(sizeof(HOST_DATA_TYPE)) * results->arraySize * bm_execution::multiplicatorMap[v.first] / minTime) * 1.0e-6 + << (static_cast(sizeof(HOST_DATA_TYPE)) * output.arraySize * bm_execution::multiplicatorMap[v.first] / minTime) * 1.0e-6 << std::setw(ENTRY_SPACE) << avgTime << std::setw(ENTRY_SPACE) << minTime << std::setw(ENTRY_SPACE) << maxTime << std::endl; @@ -67,17 +104,41 @@ printResults(std::shared_ptr results) { } - -void generateInputData(HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C, unsigned array_size) { - for (int i=0; i< array_size; i++) { +std::shared_ptr +StreamBenchmark::generateInputData(const hpcc_base::ExecutionSettings &settings) { + HOST_DATA_TYPE *A, *B, *C; +#ifdef INTEL_FPGA +#ifdef USE_SVM + A = reinterpret_cast( + clSVMAlloc(context(), 0 , + settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE), 1024)); + B = reinterpret_cast( + clSVMAlloc(context(), 0 , + settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE), 1024)); + C = reinterpret_cast( + clSVMAlloc(context(), 0 , + settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE), 1024)); +#else + posix_memalign(reinterpret_cast(&A), 64, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&B), 64, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&C), 64, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); +#endif +#endif +#ifdef XILINX_FPGA + posix_memalign(reinterpret_cast(&A), 4096, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&B), 4096, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&C), 4096, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); +#endif + for (int i=0; i< settings.programSettings->streamArraySize; i++) { A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; } + + return std::make_shared(new StreamData{A, B, C}); } -double checkSTREAMResult(const HOST_DATA_TYPE* A, const HOST_DATA_TYPE* B, const HOST_DATA_TYPE* C, unsigned repetitions, - unsigned array_size) { +bool StreamBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionSettings &settings ,StreamData &data, const StreamExecutionTimings &output) { HOST_DATA_TYPE aj,bj,cj,scalar; HOST_DATA_TYPE aSumErr,bSumErr,cSumErr; HOST_DATA_TYPE aAvgErr,bAvgErr,cAvgErr; @@ -93,7 +154,7 @@ double checkSTREAMResult(const HOST_DATA_TYPE* A, const HOST_DATA_TYPE* B, const aj = 2.0E0 * aj; /* now execute timing loop */ scalar = 3.0; - for (k=0; knumRepetitions; k++) { cj = aj; bj = scalar*cj; @@ -105,15 +166,15 @@ double checkSTREAMResult(const HOST_DATA_TYPE* A, const HOST_DATA_TYPE* B, const aSumErr = 0.0; bSumErr = 0.0; cSumErr = 0.0; - for (j=0; j< array_size; j++) { - aSumErr += abs(A[j] - aj); - bSumErr += abs(B[j] - bj); - cSumErr += abs(C[j] - cj); + for (j=0; j< settings.programSettings->streamArraySize; j++) { + aSumErr += abs(data.A[j] - aj); + bSumErr += abs(data.B[j] - bj); + cSumErr += abs(data.C[j] - cj); // if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj); // MCCALPIN } - aAvgErr = aSumErr / (HOST_DATA_TYPE) array_size; - bAvgErr = bSumErr / (HOST_DATA_TYPE) array_size; - cAvgErr = cSumErr / (HOST_DATA_TYPE) array_size; + aAvgErr = aSumErr / (HOST_DATA_TYPE) settings.programSettings->streamArraySize; + bAvgErr = bSumErr / (HOST_DATA_TYPE) settings.programSettings->streamArraySize; + cAvgErr = cSumErr / (HOST_DATA_TYPE) settings.programSettings->streamArraySize; if (sizeof(HOST_DATA_TYPE) == 4) { epsilon = 1.e-6; @@ -122,7 +183,7 @@ double checkSTREAMResult(const HOST_DATA_TYPE* A, const HOST_DATA_TYPE* B, const epsilon = 1.e-13; } else { - printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(array_size)); + printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(settings.programSettings->streamArraySize)); epsilon = 1.e-6; } @@ -132,8 +193,8 @@ double checkSTREAMResult(const HOST_DATA_TYPE* A, const HOST_DATA_TYPE* B, const printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); ierr = 0; - for (j=0; j epsilon) { + for (j=0; jstreamArraySize; j++) { + if (abs(data.A[j]/aj-1.0) > epsilon) { ierr++; } } @@ -145,8 +206,8 @@ double checkSTREAMResult(const HOST_DATA_TYPE* A, const HOST_DATA_TYPE* B, const printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); ierr = 0; - for (j=0; j epsilon) { + for (j=0; jstreamArraySize; j++) { + if (abs(data.B[j]/bj-1.0) > epsilon) { ierr++; } } @@ -158,8 +219,8 @@ double checkSTREAMResult(const HOST_DATA_TYPE* A, const HOST_DATA_TYPE* B, const printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); ierr = 0; - for (j=0; j epsilon) { + for (j=0; jstreamArraySize; j++) { + if (abs(data.C[j]/cj-1.0) > epsilon) { ierr++; } } @@ -167,6 +228,7 @@ double checkSTREAMResult(const HOST_DATA_TYPE* A, const HOST_DATA_TYPE* B, const } if (err == 0) { printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); + return true; } - return err; + return false; } \ No newline at end of file diff --git a/STREAM/src/host/stream_functionality.hpp b/STREAM/src/host/stream_functionality.hpp index 41e7e066..64b0fa09 100644 --- a/STREAM/src/host/stream_functionality.hpp +++ b/STREAM/src/host/stream_functionality.hpp @@ -20,50 +20,71 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef SRC_HOST_NETWORK_FUNCTIONALITY_H_ -#define SRC_HOST_NETWORK_FUNCTIONALITY_H_ +#ifndef SRC_HOST_STREAM_BENCHMARK_H_ +#define SRC_HOST_STREAM_BENCHMARK_H_ /* C++ standard library headers */ #include #include /* Project's headers */ -#include "execution.h" -#include "cxxopts.hpp" -#include "setup/fpga_setup.hpp" +#include "hpcc_benchmark.hpp" #include "parameters.h" -/** -Prints the execution results to stdout +class StreamProgramSettings : public hpcc_base::BaseSettings { -@param results The execution results -*/ -void -printResults(std::shared_ptr results); - - -/** - * Fill the data buffer with random number using the mersenne twister engine with - * seed 0. - * - * @param data Data array that has to be filled - * @param size Size of the data array that has to be filled - */ -void generateInputData(HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C, unsigned array_size); - - -/** - * Checks the calculation error of an FFt calculation by calculating the inverse FFT on the result data - * and calculating the residual with abs(x - x')/(epsilon * log(FFT_SIZE)). - * - * @param verify_data The input data of the FFT calculation - * @param result_data Result of the FFT calculation - * @param iterations Number data iterations (total data size should be iterations * FFT_SIZE) - * @return the residual error of the calculation - */ -double checkSTREAMResult(const HOST_DATA_TYPE* A, const HOST_DATA_TYPE* B, const HOST_DATA_TYPE* C, unsigned repetitions, - unsigned array_size); - - -#endif // SRC_HOST_NETWORK_FUNCTIONALITY_H_ +public: + uint streamArraySize; + uint kernelReplications; + bool useSingleKernel; + + StreamProgramSettings(cxxopts::ParseResult &results); + +}; + +std::ostream& operator<<(std::ostream& os, StreamProgramSettings const& printedSettings); + + +class StreamData { + +public: + HOST_DATA_TYPE *A, *B, *C; + StreamData(HOST_DATA_TYPE *A,HOST_DATA_TYPE *B,HOST_DATA_TYPE *C) : A(A), B(B), C(C) {} + StreamData(StreamData *d) : A(d->A), B(d->B), C(d->C) {} + +}; + +class StreamExecutionTimings { +public: + std::map> timings; + uint arraySize; +}; + +class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark { + +protected: + + std::shared_ptr + generateInputData(const hpcc_base::ExecutionSettings &settings) override; + + std::shared_ptr + executeKernel(const hpcc_base::ExecutionSettings &settings, StreamData &data) override; + + bool + validateOutputAndPrintError(const hpcc_base::ExecutionSettings &settings ,StreamData &data, const StreamExecutionTimings &output) override; + + void + printResults(const hpcc_base::ExecutionSettings &settings, const StreamExecutionTimings &output) override; + + void + addAdditionalParseOptions(cxxopts::Options &options) override; + +public: + + StreamBenchmark(int argc, char* argv[]); + +}; + + +#endif // SRC_HOST_STREAM_BENCHMARK_H_ diff --git a/STREAM/tests/CMakeLists.txt b/STREAM/tests/CMakeLists.txt index df89235e..81dba6a3 100755 --- a/STREAM/tests/CMakeLists.txt +++ b/STREAM/tests/CMakeLists.txt @@ -5,7 +5,7 @@ include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) include_directories(${CMAKE_SOURCE_DIR}/src/host/) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/execution_default.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/stream_functionality.cpp) +set(PROJECT_SOURCES ../src/host/execution_default.cpp ../src/host/stream_functionality.cpp) set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp ../../shared/testing/main.cpp) if (INTELFPGAOPENCL_FOUND) diff --git a/shared/hpcc_benchmark.hpp b/shared/hpcc_benchmark.hpp new file mode 100644 index 00000000..f5023fd6 --- /dev/null +++ b/shared/hpcc_benchmark.hpp @@ -0,0 +1,245 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SHARED_HPCC_BENCHMAKR_HPP_ +#define SHARED_HPCC_BENCHMAKR_HPP_ + +/* Project's headers */ +#include "setup/fpga_setup.hpp" +#include "cxxopts.hpp" +#include "parameters.h" + +/* External library headers */ +#include "CL/cl.hpp" + +#define STR_EXPAND(tok) #tok +#define STR(tok) STR_EXPAND(tok) + +#define ENTRY_SPACE 15 + + +namespace hpcc_base { + +class BaseSettings { + +public: + + uint numRepetitions; + bool useMemoryInterleaving; + int defaultPlatform; + int defaultDevice; + std::string kernelFileName; + + BaseSettings(cxxopts::ParseResult &results) : numRepetitions(results["n"].as()), + useMemoryInterleaving(static_cast(results.count("i"))), + defaultPlatform(results["platform"].as()), + defaultDevice(results["device"].as()), + kernelFileName(results["f"].as()) {} + +}; + +std::ostream& operator<<(std::ostream& os, BaseSettings const& printedBaseSettings){ + return (os << "Data Type: " << STR(HOST_DATA_TYPE) + << std::endl + << "Repetitions: " << printedBaseSettings.numRepetitions + << std::endl + << "Kernel File: " << printedBaseSettings.kernelFileName + << std::endl); + } + +template +class ExecutionSettings { +public: + cl::Device device; + cl::Context context; + cl::Program program; + std::shared_ptr programSettings; + + ExecutionSettings(const std::shared_ptr programSettings_, cl::Device device_,cl::Context context_,cl::Program program_): + programSettings(programSettings_), device(device_), context(context_), program(program_) {} + +}; + +template +std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ + return os << &(printedExecutionSettings.programSettings) + << "Device: " + //<< printedExecutionSettings.device.getInfo() + << std::endl; + } + +template +class HpccFpgaBenchmark { + +private: + bool isSetupExecuted = false; + ExecutionSettings executionSettings; + +protected: + + virtual std::shared_ptr + generateInputData(const ExecutionSettings &settings); + + virtual std::shared_ptr + executeKernel(const ExecutionSettings &settings, TData &data); + + virtual bool + validateOutputAndPrintError(const ExecutionSettings &settings ,TData &data, const TOutput &output); + + virtual void + printResults(const ExecutionSettings &settings, const TOutput &output); + + virtual void + addAdditionalParseOptions(cxxopts::Options &options) {} + + /** + * Parses and returns program options using the cxxopts library. + * The parsed parameters are depending on the benchmark that is implementing + * function. + * The header file is used to specify a unified interface so it can also be used + * in the testing binary. + * cxxopts is used to parse the parameters. + * @see https://github.com/jarro2783/cxxopts + * + * @param argc Number of input parameters as it is provided by the main function + * @param argv Strings containing the input parameters as provided by the main function + * + * @return program settings that are created from the given program arguments + */ + std::shared_ptr + parseProgramParameters(int argc, char *argv[]) { + // Defining and parsing program options + cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); + options.add_options() + ("f,file", "Kernel file name", cxxopts::value()) + ("n", "Number of repetitions", + cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) + ("i", "Use memory Interleaving") + ("device", "Index of the device that has to be used. If not given you "\ + "will be asked which device to use if there are multiple devices "\ + "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) + ("platform", "Index of the platform that has to be used. If not given "\ + "you will be asked which platform to use if there are multiple "\ + "platforms available.", + cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) + ("h,help", "Print this help"); + + + addAdditionalParseOptions(&options); + cxxopts::ParseResult result = options.parse(argc, argv); + + if (result.count("h")) { + // Just print help when argument is given + std::cout << options.help() << std::endl; + exit(0); + } + // Check parsed options and handle special cases + if (result.count("f") <= 0) { + // Path to the kernel file is mandatory - exit if not given! + std::cerr << "Kernel file must be given! Aborting" << std::endl; + std::cout << options.help() << std::endl; + exit(1); + } + + // Create program settings from program arguments + std::shared_ptr sharedSettings( + new TSettings(result)); + return sharedSettings; + } + + /** + * Prints the used configuration to std out before starting the actual benchmark. + * + * @param executionSettings The program settings that are parsed from the command line + * using parseProgramParameters and extended by the used OpenCL + * context, program and device + */ + void + printFinalConfiguration(const ExecutionSettings executionSettings) { + std::cout << PROGRAM_DESCRIPTION << std::endl; + std::cout << "Summary:" << std::endl; + std::cout << executionSettings << std::endl; + } + +public: + + /** + * @brief Selects and prepares the target device and prints the final configuration + * before executing the benchmark + * + * @param argc Number of input parameters as it is provided by the main function + * @param argv Strings containing the input parameters as provided by the main function + */ + void + setupBenchmark(int argc, char *argv[]) { + std::shared_ptr programSettings = parseProgramParameters(argc, argv); + fpga_setup::setupEnvironmentAndClocks(); + cl::Device usedDevice = + fpga_setup::selectFPGADevice(programSettings->defaultPlatform, + programSettings->defaultDevice); + cl::Context context = cl::Context(usedDevice); + cl::Program program = fpga_setup::fpgaSetup(&context, {usedDevice}, + &programSettings->kernelFileName); + + executionSettings = ExecutionSettings(programSettings, usedDevice, context, program); + + printFinalConfiguration(executionSettings); + isSetupExecuted = true; + } + + bool + executeBenchmark() { + if (!isSetupExecuted) { + std::cerr << "Benchmark execution started without running the benchmark setup!" << std::endl; + exit(1); + } + std::cout << HLINE << "Start benchmark using the given configuration. Generating data..." << std::endl + << HLINE; + std::shared_ptr data = generateInputData(&executionSettings); + std::cout << HLINE << "Execute benchmar kernel..." << std::endl + << HLINE; + std::shared_ptr output = executeKernel(&executionSettings, &data); + + std::cout << HLINE << "Validate output..." << std::endl + << HLINE; + + bool validateSuccess = validateOutputAndPrintError(&executionSettings , &data, &output); + + printResults(&executionSettings, &output); + + std::cout << HLINE << "Cleaning up." << std::endl + << HLINE; + + delete data; + delete output; + + return validateSuccess; + } + + HpccFpgaBenchmark(int argc, char *argv[]) { + setupBenchmark(argc, argv); + } + +}; + +} // namespace hpcc_base + +#endif diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp deleted file mode 100644 index 41a8739e..00000000 --- a/shared/setup/fpga_setup.cpp +++ /dev/null @@ -1,304 +0,0 @@ -// -// Created by Marius Meyer on 04.12.19. -// - -#include "fpga_setup.hpp" - -#include -#include -#include -#include -#include -#include - -/* External libraries */ -#include "parameters.h" - -#ifdef _USE_MPI_ -#include "mpi.h" -#endif - -namespace fpga_setup { - -/** -Converts the reveived OpenCL error to a string - -@param err The OpenCL error code - -@return The string representation of the OpenCL error code -*/ - std::string - getCLErrorString(cl_int const err) { - switch (err) { - CL_ERR_TO_STR(CL_DEVICE_NOT_FOUND); - CL_ERR_TO_STR(CL_DEVICE_NOT_AVAILABLE); - CL_ERR_TO_STR(CL_COMPILER_NOT_AVAILABLE); - CL_ERR_TO_STR(CL_MEM_OBJECT_ALLOCATION_FAILURE); - CL_ERR_TO_STR(CL_OUT_OF_RESOURCES); - CL_ERR_TO_STR(CL_OUT_OF_HOST_MEMORY); - CL_ERR_TO_STR(CL_PROFILING_INFO_NOT_AVAILABLE); - CL_ERR_TO_STR(CL_MEM_COPY_OVERLAP); - CL_ERR_TO_STR(CL_IMAGE_FORMAT_MISMATCH); - CL_ERR_TO_STR(CL_IMAGE_FORMAT_NOT_SUPPORTED); - CL_ERR_TO_STR(CL_BUILD_PROGRAM_FAILURE); - CL_ERR_TO_STR(CL_MAP_FAILURE); - CL_ERR_TO_STR(CL_MISALIGNED_SUB_BUFFER_OFFSET); - CL_ERR_TO_STR(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); - CL_ERR_TO_STR(CL_KERNEL_ARG_INFO_NOT_AVAILABLE); - CL_ERR_TO_STR(CL_INVALID_VALUE); - CL_ERR_TO_STR(CL_INVALID_DEVICE_TYPE); - CL_ERR_TO_STR(CL_INVALID_PLATFORM); - CL_ERR_TO_STR(CL_INVALID_DEVICE); - CL_ERR_TO_STR(CL_INVALID_CONTEXT); - CL_ERR_TO_STR(CL_INVALID_QUEUE_PROPERTIES); - CL_ERR_TO_STR(CL_INVALID_COMMAND_QUEUE); - CL_ERR_TO_STR(CL_INVALID_HOST_PTR); - CL_ERR_TO_STR(CL_INVALID_MEM_OBJECT); - CL_ERR_TO_STR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); - CL_ERR_TO_STR(CL_INVALID_IMAGE_SIZE); - CL_ERR_TO_STR(CL_INVALID_SAMPLER); - CL_ERR_TO_STR(CL_INVALID_BINARY); - CL_ERR_TO_STR(CL_INVALID_BUILD_OPTIONS); - CL_ERR_TO_STR(CL_INVALID_PROGRAM); - CL_ERR_TO_STR(CL_INVALID_PROGRAM_EXECUTABLE); - CL_ERR_TO_STR(CL_INVALID_KERNEL_NAME); - CL_ERR_TO_STR(CL_INVALID_KERNEL_DEFINITION); - CL_ERR_TO_STR(CL_INVALID_KERNEL); - CL_ERR_TO_STR(CL_INVALID_ARG_INDEX); - CL_ERR_TO_STR(CL_INVALID_ARG_VALUE); - CL_ERR_TO_STR(CL_INVALID_ARG_SIZE); - CL_ERR_TO_STR(CL_INVALID_KERNEL_ARGS); - CL_ERR_TO_STR(CL_INVALID_WORK_DIMENSION); - CL_ERR_TO_STR(CL_INVALID_WORK_GROUP_SIZE); - CL_ERR_TO_STR(CL_INVALID_WORK_ITEM_SIZE); - CL_ERR_TO_STR(CL_INVALID_GLOBAL_OFFSET); - CL_ERR_TO_STR(CL_INVALID_EVENT_WAIT_LIST); - CL_ERR_TO_STR(CL_INVALID_EVENT); - CL_ERR_TO_STR(CL_INVALID_OPERATION); - CL_ERR_TO_STR(CL_INVALID_GL_OBJECT); - CL_ERR_TO_STR(CL_INVALID_BUFFER_SIZE); - CL_ERR_TO_STR(CL_INVALID_MIP_LEVEL); - CL_ERR_TO_STR(CL_INVALID_GLOBAL_WORK_SIZE); - CL_ERR_TO_STR(CL_COMPILE_PROGRAM_FAILURE); - CL_ERR_TO_STR(CL_LINKER_NOT_AVAILABLE); - CL_ERR_TO_STR(CL_LINK_PROGRAM_FAILURE); - CL_ERR_TO_STR(CL_DEVICE_PARTITION_FAILED); - CL_ERR_TO_STR(CL_INVALID_PROPERTY); - CL_ERR_TO_STR(CL_INVALID_IMAGE_DESCRIPTOR); - CL_ERR_TO_STR(CL_INVALID_COMPILER_OPTIONS); - CL_ERR_TO_STR(CL_INVALID_LINKER_OPTIONS); - CL_ERR_TO_STR(CL_INVALID_DEVICE_PARTITION_COUNT); - - default: - return "UNKNOWN ERROR CODE"; - } - } - -/** -Check the OpenCL return code for errors. -If an error is detected, it will be printed and the programm execution is -stopped. - -@param err The OpenCL error code -*/ - void - handleClReturnCode(cl_int const err, std::string const file, - int const line) { - if (err != CL_SUCCESS) { - std::string err_string = getCLErrorString(err); - std::cerr << "ERROR in OpenCL library detected! Aborting." - << std::endl << file << ":" << line << ": " << err_string - << std::endl; - exit(err); - } - } - -/** -Sets up the given FPGA with the kernel in the provided file. - -@param context The context used for the program -@param program The devices used for the program -@param usedKernelFile The path to the kernel file -@return The program that is used to create the benchmark kernels -*/ - cl::Program - fpgaSetup(const cl::Context *context, std::vector deviceList, - const std::string *usedKernelFile) { - int err; - int world_rank = 0; - int world_size = 0; - -#ifdef _USE_MPI_ - MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); - MPI_Comm_size(MPI_COMM_WORLD, &world_size); -#endif - - if (world_rank == 0) { - std::cout << HLINE; - std::cout << "FPGA Setup:" << usedKernelFile->c_str() << std::endl; - } - - // Open file stream if possible - std::ifstream aocxStream(usedKernelFile->c_str(), std::ifstream::binary); - if (!aocxStream.is_open()) { - std::cerr << "Not possible to open from given file!" << std::endl; - } - - // Read in file contents and create program from binaries - std::string prog(std::istreambuf_iterator(aocxStream), - (std::istreambuf_iterator())); - aocxStream.seekg(0, aocxStream.end); - unsigned file_size = aocxStream.tellg(); - aocxStream.seekg(0, aocxStream.beg); - char *buf = new char[file_size]; - aocxStream.read(buf, file_size); - - cl::Program::Binaries mybinaries; - mybinaries.push_back({buf, file_size}); - - // Create the Program from the AOCX file. - cl::Program program(*context, deviceList, mybinaries, NULL, &err); - ASSERT_CL(err); - if (world_rank == 0) { - std::cout << "Prepared FPGA successfully for global Execution!" << - std::endl; - std::cout << HLINE; - } - return program; - } - -/** -Sets up the C++ environment by configuring std::cout and checking the clock -granularity using bm_helper::checktick() -*/ - void - setupEnvironmentAndClocks() { - std::cout << std::setprecision(5) << std::scientific; - - std::cout << HLINE; - std::cout << "General setup:" << std::endl; - - // Check clock granularity and output result - std::cout << "C++ high resolution clock is used." << std::endl; - std::cout << "The clock precision seems to be " - << static_cast - (std::chrono::high_resolution_clock::period::num) / - std::chrono::high_resolution_clock::period::den * 10e9 - << "ns" << std::endl; - - std::cout << HLINE; - } - - -/** -Searches an selects an FPGA device using the CL library functions. -If multiple platforms or devices are given, the user will be prompted to -choose a device. - -@param defaultPlatform The index of the platform that has to be used. If a - value < 0 is given, the platform can be chosen - interactively -@param defaultDevice The index of the device that has to be used. If a - value < 0 is given, the device can be chosen - interactively - -@return A list containing a single selected device -*/ - std::vector - selectFPGADevice(int defaultPlatform, int defaultDevice) { - // Integer used to store return codes of OpenCL library calls - int err; - - int world_rank = 0; - int world_size = 0; - -#ifdef _USE_MPI_ - MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); - MPI_Comm_size(MPI_COMM_WORLD, &world_size); -#endif - - std::vector platformList; - err = cl::Platform::get(&platformList); - ASSERT_CL(err); - - // Choose the target platform - int chosenPlatformId = 0; - if (defaultPlatform >= 0) { - if (defaultPlatform < platformList.size()) { - chosenPlatformId = defaultPlatform; - } else { - std::cerr << "Default platform " << defaultPlatform - << " can not be used. Available platforms: " - << platformList.size() << std::endl; - exit(1); - } - } else if (platformList.size() > 1 && world_size == 1) { - std::cout << - "Multiple platforms have been found. Select the platform by"\ - " typing a number:" << std::endl; - for (int platformId = 0; - platformId < platformList.size(); platformId++) { - std::cout << platformId << ") " << - platformList[platformId].getInfo() << - std::endl; - } - std::cout << "Enter platform id [0-" << platformList.size() - 1 - << "]:"; - std::cin >> chosenPlatformId; - } - cl::Platform platform = platformList[chosenPlatformId]; - if (world_rank == 0) { - std::cout << "Selected Platform: " - << platform.getInfo() << std::endl; - } - std::vector deviceList; - err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &deviceList); - ASSERT_CL(err); - - // Choose taget device - int chosenDeviceId = 0; - if (defaultDevice >= 0) { - if (defaultDevice < deviceList.size()) { - chosenDeviceId = defaultDevice; - } else { - std::cerr << "Default device " << defaultDevice - << " can not be used. Available devices: " - << deviceList.size() << std::endl; - exit(1); - } - } else if (deviceList.size() > 1) { - if (world_size == 1) { - std::cout << - "Multiple devices have been found. Select the platform by"\ - " typing a number:" << std::endl; - - for (int deviceId = 0; - deviceId < deviceList.size(); deviceId++) { - std::cout << deviceId << ") " << - deviceList[deviceId].getInfo() << - std::endl; - } - std::cout << "Enter device id [0-" << deviceList.size() - 1 << "]:"; - std::cin >> chosenDeviceId; - } else { - chosenDeviceId = static_cast(world_rank % deviceList.size()); - } - } - std::vector chosenDeviceList; - chosenDeviceList.push_back(deviceList[chosenDeviceId]); - - if (world_rank == 0) { - // Give selection summary - std::cout << HLINE; - std::cout << "Selection summary:" << std::endl; - std::cout << "Platform Name: " << - platform.getInfo() << std::endl; - std::cout << "Device Name: " << - chosenDeviceList[0].getInfo() << std::endl; - std::cout << HLINE; - } - - return chosenDeviceList; - } - -} // namespace fpga_setup \ No newline at end of file diff --git a/shared/setup/fpga_setup.hpp b/shared/setup/fpga_setup.hpp index c1f78b0c..5318dae2 100644 --- a/shared/setup/fpga_setup.hpp +++ b/shared/setup/fpga_setup.hpp @@ -1,26 +1,8 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ -#ifndef SRC_HOST_FPGA_SETUP_H_ -#define SRC_HOST_FPGA_SETUP_H_ +// +// Created by Marius Meyer on 04.12.19. +// + +#include "fpga_setup.hpp" #include #include @@ -29,18 +11,19 @@ SOFTWARE. #include #include + /* External libraries */ #include "CL/cl.hpp" -/** -Makro to convert the error integer representation to its string representation -Source: https://gist.github.com/allanmac/9328bb2d6a99b86883195f8f78fd1b93 -*/ -#define CL_ERR_TO_STR(err) case err: return #err +#include "parameters.h" + +#ifdef _USE_MPI_ +#include "mpi.h" +#endif namespace fpga_setup { - /** +/** Converts the reveived OpenCL error to a string @param err The OpenCL error code @@ -48,7 +31,71 @@ Converts the reveived OpenCL error to a string @return The string representation of the OpenCL error code */ std::string - getCLErrorString(cl_int const err); + getCLErrorString(cl_int const err) { + switch (err) { + CL_ERR_TO_STR(CL_DEVICE_NOT_FOUND); + CL_ERR_TO_STR(CL_DEVICE_NOT_AVAILABLE); + CL_ERR_TO_STR(CL_COMPILER_NOT_AVAILABLE); + CL_ERR_TO_STR(CL_MEM_OBJECT_ALLOCATION_FAILURE); + CL_ERR_TO_STR(CL_OUT_OF_RESOURCES); + CL_ERR_TO_STR(CL_OUT_OF_HOST_MEMORY); + CL_ERR_TO_STR(CL_PROFILING_INFO_NOT_AVAILABLE); + CL_ERR_TO_STR(CL_MEM_COPY_OVERLAP); + CL_ERR_TO_STR(CL_IMAGE_FORMAT_MISMATCH); + CL_ERR_TO_STR(CL_IMAGE_FORMAT_NOT_SUPPORTED); + CL_ERR_TO_STR(CL_BUILD_PROGRAM_FAILURE); + CL_ERR_TO_STR(CL_MAP_FAILURE); + CL_ERR_TO_STR(CL_MISALIGNED_SUB_BUFFER_OFFSET); + CL_ERR_TO_STR(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); + CL_ERR_TO_STR(CL_KERNEL_ARG_INFO_NOT_AVAILABLE); + CL_ERR_TO_STR(CL_INVALID_VALUE); + CL_ERR_TO_STR(CL_INVALID_DEVICE_TYPE); + CL_ERR_TO_STR(CL_INVALID_PLATFORM); + CL_ERR_TO_STR(CL_INVALID_DEVICE); + CL_ERR_TO_STR(CL_INVALID_CONTEXT); + CL_ERR_TO_STR(CL_INVALID_QUEUE_PROPERTIES); + CL_ERR_TO_STR(CL_INVALID_COMMAND_QUEUE); + CL_ERR_TO_STR(CL_INVALID_HOST_PTR); + CL_ERR_TO_STR(CL_INVALID_MEM_OBJECT); + CL_ERR_TO_STR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); + CL_ERR_TO_STR(CL_INVALID_IMAGE_SIZE); + CL_ERR_TO_STR(CL_INVALID_SAMPLER); + CL_ERR_TO_STR(CL_INVALID_BINARY); + CL_ERR_TO_STR(CL_INVALID_BUILD_OPTIONS); + CL_ERR_TO_STR(CL_INVALID_PROGRAM); + CL_ERR_TO_STR(CL_INVALID_PROGRAM_EXECUTABLE); + CL_ERR_TO_STR(CL_INVALID_KERNEL_NAME); + CL_ERR_TO_STR(CL_INVALID_KERNEL_DEFINITION); + CL_ERR_TO_STR(CL_INVALID_KERNEL); + CL_ERR_TO_STR(CL_INVALID_ARG_INDEX); + CL_ERR_TO_STR(CL_INVALID_ARG_VALUE); + CL_ERR_TO_STR(CL_INVALID_ARG_SIZE); + CL_ERR_TO_STR(CL_INVALID_KERNEL_ARGS); + CL_ERR_TO_STR(CL_INVALID_WORK_DIMENSION); + CL_ERR_TO_STR(CL_INVALID_WORK_GROUP_SIZE); + CL_ERR_TO_STR(CL_INVALID_WORK_ITEM_SIZE); + CL_ERR_TO_STR(CL_INVALID_GLOBAL_OFFSET); + CL_ERR_TO_STR(CL_INVALID_EVENT_WAIT_LIST); + CL_ERR_TO_STR(CL_INVALID_EVENT); + CL_ERR_TO_STR(CL_INVALID_OPERATION); + CL_ERR_TO_STR(CL_INVALID_GL_OBJECT); + CL_ERR_TO_STR(CL_INVALID_BUFFER_SIZE); + CL_ERR_TO_STR(CL_INVALID_MIP_LEVEL); + CL_ERR_TO_STR(CL_INVALID_GLOBAL_WORK_SIZE); + CL_ERR_TO_STR(CL_COMPILE_PROGRAM_FAILURE); + CL_ERR_TO_STR(CL_LINKER_NOT_AVAILABLE); + CL_ERR_TO_STR(CL_LINK_PROGRAM_FAILURE); + CL_ERR_TO_STR(CL_DEVICE_PARTITION_FAILED); + CL_ERR_TO_STR(CL_INVALID_PROPERTY); + CL_ERR_TO_STR(CL_INVALID_IMAGE_DESCRIPTOR); + CL_ERR_TO_STR(CL_INVALID_COMPILER_OPTIONS); + CL_ERR_TO_STR(CL_INVALID_LINKER_OPTIONS); + CL_ERR_TO_STR(CL_INVALID_DEVICE_PARTITION_COUNT); + + default: + return "UNKNOWN ERROR CODE"; + } + } /** Check the OpenCL return code for errors. @@ -59,13 +106,15 @@ stopped. */ void handleClReturnCode(cl_int const err, std::string const file, - int const line); - -/** -Makro that enables checks for OpenCL errors with handling of the file and -line number. -*/ -#define ASSERT_CL(err) fpga_setup::handleClReturnCode(err, __FILE__, __LINE__) + int const line) { + if (err != CL_SUCCESS) { + std::string err_string = getCLErrorString(err); + std::cerr << "ERROR in OpenCL library detected! Aborting." + << std::endl << file << ":" << line << ": " << err_string + << std::endl; + exit(err); + } + } /** Sets up the given FPGA with the kernel in the provided file. @@ -77,14 +126,71 @@ Sets up the given FPGA with the kernel in the provided file. */ cl::Program fpgaSetup(const cl::Context *context, std::vector deviceList, - const std::string *usedKernelFile); + const std::string *usedKernelFile) { + int err; + int world_rank = 0; + int world_size = 0; + +#ifdef _USE_MPI_ + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); +#endif + + if (world_rank == 0) { + std::cout << HLINE; + std::cout << "FPGA Setup:" << usedKernelFile->c_str() << std::endl; + } + + // Open file stream if possible + std::ifstream aocxStream(usedKernelFile->c_str(), std::ifstream::binary); + if (!aocxStream.is_open()) { + std::cerr << "Not possible to open from given file!" << std::endl; + } + + // Read in file contents and create program from binaries + std::string prog(std::istreambuf_iterator(aocxStream), + (std::istreambuf_iterator())); + aocxStream.seekg(0, aocxStream.end); + unsigned file_size = aocxStream.tellg(); + aocxStream.seekg(0, aocxStream.beg); + char *buf = new char[file_size]; + aocxStream.read(buf, file_size); + + cl::Program::Binaries mybinaries; + mybinaries.push_back({buf, file_size}); + + // Create the Program from the AOCX file. + cl::Program program(*context, deviceList, mybinaries, NULL, &err); + ASSERT_CL(err); + if (world_rank == 0) { + std::cout << "Prepared FPGA successfully for global Execution!" << + std::endl; + std::cout << HLINE; + } + return program; + } /** Sets up the C++ environment by configuring std::cout and checking the clock granularity using bm_helper::checktick() */ void - setupEnvironmentAndClocks(); + setupEnvironmentAndClocks() { + std::cout << std::setprecision(5) << std::scientific; + + std::cout << HLINE; + std::cout << "General setup:" << std::endl; + + // Check clock granularity and output result + std::cout << "C++ high resolution clock is used." << std::endl; + std::cout << "The clock precision seems to be " + << static_cast + (std::chrono::high_resolution_clock::period::num) / + std::chrono::high_resolution_clock::period::den * 10e9 + << "ns" << std::endl; + + std::cout << HLINE; + } /** @@ -102,7 +208,100 @@ choose a device. @return A list containing a single selected device */ std::vector - selectFPGADevice(int defaultPlatform, int defaultDevice); + selectFPGADevice(int defaultPlatform, int defaultDevice) { + // Integer used to store return codes of OpenCL library calls + int err; + + int world_rank = 0; + int world_size = 0; + +#ifdef _USE_MPI_ + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); +#endif + + std::vector platformList; + err = cl::Platform::get(&platformList); + ASSERT_CL(err); + + // Choose the target platform + int chosenPlatformId = 0; + if (defaultPlatform >= 0) { + if (defaultPlatform < platformList.size()) { + chosenPlatformId = defaultPlatform; + } else { + std::cerr << "Default platform " << defaultPlatform + << " can not be used. Available platforms: " + << platformList.size() << std::endl; + exit(1); + } + } else if (platformList.size() > 1 && world_size == 1) { + std::cout << + "Multiple platforms have been found. Select the platform by"\ + " typing a number:" << std::endl; + for (int platformId = 0; + platformId < platformList.size(); platformId++) { + std::cout << platformId << ") " << + platformList[platformId].getInfo() << + std::endl; + } + std::cout << "Enter platform id [0-" << platformList.size() - 1 + << "]:"; + std::cin >> chosenPlatformId; + } + cl::Platform platform = platformList[chosenPlatformId]; + if (world_rank == 0) { + std::cout << "Selected Platform: " + << platform.getInfo() << std::endl; + } + std::vector deviceList; + err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &deviceList); + ASSERT_CL(err); + + // Choose taget device + int chosenDeviceId = 0; + if (defaultDevice >= 0) { + if (defaultDevice < deviceList.size()) { + chosenDeviceId = defaultDevice; + } else { + std::cerr << "Default device " << defaultDevice + << " can not be used. Available devices: " + << deviceList.size() << std::endl; + exit(1); + } + } else if (deviceList.size() > 1) { + if (world_size == 1) { + std::cout << + "Multiple devices have been found. Select the platform by"\ + " typing a number:" << std::endl; + + for (int deviceId = 0; + deviceId < deviceList.size(); deviceId++) { + std::cout << deviceId << ") " << + deviceList[deviceId].getInfo() << + std::endl; + } + std::cout << "Enter device id [0-" << deviceList.size() - 1 << "]:"; + std::cin >> chosenDeviceId; + } else { + chosenDeviceId = static_cast(world_rank % deviceList.size()); + } + } + std::vector chosenDeviceList; + chosenDeviceList.push_back(deviceList[chosenDeviceId]); + + if (world_rank == 0) { + // Give selection summary + std::cout << HLINE; + std::cout << "Selection summary:" << std::endl; + std::cout << "Platform Name: " << + platform.getInfo() << std::endl; + std::cout << "Device Name: " << + chosenDeviceList[0].getInfo() << std::endl; + std::cout << HLINE; + } + + return chosenDeviceList; + } -} // namespace fpga_setup -#endif // SRC_HOST_FPGA_SETUP_H_ +} // namespace fpga_setup \ No newline at end of file From 5cba5d6089f72ee226f96567c14d7c037f410a62 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 14 May 2020 21:02:37 +0200 Subject: [PATCH 14/45] Finalize first version of the base library --- shared/CMakeLists.txt | 9 ++ shared/hpcc_benchmark.cpp | 9 ++ shared/{ => include}/hpcc_benchmark.hpp | 80 ++++++------- shared/include/setup/fpga_setup.hpp | 108 ++++++++++++++++++ .../setup/{fpga_setup.hpp => fpga_setup.cpp} | 17 +-- 5 files changed, 167 insertions(+), 56 deletions(-) create mode 100644 shared/CMakeLists.txt create mode 100644 shared/hpcc_benchmark.cpp rename shared/{ => include}/hpcc_benchmark.hpp (81%) create mode 100644 shared/include/setup/fpga_setup.hpp rename shared/setup/{fpga_setup.hpp => fpga_setup.cpp} (96%) diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt new file mode 100644 index 00000000..0466dcea --- /dev/null +++ b/shared/CMakeLists.txt @@ -0,0 +1,9 @@ +project(HPCCBaseLibrary VERSION 1.0.0) + +add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/hpcc_benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) + +target_include_directories(hpcc_fpga_base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/../extern/cxxopts/include) + +install(TARGETS hpcc_fpga_base +PUBLIC_HEADER DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" +ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") \ No newline at end of file diff --git a/shared/hpcc_benchmark.cpp b/shared/hpcc_benchmark.cpp new file mode 100644 index 00000000..464b6c9a --- /dev/null +++ b/shared/hpcc_benchmark.cpp @@ -0,0 +1,9 @@ + +#include "hpcc_benchmark.hpp" + +std::ostream& hpcc_base::operator<<(std::ostream& os, hpcc_base::BaseSettings const& printedBaseSettings){ + return (os << "Repetitions: " << printedBaseSettings.numRepetitions + << std::endl + << "Kernel File: " << printedBaseSettings.kernelFileName + << std::endl); +} diff --git a/shared/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp similarity index 81% rename from shared/hpcc_benchmark.hpp rename to shared/include/hpcc_benchmark.hpp index f5023fd6..4c904524 100644 --- a/shared/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef SHARED_HPCC_BENCHMAKR_HPP_ -#define SHARED_HPCC_BENCHMAKR_HPP_ +#ifndef SHARED_HPCC_BENCHMARK_HPP_ +#define SHARED_HPCC_BENCHMARK_HPP_ /* Project's headers */ #include "setup/fpga_setup.hpp" @@ -56,15 +56,6 @@ class BaseSettings { }; -std::ostream& operator<<(std::ostream& os, BaseSettings const& printedBaseSettings){ - return (os << "Data Type: " << STR(HOST_DATA_TYPE) - << std::endl - << "Repetitions: " << printedBaseSettings.numRepetitions - << std::endl - << "Kernel File: " << printedBaseSettings.kernelFileName - << std::endl); - } - template class ExecutionSettings { public: @@ -75,40 +66,37 @@ class ExecutionSettings { ExecutionSettings(const std::shared_ptr programSettings_, cl::Device device_,cl::Context context_,cl::Program program_): programSettings(programSettings_), device(device_), context(context_), program(program_) {} + + ExecutionSettings(ExecutionSettings *s) : ExecutionSettings(s->programSettings, s->device, s->context, s->program) {} }; template -std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ - return os << &(printedExecutionSettings.programSettings) - << "Device: " - //<< printedExecutionSettings.device.getInfo() - << std::endl; - } +std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings); template class HpccFpgaBenchmark { private: bool isSetupExecuted = false; - ExecutionSettings executionSettings; + std::shared_ptr> executionSettings; protected: virtual std::shared_ptr - generateInputData(const ExecutionSettings &settings); + generateInputData(const ExecutionSettings &settings) = 0; virtual std::shared_ptr - executeKernel(const ExecutionSettings &settings, TData &data); + executeKernel(const ExecutionSettings &settings, TData &data) = 0; virtual bool - validateOutputAndPrintError(const ExecutionSettings &settings ,TData &data, const TOutput &output); + validateOutputAndPrintError(const ExecutionSettings &settings ,TData &data, const TOutput &output) = 0; virtual void - printResults(const ExecutionSettings &settings, const TOutput &output); + printResults(const ExecutionSettings &settings, const TOutput &output) = 0; virtual void - addAdditionalParseOptions(cxxopts::Options &options) {} + addAdditionalParseOptions(cxxopts::Options &options) = 0; /** * Parses and returns program options using the cxxopts library. @@ -143,7 +131,7 @@ class HpccFpgaBenchmark { ("h,help", "Print this help"); - addAdditionalParseOptions(&options); + addAdditionalParseOptions(options); cxxopts::ParseResult result = options.parse(argc, argv); if (result.count("h")) { @@ -173,7 +161,7 @@ class HpccFpgaBenchmark { * context, program and device */ void - printFinalConfiguration(const ExecutionSettings executionSettings) { + printFinalConfiguration(ExecutionSettings const& executionSettings) { std::cout << PROGRAM_DESCRIPTION << std::endl; std::cout << "Summary:" << std::endl; std::cout << executionSettings << std::endl; @@ -190,18 +178,18 @@ class HpccFpgaBenchmark { */ void setupBenchmark(int argc, char *argv[]) { - std::shared_ptr programSettings = parseProgramParameters(argc, argv); fpga_setup::setupEnvironmentAndClocks(); + std::shared_ptr programSettings = parseProgramParameters(argc, argv); cl::Device usedDevice = - fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); + cl::Device(fpga_setup::selectFPGADevice(programSettings->defaultPlatform, + programSettings->defaultDevice)); cl::Context context = cl::Context(usedDevice); cl::Program program = fpga_setup::fpgaSetup(&context, {usedDevice}, &programSettings->kernelFileName); - executionSettings = ExecutionSettings(programSettings, usedDevice, context, program); + executionSettings = std::make_shared>(new ExecutionSettings(programSettings, usedDevice, context, program)); - printFinalConfiguration(executionSettings); + printFinalConfiguration(*executionSettings); isSetupExecuted = true; } @@ -213,33 +201,35 @@ class HpccFpgaBenchmark { } std::cout << HLINE << "Start benchmark using the given configuration. Generating data..." << std::endl << HLINE; - std::shared_ptr data = generateInputData(&executionSettings); - std::cout << HLINE << "Execute benchmar kernel..." << std::endl + std::shared_ptr data = generateInputData(*executionSettings); + std::cout << HLINE << "Execute benchmark kernel..." << std::endl << HLINE; - std::shared_ptr output = executeKernel(&executionSettings, &data); + std::shared_ptr output = executeKernel(*executionSettings, *data); std::cout << HLINE << "Validate output..." << std::endl << HLINE; - bool validateSuccess = validateOutputAndPrintError(&executionSettings , &data, &output); + bool validateSuccess = validateOutputAndPrintError(*executionSettings , *data, *output); - printResults(&executionSettings, &output); - - std::cout << HLINE << "Cleaning up." << std::endl - << HLINE; - - delete data; - delete output; + printResults(*executionSettings, *output); return validateSuccess; } - HpccFpgaBenchmark(int argc, char *argv[]) { - setupBenchmark(argc, argv); - } - }; +std::ostream& operator<<(std::ostream& os, BaseSettings const& printedBaseSettings); + +template +std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ + std::string device_name; + printedExecutionSettings.device.getInfo(CL_DEVICE_NAME, &device_name); + return os << *printedExecutionSettings.programSettings + << "Device: " + << device_name + << std::endl; +} + } // namespace hpcc_base #endif diff --git a/shared/include/setup/fpga_setup.hpp b/shared/include/setup/fpga_setup.hpp new file mode 100644 index 00000000..92d24882 --- /dev/null +++ b/shared/include/setup/fpga_setup.hpp @@ -0,0 +1,108 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_FPGA_SETUP_H_ +#define SRC_HOST_FPGA_SETUP_H_ + +#include +#include +#include +#include +#include +#include + +/* External libraries */ +#include "CL/cl.hpp" + +/** +Makro to convert the error integer representation to its string representation +Source: https://gist.github.com/allanmac/9328bb2d6a99b86883195f8f78fd1b93 +*/ +#define CL_ERR_TO_STR(err) case err: return #err + +namespace fpga_setup { + + /** +Converts the reveived OpenCL error to a string + +@param err The OpenCL error code + +@return The string representation of the OpenCL error code +*/ + std::string + getCLErrorString(cl_int const err); + +/** +Check the OpenCL return code for errors. +If an error is detected, it will be printed and the programm execution is +stopped. + +@param err The OpenCL error code +*/ + void + handleClReturnCode(cl_int const err, std::string const file, + int const line); + +/** +Makro that enables checks for OpenCL errors with handling of the file and +line number. +*/ +#define ASSERT_CL(err) fpga_setup::handleClReturnCode(err, __FILE__, __LINE__) + +/** +Sets up the given FPGA with the kernel in the provided file. + +@param context The context used for the program +@param program The devices used for the program +@param usedKernelFile The path to the kernel file +@return The program that is used to create the benchmark kernels +*/ + cl::Program + fpgaSetup(const cl::Context *context, std::vector deviceList, + const std::string *usedKernelFile); + +/** +Sets up the C++ environment by configuring std::cout and checking the clock +granularity using bm_helper::checktick() +*/ + void + setupEnvironmentAndClocks(); + + +/** +Searches an selects an FPGA device using the CL library functions. +If multiple platforms or devices are given, the user will be prompted to +choose a device. + +@param defaultPlatform The index of the platform that has to be used. If a + value < 0 is given, the platform can be chosen + interactively +@param defaultDevice The index of the device that has to be used. If a + value < 0 is given, the device can be chosen + interactively + +@return A list containing a single selected device +*/ + cl::Device + selectFPGADevice(int defaultPlatform, int defaultDevice); + +} // namespace fpga_setup +#endif // SRC_HOST_FPGA_SETUP_H_ diff --git a/shared/setup/fpga_setup.hpp b/shared/setup/fpga_setup.cpp similarity index 96% rename from shared/setup/fpga_setup.hpp rename to shared/setup/fpga_setup.cpp index 5318dae2..6d95789f 100644 --- a/shared/setup/fpga_setup.hpp +++ b/shared/setup/fpga_setup.cpp @@ -2,7 +2,7 @@ // Created by Marius Meyer on 04.12.19. // -#include "fpga_setup.hpp" +#include "setup/fpga_setup.hpp" #include #include @@ -11,10 +11,7 @@ #include #include - /* External libraries */ -#include "CL/cl.hpp" - #include "parameters.h" #ifdef _USE_MPI_ @@ -207,7 +204,7 @@ choose a device. @return A list containing a single selected device */ - std::vector + cl::Device selectFPGADevice(int defaultPlatform, int defaultDevice) { // Integer used to store return codes of OpenCL library calls int err; @@ -264,8 +261,8 @@ choose a device. if (defaultDevice < deviceList.size()) { chosenDeviceId = defaultDevice; } else { - std::cerr << "Default device " << defaultDevice - << " can not be used. Available devices: " + std::cerr << "Default platform " << defaultDevice + << " can not be used. Available platforms: " << deviceList.size() << std::endl; exit(1); } @@ -287,8 +284,6 @@ choose a device. chosenDeviceId = static_cast(world_rank % deviceList.size()); } } - std::vector chosenDeviceList; - chosenDeviceList.push_back(deviceList[chosenDeviceId]); if (world_rank == 0) { // Give selection summary @@ -297,11 +292,11 @@ choose a device. std::cout << "Platform Name: " << platform.getInfo() << std::endl; std::cout << "Device Name: " << - chosenDeviceList[0].getInfo() << std::endl; + deviceList[chosenDeviceId].getInfo() << std::endl; std::cout << HLINE; } - return chosenDeviceList; + return deviceList[chosenDeviceId]; } } // namespace fpga_setup \ No newline at end of file From ed522e9e2709472a8e22a327dba23e14f0584eb4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 14 May 2020 21:03:04 +0200 Subject: [PATCH 15/45] Adapt STREAM to new base library --- STREAM/src/host/CMakeLists.txt | 5 +- .../common_benchmark_io_implementation.cpp | 102 ------------------ .../src/host/{execution.h => execution.hpp} | 1 - STREAM/src/host/execution_default.cpp | 6 +- STREAM/src/host/program_settings.h | 35 ------ STREAM/src/host/stream_functionality.cpp | 12 ++- STREAM/src/host/stream_functionality.hpp | 3 - 7 files changed, 14 insertions(+), 150 deletions(-) delete mode 100644 STREAM/src/host/common_benchmark_io_implementation.cpp rename STREAM/src/host/{execution.h => execution.hpp} (98%) delete mode 100644 STREAM/src/host/program_settings.h diff --git a/STREAM/src/host/CMakeLists.txt b/STREAM/src/host/CMakeLists.txt index 9be0036f..25094b12 100755 --- a/STREAM/src/host/CMakeLists.txt +++ b/STREAM/src/host/CMakeLists.txt @@ -1,4 +1,5 @@ -include_directories(../../../extern/cxxopts/include ../../../shared) +add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) +include_directories(${HPCCBaseLibrary_INCLUDE_DIRS}) set(HOST_SOURCE execution_default.cpp main.cpp stream_functionality.cpp) @@ -7,6 +8,7 @@ if (INTELFPGAOPENCL_FOUND) include_directories(${CMAKE_BINARY_DIR}/src/common) add_executable(STREAM_FPGA_intel ${HOST_SOURCE}) target_link_libraries(STREAM_FPGA_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}") + target_link_libraries(STREAM_FPGA_intel hpcc_fpga_base) if (USE_SVM) target_compile_definitions(STREAM_FPGA_intel PRIVATE -DCL_VERSION_2_0) endif() @@ -20,6 +22,7 @@ if (Vitis_FOUND) include_directories(${CMAKE_BINARY_DIR}/src/common) add_executable(STREAM_FPGA_xilinx ${HOST_SOURCE}) target_link_libraries(STREAM_FPGA_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(STREAM_FPGA_xilinx hpcc_fpga_base) target_compile_definitions(STREAM_FPGA_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(STREAM_FPGA_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_xilinx_host_executable COMMAND $ -h) diff --git a/STREAM/src/host/common_benchmark_io_implementation.cpp b/STREAM/src/host/common_benchmark_io_implementation.cpp deleted file mode 100644 index 0c149f31..00000000 --- a/STREAM/src/host/common_benchmark_io_implementation.cpp +++ /dev/null @@ -1,102 +0,0 @@ - -#include "cxxopts.hpp" -#include "parameters.h" -#include "setup/common_benchmark_io.hpp" - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("s", "Size of the data arrays", - cxxopts::value()->default_value(std::to_string(DEFAULT_ARRAY_LENGTH))) - ("r", "Number of kernel replications used", - cxxopts::value()->default_value(std::to_string(NUM_KERNEL_REPLICATIONS))) -#ifdef INTEL_FPGA - ("i", "Use memory Interleaving") -#endif - ("single-kernel", "Use the single kernel implementation") - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["s"].as(), - result["r"].as(), -#ifdef INTEL_FPGA - static_cast(result.count("i")), -#else - false, -#endif - result["platform"].as(), - result["device"].as(), - result["f"].as(), - static_cast(result.count("single-kernel"))}); - return sharedSettings; -} - -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl; - std::cout << "Summary:" << std::endl - << "Array Size: " - << static_cast(programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)) << " Byte" - << std::endl - << "Data Type: " << STR(HOST_DATA_TYPE) - << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Kernel Replications: " << programSettings->kernelReplications - << std::endl - << "Kernel Type: " << (programSettings->useSingleKernel ? "Single" : "Separate") - << std::endl - << "Kernel File: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} diff --git a/STREAM/src/host/execution.h b/STREAM/src/host/execution.hpp similarity index 98% rename from STREAM/src/host/execution.h rename to STREAM/src/host/execution.hpp index 1b78d928..e15b506d 100644 --- a/STREAM/src/host/execution.h +++ b/STREAM/src/host/execution.hpp @@ -31,7 +31,6 @@ SOFTWARE. /* External library headers */ #include "CL/cl.hpp" #include "parameters.h" -#include "hpcc_benchmark.hpp" #include "stream_functionality.hpp" // Map keys for execution timings diff --git a/STREAM/src/host/execution_default.cpp b/STREAM/src/host/execution_default.cpp index 3d79c845..9c2f209f 100644 --- a/STREAM/src/host/execution_default.cpp +++ b/STREAM/src/host/execution_default.cpp @@ -21,7 +21,7 @@ SOFTWARE. */ /* Related header files */ -#include "execution.h" +#include "execution.hpp" /* C++ standard library headers */ #include @@ -156,7 +156,7 @@ namespace bm_execution { std::cout << "precision of your system timer." << std::endl; std::cout << HLINE; - for (int i=0; inumRepetitions; i++) { + for (int i=0; ikernelReplications; i++) { #ifdef USE_SVM ASSERT_CL(clEnqueueSVMUnmap(command_queues[i](), reinterpret_cast(A), 0, @@ -174,7 +174,7 @@ namespace bm_execution { // // Do actual benchmark measurements // - for (uint r = 0; r < config.programSettings->kernelReplications; r++) { + for (uint r = 0; r < config.programSettings->numRepetitions; r++) { #pragma omp parallel { #pragma omp single diff --git a/STREAM/src/host/program_settings.h b/STREAM/src/host/program_settings.h deleted file mode 100644 index 53f8f34e..00000000 --- a/STREAM/src/host/program_settings.h +++ /dev/null @@ -1,35 +0,0 @@ - -#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ -#define SRC_HOST_PROGRAM_SETTINGS_H_ - -#include "parameters.h" - -/* C++ standard library headers */ -#include - -#include "CL/opencl.h" - - - -#define PROGRAM_DESCRIPTION "Implementation of the STREAM benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " VERSION "\n" - - -/** -* A struct that is used to store the porgram settings -* provided by command line arguments. -*/ -struct ProgramSettings { - uint numRepetitions; - uint streamArraySize; - uint kernelReplications; - bool useMemoryInterleaving; - int defaultPlatform; - int defaultDevice; - std::string kernelFileName; - bool useSingleKernel; -}; - - -#endif diff --git a/STREAM/src/host/stream_functionality.cpp b/STREAM/src/host/stream_functionality.cpp index fc5f5370..40636785 100644 --- a/STREAM/src/host/stream_functionality.cpp +++ b/STREAM/src/host/stream_functionality.cpp @@ -31,7 +31,7 @@ SOFTWARE. #include /* Project's headers */ -#include "execution.h" +#include "execution.hpp" #include "parameters.h" StreamProgramSettings::StreamProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), @@ -42,10 +42,11 @@ StreamProgramSettings::StreamProgramSettings(cxxopts::ParseResult &results) : hp } std::ostream& operator<<(std::ostream& os, StreamProgramSettings const& printedSettings) { - return os << "Array Size: " - << static_cast(printedSettings.streamArraySize * sizeof(HOST_DATA_TYPE)) << " Byte" + return os << static_cast(printedSettings) + << "Data Type: " << STR(HOST_DATA_TYPE) << std::endl - << "Data Type: " << STR(HOST_DATA_TYPE) + << "Array Size: " << printedSettings.streamArraySize << " (" + << static_cast(printedSettings.streamArraySize * sizeof(HOST_DATA_TYPE)) <<" Byte )" << std::endl << "Kernel Replications: " << printedSettings.kernelReplications << std::endl @@ -53,7 +54,8 @@ std::ostream& operator<<(std::ostream& os, StreamProgramSettings const& printedS << std::endl; } -StreamBenchmark::StreamBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) { +StreamBenchmark::StreamBenchmark(int argc, char* argv[]) { + setupBenchmark(argc, argv); } void diff --git a/STREAM/src/host/stream_functionality.hpp b/STREAM/src/host/stream_functionality.hpp index 64b0fa09..6030449d 100644 --- a/STREAM/src/host/stream_functionality.hpp +++ b/STREAM/src/host/stream_functionality.hpp @@ -43,9 +43,6 @@ class StreamProgramSettings : public hpcc_base::BaseSettings { }; -std::ostream& operator<<(std::ostream& os, StreamProgramSettings const& printedSettings); - - class StreamData { public: From 68e67e57d397911f2efac4e1e7917e6fe9339095 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 15 May 2020 09:32:53 +0200 Subject: [PATCH 16/45] Work on STREAM testing --- .gitignore | 1 + STREAM/src/host/CMakeLists.txt | 27 +-- STREAM/src/host/execution.hpp | 6 +- STREAM/src/host/execution_default.cpp | 12 +- STREAM/src/host/main.cpp | 2 +- ...functionality.cpp => stream_benchmark.cpp} | 33 +-- ...functionality.hpp => stream_benchmark.hpp} | 111 +++++++++- STREAM/tests/CMakeLists.txt | 11 +- STREAM/tests/main.cpp | 68 ++++++ ...nel_functionality_and_host_integration.cpp | 82 ++------ STREAM/tests/test_program_settings.h | 27 +++ shared/include/hpcc_benchmark.hpp | 198 ++++++++++++++++-- 12 files changed, 452 insertions(+), 126 deletions(-) rename STREAM/src/host/{stream_functionality.cpp => stream_benchmark.cpp} (86%) rename STREAM/src/host/{stream_functionality.hpp => stream_benchmark.hpp} (55%) create mode 100644 STREAM/tests/main.cpp create mode 100644 STREAM/tests/test_program_settings.h diff --git a/.gitignore b/.gitignore index 204e1da6..3f889e86 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ cmake-* build/* .idea .venv +docs/* \ No newline at end of file diff --git a/STREAM/src/host/CMakeLists.txt b/STREAM/src/host/CMakeLists.txt index 25094b12..239c1bb6 100755 --- a/STREAM/src/host/CMakeLists.txt +++ b/STREAM/src/host/CMakeLists.txt @@ -1,29 +1,32 @@ add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) include_directories(${HPCCBaseLibrary_INCLUDE_DIRS}) +include_directories(${CMAKE_BINARY_DIR}/src/common) -set(HOST_SOURCE execution_default.cpp main.cpp stream_functionality.cpp) +set(HOST_SOURCE execution_default.cpp stream_benchmark.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) - include_directories(${CMAKE_BINARY_DIR}/src/common) - add_executable(STREAM_FPGA_intel ${HOST_SOURCE}) - target_link_libraries(STREAM_FPGA_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}") + add_library(stream_intel STATIC ${HOST_SOURCE}) + add_executable(STREAM_FPGA_intel main.cpp) + target_link_libraries(stream_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}") target_link_libraries(STREAM_FPGA_intel hpcc_fpga_base) + target_link_libraries(STREAM_FPGA_intel stream_intel) if (USE_SVM) - target_compile_definitions(STREAM_FPGA_intel PRIVATE -DCL_VERSION_2_0) + target_compile_definitions(stream_intel PRIVATE -DCL_VERSION_2_0) endif() - target_compile_definitions(STREAM_FPGA_intel PRIVATE -DINTEL_FPGA) - target_compile_options(STREAM_FPGA_intel PRIVATE "${OpenMP_CXX_FLAGS}") + target_compile_definitions(stream_intel PRIVATE -DINTEL_FPGA) + target_compile_options(stream_intel PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_intel_host_executable COMMAND $ -h) endif() if (Vitis_FOUND) include_directories(${Vitis_INCLUDE_DIRS}) - include_directories(${CMAKE_BINARY_DIR}/src/common) - add_executable(STREAM_FPGA_xilinx ${HOST_SOURCE}) - target_link_libraries(STREAM_FPGA_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + add_library(stream_xilinx STATIC ${HOST_SOURCE}) + add_executable(STREAM_FPGA_xilinx main.cpp) + target_link_libraries(stream_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(STREAM_FPGA_xilinx hpcc_fpga_base) - target_compile_definitions(STREAM_FPGA_xilinx PRIVATE -DXILINX_FPGA) - target_compile_options(STREAM_FPGA_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + target_link_libraries(STREAM_FPGA_xilinx stream_ixilinx) + target_compile_definitions(stream_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(stream_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_xilinx_host_executable COMMAND $ -h) endif() diff --git a/STREAM/src/host/execution.hpp b/STREAM/src/host/execution.hpp index e15b506d..a46fa687 100644 --- a/STREAM/src/host/execution.hpp +++ b/STREAM/src/host/execution.hpp @@ -31,7 +31,7 @@ SOFTWARE. /* External library headers */ #include "CL/cl.hpp" #include "parameters.h" -#include "stream_functionality.hpp" +#include "stream_benchmark.hpp" // Map keys for execution timings #define PCIE_WRITE_KEY "PCI write" @@ -62,8 +62,8 @@ simple exchange of the different calculation methods. @return The resulting matrix */ - std::shared_ptr - calculate(const hpcc_base::ExecutionSettings config, + std::shared_ptr + calculate(const hpcc_base::ExecutionSettings config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C); diff --git a/STREAM/src/host/execution_default.cpp b/STREAM/src/host/execution_default.cpp index 9c2f209f..744d8029 100644 --- a/STREAM/src/host/execution_default.cpp +++ b/STREAM/src/host/execution_default.cpp @@ -39,11 +39,11 @@ SOFTWARE. namespace bm_execution { - void initialize_buffers(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, + void initialize_buffers(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, std::vector &Buffers_A, std::vector &Buffers_B, std::vector &Buffers_C); - void initialize_queues_and_kernels(const hpcc_base::ExecutionSettings &config, + void initialize_queues_and_kernels(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, const std::vector &Buffers_A, const std::vector &Buffers_B, const std::vector &Buffers_C, @@ -52,7 +52,7 @@ namespace bm_execution { std::vector &triad_kernels, std::vector &command_queues); - void initialize_queues_and_kernels_single(const hpcc_base::ExecutionSettings &config, + void initialize_queues_and_kernels_single(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, const std::vector &Buffers_A, const std::vector &Buffers_B, const std::vector &Buffers_C, @@ -68,8 +68,8 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr - calculate(const hpcc_base::ExecutionSettings config, + std::shared_ptr + calculate(const hpcc_base::ExecutionSettings config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C) { @@ -328,7 +328,7 @@ namespace bm_execution { } } - std::shared_ptr result(new StreamExecutionTimings{ + std::shared_ptr result(new stream::StreamExecutionTimings{ timingMap, config.programSettings->streamArraySize }); diff --git a/STREAM/src/host/main.cpp b/STREAM/src/host/main.cpp index 1aa2896a..fadd4f88 100644 --- a/STREAM/src/host/main.cpp +++ b/STREAM/src/host/main.cpp @@ -2,7 +2,7 @@ // Created by Marius Meyer on 04.12.19. // -#include "stream_functionality.hpp" +#include "stream_benchmark.hpp" /** The program entry point diff --git a/STREAM/src/host/stream_functionality.cpp b/STREAM/src/host/stream_benchmark.cpp similarity index 86% rename from STREAM/src/host/stream_functionality.cpp rename to STREAM/src/host/stream_benchmark.cpp index 40636785..758cde3e 100644 --- a/STREAM/src/host/stream_functionality.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -24,7 +24,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "stream_functionality.hpp" +#include "stream_benchmark.hpp" /* C++ standard library headers */ #include @@ -34,16 +34,22 @@ SOFTWARE. #include "execution.hpp" #include "parameters.h" -StreamProgramSettings::StreamProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), +stream::StreamProgramSettings::StreamProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), streamArraySize(results["s"].as()), kernelReplications(results["r"].as()), useSingleKernel(static_cast(results.count("single-kernel"))) { } -std::ostream& operator<<(std::ostream& os, StreamProgramSettings const& printedSettings) { - return os << static_cast(printedSettings) - << "Data Type: " << STR(HOST_DATA_TYPE) +/** + * @brief Print method for the stream specific program settings + * + * @param os + * @param printedSettings + * @return std::ostream& + */ +std::ostream& operator<<(std::ostream& os, stream::StreamProgramSettings const& printedSettings) { + return os << "Data Type: " << STR(HOST_DATA_TYPE) << std::endl << "Array Size: " << printedSettings.streamArraySize << " (" << static_cast(printedSettings.streamArraySize * sizeof(HOST_DATA_TYPE)) <<" Byte )" @@ -54,12 +60,14 @@ std::ostream& operator<<(std::ostream& os, StreamProgramSettings const& printedS << std::endl; } -StreamBenchmark::StreamBenchmark(int argc, char* argv[]) { +stream::StreamBenchmark::StreamBenchmark(int argc, char* argv[]) { setupBenchmark(argc, argv); } +stream::StreamBenchmark::StreamBenchmark() {} + void -StreamBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { +stream::StreamBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { options.add_options() ("s", "Size of the data arrays", cxxopts::value()->default_value(std::to_string(DEFAULT_ARRAY_LENGTH))) @@ -69,7 +77,7 @@ StreamBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { } std::shared_ptr -StreamBenchmark::executeKernel(const hpcc_base::ExecutionSettings &settings, StreamData &data) { +stream::StreamBenchmark::executeKernel(const hpcc_base::ExecutionSettings &settings, StreamData &data) { return bm_execution::calculate(settings, data.A, data.B, @@ -82,7 +90,7 @@ Prints the execution results to stdout @param results The execution results */ void -StreamBenchmark::printResults(const hpcc_base::ExecutionSettings &settings, const StreamExecutionTimings &output) { +stream::StreamBenchmark::printResults(const hpcc_base::ExecutionSettings &settings, const stream::StreamExecutionTimings &output) { std::cout << std::setw(ENTRY_SPACE) << "Function"; std::cout << std::setw(ENTRY_SPACE) << "Best Rate MB/s"; @@ -107,7 +115,7 @@ StreamBenchmark::printResults(const hpcc_base::ExecutionSettings -StreamBenchmark::generateInputData(const hpcc_base::ExecutionSettings &settings) { +stream::StreamBenchmark::generateInputData(const hpcc_base::ExecutionSettings &settings) { HOST_DATA_TYPE *A, *B, *C; #ifdef INTEL_FPGA #ifdef USE_SVM @@ -137,10 +145,11 @@ StreamBenchmark::generateInputData(const hpcc_base::ExecutionSettings(new StreamData{A, B, C}); + return std::make_shared(new stream::StreamData{A, B, C}); } -bool StreamBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionSettings &settings ,StreamData &data, const StreamExecutionTimings &output) { +bool +stream::StreamBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionSettings &settings ,stream::StreamData &data, const stream::StreamExecutionTimings &output) { HOST_DATA_TYPE aj,bj,cj,scalar; HOST_DATA_TYPE aSumErr,bSumErr,cSumErr; HOST_DATA_TYPE aAvgErr,bAvgErr,cAvgErr; diff --git a/STREAM/src/host/stream_functionality.hpp b/STREAM/src/host/stream_benchmark.hpp similarity index 55% rename from STREAM/src/host/stream_functionality.hpp rename to STREAM/src/host/stream_benchmark.hpp index 6030449d..b52b3b49 100644 --- a/STREAM/src/host/stream_functionality.hpp +++ b/STREAM/src/host/stream_benchmark.hpp @@ -31,18 +31,50 @@ SOFTWARE. #include "hpcc_benchmark.hpp" #include "parameters.h" - +/** + * @brief Contains all classes and methods needed by the STREAM benchmark + * + */ +namespace stream { + +/** + * @brief The STREAM specific program settings + * + */ class StreamProgramSettings : public hpcc_base::BaseSettings { public: + /** + * @brief The size of each stream array in number of values + * + */ uint streamArraySize; + + /** + * @brief The number of used kernel replications + * + */ uint kernelReplications; + + /** + * @brief Indicator if the single kernel or the legacy kernel are used for execution + * + */ bool useSingleKernel; + /** + * @brief Construct a new Stream Program Settings object + * + * @param results the result map from parsing the program input parameters + */ StreamProgramSettings(cxxopts::ParseResult &results); }; +/** + * @brief Data class cotnaining the data the kernel is exeucted with + * + */ class StreamData { public: @@ -52,36 +84,99 @@ class StreamData { }; +/** + * @brief Measured execution timing from the kernel execution + * + */ class StreamExecutionTimings { public: - std::map> timings; - uint arraySize; + /** + * @brief A map containing the timings for all stream operation types + * + */ + std::map> timings; + + /** + * @brief The used array size + * + */ + uint arraySize; }; +/** + * @brief Implementation of the Sream benchmark + * + */ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: + /** + * @brief Additional input parameters of the strema benchmark + * + * @param options + */ + void + addAdditionalParseOptions(cxxopts::Options &options) override; + +public: + + /** + * @brief Stream specific implementation of the data generation + * + * @param settings + * @return std::shared_ptr + */ std::shared_ptr generateInputData(const hpcc_base::ExecutionSettings &settings) override; + /** + * @brief Stream specific implementation of the kernel execution + * + * @param settings + * @param data + * @return std::shared_ptr + */ std::shared_ptr executeKernel(const hpcc_base::ExecutionSettings &settings, StreamData &data) override; + /** + * @brief Stream specific implementation of the execution validation + * + * @param settings + * @param data + * @param output + * @return true + * @return false + */ bool validateOutputAndPrintError(const hpcc_base::ExecutionSettings &settings ,StreamData &data, const StreamExecutionTimings &output) override; + /** + * @brief Stream specific implementation of printing the execution results + * + * @param settings + * @param output + */ void printResults(const hpcc_base::ExecutionSettings &settings, const StreamExecutionTimings &output) override; - void - addAdditionalParseOptions(cxxopts::Options &options) override; - -public: - + /** + * @brief Construct a new Stream Benchmark object + * + * @param argc the number of program input parameters + * @param argv the program input parameters as array of strings + */ StreamBenchmark(int argc, char* argv[]); + /** + * @brief Construct a new Stream Benchmark object + */ + StreamBenchmark(); + }; +} // namespace stream + #endif // SRC_HOST_STREAM_BENCHMARK_H_ diff --git a/STREAM/tests/CMakeLists.txt b/STREAM/tests/CMakeLists.txt index 81dba6a3..db1bffa6 100755 --- a/STREAM/tests/CMakeLists.txt +++ b/STREAM/tests/CMakeLists.txt @@ -2,16 +2,15 @@ # 'lib' is the folder with Google Test sources add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) -include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) -include_directories(${CMAKE_SOURCE_DIR}/src/host/) +include_directories(${CMAKE_BINARY_DIR}/src/common) +add_subdirectory(${CMAKE_SOURCE_DIR}/src/host/) -set(PROJECT_SOURCES ../src/host/execution_default.cpp ../src/host/stream_functionality.cpp) -set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp ../../shared/testing/main.cpp) +set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp main.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(Test_intel gtest gmock stream_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") add_dependencies(Test_intel stream_kernels_emulate_intel stream_kernels_single_emulate_intel) target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) target_compile_options(Test_intel PRIVATE "${OpenMP_CXX_FLAGS}") @@ -22,7 +21,7 @@ endif() if (Vitis_FOUND) include_directories(SYSTEM ${Vitis_INCLUDE_DIRS}) add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(Test_xilinx gtest gmock stream_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") add_dependencies(Test_xilinx stream_kernels_single_emulate_xilinx) target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(Test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") diff --git a/STREAM/tests/main.cpp b/STREAM/tests/main.cpp new file mode 100644 index 00000000..d364302f --- /dev/null +++ b/STREAM/tests/main.cpp @@ -0,0 +1,68 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "stream_benchmark.hpp" + +#include "gtest/gtest.h" +#include "CL/cl.hpp" + +#ifdef _USE_MPI_ +#include "mpi.h" + +class MPIEnvironment : public ::testing::Environment { +public: + MPIEnvironment(int* argc, char** argv[]) { + MPI_Init(argc, argv); + } + + ~MPIEnvironment() override { + MPI_Finalize(); + } +}; +#endif + +std::shared_ptr bm; + +/** +The program entry point for the unit tests +*/ +int +main(int argc, char *argv[]) { + + std::cout << "THIS BINARY EXECUTES UNIT TESTS FOR THE FOLLOWING BENCHMARK:" << std::endl << std::endl; + + ::testing::InitGoogleTest(&argc, argv); + +#ifdef _USE_MPI_ + ::testing::Environment* const mpi_env = + ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); +#endif + + bm = std::shared_ptr(StreamBenchmark()); + + bm->setupBenchmark(argc, argv); + + return RUN_ALL_TESTS(); + +} + diff --git a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp index 74d52b1b..85c1c69b 100644 --- a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp +++ b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp @@ -3,89 +3,43 @@ // #include "gtest/gtest.h" #include "parameters.h" -#include "../src/host/execution.h" -#include "setup/fpga_setup.hpp" -#include "testing/test_program_settings.h" -#include "../src/host/stream_functionality.hpp" +#include "test_program_settings.h" +#include "stream_benchmark.hpp" -struct OpenCLKernelTest :public ::testing::Test { - HOST_DATA_TYPE *A; - HOST_DATA_TYPE *B; - HOST_DATA_TYPE *C; - std::shared_ptr config; - cl_uint array_size; - - OpenCLKernelTest() { - array_size = VECTOR_COUNT * UNROLL_COUNT * NUM_KERNEL_REPLICATIONS * BUFFER_SIZE; - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * array_size); - posix_memalign(reinterpret_cast(&B), 64, - sizeof(HOST_DATA_TYPE) * array_size); - posix_memalign(reinterpret_cast(&C), 64, - sizeof(HOST_DATA_TYPE) * array_size); - } +struct StreamKernelTest :public ::testing::Test { + std::shared_ptr data; void SetUp( ) { - std::cout << programSettings << std::endl; - setupFPGA(programSettings); + bm->getExecutionSettings->streamArraySize = VECTOR_COUNT * UNROLL_COUNT * NUM_KERNEL_REPLICATIONS * BUFFER_SIZE; + data = bm->generateInputData(); } - void setupFPGA(std::shared_ptr settings) { - // Redirect stout buffer to local buffer to make checks possible - std::stringstream newStdOutBuffer; - std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); - std::cout.rdbuf(newStdOutBuffer.rdbuf()); - - std::vector device = fpga_setup::selectFPGADevice(settings->defaultPlatform, settings->defaultDevice); - cl::Context context(device[0]); - cl::Program program = fpga_setup::fpgaSetup(&context, device, &settings->kernelFileName); - config = std::make_shared( - bm_execution::ExecutionConfiguration{ - context, device[0], program, - 1, - NUM_KERNEL_REPLICATIONS, - array_size, - false, - settings->useSingleKernel - }); - HOST_DATA_TYPE norm; - generateInputData(A, B, C, array_size); - - // Redirect stdout to old buffer - std::cout.rdbuf(oldStdOutBuffer); - } - - ~OpenCLKernelTest() override { - free(A); - free(B); - free(C); - } }; /** * Execution returns correct results for a single repetition */ -TEST_F(OpenCLKernelTest, FPGACorrectResultsOneRepetition) { - - auto result = bm_execution::calculate(config, A, B, C); +TEST_F(StreamKernelTest, FPGACorrectResultsOneRepetition) { + bm->getExecutionSettings().programSettings->numRepetitions = 1; + auto result = bm->executeKernel(bm->getExecutionSettings(), *data); for (int i = 0; i < array_size; i++) { - EXPECT_FLOAT_EQ(A[i], 30.0); - EXPECT_FLOAT_EQ(B[i], 6.0); - EXPECT_FLOAT_EQ(C[i], 8.0); + EXPECT_FLOAT_EQ(data->A[i], 30.0); + EXPECT_FLOAT_EQ(data->B[i], 6.0); + EXPECT_FLOAT_EQ(data->C[i], 8.0); } } /** * Execution returns correct results for three repetitions */ -TEST_F(OpenCLKernelTest, FPGACorrectResultsThreeRepetition) { - config->repetitions = 3; - auto result = bm_execution::calculate(config, A, B, C); +TEST_F(StreamKernelTest, FPGACorrectResultsThreeRepetition) { + bm->getExecutionSettings().programSettings->numRepetitions = 3; + auto result = bm->executeKernel(bm->getExecutionSettings(), *data); for (int i = 0; i < array_size; i++) { - EXPECT_FLOAT_EQ(A[i], 6750.0); - EXPECT_FLOAT_EQ(B[i], 1350.0); - EXPECT_FLOAT_EQ(C[i], 1800.0); + EXPECT_FLOAT_EQ(data->A[i], 6750.0); + EXPECT_FLOAT_EQ(data->B[i], 1350.0); + EXPECT_FLOAT_EQ(data->C[i], 1800.0); } } diff --git a/STREAM/tests/test_program_settings.h b/STREAM/tests/test_program_settings.h new file mode 100644 index 00000000..c2e0476b --- /dev/null +++ b/STREAM/tests/test_program_settings.h @@ -0,0 +1,27 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "stream_benchmark.hpp" + + +extern std::shared_ptr bm; diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 4c904524..256d4620 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -35,19 +35,63 @@ SOFTWARE. #define ENTRY_SPACE 15 - +/** + * @brief Contains all classes and functions that are used as basis + * for all benchmarks. + * + */ namespace hpcc_base { +/** + * @brief This class should be derived and extended for every benchmark. + * It is a pure data object containing the benchmark settings that are + * used to execute the benchmark kernel. + * + * DERIVED CLASSES NEED TO IMPLEMENT A operator<< METHOD! + * + */ class BaseSettings { public: + /** + * @brief Number of times the kernel execution will be repeated + * + */ uint numRepetitions; + + /** + * @brief Boolean showing if memory interleaving is used that is + * triggered from the host side (Intel specific) + * + */ bool useMemoryInterleaving; + + /** + * @brief The default platform that should be used for execution. + * A number representing the index in the list of available platforms + * + */ int defaultPlatform; + + /** + * @brief The default device that should be used for execution. + * A number representing the index in the list of available devices + * + */ int defaultDevice; + + /** + * @brief Path to the kernel file that is used for execution + * + */ std::string kernelFileName; + /** + * @brief Construct a new Base Settings object + * + * @param results The resulting map from parsing the program input parameters + */ BaseSettings(cxxopts::ParseResult &results) : numRepetitions(results["n"].as()), useMemoryInterleaving(static_cast(results.count("i"))), defaultPlatform(results["platform"].as()), @@ -56,48 +100,137 @@ class BaseSettings { }; + +/** + * @brief Settings class that is containing the program settings together with + * additional information about the OpenCL runtime + * + * @tparam TSettings The program settings class that should be used (Must derive from BaseSettings) + */ template class ExecutionSettings { public: + + /** + * @brief The OpenCL device that should be used for execution + * + */ cl::Device device; + + /** + * @brief The OpenCL context that should be used for execution + * + */ cl::Context context; + + /** + * @brief The OpenCL program that contains the benchmark kernel + * + */ cl::Program program; + + /** + * @brief Pointer to the additional program settings + * + */ std::shared_ptr programSettings; + /** + * @brief Construct a new Execution Settings object + * + * @param programSettings_ Pointer to an existing program settings object that is derived from BaseSettings + * @param device_ Used OpenCL device + * @param context_ Used OpenCL context + * @param program_ Used OpenCL program + */ ExecutionSettings(const std::shared_ptr programSettings_, cl::Device device_,cl::Context context_,cl::Program program_): programSettings(programSettings_), device(device_), context(context_), program(program_) {} + /** + * @brief Construct a new Execution Settings object from an Execution Settings object + * + * @param s The object to copy + */ ExecutionSettings(ExecutionSettings *s) : ExecutionSettings(s->programSettings, s->device, s->context, s->program) {} }; -template -std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings); - +/** + * @brief Base benchmark class. Every benchmark should be derived from this class and implement its abstract methods. + * + * @tparam TSettings Class used to represent the program settings of the benchmark + * @tparam TData Class used to represent the benchmark input and output data + * @tparam TOutput Class representing the measurements like timings etc + */ template class HpccFpgaBenchmark { private: + /** + * @brief Is set by setupBenchmark() to make sure the benchmark is not + * executed before the setup is run + * + */ bool isSetupExecuted = false; + + /** + * @brief The used execution settings that will be generated by setupBenchmark() + * + */ std::shared_ptr> executionSettings; protected: + /** + * @brief Add additional options to the program parameter parser + * + * @param options The options object that will be used to parse the input parameters + */ + virtual void + addAdditionalParseOptions(cxxopts::Options &options) {} + +public: + + /** + * @brief Allocate and initiate the input data for the kernel + * + * @param settings The used execution settings + * @return std::shared_ptr A data class containing the initialized data + */ virtual std::shared_ptr generateInputData(const ExecutionSettings &settings) = 0; + /** + * @brief Execute the benchmark kernel and measure performance + * + * @param settings The used execution settings + * @param data The initialized data for the kernel. It will be replaced by the kernel output for validation + * @return std::shared_ptr A data class containing the measurement results of the execution + */ virtual std::shared_ptr executeKernel(const ExecutionSettings &settings, TData &data) = 0; + /** + * @brief Validate the output of the execution + * + * @param settings The used execution settings + * @param data The output data after kernel execution + * @param output The measurement data of the kernel execution + * @return true If the validation is a success. + * @return false If the validation failed + */ virtual bool validateOutputAndPrintError(const ExecutionSettings &settings ,TData &data, const TOutput &output) = 0; + /** + * @brief Prints the measurement results of the benchmark to std::cout + * + * @param settings The used execution settings + * @param output The measurement data of the kernel execution + */ virtual void printResults(const ExecutionSettings &settings, const TOutput &output) = 0; - virtual void - addAdditionalParseOptions(cxxopts::Options &options) = 0; - /** * Parses and returns program options using the cxxopts library. * The parsed parameters are depending on the benchmark that is implementing @@ -167,8 +300,6 @@ class HpccFpgaBenchmark { std::cout << executionSettings << std::endl; } -public: - /** * @brief Selects and prepares the target device and prints the final configuration * before executing the benchmark @@ -178,7 +309,6 @@ class HpccFpgaBenchmark { */ void setupBenchmark(int argc, char *argv[]) { - fpga_setup::setupEnvironmentAndClocks(); std::shared_ptr programSettings = parseProgramParameters(argc, argv); cl::Device usedDevice = cl::Device(fpga_setup::selectFPGADevice(programSettings->defaultPlatform, @@ -193,6 +323,13 @@ class HpccFpgaBenchmark { isSetupExecuted = true; } + /** + * @brief Execute the benchmark. This includes the initialization of the + * input data, exectuon of the kernel, validation and printing the result + * + * @return true If the validation is a success + * @return false If the validation fails + */ bool executeBenchmark() { if (!isSetupExecuted) { @@ -216,18 +353,51 @@ class HpccFpgaBenchmark { return validateSuccess; } + /** + * @brief Get the Execution Settings object for further modifications (Mainly for testing purposes) + * + * @return ExecutionSettings& The execution settings object + */ + ExecutionSettings& getExecutionSettings() { + return *executionSettings; + } + + HpccFpgaBenchmark() { + fpga_setup::setupEnvironmentAndClocks(); + } + }; +/** + * @brief Prints the base settings to an output stream. + * + * @param os The output stream + * @param printedBaseSettings The base settings that should be printed + * @return std::ostream& The output stream after the base settings are piped in + */ std::ostream& operator<<(std::ostream& os, BaseSettings const& printedBaseSettings); +/** + * @brief Prints the execution settings to an output stream + * + * @tparam TSettings The program settings class used to create the execution settings + * @param os The output stream + * @param printedExecutionSettings The execution settings that have to be printed to the stream + * @return std::ostream& The output stream after the execution settings are piped in + */ template std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ std::string device_name; printedExecutionSettings.device.getInfo(CL_DEVICE_NAME, &device_name); - return os << *printedExecutionSettings.programSettings - << "Device: " - << device_name - << std::endl; + std::string platform_name; + printedExecutionSettings.device.getInfo(CL_PLATFORM_NAME, &platform_name); + os << static_cast(*printedExecutionSettings.programSettings); + if (typeid(printedExecutionSettings.programSettings) != typeid(BaseSettings)) { + os << *printedExecutionSettings.programSettings; + } + os << "Platform: " << platform_name << std::endl; + os << "Device: " << device_name << std::endl; + return os; } } // namespace hpcc_base From a55827f08af0bb5392219fd9d859235d53b2f8f3 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 15 May 2020 11:33:10 +0200 Subject: [PATCH 17/45] Fix unit tests for STREAM --- STREAM/src/host/CMakeLists.txt | 17 ++-- STREAM/src/host/execution_default.cpp | 6 +- STREAM/src/host/main.cpp | 2 + STREAM/src/host/stream_benchmark.cpp | 16 +++- STREAM/src/host/stream_benchmark.hpp | 2 + STREAM/tests/CMakeLists.txt | 5 +- STREAM/tests/main.cpp | 4 +- ...nel_functionality_and_host_integration.cpp | 10 +-- STREAM/tests/test_program_settings.h | 2 +- shared/CMakeLists.txt | 2 +- shared/include/hpcc_benchmark.hpp | 25 ++---- shared/setup/fpga_setup.cpp | 6 +- shared/setup/test_fpga_setup.cpp | 17 ++-- shared/testing/main.cpp | 78 ------------------- shared/testing/test_program_settings.h | 27 ------- 15 files changed, 57 insertions(+), 162 deletions(-) delete mode 100644 shared/testing/main.cpp delete mode 100644 shared/testing/test_program_settings.h diff --git a/STREAM/src/host/CMakeLists.txt b/STREAM/src/host/CMakeLists.txt index 239c1bb6..99ade190 100755 --- a/STREAM/src/host/CMakeLists.txt +++ b/STREAM/src/host/CMakeLists.txt @@ -1,15 +1,13 @@ -add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) -include_directories(${HPCCBaseLibrary_INCLUDE_DIRS}) -include_directories(${CMAKE_BINARY_DIR}/src/common) - +add_subdirectory(../../../shared ${CMAKE_CURRENT_BINARY_DIR}/lib/hpccbase) set(HOST_SOURCE execution_default.cpp stream_benchmark.cpp) if (INTELFPGAOPENCL_FOUND) - include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) add_library(stream_intel STATIC ${HOST_SOURCE}) + target_include_directories(stream_intel PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${IntelFPGAOpenCL_INCLUDE_DIRS}) + target_include_directories(stream_intel PUBLIC ${CMAKE_SOURCE_DIR}/src/host) add_executable(STREAM_FPGA_intel main.cpp) target_link_libraries(stream_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}") - target_link_libraries(STREAM_FPGA_intel hpcc_fpga_base) + target_link_libraries(stream_intel hpcc_fpga_base) target_link_libraries(STREAM_FPGA_intel stream_intel) if (USE_SVM) target_compile_definitions(stream_intel PRIVATE -DCL_VERSION_2_0) @@ -20,12 +18,13 @@ if (INTELFPGAOPENCL_FOUND) endif() if (Vitis_FOUND) - include_directories(${Vitis_INCLUDE_DIRS}) add_library(stream_xilinx STATIC ${HOST_SOURCE}) + target_include_directories(stream_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) + target_include_directories(stream_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) add_executable(STREAM_FPGA_xilinx main.cpp) target_link_libraries(stream_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") - target_link_libraries(STREAM_FPGA_xilinx hpcc_fpga_base) - target_link_libraries(STREAM_FPGA_xilinx stream_ixilinx) + target_link_libraries(stream_xilinx hpcc_fpga_base) + target_link_libraries(STREAM_FPGA_xilinx stream_xilinx) target_compile_definitions(stream_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(stream_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_xilinx_host_executable COMMAND $ -h) diff --git a/STREAM/src/host/execution_default.cpp b/STREAM/src/host/execution_default.cpp index 744d8029..8228b56e 100644 --- a/STREAM/src/host/execution_default.cpp +++ b/STREAM/src/host/execution_default.cpp @@ -335,7 +335,7 @@ namespace bm_execution { return result; } - void initialize_queues_and_kernels(const hpcc_base::ExecutionSettings &config, + void initialize_queues_and_kernels(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, const std::vector &Buffers_A, const std::vector &Buffers_B, const std::vector &Buffers_C, @@ -414,7 +414,7 @@ namespace bm_execution { } } - void initialize_queues_and_kernels_single(const hpcc_base::ExecutionSettings &config, + void initialize_queues_and_kernels_single(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, const std::vector &Buffers_A, const std::vector &Buffers_B, const std::vector &Buffers_C, @@ -591,7 +591,7 @@ namespace bm_execution { } } - void initialize_buffers(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, + void initialize_buffers(const hpcc_base::ExecutionSettings &config, unsigned int data_per_kernel, std::vector &Buffers_A, std::vector &Buffers_B, std::vector &Buffers_C) { if (!config.programSettings->useMemoryInterleaving) { diff --git a/STREAM/src/host/main.cpp b/STREAM/src/host/main.cpp index fadd4f88..0a114582 100644 --- a/STREAM/src/host/main.cpp +++ b/STREAM/src/host/main.cpp @@ -4,6 +4,8 @@ #include "stream_benchmark.hpp" +using namespace stream; + /** The program entry point */ diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp index 758cde3e..007a97ba 100644 --- a/STREAM/src/host/stream_benchmark.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -41,6 +41,18 @@ stream::StreamProgramSettings::StreamProgramSettings(cxxopts::ParseResult &resul } +std::map +stream::StreamProgramSettings::getSettingsMap() { + auto map = hpcc_base::BaseSettings::getSettingsMap(); + map["Data Type"] = STR(HOST_DATA_TYPE); + std::stringstream ss; + ss << streamArraySize << " (" << static_cast(streamArraySize * sizeof(HOST_DATA_TYPE)) << " Byte )"; + map["Array Size"] = ss.str(); + map["Kernel Replications"] = std::to_string(kernelReplications); + map["Kernel Type"] = (useSingleKernel ? "Single" : "Separate"); + return map; +} + /** * @brief Print method for the stream specific program settings * @@ -76,7 +88,7 @@ stream::StreamBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { ("single-kernel", "Use the single kernel implementation"); } -std::shared_ptr +std::shared_ptr stream::StreamBenchmark::executeKernel(const hpcc_base::ExecutionSettings &settings, StreamData &data) { return bm_execution::calculate(settings, data.A, @@ -114,7 +126,7 @@ stream::StreamBenchmark::printResults(const hpcc_base::ExecutionSettings +std::shared_ptr stream::StreamBenchmark::generateInputData(const hpcc_base::ExecutionSettings &settings) { HOST_DATA_TYPE *A, *B, *C; #ifdef INTEL_FPGA diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp index b52b3b49..a941289e 100644 --- a/STREAM/src/host/stream_benchmark.hpp +++ b/STREAM/src/host/stream_benchmark.hpp @@ -69,6 +69,8 @@ class StreamProgramSettings : public hpcc_base::BaseSettings { */ StreamProgramSettings(cxxopts::ParseResult &results); + std::map getSettingsMap() override; + }; /** diff --git a/STREAM/tests/CMakeLists.txt b/STREAM/tests/CMakeLists.txt index db1bffa6..5aa48294 100755 --- a/STREAM/tests/CMakeLists.txt +++ b/STREAM/tests/CMakeLists.txt @@ -1,9 +1,8 @@ # 'lib' is the folder with Google Test sources -add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) +add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib/googletest) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) -include_directories(${CMAKE_BINARY_DIR}/src/common) -add_subdirectory(${CMAKE_SOURCE_DIR}/src/host/) +include_directories(${CMAKE_BINARY_DIR}/src/common .) set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp main.cpp) diff --git a/STREAM/tests/main.cpp b/STREAM/tests/main.cpp index d364302f..d43b0a40 100644 --- a/STREAM/tests/main.cpp +++ b/STREAM/tests/main.cpp @@ -41,6 +41,8 @@ class MPIEnvironment : public ::testing::Environment { }; #endif +using namespace stream; + std::shared_ptr bm; /** @@ -58,7 +60,7 @@ main(int argc, char *argv[]) { ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); #endif - bm = std::shared_ptr(StreamBenchmark()); + bm = std::shared_ptr(new StreamBenchmark()); bm->setupBenchmark(argc, argv); diff --git a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp index 85c1c69b..92f967df 100644 --- a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp +++ b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp @@ -8,11 +8,11 @@ struct StreamKernelTest :public ::testing::Test { - std::shared_ptr data; + std::shared_ptr data; void SetUp( ) { - bm->getExecutionSettings->streamArraySize = VECTOR_COUNT * UNROLL_COUNT * NUM_KERNEL_REPLICATIONS * BUFFER_SIZE; - data = bm->generateInputData(); + bm->getExecutionSettings().programSettings->streamArraySize = VECTOR_COUNT * UNROLL_COUNT * NUM_KERNEL_REPLICATIONS * BUFFER_SIZE; + data = bm->generateInputData(bm->getExecutionSettings()); } }; @@ -24,7 +24,7 @@ struct StreamKernelTest :public ::testing::Test { TEST_F(StreamKernelTest, FPGACorrectResultsOneRepetition) { bm->getExecutionSettings().programSettings->numRepetitions = 1; auto result = bm->executeKernel(bm->getExecutionSettings(), *data); - for (int i = 0; i < array_size; i++) { + for (int i = 0; i < bm->getExecutionSettings().programSettings->streamArraySize; i++) { EXPECT_FLOAT_EQ(data->A[i], 30.0); EXPECT_FLOAT_EQ(data->B[i], 6.0); EXPECT_FLOAT_EQ(data->C[i], 8.0); @@ -37,7 +37,7 @@ TEST_F(StreamKernelTest, FPGACorrectResultsOneRepetition) { TEST_F(StreamKernelTest, FPGACorrectResultsThreeRepetition) { bm->getExecutionSettings().programSettings->numRepetitions = 3; auto result = bm->executeKernel(bm->getExecutionSettings(), *data); - for (int i = 0; i < array_size; i++) { + for (int i = 0; i < bm->getExecutionSettings().programSettings->streamArraySize; i++) { EXPECT_FLOAT_EQ(data->A[i], 6750.0); EXPECT_FLOAT_EQ(data->B[i], 1350.0); EXPECT_FLOAT_EQ(data->C[i], 1800.0); diff --git a/STREAM/tests/test_program_settings.h b/STREAM/tests/test_program_settings.h index c2e0476b..b8d13110 100644 --- a/STREAM/tests/test_program_settings.h +++ b/STREAM/tests/test_program_settings.h @@ -24,4 +24,4 @@ SOFTWARE. #include "stream_benchmark.hpp" -extern std::shared_ptr bm; +extern std::shared_ptr bm; diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index 0466dcea..2fbe052e 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -1,6 +1,6 @@ project(HPCCBaseLibrary VERSION 1.0.0) -add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/hpcc_benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) +add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) target_include_directories(hpcc_fpga_base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/../extern/cxxopts/include) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 256d4620..c15819ed 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -98,6 +98,10 @@ class BaseSettings { defaultDevice(results["device"].as()), kernelFileName(results["f"].as()) {} + virtual std::map getSettingsMap() { + return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel File", kernelFileName}}; + } + }; @@ -358,7 +362,7 @@ class HpccFpgaBenchmark { * * @return ExecutionSettings& The execution settings object */ - ExecutionSettings& getExecutionSettings() { + ExecutionSettings& getExecutionSettings() { return *executionSettings; } @@ -368,15 +372,6 @@ class HpccFpgaBenchmark { }; -/** - * @brief Prints the base settings to an output stream. - * - * @param os The output stream - * @param printedBaseSettings The base settings that should be printed - * @return std::ostream& The output stream after the base settings are piped in - */ -std::ostream& operator<<(std::ostream& os, BaseSettings const& printedBaseSettings); - /** * @brief Prints the execution settings to an output stream * @@ -389,14 +384,10 @@ template std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ std::string device_name; printedExecutionSettings.device.getInfo(CL_DEVICE_NAME, &device_name); - std::string platform_name; - printedExecutionSettings.device.getInfo(CL_PLATFORM_NAME, &platform_name); - os << static_cast(*printedExecutionSettings.programSettings); - if (typeid(printedExecutionSettings.programSettings) != typeid(BaseSettings)) { - os << *printedExecutionSettings.programSettings; + for (auto k : printedExecutionSettings.programSettings->getSettingsMap()) { + os << std::setw(2 * ENTRY_SPACE) << std::left<< k.first << k.second << std::endl; } - os << "Platform: " << platform_name << std::endl; - os << "Device: " << device_name << std::endl; + os << std::setw(2 * ENTRY_SPACE) << std::left << "Device" << device_name << std::endl; return os; } diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index 6d95789f..d73bcf04 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -261,15 +261,15 @@ choose a device. if (defaultDevice < deviceList.size()) { chosenDeviceId = defaultDevice; } else { - std::cerr << "Default platform " << defaultDevice - << " can not be used. Available platforms: " + std::cerr << "Default device " << defaultDevice + << " can not be used. Available devices: " << deviceList.size() << std::endl; exit(1); } } else if (deviceList.size() > 1) { if (world_size == 1) { std::cout << - "Multiple devices have been found. Select the platform by"\ + "Multiple devices have been found. Select the device by"\ " typing a number:" << std::endl; for (int deviceId = 0; diff --git a/shared/setup/test_fpga_setup.cpp b/shared/setup/test_fpga_setup.cpp index c1842351..297b49e3 100644 --- a/shared/setup/test_fpga_setup.cpp +++ b/shared/setup/test_fpga_setup.cpp @@ -6,25 +6,18 @@ #include "gtest/gtest.h" #include "setup/fpga_setup.hpp" #include "parameters.h" -#include "testing/test_program_settings.h" +#include "test_program_settings.h" #include "gmock/gmock.h" -/** - * Check if it is possible to find the platform and device that are given as default - */ -TEST (FPGASetup, FindValidPlatformAndDevice) { - EXPECT_EQ (1, fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice).size()); -} - /** * Checks if non existing platform leads to an error */ TEST (FPGASetup, FindNonExistingPlatform) { testing::FLAGS_gtest_death_test_style="threadsafe"; std::stringstream fmt; - fmt << "Default platform " << programSettings->defaultPlatform + 100 << " can not be used. Available platforms: " ; - EXPECT_EXIT(fpga_setup::selectFPGADevice(programSettings->defaultPlatform + 100, programSettings->defaultDevice), + fmt << "Default platform " << bm->getExecutionSettings().programSettings->defaultPlatform + 100 << " can not be used. Available platforms: " ; + EXPECT_EXIT(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform + 100, bm->getExecutionSettings().programSettings->defaultDevice), ::testing::ExitedWithCode(1), ::testing::StartsWith(fmt.str())); } @@ -35,8 +28,8 @@ TEST (FPGASetup, FindNonExistingPlatform) { TEST (FPGASetup, FindNonExistingDevice) { testing::FLAGS_gtest_death_test_style="threadsafe"; std::stringstream fmt; - fmt << "Default device " << programSettings->defaultDevice + 100 << " can not be used. Available devices: " ; - EXPECT_EXIT(fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice + 100), + fmt << "Default device " << bm->getExecutionSettings().programSettings->defaultDevice + 100 << " can not be used. Available devices: " ; + EXPECT_EXIT(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, bm->getExecutionSettings().programSettings->defaultDevice + 100), ::testing::ExitedWithCode(1), ::testing::StartsWith(fmt.str())); } diff --git a/shared/testing/main.cpp b/shared/testing/main.cpp deleted file mode 100644 index 9cbe8554..00000000 --- a/shared/testing/main.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* -Copyright (c) 2020 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* Project's headers */ -#include "program_settings.h" -#include "setup/common_benchmark_io.hpp" -#include "setup/fpga_setup.hpp" -#include "test_program_settings.h" - -#include "gtest/gtest.h" -#include "CL/cl.hpp" - -#ifdef _USE_MPI_ -#include "mpi.h" - -class MPIEnvironment : public ::testing::Environment { -public: - MPIEnvironment(int* argc, char** argv[]) { - MPI_Init(argc, argv); - } - - ~MPIEnvironment() override { - MPI_Finalize(); - } -}; -#endif - -std::shared_ptr programSettings; - -/** -The program entry point for the unit tests -*/ -int -main(int argc, char *argv[]) { - - std::cout << "THIS BINARY EXECUTES UNIT TESTS FOR THE FOLLOWING BENCHMARK:" << std::endl << std::endl; - - ::testing::InitGoogleTest(&argc, argv); - -#ifdef _USE_MPI_ - ::testing::Environment* const mpi_env = - ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); -#endif - - // Parse input parameters - programSettings = parseProgramParameters(argc, argv); - fpga_setup::setupEnvironmentAndClocks(); - - std::vector usedDevice = - fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); - - // Print input parameters - printFinalConfiguration(programSettings, usedDevice[0]); - - return RUN_ALL_TESTS(); - -} - diff --git a/shared/testing/test_program_settings.h b/shared/testing/test_program_settings.h deleted file mode 100644 index a9a70991..00000000 --- a/shared/testing/test_program_settings.h +++ /dev/null @@ -1,27 +0,0 @@ -/* -Copyright (c) 2020 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* Project's headers */ -#include "program_settings.h" - - -extern std::shared_ptr programSettings; From 5c6b0e2ce286477048a176038a05187d9d2a5925 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 15 May 2020 12:37:27 +0200 Subject: [PATCH 18/45] Adjust output parser to new STREAM output --- STREAM/src/host/stream_benchmark.cpp | 19 ------------------- scripts/evaluation/parse_raw_to_csv.py | 2 +- shared/include/hpcc_benchmark.hpp | 6 ++++-- 3 files changed, 5 insertions(+), 22 deletions(-) diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp index 007a97ba..b73d58de 100644 --- a/STREAM/src/host/stream_benchmark.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -53,25 +53,6 @@ stream::StreamProgramSettings::getSettingsMap() { return map; } -/** - * @brief Print method for the stream specific program settings - * - * @param os - * @param printedSettings - * @return std::ostream& - */ -std::ostream& operator<<(std::ostream& os, stream::StreamProgramSettings const& printedSettings) { - return os << "Data Type: " << STR(HOST_DATA_TYPE) - << std::endl - << "Array Size: " << printedSettings.streamArraySize << " (" - << static_cast(printedSettings.streamArraySize * sizeof(HOST_DATA_TYPE)) <<" Byte )" - << std::endl - << "Kernel Replications: " << printedSettings.kernelReplications - << std::endl - << "Kernel Type: " << (printedSettings.useSingleKernel ? "Single" : "Separate") - << std::endl; -} - stream::StreamBenchmark::StreamBenchmark(int argc, char* argv[]) { setupBenchmark(argc, argv); } diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index c10263e0..1a7b59f8 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -13,7 +13,7 @@ gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Total\\smatrix\\ssize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Kernel\\sReplications:\\s+(?P\d+)(.*\n)+Total\\sdata\\ssize:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+best\\s+mean\\s+GUOPS\\s+error\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+trans\\s+calc\\s+calc\\s+FLOPS\\s+total\\s+FLOPS\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)" -stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType:\\s+(?P.+)\n(.*\n)+Kernel\\sReplications:\\s+(?P\d+)(.*\n)+Kernel\\sType:\\s+(?P.+)\n(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" +stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize:\\s+(?P\d+)(.*\n)+Data\\sType:\\s+(?P.+)\n(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS(.*\n)+\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index c15819ed..31339ba0 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -383,11 +383,13 @@ class HpccFpgaBenchmark { template std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ std::string device_name; + os << std::left; printedExecutionSettings.device.getInfo(CL_DEVICE_NAME, &device_name); for (auto k : printedExecutionSettings.programSettings->getSettingsMap()) { - os << std::setw(2 * ENTRY_SPACE) << std::left<< k.first << k.second << std::endl; + os << std::setw(2 * ENTRY_SPACE) << k.first << k.second << std::endl; } - os << std::setw(2 * ENTRY_SPACE) << std::left << "Device" << device_name << std::endl; + os << std::setw(2 * ENTRY_SPACE) << "Device" << device_name << std::endl; + os << std::right; return os; } From 1dcef64c0a32648d3b770d9f5d2aba26ed521c34 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 15 May 2020 13:33:05 +0200 Subject: [PATCH 19/45] Fix build errors --- STREAM/src/host/CMakeLists.txt | 2 +- STREAM/tests/CMakeLists.txt | 2 +- shared/CMakeLists.txt | 4 ---- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/STREAM/src/host/CMakeLists.txt b/STREAM/src/host/CMakeLists.txt index 99ade190..558f0f12 100755 --- a/STREAM/src/host/CMakeLists.txt +++ b/STREAM/src/host/CMakeLists.txt @@ -1,4 +1,4 @@ -add_subdirectory(../../../shared ${CMAKE_CURRENT_BINARY_DIR}/lib/hpccbase) +add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) set(HOST_SOURCE execution_default.cpp stream_benchmark.cpp) if (INTELFPGAOPENCL_FOUND) diff --git a/STREAM/tests/CMakeLists.txt b/STREAM/tests/CMakeLists.txt index 5aa48294..b308f0f9 100755 --- a/STREAM/tests/CMakeLists.txt +++ b/STREAM/tests/CMakeLists.txt @@ -1,6 +1,6 @@ # 'lib' is the folder with Google Test sources -add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib/googletest) +add_subdirectory(../../extern/googletest ${CMAKE_BINARY_DIR}/lib/googletest) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}/src/common .) diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index 2fbe052e..ac6789bd 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -3,7 +3,3 @@ project(HPCCBaseLibrary VERSION 1.0.0) add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) target_include_directories(hpcc_fpga_base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/../extern/cxxopts/include) - -install(TARGETS hpcc_fpga_base -PUBLIC_HEADER DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" -ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") \ No newline at end of file From 67e1d67c27cee0a7e58c8671160c4f5627ff4f81 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 15 May 2020 17:48:40 +0200 Subject: [PATCH 20/45] Start converting random access -host is working - tests have to be converted - parsing has to be adapted! --- RandomAccess/src/common/parameters.h.in | 24 +++ RandomAccess/src/host/CMakeLists.txt | 39 ++-- RandomAccess/src/host/execution.h | 24 +-- RandomAccess/src/host/execution_single.cpp | 52 +++-- RandomAccess/src/host/main.cpp | 60 +----- .../src/host/random_access_benchmark.cpp | 145 ++++++++++++++ .../src/host/random_access_benchmark.hpp | 177 ++++++++++++++++++ RandomAccess/tests/CMakeLists.txt | 36 ++-- RandomAccess/tests/main.cpp | 64 +++++++ 9 files changed, 490 insertions(+), 131 deletions(-) create mode 100644 RandomAccess/src/host/random_access_benchmark.cpp create mode 100644 RandomAccess/src/host/random_access_benchmark.hpp create mode 100644 RandomAccess/tests/main.cpp diff --git a/RandomAccess/src/common/parameters.h.in b/RandomAccess/src/common/parameters.h.in index d0a5ea2a..5bf27f6a 100644 --- a/RandomAccess/src/common/parameters.h.in +++ b/RandomAccess/src/common/parameters.h.in @@ -25,6 +25,30 @@ #cmakedefine COMBINE_LOOPS #cmakedefine USE_SVM +/* +Short description of the program. +Moreover the version and build time is also compiled into the description. +*/ + +#define PROGRAM_DESCRIPTION "Implementation of the random access benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + +/** +Prefix of the function name of the used kernel. +It will be used to construct the full function name for the case of replications. +The full name will be +*/ +#define RANDOM_ACCESS_KERNEL "accessMemory_" + +/** +Constants used to verify benchmark results +*/ +#define POLY 7 +#define PERIOD 1317624576693539401L + +#define BIT_SIZE (sizeof(HOST_DATA_TYPE) * 8) + /** Output separator */ diff --git a/RandomAccess/src/host/CMakeLists.txt b/RandomAccess/src/host/CMakeLists.txt index 3e0f9bd0..374b6d1d 100755 --- a/RandomAccess/src/host/CMakeLists.txt +++ b/RandomAccess/src/host/CMakeLists.txt @@ -1,23 +1,34 @@ -include_directories(../../../extern/cxxopts/include ../../../shared) -include_directories(.) +add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) +set(HOST_SOURCE execution_single.cpp random_access_benchmark.cpp) -set(HOST_SOURCE execution_single.cpp main.cpp common_benchmark_io_implementation.cpp ../../../shared/setup/fpga_setup.cpp random_access_functionality.cpp) +set(HOST_EXE_NAME RandomAccess) +set(LIB_NAME ra) if (INTELFPGAOPENCL_FOUND) - include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) - include_directories(${CMAKE_BINARY_DIR}/src/common) - add_executable(RandomAccess_intel ${HOST_SOURCE}) - target_link_libraries(RandomAccess_intel ${IntelFPGAOpenCL_LIBRARIES}) + add_library(${LIB_NAME}_intel STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_intel PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${IntelFPGAOpenCL_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_intel PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_intel main.cpp) + target_link_libraries(${LIB_NAME}_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel) if (USE_SVM) - target_compile_definitions(RandomAccess_intel PRIVATE -DCL_VERSION_2_0) + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0) endif() - target_compile_definitions(RandomAccess_intel PRIVATE -DINTEL_FPGA) + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_intel_host_executable COMMAND $ -h) endif() if (Vitis_FOUND) - include_directories(${Vitis_INCLUDE_DIRS}) - include_directories(${CMAKE_BINARY_DIR}/src/common) - add_executable(RandomAccess_xilinx ${HOST_SOURCE}) - target_link_libraries(RandomAccess_xilinx ${Vitis_LIBRARIES}) - target_compile_definitions(RandomAccess_xilinx PRIVATE -DXILINX_FPGA) + add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_xilinx main.cpp) + target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) + target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_xilinx_host_executable COMMAND $ -h) endif() diff --git a/RandomAccess/src/host/execution.h b/RandomAccess/src/host/execution.h index 2a691fc0..d494b321 100644 --- a/RandomAccess/src/host/execution.h +++ b/RandomAccess/src/host/execution.h @@ -30,29 +30,11 @@ SOFTWARE. #include "CL/cl.hpp" #include "parameters.h" +#include "random_access_benchmark.hpp" namespace bm_execution { -struct ExecutionConfiguration { - cl::Context context; - cl::Device device; - cl::Program program; - uint repetitions; - uint replications; - size_t arraySize; -}; - -/** -This struct is returned by the calculate call and contains the measured -runtimes and the error rate in the data set after the updates. - -@see bm_execution::calculate() -*/ -struct ExecutionResults { - std::vector times; -}; - /** The actual execution of the benchmark. This method can be implemented in multiple *.cpp files. This header enables @@ -71,8 +53,8 @@ simple exchange of the different calculation methods. @return The time measurements and the error rate counted from the executions */ -std::shared_ptr -calculate(std::shared_ptr config, HOST_DATA_TYPE * data); +std::shared_ptr +calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE * data); } // namespace bm_execution diff --git a/RandomAccess/src/host/execution_single.cpp b/RandomAccess/src/host/execution_single.cpp index 44a22265..94ba4574 100644 --- a/RandomAccess/src/host/execution_single.cpp +++ b/RandomAccess/src/host/execution_single.cpp @@ -32,9 +32,6 @@ SOFTWARE. /* External library headers */ #include "CL/cl.hpp" -/* Project's headers */ -#include "setup/fpga_setup.hpp" -#include "random_access_functionality.hpp" namespace bm_execution { @@ -42,8 +39,8 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr - calculate(std::shared_ptr config, HOST_DATA_TYPE * data) { + std::shared_ptr + calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE * data) { // int used to check for OpenCL errors int err; @@ -54,23 +51,23 @@ namespace bm_execution { /* --- Prepare kernels --- */ - for (int r=0; r < config->replications; r++) { - compute_queue.push_back(cl::CommandQueue(config->context, config->device)); + for (int r=0; r < config.programSettings->kernelReplications; r++) { + compute_queue.push_back(cl::CommandQueue(config.context, config.device)); int memory_bank_info = 0; #ifdef INTEL_FPGA memory_bank_info = ((r + 1) << 16); #endif - Buffer_data.push_back(cl::Buffer(config->context, + Buffer_data.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | memory_bank_info, - sizeof(HOST_DATA_TYPE)*(config->arraySize / config->replications))); + sizeof(HOST_DATA_TYPE)*(config.programSettings->dataSize / config.programSettings->kernelReplications))); #ifdef INTEL_FPGA - accesskernel.push_back(cl::Kernel(config->program, + accesskernel.push_back(cl::Kernel(config.program, (RANDOM_ACCESS_KERNEL + std::to_string(r)).c_str() , &err)); #endif #ifdef XILINX_FPGA - accesskernel.push_back(cl::Kernel(config->program, + accesskernel.push_back(cl::Kernel(config.program, (std::string(RANDOM_ACCESS_KERNEL) + "0:{" + RANDOM_ACCESS_KERNEL + "0_" + std::to_string(r + 1) + "}").c_str() , &err)); #endif @@ -79,16 +76,16 @@ namespace bm_execution { // prepare kernels #ifdef USE_SVM err = clSetKernelArgSVMPointer(accesskernel[r](), 0, - reinterpret_cast(&data[r * (config->arraySize / config->replications)])); + reinterpret_cast(&data[r * (config.programSettings->dataSize / config.programSettings->kernelReplications)])); #else err = accesskernel[r].setArg(0, Buffer_data[r]); #endif ASSERT_CL(err); - err = accesskernel[r].setArg(1, HOST_DATA_TYPE(config->arraySize)); + err = accesskernel[r].setArg(1, HOST_DATA_TYPE(config.programSettings->dataSize)); ASSERT_CL(err); err = accesskernel[r].setArg(2, - HOST_DATA_TYPE((config->arraySize / config->replications))); + HOST_DATA_TYPE((config.programSettings->dataSize / config.programSettings->kernelReplications))); ASSERT_CL(err); err = accesskernel[r].setArg(3, cl_uint(r)); @@ -98,24 +95,24 @@ namespace bm_execution { /* --- Execute actual benchmark kernels --- */ std::vector executionTimes; - for (int i = 0; i < config->repetitions; i++) { + for (int i = 0; i < config.programSettings->numRepetitions; i++) { std::chrono::time_point t1; #pragma omp parallel default(shared) { #pragma omp for - for (int r = 0; r < config->replications; r++) { + for (int r = 0; r < config.programSettings->kernelReplications; r++) { #ifdef USE_SVM clEnqueueSVMMap(compute_queue[r](), CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, - reinterpret_cast(&data[r * (config->arraySize / config->replications)]), + reinterpret_cast(&data[r * (config.programSettings->dataSize / config.programSettings->kernelReplications)]), sizeof(HOST_DATA_TYPE) * - (config->arraySize / config->replications), 0, + (config.programSettings->dataSize / config.programSettings->kernelReplications), 0, NULL, NULL); #else compute_queue[r].enqueueWriteBuffer(Buffer_data[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * - (config->arraySize / config->replications), - &data[r * (config->arraySize / config->replications)]); + (config.programSettings->dataSize / config.programSettings->kernelReplications), + &data[r * (config.programSettings->dataSize / config.programSettings->kernelReplications)]); #endif } #pragma omp master @@ -125,11 +122,11 @@ namespace bm_execution { } #pragma omp barrier #pragma omp for nowait - for (int r = 0; r < config->replications; r++) { + for (int r = 0; r < config.programSettings->kernelReplications; r++) { compute_queue[r].enqueueTask(accesskernel[r]); } #pragma omp for - for (int r = 0; r < config->replications; r++) { + for (int r = 0; r < config.programSettings->kernelReplications; r++) { compute_queue[r].finish(); } #pragma omp master @@ -144,20 +141,19 @@ namespace bm_execution { } /* --- Read back results from Device --- */ - for (int r=0; r < config->replications; r++) { + for (int r=0; r < config.programSettings->kernelReplications; r++) { #ifdef USE_SVM clEnqueueSVMUnmap(compute_queue[r](), - reinterpret_cast(&data[r * (config->arraySize / config->replications)]), 0, + reinterpret_cast(&data[r * (config.programSettings->dataSize / config.programSettings->kernelReplications)]), 0, NULL, NULL); #else compute_queue[r].enqueueReadBuffer(Buffer_data[r], CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*(config->arraySize / config->replications), &data[r * (config->arraySize / config->replications)]); + sizeof(HOST_DATA_TYPE)*(config.programSettings->dataSize / config.programSettings->kernelReplications), + &data[r * (config.programSettings->dataSize / config.programSettings->kernelReplications)]); #endif } - std::shared_ptr results( - new ExecutionResults{executionTimes}); - return results; + return std::shared_ptr(new random_access::RandomAccessExecutionTimings{executionTimes}); } } // namespace bm_execution diff --git a/RandomAccess/src/host/main.cpp b/RandomAccess/src/host/main.cpp index ec9ef0cc..f67d1e58 100644 --- a/RandomAccess/src/host/main.cpp +++ b/RandomAccess/src/host/main.cpp @@ -1,10 +1,6 @@ -// -// Created by Marius Meyer on 04.12.19. -// +#include "random_access_benchmark.hpp" -#include "random_access_functionality.hpp" -#include "setup/common_benchmark_io.hpp" -#include "execution.h" +using namespace random_access; /** The program entry point @@ -12,49 +8,13 @@ The program entry point int main(int argc, char *argv[]) { // Setup benchmark - std::shared_ptr programSettings = - parseProgramParameters(argc, argv); - fpga_setup::setupEnvironmentAndClocks(); - std::vector usedDevice = - fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); - cl::Context context = cl::Context(usedDevice); - cl::Program program = fpga_setup::fpgaSetup(&context, usedDevice, - &programSettings->kernelFileName); - - printFinalConfiguration(programSettings, usedDevice[0]); - - std::shared_ptr config( - new bm_execution::ExecutionConfiguration { - context, usedDevice[0], program, - programSettings->numRepetitions, - programSettings->numReplications, - programSettings->dataSize - }); - - HOST_DATA_TYPE *data; -#ifdef USE_SVM - data = reinterpret_cast( - clSVMAlloc(context(), 0 , - programSettings->dataSize * sizeof(HOST_DATA_TYPE), 1024)); -#else - posix_memalign(reinterpret_cast(&data), 4096, programSettings->dataSize * sizeof(HOST_DATA_TYPE)); -#endif - - generateInputData(data, programSettings->dataSize); - - auto timing = bm_execution::calculate(config, data); - - double error = checkRandomAccessResults(data, programSettings->dataSize); - -#ifdef USE_SVM - clSVMFree(context(), reinterpret_cast(data)); -#else - free(data); -#endif - - printResults(timing, programSettings->dataSize, error); - - return error < 0.01 ? 0 : 1; + auto bm = RandomAccessBenchmark(argc, argv); + bool success = bm.executeBenchmark(); + if (success) { + return 0; + } + else { + return 1; + } } diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp new file mode 100644 index 00000000..8b99e11b --- /dev/null +++ b/RandomAccess/src/host/random_access_benchmark.cpp @@ -0,0 +1,145 @@ +// +// Created by Marius Meyer on 04.12.19. +// + +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "random_access_benchmark.hpp" + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "execution.h" +#include "parameters.h" + +random_access::RandomAccessProgramSettings::RandomAccessProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), + dataSize(results["d"].as()), + kernelReplications(results["r"].as()) { + +} + +std::map +random_access::RandomAccessProgramSettings::getSettingsMap() { + auto map = hpcc_base::BaseSettings::getSettingsMap(); + std::stringstream ss; + ss << dataSize << " (" << static_cast(dataSize * sizeof(HOST_DATA_TYPE)) << " Byte )"; + map["Array Size"] = ss.str(); + map["Kernel Replications"] = std::to_string(kernelReplications); + return map; +} + +random_access::RandomAccessBenchmark::RandomAccessBenchmark(int argc, char* argv[]) { + setupBenchmark(argc, argv); +} + +random_access::RandomAccessBenchmark::RandomAccessBenchmark() {} + +void +random_access::RandomAccessBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { + options.add_options() + ("d", "Size of the data array", + cxxopts::value()->default_value(std::to_string(DEFAULT_ARRAY_LENGTH))) + ("r", "Number of kernel replications used", + cxxopts::value()->default_value(std::to_string(NUM_KERNEL_REPLICATIONS))) + ("single-kernel", "Use the single kernel implementation"); +} + +std::shared_ptr +random_access::RandomAccessBenchmark::executeKernel(const hpcc_base::ExecutionSettings &settings, RandomAccessData &data) { + return bm_execution::calculate(settings, data.data); +} + +/** +Prints the execution results to stdout + +@param results The execution results +*/ +void +random_access::RandomAccessBenchmark::printResults(const hpcc_base::ExecutionSettings &settings, const random_access::RandomAccessExecutionTimings &output) { + std::cout << std::setw(ENTRY_SPACE) + << "best" << std::setw(ENTRY_SPACE) << "mean" + << std::setw(ENTRY_SPACE) << "GUOPS" << std::endl; + + // Calculate performance for kernel execution plus data transfer + double tmean = 0; + double tmin = std::numeric_limits::max(); + double gups = static_cast(4 * settings.programSettings->dataSize) / 1000000000; + for (double currentTime : output.times) { + tmean += currentTime; + if (currentTime < tmin) { + tmin = currentTime; + } + } + tmean = tmean / output.times.size(); + + std::cout << std::setw(ENTRY_SPACE) + << tmin << std::setw(ENTRY_SPACE) << tmean + << std::setw(ENTRY_SPACE) << gups / tmin + << std::endl; + +} + +std::shared_ptr +random_access::RandomAccessBenchmark::generateInputData(const hpcc_base::ExecutionSettings &settings) { + HOST_DATA_TYPE *data; +#ifdef USE_SVM + data = reinterpret_cast( + clSVMAlloc(context(), 0 , + settings.programSettings->dataSize * sizeof(HOST_DATA_TYPE), 1024)); +#else + posix_memalign(reinterpret_cast(&data), 4096, settings.programSettings->dataSize * sizeof(HOST_DATA_TYPE)); +#endif + + for (HOST_DATA_TYPE j=0; j < settings.programSettings->dataSize ; j++) { + data[j] = j; + } + + return std::shared_ptr(new RandomAccessData{data}); +} + +bool +random_access::RandomAccessBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionSettings &settings ,random_access::RandomAccessData &data, const random_access::RandomAccessExecutionTimings &output) { + HOST_DATA_TYPE temp = 1; + for (HOST_DATA_TYPE i=0; i < 4L*settings.programSettings->dataSize; i++) { + HOST_DATA_TYPE_SIGNED v = 0; + if (((HOST_DATA_TYPE_SIGNED)temp) < 0) { + v = POLY; + } + temp = (temp << 1) ^ v; + data.data[(temp >> 3) & (settings.programSettings->dataSize - 1)] ^= temp; + } + + double errors = 0; +#pragma omp parallel for reduction(+:errors) + for (HOST_DATA_TYPE i=0; i< settings.programSettings->dataSize; i++) { + if (data.data[i] != i) { + errors++; + } + } + std::cout << "Error: " << (static_cast(errors) / settings.programSettings->dataSize) * 100 + << "%" << std::endl; + + return (static_cast(errors) / settings.programSettings->dataSize) < 0.01; +} \ No newline at end of file diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp new file mode 100644 index 00000000..2f39294f --- /dev/null +++ b/RandomAccess/src/host/random_access_benchmark.hpp @@ -0,0 +1,177 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef SRC_HOST_RANDOM_ACCESS_BENCHMARK_H_ +#define SRC_HOST_RANDOM_ACCESS_BENCHMARK_H_ + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "hpcc_benchmark.hpp" +#include "parameters.h" + +/** + * @brief Contains all classes and methods needed by the STREAM benchmark + * + */ +namespace random_access { + +/** + * @brief The random access specific program settings + * + */ +class RandomAccessProgramSettings : public hpcc_base::BaseSettings { + +public: + /** + * @brief The size of the data array + * + */ + size_t dataSize; + + /** + * @brief The number of used kernel replications + * + */ + uint kernelReplications; + + /** + * @brief Construct a new random access Program Settings object + * + * @param results the result map from parsing the program input parameters + */ + RandomAccessProgramSettings(cxxopts::ParseResult &results); + + /** + * @brief Construct a new random access Program Settings object + * + * @return a map of program parameters. keys are the name of the parameter. + */ + std::map getSettingsMap() override; + +}; + +/** + * @brief Data class cotnaining the data the kernel is exeucted with + * + */ +class RandomAccessData { + +public: + HOST_DATA_TYPE *data; + RandomAccessData(HOST_DATA_TYPE *data_) : data(data_) {} + +}; + +/** + * @brief Measured execution timing from the kernel execution + * + */ +class RandomAccessExecutionTimings { +public: + /** + * @brief A vector containing the timings for all repetitions + * + */ + std::vector times; + +}; + +/** + * @brief Implementation of the random access benchmark + * + */ +class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark { + +protected: + + /** + * @brief Additional input parameters of the random access benchmark + * + * @param options + */ + void + addAdditionalParseOptions(cxxopts::Options &options) override; + +public: + + /** + * @brief Random access specific implementation of the data generation + * + * @param settings + * @return std::shared_ptr + */ + std::shared_ptr + generateInputData(const hpcc_base::ExecutionSettings &settings) override; + + /** + * @brief RandomAccess specific implementation of the kernel execution + * + * @param settings + * @param data + * @return std::shared_ptr + */ + std::shared_ptr + executeKernel(const hpcc_base::ExecutionSettings &settings, RandomAccessData &data) override; + + /** + * @brief RandomAccess specific implementation of the execution validation + * + * @param settings + * @param data + * @param output + * @return true + * @return false + */ + bool + validateOutputAndPrintError(const hpcc_base::ExecutionSettings &settings ,RandomAccessData &data, const RandomAccessExecutionTimings &output) override; + + /** + * @brief RandomAccess specific implementation of printing the execution results + * + * @param settings + * @param output + */ + void + printResults(const hpcc_base::ExecutionSettings &settings, const RandomAccessExecutionTimings &output) override; + + /** + * @brief Construct a new RandomAccess Benchmark object + * + * @param argc the number of program input parameters + * @param argv the program input parameters as array of strings + */ + RandomAccessBenchmark(int argc, char* argv[]); + + /** + * @brief Construct a new RandomAccess Benchmark object + */ + RandomAccessBenchmark(); + +}; + +} // namespace stream + + +#endif // SRC_HOST_STREAM_BENCHMARK_H_ diff --git a/RandomAccess/tests/CMakeLists.txt b/RandomAccess/tests/CMakeLists.txt index 2790fbdf..2782e9de 100755 --- a/RandomAccess/tests/CMakeLists.txt +++ b/RandomAccess/tests/CMakeLists.txt @@ -1,29 +1,29 @@ -# 'lib' is the folder with Google Test sources -add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) +add_subdirectory(../../extern/googletest ${CMAKE_BINARY_DIR}/lib/googletest) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) -include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) -include_directories(${CMAKE_SOURCE_DIR}/src/host) +include_directories(${CMAKE_BINARY_DIR}/src/common) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/execution_single.cpp ../src/host/random_access_functionality.cpp) -set(TEST_SOURCES ../../shared/testing/main.cpp ../../shared/setup/test_fpga_setup.cpp test_host_code.cpp test_kernel_functionality_and_host_integration.cpp) +set(HOST_EXE_NAME RandomAccess) +set(LIB_NAME ra) + +set(TEST_SOURCES main.cpp test_host_code.cpp ) #test_kernel_functionality_and_host_integration.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) - add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") - add_dependencies(Test_intel random_access_kernels_single_emulate_intel) - target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) - target_compile_options(Test_intel PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_intel_unit COMMAND $ -f random_access_kernels_single_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_executable(${HOST_EXE_NAME}_test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_intel gtest gmock ${LIB_NAME}_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") + add_dependencies(${HOST_EXE_NAME}_test_intel random_access_kernels_single_emulate_intel) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_intel_single_unit COMMAND $ -f random_access_kernels_single_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (Vitis_FOUND) include_directories(SYSTEM ${Vitis_INCLUDE_DIRS}) - add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") - add_dependencies(Test_xilinx random_access_kernels_single_emulate_xilinx) - target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) - target_compile_options(Test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_xilinx_unit COMMAND $ -f random_access_kernels_single_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + add_dependencies(${HOST_EXE_NAME}_test_xilinx random_access_kernels_single_emulate_xilinx) + target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_xilinx_single_unit COMMAND $ -f random_access_kernels_single_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() \ No newline at end of file diff --git a/RandomAccess/tests/main.cpp b/RandomAccess/tests/main.cpp new file mode 100644 index 00000000..58b36c40 --- /dev/null +++ b/RandomAccess/tests/main.cpp @@ -0,0 +1,64 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "random_access_benchmark.hpp" + +#include "gtest/gtest.h" +#include "CL/cl.hpp" + +#ifdef _USE_MPI_ +#include "mpi.h" + +class MPIEnvironment : public ::testing::Environment { +public: + MPIEnvironment(int* argc, char** argv[]) { + MPI_Init(argc, argv); + } + + ~MPIEnvironment() override { + MPI_Finalize(); + } +}; +#endif + +using namespace random_access; + +/** +The program entry point for the unit tests +*/ +int +main(int argc, char *argv[]) { + + std::cout << "THIS BINARY EXECUTES UNIT TESTS FOR THE FOLLOWING BENCHMARK:" << std::endl << std::endl; + + ::testing::InitGoogleTest(&argc, argv); + +#ifdef _USE_MPI_ + ::testing::Environment* const mpi_env = + ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); +#endif + + return RUN_ALL_TESTS(); + +} + From 4f1595a888154ace41476fd417807b9d46265160 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 18 May 2020 09:25:26 +0200 Subject: [PATCH 21/45] Modify STREAM test cmake file --- STREAM/tests/CMakeLists.txt | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/STREAM/tests/CMakeLists.txt b/STREAM/tests/CMakeLists.txt index b308f0f9..bd73b663 100755 --- a/STREAM/tests/CMakeLists.txt +++ b/STREAM/tests/CMakeLists.txt @@ -6,23 +6,26 @@ include_directories(${CMAKE_BINARY_DIR}/src/common .) set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp main.cpp) +set(HOST_EXE_NAME STREAM_FPGA) +set(LIB_NAME stream) + if (INTELFPGAOPENCL_FOUND) include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) - add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock stream_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") - add_dependencies(Test_intel stream_kernels_emulate_intel stream_kernels_single_emulate_intel) - target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) - target_compile_options(Test_intel PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_intel_unit COMMAND $ -f stream_kernels_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_intel_single_unit COMMAND $ -f stream_kernels_single_emulate.aocx --single-kernel WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_executable(${HOST_EXE_NAME}_test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_intel gtest gmock ${LIB_NAME}_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") + add_dependencies(${HOST_EXE_NAME}_test_intel stream_kernels_emulate_intel stream_kernels_single_emulate_intel) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_intel_unit COMMAND $ -f stream_kernels_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME ${HOST_EXE_NAME}_test_intel_single_unit COMMAND $ -f stream_kernels_single_emulate.aocx --single-kernel WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (Vitis_FOUND) include_directories(SYSTEM ${Vitis_INCLUDE_DIRS}) - add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock stream_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") - add_dependencies(Test_xilinx stream_kernels_single_emulate_xilinx) - target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) - target_compile_options(Test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_xilinx_single_unit COMMAND $ -f stream_kernels_single_emulate.xclbin --single-kernel WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) -endif() \ No newline at end of file + add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + add_dependencies(${HOST_EXE_NAME}_test_xilinx stream_kernels_single_emulate_xilinx) + target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_xilinx_single_unit COMMAND $ -f stream_kernels_single_emulate.xclbin --single-kernel WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) +endif() From e6d33237083df8ebba671e13c5ffbd20c94b965b Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 18 May 2020 09:25:57 +0200 Subject: [PATCH 22/45] Fix tests for RandomAccess --- RandomAccess/tests/CMakeLists.txt | 2 +- RandomAccess/tests/main.cpp | 4 + RandomAccess/tests/test_host_code.cpp | 32 +++++--- ...nel_functionality_and_host_integration.cpp | 75 +++++-------------- RandomAccess/tests/test_program_settings.h | 27 +++++++ scripts/evaluation/parse_raw_to_csv.py | 2 +- 6 files changed, 71 insertions(+), 71 deletions(-) create mode 100644 RandomAccess/tests/test_program_settings.h diff --git a/RandomAccess/tests/CMakeLists.txt b/RandomAccess/tests/CMakeLists.txt index 2782e9de..eadba306 100755 --- a/RandomAccess/tests/CMakeLists.txt +++ b/RandomAccess/tests/CMakeLists.txt @@ -6,7 +6,7 @@ include_directories(${CMAKE_BINARY_DIR}/src/common) set(HOST_EXE_NAME RandomAccess) set(LIB_NAME ra) -set(TEST_SOURCES main.cpp test_host_code.cpp ) #test_kernel_functionality_and_host_integration.cpp) +set(TEST_SOURCES main.cpp test_host_code.cpp test_kernel_functionality_and_host_integration.cpp) if (INTELFPGAOPENCL_FOUND) include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) diff --git a/RandomAccess/tests/main.cpp b/RandomAccess/tests/main.cpp index 58b36c40..fcbbdb6e 100644 --- a/RandomAccess/tests/main.cpp +++ b/RandomAccess/tests/main.cpp @@ -43,6 +43,8 @@ class MPIEnvironment : public ::testing::Environment { using namespace random_access; +std::shared_ptr bm; + /** The program entry point for the unit tests */ @@ -53,6 +55,8 @@ main(int argc, char *argv[]) { ::testing::InitGoogleTest(&argc, argv); + bm = std::shared_ptr(new RandomAccessBenchmark(argc, argv)); + #ifdef _USE_MPI_ ::testing::Environment* const mpi_env = ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); diff --git a/RandomAccess/tests/test_host_code.cpp b/RandomAccess/tests/test_host_code.cpp index e7437bb9..daebafce 100644 --- a/RandomAccess/tests/test_host_code.cpp +++ b/RandomAccess/tests/test_host_code.cpp @@ -2,30 +2,38 @@ // Created by Marius Meyer on 04.12.19 // #include "gtest/gtest.h" -#include "../src/host/random_access_functionality.hpp" #include "parameters.h" +#include "random_access_benchmark.hpp" +#include "test_program_settings.h" + + +struct RandomAccessHostCodeTest : testing::Test { + + RandomAccessHostCodeTest() { + bm->getExecutionSettings().programSettings->dataSize = 1024; + } + +}; /** * Check if the correctness test gives correct results for correct array */ -TEST (FPGASetup, ResultValidationWorksForCorrectUpdates) { - HOST_DATA_TYPE data[1024]; - generateInputData(data, 1024); +TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForCorrectUpdates) { + auto data = bm->generateInputData(bm->getExecutionSettings()); // do random accesses - checkRandomAccessResults(data, 1024); + bm->validateOutputAndPrintError(bm->getExecutionSettings(), *data, random_access::RandomAccessExecutionTimings{{}}); // check correctness of random accesses - double error = checkRandomAccessResults(data, 1024); - ASSERT_FLOAT_EQ(error, 0.0); + bool success = bm->validateOutputAndPrintError(bm->getExecutionSettings(), *data, random_access::RandomAccessExecutionTimings{{}}); + EXPECT_TRUE(success); } /** * Check if the correctness test gives correct results for not updated array */ -TEST (FPGASetup, ResultValidationWorksForWrongUpdates) { - HOST_DATA_TYPE data[1024]; - generateInputData(data, 1024); +TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForWrongUpdates) { + auto data = bm->generateInputData(bm->getExecutionSettings()); // check correctness of random accesses - double error = checkRandomAccessResults(data, 1024); - ASSERT_GT(error, 0.3); + bool success = bm->validateOutputAndPrintError(bm->getExecutionSettings(), *data, random_access::RandomAccessExecutionTimings{{}}); + EXPECT_FALSE(success); } diff --git a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp index b605c287..f32974e2 100644 --- a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp +++ b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp @@ -3,86 +3,47 @@ // #include "gtest/gtest.h" #include "parameters.h" -#include "../src/host/execution.h" -#include "setup/fpga_setup.hpp" -#include "../src/host/random_access_functionality.hpp" +#include "random_access_benchmark.hpp" +#include "test_program_settings.h" -struct OpenCLKernelTest : testing::Test { - HOST_DATA_TYPE *data; - std::shared_ptr config; - cl_uint array_size; +struct RandomAccessKernelTest : testing::Test { + std::shared_ptr data; - OpenCLKernelTest() { - array_size = 128 * NUM_KERNEL_REPLICATIONS * BUFFER_SIZE; - posix_memalign(reinterpret_cast(&data), 4096, - sizeof(HOST_DATA_TYPE) * array_size); + RandomAccessKernelTest() { + bm->getExecutionSettings().programSettings->dataSize = 128 * NUM_KERNEL_REPLICATIONS * BUFFER_SIZE; + bm->getExecutionSettings().programSettings->numRepetitions = 1; } - void setupFPGA(std::string kernelFileName) { - std::vector device = fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE); - cl::Context context(device[0]); - cl::Program program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); - config = std::make_shared( - bm_execution::ExecutionConfiguration{ - context, device[0], program, - 1, - NUM_KERNEL_REPLICATIONS, - array_size - }); - generateInputData(data, array_size); + void SetUp() override { + data = bm->generateInputData(bm->getExecutionSettings()); } - ~OpenCLKernelTest() override { - free(data); - } }; -struct DifferentOpenCLKernelTest : OpenCLKernelTest, testing::WithParamInterface { - DifferentOpenCLKernelTest() { - auto params = GetParam(); - auto kernel_file = params; - setupFPGA(kernel_file); - } -}; /** * Check if the number of measurements from the calculation matches the number of repetitions */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectNumberOfMeasurements1Rep) { - - auto result = bm_execution::calculate(config, data); +TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements1Rep) { + auto result = bm->executeKernel(bm->getExecutionSettings(), *data); EXPECT_EQ(result->times.size(), 1); } /** * Check if the number of measurements from the calculation matches the number of repetitions */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectNumberOfMeasurements3Rep) { - config->repetitions = 3; - auto result = bm_execution::calculate(config, data); +TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements3Rep) { + bm->getExecutionSettings().programSettings->numRepetitions = 3; + auto result = bm->executeKernel(bm->getExecutionSettings(), *data); EXPECT_EQ(result->times.size(), 3); } /** * Execution returns correct results for a single repetition */ -TEST_P(DifferentOpenCLKernelTest, FPGAErrorBelow1Percent) { - - auto result = bm_execution::calculate(config, data); - double errors = checkRandomAccessResults(data, array_size); - EXPECT_LT(errors, 0.01); +TEST_F(RandomAccessKernelTest, FPGAErrorBelow1Percent) { + auto result = bm->executeKernel(bm->getExecutionSettings(), *data); + bool success = bm->validateOutputAndPrintError(bm->getExecutionSettings(), *data, *result); + EXPECT_TRUE(success); } - - -#ifdef INTEL_FPGA -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Values("random_access_kernels_single_emulate.aocx") -); -#endif - -#ifdef XILINX_FPGA -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, - testing::Values("random_access_kernels_single_emulate.xclbin") -); -#endif \ No newline at end of file diff --git a/RandomAccess/tests/test_program_settings.h b/RandomAccess/tests/test_program_settings.h new file mode 100644 index 00000000..fbae376d --- /dev/null +++ b/RandomAccess/tests/test_program_settings.h @@ -0,0 +1,27 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "random_access_benchmark.hpp" + + +extern std::shared_ptr bm; diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index 1a7b59f8..75024fb1 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -11,7 +11,7 @@ # Regular expressions for the raw output of all fft_regex = "Version:\\s+(?P.+)\n(.*\n)+FFT\\sSize:\\s+(?P\d+)\nData\\sSize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Total\\smatrix\\ssize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" -ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Kernel\\sReplications:\\s+(?P\d+)(.*\n)+Total\\sdata\\ssize:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+best\\s+mean\\s+GUOPS\\s+error\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" +ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+trans\\s+calc\\s+calc\\s+FLOPS\\s+total\\s+FLOPS\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)" stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize:\\s+(?P\d+)(.*\n)+Data\\sType:\\s+(?P.+)\n(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS(.*\n)+\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" From 85b62b49dc048e8b475c193e611a246d9eb9bf6e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 18 May 2020 09:30:23 +0200 Subject: [PATCH 23/45] Update STREAM version --- STREAM/CHANGELOG | 5 +++++ STREAM/CMakeLists.txt | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/STREAM/CHANGELOG b/STREAM/CHANGELOG index 34008765..de7933f2 100644 --- a/STREAM/CHANGELOG +++ b/STREAM/CHANGELOG @@ -2,6 +2,11 @@ This file contains all changes made to the source code for each release. +## 2.1.5 + +#### Changed: +- Converted host code to new OO code + ## 2.1.4.1 #### Added: diff --git a/STREAM/CMakeLists.txt b/STREAM/CMakeLists.txt index 61609c0b..87998322 100755 --- a/STREAM/CMakeLists.txt +++ b/STREAM/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.1) -project(STREAM VERSION 2.1.4.1) +project(STREAM VERSION 2.1.5) # Additional benchmark specific build parameters set(DEFAULT_ARRAY_LENGTH 134217728 CACHE STRING "Default size of the data arrays") From 57ffe8c03dcab73aa9000ae4e1438f4e89fe4bee Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 18 May 2020 09:30:47 +0200 Subject: [PATCH 24/45] Remove old random access host files --- .../common_benchmark_io_implementation.cpp | 89 -------- RandomAccess/src/host/execution_ndrange.cpp | 191 ----------------- .../src/host/execution_single_rnd.cpp | 195 ------------------ RandomAccess/src/host/program_settings.h | 32 --- .../src/host/random_access_functionality.cpp | 167 --------------- .../src/host/random_access_functionality.hpp | 84 -------- 6 files changed, 758 deletions(-) delete mode 100644 RandomAccess/src/host/common_benchmark_io_implementation.cpp delete mode 100644 RandomAccess/src/host/execution_ndrange.cpp delete mode 100644 RandomAccess/src/host/execution_single_rnd.cpp delete mode 100644 RandomAccess/src/host/program_settings.h delete mode 100644 RandomAccess/src/host/random_access_functionality.cpp delete mode 100644 RandomAccess/src/host/random_access_functionality.hpp diff --git a/RandomAccess/src/host/common_benchmark_io_implementation.cpp b/RandomAccess/src/host/common_benchmark_io_implementation.cpp deleted file mode 100644 index 67e8aa65..00000000 --- a/RandomAccess/src/host/common_benchmark_io_implementation.cpp +++ /dev/null @@ -1,89 +0,0 @@ - -#include "cxxopts.hpp" -#include "parameters.h" -#include "setup/common_benchmark_io.hpp" - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("r", "Number of used kernel replications", - cxxopts::value()->default_value(std::to_string(NUM_KERNEL_REPLICATIONS))) - ("d,data", "Size of the used data array (Should be half of the "\ - "available global memory)", - cxxopts::value() - ->default_value(std::to_string(DEFAULT_ARRAY_LENGTH))) - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(-1))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(-1))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings {result["n"].as(), result["r"].as(), - result["platform"].as(), - result["device"].as(), - result["d"].as(), - result["f"].as()}); - return sharedSettings; -} - -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "Kernel Replications: " << programSettings->numReplications - << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Total data size: " << (programSettings->dataSize - * sizeof(HOST_DATA_TYPE)) * 1.0 - << " Byte" << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} diff --git a/RandomAccess/src/host/execution_ndrange.cpp b/RandomAccess/src/host/execution_ndrange.cpp deleted file mode 100644 index 3b8b0d67..00000000 --- a/RandomAccess/src/host/execution_ndrange.cpp +++ /dev/null @@ -1,191 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* Related header files */ -#include "src/host/execution.h" - -/* C++ standard library headers */ -#include -#include -#include -#include - -/* External library headers */ -#include "CL/cl.hpp" -#if QUARTUS_MAJOR_VERSION > 18 -#include "CL/cl_ext_intelfpga.h" -#endif - -/* Project's headers */ -#include "src/host/fpga_setup.h" -#include "src/host/random_access_functionality.h" - -namespace bm_execution { - - /* - Implementation for the ndrange kernel. - @copydoc bm_execution::calculate() - */ - std::shared_ptr - calculate(cl::Context context, cl::Device device, cl::Program program, - uint repetitions, uint replications, size_t dataSize, - bool useMemInterleaving) { - // int used to check for OpenCL errors - int err; - DATA_TYPE_UNSIGNED* random; - posix_memalign(reinterpret_cast(&random), 64, - sizeof(DATA_TYPE_UNSIGNED)*UPDATE_SPLIT); - - for (DATA_TYPE i=0; i < UPDATE_SPLIT; i++) { - random[i] = starts((4 * DATA_LENGTH) / UPDATE_SPLIT * i); - } - - std::vector compute_queue; - std::vector Buffer_data; - std::vector Buffer_random; - std::vector accesskernel; - std::vector data_sets; - - /* --- Prepare kernels --- */ - - for (int r=0; r < replications; r++) { - DATA_TYPE_UNSIGNED* data; - posix_memalign(reinterpret_cast(&data), 64, - sizeof(DATA_TYPE)*(dataSize / replications)); - data_sets.push_back(data); - - compute_queue.push_back(cl::CommandQueue(context, device)); - - // Select memory bank to place data replication - int channel = 0; - if (!useMemInterleaving) { - switch ((r % replications) + 1) { - case 1: channel = CL_CHANNEL_1_INTELFPGA; break; - case 2: channel = CL_CHANNEL_2_INTELFPGA; break; - case 3: channel = CL_CHANNEL_3_INTELFPGA; break; - case 4: channel = CL_CHANNEL_4_INTELFPGA; break; - case 5: channel = CL_CHANNEL_5_INTELFPGA; break; - case 6: channel = CL_CHANNEL_6_INTELFPGA; break; - case 7: channel = CL_CHANNEL_7_INTELFPGA; break; - } - } - - Buffer_data.push_back(cl::Buffer(context, channel | - CL_MEM_READ_WRITE, - sizeof(DATA_TYPE_UNSIGNED)*(dataSize / replications))); - Buffer_random.push_back(cl::Buffer(context, channel | - CL_MEM_WRITE_ONLY, - sizeof(DATA_TYPE_UNSIGNED) * UPDATE_SPLIT)); - accesskernel.push_back(cl::Kernel(program, - RANDOM_ACCESS_KERNEL, &err)); - ASSERT_CL(err); - - // prepare kernels - err = accesskernel[r].setArg(0, Buffer_data[r]); - ASSERT_CL(err); - err = accesskernel[r].setArg(1, Buffer_random[r]); - ASSERT_CL(err); - err = accesskernel[r].setArg(2, DATA_TYPE_UNSIGNED(dataSize)); - ASSERT_CL(err); - } - - /* --- Execute actual benchmark kernels --- */ - - double t; - std::vector executionTimes; - for (int i = 0; i < repetitions; i++) { - // prepare data and send them to device - for (DATA_TYPE_UNSIGNED r =0; r < replications; r++) { - for (DATA_TYPE_UNSIGNED j=0; - j < (dataSize / replications); j++) { - data_sets[r][j] = r*(dataSize / replications) + j; - } - } - for (int r=0; r < replications; r++) { - compute_queue[r].enqueueWriteBuffer(Buffer_data[r], CL_TRUE, 0, - sizeof(DATA_TYPE)*(dataSize / replications), data_sets[r]); - compute_queue[r].enqueueWriteBuffer(Buffer_random[r], CL_TRUE, - 0, sizeof(DATA_TYPE_UNSIGNED) * UPDATE_SPLIT, random); - } - - // Execute benchmark kernels - auto t1 = std::chrono::high_resolution_clock::now(); - for (int r=0; r < replications; r++) { - compute_queue[r].enqueueNDRangeKernel(accesskernel[r], - cl::NullRange, - cl::NDRange(UPDATE_SPLIT), - cl::NDRange(1), NULL, NULL); - } - for (int r=0; r < replications; r++) { - compute_queue[r].finish(); - } - auto t2 = std::chrono::high_resolution_clock::now(); - std::chrono::duration timespan = - std::chrono::duration_cast> - (t2 - t1); - executionTimes.push_back(timespan.count()); - } - - /* --- Read back results from Device --- */ - - for (int r=0; r < replications; r++) { - compute_queue[r].enqueueReadBuffer(Buffer_data[r], CL_TRUE, 0, - sizeof(DATA_TYPE)*(dataSize / replications), data_sets[r]); - } - DATA_TYPE_UNSIGNED* data; - posix_memalign(reinterpret_cast(&data), 64, - (sizeof(DATA_TYPE)*dataSize)); - for (size_t r =0; r < replications; r++) { - for (size_t j=0; j < (dataSize / replications); j++) { - data[r*(dataSize / replications) + j] = data_sets[r][j]; - } - free(reinterpret_cast(data_sets[r])); - } - - /* --- Check Results --- */ - - DATA_TYPE_UNSIGNED temp = 1; - for (DATA_TYPE_UNSIGNED i=0; i < 4L*dataSize; i++) { - DATA_TYPE v = 0; - if (((DATA_TYPE)temp) < 0) { - v = POLY; - } - temp = (temp << 1) ^ v; - data[temp & (dataSize - 1)] ^= temp; - } - - double errors = 0; - for (DATA_TYPE_UNSIGNED i=0; i< dataSize; i++) { - if (data[i] != i) { - errors++; - } - } - free(reinterpret_cast(data)); - free(reinterpret_cast(random)); - - std::shared_ptr results( - new ExecutionResults{executionTimes, - errors / dataSize}); - return results; - } - -} // namespace bm_execution diff --git a/RandomAccess/src/host/execution_single_rnd.cpp b/RandomAccess/src/host/execution_single_rnd.cpp deleted file mode 100644 index 7802bfa8..00000000 --- a/RandomAccess/src/host/execution_single_rnd.cpp +++ /dev/null @@ -1,195 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ -#ifndef UPDATE_SPLIT -#define UPDATE_SPLIT 8 -#endif - -/* Related header files */ -#include "src/host/execution.h" - -/* C++ standard library headers */ -#include -#include -#include -#include - -/* External library headers */ -#include "CL/cl.hpp" -#if QUARTUS_MAJOR_VERSION > 18 -#include "CL/cl_ext_intelfpga.h" -#endif - -/* Project's headers */ -#include "src/host/fpga_setup.h" -#include "src/host/random_access_functionality.h" - -namespace bm_execution { - - /* - Implementation for the single_rnd kernel. - @copydoc bm_execution::calculate() - */ - std::shared_ptr - calculate(cl::Context context, cl::Device device, cl::Program program, - uint repetitions, uint replications, size_t dataSize, - bool useMemInterleaving) { - // int used to check for OpenCL errors - int err; - DATA_TYPE_UNSIGNED* random; - posix_memalign(reinterpret_cast(&random), 64, - sizeof(DATA_TYPE_UNSIGNED)*UPDATE_SPLIT); - - for (DATA_TYPE i=0; i < UPDATE_SPLIT; i++) { - random[i] = starts((4 * DATA_LENGTH) / UPDATE_SPLIT * i); - } - - std::vector compute_queue; - std::vector Buffer_data; - std::vector Buffer_random; - std::vector accesskernel; - std::vector data_sets; - - /* --- Prepare kernels --- */ - - for (int r=0; r < replications; r++) { - DATA_TYPE_UNSIGNED* data; - posix_memalign(reinterpret_cast(&data), 64, - sizeof(DATA_TYPE)*(dataSize / replications)); - data_sets.push_back(data); - - compute_queue.push_back(cl::CommandQueue(context, device)); - - // Select memory bank to place data replication - int channel = 0; - if (!useMemInterleaving) { - switch ((r % replications) + 1) { - case 1: channel = CL_CHANNEL_1_INTELFPGA; break; - case 2: channel = CL_CHANNEL_2_INTELFPGA; break; - case 3: channel = CL_CHANNEL_3_INTELFPGA; break; - case 4: channel = CL_CHANNEL_4_INTELFPGA; break; - case 5: channel = CL_CHANNEL_5_INTELFPGA; break; - case 6: channel = CL_CHANNEL_6_INTELFPGA; break; - case 7: channel = CL_CHANNEL_7_INTELFPGA; break; - } - } - - Buffer_data.push_back(cl::Buffer(context, channel | - CL_MEM_READ_WRITE, - sizeof(DATA_TYPE_UNSIGNED)*(dataSize / replications))); - Buffer_random.push_back(cl::Buffer(context, channel | - CL_MEM_WRITE_ONLY, - sizeof(DATA_TYPE_UNSIGNED) * UPDATE_SPLIT)); - accesskernel.push_back(cl::Kernel(program, - (RANDOM_ACCESS_KERNEL + std::to_string(r)).c_str() , - &err)); - ASSERT_CL(err); - - // prepare kernels - err = accesskernel[r].setArg(0, Buffer_data[r]); - ASSERT_CL(err); - err = accesskernel[r].setArg(1, Buffer_random[r]); - ASSERT_CL(err); - err = accesskernel[r].setArg(2, DATA_TYPE_UNSIGNED(dataSize)); - ASSERT_CL(err); - err = accesskernel[r].setArg(3, - DATA_TYPE_UNSIGNED(dataSize / replications)); - ASSERT_CL(err); - } - - /* --- Execute actual benchmark kernels --- */ - - double t; - std::vector executionTimes; - for (int i = 0; i < repetitions; i++) { - // prepare data and send them to device - for (DATA_TYPE_UNSIGNED r =0; r < replications; r++) { - for (DATA_TYPE_UNSIGNED j=0; - j < (dataSize / replications); j++) { - data_sets[r][j] = r*(dataSize / replications) + j; - } - } - for (int r=0; r < replications; r++) { - compute_queue[r].enqueueWriteBuffer(Buffer_data[r], CL_TRUE, 0, - sizeof(DATA_TYPE)*(dataSize / replications), data_sets[r]); - compute_queue[r].enqueueWriteBuffer(Buffer_random[r], CL_TRUE, - 0, sizeof(DATA_TYPE_UNSIGNED) * UPDATE_SPLIT, random); - } - - // Execute benchmark kernels - auto t1 = std::chrono::high_resolution_clock::now(); - for (int r=0; r < replications; r++) { - compute_queue[r].enqueueTask(accesskernel[r]); - } - for (int r=0; r < replications; r++) { - compute_queue[r].finish(); - } - auto t2 = std::chrono::high_resolution_clock::now(); - std::chrono::duration timespan = - std::chrono::duration_cast> - (t2 - t1); - executionTimes.push_back(timespan.count()); - } - - /* --- Read back results from Device --- */ - - for (int r=0; r < replications; r++) { - compute_queue[r].enqueueReadBuffer(Buffer_data[r], CL_TRUE, 0, - sizeof(DATA_TYPE)*(dataSize / replications), data_sets[r]); - } - DATA_TYPE_UNSIGNED* data; - posix_memalign(reinterpret_cast(&data), 64, - (sizeof(DATA_TYPE)*dataSize)); - for (size_t r =0; r < replications; r++) { - for (size_t j=0; j < (dataSize / replications); j++) { - data[r*(dataSize / replications) + j] = data_sets[r][j]; - } - free(reinterpret_cast(data_sets[r])); - } - - /* --- Check Results --- */ - - DATA_TYPE_UNSIGNED temp = 1; - for (DATA_TYPE_UNSIGNED i=0; i < 4L*dataSize; i++) { - DATA_TYPE v = 0; - if (((DATA_TYPE)temp) < 0) { - v = POLY; - } - temp = (temp << 1) ^ v; - data[temp & (dataSize - 1)] ^= temp; - } - - double errors = 0; - for (DATA_TYPE_UNSIGNED i=0; i< dataSize; i++) { - if (data[i] != i) { - errors++; - } - } - free(reinterpret_cast(data)); - free(reinterpret_cast(random)); - - std::shared_ptr results( - new ExecutionResults{executionTimes, - errors / dataSize}); - return results; - } - -} // namespace bm_execution diff --git a/RandomAccess/src/host/program_settings.h b/RandomAccess/src/host/program_settings.h deleted file mode 100644 index 9c33fa92..00000000 --- a/RandomAccess/src/host/program_settings.h +++ /dev/null @@ -1,32 +0,0 @@ - -#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ -#define SRC_HOST_PROGRAM_SETTINGS_H_ - -#include "parameters.h" - -/* C++ standard library headers */ -#include - -#include "CL/opencl.h" - -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ - -#define PROGRAM_DESCRIPTION "Implementation of the random access benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " VERSION "\n" - - -struct ProgramSettings { - uint numRepetitions; - uint numReplications; - int defaultPlatform; - int defaultDevice; - size_t dataSize; - std::string kernelFileName; -}; - - -#endif diff --git a/RandomAccess/src/host/random_access_functionality.cpp b/RandomAccess/src/host/random_access_functionality.cpp deleted file mode 100644 index cb1fb82f..00000000 --- a/RandomAccess/src/host/random_access_functionality.cpp +++ /dev/null @@ -1,167 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* Related header files */ -#include "random_access_functionality.hpp" - -/* C++ standard library headers */ -#include -#include -#include -#include -#include -#include - -/* External library headers */ -#include "CL/cl.hpp" -#include "cxxopts.hpp" - -/* Project's headers */ -#include "setup/fpga_setup.hpp" -#include "setup/common_benchmark_io.hpp" -#include "execution.h" - -/** -Print the benchmark Results - -@param results The result struct provided by the calculation call -@param dataSize The size of the used data array - -*/ -void -printResults(std::shared_ptr results, - size_t dataSize, double error) { - std::cout << std::setw(ENTRY_SPACE) - << "best" << std::setw(ENTRY_SPACE) << "mean" - << std::setw(ENTRY_SPACE) << "GUOPS" - << std::setw(ENTRY_SPACE) << "error" << std::endl; - - // Calculate performance for kernel execution plus data transfer - double tmean = 0; - double tmin = std::numeric_limits::max(); - double gups = static_cast(4 * dataSize) / 1000000000; - for (double currentTime : results->times) { - tmean += currentTime; - if (currentTime < tmin) { - tmin = currentTime; - } - } - tmean = tmean / results->times.size(); - - std::cout << std::setw(ENTRY_SPACE) - << tmin << std::setw(ENTRY_SPACE) << tmean - << std::setw(ENTRY_SPACE) << gups / tmin - << std::setw(ENTRY_SPACE) << (100.0 * error) - << std::endl; -} - -/** - Generates the value of the random number after a desired number of updates - - @param n number of random number updates - - @return The random number after n number of updates - */ -HOST_DATA_TYPE -starts(HOST_DATA_TYPE_SIGNED n) { - HOST_DATA_TYPE m2[BIT_SIZE]; - - while (n < 0) { - n += PERIOD; - } - while (n > 0) { - n -= PERIOD; - } - - if (n == 0) { - return 1; - } - - HOST_DATA_TYPE temp = 1; - for (int i=0; i < BIT_SIZE; i++) { - m2[i] = temp; - for (int j=0; j < 2; j++) { - HOST_DATA_TYPE_SIGNED v = 0; - if (((HOST_DATA_TYPE_SIGNED) temp) < 0) { - v = POLY; - } - temp = (temp << 1) ^ v; - } - } - // DATA_TYPE i = BIT_SIZE - 2; - // while (i >= 0 && !((n >> i) & 1)) { - // i--; - // } - int i = 0; - for (i=BIT_SIZE - 2; i >= 0; i--) { - if ((n >> i) & 1) { - break; - } - } - - HOST_DATA_TYPE ran = 2; - while (i > 0) { - temp = 0; - for (int j=0; j < BIT_SIZE; j++) { - if ((ran >> j) & 1) { - temp ^= m2[j]; - } - } - ran = temp; - i--; - if ((n >> i) & 1) { - HOST_DATA_TYPE_SIGNED v = 0; - if (((HOST_DATA_TYPE_SIGNED) ran) < 0) { - v = POLY; - } - ran = (ran << 1) ^v; - } - } - return ran; -} - -double checkRandomAccessResults(HOST_DATA_TYPE* result_array, size_t array_size){ - HOST_DATA_TYPE temp = 1; - for (HOST_DATA_TYPE i=0; i < 4L*array_size; i++) { - HOST_DATA_TYPE_SIGNED v = 0; - if (((HOST_DATA_TYPE_SIGNED)temp) < 0) { - v = POLY; - } - temp = (temp << 1) ^ v; - result_array[(temp >> 3) & (array_size - 1)] ^= temp; - } - - double errors = 0; -#pragma omp parallel for reduction(+:errors) - for (HOST_DATA_TYPE i=0; i< array_size; i++) { - if (result_array[i] != i) { - errors++; - } - } - return errors / array_size; -} - -void generateInputData(HOST_DATA_TYPE* data, size_t dataSize) { - for (HOST_DATA_TYPE j=0; j < dataSize ; j++) { - data[j] = j; - } -} diff --git a/RandomAccess/src/host/random_access_functionality.hpp b/RandomAccess/src/host/random_access_functionality.hpp deleted file mode 100644 index 31619a30..00000000 --- a/RandomAccess/src/host/random_access_functionality.hpp +++ /dev/null @@ -1,84 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#ifndef SRC_HOST_RANDOM_ACCESS_FUNCTIONALITY_H_ -#define SRC_HOST_RANDOM_ACCESS_FUNCTIONALITY_H_ - -/* C++ standard library headers */ -#include - -/* Project's headers */ -#include "execution.h" -#include "setup/fpga_setup.hpp" -#include "parameters.h" - -/** -Prefix of the function name of the used kernel. -It will be used to construct the full function name for the case of replications. -The full name will be -*/ -#define RANDOM_ACCESS_KERNEL "accessMemory_" - -/** -Constants used to verify benchmark results -*/ -#define POLY 7 -#define PERIOD 1317624576693539401L - -#define BIT_SIZE (sizeof(HOST_DATA_TYPE) * 8) - -/** - Generates the value of the random number after a desired number of updates - - @param n number of random number updates - - @return The random number after n number of updates - */ -HOST_DATA_TYPE -starts(HOST_DATA_TYPE_SIGNED n); - -/** -Prints the execution results to stdout - -@param results The execution results -@param dataSize Size of the used data array. Needed to calculate GUOP/s from - timings -*/ -void printResults(std::shared_ptr results, - size_t dataSize, double error); - - -/** - * Checks the correctness of the updates by recalculating all updated addresses and apply the same update again. - * Since the update is a xor, the original values of the array can be recalculated with this method. - * If a value differs from the original value it means there was an error during the benchmark execution. - * 1% errors in the array are allowed. - * - * @param result_array The data array containing the data from the benchmark execution. It will be modified for the evaluation! - * @param array_size The size of the data array - */ -double checkRandomAccessResults(HOST_DATA_TYPE* result_array, size_t array_size); - - -void generateInputData(HOST_DATA_TYPE* data, size_t dataSize); - -#endif // SRC_HOST_RANDOM_ACCESS_FUNCTIONALITY_H_ From 180073a78b4bf493911ce39ed37c918a2aecfabc Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 18 May 2020 09:30:59 +0200 Subject: [PATCH 25/45] Update random access version --- RandomAccess/CHANGELOG | 5 +++++ RandomAccess/CMakeLists.txt | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/RandomAccess/CHANGELOG b/RandomAccess/CHANGELOG index 9ee25fef..5891758a 100644 --- a/RandomAccess/CHANGELOG +++ b/RandomAccess/CHANGELOG @@ -2,6 +2,11 @@ This file contains all changes made to the source code for each release. +## 2.0.3 + +#### Changed: +- Converted host code to new OO code + ## 2.0.2.1 ##### Added: diff --git a/RandomAccess/CMakeLists.txt b/RandomAccess/CMakeLists.txt index 4180da75..0569be21 100755 --- a/RandomAccess/CMakeLists.txt +++ b/RandomAccess/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.1) -project(RandomAccess VERSION 2.0.2.1) +project(RandomAccess VERSION 2.0.3) # Additional benchmark specific build parameters set(DEFAULT_ARRAY_LENGTH 536870912 CACHE STRING "Default size of the data arrays") From 4020ac6ee53719612cca68510c8513fd9c84c06d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 18 May 2020 18:59:39 +0200 Subject: [PATCH 26/45] Remove settings from base implementation calls --- shared/include/hpcc_benchmark.hpp | 93 +++++++++++++------------------ 1 file changed, 39 insertions(+), 54 deletions(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 31339ba0..c93b7482 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -22,6 +22,8 @@ SOFTWARE. #ifndef SHARED_HPCC_BENCHMARK_HPP_ #define SHARED_HPCC_BENCHMARK_HPP_ +#include + /* Project's headers */ #include "setup/fpga_setup.hpp" #include "cxxopts.hpp" @@ -119,25 +121,25 @@ class ExecutionSettings { * @brief The OpenCL device that should be used for execution * */ - cl::Device device; + std::unique_ptr device; /** * @brief The OpenCL context that should be used for execution * */ - cl::Context context; + std::unique_ptr context; /** * @brief The OpenCL program that contains the benchmark kernel * */ - cl::Program program; + std::unique_ptr program; /** * @brief Pointer to the additional program settings * */ - std::shared_ptr programSettings; + std::unique_ptr programSettings; /** * @brief Construct a new Execution Settings object @@ -147,15 +149,10 @@ class ExecutionSettings { * @param context_ Used OpenCL context * @param program_ Used OpenCL program */ - ExecutionSettings(const std::shared_ptr programSettings_, cl::Device device_,cl::Context context_,cl::Program program_): - programSettings(programSettings_), device(device_), context(context_), program(program_) {} - - /** - * @brief Construct a new Execution Settings object from an Execution Settings object - * - * @param s The object to copy - */ - ExecutionSettings(ExecutionSettings *s) : ExecutionSettings(s->programSettings, s->device, s->context, s->program) {} + ExecutionSettings(std::unique_ptr programSettings_, std::unique_ptr device_, + std::unique_ptr context_, std::unique_ptr program_): + programSettings(std::move(programSettings_)), device(std::move(device_)), + context(std::move(context_)), program(std::move(program_)) {} }; @@ -169,21 +166,14 @@ class ExecutionSettings { template class HpccFpgaBenchmark { -private: - /** - * @brief Is set by setupBenchmark() to make sure the benchmark is not - * executed before the setup is run - * - */ - bool isSetupExecuted = false; +protected: /** - * @brief The used execution settings that will be generated by setupBenchmark() + * @brief The used execution settings that will be generated by setupBenchmark(). + * It should be laos used by all other methods to read the current benchmark settings. * */ - std::shared_ptr> executionSettings; - -protected: + std::unique_ptr> executionSettings; /** * @brief Add additional options to the program parameter parser @@ -198,42 +188,37 @@ class HpccFpgaBenchmark { /** * @brief Allocate and initiate the input data for the kernel * - * @param settings The used execution settings - * @return std::shared_ptr A data class containing the initialized data + * @return std::unique_ptr A data class containing the initialized data */ - virtual std::shared_ptr - generateInputData(const ExecutionSettings &settings) = 0; + virtual std::unique_ptr + generateInputData() = 0; /** * @brief Execute the benchmark kernel and measure performance * - * @param settings The used execution settings * @param data The initialized data for the kernel. It will be replaced by the kernel output for validation - * @return std::shared_ptr A data class containing the measurement results of the execution + * @return std::unique_ptr A data class containing the measurement results of the execution */ - virtual std::shared_ptr - executeKernel(const ExecutionSettings &settings, TData &data) = 0; + virtual std::unique_ptr + executeKernel(TData &data) = 0; /** * @brief Validate the output of the execution * - * @param settings The used execution settings * @param data The output data after kernel execution - * @param output The measurement data of the kernel execution * @return true If the validation is a success. * @return false If the validation failed */ virtual bool - validateOutputAndPrintError(const ExecutionSettings &settings ,TData &data, const TOutput &output) = 0; + validateOutputAndPrintError(TData &data) = 0; /** * @brief Prints the measurement results of the benchmark to std::cout * - * @param settings The used execution settings * @param output The measurement data of the kernel execution */ virtual void - printResults(const ExecutionSettings &settings, const TOutput &output) = 0; + printResults(const TOutput &output) = 0; /** * Parses and returns program options using the cxxopts library. @@ -249,7 +234,7 @@ class HpccFpgaBenchmark { * * @return program settings that are created from the given program arguments */ - std::shared_ptr + std::unique_ptr parseProgramParameters(int argc, char *argv[]) { // Defining and parsing program options cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); @@ -285,7 +270,7 @@ class HpccFpgaBenchmark { } // Create program settings from program arguments - std::shared_ptr sharedSettings( + std::unique_ptr sharedSettings( new TSettings(result)); return sharedSettings; } @@ -313,18 +298,18 @@ class HpccFpgaBenchmark { */ void setupBenchmark(int argc, char *argv[]) { - std::shared_ptr programSettings = parseProgramParameters(argc, argv); - cl::Device usedDevice = - cl::Device(fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice)); - cl::Context context = cl::Context(usedDevice); - cl::Program program = fpga_setup::fpgaSetup(&context, {usedDevice}, - &programSettings->kernelFileName); + std::unique_ptr programSettings = parseProgramParameters(argc, argv); + auto usedDevice = + std::unique_ptr(new cl::Device(fpga_setup::selectFPGADevice(programSettings->defaultPlatform, + programSettings->defaultDevice))); + auto context = std::unique_ptr(new cl::Context(*usedDevice)); + auto program = std::unique_ptr(new cl::Program(fpga_setup::fpgaSetup(context.get(), {*usedDevice}, + &programSettings->kernelFileName))); - executionSettings = std::make_shared>(new ExecutionSettings(programSettings, usedDevice, context, program)); + executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), + std::move(context), std::move(program))); printFinalConfiguration(*executionSettings); - isSetupExecuted = true; } /** @@ -336,23 +321,23 @@ class HpccFpgaBenchmark { */ bool executeBenchmark() { - if (!isSetupExecuted) { + if (!executionSettings.get()) { std::cerr << "Benchmark execution started without running the benchmark setup!" << std::endl; exit(1); } std::cout << HLINE << "Start benchmark using the given configuration. Generating data..." << std::endl << HLINE; - std::shared_ptr data = generateInputData(*executionSettings); + std::unique_ptr data = generateInputData(); std::cout << HLINE << "Execute benchmark kernel..." << std::endl << HLINE; - std::shared_ptr output = executeKernel(*executionSettings, *data); + std::unique_ptr output = executeKernel(*data); std::cout << HLINE << "Validate output..." << std::endl << HLINE; - bool validateSuccess = validateOutputAndPrintError(*executionSettings , *data, *output); + bool validateSuccess = validateOutputAndPrintError(*data); - printResults(*executionSettings, *output); + printResults(*output); return validateSuccess; } @@ -384,7 +369,7 @@ template std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ std::string device_name; os << std::left; - printedExecutionSettings.device.getInfo(CL_DEVICE_NAME, &device_name); + printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name); for (auto k : printedExecutionSettings.programSettings->getSettingsMap()) { os << std::setw(2 * ENTRY_SPACE) << k.first << k.second << std::endl; } From e291d5ab564ace9bf1d656a8fb9e7a90cf3972ef Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 18 May 2020 19:00:09 +0200 Subject: [PATCH 27/45] Update STREAM to new base implementation --- STREAM/src/host/execution.hpp | 4 +- STREAM/src/host/execution_default.cpp | 64 +++++++++--------- STREAM/src/host/stream_benchmark.cpp | 66 +++++++------------ STREAM/src/host/stream_benchmark.hpp | 62 ++++++++++++----- STREAM/tests/CMakeLists.txt | 2 +- STREAM/tests/main.cpp | 10 +-- ...nel_functionality_and_host_integration.cpp | 7 +- STREAM/tests/test_program_settings.h | 3 +- 8 files changed, 114 insertions(+), 104 deletions(-) diff --git a/STREAM/src/host/execution.hpp b/STREAM/src/host/execution.hpp index a46fa687..b2935d37 100644 --- a/STREAM/src/host/execution.hpp +++ b/STREAM/src/host/execution.hpp @@ -62,8 +62,8 @@ simple exchange of the different calculation methods. @return The resulting matrix */ - std::shared_ptr - calculate(const hpcc_base::ExecutionSettings config, + std::unique_ptr + calculate(const hpcc_base::ExecutionSettings& config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C); diff --git a/STREAM/src/host/execution_default.cpp b/STREAM/src/host/execution_default.cpp index 8228b56e..fb6a6c44 100644 --- a/STREAM/src/host/execution_default.cpp +++ b/STREAM/src/host/execution_default.cpp @@ -68,8 +68,8 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr - calculate(const hpcc_base::ExecutionSettings config, + std::unique_ptr + calculate(const hpcc_base::ExecutionSettings& config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C) { @@ -328,7 +328,7 @@ namespace bm_execution { } } - std::shared_ptr result(new stream::StreamExecutionTimings{ + std::unique_ptr result(new stream::StreamExecutionTimings{ timingMap, config.programSettings->streamArraySize }); @@ -346,15 +346,15 @@ namespace bm_execution { int err; for (int i=0; i < config.programSettings->kernelReplications; i++) { // create the kernels - cl::Kernel testkernel(config.program, ("scale_" + std::to_string(i)).c_str(), &err); + cl::Kernel testkernel(*config.program, ("scale_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel copykernel(config.program, ("copy_" + std::to_string(i)).c_str(), &err); + cl::Kernel copykernel(*config.program, ("copy_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel scalekernel(config.program, ("scale_" + std::to_string(i)).c_str(), &err); + cl::Kernel scalekernel(*config.program, ("scale_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel addkernel(config.program, ("add_" + std::to_string(i)).c_str(), &err); + cl::Kernel addkernel(*config.program, ("add_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel triadkernel(config.program, ("triad_" + std::to_string(i)).c_str(), &err); + cl::Kernel triadkernel(*config.program, ("triad_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); HOST_DATA_TYPE scalar = 3.0; @@ -405,7 +405,7 @@ namespace bm_execution { err = triadkernel.setArg(4, data_per_kernel); ASSERT_CL(err); - command_queues.push_back(cl::CommandQueue(config.context)); + command_queues.push_back(cl::CommandQueue(*config.context)); test_kernels.push_back(testkernel); copy_kernels.push_back(copykernel); scale_kernels.push_back(scalekernel); @@ -429,28 +429,28 @@ namespace bm_execution { for (int i=0; i < config.programSettings->kernelReplications; i++) { #ifdef INTEL_FPGA // create the kernels - cl::Kernel testkernel(config.program, ("calc_" + std::to_string(i)).c_str(), &err); + cl::Kernel testkernel(*config.program, ("calc_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel copykernel(config.program, ("calc_" + std::to_string(i)).c_str(), &err); + cl::Kernel copykernel(*config.program, ("calc_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel scalekernel(config.program, ("calc_" + std::to_string(i)).c_str(), &err); + cl::Kernel scalekernel(*config.program, ("calc_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel addkernel(config.program, ("calc_" + std::to_string(i)).c_str(), &err); + cl::Kernel addkernel(*config.program, ("calc_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); - cl::Kernel triadkernel(config.program, ("calc_" + std::to_string(i)).c_str(), &err); + cl::Kernel triadkernel(*config.program, ("calc_" + std::to_string(i)).c_str(), &err); ASSERT_CL(err); #endif #ifdef XILINX_FPGA // create the kernels - cl::Kernel testkernel(config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); + cl::Kernel testkernel(*config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); ASSERT_CL(err); - cl::Kernel copykernel(config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); + cl::Kernel copykernel(*config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); ASSERT_CL(err); - cl::Kernel scalekernel(config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); + cl::Kernel scalekernel(*config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); ASSERT_CL(err); - cl::Kernel addkernel(config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); + cl::Kernel addkernel(*config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); ASSERT_CL(err); - cl::Kernel triadkernel(config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); + cl::Kernel triadkernel(*config.program, ("calc_0:{calc_0_" + std::to_string(i+1) + "}").c_str(), &err); ASSERT_CL(err); #endif HOST_DATA_TYPE scalar = 3.0; @@ -582,7 +582,7 @@ namespace bm_execution { err = triadkernel.setArg(5, TRIAD_KERNEL_TYPE); ASSERT_CL(err); - command_queues.push_back(cl::CommandQueue(config.context)); + command_queues.push_back(cl::CommandQueue(*config.context)); test_kernels.push_back(testkernel); copy_kernels.push_back(copykernel); scale_kernels.push_back(scalekernel); @@ -600,30 +600,30 @@ namespace bm_execution { #ifdef INTEL_FPGA if (config.programSettings->useSingleKernel) { //Create Buffers for input and output - Buffers_A.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_B.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_C.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_A.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_B.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_C.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE | ((i + 1) << 16), sizeof(HOST_DATA_TYPE)*data_per_kernel)); } else { //Create Buffers for input and output - Buffers_A.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | CL_CHANNEL_1_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_B.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | CL_CHANNEL_3_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_C.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE | CL_CHANNEL_2_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_A.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE | CL_CHANNEL_1_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_B.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE | CL_CHANNEL_3_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_C.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE | CL_CHANNEL_2_INTELFPGA, sizeof(HOST_DATA_TYPE)*data_per_kernel)); } #endif #ifdef XILINX_FPGA - Buffers_A.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_B.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_C.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_A.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_B.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_C.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); #endif } } else { for (int i=0; i < config.programSettings->kernelReplications; i++) { //Create Buffers for input and output - Buffers_A.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_B.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); - Buffers_C.push_back(cl::Buffer(config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_A.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_B.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); + Buffers_C.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE)*data_per_kernel)); } } } diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp index b73d58de..e3448f51 100644 --- a/STREAM/src/host/stream_benchmark.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -69,9 +69,9 @@ stream::StreamBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { ("single-kernel", "Use the single kernel implementation"); } -std::shared_ptr -stream::StreamBenchmark::executeKernel(const hpcc_base::ExecutionSettings &settings, StreamData &data) { - return bm_execution::calculate(settings, +std::unique_ptr +stream::StreamBenchmark::executeKernel(StreamData &data) { + return bm_execution::calculate(*executionSettings, data.A, data.B, data.C); @@ -83,7 +83,7 @@ Prints the execution results to stdout @param results The execution results */ void -stream::StreamBenchmark::printResults(const hpcc_base::ExecutionSettings &settings, const stream::StreamExecutionTimings &output) { +stream::StreamBenchmark::printResults(const stream::StreamExecutionTimings &output) { std::cout << std::setw(ENTRY_SPACE) << "Function"; std::cout << std::setw(ENTRY_SPACE) << "Best Rate MB/s"; @@ -107,42 +107,20 @@ stream::StreamBenchmark::printResults(const hpcc_base::ExecutionSettings -stream::StreamBenchmark::generateInputData(const hpcc_base::ExecutionSettings &settings) { - HOST_DATA_TYPE *A, *B, *C; -#ifdef INTEL_FPGA -#ifdef USE_SVM - A = reinterpret_cast( - clSVMAlloc(context(), 0 , - settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE), 1024)); - B = reinterpret_cast( - clSVMAlloc(context(), 0 , - settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE), 1024)); - C = reinterpret_cast( - clSVMAlloc(context(), 0 , - settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE), 1024)); -#else - posix_memalign(reinterpret_cast(&A), 64, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&B), 64, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&C), 64, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); -#endif -#endif -#ifdef XILINX_FPGA - posix_memalign(reinterpret_cast(&A), 4096, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&B), 4096, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&C), 4096, settings.programSettings->streamArraySize * sizeof(HOST_DATA_TYPE)); -#endif - for (int i=0; i< settings.programSettings->streamArraySize; i++) { - A[i] = 1.0; - B[i] = 2.0; - C[i] = 0.0; +std::unique_ptr +stream::StreamBenchmark::generateInputData() { + auto d = std::unique_ptr(new StreamData(*executionSettings->context, executionSettings->programSettings->streamArraySize)); + for (int i=0; i< executionSettings->programSettings->streamArraySize; i++) { + d->A[i] = 1.0; + d->B[i] = 2.0; + d->C[i] = 0.0; } - return std::make_shared(new stream::StreamData{A, B, C}); + return d; } bool -stream::StreamBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionSettings &settings ,stream::StreamData &data, const stream::StreamExecutionTimings &output) { +stream::StreamBenchmark::validateOutputAndPrintError(stream::StreamData &data) { HOST_DATA_TYPE aj,bj,cj,scalar; HOST_DATA_TYPE aSumErr,bSumErr,cSumErr; HOST_DATA_TYPE aAvgErr,bAvgErr,cAvgErr; @@ -158,7 +136,7 @@ stream::StreamBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionS aj = 2.0E0 * aj; /* now execute timing loop */ scalar = 3.0; - for (k=0; knumRepetitions; k++) + for (k=0; kprogramSettings->numRepetitions; k++) { cj = aj; bj = scalar*cj; @@ -170,15 +148,15 @@ stream::StreamBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionS aSumErr = 0.0; bSumErr = 0.0; cSumErr = 0.0; - for (j=0; j< settings.programSettings->streamArraySize; j++) { + for (j=0; j< executionSettings->programSettings->streamArraySize; j++) { aSumErr += abs(data.A[j] - aj); bSumErr += abs(data.B[j] - bj); cSumErr += abs(data.C[j] - cj); // if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj); // MCCALPIN } - aAvgErr = aSumErr / (HOST_DATA_TYPE) settings.programSettings->streamArraySize; - bAvgErr = bSumErr / (HOST_DATA_TYPE) settings.programSettings->streamArraySize; - cAvgErr = cSumErr / (HOST_DATA_TYPE) settings.programSettings->streamArraySize; + aAvgErr = aSumErr / (HOST_DATA_TYPE) executionSettings->programSettings->streamArraySize; + bAvgErr = bSumErr / (HOST_DATA_TYPE) executionSettings->programSettings->streamArraySize; + cAvgErr = cSumErr / (HOST_DATA_TYPE) executionSettings->programSettings->streamArraySize; if (sizeof(HOST_DATA_TYPE) == 4) { epsilon = 1.e-6; @@ -187,7 +165,7 @@ stream::StreamBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionS epsilon = 1.e-13; } else { - printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(settings.programSettings->streamArraySize)); + printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(executionSettings->programSettings->streamArraySize)); epsilon = 1.e-6; } @@ -197,7 +175,7 @@ stream::StreamBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionS printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); ierr = 0; - for (j=0; jstreamArraySize; j++) { + for (j=0; jprogramSettings->streamArraySize; j++) { if (abs(data.A[j]/aj-1.0) > epsilon) { ierr++; } @@ -210,7 +188,7 @@ stream::StreamBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionS printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); ierr = 0; - for (j=0; jstreamArraySize; j++) { + for (j=0; jprogramSettings->streamArraySize; j++) { if (abs(data.B[j]/bj-1.0) > epsilon) { ierr++; } @@ -223,7 +201,7 @@ stream::StreamBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionS printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); ierr = 0; - for (j=0; jstreamArraySize; j++) { + for (j=0; jprogramSettings->streamArraySize; j++) { if (abs(data.C[j]/cj-1.0) > epsilon) { ierr++; } diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp index a941289e..caa36cfe 100644 --- a/STREAM/src/host/stream_benchmark.hpp +++ b/STREAM/src/host/stream_benchmark.hpp @@ -81,8 +81,42 @@ class StreamData { public: HOST_DATA_TYPE *A, *B, *C; - StreamData(HOST_DATA_TYPE *A,HOST_DATA_TYPE *B,HOST_DATA_TYPE *C) : A(A), B(B), C(C) {} - StreamData(StreamData *d) : A(d->A), B(d->B), C(d->C) {} + StreamData(const cl::Context& context, size_t size) { + #ifdef INTEL_FPGA + #ifdef USE_SVM + A = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); + B = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); + C = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); + #else + posix_memalign(reinterpret_cast(&A), 64, size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&B), 64, size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&C), 64, size * sizeof(HOST_DATA_TYPE)); + #endif + #endif + #ifdef XILINX_FPGA + posix_memalign(reinterpret_cast(&A), 4096, size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&B), 4096, size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&C), 4096, size * sizeof(HOST_DATA_TYPE)); + #endif + } + + ~StreamData() { + #ifdef USE_SVM + clSVMFree(A); + clSVMFree(B); + clSVMFree(C); + #else + free(A); + free(B); + free(C); + #endif + } }; @@ -125,43 +159,39 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark + * + * @return std::unique_ptr */ - std::shared_ptr - generateInputData(const hpcc_base::ExecutionSettings &settings) override; + std::unique_ptr + generateInputData() override; /** * @brief Stream specific implementation of the kernel execution * - * @param settings * @param data - * @return std::shared_ptr + * @return std::unique_ptr */ - std::shared_ptr - executeKernel(const hpcc_base::ExecutionSettings &settings, StreamData &data) override; + std::unique_ptr + executeKernel( StreamData &data) override; /** * @brief Stream specific implementation of the execution validation - * - * @param settings + * * @param data * @param output * @return true * @return false */ bool - validateOutputAndPrintError(const hpcc_base::ExecutionSettings &settings ,StreamData &data, const StreamExecutionTimings &output) override; + validateOutputAndPrintError(StreamData &data) override; /** * @brief Stream specific implementation of printing the execution results * - * @param settings * @param output */ void - printResults(const hpcc_base::ExecutionSettings &settings, const StreamExecutionTimings &output) override; + printResults(const StreamExecutionTimings &output) override; /** * @brief Construct a new Stream Benchmark object diff --git a/STREAM/tests/CMakeLists.txt b/STREAM/tests/CMakeLists.txt index bd73b663..9097411a 100755 --- a/STREAM/tests/CMakeLists.txt +++ b/STREAM/tests/CMakeLists.txt @@ -4,7 +4,7 @@ add_subdirectory(../../extern/googletest ${CMAKE_BINARY_DIR}/lib/googletest) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) include_directories(${CMAKE_BINARY_DIR}/src/common .) -set(TEST_SOURCES ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp main.cpp) +set(TEST_SOURCES main.cpp test_kernel_functionality_and_host_integration.cpp) set(HOST_EXE_NAME STREAM_FPGA) set(LIB_NAME stream) diff --git a/STREAM/tests/main.cpp b/STREAM/tests/main.cpp index d43b0a40..8827866a 100644 --- a/STREAM/tests/main.cpp +++ b/STREAM/tests/main.cpp @@ -43,7 +43,7 @@ class MPIEnvironment : public ::testing::Environment { using namespace stream; -std::shared_ptr bm; +std::unique_ptr bm; /** The program entry point for the unit tests @@ -60,11 +60,13 @@ main(int argc, char *argv[]) { ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); #endif - bm = std::shared_ptr(new StreamBenchmark()); + bm = std::unique_ptr(new StreamBenchmark(argc, argv)); - bm->setupBenchmark(argc, argv); + bool result = RUN_ALL_TESTS(); - return RUN_ALL_TESTS(); + bm = nullptr; + + return result; } diff --git a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp index 92f967df..e796c4fb 100644 --- a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp +++ b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp @@ -12,9 +12,10 @@ struct StreamKernelTest :public ::testing::Test { void SetUp( ) { bm->getExecutionSettings().programSettings->streamArraySize = VECTOR_COUNT * UNROLL_COUNT * NUM_KERNEL_REPLICATIONS * BUFFER_SIZE; - data = bm->generateInputData(bm->getExecutionSettings()); + data = bm->generateInputData(); } + }; @@ -23,7 +24,7 @@ struct StreamKernelTest :public ::testing::Test { */ TEST_F(StreamKernelTest, FPGACorrectResultsOneRepetition) { bm->getExecutionSettings().programSettings->numRepetitions = 1; - auto result = bm->executeKernel(bm->getExecutionSettings(), *data); + auto result = bm->executeKernel(*data); for (int i = 0; i < bm->getExecutionSettings().programSettings->streamArraySize; i++) { EXPECT_FLOAT_EQ(data->A[i], 30.0); EXPECT_FLOAT_EQ(data->B[i], 6.0); @@ -36,7 +37,7 @@ TEST_F(StreamKernelTest, FPGACorrectResultsOneRepetition) { */ TEST_F(StreamKernelTest, FPGACorrectResultsThreeRepetition) { bm->getExecutionSettings().programSettings->numRepetitions = 3; - auto result = bm->executeKernel(bm->getExecutionSettings(), *data); + auto result = bm->executeKernel(*data); for (int i = 0; i < bm->getExecutionSettings().programSettings->streamArraySize; i++) { EXPECT_FLOAT_EQ(data->A[i], 6750.0); EXPECT_FLOAT_EQ(data->B[i], 1350.0); diff --git a/STREAM/tests/test_program_settings.h b/STREAM/tests/test_program_settings.h index b8d13110..dce434f2 100644 --- a/STREAM/tests/test_program_settings.h +++ b/STREAM/tests/test_program_settings.h @@ -23,5 +23,4 @@ SOFTWARE. /* Project's headers */ #include "stream_benchmark.hpp" - -extern std::shared_ptr bm; +extern std::unique_ptr bm; From 797779faf4b33761120a98ce1ea425b6ea25fea2 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 18 May 2020 19:00:36 +0200 Subject: [PATCH 28/45] Adapt random access to new base implementation --- RandomAccess/src/host/execution.h | 2 +- RandomAccess/src/host/execution_single.cpp | 12 ++--- .../src/host/random_access_benchmark.cpp | 46 ++++++++----------- .../src/host/random_access_benchmark.hpp | 36 +++++++++++---- RandomAccess/tests/main.cpp | 5 +- RandomAccess/tests/test_host_code.cpp | 10 ++-- ...nel_functionality_and_host_integration.cpp | 12 ++--- 7 files changed, 66 insertions(+), 57 deletions(-) diff --git a/RandomAccess/src/host/execution.h b/RandomAccess/src/host/execution.h index d494b321..030544d9 100644 --- a/RandomAccess/src/host/execution.h +++ b/RandomAccess/src/host/execution.h @@ -53,7 +53,7 @@ simple exchange of the different calculation methods. @return The time measurements and the error rate counted from the executions */ -std::shared_ptr +std::unique_ptr calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE * data); } // namespace bm_execution diff --git a/RandomAccess/src/host/execution_single.cpp b/RandomAccess/src/host/execution_single.cpp index 94ba4574..67bbd7bf 100644 --- a/RandomAccess/src/host/execution_single.cpp +++ b/RandomAccess/src/host/execution_single.cpp @@ -39,7 +39,7 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr + std::unique_ptr calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE * data) { // int used to check for OpenCL errors int err; @@ -52,22 +52,22 @@ namespace bm_execution { /* --- Prepare kernels --- */ for (int r=0; r < config.programSettings->kernelReplications; r++) { - compute_queue.push_back(cl::CommandQueue(config.context, config.device)); + compute_queue.push_back(cl::CommandQueue(*config.context, *config.device)); int memory_bank_info = 0; #ifdef INTEL_FPGA memory_bank_info = ((r + 1) << 16); #endif - Buffer_data.push_back(cl::Buffer(config.context, + Buffer_data.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE | memory_bank_info, sizeof(HOST_DATA_TYPE)*(config.programSettings->dataSize / config.programSettings->kernelReplications))); #ifdef INTEL_FPGA - accesskernel.push_back(cl::Kernel(config.program, + accesskernel.push_back(cl::Kernel(*config.program, (RANDOM_ACCESS_KERNEL + std::to_string(r)).c_str() , &err)); #endif #ifdef XILINX_FPGA - accesskernel.push_back(cl::Kernel(config.program, + accesskernel.push_back(cl::Kernel(*config.program, (std::string(RANDOM_ACCESS_KERNEL) + "0:{" + RANDOM_ACCESS_KERNEL + "0_" + std::to_string(r + 1) + "}").c_str() , &err)); #endif @@ -153,7 +153,7 @@ namespace bm_execution { #endif } - return std::shared_ptr(new random_access::RandomAccessExecutionTimings{executionTimes}); + return std::unique_ptr(new random_access::RandomAccessExecutionTimings{executionTimes}); } } // namespace bm_execution diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp index 8b99e11b..fb72c45c 100644 --- a/RandomAccess/src/host/random_access_benchmark.cpp +++ b/RandomAccess/src/host/random_access_benchmark.cpp @@ -62,13 +62,12 @@ random_access::RandomAccessBenchmark::addAdditionalParseOptions(cxxopts::Options ("d", "Size of the data array", cxxopts::value()->default_value(std::to_string(DEFAULT_ARRAY_LENGTH))) ("r", "Number of kernel replications used", - cxxopts::value()->default_value(std::to_string(NUM_KERNEL_REPLICATIONS))) - ("single-kernel", "Use the single kernel implementation"); + cxxopts::value()->default_value(std::to_string(NUM_KERNEL_REPLICATIONS))); } -std::shared_ptr -random_access::RandomAccessBenchmark::executeKernel(const hpcc_base::ExecutionSettings &settings, RandomAccessData &data) { - return bm_execution::calculate(settings, data.data); +std::unique_ptr +random_access::RandomAccessBenchmark::executeKernel(RandomAccessData &data) { + return bm_execution::calculate(*executionSettings, data.data); } /** @@ -77,7 +76,7 @@ Prints the execution results to stdout @param results The execution results */ void -random_access::RandomAccessBenchmark::printResults(const hpcc_base::ExecutionSettings &settings, const random_access::RandomAccessExecutionTimings &output) { +random_access::RandomAccessBenchmark::printResults(const random_access::RandomAccessExecutionTimings &output) { std::cout << std::setw(ENTRY_SPACE) << "best" << std::setw(ENTRY_SPACE) << "mean" << std::setw(ENTRY_SPACE) << "GUOPS" << std::endl; @@ -85,7 +84,7 @@ random_access::RandomAccessBenchmark::printResults(const hpcc_base::ExecutionSet // Calculate performance for kernel execution plus data transfer double tmean = 0; double tmin = std::numeric_limits::max(); - double gups = static_cast(4 * settings.programSettings->dataSize) / 1000000000; + double gups = static_cast(4 * executionSettings->programSettings->dataSize) / 1000000000; for (double currentTime : output.times) { tmean += currentTime; if (currentTime < tmin) { @@ -101,45 +100,36 @@ random_access::RandomAccessBenchmark::printResults(const hpcc_base::ExecutionSet } -std::shared_ptr -random_access::RandomAccessBenchmark::generateInputData(const hpcc_base::ExecutionSettings &settings) { - HOST_DATA_TYPE *data; -#ifdef USE_SVM - data = reinterpret_cast( - clSVMAlloc(context(), 0 , - settings.programSettings->dataSize * sizeof(HOST_DATA_TYPE), 1024)); -#else - posix_memalign(reinterpret_cast(&data), 4096, settings.programSettings->dataSize * sizeof(HOST_DATA_TYPE)); -#endif - - for (HOST_DATA_TYPE j=0; j < settings.programSettings->dataSize ; j++) { - data[j] = j; +std::unique_ptr +random_access::RandomAccessBenchmark::generateInputData() { + auto d = std::unique_ptr(new RandomAccessData(*executionSettings->context, executionSettings->programSettings->dataSize)); + for (HOST_DATA_TYPE j=0; j < executionSettings->programSettings->dataSize ; j++) { + d->data[j] = j; } - - return std::shared_ptr(new RandomAccessData{data}); + return d; } bool -random_access::RandomAccessBenchmark::validateOutputAndPrintError(const hpcc_base::ExecutionSettings &settings ,random_access::RandomAccessData &data, const random_access::RandomAccessExecutionTimings &output) { +random_access::RandomAccessBenchmark::validateOutputAndPrintError(random_access::RandomAccessData &data) { HOST_DATA_TYPE temp = 1; - for (HOST_DATA_TYPE i=0; i < 4L*settings.programSettings->dataSize; i++) { + for (HOST_DATA_TYPE i=0; i < 4L*executionSettings->programSettings->dataSize; i++) { HOST_DATA_TYPE_SIGNED v = 0; if (((HOST_DATA_TYPE_SIGNED)temp) < 0) { v = POLY; } temp = (temp << 1) ^ v; - data.data[(temp >> 3) & (settings.programSettings->dataSize - 1)] ^= temp; + data.data[(temp >> 3) & (executionSettings->programSettings->dataSize - 1)] ^= temp; } double errors = 0; #pragma omp parallel for reduction(+:errors) - for (HOST_DATA_TYPE i=0; i< settings.programSettings->dataSize; i++) { + for (HOST_DATA_TYPE i=0; i< executionSettings->programSettings->dataSize; i++) { if (data.data[i] != i) { errors++; } } - std::cout << "Error: " << (static_cast(errors) / settings.programSettings->dataSize) * 100 + std::cout << "Error: " << (static_cast(errors) / executionSettings->programSettings->dataSize) * 100 << "%" << std::endl; - return (static_cast(errors) / settings.programSettings->dataSize) < 0.01; + return (static_cast(errors) / executionSettings->programSettings->dataSize) < 0.01; } \ No newline at end of file diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp index 2f39294f..4fd4a8c6 100644 --- a/RandomAccess/src/host/random_access_benchmark.hpp +++ b/RandomAccess/src/host/random_access_benchmark.hpp @@ -32,7 +32,7 @@ SOFTWARE. #include "parameters.h" /** - * @brief Contains all classes and methods needed by the STREAM benchmark + * @brief Contains all classes and methods needed by the RandomAccess benchmark * */ namespace random_access { @@ -80,7 +80,23 @@ class RandomAccessData { public: HOST_DATA_TYPE *data; - RandomAccessData(HOST_DATA_TYPE *data_) : data(data_) {} + RandomAccessData(cl::Context& context, size_t size) { + #ifdef USE_SVM + data = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); + #else + posix_memalign(reinterpret_cast(&data), 4096, size * sizeof(HOST_DATA_TYPE)); + #endif + } + + ~RandomAccessData() { + #ifdef USE_SVM + clSVMFree(data); + #else + free(data); + #endif + } }; @@ -120,20 +136,20 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark + * @return std::unique_ptr */ - std::shared_ptr - generateInputData(const hpcc_base::ExecutionSettings &settings) override; + std::unique_ptr + generateInputData() override; /** * @brief RandomAccess specific implementation of the kernel execution * * @param settings * @param data - * @return std::shared_ptr + * @return std::unique_ptr */ - std::shared_ptr - executeKernel(const hpcc_base::ExecutionSettings &settings, RandomAccessData &data) override; + std::unique_ptr + executeKernel(RandomAccessData &data) override; /** * @brief RandomAccess specific implementation of the execution validation @@ -145,7 +161,7 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark &settings ,RandomAccessData &data, const RandomAccessExecutionTimings &output) override; + validateOutputAndPrintError(RandomAccessData &data) override; /** * @brief RandomAccess specific implementation of printing the execution results @@ -154,7 +170,7 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark &settings, const RandomAccessExecutionTimings &output) override; + printResults(const RandomAccessExecutionTimings &output) override; /** * @brief Construct a new RandomAccess Benchmark object diff --git a/RandomAccess/tests/main.cpp b/RandomAccess/tests/main.cpp index fcbbdb6e..ffe5dbbc 100644 --- a/RandomAccess/tests/main.cpp +++ b/RandomAccess/tests/main.cpp @@ -62,7 +62,10 @@ main(int argc, char *argv[]) { ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); #endif - return RUN_ALL_TESTS(); + bool result = RUN_ALL_TESTS(); + bm = nullptr; + + return result; } diff --git a/RandomAccess/tests/test_host_code.cpp b/RandomAccess/tests/test_host_code.cpp index daebafce..98822854 100644 --- a/RandomAccess/tests/test_host_code.cpp +++ b/RandomAccess/tests/test_host_code.cpp @@ -19,11 +19,11 @@ struct RandomAccessHostCodeTest : testing::Test { * Check if the correctness test gives correct results for correct array */ TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForCorrectUpdates) { - auto data = bm->generateInputData(bm->getExecutionSettings()); + auto data = bm->generateInputData(); // do random accesses - bm->validateOutputAndPrintError(bm->getExecutionSettings(), *data, random_access::RandomAccessExecutionTimings{{}}); + bm->validateOutputAndPrintError(*data); // check correctness of random accesses - bool success = bm->validateOutputAndPrintError(bm->getExecutionSettings(), *data, random_access::RandomAccessExecutionTimings{{}}); + bool success = bm->validateOutputAndPrintError(*data); EXPECT_TRUE(success); } @@ -32,8 +32,8 @@ TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForCorrectUpdates) { * Check if the correctness test gives correct results for not updated array */ TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForWrongUpdates) { - auto data = bm->generateInputData(bm->getExecutionSettings()); + auto data = bm->generateInputData(); // check correctness of random accesses - bool success = bm->validateOutputAndPrintError(bm->getExecutionSettings(), *data, random_access::RandomAccessExecutionTimings{{}}); + bool success = bm->validateOutputAndPrintError( *data); EXPECT_FALSE(success); } diff --git a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp index f32974e2..e275d10b 100644 --- a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp +++ b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp @@ -8,7 +8,7 @@ struct RandomAccessKernelTest : testing::Test { - std::shared_ptr data; + std::unique_ptr data; RandomAccessKernelTest() { bm->getExecutionSettings().programSettings->dataSize = 128 * NUM_KERNEL_REPLICATIONS * BUFFER_SIZE; @@ -16,7 +16,7 @@ struct RandomAccessKernelTest : testing::Test { } void SetUp() override { - data = bm->generateInputData(bm->getExecutionSettings()); + data = bm->generateInputData(); } }; @@ -26,7 +26,7 @@ struct RandomAccessKernelTest : testing::Test { * Check if the number of measurements from the calculation matches the number of repetitions */ TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements1Rep) { - auto result = bm->executeKernel(bm->getExecutionSettings(), *data); + auto result = bm->executeKernel( *data); EXPECT_EQ(result->times.size(), 1); } @@ -35,7 +35,7 @@ TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements1Rep) { */ TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements3Rep) { bm->getExecutionSettings().programSettings->numRepetitions = 3; - auto result = bm->executeKernel(bm->getExecutionSettings(), *data); + auto result = bm->executeKernel(*data); EXPECT_EQ(result->times.size(), 3); } @@ -43,7 +43,7 @@ TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements3Rep) { * Execution returns correct results for a single repetition */ TEST_F(RandomAccessKernelTest, FPGAErrorBelow1Percent) { - auto result = bm->executeKernel(bm->getExecutionSettings(), *data); - bool success = bm->validateOutputAndPrintError(bm->getExecutionSettings(), *data, *result); + auto result = bm->executeKernel(*data); + bool success = bm->validateOutputAndPrintError(*data); EXPECT_TRUE(success); } From e71c1b4a413921a69f937c4053ca0fbc16e40a93 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 18 May 2020 19:03:27 +0200 Subject: [PATCH 29/45] Adapt PTRANS to OO code --- PTRANS/CHANGELOG | 5 + PTRANS/CMakeLists.txt | 2 +- PTRANS/src/common/parameters.h.in | 8 + PTRANS/src/device/CMakeLists.txt | 8 +- PTRANS/src/host/CMakeLists.txt | 42 ++-- .../common_benchmark_io_implementation.cpp | 92 --------- PTRANS/src/host/execution.h | 20 +- PTRANS/src/host/execution_default.cpp | 32 +-- PTRANS/src/host/main.cpp | 64 +----- PTRANS/src/host/program_settings.h | 33 ---- PTRANS/src/host/transpose_benchmark.cpp | 151 ++++++++++++++ PTRANS/src/host/transpose_benchmark.hpp | 185 ++++++++++++++++++ PTRANS/src/host/transpose_functionality.cpp | 120 ------------ PTRANS/src/host/transpose_functionality.hpp | 76 ------- PTRANS/tests/CMakeLists.txt | 44 +++-- PTRANS/tests/main.cpp | 72 +++++++ PTRANS/tests/test_host_functionality.cpp | 154 ++++----------- ...nel_functionality_and_host_integration.cpp | 156 ++++----------- PTRANS/tests/test_program_settings.h | 27 +++ scripts/evaluation/parse_raw_to_csv.py | 2 +- 20 files changed, 614 insertions(+), 679 deletions(-) delete mode 100644 PTRANS/src/host/common_benchmark_io_implementation.cpp delete mode 100644 PTRANS/src/host/program_settings.h create mode 100644 PTRANS/src/host/transpose_benchmark.cpp create mode 100644 PTRANS/src/host/transpose_benchmark.hpp delete mode 100644 PTRANS/src/host/transpose_functionality.cpp delete mode 100644 PTRANS/src/host/transpose_functionality.hpp create mode 100644 PTRANS/tests/main.cpp create mode 100644 PTRANS/tests/test_program_settings.h diff --git a/PTRANS/CHANGELOG b/PTRANS/CHANGELOG index e0c91f51..7ef7d89e 100644 --- a/PTRANS/CHANGELOG +++ b/PTRANS/CHANGELOG @@ -2,6 +2,11 @@ This file contains all changes made to the source code for each release. +## 1.1.1 + +#### Changed: +- Converted host code to new OO code + ## 1.1 #### Added: diff --git a/PTRANS/CMakeLists.txt b/PTRANS/CMakeLists.txt index 4af8b3ab..2faf1a62 100755 --- a/PTRANS/CMakeLists.txt +++ b/PTRANS/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.1) -project(PTRANS VERSION 1.1) +project(PTRANS VERSION 1.1.1) set(KERNEL_NAME transpose CACHE STRING "Name of the OpenCL kernel") set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices") diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in index 93cfdd06..423a96ca 100644 --- a/PTRANS/src/common/parameters.h.in +++ b/PTRANS/src/common/parameters.h.in @@ -17,6 +17,14 @@ #define HOST_DATA_TYPE @HOST_DATA_TYPE@ #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ +/* +Short description of the program. +Moreover the version and build time is also compiled into the description. +*/ +#define PROGRAM_DESCRIPTION "Implementation of the matrix transposition benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + /** Output separator */ diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt index 4bf2e7da..1d94289f 100644 --- a/PTRANS/src/device/CMakeLists.txt +++ b/PTRANS/src/device/CMakeLists.txt @@ -3,15 +3,15 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) if (INTELFPGAOPENCL_FOUND) generate_kernel_targets_intel(transpose_optimized) - add_test(NAME test_emulation_optimized_intel COMMAND trans_intel -f transpose_optimized_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_intel -f transpose_optimized_emulate.aocx -n 1 -m 1 + add_test(NAME test_emulation_optimized_intel COMMAND Transpose_intel -f transpose_optimized_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_intel -f transpose_optimized_emulate.aocx -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (VITIS_FOUND) generate_kernel_targets_xilinx(transpose_optimized) - add_test(NAME test_emulation_optimized_xilinx COMMAND trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./trans_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m 1 + add_test(NAME test_emulation_optimized_xilinx COMMAND Transpose_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_optimized_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index 1bcf2935..f576ab85 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -1,20 +1,36 @@ -include_directories(../../../extern/cxxopts/include ../../../shared) -include_directories(${CMAKE_BINARY_DIR}/src/common) -include_directories(.) +add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) +set(HOST_SOURCE execution_default.cpp transpose_benchmark.cpp) -set(HOST_SOURCE execution_default.cpp main.cpp common_benchmark_io_implementation.cpp ../../../shared/setup/fpga_setup.cpp transpose_functionality.cpp) +set(HOST_EXE_NAME Transpose) +set(LIB_NAME trans) if (INTELFPGAOPENCL_FOUND) - include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) - add_executable(trans_intel ${HOST_SOURCE} ) - target_link_libraries(trans_intel ${IntelFPGAOpenCL_LIBRARIES}) - target_compile_definitions(trans_intel PRIVATE -DINTEL_FPGA) + add_library(${LIB_NAME}_intel STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_intel PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${IntelFPGAOpenCL_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_intel PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_intel main.cpp) + target_link_libraries(${LIB_NAME}_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel) + if (USE_SVM) + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0) + endif() + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_intel_host_executable COMMAND $ -h) endif() -if (VITIS_FOUND) - include_directories(${Vitis_INCLUDE_DIRS}) - add_executable(trans_xilinx ${HOST_SOURCE}) - target_link_libraries(trans_xilinx ${Vitis_LIBRARIES}) - target_compile_definitions(trans_xilinx PRIVATE -DXILINX_FPGA) +if (Vitis_FOUND) + add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_xilinx main.cpp) + target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) + target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_xilinx_host_executable COMMAND $ -h) endif() + diff --git a/PTRANS/src/host/common_benchmark_io_implementation.cpp b/PTRANS/src/host/common_benchmark_io_implementation.cpp deleted file mode 100644 index 4b631c29..00000000 --- a/PTRANS/src/host/common_benchmark_io_implementation.cpp +++ /dev/null @@ -1,92 +0,0 @@ - -#include "cxxopts.hpp" -#include "parameters.h" -#include "setup/common_benchmark_io.hpp" - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("m", "Matrix size in number of blocks in one dimension", - cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) - ("b", "Block size in number of values in one dimension", - cxxopts::value()->default_value(std::to_string(BLOCK_SIZE))) - ("kernel", "Name of the kernel", - cxxopts::value()->default_value(KERNEL_NAME)) - ("i,nointerleaving", "Disable memory interleaving") - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["m"].as(), - result["b"].as(), - result["platform"].as(), - result["device"].as(), - static_cast(result.count("i") <= 0), - result["f"].as(), - result["kernel"].as()}); - return sharedSettings; -} - -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Matrix Size: " << programSettings->matrixSize * programSettings->blockSize - << std::endl - << "Memory Interleaving: " << (programSettings->useMemInterleaving ? "Yes" : "No") - << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} diff --git a/PTRANS/src/host/execution.h b/PTRANS/src/host/execution.h index de460742..30536e3e 100644 --- a/PTRANS/src/host/execution.h +++ b/PTRANS/src/host/execution.h @@ -29,25 +29,11 @@ SOFTWARE. /* External library headers */ #include "CL/cl.hpp" #include "parameters.h" +#include "transpose_benchmark.hpp" namespace bm_execution { - struct ExecutionConfiguration { - cl::Context context; - cl::Device device; - cl::Program program; - std::string kernelName; - uint repetitons; - cl_uint matrixSize; - cl_uint blockSize; - bool useMemInterleaving; - }; - - struct ExecutionTimings { - std::vector transferTimings; - std::vector calculationTimings; - }; /** The actual execution of the benchmark. @@ -59,8 +45,8 @@ simple exchange of the different calculation methods. @return The resulting matrix */ - std::shared_ptr - calculate(std::shared_ptr config, HOST_DATA_TYPE *const A, + std::unique_ptr + calculate(const hpcc_base::ExecutionSettings& config, HOST_DATA_TYPE *const A, HOST_DATA_TYPE *const B, HOST_DATA_TYPE *A_out); } // namespace bm_execution diff --git a/PTRANS/src/host/execution_default.cpp b/PTRANS/src/host/execution_default.cpp index b5267d2e..83948912 100644 --- a/PTRANS/src/host/execution_default.cpp +++ b/PTRANS/src/host/execution_default.cpp @@ -39,36 +39,36 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr - calculate(std::shared_ptr config, HOST_DATA_TYPE *const A, + std::unique_ptr + calculate(const hpcc_base::ExecutionSettings& config, HOST_DATA_TYPE *const A, HOST_DATA_TYPE *const B, HOST_DATA_TYPE *A_out) { - cl::Buffer bufferA(config->context, CL_MEM_READ_ONLY, - sizeof(HOST_DATA_TYPE) * config->matrixSize * config->matrixSize); - cl::Buffer bufferB(config->context, CL_MEM_READ_ONLY, - sizeof(HOST_DATA_TYPE) * config->matrixSize * config->matrixSize); - cl::Buffer bufferA_out(config->context, CL_MEM_WRITE_ONLY, - sizeof(HOST_DATA_TYPE) * config->matrixSize * config->matrixSize); + cl::Buffer bufferA(*config.context, CL_MEM_READ_ONLY, + sizeof(HOST_DATA_TYPE) * config.programSettings->matrixSize * config.programSettings->matrixSize); + cl::Buffer bufferB(*config.context, CL_MEM_READ_ONLY, + sizeof(HOST_DATA_TYPE) * config.programSettings->matrixSize * config.programSettings->matrixSize); + cl::Buffer bufferA_out(*config.context, CL_MEM_WRITE_ONLY, + sizeof(HOST_DATA_TYPE) * config.programSettings->matrixSize * config.programSettings->matrixSize); - cl::Kernel transposeKernel(config->program, config->kernelName.c_str()); + cl::Kernel transposeKernel(*config.program, KERNEL_NAME); transposeKernel.setArg(0, bufferA); transposeKernel.setArg(1, bufferB); transposeKernel.setArg(2, bufferA_out); - transposeKernel.setArg(3, config->matrixSize / config->blockSize); + transposeKernel.setArg(3, config.programSettings->matrixSize / config.programSettings->blockSize); - cl::CommandQueue queue(config->context); + cl::CommandQueue queue(*config.context); std::vector transferTimings; std::vector calculationTimings; - for (int repetition = 0; repetition < config->repetitons; repetition++) { + for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { auto startTransfer = std::chrono::high_resolution_clock::now(); queue.enqueueWriteBuffer(bufferA, CL_FALSE, 0, - sizeof(HOST_DATA_TYPE) * config->matrixSize * config->matrixSize, A); + sizeof(HOST_DATA_TYPE) * config.programSettings->matrixSize * config.programSettings->matrixSize, A); queue.enqueueWriteBuffer(bufferB, CL_FALSE, 0, - sizeof(HOST_DATA_TYPE) * config->matrixSize * config->matrixSize, B); + sizeof(HOST_DATA_TYPE) * config.programSettings->matrixSize * config.programSettings->matrixSize, B); queue.finish(); auto endTransfer = std::chrono::high_resolution_clock::now(); std::chrono::duration transferTime = @@ -86,7 +86,7 @@ namespace bm_execution { startTransfer = std::chrono::high_resolution_clock::now(); queue.enqueueReadBuffer(bufferA_out, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE) * config->matrixSize * config->matrixSize, A_out); + sizeof(HOST_DATA_TYPE) * config.programSettings->matrixSize * config.programSettings->matrixSize, A_out); endTransfer = std::chrono::high_resolution_clock::now(); transferTime += std::chrono::duration_cast> @@ -94,7 +94,7 @@ namespace bm_execution { transferTimings.push_back(transferTime.count()); } - std::shared_ptr result(new ExecutionTimings{ + std::unique_ptr result(new transpose::TransposeExecutionTimings{ transferTimings, calculationTimings }); diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp index 367fc9d7..d438f3cd 100644 --- a/PTRANS/src/host/main.cpp +++ b/PTRANS/src/host/main.cpp @@ -1,9 +1,6 @@ -// -// Created by Marius Meyer on 04.12.19. -// +#include "transpose_benchmark.hpp" -#include "transpose_functionality.hpp" -#include "setup/common_benchmark_io.hpp" +using namespace transpose; /** The program entry point @@ -11,58 +8,13 @@ The program entry point int main(int argc, char *argv[]) { // Setup benchmark - std::shared_ptr programSettings = - parseProgramParameters(argc, argv); - fpga_setup::setupEnvironmentAndClocks(); - - std::vector usedDevice; - cl::Context context; - cl::Program program; - cl::Device device; - - if (programSettings->kernelFileName != "CPU") { - usedDevice = fpga_setup::selectFPGADevice( - programSettings->defaultPlatform, - programSettings->defaultDevice); - context = cl::Context(usedDevice); - program = fpga_setup::fpgaSetup(&context, usedDevice, &programSettings->kernelFileName); + auto bm = TransposeBenchmark(argc, argv); + bool success = bm.executeBenchmark(); + if (success) { + return 0; } - - if (usedDevice.size() > 0) { - device = usedDevice[0]; - printFinalConfiguration(programSettings, device); + else { + return 1; } - - std::shared_ptr config( - new bm_execution::ExecutionConfiguration{ - context, device, program, - programSettings->kernelName, - programSettings->numRepetitions, - programSettings->matrixSize * programSettings->blockSize, - programSettings->blockSize, - programSettings->useMemInterleaving - }); - - HOST_DATA_TYPE* A; - HOST_DATA_TYPE* B; - HOST_DATA_TYPE* result; - - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * config->matrixSize * config->matrixSize); - posix_memalign(reinterpret_cast(&B), 64, - sizeof(HOST_DATA_TYPE) * config->matrixSize * config->matrixSize); - posix_memalign(reinterpret_cast(&result), 64, - sizeof(HOST_DATA_TYPE) * config->matrixSize * config->matrixSize); - - generateInputData(config->matrixSize, A, B); - // Start actual benchmark - auto results = bm_execution::calculate(config, A, B, result); - - printResults(results, config->matrixSize); - - double error = printCalculationError(config->matrixSize, result); - - - return (error < 1.0e-5) ? 0 : 1; } diff --git a/PTRANS/src/host/program_settings.h b/PTRANS/src/host/program_settings.h deleted file mode 100644 index 8284c74d..00000000 --- a/PTRANS/src/host/program_settings.h +++ /dev/null @@ -1,33 +0,0 @@ - -#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ -#define SRC_HOST_PROGRAM_SETTINGS_H_ - -#include "parameters.h" - -/* C++ standard library headers */ -#include - -#include "CL/opencl.h" - -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ - -#define PROGRAM_DESCRIPTION "Implementation of the matrix transposition benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " VERSION "\n" - -struct ProgramSettings { - uint numRepetitions; - cl_uint matrixSize; - cl_uint blockSize; - int defaultPlatform; - int defaultDevice; - bool useMemInterleaving; - std::string kernelFileName; - std::string kernelName; -}; - - -#endif diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp new file mode 100644 index 00000000..26cdd4ec --- /dev/null +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -0,0 +1,151 @@ +// +// Created by Marius Meyer on 04.12.19. +// + +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "transpose_benchmark.hpp" + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "execution.h" +#include "parameters.h" + +transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), + matrixSize(results["m"].as() * results["b"].as()), + blockSize(results["b"].as()) { + +} + +std::map +transpose::TransposeProgramSettings::getSettingsMap() { + auto map = hpcc_base::BaseSettings::getSettingsMap(); + map["Matrix Size"] = std::to_string(matrixSize); + map["Block Size"] = std::to_string(blockSize); + return map; +} + +transpose::TransposeBenchmark::TransposeBenchmark(int argc, char* argv[]) { + setupBenchmark(argc, argv); +} + +transpose::TransposeBenchmark::TransposeBenchmark() {} + +void +transpose::TransposeBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { + options.add_options() + ("m", "Matrix size in number of blocks in one dimension", + cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) + ("b", "Block size in number of values in one dimension", + cxxopts::value()->default_value(std::to_string(BLOCK_SIZE))); +} + +std::unique_ptr +transpose::TransposeBenchmark::executeKernel(TransposeData &data) { + return bm_execution::calculate(*executionSettings, data.A, data.B, data.result); +} + +/** +Prints the execution results to stdout + +@param results The execution results +*/ +void +transpose::TransposeBenchmark::printResults(const transpose::TransposeExecutionTimings &output) { + double flops = executionSettings->programSettings->matrixSize * executionSettings->programSettings->matrixSize; + + double avgTransferTime = accumulate(output.transferTimings.begin(), output.transferTimings.end(), 0.0) + / output.transferTimings.size(); + double minTransferTime = *min_element(output.transferTimings.begin(), output.transferTimings.end()); + + + double avgCalculationTime = accumulate(output.calculationTimings.begin(), output.calculationTimings.end(), 0.0) + / output.calculationTimings.size(); + double minCalculationTime = *min_element(output.calculationTimings.begin(), output.calculationTimings.end()); + + double avgCalcFLOPS = flops / avgCalculationTime; + double avgTotalFLOPS = flops / (avgCalculationTime + avgTransferTime); + double minCalcFLOPS = flops / minCalculationTime; + double minTotalFLOPS = flops / (minCalculationTime + minTransferTime); + + std::cout << " trans calc calc FLOPS total FLOPS" << std::endl; + std::cout << "avg: " << avgTransferTime + << " " << avgCalculationTime + << " " << avgCalcFLOPS + << " " << avgTotalFLOPS + << std::endl; + std::cout << "best: " << minTransferTime + << " " << minCalculationTime + << " " << minCalcFLOPS + << " " << minTotalFLOPS + << std::endl; + +} + +std::unique_ptr +transpose::TransposeBenchmark::generateInputData() { + HOST_DATA_TYPE* A; + HOST_DATA_TYPE* B; + HOST_DATA_TYPE* result; + + posix_memalign(reinterpret_cast(&A), 64, + sizeof(HOST_DATA_TYPE) * executionSettings->programSettings->matrixSize * executionSettings->programSettings->matrixSize); + posix_memalign(reinterpret_cast(&B), 64, + sizeof(HOST_DATA_TYPE) * executionSettings->programSettings->matrixSize * executionSettings->programSettings->matrixSize); + posix_memalign(reinterpret_cast(&result), 64, + sizeof(HOST_DATA_TYPE) * executionSettings->programSettings->matrixSize * executionSettings->programSettings->matrixSize); + + std::mt19937 gen(7); + std::uniform_real_distribution<> dis(-100.0, 100.0); + for (int i = 0; i < executionSettings->programSettings->matrixSize; i++) { + for (int j = 0; j < executionSettings->programSettings->matrixSize; j++) { + A[i * executionSettings->programSettings->matrixSize + j] = dis(gen); + B[j * executionSettings->programSettings->matrixSize + i] = dis(gen); + result[j * executionSettings->programSettings->matrixSize + i] = 0.0; + } + } + + return std::unique_ptr(new TransposeData{A, B, result}); +} + +bool +transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeData &data) { + for (int i = 0; i < executionSettings->programSettings->matrixSize; i++) { + for (int j = 0; j < executionSettings->programSettings->matrixSize; j++) { + data.A[j * executionSettings->programSettings->matrixSize + i] -= data.result[i * executionSettings->programSettings->matrixSize + j] + - data.B[i * executionSettings->programSettings->matrixSize + j]; + } + } + + double max_error = 0.0; + for (int i = 0; i < executionSettings->programSettings->matrixSize * executionSettings->programSettings->matrixSize; i++) { + max_error = std::max(fabs(data.A[i]), max_error); + } + + std::cout << "Maximum error: " << max_error << std::endl; + + return (static_cast(max_error) / executionSettings->programSettings->matrixSize) < 1.0e-6; +} \ No newline at end of file diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp new file mode 100644 index 00000000..f42d09b0 --- /dev/null +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -0,0 +1,185 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef SRC_HOST_TRANSPOSE_BENCHMARK_H_ +#define SRC_HOST_TRANSPOSE_BENCHMARK_H_ + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "hpcc_benchmark.hpp" +#include "parameters.h" + +/** + * @brief Contains all classes and methods needed by the Transpose benchmark + * + */ +namespace transpose { + +/** + * @brief The Transpose specific program settings + * + */ +class TransposeProgramSettings : public hpcc_base::BaseSettings { + +public: + /** + * @brief The size of the whole matrix + * + */ + uint matrixSize; + + /** + * @brief The size of a matrix block + * + */ + uint blockSize; + + /** + * @brief Construct a new Transpose Program Settings object + * + * @param results the result map from parsing the program input parameters + */ + TransposeProgramSettings(cxxopts::ParseResult &results); + + /** + * @brief Construct a new Transpose Program Settings object + * + * @return a map of program parameters. keys are the name of the parameter. + */ + std::map getSettingsMap() override; + +}; + +/** + * @brief Data class cotnaining the data the kernel is exeucted with + * + */ +class TransposeData { + +public: + HOST_DATA_TYPE *A, *B, *result; + TransposeData(HOST_DATA_TYPE *A_, HOST_DATA_TYPE *B_, HOST_DATA_TYPE *result_) : A(A_), B(B_), result(result_) {} + + ~TransposeData() { + free(A); + free(B); + free(result); + } + +}; + +/** + * @brief Measured execution timing from the kernel execution + * + */ +class TransposeExecutionTimings { +public: + /** + * @brief A vector containing the timings for all repetitions for the data transfer + * + */ + std::vector transferTimings; + + /** + * @brief A vector containing the timings for all repetitions for the calculation + * + */ + std::vector calculationTimings; + +}; + +/** + * @brief Implementation of the transpose benchmark + * + */ +class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark { + +protected: + + /** + * @brief Additional input parameters of the transpose benchmark + * + * @param options + */ + void + addAdditionalParseOptions(cxxopts::Options &options) override; + +public: + + /** + * @brief Random access specific implementation of the data generation + * + * @return std::unique_ptr + */ + std::unique_ptr + generateInputData() override; + + /** + * @brief Transpose specific implementation of the kernel execution + * + * @param data + * @return std::unique_ptr + */ + std::unique_ptr + executeKernel(TransposeData &data) override; + + /** + * @brief Transpose specific implementation of the execution validation + * + * @param data + * @param output + * @return true + * @return false + */ + bool + validateOutputAndPrintError(TransposeData &data) override; + + /** + * @brief Transpose specific implementation of printing the execution results + * + * @param output + */ + void + printResults(const TransposeExecutionTimings &output) override; + + /** + * @brief Construct a new Transpose Benchmark object + * + * @param argc the number of program input parameters + * @param argv the program input parameters as array of strings + */ + TransposeBenchmark(int argc, char* argv[]); + + /** + * @brief Construct a new Transpose Benchmark object + */ + TransposeBenchmark(); + +}; + +} // namespace stream + + +#endif // SRC_HOST_STREAM_BENCHMARK_H_ diff --git a/PTRANS/src/host/transpose_functionality.cpp b/PTRANS/src/host/transpose_functionality.cpp deleted file mode 100644 index 250bf57f..00000000 --- a/PTRANS/src/host/transpose_functionality.cpp +++ /dev/null @@ -1,120 +0,0 @@ -// -// Created by Marius Meyer on 04.12.19. -// - -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#include "transpose_functionality.hpp" - -/* C++ standard library headers */ -#include -#include - -/* Project's headers */ -#include "execution.h" -#include "cxxopts.hpp" -#include "setup/fpga_setup.hpp" -#include "parameters.h" - - -/** - * Reference implementation that takes two matrices and calculates - * A_out = trans(A) + B - * where A, B and A_out are matrices of size n*n. - * - * @param A matrix that has to be transposed - * @param B matrix that will be added to the transposed matrix - * @param A_out matrix where the result of the calculation is stored - * @param n dimension of the matrices - */ -void -transposeReference(HOST_DATA_TYPE *const A, HOST_DATA_TYPE *const B, - HOST_DATA_TYPE *A_out, cl_uint n) { - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { - A_out[i * n + j] = A[j * n + i] + B[i * n + j]; - } - } -} - - -void -generateInputData(cl_uint matrix_size, HOST_DATA_TYPE *A, HOST_DATA_TYPE *B) { - std::mt19937 gen(7); - std::uniform_real_distribution<> dis(-100.0, 100.0); - for (int i = 0; i < matrix_size; i++) { - for (int j = 0; j < matrix_size; j++) { - A[i * matrix_size + j] = dis(gen); - B[j * matrix_size + i] = -A[i * matrix_size + j] + 1.0; - } - } -} - - -/** -Prints the execution results to stdout - -@param results The execution results -*/ -void -printResults(std::shared_ptr results, cl_uint matrixSize) { - double flops = matrixSize * matrixSize; - - double avgTransferTime = accumulate(results->transferTimings.begin(), results->transferTimings.end(), 0.0) - / results->transferTimings.size(); - double minTransferTime = *min_element(results->transferTimings.begin(), results->transferTimings.end()); - - - double avgCalculationTime = accumulate(results->calculationTimings.begin(), results->calculationTimings.end(), 0.0) - / results->calculationTimings.size(); - double minCalculationTime = *min_element(results->calculationTimings.begin(), results->calculationTimings.end()); - - double avgCalcFLOPS = flops / avgCalculationTime; - double avgTotalFLOPS = flops / (avgCalculationTime + avgTransferTime); - double minCalcFLOPS = flops / minCalculationTime; - double minTotalFLOPS = flops / (minCalculationTime + minTransferTime); - - std::cout << " trans calc calc FLOPS total FLOPS" << std::endl; - std::cout << "avg: " << avgTransferTime - << " " << avgCalculationTime - << " " << avgCalcFLOPS - << " " << avgTotalFLOPS - << std::endl; - std::cout << "best: " << minTransferTime - << " " << minCalculationTime - << " " << minCalcFLOPS - << " " << minTotalFLOPS - << std::endl; -} - -double -printCalculationError(cl_uint matrixSize, const HOST_DATA_TYPE *result) { - double max_error = 0.0; - for (int i = 0; i < matrixSize * matrixSize; i++) { - max_error = std::max(fabs(result[i] - 1.0), max_error); - } - - std::cout << "Maximum error: " << max_error << std::endl; - - return max_error; -} \ No newline at end of file diff --git a/PTRANS/src/host/transpose_functionality.hpp b/PTRANS/src/host/transpose_functionality.hpp deleted file mode 100644 index e71bf11b..00000000 --- a/PTRANS/src/host/transpose_functionality.hpp +++ /dev/null @@ -1,76 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#ifndef SRC_HOST_TRANSPOSE_FUNCTIONALITY_H_ -#define SRC_HOST_TRANSPOSE_FUNCTIONALITY_H_ - -/* C++ standard library headers */ -#include - -/* Project's headers */ -#include "execution.h" -#include "cxxopts.hpp" -#include "setup/fpga_setup.hpp" -#include "parameters.h" - - -/** - * Reference implementation that takes two matrices and calculates - * A_out = trans(A) + B - * where A, B and A_out are matrices of size n*n. - * - * @param A matrix that has to be transposed - * @param B matrix that will be added to the transposed matrix - * @param A_out matrix where the result of the calculation is stored - * @param n dimension of the matrices - */ -void -transposeReference(HOST_DATA_TYPE *const A, HOST_DATA_TYPE *const B, - HOST_DATA_TYPE *A_out, cl_uint n); - - -void -generateInputData(cl_uint matrix_size, HOST_DATA_TYPE *A, HOST_DATA_TYPE *B); - - -/** -Prints the execution results to stdout - -@param results The execution results -*/ -void -printResults(std::shared_ptr results, cl_uint matrixSize); - - -/** - * Prints the aggregated error for the result matrix to stdout. - * All values in the result matrix should be 0 because of the used matrix generation scheme. - * All values of the matrix are aggregated to get to total aggregated error. - * - * @param matrixSize Size of the result matrix in one dimension (i. e. N for matrix size N*N) - * @param result The buffer containing the result matrix - * @returns the aggregated normalized error - */ -double -printCalculationError(cl_uint matrixSize, const HOST_DATA_TYPE *result); - -#endif // SRC_HOST_TRANSPOSE_FUNCTIONALITY_H_ diff --git a/PTRANS/tests/CMakeLists.txt b/PTRANS/tests/CMakeLists.txt index 78534477..ad471137 100755 --- a/PTRANS/tests/CMakeLists.txt +++ b/PTRANS/tests/CMakeLists.txt @@ -1,27 +1,29 @@ -# 'lib' is the folder with Google Test sources -add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) -include_directories(${gtest_SOURCE_DIR}/include ${gmock_SOURCE_DIR}/include) -include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) -include_directories(${CMAKE_SOURCE_DIR}/src/host) +add_subdirectory(../../extern/googletest ${CMAKE_BINARY_DIR}/lib/googletest) +include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) +include_directories(${CMAKE_BINARY_DIR}/src/common) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/execution_default.cpp ../src/host/transpose_functionality.cpp) -set(TEST_SOURCES ../../shared/testing/main.cpp test_host_functionality.cpp test_kernel_functionality_and_host_integration.cpp ../../shared/setup/test_fpga_setup.cpp) +set(HOST_EXE_NAME Transpose) +set(LIB_NAME trans) + +set(TEST_SOURCES main.cpp test_host_functionality.cpp test_kernel_functionality_and_host_integration.cpp) if (INTELFPGAOPENCL_FOUND) - include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) - add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES}) - add_dependencies(Test_intel transpose_optimized_emulate_intel) - target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) - add_test(NAME test_intel_unit COMMAND $ -f transpose_optimized_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) + add_executable(${HOST_EXE_NAME}_test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_intel gtest gmock ${LIB_NAME}_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") + add_dependencies(${HOST_EXE_NAME}_test_intel transpose_optimized_emulate_intel) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_intel_single_unit COMMAND $ -f transpose_optimized_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() -if (VITIS_FOUND) - include_directories(${Vitis_INCLUDE_DIRS}) - add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES}) - add_dependencies(Test_xilinx transpose_optimized_emulate_xilinx) - target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) - add_test(NAME test_xilinx_unit COMMAND $ -f transpose_optimized_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) -endif() \ No newline at end of file +if (Vitis_FOUND) + include_directories(SYSTEM ${Vitis_INCLUDE_DIRS}) + add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + add_dependencies(${HOST_EXE_NAME}_test_xilinx transpose_optimized_emulate_xilinx) + target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_xilinx_single_unit COMMAND $ -f transpose_optimized_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) +endif() diff --git a/PTRANS/tests/main.cpp b/PTRANS/tests/main.cpp new file mode 100644 index 00000000..02c9b27f --- /dev/null +++ b/PTRANS/tests/main.cpp @@ -0,0 +1,72 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "transpose_benchmark.hpp" + +#include "gtest/gtest.h" +#include "CL/cl.hpp" + +#ifdef _USE_MPI_ +#include "mpi.h" + +class MPIEnvironment : public ::testing::Environment { +public: + MPIEnvironment(int* argc, char** argv[]) { + MPI_Init(argc, argv); + } + + ~MPIEnvironment() override { + MPI_Finalize(); + } +}; +#endif + +using namespace transpose; + +std::unique_ptr bm; + +/** +The program entry point for the unit tests +*/ +int +main(int argc, char *argv[]) { + + std::cout << "THIS BINARY EXECUTES UNIT TESTS FOR THE FOLLOWING BENCHMARK:" << std::endl << std::endl; + + ::testing::InitGoogleTest(&argc, argv); + + bm = std::unique_ptr(new TransposeBenchmark(argc, argv)); + +#ifdef _USE_MPI_ + ::testing::Environment* const mpi_env = + ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); +#endif + + bool result = RUN_ALL_TESTS(); + + bm = nullptr; + + return result; + +} + diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp index 2dfccb58..ee7df2fa 100644 --- a/PTRANS/tests/test_host_functionality.cpp +++ b/PTRANS/tests/test_host_functionality.cpp @@ -4,101 +4,9 @@ #include "gtest/gtest.h" #include "parameters.h" #include "CL/cl.hpp" -#include "../src/host/transpose_functionality.hpp" +#include "test_program_settings.h" #include "gmock/gmock-matchers.h" -/** - * Check correctness of reference matrix transposition implementation for 2x2 - */ -TEST(TransposeFunctionality, make2x2MatrixTranspose) { - - size_t used_size = 2; - auto A_test = new HOST_DATA_TYPE[used_size * used_size]; - auto B_test = new HOST_DATA_TYPE[used_size * used_size]; - auto result = new HOST_DATA_TYPE[used_size * used_size]; - for (int i = 0; i < used_size; i++) { - for (int j = 0; j < used_size; j++) { - A_test[i * used_size + j] = (HOST_DATA_TYPE) (i * used_size + j); - B_test[i * used_size + j] = 0.0; - } - } - transposeReference(A_test, B_test, result, used_size); - for (int i = 0; i < used_size; i++) { - for (int j = 0; j < used_size; j++) { - EXPECT_FLOAT_EQ(A_test[i * used_size + j], result[j * used_size + i]); - } - } -} - -/** - * Check correctness of reference matrix transposition implementation for 9x9 - */ -TEST(TransposeFunctionality, make9x9MatrixTranspose) { - - size_t used_size = 9; - auto A_test = new HOST_DATA_TYPE[used_size * used_size]; - auto B_test = new HOST_DATA_TYPE[used_size * used_size]; - auto result = new HOST_DATA_TYPE[used_size * used_size]; - for (int i = 0; i < used_size; i++) { - for (int j = 0; j < used_size; j++) { - A_test[i * used_size + j] = (HOST_DATA_TYPE) (i * used_size + j); - B_test[i * used_size + j] = 0.0; - } - } - transposeReference(A_test, B_test, result, used_size); - for (int i = 0; i < used_size; i++) { - for (int j = 0; j < used_size; j++) { - EXPECT_FLOAT_EQ(A_test[i * used_size + j], result[j * used_size + i]); - } - } -} - -/** - * Check that B is not transposed - */ -TEST(TransposeFunctionality, BStaysTheSame) { - - size_t used_size = 10; - auto A_test = new HOST_DATA_TYPE[used_size * used_size]; - auto B_test = new HOST_DATA_TYPE[used_size * used_size]; - auto result = new HOST_DATA_TYPE[used_size * used_size]; - for (int i = 0; i < used_size; i++) { - for (int j = 0; j < used_size; j++) { - B_test[i * used_size + j] = (HOST_DATA_TYPE) (i * used_size + j); - A_test[i * used_size + j] = 0.0; - } - } - transposeReference(A_test, B_test, result, used_size); - for (int i = 0; i < used_size; i++) { - for (int j = 0; j < used_size; j++) { - EXPECT_FLOAT_EQ(B_test[i * used_size + j], result[i * used_size + j]); - } - } -} - -/** - * Check if addition is done for A and B - */ -TEST(TransposeFunctionality, BAndAAreAddedUp) { - - size_t used_size = 10; - auto A_test = new HOST_DATA_TYPE[used_size * used_size]; - auto B_test = new HOST_DATA_TYPE[used_size * used_size]; - auto result = new HOST_DATA_TYPE[used_size * used_size]; - for (int i = 0; i < used_size; i++) { - for (int j = 0; j < used_size; j++) { - B_test[i * used_size + j] = (HOST_DATA_TYPE) (i * used_size + j); - A_test[i * used_size + j] = 1.0; - } - } - transposeReference(A_test, B_test, result, used_size); - for (int i = 0; i < used_size; i++) { - for (int j = 0; j < used_size; j++) { - EXPECT_FLOAT_EQ(B_test[i * used_size + j] + 1.0, result[i * used_size + j]); - } - } -} - /** * Check if the output has the correct structure */ @@ -107,17 +15,16 @@ TEST(ResultOutput, OutputsCorrectFormatHeader) { std::vector calculateTimings; transferTimings.push_back(1.0); calculateTimings.push_back(1.0); - std::shared_ptr results( - new bm_execution::ExecutionTimings{transferTimings, calculateTimings}); + std::shared_ptr results( + new transpose::TransposeExecutionTimings{transferTimings, calculateTimings}); - fpga_setup::setupEnvironmentAndClocks(); // Redirect stout buffer to local buffer to make checks possible std::stringstream newStdOutBuffer; std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); std::cout.rdbuf(newStdOutBuffer.rdbuf()); - printResults(results, 10); + bm->printResults(*results); // Redirect stdout to old buffer std::cout.rdbuf(oldStdOutBuffer); @@ -134,17 +41,16 @@ TEST(ResultOutput, OutputsCorrectFormatValues) { std::vector calculateTimings; transferTimings.push_back(1.0); calculateTimings.push_back(1.0); - std::shared_ptr results( - new bm_execution::ExecutionTimings{transferTimings, calculateTimings}); + std::shared_ptr results( + new transpose::TransposeExecutionTimings{transferTimings, calculateTimings}); - fpga_setup::setupEnvironmentAndClocks(); // Redirect stout buffer to local buffer to make checks possible std::stringstream newStdOutBuffer; std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); std::cout.rdbuf(newStdOutBuffer.rdbuf()); - printResults(results, 10); + bm->printResults(*results); // Redirect stdout to old buffer std::cout.rdbuf(oldStdOutBuffer); @@ -157,9 +63,14 @@ TEST(ResultOutput, OutputsCorrectFormatValues) { * Checks if the error is printed to stdout and the error is aggregated over the whole matrix. */ TEST(ErrorOutput, AggregatedErrorIsPrinted) { - HOST_DATA_TYPE *results = new HOST_DATA_TYPE[4 * 4]; - for (int i = 0; i < 4 * 4; i++) { - results[i] = 0.0; + bm->getExecutionSettings().programSettings->matrixSize = 4; + bm->executeBenchmark(); + auto data = bm->generateInputData(); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + data->A[i * 4 + j] = i * 4 + j; + data->B[i * 4 + j] = i * 4 + j; + } } // Redirect stout buffer to local buffer to make checks possible @@ -167,26 +78,43 @@ TEST(ErrorOutput, AggregatedErrorIsPrinted) { std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); std::cout.rdbuf(newStdOutBuffer.rdbuf()); - printCalculationError(4, results); + bool success = bm->validateOutputAndPrintError(*data); // Redirect stdout to old buffer std::cout.rdbuf(oldStdOutBuffer); EXPECT_THAT(newStdOutBuffer.str(), - ::testing::MatchesRegex("Maximum error:\\s+1\\.00000e\\+00\n")); + ::testing::MatchesRegex("Maximum error:\\s+3\\.00000e\\+01\n")); + EXPECT_FALSE(success); } /** - * Checks if the error is returned as an integer by the error calculation function. + * Checks if the error is printed to stdout and validation can be success. */ -TEST(ErrorOutput, AggregatedErrorIsReturned) { - HOST_DATA_TYPE *results = new HOST_DATA_TYPE[4 * 4]; - for (int i = 0; i < 4 * 4; i++) { - results[i] = 0.0; +TEST(ErrorOutput, ValidationIsSuccess) { + bm->getExecutionSettings().programSettings->matrixSize = 4; + bm->executeBenchmark(); + auto data = bm->generateInputData(); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + data->A[i * 4 + j] = 0.0; + data->B[i * 4 + j] = 0.0; + } } - int error = printCalculationError(4, results); + // Redirect stout buffer to local buffer to make checks possible + std::stringstream newStdOutBuffer; + std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); + std::cout.rdbuf(newStdOutBuffer.rdbuf()); + + bool success = bm->validateOutputAndPrintError(*data); - EXPECT_EQ(error, 1); + // Redirect stdout to old buffer + std::cout.rdbuf(oldStdOutBuffer); + + EXPECT_THAT(newStdOutBuffer.str(), + ::testing::MatchesRegex("Maximum error:\\s+0\\.00000e\\+00\n")); + EXPECT_TRUE(success); } + diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp index 0d1c347e..55e8889b 100644 --- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp +++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp @@ -4,68 +4,19 @@ #include #include "gtest/gtest.h" -#include "../src/host/execution.h" -#include "../src/host/transpose_functionality.hpp" #include "parameters.h" -#include "testing/test_program_settings.h" +#include "test_program_settings.h" -struct OpenCLKernelTest : testing::Test { - std::string kernelFileName; - HOST_DATA_TYPE *A; - HOST_DATA_TYPE *B; - HOST_DATA_TYPE *A_out; - std::shared_ptr config; - cl_uint matrix_size; - cl::Program program; - cl::Context context; - std::vector device; +struct TransposeKernelTest : testing::Test { + std::shared_ptr data; + uint matrix_size = BLOCK_SIZE; - OpenCLKernelTest() { - kernelFileName = programSettings->kernelFileName; + void SetUp() override { matrix_size = BLOCK_SIZE; - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - posix_memalign(reinterpret_cast(&B), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - posix_memalign(reinterpret_cast(&A_out), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - for (int i = 0; i < matrix_size; i++) { - for (int j = 0; j < matrix_size; j++) { - A[i * matrix_size + j] = 0.0; - B[i * matrix_size + j] = 0.0; - A_out[i * matrix_size + j] = 0.0; - } - } - setupFPGA(); - } - - void setupFPGA() { - if (!config.get()) { - // TODO: Workaround. File bug report to XRT? - // This is done because of a bug in Xilix XRT that does not allow - // to reprogram an FPGA twice which will crash with CL_OUT_OF_RESOURCES - device = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); - context = cl::Context(device[0]); - program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); - } - - config = std::make_shared( - bm_execution::ExecutionConfiguration{ - context, device[0], program, - KERNEL_NAME, - 1, - matrix_size, - BLOCK_SIZE, - false - }); - generateInputData(matrix_size, A, B); - } - - ~OpenCLKernelTest() override { - free(A); - free(B); - free(A_out); + bm->getExecutionSettings().programSettings->matrixSize = matrix_size; + bm->getExecutionSettings().programSettings->numRepetitions = 1; + data = bm->generateInputData(); } }; @@ -73,17 +24,17 @@ struct OpenCLKernelTest : testing::Test { /** * Tests if B will not be transposed */ -TEST_F(OpenCLKernelTest, FPGACorrectBStaysTheSame) { +TEST_F(TransposeKernelTest, FPGACorrectBStaysTheSame) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - A[i * matrix_size + j] = 0.0; - B[i * matrix_size + j] = i * matrix_size + j; + data->A[i * matrix_size + j] = 0.0; + data->B[i * matrix_size + j] = i * matrix_size + j; } } - auto result = bm_execution::calculate(config, A, B, A_out); + bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - EXPECT_FLOAT_EQ(A_out[i * matrix_size + j], B[i * matrix_size + j]); + EXPECT_FLOAT_EQ(data->result[i * matrix_size + j], data->B[i * matrix_size + j]); } } } @@ -91,17 +42,17 @@ TEST_F(OpenCLKernelTest, FPGACorrectBStaysTheSame) { /** * Tests if a block of A will be correctly transposed */ -TEST_F(OpenCLKernelTest, FPGAABlockIsTransposed) { +TEST_F(TransposeKernelTest, FPGAABlockIsTransposed) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - A[i * matrix_size + j] = i * matrix_size + j; - B[i * matrix_size + j] = 0.0; + data->A[i * matrix_size + j] = i * matrix_size + j; + data->B[i * matrix_size + j] = 0.0; } } - auto result = bm_execution::calculate(config, A, B, A_out); + bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - EXPECT_FLOAT_EQ(A_out[i * matrix_size + j], A[j * matrix_size + i]); + EXPECT_FLOAT_EQ(data->result[i * matrix_size + j], data->A[j * matrix_size + i]); } } } @@ -109,35 +60,24 @@ TEST_F(OpenCLKernelTest, FPGAABlockIsTransposed) { /** * Tests if A will be transposed when it is bigger than one block */ -TEST_F(OpenCLKernelTest, FPGAAIsTransposed) { +TEST_F(TransposeKernelTest, FPGAAIsTransposed) { // delete memory allocated in constructor - free(A); - free(B); - free(A_out); - - // allocate more memory for test with multiple blocks matrix_size = 2 * BLOCK_SIZE; - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - posix_memalign(reinterpret_cast(&B), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - posix_memalign(reinterpret_cast(&A_out), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - - setupFPGA(); + bm->getExecutionSettings().programSettings->matrixSize = matrix_size; + data = bm->generateInputData(); // Do actual test for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - A[i * matrix_size + j] = i * matrix_size + j; - B[i * matrix_size + j] = 0.0; + data->A[i * matrix_size + j] = i * matrix_size + j; + data->B[i * matrix_size + j] = 0.0; } } - auto result = bm_execution::calculate(config, A, B, A_out); + bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - EXPECT_FLOAT_EQ(A_out[i * matrix_size + j], A[j * matrix_size + i]); + EXPECT_FLOAT_EQ(data->result[i * matrix_size + j], data->A[j * matrix_size + i]); } } } @@ -145,17 +85,17 @@ TEST_F(OpenCLKernelTest, FPGAAIsTransposed) { /** * Tests if matrix A and B will be summed up in the result */ -TEST_F(OpenCLKernelTest, FPGAAAndBAreSummedUp) { +TEST_F(TransposeKernelTest, FPGAAAndBAreSummedUp) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - A[i * matrix_size + j] = 1.0; - B[i * matrix_size + j] = i * matrix_size + j; + data->A[i * matrix_size + j] = 1.0; + data->B[i * matrix_size + j] = i * matrix_size + j; } } - auto result = bm_execution::calculate(config, A, B, A_out); + bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - EXPECT_FLOAT_EQ(A_out[i * matrix_size + j], B[i * matrix_size + j] + 1.0); + EXPECT_FLOAT_EQ(data->result[i * matrix_size + j], data->B[i * matrix_size + j] + 1.0); } } } @@ -164,9 +104,9 @@ TEST_F(OpenCLKernelTest, FPGAAAndBAreSummedUp) { /** * Checks the size and values of the timing measurements that are retured by calculate. */ -TEST_F(OpenCLKernelTest, FPGATimingsMeasuredForEveryIteration) { - config->repetitons = 10; - auto result = bm_execution::calculate(config, A, B, A_out); +TEST_F(TransposeKernelTest, FPGATimingsMeasuredForEveryIteration) { + bm->getExecutionSettings().programSettings->numRepetitions = 10; + auto result = bm->executeKernel(*data); EXPECT_EQ(result->calculationTimings.size(), 10); EXPECT_EQ(result->transferTimings.size(), 10); for (int t = 0; t < 10; t++) { @@ -178,32 +118,16 @@ TEST_F(OpenCLKernelTest, FPGATimingsMeasuredForEveryIteration) { /** * Check if the generated input data is in the specified range */ -TEST(ExecutionDefault, GenerateInputDataRange) { - HOST_DATA_TYPE *A = new HOST_DATA_TYPE[25]; - HOST_DATA_TYPE *B = new HOST_DATA_TYPE[25]; - generateInputData(5, A, B); +TEST_F(TransposeKernelTest, GenerateInputDataRange) { + bm->getExecutionSettings().programSettings->matrixSize = 5; + auto data = bm->generateInputData(); for (int i = 0; i < 5; i++) { for (int j = 0; j < 5; j++) { - EXPECT_LT(A[i * 5 + j], 100); - EXPECT_GT(A[i * 5 + j], -100); - EXPECT_LT(B[i * 5 + j], 101); - EXPECT_GT(B[i * 5 + j], -99); + EXPECT_LT(data->A[i * 5 + j], 100); + EXPECT_GT(data->A[i * 5 + j], -100); + EXPECT_LT(data->B[i * 5 + j], 101); + EXPECT_GT(data->B[i * 5 + j], -99); } } } -/** - * Check if the input data is generated correctly - */ -TEST(ExecutionDefault, GenerateInputDataCorrectness) { - HOST_DATA_TYPE *A = new HOST_DATA_TYPE[25]; - HOST_DATA_TYPE *B = new HOST_DATA_TYPE[25]; - HOST_DATA_TYPE *result = new HOST_DATA_TYPE[25]; - generateInputData(5, A, B); - transposeReference(A, B, result, 5); - for (int i = 0; i < 5; i++) { - for (int j = 0; j < 5; j++) { - EXPECT_NEAR(result[i * 5 + j], 1.0, std::numeric_limits::epsilon()); - } - } -} diff --git a/PTRANS/tests/test_program_settings.h b/PTRANS/tests/test_program_settings.h new file mode 100644 index 00000000..720f61e5 --- /dev/null +++ b/PTRANS/tests/test_program_settings.h @@ -0,0 +1,27 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "transpose_benchmark.hpp" + + +extern std::unique_ptr bm; diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index 75024fb1..ba8b4ecf 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -12,7 +12,7 @@ fft_regex = "Version:\\s+(?P.+)\n(.*\n)+FFT\\sSize:\\s+(?P\d+)\nData\\sSize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Total\\smatrix\\ssize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" -trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+trans\\s+calc\\s+calc\\s+FLOPS\\s+total\\s+FLOPS\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)" +trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)\n\\s+trans\\s+calc\\s+calc\\s+FLOPS\\s+total\\s+FLOPS\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize:\\s+(?P\d+)(.*\n)+Data\\sType:\\s+(?P.+)\n(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS(.*\n)+\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" From 96d9624b29880b0a9c5652ac231356c77c0d9154 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 19 May 2020 08:54:39 +0200 Subject: [PATCH 30/45] Clean up and add documentation --- PTRANS/README.md | 14 +- PTRANS/src/host/execution.h | 17 +- PTRANS/src/host/transpose_benchmark.cpp | 19 +-- PTRANS/src/host/transpose_benchmark.hpp | 36 +++-- RandomAccess/README.md | 11 +- .../device/random_access_kernels_ndrange.cl | 54 ------- .../random_access_kernels_single_rnd.cl | 147 ------------------ RandomAccess/src/host/execution.h | 23 +-- .../src/host/random_access_benchmark.hpp | 32 ++-- STREAM/README.md | 4 +- STREAM/src/host/execution.hpp | 19 ++- STREAM/src/host/stream_benchmark.hpp | 7 +- shared/include/hpcc_benchmark.hpp | 8 +- 13 files changed, 100 insertions(+), 291 deletions(-) delete mode 100644 RandomAccess/src/device/random_access_kernels_ndrange.cl delete mode 100644 RandomAccess/src/device/random_access_kernels_single_rnd.cl diff --git a/PTRANS/README.md b/PTRANS/README.md index 584c401a..9fbf8cb9 100644 --- a/PTRANS/README.md +++ b/PTRANS/README.md @@ -21,8 +21,8 @@ The targets below can be used to build the benchmark and its kernels: | Target | Description | | -------- | ---------------------------------------------- | - | trans_VENDOR | Builds the host application | - | Test_VENDOR | Compile the tests and its dependencies | + | Transpose_VENDOR | Builds the host application | + | Transpose_test_VENDOR | Compile the tests and its dependencies | `VENDOR` can be `intel` or `xilinx`. @@ -45,7 +45,7 @@ The currently supported values for KERNEL_FILE_NAME are listed below where `tran mkdir build && cd build cmake .. - make trans_intel + make Transpose_intel You will find all executables and kernel files in the `bin` folder of your build directory. @@ -65,16 +65,16 @@ of the Intel FPGA SDK installation. For execution of the benchmark run: - ./trans_intel -f path_to_kernel.aocx + ./Transpose_intel -f path_to_kernel.aocx For more information on available input parameters run - $./trans_xilinx -h + $./Transpose_xilinx -h Implementation of the matrix transposition benchmark proposed in the HPCC benchmark suite for FPGA. Version: 1.0.1 Usage: - ./trans_xilinx [OPTION...] + ./Transpose_xilinx [OPTION...] -f, --file arg Kernel file name -n, arg Number of repetitions (default: 10) @@ -96,7 +96,7 @@ For more information on available input parameters run To execute the unit and integration tests run - ./Test_intel + ./Transpose_test_intel -f KERNEL_FILE_NAME in the `bin` folder within the build directory. It will run an emulation of the kernel and execute some functionality tests. diff --git a/PTRANS/src/host/execution.h b/PTRANS/src/host/execution.h index 30536e3e..64711347 100644 --- a/PTRANS/src/host/execution.h +++ b/PTRANS/src/host/execution.h @@ -36,15 +36,14 @@ namespace bm_execution { /** -The actual execution of the benchmark. -This method can be implemented in multiple *.cpp files. This header enables -simple exchange of the different calculation methods. - -@param config struct that contains all necessary information to execute the kernel on the FPGA - - -@return The resulting matrix -*/ + * @brief Transpose and add the matrices using the OpenCL kernel + * + * @param config The progrma configuration + * @param A The input matrix A + * @param B The input matrix B + * @param A_out The matrix storing the result of the calculation + * @return std::unique_ptr The measured executuon times + */ std::unique_ptr calculate(const hpcc_base::ExecutionSettings& config, HOST_DATA_TYPE *const A, HOST_DATA_TYPE *const B, HOST_DATA_TYPE *A_out); diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index 26cdd4ec..d243829f 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -107,28 +107,19 @@ transpose::TransposeBenchmark::printResults(const transpose::TransposeExecutionT std::unique_ptr transpose::TransposeBenchmark::generateInputData() { - HOST_DATA_TYPE* A; - HOST_DATA_TYPE* B; - HOST_DATA_TYPE* result; - - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * executionSettings->programSettings->matrixSize * executionSettings->programSettings->matrixSize); - posix_memalign(reinterpret_cast(&B), 64, - sizeof(HOST_DATA_TYPE) * executionSettings->programSettings->matrixSize * executionSettings->programSettings->matrixSize); - posix_memalign(reinterpret_cast(&result), 64, - sizeof(HOST_DATA_TYPE) * executionSettings->programSettings->matrixSize * executionSettings->programSettings->matrixSize); + auto d = std::unique_ptr(new transpose::TransposeData(executionSettings->programSettings->matrixSize)); std::mt19937 gen(7); std::uniform_real_distribution<> dis(-100.0, 100.0); for (int i = 0; i < executionSettings->programSettings->matrixSize; i++) { for (int j = 0; j < executionSettings->programSettings->matrixSize; j++) { - A[i * executionSettings->programSettings->matrixSize + j] = dis(gen); - B[j * executionSettings->programSettings->matrixSize + i] = dis(gen); - result[j * executionSettings->programSettings->matrixSize + i] = 0.0; + d->A[i * executionSettings->programSettings->matrixSize + j] = dis(gen); + d->B[j * executionSettings->programSettings->matrixSize + i] = dis(gen); + d->result[j * executionSettings->programSettings->matrixSize + i] = 0.0; } } - return std::unique_ptr(new TransposeData{A, B, result}); + return d; } bool diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index f42d09b0..fb7666be 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -64,7 +64,7 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings { TransposeProgramSettings(cxxopts::ParseResult &results); /** - * @brief Construct a new Transpose Program Settings object + * @brief Get a map of the settings. This map will be used to print the final configuration. * * @return a map of program parameters. keys are the name of the parameter. */ @@ -80,8 +80,25 @@ class TransposeData { public: HOST_DATA_TYPE *A, *B, *result; - TransposeData(HOST_DATA_TYPE *A_, HOST_DATA_TYPE *B_, HOST_DATA_TYPE *result_) : A(A_), B(B_), result(result_) {} + /** + * @brief Construct a new Transpose Data object + * + * @param size Size of the allocated square matrices + */ + TransposeData(uint size) { + posix_memalign(reinterpret_cast(&A), 64, + sizeof(HOST_DATA_TYPE) * size * size); + posix_memalign(reinterpret_cast(&B), 64, + sizeof(HOST_DATA_TYPE) * size * size); + posix_memalign(reinterpret_cast(&result), 64, + sizeof(HOST_DATA_TYPE) * size * size); + } + + /** + * @brief Destroy the Transpose Data object. Free the allocated memory + * + */ ~TransposeData() { free(A); free(B); @@ -131,7 +148,7 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark + * @return std::unique_ptr The input and output data of the benchmark */ std::unique_ptr generateInputData() override; @@ -139,8 +156,8 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark + * @param data The input and output data of the benchmark + * @return std::unique_ptr Measured runtimes of the kernel execution */ std::unique_ptr executeKernel(TransposeData &data) override; @@ -148,10 +165,9 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark The measured runtimes of the kernel + */ std::unique_ptr calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE * data); diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp index 4fd4a8c6..6eb6470c 100644 --- a/RandomAccess/src/host/random_access_benchmark.hpp +++ b/RandomAccess/src/host/random_access_benchmark.hpp @@ -64,7 +64,7 @@ class RandomAccessProgramSettings : public hpcc_base::BaseSettings { RandomAccessProgramSettings(cxxopts::ParseResult &results); /** - * @brief Construct a new random access Program Settings object + * @brief Get a map of the settings. This map will be used to print the final configuration. * * @return a map of program parameters. keys are the name of the parameter. */ @@ -80,6 +80,13 @@ class RandomAccessData { public: HOST_DATA_TYPE *data; + + /** + * @brief Construct a new Random Access Data object + * + * @param context The OpenCL context that will be used to allocate SVM memory + * @param size The size of the allocated memory in number of values + */ RandomAccessData(cl::Context& context, size_t size) { #ifdef USE_SVM data = reinterpret_cast( @@ -90,6 +97,10 @@ class RandomAccessData { #endif } + /** + * @brief Destroy the Random Access Data object and free the memory allocated in the constructor + * + */ ~RandomAccessData() { #ifdef USE_SVM clSVMFree(data); @@ -135,8 +146,7 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark + * @return std::unique_ptr pointer to the object storing the benchmark input and output data */ std::unique_ptr generateInputData() override; @@ -144,8 +154,7 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark */ std::unique_ptr @@ -154,11 +163,9 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark The measured timings for all stream operations + */ std::unique_ptr calculate(const hpcc_base::ExecutionSettings& config, HOST_DATA_TYPE* A, diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp index caa36cfe..71287e44 100644 --- a/STREAM/src/host/stream_benchmark.hpp +++ b/STREAM/src/host/stream_benchmark.hpp @@ -69,6 +69,11 @@ class StreamProgramSettings : public hpcc_base::BaseSettings { */ StreamProgramSettings(cxxopts::ParseResult &results); + /** + * @brief Get a map of the settings. This map will be used to print the final configuration. + * + * @return a map of program parameters. keys are the name of the parameter. + */ std::map getSettingsMap() override; }; @@ -201,7 +206,7 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark()), kernelFileName(results["f"].as()) {} + /** + * @brief Get a map of the settings. This map will be used to print the final configuration. + * Derived classes should override it to add additional configuration options + * + * @return std::map + */ virtual std::map getSettingsMap() { return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel File", kernelFileName}}; } From 737b7160e58130ea1f62a969fda209fed2db22c2 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 19 May 2020 10:40:02 +0200 Subject: [PATCH 31/45] Start linpack adaption --- LINPACK/src/common/parameters.h.in | 8 + LINPACK/src/host/CMakeLists.txt | 46 ++-- LINPACK/src/host/execution.h | 18 +- LINPACK/src/host/execution_blocked_pvt.cpp | 36 ++- LINPACK/src/host/linpack_benchmark.cpp | 258 +++++++++++++++++++++ LINPACK/src/host/linpack_benchmark.hpp | 224 ++++++++++++++++++ LINPACK/tests/CMakeLists.txt | 58 ++--- LINPACK/tests/main.cpp | 72 ++++++ LINPACK/tests/test_program_settings.h | 27 +++ 9 files changed, 667 insertions(+), 80 deletions(-) create mode 100644 LINPACK/src/host/linpack_benchmark.cpp create mode 100644 LINPACK/src/host/linpack_benchmark.hpp create mode 100644 LINPACK/tests/main.cpp create mode 100644 LINPACK/tests/test_program_settings.h diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in index b50af517..1b43c6ae 100644 --- a/LINPACK/src/common/parameters.h.in +++ b/LINPACK/src/common/parameters.h.in @@ -19,6 +19,14 @@ #define UNROLL_COUNT @GLOBAL_MEM_UNROLL@ #define LOCAL_MEM_BLOCK_LOG @LOCAL_MEM_BLOCK_LOG@ #define REGISTER_BLOCK_LOG @REGISTER_BLOCK_LOG@ + +/* +Short description of the program +*/ +#define PROGRAM_DESCRIPTION "Implementation of the LINPACK benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + /** Output separator */ diff --git a/LINPACK/src/host/CMakeLists.txt b/LINPACK/src/host/CMakeLists.txt index 8de7ee59..00eeeb36 100755 --- a/LINPACK/src/host/CMakeLists.txt +++ b/LINPACK/src/host/CMakeLists.txt @@ -1,24 +1,36 @@ -include_directories(../../../extern/cxxopts/include ../../../shared/) -include_directories(.) -set(HOST_SOURCE execution_blocked_pvt.cpp common_benchmark_io_implementation.cpp main.cpp ../../../shared/setup/fpga_setup.cpp linpack_functionality.cpp) +add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) +set(HOST_SOURCE execution_blocked_pvt.cpp linpack_benchmark.cpp) + +set(HOST_EXE_NAME Linpack) +set(LIB_NAME lp) if (INTELFPGAOPENCL_FOUND) - include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) - include_directories(${CMAKE_BINARY_DIR}/src/common) - add_executable(LINPACK_intel ${HOST_SOURCE}) - target_link_libraries(LINPACK_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") - target_compile_definitions(LINPACK_intel PRIVATE -DINTEL_FPGA) - target_compile_options(LINPACK_intel PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_intel_host_executable COMMAND $ -h) + add_library(${LIB_NAME}_intel STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_intel PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${IntelFPGAOpenCL_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_intel PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_intel main.cpp) + target_link_libraries(${LIB_NAME}_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel) + if (USE_SVM) + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0) + endif() + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_intel_host_executable COMMAND $ -h) endif() if (Vitis_FOUND) - include_directories(${Vitis_INCLUDE_DIRS}) - include_directories(${CMAKE_BINARY_DIR}/src/common) - add_executable(LINPACK_xilinx ${HOST_SOURCE}) - target_link_libraries(LINPACK_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") - target_compile_definitions(LINPACK_xilinx PRIVATE -DXILINX_FPGA) - target_compile_options(LINPACK_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_xilinx_host_executable COMMAND $ -h) + add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_xilinx main.cpp) + target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) + target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_xilinx_host_executable COMMAND $ -h) endif() + diff --git a/LINPACK/src/host/execution.h b/LINPACK/src/host/execution.h index d5c669ee..c702f719 100644 --- a/LINPACK/src/host/execution.h +++ b/LINPACK/src/host/execution.h @@ -29,23 +29,11 @@ SOFTWARE. /* External library headers */ #include "CL/cl.hpp" #include "parameters.h" +#include "linpack_benchmark.hpp" namespace bm_execution { - struct ExecutionConfiguration { - cl::Context context; - cl::Device device; - cl::Program program; - uint repetitions; - unsigned matrixSize; - - }; - - struct ExecutionTimings { - std::vector timings; - }; - /** The actual execution of the benchmark. This method can be implemented in multiple *.cpp files. This header enables @@ -56,8 +44,8 @@ simple exchange of the different calculation methods. @return The resulting matrix */ - std::shared_ptr - calculate(std::shared_ptr config, + std::unique_ptr + calculate(const hpcc_base::ExecutionConfiguration& config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* b, cl_int* ipvt); diff --git a/LINPACK/src/host/execution_blocked_pvt.cpp b/LINPACK/src/host/execution_blocked_pvt.cpp index f77ff4b5..e5065745 100644 --- a/LINPACK/src/host/execution_blocked_pvt.cpp +++ b/LINPACK/src/host/execution_blocked_pvt.cpp @@ -35,10 +35,6 @@ SOFTWARE. #include "CL/cl_ext_intelfpga.h" #endif -/* Project's headers */ -#include "setup/fpga_setup.hpp" -#include "linpack_functionality.hpp" - namespace bm_execution { /* @@ -46,8 +42,8 @@ namespace bm_execution { @copydoc bm_execution::calculate() */ -std::shared_ptr -calculate(std::shared_ptr config, +std::unique_ptr +calculate(const hpcc_base::ExecutionConfiguration&config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* b, cl_int* ipvt) { @@ -55,16 +51,16 @@ calculate(std::shared_ptr config, int err; // Create Command queue - cl::CommandQueue compute_queue(config->context, config->device); + cl::CommandQueue compute_queue(*config.context, *config.device); // Create Buffers for input and output - cl::Buffer Buffer_a(config->context, CL_MEM_READ_WRITE, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize); - cl::Buffer Buffer_pivot(config->context, CL_MEM_READ_WRITE, - sizeof(cl_int)*config->matrixSize); + cl::Buffer Buffer_a(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize); + cl::Buffer Buffer_pivot(*config.context, CL_MEM_READ_WRITE, + sizeof(cl_int)*config.programSettings->matrixSize); // create the kernels - cl::Kernel gefakernel(config->program, "gefa", + cl::Kernel gefakernel(*config.program, "gefa", &err); ASSERT_CL(err); @@ -74,16 +70,16 @@ calculate(std::shared_ptr config, ASSERT_CL(err); err = gefakernel.setArg(1, Buffer_pivot); ASSERT_CL(err); - err = gefakernel.setArg(2, static_cast(config->matrixSize >> LOCAL_MEM_BLOCK_LOG)); + err = gefakernel.setArg(2, static_cast(config.programSettings->matrixSize >> LOCAL_MEM_BLOCK_LOG)); ASSERT_CL(err); /* --- Execute actual benchmark kernels --- */ double t; std::vector executionTimes; - for (int i = 0; i < config->repetitions; i++) { + for (int i = 0; i < config.repetitions; i++) { compute_queue.enqueueWriteBuffer(Buffer_a, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, A); + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, A); compute_queue.finish(); auto t1 = std::chrono::high_resolution_clock::now(); compute_queue.enqueueTask(gefakernel); @@ -98,16 +94,16 @@ calculate(std::shared_ptr config, /* --- Read back results from Device --- */ compute_queue.enqueueReadBuffer(Buffer_a, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, A); + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, A); compute_queue.enqueueReadBuffer(Buffer_pivot, CL_TRUE, 0, - sizeof(cl_int)*config->matrixSize, ipvt); + sizeof(cl_int)*config.programSettings->matrixSize, ipvt); // Solve linear equations on CPU // TODO: This has to be done on FPGA - gesl_ref(A, b, ipvt, config->matrixSize, config->matrixSize); + linpack::gesl_ref(A, b, ipvt, config.programSettings->matrixSize, config.programSettings->matrixSize); - std::shared_ptr results( - new ExecutionTimings{executionTimes}); + std::unique_ptr results( + new LinpackExecutionTimings{executionTimes}); return results; } diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp new file mode 100644 index 00000000..9a5a6a9c --- /dev/null +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -0,0 +1,258 @@ +// +// Created by Marius Meyer on 04.12.19. +// + +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "linpack_benchmark.hpp" + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "execution.h" +#include "parameters.h" + +linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), + matrixSize(results["m"].as()) { + +} + +std::map +linpack::LinpackProgramSettings::getSettingsMap() { + auto map = hpcc_base::BaseSettings::getSettingsMap(); + map["Matrix Size"] = std::to_string(matrixSize); + return map; +} + +linpack::LinpackBenchmark::LinpackBenchmark(int argc, char* argv[]) { + setupBenchmark(argc, argv); +} + +linpack::LinpackBenchmark::LinpackBenchmark() {} + +void +linpack::LinpackBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { + options.add_options() + ("m", "Matrix size in number of values in one dimension", + cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))); +} + +std::unique_ptr +linpack::LinpackBenchmark::executeKernel(LinpackData &data) { + return bm_execution::calculate(*executionSettings, data.A, data.b, data.ipvt); +} + +void +linpack::LinpackBenchmark::printResults(const linpack::LinpackExecutionTimings &output) { + std::cout << std::setw(ENTRY_SPACE) + << "best" << std::setw(ENTRY_SPACE) << "mean" + << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; + + // Calculate performance for kernel execution plus data transfer + double tmean = 0; + double tmin = std::numeric_limits::max(); + + // GFLOPs for calculation of both GEFA and GESL. + // Currently only GEFA is calculated on the FPGA so GFLOPS have to be + // reduced. + // double gflops = ((2.0e0*(dataSize*dataSize*dataSize))/3.0 + // + 2.0*(dataSize*dataSize)) / 1.0e9; + // TODO: Change this when GESL is also calculated on FPGA + double gflops = (2.0e0*(static_cast(executionSettings->programSettings->matrixSize) + *static_cast(executionSettings->programSettings->matrixSize) + *static_cast(executionSettings->programSettings->matrixSize)))/3.0/1.0e9; + for (double currentTime : output.timings) { + tmean += currentTime; + if (currentTime < tmin) { + tmin = currentTime; + } + } + tmean = tmean / output.timings.size(); + + std::cout << std::setw(ENTRY_SPACE) + << tmin << std::setw(ENTRY_SPACE) << tmean + << std::setw(ENTRY_SPACE) << (gflops / tmin) + << std::endl; + +} + +std::unique_ptr +linpack::LinpackBenchmark::generateInputData() { + auto d = std::unique_ptr(new linpack::LinpackData(executionSettings->programSettings->matrixSize)); + std::mt19937 gen(7); + std::uniform_real_distribution<> dis(-1.0, 1.0); + *norma = 0.0; + for (int j = 0; j < executionSettings->programSettings->matrixSize; j++) { + for (int i = 0; i < executionSettings->programSettings->matrixSize; i++) { + d->A[executionSettings->programSettings->matrixSize*i+j] = dis(gen); + *norma = (d->A[executionSettings->programSettings->matrixSize*i+j] > *norma) ? d->A[executionSettings->programSettings->matrixSize*i+j] : *norma; + } + } + for (int i = 0; i < executionSettings->programSettings->matrixSize; i++) { + d->b[i] = 0.0; + d->ipvt[i] = i; + } + for (int j = 0; j < executionSettings->programSettings->matrixSize; j++) { + for (int i = 0; i < executionSettings->programSettings->matrixSize; i++) { + d->b[j] += d->A[executionSettings->programSettings->matrixSize*j+i]; + } + } + return d; +} + +bool +linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &data) { + uint n= executionSettings->programSettings->matrixSize; + auto newdata = generateInputData(); + for (int i = 0; i < n; i++) { + newdata->b[i] = -newdata->b[i]; + } + linpack::dmxpy(n, newdata->b, n, n, data.b, newdata->A); + HOST_DATA_TYPE resid = 0.0; + HOST_DATA_TYPE normx = 0.0; + + for (int i = 0; i < n; i++) { + resid = (resid > fabs(newdata->b[i])) ? resid : fabs(newdata->b[i]); + normx = (normx > fabs(data.b[i])) ? normx : fabs(data.b[i]); + } + + HOST_DATA_TYPE eps = std::numeric_limits::epsilon(); + HOST_DATA_TYPE residn = resid / (n*norma*normx*eps); + //std::cout << resid << ", " << norma << ", " << normx << std::endl; + std::cout << " norm. resid resid "\ + "machep x[0]-1 x[n-1]-1" << std::endl; + std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE) + << resid << std::setw(ENTRY_SPACE) << eps + << std::setw(ENTRY_SPACE) << data.b[0]-1 << std::setw(ENTRY_SPACE) + << data.b[n-1]-1 << std::endl; + + return residn < 100; +} + +/** +Standard LU factorization on a block with fixed size + +Case 1 of Zhangs description +*/ +void +gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) { + for (int i = 0; i < n; i++) { + ipvt[i] = i; + } + // For each diagnonal element + for (int k = 0; k < n - 1; k++) { + HOST_DATA_TYPE max_val = fabs(a[k * lda + k]); + int pvt_index = k; + for (int i = k + 1; i < n; i++) { + if (max_val < fabs(a[i * lda + k])) { + pvt_index = i; + max_val = fabs(a[i * lda + k]); + } + } + + for (int i = k; i < n; i++) { + HOST_DATA_TYPE tmp_val = a[k * lda + i]; + a[k * lda + i] = a[pvt_index * lda + i]; + a[pvt_index * lda + i] = tmp_val; + } + ipvt[k] = pvt_index; + + // For each element below it + for (int i = k + 1; i < n; i++) { + a[i * lda + k] *= -1.0 / a[k * lda + k]; + } + // For each column right of current diagonal element + for (int j = k + 1; j < n; j++) { + // For each element below it + for (int i = k+1; i < n; i++) { + a[i * lda + j] += a[i * lda + k] * a[k * lda + j]; + } + } + +#ifdef DEBUG + std::cout << "A(k=" << k <<"): " << std::endl; + for (int i= 0; i < n; i++) { + for (int j=0; j < n; j++) { + std::cout << a[i*lda + j] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; +#endif + + } +} + +void +gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda) { + auto b_tmp = new HOST_DATA_TYPE[n]; +#pragma omp parallel default(shared) + { +#pragma omp for + for (int k = 0; k < n; k++) { + b_tmp[k] = b[k]; + } + + // solve l*y = b + // For each row in matrix +#pragma omp single + for (int k = 0; k < n - 1; k++) { + if (ipvt[k] != k) { + HOST_DATA_TYPE tmp = b_tmp[k]; + b_tmp[k] = b_tmp[ipvt[k]]; + b_tmp[ipvt[k]] = tmp; + } + // For each row below add +#pragma omp parallel for + for (int i = k + 1; i < n; i++) { + // add solved upper row to current row + b_tmp[i] += b_tmp[k] * a[lda * i + k]; + } + } + + // now solve u*x = y +#pragma omp single + for (int k = n - 1; k >= 0; k--) { + b_tmp[k] = b_tmp[k] / a[lda * k + k]; +#pragma omp parallel for + for (int i = 0; i < k; i++) { + b_tmp[i] -= b_tmp[k] * a[lda * i + k]; + } + } +#pragma omp for + for (int k = 0; k < n; k++) { + b[k] = b_tmp[k]; + } + } + delete [] b_tmp; +} + +void dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m) { + for (int i=0; i < n1; i++) { + for (int j=0; j < n2; j++) { + y[i] = y[i] + x[j] * m[ldm*i + j]; + } + } +} diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp new file mode 100644 index 00000000..3a64dbf7 --- /dev/null +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -0,0 +1,224 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef SRC_HOST_LINPACK_BENCHMARK_H_ +#define SRC_HOST_LINPACK_BENCHMARK_H_ + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "hpcc_benchmark.hpp" +#include "parameters.h" + +/** + * @brief Contains all classes and methods needed by the LINPACK benchmark + * + */ +namespace linpack { + +/** + * @brief The Linpack specific program settings + * + */ +class LinpackProgramSettings : public hpcc_base::BaseSettings { + +public: + /** + * @brief The size of the whole matrix + * + */ + uint matrixSize; + + /** + * @brief Construct a new Linpack Program Settings object + * + * @param results the result map from parsing the program input parameters + */ + LinpackProgramSettings(cxxopts::ParseResult &results); + + /** + * @brief Get a map of the settings. This map will be used to print the final configuration. + * + * @return a map of program parameters. keys are the name of the parameter. + */ + std::map getSettingsMap() override; + +}; + +/** + * @brief Data class cotnaining the data the kernel is exeucted with + * + */ +class LinpackData { + +public: + HOST_DATA_TYPE *A, *B; + cl_int* ipvt; + + /** + * @brief Construct a new Linpack Data object + * + * @param size Size of the allocated square matrix and vectors + */ + LinpackData(uint size) { + posix_memalign(reinterpret_cast(&A), 4096, size * size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&b), 4096, size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&ipvt), 4096, size * sizeof(cl_int)); + } + + /** + * @brief Destroy the Linpack Data object. Free the allocated memory + * + */ + ~LinpackData() { + free(A); + free(B); + free(ipvt); + } + +}; + +/** + * @brief Measured execution timing from the kernel execution + * + */ +class LinpackExecutionTimings { +public: + /** + * @brief A vector containing the timings for all repetitions for the kernel execution + * + */ + std::vector timings; + +}; + +/** + * @brief Implementation of the Linpack benchmark + * + */ +class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark { + +protected: + + /** + * @brief Additional input parameters of the Linpack benchmark + * + * @param options + */ + void + addAdditionalParseOptions(cxxopts::Options &options) override; + +public: + + /** + * @brief LINPACK specific implementation of the data generation + * + * @return std::unique_ptr The input and output data of the benchmark + */ + std::unique_ptr + generateInputData() override; + + /** + * @brief Linpack specific implementation of the kernel execution + * + * @param data The input and output data of the benchmark + * @return std::unique_ptr Measured runtimes of the kernel execution + */ + std::unique_ptr + executeKernel(LinpackData &data) override; + + /** + * @brief Linpack specific implementation of the execution validation + * + * @param data The input and output data of the benchmark + * @return true If validation is successful + * @return false otherwise + */ + bool + validateOutputAndPrintError(LinpackData &data) override; + + /** + * @brief Linpack specific implementation of printing the execution results + * + * @param output Measured runtimes of the kernel execution + */ + void + printResults(const LinpackExecutionTimings &output) override; + + /** + * @brief Construct a new Linpack Benchmark object + * + * @param argc the number of program input parameters + * @param argv the program input parameters as array of strings + */ + LinpackBenchmark(int argc, char* argv[]); + + /** + * @brief Construct a new Linpack Benchmark object + */ + LinpackBenchmark(); + +}; + +/** + * + * + * @param n1 + * @param y + * @param n2 + * @param ldm + * @param x + * @param m + */ +void dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m); + +/** +Gaussian elemination reference implementation without pivoting. +Can be used in exchange with kernel functions for functionality testing + +@param a the matrix with size of n*n +@param n size of matrix A +@param lda row with of the matrix. must be >=n + +*/ +void gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt); + +/** +Solve linear equations using its LU decomposition. +Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU +where A is a matrix of size n*n + +@param a the matrix a in LU representation calculated by gefa call +@param b vector b of the given equation +@param ipvt vector containing pivoting information +@param n size of matrix A +@param lda row with of the matrix. must be >=n + +*/ +void gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda); + +} // namespace stream + + +#endif // SRC_HOST_STREAM_BENCHMARK_H_ diff --git a/LINPACK/tests/CMakeLists.txt b/LINPACK/tests/CMakeLists.txt index 8f4569a5..b2f6adc6 100755 --- a/LINPACK/tests/CMakeLists.txt +++ b/LINPACK/tests/CMakeLists.txt @@ -1,12 +1,3 @@ -# 'lib' is the folder with Google Test sources -add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) -include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) -include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared/) -include_directories(${CMAKE_SOURCE_DIR}/src/host) - - -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/execution_blocked_pvt.cpp ../src/host/linpack_functionality.cpp) -set(TEST_SOURCES ../../shared/testing/main.cpp ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp test_kernel_functionality_separate_cores.cpp) set(BLA_VENDOR Intel10_64lp) find_package(LAPACK) @@ -16,34 +7,45 @@ if (NOT LAPACK_FOUND) endif() +add_subdirectory(../../extern/googletest ${CMAKE_BINARY_DIR}/lib/googletest) +include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) +include_directories(${CMAKE_BINARY_DIR}/src/common) + +set(HOST_EXE_NAME Linpack) +set(LIB_NAME lp) + +set(TEST_SOURCES main.cpp test_kernel_functionality_and_host_integration.cpp test_kernel_functionality_separate_cores.cpp) + if (INTELFPGAOPENCL_FOUND) - include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) - add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") - target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) + include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) + add_executable(${HOST_EXE_NAME}_test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_intel gtest gmock ${LIB_NAME}_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") if (LAPACK_FOUND) - target_compile_definitions(Test_intel PRIVATE -D_INTEL_MKL_) - target_link_libraries(Test_intel ${LAPACK_LIBRARIES}) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -D_INTEL_MKL_) + target_link_libraries(${HOST_EXE_NAME}_test_intel ${LAPACK_LIBRARIES}) include_directories(SYSTEM $ENV{MKLROOT}/include) endif() - add_dependencies(Test_intel lu_blocked_pvt_emulate_intel lu_blocked_pvt_test_emulate_intel) - target_compile_options(Test_intel PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_intel_unit COMMAND $ -f lu_blocked_pvt_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_dependencies(${HOST_EXE_NAME}_test_intel lu_blocked_pvt_emulate_intel) + add_dependencies(${HOST_EXE_NAME}_test_intel lu_blocked_pvt_test_emulate_intel) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_intel_unit COMMAND $ -f lu_blocked_pvt_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (Vitis_FOUND) - include_directories(${Vitis_INCLUDE_DIRS}) - add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") - target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) + include_directories(SYSTEM ${Vitis_INCLUDE_DIRS}) + add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") if (LAPACK_FOUND) - target_compile_definitions(Test_xilinx PRIVATE -D_INTEL_MKL_) - target_link_libraries(Test_xilinx ${LAPACK_LIBRARIES}) + target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -D_INTEL_MKL_) + target_link_libraries(${HOST_EXE_NAME}_test_xilinx ${LAPACK_LIBRARIES}) include_directories(SYSTEM $ENV{MKLROOT}/include) endif() - add_dependencies(Test_xilinx lu_blocked_pvt_emulate_xilinx) + add_dependencies(${HOST_EXE_NAME}_test_xilinx lu_blocked_pvt_emulate_xilinx) # Disabled since compilation is not possible - #add_dependencies(Test_xilinx lu_blocked_pvt_test_emulate_xilinx) - target_compile_options(Test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") - add_test(NAME test_xilinx_unit COMMAND $ -f lu_blocked_pvt_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + #add_dependencies(${HOST_EXE_NAME}_test_xilinx lu_blocked_pvt_test_emulate_xilinx) + target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_xilinx_unit COMMAND $ -f lu_blocked_pvt_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() + diff --git a/LINPACK/tests/main.cpp b/LINPACK/tests/main.cpp new file mode 100644 index 00000000..d9a9b221 --- /dev/null +++ b/LINPACK/tests/main.cpp @@ -0,0 +1,72 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "linpack_benchmark.hpp" + +#include "gtest/gtest.h" +#include "CL/cl.hpp" + +#ifdef _USE_MPI_ +#include "mpi.h" + +class MPIEnvironment : public ::testing::Environment { +public: + MPIEnvironment(int* argc, char** argv[]) { + MPI_Init(argc, argv); + } + + ~MPIEnvironment() override { + MPI_Finalize(); + } +}; +#endif + +using namespace linpack; + +std::unique_ptr bm; + +/** +The program entry point for the unit tests +*/ +int +main(int argc, char *argv[]) { + + std::cout << "THIS BINARY EXECUTES UNIT TESTS FOR THE FOLLOWING BENCHMARK:" << std::endl << std::endl; + + ::testing::InitGoogleTest(&argc, argv); + + bm = std::unique_ptr(new LinpackBenchmark(argc, argv)); + +#ifdef _USE_MPI_ + ::testing::Environment* const mpi_env = + ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); +#endif + + bool result = RUN_ALL_TESTS(); + + bm = nullptr; + + return result; + +} + diff --git a/LINPACK/tests/test_program_settings.h b/LINPACK/tests/test_program_settings.h new file mode 100644 index 00000000..7c0338a8 --- /dev/null +++ b/LINPACK/tests/test_program_settings.h @@ -0,0 +1,27 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "linpack_benchmark.hpp" + + +extern std::unique_ptr bm; From 8c58016a38a9db383e6fe7ace2973019f9189e67 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 19 May 2020 14:16:32 +0200 Subject: [PATCH 32/45] Fix Linpack xilinx tests --- LINPACK/Readme.md | 12 +- LINPACK/src/device/CMakeLists.txt | 8 +- LINPACK/src/host/execution.h | 2 +- LINPACK/src/host/execution_blocked_pvt.cpp | 6 +- LINPACK/src/host/linpack_benchmark.cpp | 16 +- LINPACK/src/host/linpack_benchmark.hpp | 7 +- LINPACK/src/host/main.cpp | 57 ++----- LINPACK/tests/main.cpp | 8 +- ...nel_functionality_and_host_integration.cpp | 158 +++--------------- ...st_kernel_functionality_separate_cores.cpp | 104 +++++------- LINPACK/tests/test_program_settings.h | 3 +- scripts/evaluation/parse_raw_to_csv.py | 2 +- 12 files changed, 113 insertions(+), 270 deletions(-) diff --git a/LINPACK/Readme.md b/LINPACK/Readme.md index b6507fa6..17982bdd 100644 --- a/LINPACK/Readme.md +++ b/LINPACK/Readme.md @@ -21,8 +21,8 @@ The targets below can be used to build the benchmark and its kernels, where `VEN | Target | Description | | --------------------- | ---------------------------------------------- | - | LINPACK_`VENDOR` | Builds the host application linking with the Intel SDK| - | Test_`VENDOR` | Compile the tests and its dependencies linking with the Intel SDK | + | Linpack_`VENDOR` | Builds the host application linking with the Intel SDK| + | Linpack_test_`VENDOR` | Compile the tests and its dependencies linking with the Intel SDK | More over there are additional targets to generate kernel reports and bitstreams. The provided kernel is optimized for the Bittware 520N board equipped with Stratix 10. @@ -72,15 +72,15 @@ in the `CMakeCache.txt` located in the build directory after running cmake. For execution of the benchmark run: - ./LINPACK_intel -f path_to_kernel.aocx + ./Linpack_intel -f path_to_kernel.aocx For more information on available input parameters run - ./LINPACK_intel -h + ./Linpack_intel -h Implementation of the LINPACK benchmark proposed in the HPCC benchmark suite for FPGA. Usage: - ./LINPACK_intel [OPTION...] + ./Linpack_intel [OPTION...] -f, --file arg Kernel file name -n, arg Number of repetitions (default: 10) @@ -96,7 +96,7 @@ For more information on available input parameters run To execute the unit and integration tests for Intel devices run - CL_CONTEXT_EMULATOR_DEVICE=1 ./Test_intel + CL_CONTEXT_EMULATOR_DEVICE=1 ./Linpack_test_intel -f KERNEL_FILE_NAME in the `bin` folder within the build directory. It will run an emulation of the kernel and execute some functionality tests. diff --git a/LINPACK/src/device/CMakeLists.txt b/LINPACK/src/device/CMakeLists.txt index 2bc96d03..cadaaf42 100644 --- a/LINPACK/src/device/CMakeLists.txt +++ b/LINPACK/src/device/CMakeLists.txt @@ -3,17 +3,17 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) if (INTELFPGAOPENCL_FOUND) generate_kernel_targets_intel(lu_blocked_pvt lu_blocked_pvt_test) - add_test(NAME test_emulation_intel COMMAND LINPACK_intel -f lu_blocked_pvt_emulate.aocx -s 128 -n 1 + add_test(NAME test_emulation_intel COMMAND Linpack_intel -f lu_blocked_pvt_emulate.aocx -s 128 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./LINPACK_intel -f lu_blocked_pvt_emulate.aocx -s 128 -n 1 + add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_intel -f lu_blocked_pvt_emulate.aocx -s 128 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (VITIS_FOUND) generate_kernel_targets_xilinx(lu_blocked_pvt) - add_test(NAME test_emulation_xilinx COMMAND LINPACK_xilinx -f lu_blocked_pvt_emulate.xclbin -s 128 -n 1 + add_test(NAME test_emulation_xilinx COMMAND Linpack_xilinx -f lu_blocked_pvt_emulate.xclbin -s 128 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./LINPACK_xilinx -f lu_blocked_pvt_emulate.xclbin -s 128 -n 1 + add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_xilinx -f lu_blocked_pvt_emulate.xclbin -s 128 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/LINPACK/src/host/execution.h b/LINPACK/src/host/execution.h index c702f719..34fd5882 100644 --- a/LINPACK/src/host/execution.h +++ b/LINPACK/src/host/execution.h @@ -45,7 +45,7 @@ simple exchange of the different calculation methods. @return The resulting matrix */ std::unique_ptr - calculate(const hpcc_base::ExecutionConfiguration& config, + calculate(const hpcc_base::ExecutionSettings& config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* b, cl_int* ipvt); diff --git a/LINPACK/src/host/execution_blocked_pvt.cpp b/LINPACK/src/host/execution_blocked_pvt.cpp index e5065745..c6ea99d4 100644 --- a/LINPACK/src/host/execution_blocked_pvt.cpp +++ b/LINPACK/src/host/execution_blocked_pvt.cpp @@ -43,7 +43,7 @@ namespace bm_execution { @copydoc bm_execution::calculate() */ std::unique_ptr -calculate(const hpcc_base::ExecutionConfiguration&config, +calculate(const hpcc_base::ExecutionSettings&config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* b, cl_int* ipvt) { @@ -77,7 +77,7 @@ calculate(const hpcc_base::ExecutionConfiguration executionTimes; - for (int i = 0; i < config.repetitions; i++) { + for (int i = 0; i < config.programSettings->numRepetitions; i++) { compute_queue.enqueueWriteBuffer(Buffer_a, CL_TRUE, 0, sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, A); compute_queue.finish(); @@ -103,7 +103,7 @@ calculate(const hpcc_base::ExecutionConfigurationmatrixSize, config.programSettings->matrixSize); std::unique_ptr results( - new LinpackExecutionTimings{executionTimes}); + new linpack::LinpackExecutionTimings{executionTimes}); return results; } diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp index 9a5a6a9c..deab852b 100644 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -35,7 +35,7 @@ SOFTWARE. #include "parameters.h" linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), - matrixSize(results["m"].as()) { + matrixSize(results["s"].as()) { } @@ -55,7 +55,7 @@ linpack::LinpackBenchmark::LinpackBenchmark() {} void linpack::LinpackBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { options.add_options() - ("m", "Matrix size in number of values in one dimension", + ("s", "Matrix size in number of values in one dimension", cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))); } @@ -103,11 +103,11 @@ linpack::LinpackBenchmark::generateInputData() { auto d = std::unique_ptr(new linpack::LinpackData(executionSettings->programSettings->matrixSize)); std::mt19937 gen(7); std::uniform_real_distribution<> dis(-1.0, 1.0); - *norma = 0.0; + d->norma = 0.0; for (int j = 0; j < executionSettings->programSettings->matrixSize; j++) { for (int i = 0; i < executionSettings->programSettings->matrixSize; i++) { d->A[executionSettings->programSettings->matrixSize*i+j] = dis(gen); - *norma = (d->A[executionSettings->programSettings->matrixSize*i+j] > *norma) ? d->A[executionSettings->programSettings->matrixSize*i+j] : *norma; + d->norma = (d->A[executionSettings->programSettings->matrixSize*i+j] > d->norma) ? d->A[executionSettings->programSettings->matrixSize*i+j] : d->norma; } } for (int i = 0; i < executionSettings->programSettings->matrixSize; i++) { @@ -139,7 +139,7 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat } HOST_DATA_TYPE eps = std::numeric_limits::epsilon(); - HOST_DATA_TYPE residn = resid / (n*norma*normx*eps); + HOST_DATA_TYPE residn = resid / (n*newdata->norma*normx*eps); //std::cout << resid << ", " << norma << ", " << normx << std::endl; std::cout << " norm. resid resid "\ "machep x[0]-1 x[n-1]-1" << std::endl; @@ -157,7 +157,7 @@ Standard LU factorization on a block with fixed size Case 1 of Zhangs description */ void -gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) { +linpack::gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) { for (int i = 0; i < n; i++) { ipvt[i] = i; } @@ -206,7 +206,7 @@ gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) { } void -gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda) { +linpack::gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda) { auto b_tmp = new HOST_DATA_TYPE[n]; #pragma omp parallel default(shared) { @@ -249,7 +249,7 @@ gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigne delete [] b_tmp; } -void dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m) { +void linpack::dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m) { for (int i=0; i < n1; i++) { for (int j=0; j < n2; j++) { y[i] = y[i] + x[j] * m[ldm*i + j]; diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index 3a64dbf7..355fa7c6 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -73,15 +73,16 @@ class LinpackProgramSettings : public hpcc_base::BaseSettings { class LinpackData { public: - HOST_DATA_TYPE *A, *B; + HOST_DATA_TYPE *A, *b; cl_int* ipvt; + HOST_DATA_TYPE norma; /** * @brief Construct a new Linpack Data object * * @param size Size of the allocated square matrix and vectors */ - LinpackData(uint size) { + LinpackData(uint size) : norma(0.0) { posix_memalign(reinterpret_cast(&A), 4096, size * size * sizeof(HOST_DATA_TYPE)); posix_memalign(reinterpret_cast(&b), 4096, size * sizeof(HOST_DATA_TYPE)); posix_memalign(reinterpret_cast(&ipvt), 4096, size * sizeof(cl_int)); @@ -93,7 +94,7 @@ class LinpackData { */ ~LinpackData() { free(A); - free(B); + free(b); free(ipvt); } diff --git a/LINPACK/src/host/main.cpp b/LINPACK/src/host/main.cpp index f1009df9..dbc50c9f 100644 --- a/LINPACK/src/host/main.cpp +++ b/LINPACK/src/host/main.cpp @@ -2,10 +2,9 @@ // Created by Marius Meyer on 04.12.19. // -#include "linpack_functionality.hpp" -#include "setup/fpga_setup.hpp" -#include "setup/common_benchmark_io.hpp" -#include "execution.h" +#include "linpack_benchmark.hpp" + +using namespace linpack; /** The program entry point @@ -13,47 +12,13 @@ The program entry point int main(int argc, char *argv[]) { // Setup benchmark - std::shared_ptr programSettings = - parseProgramParameters(argc, argv); - fpga_setup::setupEnvironmentAndClocks(); - std::vector usedDevice = - fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); - cl::Context context = cl::Context(usedDevice); - cl::Program program = fpga_setup::fpgaSetup(&context, usedDevice, - &programSettings->kernelFileName); - - printFinalConfiguration(programSettings, usedDevice[0]); - - std::shared_ptr config( - new bm_execution::ExecutionConfiguration { - context, usedDevice[0], program, - programSettings->numRepetitions, - programSettings->matrixSize - }); - - HOST_DATA_TYPE *A, *b; - cl_int* ipvt; - - posix_memalign(reinterpret_cast(&A), 4096, programSettings->matrixSize * - programSettings->matrixSize * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&b), 4096, programSettings->matrixSize * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&ipvt), 4096, programSettings->matrixSize * sizeof(cl_int)); - - - HOST_DATA_TYPE norma; - generateInputData(A, b, ipvt, programSettings->matrixSize, &norma); - - auto timing = bm_execution::calculate(config, A, b, ipvt); - - double error = checkLINPACKresults(b, programSettings->matrixSize); - - free(A); - free(b); - free(ipvt); - - printResults(timing, programSettings->matrixSize); - - return error < 100 ? 0 : 1; + auto bm = LinpackBenchmark(argc, argv); + bool success = bm.executeBenchmark(); + if (success) { + return 0; + } + else { + return 1; + } } diff --git a/LINPACK/tests/main.cpp b/LINPACK/tests/main.cpp index d9a9b221..a61f0623 100644 --- a/LINPACK/tests/main.cpp +++ b/LINPACK/tests/main.cpp @@ -43,7 +43,8 @@ class MPIEnvironment : public ::testing::Environment { using namespace linpack; -std::unique_ptr bm; +int global_argc; +char** global_argv; /** The program entry point for the unit tests @@ -55,7 +56,8 @@ main(int argc, char *argv[]) { ::testing::InitGoogleTest(&argc, argv); - bm = std::unique_ptr(new LinpackBenchmark(argc, argv)); + global_argc = argc; + global_argv = argv; #ifdef _USE_MPI_ ::testing::Environment* const mpi_env = @@ -64,8 +66,6 @@ main(int argc, char *argv[]) { bool result = RUN_ALL_TESTS(); - bm = nullptr; - return result; } diff --git a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp index c6a83de0..a2722188 100644 --- a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp +++ b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp @@ -3,63 +3,40 @@ // #include "gtest/gtest.h" #include "parameters.h" -#include "../src/host/execution.h" -#include "setup/fpga_setup.hpp" -#include "testing/test_program_settings.h" -#include "../src/host/linpack_functionality.hpp" +#include "test_program_settings.h" #ifdef _INTEL_MKL_ #include "mkl.h" #endif -struct OpenCLKernelTest : testing::Test { - HOST_DATA_TYPE *A; - HOST_DATA_TYPE *b; - cl_int *ipvt; - std::shared_ptr config; - cl_uint array_size; - std::string lastKernelFileName; - OpenCLKernelTest() { - array_size = (1 << LOCAL_MEM_BLOCK_LOG); - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * array_size * array_size); - posix_memalign(reinterpret_cast(&b), 64, - sizeof(HOST_DATA_TYPE) * array_size); - posix_memalign(reinterpret_cast(&ipvt), 64, - sizeof(cl_int) * array_size); - setupFPGA(programSettings->kernelFileName); +struct LinpackKernelTest : testing::Test { + + std::unique_ptr bm; + std::unique_ptr data; + uint array_size = 0; + + void SetUp() override { + bm = std::unique_ptr(new linpack::LinpackBenchmark(global_argc, global_argv)); + bm->getExecutionSettings().programSettings->matrixSize = 1 << LOCAL_MEM_BLOCK_LOG; + data = bm->generateInputData(); + array_size = bm->getExecutionSettings().programSettings->matrixSize; } - void setupFPGA(std::string kernelFileName) { - lastKernelFileName = kernelFileName; - std::vector device = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); - cl::Context context(device[0]); - cl::Program program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); - config = std::make_shared( - bm_execution::ExecutionConfiguration{ - context, device[0], program, - 1, - array_size, - }); - HOST_DATA_TYPE norm; - generateInputData(A, b, ipvt, array_size, &norm); + void TearDown() override { + bm = nullptr; + data = nullptr; } - ~OpenCLKernelTest() override { - free(A); - free(b); - free(ipvt); - } }; /** * Execution returns correct results for a single repetition */ -TEST_F(OpenCLKernelTest, FPGACorrectResultsOneRepetition) { +TEST_F(LinpackKernelTest, FPGACorrectResultsOneRepetition) { - auto result = bm_execution::calculate(config, A, b, ipvt); + auto result = bm->executeKernel(*data); for (int i = 0; i < array_size; i++) { - EXPECT_NEAR(b[i], 1.0, 1.0e-3); + EXPECT_NEAR(data->b[i], 1.0, 1.0e-3); } } @@ -67,109 +44,24 @@ TEST_F(OpenCLKernelTest, FPGACorrectResultsOneRepetition) { /** * Execution returns correct results for a single repetition */ -TEST_F(OpenCLKernelTest, FPGASimilarResultsToLAPACKforSingleBlock) { +TEST_F(OpenCLKernelTest, ValidationWorksForMKL) { - auto result = bm_execution::calculate(config, A, b, ipvt); int info; - HOST_DATA_TYPE* bcpu = new HOST_DATA_TYPE[array_size]; - HOST_DATA_TYPE* Acpu = new HOST_DATA_TYPE[array_size * array_size]; - HOST_DATA_TYPE norm; - generateInputData(A, bcpu, ipvt, array_size, &norm); + auto data_cpu = bm->generateInputData(); for (int i=0; iA[i * array_size + j] = data->A[j* array_size + i]; } } int s = static_cast(array_size); int lrhs = 1; #ifndef _DP - sgesv(&s, &lrhs, Acpu, &s, ipvt, bcpu, &s, &info); + sgesv(&s, &lrhs, data_cpu->A, &s, data_cpu->ipvt, data_cpu->b, &s, &info); #else - dgesv(&s, &lrhs, Acpu, &s, ipvt, bcpu, &s, &info); + dgesv(&s, &lrhs, data_cpu->A, &s, data_cpu->ipvt, data_cpu->b, &s, &info); #endif - double error_emu = checkLINPACKresults(b, array_size); - double error_cpu = checkLINPACKresults(bcpu, array_size); - EXPECT_LE(error_emu, error_cpu+ 1.0); - delete [] Acpu; - delete [] bcpu; -} - -/** - * Execution of reference implementation returns correct results for a single repetition - */ -TEST_F(OpenCLKernelTest, FPGAReferenceImplSimilarToMKL) { - - gefa_ref(A, config->matrixSize, config->matrixSize, ipvt); - gesl_ref(A, b, ipvt, config->matrixSize, config->matrixSize); - int info; - HOST_DATA_TYPE* bcpu = new HOST_DATA_TYPE[array_size]; - HOST_DATA_TYPE* Acpu = new HOST_DATA_TYPE[array_size * array_size]; - HOST_DATA_TYPE norm; - generateInputData(A, bcpu, ipvt, array_size, &norm); - for (int i=0; i(array_size); - int lrhs = 1; -#ifndef _DP - sgesv(&s, &lrhs, Acpu, &s, ipvt, bcpu, &s, &info); -#else - dgesv(&s, &lrhs, Acpu, &s, ipvt, bcpu, &s, &info); -#endif - double error_emu = checkLINPACKresults(b, array_size); - double error_cpu = checkLINPACKresults(bcpu, array_size); - EXPECT_LE(error_emu, error_cpu+ 1.0); - delete [] Acpu; - delete [] bcpu; -} - - -/** - * Execution returns correct results for a single repetition - */ -// TODO this test fails most likely because of inreasing errors in C2. Use partial pivoting or other mechanisms -// to make the calculation stable again! -// Remove DISABLED_ from test name to enable the test again. -TEST_F(OpenCLKernelTest, DISABLED_FPGASimilarResultsToLAPACKforMultipleBlocks) { - free(A); - free(b); - free(ipvt); - array_size = 4 * (1 << LOCAL_MEM_BLOCK_LOG); - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * array_size * array_size); - posix_memalign(reinterpret_cast(&b), 64, - sizeof(HOST_DATA_TYPE) * array_size); - posix_memalign(reinterpret_cast(&ipvt), 64, - sizeof(cl_int) * array_size); - setupFPGA(lastKernelFileName); - auto result = bm_execution::calculate(config, A, b, ipvt); - - int info; - HOST_DATA_TYPE* bcpu = new HOST_DATA_TYPE[array_size]; - HOST_DATA_TYPE* Acpu = new HOST_DATA_TYPE[array_size * array_size]; - HOST_DATA_TYPE norm; - generateInputData(A, bcpu, ipvt, array_size, &norm); - for (int i=0; i(array_size); - int lrhs = 1; -#ifndef _DP - sgesv(&s, &lrhs, Acpu, &s, ipvt, bcpu, &s, &info); -#else - dgesv(&s, &lrhs, Acpu, &s, ipvt, bcpu, &s, &info); -#endif - double error_emu = checkLINPACKresults(b, array_size); - double error_cpu = checkLINPACKresults(bcpu, array_size); - - EXPECT_LE(error_emu, error_cpu + 1.0); - - delete [] Acpu; - delete [] bcpu; + bool success = bm->validateOutputAndPrintError(*data_cpu); + EXPECT_TRUE(success); } diff --git a/LINPACK/tests/test_kernel_functionality_separate_cores.cpp b/LINPACK/tests/test_kernel_functionality_separate_cores.cpp index b2f875c5..fb761b7c 100644 --- a/LINPACK/tests/test_kernel_functionality_separate_cores.cpp +++ b/LINPACK/tests/test_kernel_functionality_separate_cores.cpp @@ -3,23 +3,27 @@ // #include "gtest/gtest.h" #include "parameters.h" -#include "../src/host/execution.h" -#include "setup/fpga_setup.hpp" -#include "../src/host/linpack_functionality.hpp" +#include "linpack_benchmark.hpp" #include #ifdef _INTEL_MKL_ #include "mkl.h" #endif -struct OpenCLKernelSeparateTest : testing::Test { +struct LinpackKernelSeparateTest : testing::Test, testing::WithParamInterface { + HOST_DATA_TYPE *A, *B, *C, *scale; cl_int *ipvt; - std::shared_ptr config; - cl_uint array_size; - std::string lastKernelFileName; + uint array_size; + std::unique_ptr bm; - OpenCLKernelSeparateTest() { + LinpackKernelSeparateTest() { + int argc = 3; + std::string str_param = GetParam(); + std::vector param(str_param.c_str(), str_param.c_str() + str_param.size() + 1); + char* argv[3] = {"Test", "-f", reinterpret_cast(¶m)}; array_size = (1 << LOCAL_MEM_BLOCK_LOG); + bm->getExecutionSettings().programSettings->matrixSize = array_size; + bm = std::unique_ptr(new linpack::LinpackBenchmark(argc, argv)); posix_memalign(reinterpret_cast(&A), 4096, sizeof(HOST_DATA_TYPE) * array_size * array_size); posix_memalign(reinterpret_cast(&B), 4096, @@ -29,20 +33,7 @@ struct OpenCLKernelSeparateTest : testing::Test { posix_memalign(reinterpret_cast(&scale), 4096, sizeof(HOST_DATA_TYPE) * array_size ); posix_memalign(reinterpret_cast(&ipvt), 4096, - sizeof(cl_int) * array_size); - } - - void setupFPGA(std::string kernelFileName) { - lastKernelFileName = kernelFileName; - std::vector device = fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE); - cl::Context context(device[0]); - cl::Program program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); - config = std::make_shared( - bm_execution::ExecutionConfiguration{ - context, device[0], program, - 1, - array_size, - }); + sizeof(cl_int) * array_size); } void initializeData() { @@ -69,22 +60,22 @@ struct OpenCLKernelSeparateTest : testing::Test { int err; // Create Command queue - cl::CommandQueue compute_queue(config->context, config->device); + cl::CommandQueue compute_queue(*bm->getExecutionSettings().context, *bm->getExecutionSettings().device); // Create Buffers for input and output - cl::Buffer Buffer_a(config->context, CL_MEM_READ_WRITE, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize); - cl::Buffer Buffer_b(config->context, CL_MEM_READ_WRITE, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize); - cl::Buffer Buffer_c(config->context, CL_MEM_READ_WRITE, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize); - cl::Buffer Buffer_scale(config->context, CL_MEM_READ_WRITE, - sizeof(HOST_DATA_TYPE)*config->matrixSize); - cl::Buffer Buffer_pivot(config->context, CL_MEM_READ_WRITE, - sizeof(cl_int)*config->matrixSize); + cl::Buffer Buffer_a(*bm->getExecutionSettings().context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*array_size*array_size); + cl::Buffer Buffer_b(*bm->getExecutionSettings().context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*array_size*array_size); + cl::Buffer Buffer_c(*bm->getExecutionSettings().context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*array_size*array_size); + cl::Buffer Buffer_scale(*bm->getExecutionSettings().context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*array_size); + cl::Buffer Buffer_pivot(*bm->getExecutionSettings().context, CL_MEM_READ_WRITE, + sizeof(cl_int)*array_size); // create the kernels - cl::Kernel test_c4_kernel(config->program, kernel_name.c_str(), + cl::Kernel test_c4_kernel(*bm->getExecutionSettings().program, kernel_name.c_str(), &err); ASSERT_CL(err); @@ -100,24 +91,24 @@ struct OpenCLKernelSeparateTest : testing::Test { ASSERT_CL(err); err = test_c4_kernel.setArg(4, Buffer_pivot); ASSERT_CL(err); - err = test_c4_kernel.setArg(5, static_cast(config->matrixSize >> LOCAL_MEM_BLOCK_LOG)); + err = test_c4_kernel.setArg(5, static_cast(array_size >> LOCAL_MEM_BLOCK_LOG)); ASSERT_CL(err); /* --- Execute actual benchmark kernels --- */ double t; std::vector executionTimes; - for (int i = 0; i < config->repetitions; i++) { + for (int i = 0; i < bm->getExecutionSettings().programSettings->numRepetitions; i++) { compute_queue.enqueueWriteBuffer(Buffer_a, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, A); + sizeof(HOST_DATA_TYPE)*array_size*array_size, A); compute_queue.enqueueWriteBuffer(Buffer_b, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, B); + sizeof(HOST_DATA_TYPE)*array_size*array_size, B); compute_queue.enqueueWriteBuffer(Buffer_c, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, C); + sizeof(HOST_DATA_TYPE)*array_size*array_size, C); compute_queue.enqueueWriteBuffer(Buffer_scale, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize, scale); + sizeof(HOST_DATA_TYPE)*array_size, scale); compute_queue.enqueueWriteBuffer(Buffer_pivot, CL_TRUE, 0, - sizeof(cl_int)*config->matrixSize, ipvt); + sizeof(cl_int)*array_size, ipvt); compute_queue.finish(); auto t1 = std::chrono::high_resolution_clock::now(); compute_queue.enqueueTask(test_c4_kernel); @@ -131,33 +122,26 @@ struct OpenCLKernelSeparateTest : testing::Test { /* --- Read back results from Device --- */ compute_queue.enqueueReadBuffer(Buffer_a, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, A); + sizeof(HOST_DATA_TYPE)*array_size*array_size, A); compute_queue.enqueueReadBuffer(Buffer_b, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, B); + sizeof(HOST_DATA_TYPE)*array_size*array_size, B); compute_queue.enqueueReadBuffer(Buffer_c, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, C); + sizeof(HOST_DATA_TYPE)*array_size*array_size, C); } - ~OpenCLKernelSeparateTest() override { + ~LinpackKernelSeparateTest() override { free(A); free(B); free(C); free(ipvt); - } -}; - -struct DifferentOpenCLKernelSeparateTest : OpenCLKernelSeparateTest, testing::WithParamInterface { - DifferentOpenCLKernelSeparateTest() { - auto params = GetParam(); - auto kernel_file = params; - setupFPGA(kernel_file); + free(scale); } }; /** * Execution returns correct results for a single repetition */ -TEST_P(DifferentOpenCLKernelSeparateTest, FPGACorrectResultsForC1) { +TEST_P(LinpackKernelSeparateTest, FPGACorrectResultsForC1) { auto a_result = new HOST_DATA_TYPE[array_size * array_size]; initializeData(); executeTest("test_c1"); @@ -167,7 +151,7 @@ TEST_P(DifferentOpenCLKernelSeparateTest, FPGACorrectResultsForC1) { } } initializeData(); - gefa_ref(A, array_size, array_size, ipvt); + linpack::gefa_ref(A, array_size, array_size, ipvt); double error = 0.0; for (int i=0; i bm; +extern int global_argc; +extern char** global_argv; \ No newline at end of file diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index ba8b4ecf..bc6ba6e3 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -14,7 +14,7 @@ ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)\n\\s+trans\\s+calc\\s+calc\\s+FLOPS\\s+total\\s+FLOPS\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" -linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize:\\s+(?P\d+)(.*\n)+Data\\sType:\\s+(?P.+)\n(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS(.*\n)+\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" +linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS(.*\n)+\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" def parse_network(file_content): From cba02fd4525c82c92dc013108b239e4c65739ff0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 19 May 2020 14:20:56 +0200 Subject: [PATCH 33/45] Delete old Linpack files --- .../common_benchmark_io_implementation.cpp | 86 ------ LINPACK/src/host/execution_blocked.cpp | 163 ------------ LINPACK/src/host/linpack_functionality.cpp | 249 ------------------ LINPACK/src/host/linpack_functionality.hpp | 95 ------- LINPACK/src/host/program_settings.h | 34 --- 5 files changed, 627 deletions(-) delete mode 100644 LINPACK/src/host/common_benchmark_io_implementation.cpp delete mode 100644 LINPACK/src/host/execution_blocked.cpp delete mode 100644 LINPACK/src/host/linpack_functionality.cpp delete mode 100644 LINPACK/src/host/linpack_functionality.hpp delete mode 100644 LINPACK/src/host/program_settings.h diff --git a/LINPACK/src/host/common_benchmark_io_implementation.cpp b/LINPACK/src/host/common_benchmark_io_implementation.cpp deleted file mode 100644 index a243dc1c..00000000 --- a/LINPACK/src/host/common_benchmark_io_implementation.cpp +++ /dev/null @@ -1,86 +0,0 @@ - -#include "cxxopts.hpp" -#include "parameters.h" -#include "setup/common_benchmark_io.hpp" - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("s", "Size of the data arrays", - cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["s"].as(), - result["platform"].as(), - result["device"].as(), - result["f"].as()}); - return sharedSettings; -} - -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << HLINE; - std::cout << "Summary:" << std::endl - << "Matrix Size: " << programSettings->matrixSize - << std::endl - << "Block Size: " << (1 << LOCAL_MEM_BLOCK_LOG) - << std::endl - << "Data Type: " << STR(HOST_DATA_TYPE) - << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} diff --git a/LINPACK/src/host/execution_blocked.cpp b/LINPACK/src/host/execution_blocked.cpp deleted file mode 100644 index e0ff4a9b..00000000 --- a/LINPACK/src/host/execution_blocked.cpp +++ /dev/null @@ -1,163 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* Related header files */ -#include "src/host/execution.h" - -/* C++ standard library headers */ -#include -#include -#include -#include - -#ifdef DEBUG -#include -#endif - -/* External library headers */ -#include "CL/cl.hpp" -#if QUARTUS_MAJOR_VERSION > 18 -#include "CL/cl_ext_intelfpga.h" -#endif - -/* Project's headers */ -#include "src/host/fpga_setup.h" -#include "src/host/linpack_functionality.h" - -namespace bm_execution { - -/* - Prepare kernels and execute benchmark for the blocked approach - - @copydoc bm_execution::calculate() -*/ -std::shared_ptr -calculate(cl::Context context, cl::Device device, cl::Program program, - uint repetitions, ulong matrixSize, uint blockSize) { - uint lda = matrixSize; - DATA_TYPE* a; - posix_memalign(reinterpret_cast(&a), 64, - sizeof(DATA_TYPE)*lda*matrixSize); - DATA_TYPE* b; - posix_memalign(reinterpret_cast(&b), 64, - sizeof(DATA_TYPE)* matrixSize); - cl_int* ipvt; - posix_memalign(reinterpret_cast(&ipvt), 64, - sizeof(cl_int) * matrixSize); - - for (int i = 0; i < matrixSize; i++) { - ipvt[i] = i; - } - - DATA_TYPE norma = 0; - double ops = (2.0e0*(matrixSize*matrixSize*matrixSize))/ - 3.0 + 2.0*(matrixSize*matrixSize); - int err; - - // Create Command queue - cl::CommandQueue compute_queue(context, device); - - // Create Buffers for input and output - cl::Buffer Buffer_a(context, CL_MEM_READ_WRITE, - sizeof(DATA_TYPE)*lda*matrixSize); - - // create the kernels - cl::Kernel gefakernel(program, GEFA_KERNEL, - &err); - ASSERT_CL(err); - - - // prepare kernels - err = gefakernel.setArg(0, Buffer_a); - ASSERT_CL(err); - err = gefakernel.setArg(1, static_cast(matrixSize / blockSize)); - ASSERT_CL(err); - - /* --- Execute actual benchmark kernels --- */ - - double t; - std::vector executionTimes; - for (int i = 0; i < repetitions; i++) { - matgen(a, lda, matrixSize, b, &norma); - compute_queue.enqueueWriteBuffer(Buffer_a, CL_TRUE, 0, - sizeof(DATA_TYPE)*lda*matrixSize, a); - compute_queue.finish(); - auto t1 = std::chrono::high_resolution_clock::now(); - compute_queue.enqueueTask(gefakernel); - compute_queue.finish(); - auto t2 = std::chrono::high_resolution_clock::now(); - std::chrono::duration timespan = - std::chrono::duration_cast> - (t2 - t1); - executionTimes.push_back(timespan.count()); - } - - /* --- Read back results from Device --- */ - - compute_queue.enqueueReadBuffer(Buffer_a, CL_TRUE, 0, - sizeof(DATA_TYPE)*lda*matrixSize, a); - -#ifdef DEBUG - for (int i= 0; i < matrixSize; i++) { - for (int j=0; j < matrixSize; j++) { - std::cout << a[i*lda + j] << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; -#endif - - gesl_ref(a, b, ipvt, matrixSize, matrixSize); - - /* --- Check Results --- */ - - double error = checkLINPACKresults(b, matrixSize, matrixSize); - - /* Check CPU reference results */ - - matgen(a, lda, matrixSize, b, &norma); - gefa_ref(a, matrixSize, lda, ipvt); - -#ifdef DEBUG - for (int i= 0; i < matrixSize; i++) { - for (int j=0; j < matrixSize; j++) { - std::cout << a[i*lda + j] << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; -#endif - - gesl_ref(a, b, ipvt, matrixSize, matrixSize); - checkLINPACKresults(b, matrixSize, matrixSize); - - free(reinterpret_cast(a)); - free(reinterpret_cast(b)); - free(reinterpret_cast(ipvt)); - - std::shared_ptr results( - new ExecutionResults{executionTimes, - error}); - return results; -} - -} // namespace bm_execution diff --git a/LINPACK/src/host/linpack_functionality.cpp b/LINPACK/src/host/linpack_functionality.cpp deleted file mode 100644 index 82aba0dc..00000000 --- a/LINPACK/src/host/linpack_functionality.cpp +++ /dev/null @@ -1,249 +0,0 @@ -// -// Created by Marius Meyer on 04.12.19. -// - -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#include "linpack_functionality.hpp" - -/* C++ standard library headers */ -#include -#include - -/* Project's headers */ -#include "execution.h" -#include "cxxopts.hpp" -#include "setup/fpga_setup.hpp" -#include "setup/common_benchmark_io.hpp" -#include "parameters.h" - - -/** -Prints the execution results to stdout - -@param results The execution results -*/ -void -printResults(std::shared_ptr results, unsigned matrix_size) { - - std::cout << std::setw(ENTRY_SPACE) - << "best" << std::setw(ENTRY_SPACE) << "mean" - << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; - - // Calculate performance for kernel execution plus data transfer - double tmean = 0; - double tmin = std::numeric_limits::max(); - - // GFLOPs for calculation of both GEFA and GESL. - // Currently only GEFA is calculated on the FPGA so GFLOPS have to be - // reduced. - // double gflops = ((2.0e0*(dataSize*dataSize*dataSize))/3.0 - // + 2.0*(dataSize*dataSize)) / 1.0e9; - // TODO: Change this when GESL is also calculated on FPGA - double gflops = (2.0e0*(static_cast(matrix_size)*static_cast(matrix_size)*static_cast(matrix_size)))/3.0/1.0e9; - for (double currentTime : results->timings) { - tmean += currentTime; - if (currentTime < tmin) { - tmin = currentTime; - } - } - tmean = tmean / results->timings.size(); - - std::cout << std::setw(ENTRY_SPACE) - << tmin << std::setw(ENTRY_SPACE) << tmean - << std::setw(ENTRY_SPACE) << (gflops / tmin) - << std::endl; - -} - - -void generateInputData(HOST_DATA_TYPE* A, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned matrix_size, HOST_DATA_TYPE* norma) { - std::mt19937 gen(7); - std::uniform_real_distribution<> dis(-1.0, 1.0); - *norma = 0.0; - for (int j = 0; j < matrix_size; j++) { - for (int i = 0; i < matrix_size; i++) { - A[matrix_size*i+j] = dis(gen); - *norma = (A[matrix_size*i+j] > *norma) ? A[matrix_size*i+j] : *norma; - } - } - for (int i = 0; i < matrix_size; i++) { - b[i] = 0.0; - } - for (int j = 0; j < matrix_size; j++) { - for (int i = 0; i < matrix_size; i++) { - b[j] += A[matrix_size*j+i]; - } - } - -} - -double -checkLINPACKresults(const HOST_DATA_TYPE* b_res, unsigned n) { - auto a = new HOST_DATA_TYPE[n*n]; - HOST_DATA_TYPE norma; - auto x = new HOST_DATA_TYPE[n]; - auto b = new HOST_DATA_TYPE[n]; - /* compute a residual to verify results. */ - - for (int i = 0; i < n; i++) { - x[i] = b_res[i]; - } - - auto ipvt = new cl_int[n]; - generateInputData(a,b,ipvt,n, &norma); - for (int i = 0; i < n; i++) { - b[i] = -b[i]; - } - dmxpy(n, b, n, n, x, a); - HOST_DATA_TYPE resid = 0.0; - HOST_DATA_TYPE normx = 0.0; - - for (int i = 0; i < n; i++) { - resid = (resid > fabs(b[i])) ? resid : fabs(b[i]); - normx = (normx > fabs(x[i])) ? normx : fabs(x[i]); - } - - HOST_DATA_TYPE eps = std::numeric_limits::epsilon(); - HOST_DATA_TYPE residn = resid / (n*norma*normx*eps); - //std::cout << resid << ", " << norma << ", " << normx << std::endl; - std::cout << " norm. resid resid "\ - "machep x[0]-1 x[n-1]-1" << std::endl; - std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE) - << resid << std::setw(ENTRY_SPACE) << eps - << std::setw(ENTRY_SPACE) << x[0]-1 << std::setw(ENTRY_SPACE) - << x[n-1]-1 << std::endl; - - delete [] a; - delete [] x; - delete [] b; - delete [] ipvt; - return residn; -} - -/** -Standard LU factorization on a block with fixed size - -Case 1 of Zhangs description -*/ -void -gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) { - for (int i = 0; i < n; i++) { - ipvt[i] = i; - } - // For each diagnonal element - for (int k = 0; k < n - 1; k++) { - HOST_DATA_TYPE max_val = fabs(a[k * lda + k]); - int pvt_index = k; - for (int i = k + 1; i < n; i++) { - if (max_val < fabs(a[i * lda + k])) { - pvt_index = i; - max_val = fabs(a[i * lda + k]); - } - } - - for (int i = k; i < n; i++) { - HOST_DATA_TYPE tmp_val = a[k * lda + i]; - a[k * lda + i] = a[pvt_index * lda + i]; - a[pvt_index * lda + i] = tmp_val; - } - ipvt[k] = pvt_index; - - // For each element below it - for (int i = k + 1; i < n; i++) { - a[i * lda + k] *= -1.0 / a[k * lda + k]; - } - // For each column right of current diagonal element - for (int j = k + 1; j < n; j++) { - // For each element below it - for (int i = k+1; i < n; i++) { - a[i * lda + j] += a[i * lda + k] * a[k * lda + j]; - } - } - -#ifdef DEBUG - std::cout << "A(k=" << k <<"): " << std::endl; - for (int i= 0; i < n; i++) { - for (int j=0; j < n; j++) { - std::cout << a[i*lda + j] << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; -#endif - - } -} - -void -gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda) { - auto b_tmp = new HOST_DATA_TYPE[n]; -#pragma omp parallel default(shared) - { -#pragma omp for - for (int k = 0; k < n; k++) { - b_tmp[k] = b[k]; - } - - // solve l*y = b - // For each row in matrix -#pragma omp single - for (int k = 0; k < n - 1; k++) { - if (ipvt[k] != k) { - HOST_DATA_TYPE tmp = b_tmp[k]; - b_tmp[k] = b_tmp[ipvt[k]]; - b_tmp[ipvt[k]] = tmp; - } - // For each row below add -#pragma omp parallel for - for (int i = k + 1; i < n; i++) { - // add solved upper row to current row - b_tmp[i] += b_tmp[k] * a[lda * i + k]; - } - } - - // now solve u*x = y -#pragma omp single - for (int k = n - 1; k >= 0; k--) { - b_tmp[k] = b_tmp[k] / a[lda * k + k]; -#pragma omp parallel for - for (int i = 0; i < k; i++) { - b_tmp[i] -= b_tmp[k] * a[lda * i + k]; - } - } -#pragma omp for - for (int k = 0; k < n; k++) { - b[k] = b_tmp[k]; - } - } - delete [] b_tmp; -} - -void dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m) { - for (int i=0; i < n1; i++) { - for (int j=0; j < n2; j++) { - y[i] = y[i] + x[j] * m[ldm*i + j]; - } - } -} - diff --git a/LINPACK/src/host/linpack_functionality.hpp b/LINPACK/src/host/linpack_functionality.hpp deleted file mode 100644 index ef7fd358..00000000 --- a/LINPACK/src/host/linpack_functionality.hpp +++ /dev/null @@ -1,95 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#ifndef SRC_HOST_NETWORK_FUNCTIONALITY_H_ -#define SRC_HOST_NETWORK_FUNCTIONALITY_H_ - -/* C++ standard library headers */ -#include -#include - -/* Project's headers */ -#include "linpack_functionality.hpp" -#include "execution.h" -#include "parameters.h" - -/** -Prints the execution results to stdout - -@param results The execution results -*/ -void -printResults(std::shared_ptr results, unsigned matrix_size); - - -/** - * Fill the data buffer with random number using the mersenne twister engine with - * seed 0. - * - * @param data Data array that has to be filled - * @param size Size of the data array that has to be filled - */ -void generateInputData(HOST_DATA_TYPE* A, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned matrix_size, HOST_DATA_TYPE* norma); - - - -double checkLINPACKresults(const HOST_DATA_TYPE* b_res, unsigned n); - -/** - * - * - * @param n1 - * @param y - * @param n2 - * @param ldm - * @param x - * @param m - */ -void dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m); - -/** -Gaussian elemination reference implementation without pivoting. -Can be used in exchange with kernel functions for functionality testing - -@param a the matrix with size of n*n -@param n size of matrix A -@param lda row with of the matrix. must be >=n - -*/ -void gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt); - -/** -Solve linear equations using its LU decomposition. -Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU -where A is a matrix of size n*n - -@param a the matrix a in LU representation calculated by gefa call -@param b vector b of the given equation -@param ipvt vector containing pivoting information -@param n size of matrix A -@param lda row with of the matrix. must be >=n - -*/ -void gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda); - - -#endif // SRC_HOST_NETWORK_FUNCTIONALITY_H_ diff --git a/LINPACK/src/host/program_settings.h b/LINPACK/src/host/program_settings.h deleted file mode 100644 index f2f1e97e..00000000 --- a/LINPACK/src/host/program_settings.h +++ /dev/null @@ -1,34 +0,0 @@ - -#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ -#define SRC_HOST_PROGRAM_SETTINGS_H_ - -#include "parameters.h" - -/* C++ standard library headers */ -#include - -#include "CL/opencl.h" - -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ - -/* -Short description of the program -*/ -#define PROGRAM_DESCRIPTION "Implementation of the LINPACK benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " VERSION "\n" - - -struct ProgramSettings { - uint numRepetitions; - uint matrixSize; - int defaultPlatform; - int defaultDevice; - std::string kernelFileName; -}; - - -#endif From 2bcd7c0362382a1648f77d62c454f1fc48660f31 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 19 May 2020 15:49:01 +0200 Subject: [PATCH 34/45] Adapt GEMM to OO code --- GEMM/CHANGELOG | 5 + GEMM/CMakeLists.txt | 2 +- GEMM/Readme.md | 6 +- GEMM/src/common/parameters.h.in | 7 + GEMM/src/host/CMakeLists.txt | 56 +++-- GEMM/src/host/execution.h | 25 +-- GEMM/src/host/execution_cannon.cpp | 59 +++-- GEMM/src/host/gemm_benchmark.cpp | 163 ++++++++++++++ GEMM/src/host/gemm_benchmark.hpp | 211 ++++++++++++++++++ GEMM/src/host/main.cpp | 75 ++----- GEMM/tests/CMakeLists.txt | 39 ++-- GEMM/tests/main.cpp | 69 ++++++ ...nel_functionality_and_host_integration.cpp | 154 +++++-------- GEMM/tests/test_program_settings.h | 25 +++ scripts/evaluation/parse_raw_to_csv.py | 2 +- 15 files changed, 641 insertions(+), 257 deletions(-) create mode 100644 GEMM/src/host/gemm_benchmark.cpp create mode 100644 GEMM/src/host/gemm_benchmark.hpp create mode 100644 GEMM/tests/main.cpp create mode 100644 GEMM/tests/test_program_settings.h diff --git a/GEMM/CHANGELOG b/GEMM/CHANGELOG index d3052cfd..ab546246 100644 --- a/GEMM/CHANGELOG +++ b/GEMM/CHANGELOG @@ -2,6 +2,11 @@ This file contains all changes made to the source code for each release. +## 0.2.2 + +#### Changed: +- Converted host code to new OO code + ## 0.2.1 #### Added: diff --git a/GEMM/CMakeLists.txt b/GEMM/CMakeLists.txt index 772fe318..9af84c0d 100755 --- a/GEMM/CMakeLists.txt +++ b/GEMM/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.1) -project(GEMM VERSION 0.2.1 ) +project(GEMM VERSION 0.2.2 ) set(KERNEL_NAME gemm CACHE STRING "Name of the OpenCL kernel") set(DEFAULT_MATRIX_SIZE 4096 CACHE STRING "Default size of the used matrices") diff --git a/GEMM/Readme.md b/GEMM/Readme.md index 6c1c339d..957f3691 100755 --- a/GEMM/Readme.md +++ b/GEMM/Readme.md @@ -30,7 +30,7 @@ The targets below can be used to build the benchmark and its kernels, where `VEN | Target | Description | | -------- | ---------------------------------------------- | | GEMM_`VENDOR` | Builds the host application | - | Test_`VENDOR` | Compile the tests and its dependencies | + | GEMM_test_`VENDOR` | Compile the tests and its dependencies | More over the are additional targets to generate kernel reports and bitstreams. They are generated for every kernel code in the `src/device` folder: @@ -46,7 +46,7 @@ The targets below can be used to build the benchmark and its kernels, where `VEN mkdir build && cd build cmake .. - make fgemm + make GEMM_intel You will find all executables and kernel files in the `bin` folder of your build directory. @@ -93,7 +93,7 @@ For more information on available input parameters run To execute the unit and integration tests run - ./Test_intel + ./GE__test_intel -f KERNEL_FILE_NAME in the `bin` folder within the build directory. It will run an emulation of the kernel and execute some functionality tests. diff --git a/GEMM/src/common/parameters.h.in b/GEMM/src/common/parameters.h.in index c865e09c..275d63be 100644 --- a/GEMM/src/common/parameters.h.in +++ b/GEMM/src/common/parameters.h.in @@ -18,6 +18,13 @@ #define HOST_DATA_TYPE @HOST_DATA_TYPE@ #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ +/* +Short description of the program +*/ +#define PROGRAM_DESCRIPTION "Implementation of the GEMM benchmark"\ + " proposed in the HPCC benchmark adapted for FPGA\n"\ + "Version: " VERSION "\n" + /** Output separator */ diff --git a/GEMM/src/host/CMakeLists.txt b/GEMM/src/host/CMakeLists.txt index 68c00da8..f1668352 100755 --- a/GEMM/src/host/CMakeLists.txt +++ b/GEMM/src/host/CMakeLists.txt @@ -1,36 +1,52 @@ -include_directories(../../../extern/cxxopts/include ../../../shared/) -include_directories(${CMAKE_BINARY_DIR}/src/common) -include_directories(${CMAKE_SOURCE_DIR}/../shared/setup .) - -set(HOST_SOURCE execution_cannon.cpp main.cpp common_benchmark_io_implementation.cpp ../../../shared/setup/fpga_setup.cpp gemm_functionality.cpp) - find_package(BLAS) if (NOT BLAS_FOUND) message(WARNING "No BLAS Library found. Slow reference implementation will be used for verification!") endif() +add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) +set(HOST_SOURCE execution_cannon.cpp gemm_benchmark.cpp) + +set(HOST_EXE_NAME GEMM) +set(LIB_NAME ge) + if (INTELFPGAOPENCL_FOUND) - include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) - add_executable(GEMM_intel ${HOST_SOURCE}) - target_link_libraries(GEMM_intel ${IntelFPGAOpenCL_LIBRARIES}) - target_compile_definitions(GEMM_intel PRIVATE -DINTEL_FPGA) + add_library(${LIB_NAME}_intel STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_intel PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${IntelFPGAOpenCL_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_intel PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_intel main.cpp) + target_link_libraries(${LIB_NAME}_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel) + if (USE_SVM) + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0) + endif() if (BLAS_FOUND) - target_compile_definitions(GEMM_intel PRIVATE -D_USE_BLAS_) - target_link_libraries(GEMM_intel ${BLAS_LIBRARIES}) - target_link_options(GEMM_intel PRIVATE ${BLAS_LINKER_FLAGS}) + target_compile_definitions(${LIB_NAME}_intel PRIVATE -D_USE_BLAS_) + target_link_libraries(${LIB_NAME}_intel ${BLAS_LIBRARIES}) + target_link_libraries(${LIB_NAME}_intel PRIVATE ${BLAS_LINKER_FLAGS}) endif() + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_intel_host_executable COMMAND $ -h) endif() if (Vitis_FOUND) - include_directories(${Vitis_INCLUDE_DIRS}) - add_executable(GEMM_xilinx ${HOST_SOURCE}) - target_link_libraries(GEMM_xilinx ${Vitis_LIBRARIES}) - target_compile_definitions(GEMM_xilinx PRIVATE -DXILINX_FPGA) + add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_xilinx main.cpp) + target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) if (BLAS_FOUND) - target_compile_definitions(GEMM_xilinx PRIVATE -D_USE_BLAS_) - target_link_libraries(GEMM_xilinx ${BLAS_LIBRARIES}) - target_link_options(GEMM_xilinx PRIVATE ${BLAS_LINKER_FLAGS}) + target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -D_USE_BLAS_) + target_link_libraries(${LIB_NAME}_xilinx ${BLAS_LIBRARIES}) + target_link_libraries(${LIB_NAME}_xilinx PRIVATE ${BLAS_LINKER_FLAGS}) + message(STATUS ${BLAS_LINKER_FLAGS} ${BLAS_LIBRARIES}) endif() + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) + target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_xilinx_host_executable COMMAND $ -h) endif() diff --git a/GEMM/src/host/execution.h b/GEMM/src/host/execution.h index a93a3e74..4fc11709 100644 --- a/GEMM/src/host/execution.h +++ b/GEMM/src/host/execution.h @@ -30,29 +30,10 @@ SOFTWARE. #include "CL/cl.hpp" #include "parameters.h" - -#ifndef BLOCK_SIZE -#define BLOCK_SIZE 32 -#endif - +#include "gemm_benchmark.hpp" namespace bm_execution { - struct ExecutionConfiguration { - cl::Context context; - cl::Device device; - cl::Program program; - std::string kernelName; - uint repetitions; - cl_uint matrixSize; - bool useMemInterleaving; - }; - - struct ExecutionTimings { - std::vector transferTimings; - std::vector calculationTimings; - }; - /** The actual execution of the benchmark. @@ -72,8 +53,8 @@ simple exchange of the different calculation methods. @return The time measurements and the error rate counted from the executions */ -std::shared_ptr -calculate(std::shared_ptr config, HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, +std::unique_ptr +calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, HOST_DATA_TYPE* c_out, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta); } // namespace bm_execution diff --git a/GEMM/src/host/execution_cannon.cpp b/GEMM/src/host/execution_cannon.cpp index d92678bc..7e4ded52 100644 --- a/GEMM/src/host/execution_cannon.cpp +++ b/GEMM/src/host/execution_cannon.cpp @@ -35,9 +35,6 @@ SOFTWARE. #include "CL/cl_ext_intelfpga.h" #endif -/* Project's headers */ -#include "setup/fpga_setup.hpp" -#include "gemm_functionality.hpp" namespace bm_execution { @@ -46,37 +43,37 @@ namespace bm_execution { @copydoc bm_execution::calculate() */ -std::shared_ptr -calculate(std::shared_ptr config, HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, HOST_DATA_TYPE* c_out, +std::unique_ptr +calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, HOST_DATA_TYPE* c_out, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta) { int err; // Create Command queue - cl::CommandQueue compute_queue(config->context, config->device); + cl::CommandQueue compute_queue(*config.context, *config.device); #ifdef INTEL_FPGA - cl::Buffer Buffer_a(config->context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_1_INTELFPGA), - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize); - cl::Buffer Buffer_b(config->context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_2_INTELFPGA), - sizeof(cl_int)*config->matrixSize*config->matrixSize); - cl::Buffer Buffer_c_in(config->context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_3_INTELFPGA), - sizeof(cl_int)*config->matrixSize*config->matrixSize); - cl::Buffer Buffer_c_out(config->context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_4_INTELFPGA), - sizeof(cl_int)*config->matrixSize*config->matrixSize); + cl::Buffer Buffer_a(*config.context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_1_INTELFPGA), + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize); + cl::Buffer Buffer_b(*config.context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_2_INTELFPGA), + sizeof(cl_int)*config.programSettings->matrixSize*config.programSettings->matrixSize); + cl::Buffer Buffer_c_in(*config.context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_3_INTELFPGA), + sizeof(cl_int)*config.programSettings->matrixSize*config.programSettings->matrixSize); + cl::Buffer Buffer_c_out(*config.context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_4_INTELFPGA), + sizeof(cl_int)*config.programSettings->matrixSize*config.programSettings->matrixSize); #else - cl::Buffer Buffer_a(config->context, CL_MEM_READ_WRITE, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize); - cl::Buffer Buffer_b(config->context, CL_MEM_READ_WRITE, - sizeof(cl_int)*config->matrixSize*config->matrixSize); - cl::Buffer Buffer_c_in(config->context, CL_MEM_READ_WRITE, - sizeof(cl_int)*config->matrixSize*config->matrixSize); - cl::Buffer Buffer_c_out(config->context, CL_MEM_READ_WRITE, - sizeof(cl_int)*config->matrixSize*config->matrixSize); + cl::Buffer Buffer_a(*config.context, CL_MEM_READ_WRITE, + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize); + cl::Buffer Buffer_b(*config.context, CL_MEM_READ_WRITE, + sizeof(cl_int)*config.programSettings->matrixSize*config.programSettings->matrixSize); + cl::Buffer Buffer_c_in(*config.context, CL_MEM_READ_WRITE, + sizeof(cl_int)*config.programSettings->matrixSize*config.programSettings->matrixSize); + cl::Buffer Buffer_c_out(*config.context, CL_MEM_READ_WRITE, + sizeof(cl_int)*config.programSettings->matrixSize*config.programSettings->matrixSize); #endif // create the kernels - cl::Kernel gemmkernel(config->program, GEMM_KERNEL, + cl::Kernel gemmkernel(*config.program, KERNEL_NAME, &err); ASSERT_CL(err); @@ -94,20 +91,20 @@ calculate(std::shared_ptr config, HOST_DATA_TYPE* a, HOS ASSERT_CL(err); err = gemmkernel.setArg(5, beta); ASSERT_CL(err); - err = gemmkernel.setArg(6, config->matrixSize); + err = gemmkernel.setArg(6, config.programSettings->matrixSize); ASSERT_CL(err); /* --- Execute actual benchmark kernels --- */ double t; std::vector executionTimes; - for (int i = 0; i < config->repetitions; i++) { + for (int i = 0; i < config.programSettings->matrixSize; i++) { compute_queue.enqueueWriteBuffer(Buffer_a, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, a); + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, a); compute_queue.enqueueWriteBuffer(Buffer_b, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, b); + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, b); compute_queue.enqueueWriteBuffer(Buffer_c_in, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, c); + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, c); compute_queue.finish(); auto t1 = std::chrono::high_resolution_clock::now(); compute_queue.enqueueTask(gemmkernel); @@ -122,11 +119,11 @@ calculate(std::shared_ptr config, HOST_DATA_TYPE* a, HOS /* --- Read back results from Device --- */ compute_queue.enqueueReadBuffer(Buffer_c_out, CL_TRUE, 0, - sizeof(HOST_DATA_TYPE)*config->matrixSize*config->matrixSize, c_out); + sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize, c_out); - std::shared_ptr results( - new ExecutionTimings{executionTimes, executionTimes}); + std::unique_ptr results( + new gemm::GEMMExecutionTimings{executionTimes}); return results; } diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp new file mode 100644 index 00000000..a5fe009a --- /dev/null +++ b/GEMM/src/host/gemm_benchmark.cpp @@ -0,0 +1,163 @@ +// +// Created by Marius Meyer on 04.12.19. +// + +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "gemm_benchmark.hpp" + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "execution.h" +#include "parameters.h" + +gemm::GEMMProgramSettings::GEMMProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), + matrixSize(results["m"].as()) { + +} + +std::map +gemm::GEMMProgramSettings::getSettingsMap() { + auto map = hpcc_base::BaseSettings::getSettingsMap(); + map["Matrix Size"] = std::to_string(matrixSize); + return map; +} + +gemm::GEMMBenchmark::GEMMBenchmark(int argc, char* argv[]) { + setupBenchmark(argc, argv); +} + +gemm::GEMMBenchmark::GEMMBenchmark() {} + +void +gemm::GEMMBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { + options.add_options() + ("m", "Matrix size", + cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))); +} + +std::unique_ptr +gemm::GEMMBenchmark::executeKernel(GEMMData &data) { + return bm_execution::calculate(*executionSettings, data.A, data.B, data.C, data.C_out, data.alpha, data.beta); +} + +void +gemm::GEMMBenchmark::printResults(const gemm::GEMMExecutionTimings &output) { + std::cout << std::setw(ENTRY_SPACE) + << "best" << std::setw(ENTRY_SPACE) << "mean" + << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; + + // Calculate performance for kernel execution plus data transfer + double tmean = 0; + double tmin = std::numeric_limits::max(); + + double gflops = 2.0 * static_cast(executionSettings->programSettings->matrixSize + *executionSettings->programSettings->matrixSize + *executionSettings->programSettings->matrixSize)/1.0e9; + for (double currentTime : output.timings) { + tmean += currentTime; + if (currentTime < tmin) { + tmin = currentTime; + } + } + tmean = tmean / output.timings.size(); + + std::cout << std::setw(ENTRY_SPACE) + << tmin << std::setw(ENTRY_SPACE) << tmean + << std::setw(ENTRY_SPACE) << gflops / tmin + << std::endl; + +} + +std::unique_ptr +gemm::GEMMBenchmark::generateInputData() { + auto d = std::unique_ptr(new gemm::GEMMData(executionSettings->programSettings->matrixSize)); + std::mt19937 gen(7); + std::uniform_real_distribution<> dis(-1.0, 1.0); + for (int j = 0; j < executionSettings->programSettings->matrixSize; j++) { + for (int i = 0; i < executionSettings->programSettings->matrixSize; i++) { + d->A[executionSettings->programSettings->matrixSize*i+j] = dis(gen); + d->B[executionSettings->programSettings->matrixSize*i+j] = dis(gen); + d->C[executionSettings->programSettings->matrixSize*i+j] = dis(gen); + d->C_out[executionSettings->programSettings->matrixSize*i+j] = 0.0; + d->normtotal = std::max(std::max(d->normtotal, d->A[executionSettings->programSettings->matrixSize*i+j]), + std::max(d->B[executionSettings->programSettings->matrixSize*i+j], + d->C[executionSettings->programSettings->matrixSize*i+j])); + } + } + return d; +} + +bool +gemm::GEMMBenchmark::validateOutputAndPrintError(gemm::GEMMData &data) { + auto ref_data = generateInputData(); + + gemm_ref(ref_data->A, ref_data->B, ref_data->C, executionSettings->programSettings->matrixSize, 0.5, 2.0); + + HOST_DATA_TYPE resid = 0.0; + HOST_DATA_TYPE normx = 0.0; + + for (int i = 0; i < executionSettings->programSettings->matrixSize * executionSettings->programSettings->matrixSize; i++) { + resid = (resid > fabs(data.C_out[i] - ref_data->C[i])) ? resid : fabs(data.C_out[i] - ref_data->C[i]); + normx = (normx > fabs(data.C_out[i])) ? normx : fabs(data.C_out[i]); + } + + HOST_DATA_TYPE eps = std::numeric_limits::epsilon(); + HOST_DATA_TYPE residn = resid / (executionSettings->programSettings->matrixSize*executionSettings->programSettings->matrixSize*ref_data->normtotal*normx*eps); + + std::cout << " norm. resid resid "\ + "machep" << std::endl; + std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE) + << resid << std::setw(ENTRY_SPACE) << eps + << std::endl; + + return residn < 1.0; +} + +void +gemm::gemm_ref(HOST_DATA_TYPE* a,HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, + int n, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta) { +#ifdef _USE_BLAS_ + char ta = 'N'; + char tb = 'N'; + + sgemm_(&ta, &tb, &n, &n, &n, &alpha, b, &n, a, &n, &beta, c, &n); +#else + for (int i=0; i < n; i++) { + for (int j=0; j < n; j++) { + c[i * n + j] = beta * c[i*n + j]; + } + } + + for (int i=0; i < n; i++) { + for (int j=0; j < n; j++) { + for (int k=0; k < n; k++) { + c[i*n + j] += alpha * a[i*n + k] * b[k*n + j]; + } + } + } +#endif +} diff --git a/GEMM/src/host/gemm_benchmark.hpp b/GEMM/src/host/gemm_benchmark.hpp new file mode 100644 index 00000000..71b37f8a --- /dev/null +++ b/GEMM/src/host/gemm_benchmark.hpp @@ -0,0 +1,211 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef SRC_HOST_GEMM_BENCHMARK_H_ +#define SRC_HOST_GEMM_BENCHMARK_H_ + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "hpcc_benchmark.hpp" +#include "parameters.h" + +#ifdef _USE_BLAS_ + +extern "C" void sgemm_(char*, char*, int*, int*,int*, float*, float*, int*, float*, int*, float*, float*, int*); + +#endif + + +/** + * @brief Contains all classes and methods needed by the LINPACK benchmark + * + */ +namespace gemm { + +/** + * @brief The GEMM specific program settings + * + */ +class GEMMProgramSettings : public hpcc_base::BaseSettings { + +public: + /** + * @brief The size of the whole matrix + * + */ + uint matrixSize; + + /** + * @brief Construct a new GEMM Program Settings object + * + * @param results the result map from parsing the program input parameters + */ + GEMMProgramSettings(cxxopts::ParseResult &results); + + /** + * @brief Get a map of the settings. This map will be used to print the final configuration. + * + * @return a map of program parameters. keys are the name of the parameter. + */ + std::map getSettingsMap() override; + +}; + +/** + * @brief Data class cotnaining the data the kernel is exeucted with + * + */ +class GEMMData { + +public: + HOST_DATA_TYPE *A, *B, *C, *C_out; + HOST_DATA_TYPE normtotal, alpha, beta; + + /** + * @brief Construct a new GEMM Data object + * + * @param size Size of the allocated square matrices + */ + GEMMData(uint size) : normtotal(0.0), alpha(0.5), beta(2.0) { + posix_memalign(reinterpret_cast(&A), 4096, size * size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&B), 4096, size * size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&C), 4096, size * size * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&C_out), 4096, size * size * sizeof(HOST_DATA_TYPE)); + } + + /** + * @brief Destroy the GEMM Data object. Free the allocated memory + * + */ + ~GEMMData() { + free(A); + free(B); + free(C); + free(C_out); + } + +}; + +/** + * @brief Measured execution timing from the kernel execution + * + */ +class GEMMExecutionTimings { +public: + /** + * @brief A vector containing the timings for all repetitions for the kernel execution + * + */ + std::vector timings; + +}; + +/** + * @brief Implementation of the GEMM benchmark + * + */ +class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark { + +protected: + + /** + * @brief Additional input parameters of the GEMM benchmark + * + * @param options + */ + void + addAdditionalParseOptions(cxxopts::Options &options) override; + +public: + + /** + * @brief LINPACK specific implementation of the data generation + * + * @return std::unique_ptr The input and output data of the benchmark + */ + std::unique_ptr + generateInputData() override; + + /** + * @brief GEMM specific implementation of the kernel execution + * + * @param data The input and output data of the benchmark + * @return std::unique_ptr Measured runtimes of the kernel execution + */ + std::unique_ptr + executeKernel(GEMMData &data) override; + + /** + * @brief GEMM specific implementation of the execution validation + * + * @param data The input and output data of the benchmark + * @return true If validation is successful + * @return false otherwise + */ + bool + validateOutputAndPrintError(GEMMData &data) override; + + /** + * @brief GEMM specific implementation of printing the execution results + * + * @param output Measured runtimes of the kernel execution + */ + void + printResults(const GEMMExecutionTimings &output) override; + + /** + * @brief Construct a new GEMM Benchmark object + * + * @param argc the number of program input parameters + * @param argv the program input parameters as array of strings + */ + GEMMBenchmark(int argc, char* argv[]); + + /** + * @brief Construct a new GEMM Benchmark object + */ + GEMMBenchmark(); + +}; + +/** +Multiply matrix with a vector and add it to another vector. + +C = alpha * A * B + beta * C + +@param a matrix A +@param b matrix B +@param c matrix C that will also be the result matrix +@param n size of all quadratic matrices +@param alpha scalar value used to scale A * B +@param beta scalar value used to scale C +*/ +void gemm_ref( HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, + int n, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta); + +} // namespace gemm + + +#endif // SRC_HOST_STREAM_BENCHMARK_H_ diff --git a/GEMM/src/host/main.cpp b/GEMM/src/host/main.cpp index 596aabd6..f6cc7831 100755 --- a/GEMM/src/host/main.cpp +++ b/GEMM/src/host/main.cpp @@ -2,72 +2,23 @@ // Created by Marius Meyer on 04.12.19. // -#include "parameters.h" -#include "gemm_functionality.hpp" -#include "setup/common_benchmark_io.hpp" -#include "setup/fpga_setup.hpp" +#include "gemm_benchmark.hpp" + +using namespace gemm; /** The program entry point */ -int main(int argc, char * argv[]) { +int +main(int argc, char *argv[]) { // Setup benchmark - std::shared_ptr programSettings = - parseProgramParameters(argc, argv); - fpga_setup::setupEnvironmentAndClocks(); - std::vector usedDevice = - fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); - cl::Context context = cl::Context(usedDevice); - cl::Program program = fpga_setup::fpgaSetup(&context, usedDevice, - &programSettings->kernelFileName); - - printFinalConfiguration(programSettings, usedDevice[0]); - - std::shared_ptr config( - new bm_execution::ExecutionConfiguration{ - context, usedDevice[0], program, - programSettings->kernelName, - programSettings->numRepetitions, - programSettings->matrixSize, - programSettings->useMemInterleaving - }); - - uint lda = config->matrixSize; - HOST_DATA_TYPE* a; - posix_memalign(reinterpret_cast(&a), 64, - sizeof(HOST_DATA_TYPE)*lda*config->matrixSize); - HOST_DATA_TYPE* b; - posix_memalign(reinterpret_cast(&b), 64, - sizeof(HOST_DATA_TYPE)*lda*config->matrixSize); - HOST_DATA_TYPE* c; - posix_memalign(reinterpret_cast(&c), 64, - sizeof(HOST_DATA_TYPE)*lda*config->matrixSize); - HOST_DATA_TYPE* c_out; - posix_memalign(reinterpret_cast(&c_out), 64, - sizeof(HOST_DATA_TYPE)*lda*config->matrixSize); - - HOST_DATA_TYPE alpha = 0.5; - HOST_DATA_TYPE beta = 2.0; - - HOST_DATA_TYPE norma; - matgen(a, 1, config->matrixSize, config->matrixSize, &norma); - matgen(b, 2, config->matrixSize, config->matrixSize, &norma); - matgen(c, 3, config->matrixSize, config->matrixSize, &norma); - - // Start actual benchmark - auto results = bm_execution::calculate(config, a, b, c , c_out, alpha, beta); - - /* --- Check Results --- */ - - double error = checkGEMMresults(c_out, config->matrixSize, config->matrixSize); - - free(reinterpret_cast(a)); - free(reinterpret_cast(b)); - free(reinterpret_cast(c)); - - printResults(results, programSettings->matrixSize); - - return error > 0.1 ? 1 : 0; + auto bm = GEMMBenchmark(argc, argv); + bool success = bm.executeBenchmark(); + if (success) { + return 0; + } + else { + return 1; + } } diff --git a/GEMM/tests/CMakeLists.txt b/GEMM/tests/CMakeLists.txt index 6ea2a4f4..a048e000 100755 --- a/GEMM/tests/CMakeLists.txt +++ b/GEMM/tests/CMakeLists.txt @@ -1,27 +1,30 @@ -# 'lib' is the folder with Google Test sources -add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) +add_subdirectory(../../extern/googletest ${CMAKE_BINARY_DIR}/lib/googletest) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) -include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared/) -include_directories(${CMAKE_SOURCE_DIR}/src/host) +include_directories(${CMAKE_BINARY_DIR}/src/common) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/gemm_functionality.cpp ../src/host/execution_cannon.cpp) -set(TEST_SOURCES ../../shared/testing/main.cpp ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp) +set(HOST_EXE_NAME GEMM) +set(LIB_NAME ge) + +set(TEST_SOURCES main.cpp test_kernel_functionality_and_host_integration.cpp) if (INTELFPGAOPENCL_FOUND) - include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) - add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES}) - target_compile_definitions(GEMM_intel PRIVATE -DINTEL_FPGA) - add_dependencies(Test_intel gemm_cannon_emulate_intel) - add_test(NAME test_intel_unit COMMAND $ -f gemm_cannon_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) + add_executable(${HOST_EXE_NAME}_test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_intel gtest gmock ${LIB_NAME}_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") + add_dependencies(${HOST_EXE_NAME}_test_intel gemm_cannon_emulate_intel) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_intel_unit COMMAND $ -f gemm_cannon_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (Vitis_FOUND) - include_directories(${Vitis_INCLUDE_DIRS}) - add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES}) - target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) - add_dependencies(Test_xilinx gemm_cannon_emulate_xilinx) - add_test(NAME test_xilinx_unit COMMAND $ -f gemm_cannon_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + include_directories(SYSTEM ${Vitis_INCLUDE_DIRS}) + add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + + add_dependencies(${HOST_EXE_NAME}_test_xilinx gemm_cannon_emulate_xilinx) + target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_xilinx_unit COMMAND $ -f gemm_cannon_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/GEMM/tests/main.cpp b/GEMM/tests/main.cpp new file mode 100644 index 00000000..61bb1701 --- /dev/null +++ b/GEMM/tests/main.cpp @@ -0,0 +1,69 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "test_program_settings.h" +#include "gtest/gtest.h" +#include "CL/cl.hpp" + +#ifdef _USE_MPI_ +#include "mpi.h" + +class MPIEnvironment : public ::testing::Environment { +public: + MPIEnvironment(int* argc, char** argv[]) { + MPI_Init(argc, argv); + } + + ~MPIEnvironment() override { + MPI_Finalize(); + } +}; +#endif + +int global_argc; +char** global_argv; + +/** +The program entry point for the unit tests +*/ +int +main(int argc, char *argv[]) { + + std::cout << "THIS BINARY EXECUTES UNIT TESTS FOR THE FOLLOWING BENCHMARK:" << std::endl << std::endl; + + ::testing::InitGoogleTest(&argc, argv); + + global_argc = argc; + global_argv = argv; + +#ifdef _USE_MPI_ + ::testing::Environment* const mpi_env = + ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); +#endif + + bool result = RUN_ALL_TESTS(); + + return result; + +} + diff --git a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp index af4b34a0..515eab3d 100755 --- a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp +++ b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp @@ -4,11 +4,9 @@ #include #include "gtest/gtest.h" -#include "../src/host/execution.h" -#include "../src/host/gemm_functionality.hpp" +#include "gemm_benchmark.hpp" #include "parameters.h" -#include "setup/fpga_setup.hpp" -#include "testing/test_program_settings.h" +#include "test_program_settings.h" void ref_matmul(HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C, int size) { @@ -17,74 +15,23 @@ ref_matmul(HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C, int size) { C[i * size + j] = 0; } } - gemm_ref(A,B,C,size,1.0,0.0); + gemm::gemm_ref(A,B,C,size,1.0,0.0); } -struct OpenCLKernelTest : testing::Test { - std::string kernelFileName; - HOST_DATA_TYPE *A; - HOST_DATA_TYPE *B; - HOST_DATA_TYPE *C; - HOST_DATA_TYPE *C_out; - std::shared_ptr config; - cl_uint matrix_size; - - OpenCLKernelTest() { - kernelFileName = "gemm_cannon_emulate.aocx"; - matrix_size = 2 * BLOCK_SIZE; - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - posix_memalign(reinterpret_cast(&B), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - posix_memalign(reinterpret_cast(&C), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - posix_memalign(reinterpret_cast(&C_out), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - setupFPGA(); - } - - - void setupFPGA() { - std::vector device = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); - cl::Context context(device[0]); - cl::Program program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); - config = std::make_shared( - bm_execution::ExecutionConfiguration{ - context, device[0], program, - KERNEL_NAME, - 1, - matrix_size, - false - }); - HOST_DATA_TYPE norm; - matgen(A,1,matrix_size, matrix_size, &norm); - matgen(B,2,matrix_size, matrix_size, &norm); - matgen(C,3,matrix_size, matrix_size, &norm); - } +struct GEMMKernelTest : testing::Test, testing::WithParamInterface { + std::unique_ptr bm; + std::unique_ptr data; + unsigned matrix_size; - ~OpenCLKernelTest() override { - free(A); - free(B); - free(C); - free(C_out); + GEMMKernelTest() { + bm = std::unique_ptr(new gemm::GEMMBenchmark(global_argc, global_argv)); + matrix_size = GetParam() * BLOCK_SIZE; + bm->getExecutionSettings().programSettings->matrixSize = matrix_size; } -}; -struct DifferentOpenCLKernelTest : OpenCLKernelTest, testing::WithParamInterface { - DifferentOpenCLKernelTest() { - auto params = GetParam(); - kernelFileName = programSettings->kernelFileName; - matrix_size = params * BLOCK_SIZE; - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - posix_memalign(reinterpret_cast(&B), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - posix_memalign(reinterpret_cast(&C), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - posix_memalign(reinterpret_cast(&C_out), 64, - sizeof(HOST_DATA_TYPE) * matrix_size * matrix_size); - setupFPGA(); + void SetUp() { + data = bm->generateInputData(); } }; @@ -92,18 +39,18 @@ struct DifferentOpenCLKernelTest : OpenCLKernelTest, testing::WithParamInterface /** * Tests if C will be multiplied by beta */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectCtimesBeta) { +TEST_P(GEMMKernelTest, FPGACorrectCtimesBeta) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - A[i * matrix_size + j] = 0.0; - B[i * matrix_size + j] = 0.0; - C[i * matrix_size + j] = 1.0; + data->A[i * matrix_size + j] = 0.0; + data->B[i * matrix_size + j] = 0.0; + data->C[i * matrix_size + j] = 1.0; } } - auto result = bm_execution::calculate(config, A, B, C, C_out,0.0,2.0); + auto result = bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - EXPECT_FLOAT_EQ(C_out[i * matrix_size + j], 2.0 * C[i * matrix_size + j]); + EXPECT_FLOAT_EQ(data->C_out[i * matrix_size + j], 2.0 * data->C[i * matrix_size + j]); } } } @@ -111,17 +58,20 @@ TEST_P(DifferentOpenCLKernelTest, FPGACorrectCtimesBeta) { /** * Tests if A will be multiplied by alpha */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectAtimesAlpha) { +TEST_P(GEMMKernelTest, FPGACorrectAtimesAlpha) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - B[i * matrix_size + j] = i == j ? 1.0 : 0.0; - C[i * matrix_size + j] = 0.0; + data->B[i * matrix_size + j] = i == j ? 1.0 : 0.0; + data->C[i * matrix_size + j] = 0.0; } } - auto result = bm_execution::calculate(config, A, B, C, C_out,2.0,0.0); + data->alpha = 2.0; + data->beta = 0.0; + + auto result = bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - EXPECT_FLOAT_EQ(C_out[i * matrix_size + j], 2.0 * A[i * matrix_size + j]); + EXPECT_FLOAT_EQ(data->C_out[i * matrix_size + j], 2.0 * data->A[i * matrix_size + j]); } } } @@ -129,17 +79,19 @@ TEST_P(DifferentOpenCLKernelTest, FPGACorrectAtimesAlpha) { /** * Tests if B will be multiplied by alpha */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectBtimesAlpha) { +TEST_P(GEMMKernelTest, FPGACorrectBtimesAlpha) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - A[i * matrix_size + j] = i == j ? 1.0 : 0.0; - C[i * matrix_size + j] = 0.0; + data->A[i * matrix_size + j] = i == j ? 1.0 : 0.0; + data->C[i * matrix_size + j] = 0.0; } } - auto result = bm_execution::calculate(config, A, B, C, C_out,2.0,0.0); + data->alpha = 2.0; + data->beta = 0.0; + auto result = bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - EXPECT_FLOAT_EQ(C_out[i * matrix_size + j], 2.0 * B[i * matrix_size + j]); + EXPECT_FLOAT_EQ(data->C_out[i * matrix_size + j], 2.0 * data->B[i * matrix_size + j]); } } } @@ -147,21 +99,23 @@ TEST_P(DifferentOpenCLKernelTest, FPGACorrectBtimesAlpha) { /** * Tests if A will be multiplied with B */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectAmulB) { +TEST_P(GEMMKernelTest, FPGACorrectAmulB) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - C[i * matrix_size + j] = 0.0; - A[i * matrix_size + j] = j; - B[i * matrix_size + j] = i; + data->C[i * matrix_size + j] = 0.0; + data->A[i * matrix_size + j] = j; + data->B[i * matrix_size + j] = i; } } - auto result = bm_execution::calculate(config, A, B, C, C_out,1.0,0.0); + data->alpha = 1.0; + data->beta = 1.0; + auto result = bm->executeKernel(*data); HOST_DATA_TYPE c_ref_out[matrix_size * matrix_size]; - ref_matmul(A,B,c_ref_out,matrix_size); + ref_matmul(data->A,data->B,c_ref_out,matrix_size); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - EXPECT_NEAR(C_out[i * matrix_size + j], c_ref_out[i * matrix_size + j], 0.001); + EXPECT_NEAR(data->C_out[i * matrix_size + j], c_ref_out[i * matrix_size + j], 0.001); } } } @@ -169,18 +123,20 @@ TEST_P(DifferentOpenCLKernelTest, FPGACorrectAmulB) { /** * Tests if C will be added to A */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectCplusA) { +TEST_P(GEMMKernelTest, FPGACorrectCplusA) { for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - B[i * matrix_size + j] = i == j ? 1.0 : 0.0; + data->B[i * matrix_size + j] = i == j ? 1.0 : 0.0; } } + data->alpha = 1.0; + data->beta = 1.0; - auto result = bm_execution::calculate(config, A, B, C, C_out,1.0, 1.0); + auto result = bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - EXPECT_FLOAT_EQ(C_out[i * matrix_size + j], A[i * matrix_size + j] + C[i * matrix_size + j]); + EXPECT_FLOAT_EQ(data->C_out[i * matrix_size + j], data->A[i * matrix_size + j] + data->C[i * matrix_size + j]); } } } @@ -190,22 +146,22 @@ TEST_P(DifferentOpenCLKernelTest, FPGACorrectCplusA) { * Tests full multiply add */ -TEST_P(DifferentOpenCLKernelTest, FPGACorrectbetaCplusalphaAB) { +TEST_P(GEMMKernelTest, FPGACorrectbetaCplusalphaAB) { HOST_DATA_TYPE c_ref_out[matrix_size * matrix_size]; - auto result = bm_execution::calculate(config, A, B, C, C_out,0.5, 2.0); + auto result = bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - c_ref_out[i * matrix_size + j] = C[i * matrix_size + j]; + c_ref_out[i * matrix_size + j] = data->C[i * matrix_size + j]; } } - gemm_ref(A,B,c_ref_out,matrix_size,0.5,2.0); + gemm::gemm_ref(data->A,data->B,c_ref_out,matrix_size,0.5,2.0); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { - EXPECT_NEAR(C_out[i * matrix_size + j], c_ref_out[i * matrix_size + j], 0.001); + EXPECT_NEAR(data->C_out[i * matrix_size + j], c_ref_out[i * matrix_size + j], 0.001); } } } -INSTANTIATE_TEST_CASE_P(Default, DifferentOpenCLKernelTest, +INSTANTIATE_TEST_CASE_P(Default, GEMMKernelTest, testing::Values(1,2)); diff --git a/GEMM/tests/test_program_settings.h b/GEMM/tests/test_program_settings.h new file mode 100644 index 00000000..89e2070f --- /dev/null +++ b/GEMM/tests/test_program_settings.h @@ -0,0 +1,25 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + + +extern int global_argc; +extern char** global_argv; \ No newline at end of file diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index bc6ba6e3..daf7f0c4 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -10,7 +10,7 @@ # Regular expressions for the raw output of all fft_regex = "Version:\\s+(?P.+)\n(.*\n)+FFT\\sSize:\\s+(?P\d+)\nData\\sSize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" -gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Total\\smatrix\\ssize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" +gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)\n\\s+trans\\s+calc\\s+calc\\s+FLOPS\\s+total\\s+FLOPS\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" From 720b6e1555449cf55092f52204337224319bb0cc Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 19 May 2020 16:13:12 +0200 Subject: [PATCH 35/45] Remove old GEMM files --- .../common_benchmark_io_implementation.cpp | 101 ---------- GEMM/src/host/gemm_functionality.cpp | 172 ------------------ GEMM/src/host/gemm_functionality.hpp | 95 ---------- GEMM/src/host/program_settings.h | 36 ---- 4 files changed, 404 deletions(-) delete mode 100644 GEMM/src/host/common_benchmark_io_implementation.cpp delete mode 100755 GEMM/src/host/gemm_functionality.cpp delete mode 100755 GEMM/src/host/gemm_functionality.hpp delete mode 100644 GEMM/src/host/program_settings.h diff --git a/GEMM/src/host/common_benchmark_io_implementation.cpp b/GEMM/src/host/common_benchmark_io_implementation.cpp deleted file mode 100644 index 227975ef..00000000 --- a/GEMM/src/host/common_benchmark_io_implementation.cpp +++ /dev/null @@ -1,101 +0,0 @@ - -#include "cxxopts.hpp" -#include "parameters.h" -#include "setup/common_benchmark_io.hpp" - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("m", "Matrix size", - cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) - ("kernel", "Name of the kernel", - cxxopts::value()->default_value(KERNEL_NAME)) -#ifdef INTEL_FPGA - ("i,interleaving", "Use memory interleaving on the FPGA") -#endif - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["m"].as(), - result["platform"].as(), - result["device"].as(), -#ifdef INTEL_FPGA - static_cast(result.count("i") > 0), -#else - false, -#endif - result["f"].as(), - result["kernel"].as()}); - return sharedSettings; -} - -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "Kernel Repetitions: " << programSettings->numRepetitions - << std::endl - << "Total matrix size: " << programSettings->matrixSize - << std::endl - << "Memory Interleaving: " << programSettings->useMemInterleaving - << " (Intel only)" << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl - << "Device: " - << device.getInfo() << std::endl - << "Verification: " - #ifdef _USE_BLAS_ - << "external library" - #else - << "internal ref. implementation" - #endif - << std::endl - << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} diff --git a/GEMM/src/host/gemm_functionality.cpp b/GEMM/src/host/gemm_functionality.cpp deleted file mode 100755 index f9802f36..00000000 --- a/GEMM/src/host/gemm_functionality.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* Related header files */ -#include "gemm_functionality.hpp" - -/* C++ standard library headers */ -#include -#include -#include -#include -#include -#include -#include -#include - -/* External library headers */ -#include "CL/cl.hpp" -#if QUARTUS_MAJOR_VERSION > 18 -#include "CL/cl_ext_intelfpga.h" -#endif -#include "cxxopts.hpp" - -/* Project's headers */ -#include "parameters.h" -#include "execution.h" -#include "setup/common_benchmark_io.hpp" - - -/** -Print the benchmark Results - -@param results The result struct provided by the calculation call -@param dataSize The size of the used data array - -*/ -void printResults(std::shared_ptr results, - size_t dataSize) { - std::cout << std::setw(ENTRY_SPACE) - << "best" << std::setw(ENTRY_SPACE) << "mean" - << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; - - // Calculate performance for kernel execution plus data transfer - double tmean = 0; - double tmin = std::numeric_limits::max(); - - double gflops = 2.0 * static_cast(dataSize*dataSize*dataSize)/1.0e9; - for (double currentTime : results->calculationTimings) { - tmean += currentTime; - if (currentTime < tmin) { - tmin = currentTime; - } - } - tmean = tmean / results->calculationTimings.size(); - - std::cout << std::setw(ENTRY_SPACE) - << tmin << std::setw(ENTRY_SPACE) << tmean - << std::setw(ENTRY_SPACE) << gflops / tmin - << std::endl; -} - -void matgen(HOST_DATA_TYPE* a, int seed, cl_int lda, cl_int n, - HOST_DATA_TYPE* norma) { - std::mt19937 gen(seed); - std::uniform_real_distribution<> dis(-1.0, 1.0); - for (int j = 0; j < n; j++) { - for (int i = 0; i < n; i++) { - a[lda*i+j] = dis(gen); - *norma = (a[lda*i+j] > *norma) ? a[lda*i+j] : *norma; - } - for (int i = n; i < lda; i++) { - a[lda*j+i] = 0; - } - } -} - -void gemm_ref(HOST_DATA_TYPE* a,HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, - int n, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta) { -#ifdef _USE_BLAS_ - char ta = 'N'; - char tb = 'N'; - - sgemm_(&ta, &tb, &n, &n, &n, &alpha, b, &n, a, &n, &beta, c, &n); -#else - for (int i=0; i < n; i++) { - for (int j=0; j < n; j++) { - c[i * n + j] = beta * c[i*n + j]; - } - } - - for (int i=0; i < n; i++) { - for (int j=0; j < n; j++) { - for (int k=0; k < n; k++) { - c[i*n + j] += alpha * a[i*n + k] * b[k*n + j]; - } - } - } -#endif -} - -double -checkGEMMresults(HOST_DATA_TYPE* c_res, cl_int lda, cl_int n) { - HOST_DATA_TYPE* a = new HOST_DATA_TYPE[lda*n]; - HOST_DATA_TYPE* b = new HOST_DATA_TYPE[lda*n]; - HOST_DATA_TYPE* c = new HOST_DATA_TYPE[lda*n]; - HOST_DATA_TYPE totalnorm = 0.0; - - /* compute a residual to verify results. */ - - - matgen(a, 1, lda, n, &totalnorm); - matgen(b, 2, lda, n, &totalnorm); - matgen(c, 3, lda, n, &totalnorm); - - gemm_ref(a, b, c, n, 0.5, 2.0); - - HOST_DATA_TYPE resid = 0.0; - HOST_DATA_TYPE normx = 0.0; - - for (int i = 0; i < n * n; i++) { - resid = (resid > fabs(c_res[i] - c[i])) ? resid : fabs(c_res[i] - c[i]); - normx = (normx > fabs(c_res[i])) ? normx : fabs(c_res[i]); - } - - HOST_DATA_TYPE eps = epslon(static_cast(1.0)); - HOST_DATA_TYPE residn = resid / (lda*n*totalnorm*normx*eps); - - std::cout << " norm. resid resid "\ - "machep" << std::endl; - std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE) - << resid << std::setw(ENTRY_SPACE) << eps - << std::endl; - - delete a; - delete b; - delete c; - return residn; -} - -HOST_DATA_TYPE epslon(HOST_DATA_TYPE x) { - HOST_DATA_TYPE a, b, c, eps; - - a = 4.0e0/3.0e0; - eps = 0.0; - while (eps == 0.0) { - b = a - 1.0; - c = b + b + b; - eps = fabs(static_cast(c-1.0)); - } - return (eps*fabs(static_cast(x))); -} - - diff --git a/GEMM/src/host/gemm_functionality.hpp b/GEMM/src/host/gemm_functionality.hpp deleted file mode 100755 index 4b09d2e0..00000000 --- a/GEMM/src/host/gemm_functionality.hpp +++ /dev/null @@ -1,95 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ -#ifndef COMMON_FUNCTIONALITY_H -#define COMMON_FUNCTIONALITY_H - -/* C++ standard library headers */ -#include - -/* Project's headers */ -#include "execution.h" -#include "parameters.h" - -/* -Number of times the execution of the benchmark will be repeated. -*/ -#ifndef NTIMES -#define NTIMES 1 -#endif - -/* -Prefix of the function name of the used kernel. -It will be used to construct the full function name for the case of replications. -The full name will be -*/ -#define GEMM_KERNEL "gemm" - - -#ifdef _USE_BLAS_ - -extern "C" void sgemm_(char*, char*, int*, int*,int*, float*, float*, int*, float*, int*, float*, float*, int*); - -#endif - -double -checkGEMMresults(HOST_DATA_TYPE* c_res, cl_int lda, cl_int n); - -/** -Print the benchmark results to stdout - -@param results the struct containing the results of the benchmark execution -@param matrixSize size of the calculated matrix -*/ -void printResults(std::shared_ptr results, - size_t matrixSize); - -/** -Generate a matrix using pseudo random numbers with fixed seed. -Generates inuts for th - - -@param a pointer to the matrix -@param seed Seed for the pseudo random number generation -@param lda width of a row in the matrix -@param n number of rows in the matrix -@param norma the maximum value in the matrix A that can be used to calculate the residual error -*/ -void matgen(HOST_DATA_TYPE* a, int seed, cl_int lda, cl_int n, HOST_DATA_TYPE* norma); - -/** -Multiply matrix with a vector and add it to another vector. - -C = alpha * A * B + beta * C - -@param a matrix A -@param b matrix B -@param c matrix C that will also be the result matrix -@param n size of all quadratic matrices -@param alpha scalar value used to scale A * B -@param beta scalar value used to scale C -*/ -void gemm_ref( HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, - int n, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta); - -HOST_DATA_TYPE epslon (HOST_DATA_TYPE x); - -#endif // SRC_HOST_COMMON_FUNCTIONALITY_H_ diff --git a/GEMM/src/host/program_settings.h b/GEMM/src/host/program_settings.h deleted file mode 100644 index 91f9bf36..00000000 --- a/GEMM/src/host/program_settings.h +++ /dev/null @@ -1,36 +0,0 @@ - -#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ -#define SRC_HOST_PROGRAM_SETTINGS_H_ - -#include "parameters.h" - -/* C++ standard library headers */ -#include - -#include "CL/opencl.h" - -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ - -/* -Short description of the program -*/ -#define PROGRAM_DESCRIPTION "Implementation of the GEMM benchmark"\ - " proposed in the HPCC benchmark adapted for FPGA\n"\ - "Version: " VERSION "\n" - - -struct ProgramSettings { - uint numRepetitions; - cl_uint matrixSize; - int defaultPlatform; - int defaultDevice; - bool useMemInterleaving; - std::string kernelFileName; - std::string kernelName; -}; - - -#endif From ecf138032c3d67a69ca7d29e352cbde8a527c6b4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 19 May 2020 16:14:26 +0200 Subject: [PATCH 36/45] Change linpack code version --- LINPACK/CHANGELOG | 5 +++++ LINPACK/CMakeLists.txt | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/LINPACK/CHANGELOG b/LINPACK/CHANGELOG index bbd7985a..2cd2c946 100644 --- a/LINPACK/CHANGELOG +++ b/LINPACK/CHANGELOG @@ -2,6 +2,11 @@ This file contains all changes made to the source code for each release. +## 2.0.2 + +#### Changed: +- Converted host code to new OO code + ## 2.0.1 #### Added: diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt index 4f6d6d84..becda685 100755 --- a/LINPACK/CMakeLists.txt +++ b/LINPACK/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.1) -project(LINPACK VERSION 2.0.1) +project(LINPACK VERSION 2.0.2) set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size") set(GLOBAL_MEM_UNROLL 16 CACHE STRING "Unrolling factor that is used for all loops in the kernels") From 22e531268536d190055f3dd63c89ec933f6a7bad Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 19 May 2020 17:46:25 +0200 Subject: [PATCH 37/45] Adapt FFT to OO code --- FFT/CHANGELOG | 6 + FFT/CMakeLists.txt | 2 +- FFT/README.md | 8 +- FFT/src/common/parameters.h.in | 8 + FFT/src/device/CMakeLists.txt | 4 +- FFT/src/host/CMakeLists.txt | 36 ++- .../common_benchmark_io_implementation.cpp | 88 ------- FFT/src/host/execution.h | 17 +- FFT/src/host/execution_default.cpp | 22 +- ...ft_functionality.cpp => fft_benchmark.cpp} | 95 +++++--- FFT/src/host/fft_benchmark.hpp | 225 ++++++++++++++++++ FFT/src/host/fft_functionality.hpp | 103 -------- FFT/src/host/main.cpp | 48 +--- FFT/src/host/program_settings.h | 30 --- FFT/tests/CMakeLists.txt | 38 +-- FFT/tests/main.cpp | 69 ++++++ FFT/tests/test_execution_functionality.cpp | 214 ++++++----------- FFT/tests/test_fft_functionality.cpp | 98 ++++---- FFT/tests/test_program_settings.h | 25 ++ scripts/evaluation/parse_raw_to_csv.py | 2 +- 20 files changed, 588 insertions(+), 550 deletions(-) delete mode 100644 FFT/src/host/common_benchmark_io_implementation.cpp rename FFT/src/host/{fft_functionality.cpp => fft_benchmark.cpp} (69%) create mode 100644 FFT/src/host/fft_benchmark.hpp delete mode 100644 FFT/src/host/fft_functionality.hpp delete mode 100644 FFT/src/host/program_settings.h create mode 100644 FFT/tests/main.cpp create mode 100644 FFT/tests/test_program_settings.h diff --git a/FFT/CHANGELOG b/FFT/CHANGELOG index eda6effe..2274c1c3 100644 --- a/FFT/CHANGELOG +++ b/FFT/CHANGELOG @@ -2,6 +2,12 @@ This file contains all changes made to the source code for each release. +## 1.0.2 + +#### Changed: +- Converted host code to new OO code +- Unit tests and emulation kernels wrok now: Fail for Xilinx + ## 1.0.1 #### Added: diff --git a/FFT/CMakeLists.txt b/FFT/CMakeLists.txt index 516f56cc..85a899ca 100755 --- a/FFT/CMakeLists.txt +++ b/FFT/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.1) -project(FFT VERSION 1.0.1) +project(FFT VERSION 1.0.2) set(DEFAULT_ITERATIONS 100 CACHE STRING "Default number of iterations that is done with a single kernel execution") set(HOST_DATA_TYPE cl_float CACHE STRING "Data type used by the host code. Should match the data type of the used FFT") diff --git a/FFT/README.md b/FFT/README.md index 3d340551..16fe3454 100644 --- a/FFT/README.md +++ b/FFT/README.md @@ -19,7 +19,7 @@ The targets below can be used to build the benchmark and its kernels, where `VEN | Target | Description | | -------- | ---------------------------------------------- | | FFT_`VENDOR` | Builds the host application | - | Test_`VENDOR` | Compile the tests and its dependencies | + | FFT_test_`VENDOR` | Compile the tests and its dependencies | More over the are additional targets to generate kernel reports and bitstreams. The provided kernel is optimized for Stratix 10 with 512bit LSUs. @@ -37,7 +37,7 @@ The targets below can be used to build the benchmark and its kernels, where `VEN mkdir build && cd build cmake .. - make fFFT + make FFT_intel You will find all executables and kernel files in the `bin` folder of your build directory. @@ -68,7 +68,7 @@ For more information on available input parameters run -f, --file arg Kernel file name -n, arg Number of repetitions (default: 10) - -i, arg Multiplier for the used data size that will be i * + -b, arg Multiplier for the used data size that will be i * FFT_SIZE (default: 100) --inverse If set, the inverse FFT is calculated instead --device arg Index of the device that has to be used. If not given @@ -81,7 +81,7 @@ For more information on available input parameters run To execute the unit and integration tests run - ./Test_intel + ./FFT_test_intel -f KERNEL_FILE_NAME in the `bin` folder within the build directory. It will run an emulation of the kernel and execute some functionality tests. diff --git a/FFT/src/common/parameters.h.in b/FFT/src/common/parameters.h.in index bf1f850f..2e11cc4c 100644 --- a/FFT/src/common/parameters.h.in +++ b/FFT/src/common/parameters.h.in @@ -19,6 +19,14 @@ #define LOG_FFT_SIZE @LOG_FFT_SIZE@ #define FFT_UNROLL @FFT_UNROLL@ +/* +Short description of the program. +Moreover the version and build time is also compiled into the description. +*/ +#define PROGRAM_DESCRIPTION "Implementation of the FFT benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + /** Output separator */ diff --git a/FFT/src/device/CMakeLists.txt b/FFT/src/device/CMakeLists.txt index 290b923b..22de4a86 100644 --- a/FFT/src/device/CMakeLists.txt +++ b/FFT/src/device/CMakeLists.txt @@ -5,7 +5,7 @@ if (INTELFPGAOPENCL_FOUND) generate_kernel_targets_intel(fft1d_float_8) add_test(NAME test_emulation_intel COMMAND ./FFT_intel -f fft1d_float_8_emulate.aocx -i 1 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./FFT_intel -f fft1d_float_8_emulate.aocx -i 1 -n 1 + add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./FFT_intel -f fft1d_float_8_emulate.aocx -b 1 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() @@ -13,6 +13,6 @@ if (Vitis_FOUND) generate_kernel_targets_xilinx(fft1d_float_8) add_test(NAME test_emulation_xilinx COMMAND ./FFT_xilinx -f fft1d_float_8_emulate.xclbin -i 1 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./FFT_xilinx -f fft1d_float_8_emulate.xclbin -i 1 -n 1 + add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./FFT_xilinx -f fft1d_float_8_emulate.xclbin -b 1 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/FFT/src/host/CMakeLists.txt b/FFT/src/host/CMakeLists.txt index 20a5124f..0c505452 100755 --- a/FFT/src/host/CMakeLists.txt +++ b/FFT/src/host/CMakeLists.txt @@ -1,20 +1,32 @@ -include_directories(../../../extern/cxxopts/include ../../../shared) -include_directories(${CMAKE_BINARY_DIR}/src/common) -include_directories(${CMAKE_SOURCE_DIR}/../shared/setup .) +add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) +set(HOST_SOURCE execution_default.cpp fft_benchmark.cpp) -set(HOST_SOURCE execution_default.cpp main.cpp common_benchmark_io_implementation.cpp ../../../shared/setup/fpga_setup.cpp fft_functionality.cpp) +set(HOST_EXE_NAME FFT) +set(LIB_NAME fft_lib) if (INTELFPGAOPENCL_FOUND) - include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) - add_executable(FFT_intel ${HOST_SOURCE}) - target_compile_definitions(FFT_intel PRIVATE -DINTEL_FPGA) - target_link_libraries(FFT_intel ${IntelFPGAOpenCL_LIBRARIES}) + add_library(${LIB_NAME}_intel STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_intel PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${IntelFPGAOpenCL_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_intel PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_intel main.cpp) + target_link_libraries(${LIB_NAME}_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel) + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_intel_host_executable COMMAND $ -h) endif() if (Vitis_FOUND) - include_directories(${Vitis_INCLUDE_DIRS}) - add_executable(FFT_xilinx ${HOST_SOURCE}) - target_compile_definitions(FFT_xilinx PRIVATE -DXILINX_FPGA) - target_link_libraries(FFT_xilinx ${Vitis_LIBRARIES}) + add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_xilinx main.cpp) + target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) + target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_xilinx_host_executable COMMAND $ -h) endif() diff --git a/FFT/src/host/common_benchmark_io_implementation.cpp b/FFT/src/host/common_benchmark_io_implementation.cpp deleted file mode 100644 index 843c9d88..00000000 --- a/FFT/src/host/common_benchmark_io_implementation.cpp +++ /dev/null @@ -1,88 +0,0 @@ - -#include "cxxopts.hpp" -#include "parameters.h" -#include "setup/common_benchmark_io.hpp" - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("i", "Multiplier for the used data size that will be i * FFT_SIZE", - cxxopts::value()->default_value(std::to_string(DEFAULT_ITERATIONS))) - ("inverse", "If set, the inverse FFT is calculated instead") - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["i"].as(), - static_cast(result.count("inverse")), - result["platform"].as(), - result["device"].as(), - result["f"].as()}); - return sharedSettings; -} - -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "FFT Size: " << (1 << LOG_FFT_SIZE) - << std::endl - << "Data Size: " << programSettings->iterations << " * FFT Size * sizeof(" - << STR(HOST_DATA_TYPE) - << ") = " << static_cast((1 << LOG_FFT_SIZE) * programSettings->iterations * sizeof(HOST_DATA_TYPE)) << " Byte" - << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} diff --git a/FFT/src/host/execution.h b/FFT/src/host/execution.h index 27c092ce..65800bf7 100644 --- a/FFT/src/host/execution.h +++ b/FFT/src/host/execution.h @@ -30,22 +30,11 @@ SOFTWARE. /* External library headers */ #include "CL/cl.hpp" #include "parameters.h" +#include "fft_benchmark.hpp" namespace bm_execution { - struct ExecutionConfiguration { - cl::Context context; - cl::Device device; - cl::Program program; - uint repetitions; - }; - - struct ExecutionTimings { - unsigned iterations; - bool inverse; - std::vector calculationTimings; - }; /** The actual execution of the benchmark. @@ -57,8 +46,8 @@ simple exchange of the different calculation methods. @return The resulting matrix */ - std::shared_ptr - calculate(std::shared_ptr config, std::complex* data, unsigned iterations, bool inverse); + std::unique_ptr + calculate(hpcc_base::ExecutionSettings const& config, std::complex* data, unsigned iterations, bool inverse); } // namespace bm_execution diff --git a/FFT/src/host/execution_default.cpp b/FFT/src/host/execution_default.cpp index eaeefae2..53a693d1 100644 --- a/FFT/src/host/execution_default.cpp +++ b/FFT/src/host/execution_default.cpp @@ -39,32 +39,32 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr - calculate(std::shared_ptr config, + std::unique_ptr + calculate(hpcc_base::ExecutionSettings const& config, std::complex* data, unsigned iterations, bool inverse) { - cl::Buffer inBuffer = cl::Buffer(config->context, CL_MEM_WRITE_ONLY, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE)); - cl::Buffer outBuffer = cl::Buffer(config->context, CL_MEM_READ_ONLY, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE)); + cl::Buffer inBuffer = cl::Buffer(*config.context, CL_MEM_WRITE_ONLY, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE)); + cl::Buffer outBuffer = cl::Buffer(*config.context, CL_MEM_READ_ONLY, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE)); - cl::Kernel fetchKernel(config->program, FETCH_KERNEL_NAME); + cl::Kernel fetchKernel(*config.program, FETCH_KERNEL_NAME); fetchKernel.setArg(0, inBuffer); - cl::Kernel fftKernel(config->program, FFT_KERNEL_NAME); + cl::Kernel fftKernel(*config.program, FFT_KERNEL_NAME); fftKernel.setArg(0, outBuffer); fftKernel.setArg(1, iterations); fftKernel.setArg(2, static_cast(inverse)); - cl::CommandQueue fetchQueue(config->context); - cl::CommandQueue fftQueue(config->context); + cl::CommandQueue fetchQueue(*config.context); + cl::CommandQueue fftQueue(*config.context); fetchQueue.enqueueWriteBuffer(inBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data); std::vector calculationTimings; - for (uint r =0; r < config->repetitions; r++) { + for (uint r =0; r < config.programSettings->numRepetitions; r++) { auto startCalculation = std::chrono::high_resolution_clock::now(); fetchQueue.enqueueNDRangeKernel(fetchKernel, cl::NullRange, cl::NDRange((1 << LOG_FFT_SIZE)/ FFT_UNROLL * iterations), cl::NDRange((1 << LOG_FFT_SIZE)/ FFT_UNROLL)); @@ -80,9 +80,7 @@ namespace bm_execution { fetchQueue.enqueueReadBuffer(outBuffer,CL_TRUE,0, (1 << LOG_FFT_SIZE) * iterations * 2 * sizeof(HOST_DATA_TYPE), data); - std::shared_ptr result(new ExecutionTimings{ - iterations, - inverse, + std::unique_ptr result(new fft::FFTExecutionTimings{ calculationTimings }); return result; diff --git a/FFT/src/host/fft_functionality.cpp b/FFT/src/host/fft_benchmark.cpp similarity index 69% rename from FFT/src/host/fft_functionality.cpp rename to FFT/src/host/fft_benchmark.cpp index 06d9a71f..2a84e492 100644 --- a/FFT/src/host/fft_functionality.cpp +++ b/FFT/src/host/fft_benchmark.cpp @@ -24,7 +24,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "fft_functionality.hpp" +#include "fft_benchmark.hpp" /* C++ standard library headers */ #include @@ -32,60 +32,86 @@ SOFTWARE. /* Project's headers */ #include "execution.h" -#include "cxxopts.hpp" -#include "setup/fpga_setup.hpp" -#include "setup/common_benchmark_io.hpp" #include "parameters.h" +fft::FFTProgramSettings::FFTProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), + iterations(results["b"].as()), inverse(results.count("inverse")) { -/** -Prints the execution results to stdout +} + +std::map +fft::FFTProgramSettings::getSettingsMap() { + auto map = hpcc_base::BaseSettings::getSettingsMap(); + map["FFT Size"] = std::to_string(1 << LOG_FFT_SIZE); + map["Batch Size"] = std::to_string(iterations); + return map; +} + +fft::FFTBenchmark::FFTBenchmark(int argc, char* argv[]) { + setupBenchmark(argc, argv); +} + +fft::FFTBenchmark::FFTBenchmark() {} -@param results The execution results -*/ void -printResults(std::shared_ptr results) { +fft::FFTBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { + options.add_options() + ("b", "Number of batched FFT calculations (iterations)", + cxxopts::value()->default_value(std::to_string(DEFAULT_ITERATIONS))) + ("inverse", "If set, the inverse FFT is calculated instead"); +} + +std::unique_ptr +fft::FFTBenchmark::executeKernel(FFTData &data) { + return bm_execution::calculate(*executionSettings, data.data,executionSettings->programSettings->iterations, + executionSettings->programSettings->inverse); +} - double gflop = 5 * (1 << LOG_FFT_SIZE) * LOG_FFT_SIZE * results->iterations * 1.0e-9; +void +fft::FFTBenchmark::printResults(const fft::FFTExecutionTimings &output) { + double gflop = 5 * (1 << LOG_FFT_SIZE) * LOG_FFT_SIZE * executionSettings->programSettings->iterations * 1.0e-9; - double minTime = *min_element(results->calculationTimings.begin(), results->calculationTimings.end()); - double avgTime = accumulate(results->calculationTimings.begin(), results->calculationTimings.end(), 0.0) - / results->calculationTimings.size(); + double minTime = *min_element(output.timings.begin(), output.timings.end()); + double avgTime = accumulate(output.timings.begin(), output.timings.end(), 0.0) + / output.timings.size(); std::cout << std::setw(ENTRY_SPACE) << " " << std::setw(ENTRY_SPACE) << "avg" << std::setw(ENTRY_SPACE) << "best" << std::endl; - std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << avgTime / results->iterations - << std::setw(ENTRY_SPACE) << minTime / results->iterations << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << avgTime / executionSettings->programSettings->iterations + << std::setw(ENTRY_SPACE) << minTime / executionSettings->programSettings->iterations << std::endl; std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << std::setw(ENTRY_SPACE) << gflop / avgTime << std::setw(ENTRY_SPACE) << gflop / minTime << std::endl; - } - -void generateInputData(std::complex* data, unsigned iterations) { +std::unique_ptr +fft::FFTBenchmark::generateInputData() { + auto d = std::unique_ptr(new fft::FFTData(executionSettings->programSettings->iterations)); std::mt19937 gen(0); auto dis = std::uniform_real_distribution(-1.0, 1.0); - for (int i=0; i< iterations * (1 << LOG_FFT_SIZE); i++) { - data[i].real(dis(gen)); - data[i].imag(dis(gen)); + for (int i=0; i< executionSettings->programSettings->iterations * (1 << LOG_FFT_SIZE); i++) { + d->data[i].real(dis(gen)); + d->data[i].imag(dis(gen)); } + return d; } -double checkFFTResult(std::complex* verify_data, std::complex* result_data, unsigned iterations) { +bool +fft::FFTBenchmark::validateOutputAndPrintError(fft::FFTData &data) { + auto verify_data = generateInputData(); double residual_max = 0; - for (int i = 0; i < iterations; i++) { + for (int i = 0; i < executionSettings->programSettings->iterations; i++) { // we have to bit reverse the output data of the FPGA kernel, since it will be provided in bit-reversed order. // Directly applying iFFT on the data would thus not form the identity function we want to have for verification. // TODO: This might need to be changed for other FPGA implementations that return the data in correct order - bit_reverse(&result_data[i * (1 << LOG_FFT_SIZE)], 1); - fourier_transform_gold(true, LOG_FFT_SIZE, &result_data[i * (1 << LOG_FFT_SIZE)]); + fft::bit_reverse(&data.data[i * (1 << LOG_FFT_SIZE)], 1); + fft::fourier_transform_gold(true, LOG_FFT_SIZE, &data.data[i * (1 << LOG_FFT_SIZE)]); // Normalize the data after applying iFFT for (int j = 0; j < (1 << LOG_FFT_SIZE); j++) { - result_data[i * (1 << LOG_FFT_SIZE) + j] /= (1 << LOG_FFT_SIZE); + data.data[i * (1 << LOG_FFT_SIZE) + j] /= (1 << LOG_FFT_SIZE); } for (int j = 0; j < (1 << LOG_FFT_SIZE); j++) { - double tmp_error = std::abs(verify_data[i * (1 << LOG_FFT_SIZE) + j] - result_data[i * (1 << LOG_FFT_SIZE) + j]); + double tmp_error = std::abs(verify_data->data[i * (1 << LOG_FFT_SIZE) + j] - data.data[i * (1 << LOG_FFT_SIZE) + j]); residual_max = residual_max > tmp_error ? residual_max : tmp_error; } } @@ -97,10 +123,11 @@ double checkFFTResult(std::complex* verify_data, std::complex::epsilon() << std::endl << std::endl; // Calculate residual according to paper considering also the used iterations - return error; + return error < 1.0; } -void bit_reverse(std::complex *data, unsigned iterations) { +void +fft::bit_reverse(std::complex *data, unsigned iterations) { auto *tmp = new std::complex[(1 << LOG_FFT_SIZE)]; for (int k=0; k < iterations; k++) { for (int i = 0; i < (1 << LOG_FFT_SIZE); i++) { @@ -146,7 +173,8 @@ void bit_reverse(std::complex *data, unsigned iterations) { // by the laws of the United States of America. -void fourier_transform_gold(bool inverse, const int lognr_points, std::complex *data_sp) { +void +fft::fourier_transform_gold(bool inverse, const int lognr_points, std::complex *data_sp) { const int nr_points = 1 << lognr_points; auto *data = new std::complex[nr_points]; @@ -165,7 +193,7 @@ void fourier_transform_gold(bool inverse, const int lognr_points, std::complex *data) { +void +fft::fourier_stage(int lognr_points, std::complex *data) { int nr_points = 1 << lognr_points; if (nr_points == 1) return; auto *half1 = new std::complex[nr_points / 2]; @@ -204,4 +233,4 @@ void fourier_stage(int lognr_points, std::complex *data) { delete [] half1; delete [] half2; -} \ No newline at end of file +} diff --git a/FFT/src/host/fft_benchmark.hpp b/FFT/src/host/fft_benchmark.hpp new file mode 100644 index 00000000..da4f7c39 --- /dev/null +++ b/FFT/src/host/fft_benchmark.hpp @@ -0,0 +1,225 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef SRC_HOST_FFT_BENCHMARK_H_ +#define SRC_HOST_FFT_BENCHMARK_H_ + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "hpcc_benchmark.hpp" +#include "parameters.h" + +/** + * @brief Contains all classes and methods needed by the FFT benchmark + * + */ +namespace fft { + +/** + * @brief The FFT specific program settings + * + */ +class FFTProgramSettings : public hpcc_base::BaseSettings { + +public: + /** + * @brief Number of batched FFTs + * + */ + uint iterations; + + /** + * @brief Calculate inverste FFT + */ + bool inverse; + + /** + * @brief Construct a new FFT Program Settings object + * + * @param results the result map from parsing the program input parameters + */ + FFTProgramSettings(cxxopts::ParseResult &results); + + /** + * @brief Get a map of the settings. This map will be used to print the final configuration. + * + * @return a map of program parameters. keys are the name of the parameter. + */ + std::map getSettingsMap() override; + +}; + +/** + * @brief Data class cotnaining the data the kernel is exeucted with + * + */ +class FFTData { + +public: + + std::complex* data; + + /** + * @brief Construct a new FFT Data object + * + * @param iterations Number of FFT data that will be stored sequentially in the array + */ + FFTData(uint iterations) { + posix_memalign(reinterpret_cast(&data), 64, iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex)); + } + + /** + * @brief Destroy the FFT Data object. Free the allocated memory + * + */ + ~FFTData() { + free(data); + } + +}; + +/** + * @brief Measured execution timing from the kernel execution + * + */ +class FFTExecutionTimings { +public: + /** + * @brief A vector containing the timings for all repetitions for the kernel execution + * + */ + std::vector timings; + +}; + +/** + * @brief Implementation of the FFT benchmark + * + */ +class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark { + +protected: + + /** + * @brief Additional input parameters of the FFT benchmark + * + * @param options + */ + void + addAdditionalParseOptions(cxxopts::Options &options) override; + +public: + + /** + * @brief FFT specific implementation of the data generation + * + * @return std::unique_ptr The input and output data of the benchmark + */ + std::unique_ptr + generateInputData() override; + + /** + * @brief FFT specific implementation of the kernel execution + * + * @param data The input and output data of the benchmark + * @return std::unique_ptr Measured runtimes of the kernel execution + */ + std::unique_ptr + executeKernel(FFTData &data) override; + + /** + * @brief FFT specific implementation of the execution validation + * + * @param data The input and output data of the benchmark + * @return true If validation is successful + * @return false otherwise + */ + bool + validateOutputAndPrintError(FFTData &data) override; + + /** + * @brief FFT specific implementation of printing the execution results + * + * @param output Measured runtimes of the kernel execution + */ + void + printResults(const FFTExecutionTimings &output) override; + + /** + * @brief Construct a new FFT Benchmark object + * + * @param argc the number of program input parameters + * @param argv the program input parameters as array of strings + */ + FFTBenchmark(int argc, char* argv[]); + + /** + * @brief Construct a new FFT Benchmark object + */ + FFTBenchmark(); + +}; + +/** + * Bit reverses the order of the given FFT data in place + * + * @param data Array of complex numbers that will be sorted in bit reversed order + * @param iterations Length of the data array will be calculated with iterations * FFT Size + */ +void bit_reverse(std::complex *data, unsigned iterations); + +// The function definitions and implementations below this comment are taken from the +// FFT1D example implementation of the Intel FPGA SDK for OpenCL 19.4 +// They are licensed under the following conditions: +// +// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +void fourier_transform_gold(bool inverse, const int lognr_points, std::complex *data); + +void fourier_stage(int lognr_points, std::complex *data); + +} // namespace fft + + +#endif // SRC_HOST_STREAM_BENCHMARK_H_ diff --git a/FFT/src/host/fft_functionality.hpp b/FFT/src/host/fft_functionality.hpp deleted file mode 100644 index c791eadf..00000000 --- a/FFT/src/host/fft_functionality.hpp +++ /dev/null @@ -1,103 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#ifndef SRC_HOST_NETWORK_FUNCTIONALITY_H_ -#define SRC_HOST_NETWORK_FUNCTIONALITY_H_ - -/* C++ standard library headers */ -#include -#include - -/* Project's headers */ -#include "execution.h" -#include "cxxopts.hpp" -#include "setup/fpga_setup.hpp" -#include "parameters.h" - -/** -Prints the execution results to stdout - -@param results The execution results -*/ -void -printResults(std::shared_ptr results); - -/** - * Fill the data buffer with random number using the mersenne twister engine with - * seed 0. - * - * @param data Data array that has to be filled - * @param size Size of the data array that has to be filled - */ -void generateInputData(std::complex* data, unsigned iterations); - - -/** - * Checks the calculation error of an FFt calculation by calculating the inverse FFT on the result data - * and calculating the residual with abs(x - x')/(epsilon * log(FFT_SIZE)). - * - * @param verify_data The input data of the FFT calculation - * @param result_data Result of the FFT calculation - * @param iterations Number data iterations (total data size should be iterations * FFT_SIZE) - * @return the residual error of the calculation - */ -double checkFFTResult(std::complex* verify_data, std::complex* result_data, unsigned iterations); - -/** - * Bit reverses the order of the given FFT data in place - * - * @param data Array of complex numbers that will be sorted in bit reversed order - * @param iterations Length of the data array will be calculated with iterations * FFT Size - */ -void bit_reverse(std::complex *data, unsigned iterations); - -// The function definitions and implementations below this comment are taken from the -// FFT1D example implementation of the Intel FPGA SDK for OpenCL 19.4 -// They are licensed under the following conditions: -// -// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this -// software and associated documentation files (the "Software"), to deal in the Software -// without restriction, including without limitation the rights to use, copy, modify, merge, -// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to -// whom the Software is furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in all copies or -// substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// This agreement shall be governed in all respects by the laws of the State of California and -// by the laws of the United States of America. - -void fourier_transform_gold(bool inverse, const int lognr_points, std::complex *data); - -void fourier_stage(int lognr_points, std::complex *data); - - -#endif // SRC_HOST_NETWORK_FUNCTIONALITY_H_ diff --git a/FFT/src/host/main.cpp b/FFT/src/host/main.cpp index baedf3e5..ce7db149 100644 --- a/FFT/src/host/main.cpp +++ b/FFT/src/host/main.cpp @@ -2,8 +2,9 @@ // Created by Marius Meyer on 04.12.19. // -#include "fft_functionality.hpp" -#include "setup/common_benchmark_io.hpp" +#include "fft_benchmark.hpp" + +using namespace fft; /** The program entry point @@ -11,39 +12,12 @@ The program entry point int main(int argc, char *argv[]) { // Setup benchmark - std::shared_ptr programSettings = - parseProgramParameters(argc, argv); - fpga_setup::setupEnvironmentAndClocks(); - std::vector usedDevice = - fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); - cl::Context context = cl::Context(usedDevice); - cl::Program program = fpga_setup::fpgaSetup(&context, usedDevice, - &programSettings->kernelFileName); - - printFinalConfiguration(programSettings, usedDevice[0]); - - std::shared_ptr config( - new bm_execution::ExecutionConfiguration { - context, usedDevice[0], program, - programSettings->numRepetitions - }); - - //TODO implement actual benchmark execution - std::complex* data; - posix_memalign(reinterpret_cast(&data), 64, programSettings->iterations * (1 << LOG_FFT_SIZE) * sizeof(std::complex)); - generateInputData(data, programSettings->iterations); - - auto timing = bm_execution::calculate(config, data, programSettings->iterations, programSettings->inverse); - - auto* verify_data = new std::complex[programSettings->iterations * (1 << LOG_FFT_SIZE)]; - generateInputData(verify_data, programSettings->iterations); - double error = checkFFTResult(verify_data, data, timing->iterations); - - delete [] verify_data; - - printResults(timing); - - return error < 1 ? 0 : 1; + auto bm = FFTBenchmark(argc, argv); + bool success = bm.executeBenchmark(); + if (success) { + return 0; + } + else { + return 1; + } } - diff --git a/FFT/src/host/program_settings.h b/FFT/src/host/program_settings.h deleted file mode 100644 index 02b3817f..00000000 --- a/FFT/src/host/program_settings.h +++ /dev/null @@ -1,30 +0,0 @@ - -#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ -#define SRC_HOST_PROGRAM_SETTINGS_H_ - -#include "parameters.h" - -/* C++ standard library headers */ -#include - -#include "CL/opencl.h" - -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ -#define PROGRAM_DESCRIPTION "Implementation of the FFT benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " VERSION "\n" - -struct ProgramSettings { - uint numRepetitions; - unsigned iterations; - bool inverse; - int defaultPlatform; - int defaultDevice; - std::string kernelFileName; -}; - - -#endif diff --git a/FFT/tests/CMakeLists.txt b/FFT/tests/CMakeLists.txt index 0c72e04b..519dc96e 100755 --- a/FFT/tests/CMakeLists.txt +++ b/FFT/tests/CMakeLists.txt @@ -1,27 +1,29 @@ -# 'lib' is the folder with Google Test sources -add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) +add_subdirectory(../../extern/googletest ${CMAKE_BINARY_DIR}/lib/googletest) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) -include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) -include_directories(${CMAKE_SOURCE_DIR}/src/host) +include_directories(${CMAKE_BINARY_DIR}/src/common) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/execution_default.cpp ../src/host/fft_functionality.cpp) -set(TEST_SOURCES ../../shared/testing/main.cpp ../../shared/setup/test_fpga_setup.cpp test_fft_functionality.cpp test_execution_functionality.cpp) +set(HOST_EXE_NAME FFT) +set(LIB_NAME fft_lib) + +set(TEST_SOURCES main.cpp test_fft_functionality.cpp test_execution_functionality.cpp) if (INTELFPGAOPENCL_FOUND) - include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) - add_executable(Test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_intel gtest gmock ${IntelFPGAOpenCL_LIBRARIES}) - target_compile_definitions(Test_intel PRIVATE -DINTEL_FPGA) - add_dependencies(Test_intel fft1d_float_8_emulate_intel) - add_test(NAME test_intel_unit COMMAND $ -f fft1d_float_8_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) + add_executable(${HOST_EXE_NAME}_test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_intel gtest gmock ${LIB_NAME}_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}") + add_dependencies(${HOST_EXE_NAME}_test_intel fft1d_float_8_emulate_intel) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DINTEL_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_intel_unit COMMAND $ -f fft1d_float_8_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() if (Vitis_FOUND) - include_directories(${Vitis_INCLUDE_DIRS}) - add_executable(Test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) - target_link_libraries(Test_xilinx gtest gmock ${Vitis_LIBRARIES}) - target_compile_definitions(Test_xilinx PRIVATE -DXILINX_FPGA) - add_dependencies(Test_xilinx fft1d_float_8_emulate_xilinx) - add_test(NAME test_xilinx_unit COMMAND $ -f fft1d_float_8_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + include_directories(SYSTEM ${Vitis_INCLUDE_DIRS}) + add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + add_dependencies(${HOST_EXE_NAME}_test_xilinx fft1d_float_8_emulate_xilinx) + target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_xilinx_unit COMMAND $ -f fft1d_float_8_emulate.xclbin WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/FFT/tests/main.cpp b/FFT/tests/main.cpp new file mode 100644 index 00000000..61bb1701 --- /dev/null +++ b/FFT/tests/main.cpp @@ -0,0 +1,69 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "test_program_settings.h" +#include "gtest/gtest.h" +#include "CL/cl.hpp" + +#ifdef _USE_MPI_ +#include "mpi.h" + +class MPIEnvironment : public ::testing::Environment { +public: + MPIEnvironment(int* argc, char** argv[]) { + MPI_Init(argc, argv); + } + + ~MPIEnvironment() override { + MPI_Finalize(); + } +}; +#endif + +int global_argc; +char** global_argv; + +/** +The program entry point for the unit tests +*/ +int +main(int argc, char *argv[]) { + + std::cout << "THIS BINARY EXECUTES UNIT TESTS FOR THE FOLLOWING BENCHMARK:" << std::endl << std::endl; + + ::testing::InitGoogleTest(&argc, argv); + + global_argc = argc; + global_argv = argv; + +#ifdef _USE_MPI_ + ::testing::Environment* const mpi_env = + ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); +#endif + + bool result = RUN_ALL_TESTS(); + + return result; + +} + diff --git a/FFT/tests/test_execution_functionality.cpp b/FFT/tests/test_execution_functionality.cpp index fe2cb27e..8b64d0a1 100644 --- a/FFT/tests/test_execution_functionality.cpp +++ b/FFT/tests/test_execution_functionality.cpp @@ -4,241 +4,161 @@ #include #include "gtest/gtest.h" -#include "../src/host/execution.h" +#include "fft_benchmark.hpp" #include "parameters.h" -#include "setup/fpga_setup.hpp" -#include "../src/host/fft_functionality.hpp" -#include "testing/test_program_settings.h" +#include "test_program_settings.h" -struct OpenCLKernelTest : testing::Test { - std::string kernelFileName = "fft1d_float_8_emulate.aocx"; - std::shared_ptr config; - unsigned repetitions = 10; +struct FFTKernelTest : testing::Test { + std::unique_ptr bm; + std::unique_ptr data; - OpenCLKernelTest() { - kernelFileName = programSettings->kernelFileName; - setupFPGA(); + FFTKernelTest() { + bm = std::unique_ptr(new fft::FFTBenchmark(global_argc, global_argv)); + bm->getExecutionSettings().programSettings->numRepetitions = 1; + bm->getExecutionSettings().programSettings->inverse = false; + data = bm->generateInputData(); } -// TODO fix test - void setupFPGA() { - std::vector device = fpga_setup::selectFPGADevice(DEFAULT_PLATFORM, DEFAULT_DEVICE); - cl::Context context(device[0]); - cl::Program program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); - config = std::make_shared( - bm_execution::ExecutionConfiguration{ - context, device[0], program, - repetitions - }); + ~FFTKernelTest() override { + bm = nullptr; + data = nullptr; } -}; -/** - * Parametrized test takes a tuple of 1 parameter: - * - name of the emulation bitstream - */ -struct DifferentOpenCLKernelTest : OpenCLKernelTest, testing::WithParamInterface { - DifferentOpenCLKernelTest() { - auto params = GetParam(); - kernelFileName = params; - setupFPGA(); - } }; /** * Tests if calculate returns the correct execution results */ -TEST_P(DifferentOpenCLKernelTest, CalculateReturnsCorrectExecutionResultFor11False) { - config->repetitions = 1; - std::complex * data; - posix_memalign(reinterpret_cast(&data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); - auto result = bm_execution::calculate(config, data, 1, false); - EXPECT_EQ(1, result->iterations); - EXPECT_EQ(false, result->inverse); - EXPECT_EQ(1, result->calculationTimings.size()); +TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor11False) { + bm->getExecutionSettings().programSettings->numRepetitions = 1; + data = bm->generateInputData(); + auto result = bm->executeKernel(*data); + EXPECT_EQ(1, result->timings.size()); } /** * Tests if calculate returns the correct execution results for multiple repetitions */ -TEST_P(DifferentOpenCLKernelTest, CalculateReturnsCorrectExecutionResultFor24True) { - config->repetitions = 2; - std::complex * data; - posix_memalign(reinterpret_cast(&data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE) * 4); - auto result = bm_execution::calculate(config, data, 4, true); - EXPECT_EQ(4, result->iterations); - EXPECT_EQ(true, result->inverse); - EXPECT_EQ(2, result->calculationTimings.size()); +TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor24True) { + bm->getExecutionSettings().programSettings->numRepetitions = 2; + data = bm->generateInputData(); + auto result = bm->executeKernel(*data); + EXPECT_EQ(2, result->timings.size()); } /** * Check if FFT of zeros returns just zeros */ -TEST_P (DifferentOpenCLKernelTest, FFTReturnsZero) { - std::complex * data; - posix_memalign(reinterpret_cast(&data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); +TEST_F(FFTKernelTest, FFTReturnsZero) { for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data[i].real(0.0); - data[i].imag(0.0); + data->data[i].real(0.0); + data->data[i].imag(0.0); } - auto result = bm_execution::calculate(config, data, 1, false); + auto result = bm->executeKernel(*data); for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - EXPECT_FLOAT_EQ(std::abs(data[i]), 0.0); + EXPECT_FLOAT_EQ(std::abs(data->data[i]), 0.0); } - free(data); } /** * Check if FFT calculates the correct result for all number being 1.0,1.0i */ -TEST_P (DifferentOpenCLKernelTest, FFTCloseToZeroForAll1And1) { - std::complex * data; - posix_memalign(reinterpret_cast(&data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); +TEST_F(FFTKernelTest, FFTCloseToZeroForAll1And1) { for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data[i].real(1.0); - data[i].imag(1.0); + data->data[i].real(1.0); + data->data[i].imag(1.0); } - auto result = bm_execution::calculate(config, data, 1, false); - EXPECT_NEAR(data[0].real(), (1 << LOG_FFT_SIZE), 0.00001); - EXPECT_NEAR(data[0].imag(), (1 << LOG_FFT_SIZE), 0.00001); + auto result = bm->executeKernel(*data); + EXPECT_NEAR(data->data[0].real(), (1 << LOG_FFT_SIZE), 0.00001); + EXPECT_NEAR(data->data[0].imag(), (1 << LOG_FFT_SIZE), 0.00001); for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(data[i].real(), 0.0, 0.00001); - EXPECT_NEAR(data[i].imag(), 0.0, 0.00001); + EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001); + EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001); } - free(data); } /** * Check if iFFT calculates the correct result for all number being 1.0,1.0i */ -TEST_P (DifferentOpenCLKernelTest, IFFTCloseToZeroForAll1And1) { - std::complex * data; - posix_memalign(reinterpret_cast(&data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); +TEST_F(FFTKernelTest, IFFTCloseToZeroForAll1And1) { for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data[i].real(1.0); - data[i].imag(0.0); + data->data[i].real(1.0); + data->data[i].imag(0.0); } - auto result = bm_execution::calculate(config, data, 1, true); - EXPECT_NEAR(data[0].real(), static_cast(1 << LOG_FFT_SIZE), 0.00001); - EXPECT_NEAR(data[0].imag(), 0.0, 0.00001); + auto result = bm->executeKernel(*data); + EXPECT_NEAR(data->data[0].real(), static_cast(1 << LOG_FFT_SIZE), 0.00001); + EXPECT_NEAR(data->data[0].imag(), 0.0, 0.00001); for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(data[i].real(), 0.0, 0.00001); - EXPECT_NEAR(data[i].imag(), 0.0, 0.00001); + EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001); + EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001); } - free(data); } /** * Check if calling FFt and iFFT result in data that is close to the original data with small error */ -TEST_P (DifferentOpenCLKernelTest, FFTandiFFTProduceResultCloseToSource) { - std::complex * data; - posix_memalign(reinterpret_cast(&data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); - std::complex * verify_data; - posix_memalign(reinterpret_cast(&verify_data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); - - generateInputData(data, 1); - generateInputData(verify_data, 1); +TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) { + auto verify_data = bm->generateInputData(); - auto result = bm_execution::calculate(config, data, 1, false); + auto result = bm->executeKernel(*data); // Normalize iFFT result for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data[i] /= (1 << LOG_FFT_SIZE); + data->data[i] /= (1 << LOG_FFT_SIZE); } // Need to again bit reverse input for iFFT - bit_reverse(data, 1); - auto result2 = bm_execution::calculate(config, data, 1, true); + fft::bit_reverse(data->data, 1); + auto result2 = bm->executeKernel(*data); // Since data was already sorted by iFFT the bit reversal of the kernel has t be undone - bit_reverse(data, 1); + fft::bit_reverse(data->data, 1); for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(std::abs(data[i]), std::abs(verify_data[i]), 0.001); + EXPECT_NEAR(std::abs(data->data[i]), std::abs(verify_data->data[i]), 0.001); } - free(data); - free(verify_data); -} - -/** - * Check the included FFT error function on the host code on data produced by FFT - */ -TEST_P (DifferentOpenCLKernelTest, FFTErrorCheck) { - std::complex * data; - posix_memalign(reinterpret_cast(&data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); - std::complex * verify_data; - posix_memalign(reinterpret_cast(&verify_data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); - - generateInputData(data, 1); - generateInputData(verify_data, 1); - - auto result = bm_execution::calculate(config, data, 1, false); - - // Need to again bit reverse input for iFFT - double error = checkFFTResult(verify_data, data, 1); - - EXPECT_NEAR(error, 0.0, 1.0); - - free(data); - free(verify_data); } /** * Check if FPGA FFT and reference FFT give the same results */ -TEST_P (DifferentOpenCLKernelTest, FPGAFFTAndCPUFFTGiveSameResults) { - std::complex * data; - posix_memalign(reinterpret_cast(&data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); - std::complex * data2; - posix_memalign(reinterpret_cast(&data2), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); +TEST_F(FFTKernelTest, FPGAFFTAndCPUFFTGiveSameResults) { + auto verify_data = bm->generateInputData(); - generateInputData(data, 1); - generateInputData(data2, 1); + auto result = bm->executeKernel(*data); - auto result = bm_execution::calculate(config, data, 1, false); - - fourier_transform_gold(false,LOG_FFT_SIZE,data2); - bit_reverse(data2, 1); + fft::fourier_transform_gold(false,LOG_FFT_SIZE,verify_data->data); + fft::bit_reverse(verify_data->data, 1); // Normalize iFFT result for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data[i] -= data2[i]; + data->data[i] -= verify_data->data[i]; } for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(std::abs(data[i]), 0.0, 0.001); + EXPECT_NEAR(std::abs(data->data[i]), 0.0, 0.001); } - free(data); - free(data2); } /** * Check if FPGA iFFT and reference iFFT give the same results */ -TEST_P (DifferentOpenCLKernelTest, FPGAiFFTAndCPUiFFTGiveSameResults) { - std::complex * data; - posix_memalign(reinterpret_cast(&data), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); - std::complex * data2; - posix_memalign(reinterpret_cast(&data2), 64, sizeof(std::complex) * (1 << LOG_FFT_SIZE)); - - generateInputData(data, 1); - generateInputData(data2, 1); +TEST_F(FFTKernelTest, FPGAiFFTAndCPUiFFTGiveSameResults) { + auto verify_data = bm->generateInputData(); - auto result = bm_execution::calculate(config, data, 1, true); + bm->getExecutionSettings().programSettings->inverse = true; + auto result = bm->executeKernel(*data); - fourier_transform_gold(true,LOG_FFT_SIZE,data2); - bit_reverse(data2, 1); + fft::fourier_transform_gold(true,LOG_FFT_SIZE,verify_data->data); + fft::bit_reverse(verify_data->data, 1); // Normalize iFFT result for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data[i] -= data2[i]; + data->data[i] -= verify_data->data[i]; } for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(std::abs(data[i]), 0.0, 0.001); + EXPECT_NEAR(std::abs(data->data[i]), 0.0, 0.001); } - free(data); - free(data2); } diff --git a/FFT/tests/test_fft_functionality.cpp b/FFT/tests/test_fft_functionality.cpp index 1d3f2e83..75e00d60 100644 --- a/FFT/tests/test_fft_functionality.cpp +++ b/FFT/tests/test_fft_functionality.cpp @@ -3,103 +3,105 @@ // #include "gtest/gtest.h" -#include "../src/host/fft_functionality.hpp" +#include "fft_benchmark.hpp" #include "parameters.h" +#include "test_program_settings.h" +struct FFTHostTest : testing::Test { + std::unique_ptr bm; + std::unique_ptr data; + + FFTHostTest() { + bm = std::unique_ptr(new fft::FFTBenchmark(global_argc, global_argv)); + bm->getExecutionSettings().programSettings->numRepetitions = 1; + bm->getExecutionSettings().programSettings->inverse = false; + data = bm->generateInputData(); + } + + ~FFTHostTest() override { + bm = nullptr; + data = nullptr; + } + +}; /** * Check if data generator generates reproducable inputs */ -TEST (FPGASetup, DataInputReproducible) { - auto *data1 = new std::complex[(1 << LOG_FFT_SIZE)]; - auto *data2 = new std::complex[(1 << LOG_FFT_SIZE)]; - generateInputData(data1, 1); - generateInputData(data2, 1); +TEST_F(FFTHostTest, DataInputReproducible) { + auto data2 = bm->generateInputData(); for (int i=0; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_FLOAT_EQ(data1[i].real(), data2[i].real()); - EXPECT_FLOAT_EQ(data1[i].imag(), data2[i].imag()); + EXPECT_FLOAT_EQ(data->data[i].real(), data2->data[i].real()); + EXPECT_FLOAT_EQ(data->data[i].imag(), data2->data[i].imag()); } - delete [] data1; - delete [] data2; } /** * Check if FFT of zeros returns just zeros */ -TEST (FPGASetup, FFTReturnsZero) { - auto *data = new std::complex[(1 << LOG_FFT_SIZE)]; +TEST_F(FFTHostTest, FFTReturnsZero) { for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data[i].real(0.0); - data[i].imag(0.0); + data->data[i].real(0.0); + data->data[i].imag(0.0); } - fourier_transform_gold(false, LOG_FFT_SIZE, data); + fft::fourier_transform_gold(false, LOG_FFT_SIZE, data->data); for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - EXPECT_FLOAT_EQ(std::abs(data[i]), 0.0); + EXPECT_FLOAT_EQ(std::abs(data->data[i]), 0.0); } - delete [] data; } /** * Check if FFT calculates the correct result for all number being 1.0,1.0i */ -TEST (FPGASetup, FFTCloseToZeroForAll1And1) { - auto *data = new std::complex[(1 << LOG_FFT_SIZE)]; +TEST_F(FFTHostTest, FFTCloseToZeroForAll1And1) { for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data[i].real(1.0); - data[i].imag(1.0); + data->data[i].real(1.0); + data->data[i].imag(1.0); } - fourier_transform_gold(false, LOG_FFT_SIZE, data); - EXPECT_NEAR(data[0].real(), (1 << LOG_FFT_SIZE), 0.00001); - EXPECT_NEAR(data[0].imag(), (1 << LOG_FFT_SIZE), 0.00001); + fft::fourier_transform_gold(false, LOG_FFT_SIZE, data->data); + EXPECT_NEAR(data->data[0].real(), (1 << LOG_FFT_SIZE), 0.00001); + EXPECT_NEAR(data->data[0].imag(), (1 << LOG_FFT_SIZE), 0.00001); for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(data[i].real(), 0.0, 0.00001); - EXPECT_NEAR(data[i].imag(), 0.0, 0.00001); + EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001); + EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001); } - delete [] data; } /** * Check if iFFT calculates the correct result for all number being 1.0,1.0i */ -TEST (FPGASetup, IFFTCloseToZeroForAll1And1) { - auto *data = new std::complex[(1 << LOG_FFT_SIZE)]; +TEST_F(FFTHostTest, IFFTCloseToZeroForAll1And1) { for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { - data[i].real(1.0); - data[i].imag(1.0); + data->data[i].real(1.0); + data->data[i].imag(1.0); } - fourier_transform_gold(true, LOG_FFT_SIZE, data); - EXPECT_NEAR(data[0].real(), (1 << LOG_FFT_SIZE), 0.00001); - EXPECT_NEAR(data[0].imag(), (1 << LOG_FFT_SIZE), 0.00001); + fft::fourier_transform_gold(true, LOG_FFT_SIZE, data->data); + EXPECT_NEAR(data->data[0].real(), (1 << LOG_FFT_SIZE), 0.00001); + EXPECT_NEAR(data->data[0].imag(), (1 << LOG_FFT_SIZE), 0.00001); for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(data[i].real(), 0.0, 0.00001); - EXPECT_NEAR(data[i].imag(), 0.0, 0.00001); + EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001); + EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001); } - delete [] data; } /** * Check if FFT and FFT check give low error when FFT is calculated directly */ -TEST (FPGASetup, FFTandiFFTProduceResultCloseToSource) { - auto *data = new std::complex[(1 << LOG_FFT_SIZE)]; - auto *verify_data = new std::complex[(1 << LOG_FFT_SIZE)]; - generateInputData(data, 1); - generateInputData(verify_data, 1); +TEST_F(FFTHostTest, FFTandiFFTProduceResultCloseToSource) { + auto verify_data = bm->generateInputData(); - fourier_transform_gold(false, LOG_FFT_SIZE, data); - fourier_transform_gold(true, LOG_FFT_SIZE, data); + fft::fourier_transform_gold(false, LOG_FFT_SIZE, data->data); + fft::fourier_transform_gold(true, LOG_FFT_SIZE, data->data); // Normalize iFFT result for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - data[i] /= (1 << LOG_FFT_SIZE); + data->data[i] /= (1 << LOG_FFT_SIZE); } for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { - EXPECT_NEAR(std::abs(data[i]), std::abs(verify_data[i]), 0.001); + EXPECT_NEAR(std::abs(data->data[i]), std::abs(verify_data->data[i]), 0.001); } - delete [] data; - delete [] verify_data; } \ No newline at end of file diff --git a/FFT/tests/test_program_settings.h b/FFT/tests/test_program_settings.h new file mode 100644 index 00000000..89e2070f --- /dev/null +++ b/FFT/tests/test_program_settings.h @@ -0,0 +1,25 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + + +extern int global_argc; +extern char** global_argv; \ No newline at end of file diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index daf7f0c4..23b7d542 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -9,7 +9,7 @@ import sys # Regular expressions for the raw output of all -fft_regex = "Version:\\s+(?P.+)\n(.*\n)+FFT\\sSize:\\s+(?P\d+)\nData\\sSize:\\s+(?P\d+)(.*\n)+Device:\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" +fft_regex = "Version:\\s+(?P.+)\n(.*\n)+Batch\\sSize\\s+(?P\d+)\nFFT\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)\n\\s+trans\\s+calc\\s+calc\\s+FLOPS\\s+total\\s+FLOPS\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" From d63f3c15331f676eb2ceefc84b6fc988b95d2aed Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 19 May 2020 18:34:46 +0200 Subject: [PATCH 38/45] Small fixes for Intel SDK and Nocuta --- GEMM/src/host/execution_cannon.cpp | 8 ++++---- shared/CMakeLists.txt | 14 +++++++++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/GEMM/src/host/execution_cannon.cpp b/GEMM/src/host/execution_cannon.cpp index 7e4ded52..927997a8 100644 --- a/GEMM/src/host/execution_cannon.cpp +++ b/GEMM/src/host/execution_cannon.cpp @@ -52,13 +52,13 @@ calculate(hpcc_base::ExecutionSettings const& config, // Create Command queue cl::CommandQueue compute_queue(*config.context, *config.device); #ifdef INTEL_FPGA - cl::Buffer Buffer_a(*config.context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_1_INTELFPGA), + cl::Buffer Buffer_a(*config.context, CL_MEM_READ_WRITE | (config.programSettings->useMemoryInterleaving ? 0 :CL_CHANNEL_1_INTELFPGA), sizeof(HOST_DATA_TYPE)*config.programSettings->matrixSize*config.programSettings->matrixSize); - cl::Buffer Buffer_b(*config.context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_2_INTELFPGA), + cl::Buffer Buffer_b(*config.context, CL_MEM_READ_WRITE | (config.programSettings->useMemoryInterleaving ? 0 :CL_CHANNEL_2_INTELFPGA), sizeof(cl_int)*config.programSettings->matrixSize*config.programSettings->matrixSize); - cl::Buffer Buffer_c_in(*config.context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_3_INTELFPGA), + cl::Buffer Buffer_c_in(*config.context, CL_MEM_READ_WRITE | (config.programSettings->useMemoryInterleaving ? 0 :CL_CHANNEL_3_INTELFPGA), sizeof(cl_int)*config.programSettings->matrixSize*config.programSettings->matrixSize); - cl::Buffer Buffer_c_out(*config.context, CL_MEM_READ_WRITE | (config->useMemInterleaving ? 0 :CL_CHANNEL_4_INTELFPGA), + cl::Buffer Buffer_c_out(*config.context, CL_MEM_READ_WRITE | (config.programSettings->useMemoryInterleaving ? 0 :CL_CHANNEL_4_INTELFPGA), sizeof(cl_int)*config.programSettings->matrixSize*config.programSettings->matrixSize); #else cl::Buffer Buffer_a(*config.context, CL_MEM_READ_WRITE, diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index ac6789bd..3821b1a9 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -1,5 +1,17 @@ -project(HPCCBaseLibrary VERSION 1.0.0) +project(HPCCBaseLibrary VERSION 1.0.1) add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) +find_package(OpenCL) + +if (INTELFPGAOPENCL_FOUND) + target_include_directories(hpcc_fpga_base PUBLIC ${IntelFPGAOpenCL_INCLUDE_DIRS}) +elseif(Vitis_FOUND) + target_include_directories(hpcc_fpga_base PUBLIC ${Vitis_INCLUDE_DIRS}) +elseif(OpenCL_FOUND) + target_include_directories(hpcc_fpga_base PUBLIC ${OpenCL_INCLUDE_DIRS}) +else() + message(ERROR "No OpenCL header found on system!") +endif() + target_include_directories(hpcc_fpga_base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/../extern/cxxopts/include) From 83ae6cae0dc9710d532b9ca883afb948fd6328e0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 May 2020 09:40:20 +0200 Subject: [PATCH 39/45] Fix LINPACK unit tests --- .../test_kernel_functionality_and_host_integration.cpp | 2 +- .../tests/test_kernel_functionality_separate_cores.cpp | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp index a2722188..86e56185 100644 --- a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp +++ b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp @@ -44,7 +44,7 @@ TEST_F(LinpackKernelTest, FPGACorrectResultsOneRepetition) { /** * Execution returns correct results for a single repetition */ -TEST_F(OpenCLKernelTest, ValidationWorksForMKL) { +TEST_F(LinpackKernelTest, ValidationWorksForMKL) { int info; auto data_cpu = bm->generateInputData(); diff --git a/LINPACK/tests/test_kernel_functionality_separate_cores.cpp b/LINPACK/tests/test_kernel_functionality_separate_cores.cpp index fb761b7c..ec580769 100644 --- a/LINPACK/tests/test_kernel_functionality_separate_cores.cpp +++ b/LINPACK/tests/test_kernel_functionality_separate_cores.cpp @@ -15,15 +15,17 @@ struct LinpackKernelSeparateTest : testing::Test, testing::WithParamInterface bm; + char* kernelFileName; LinpackKernelSeparateTest() { int argc = 3; std::string str_param = GetParam(); - std::vector param(str_param.c_str(), str_param.c_str() + str_param.size() + 1); - char* argv[3] = {"Test", "-f", reinterpret_cast(¶m)}; + kernelFileName = new char[str_param.length() + 1]; + std::strcpy(kernelFileName, str_param.c_str()); + char* argv[3] = {"Test", "-f", kernelFileName}; + bm = std::unique_ptr(new linpack::LinpackBenchmark(argc, argv)); array_size = (1 << LOCAL_MEM_BLOCK_LOG); bm->getExecutionSettings().programSettings->matrixSize = array_size; - bm = std::unique_ptr(new linpack::LinpackBenchmark(argc, argv)); posix_memalign(reinterpret_cast(&A), 4096, sizeof(HOST_DATA_TYPE) * array_size * array_size); posix_memalign(reinterpret_cast(&B), 4096, @@ -135,6 +137,7 @@ struct LinpackKernelSeparateTest : testing::Test, testing::WithParamInterface Date: Wed, 20 May 2020 14:39:35 +0200 Subject: [PATCH 40/45] Enable MPI support for base implementation --- shared/include/hpcc_benchmark.hpp | 45 ++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 7e5f0125..75f6a2fe 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -32,6 +32,11 @@ SOFTWARE. /* External library headers */ #include "CL/cl.hpp" +#ifdef _USE_MPI_ +#include "mpi.h" +#endif + + #define STR_EXPAND(tok) #tok #define STR(tok) STR_EXPAND(tok) @@ -312,8 +317,15 @@ class HpccFpgaBenchmark { executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), std::move(context), std::move(program))); + // Get the rank of the process + int world_rank = 0; - printFinalConfiguration(*executionSettings); +#ifdef _USE_MPI_ + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); +#endif + if (world_rank == 0) { + printFinalConfiguration(*executionSettings); + } } /** @@ -325,23 +337,38 @@ class HpccFpgaBenchmark { */ bool executeBenchmark() { + // Get the rank of the process + int world_rank = 0; + +#ifdef _USE_MPI_ + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); +#endif + if (!executionSettings.get()) { std::cerr << "Benchmark execution started without running the benchmark setup!" << std::endl; exit(1); } - std::cout << HLINE << "Start benchmark using the given configuration. Generating data..." << std::endl - << HLINE; + if (world_rank == 0) { + std::cout << HLINE << "Start benchmark using the given configuration. Generating data..." << std::endl + << HLINE; + } std::unique_ptr data = generateInputData(); - std::cout << HLINE << "Execute benchmark kernel..." << std::endl - << HLINE; + if (world_rank == 0) { + std::cout << HLINE << "Execute benchmark kernel..." << std::endl + << HLINE; + } std::unique_ptr output = executeKernel(*data); - std::cout << HLINE << "Validate output..." << std::endl - << HLINE; - + if (world_rank == 0) { + std::cout << HLINE << "Validate output..." << std::endl + << HLINE; + } + bool validateSuccess = validateOutputAndPrintError(*data); - printResults(*output); + if (world_rank == 0) { + printResults(*output); + } return validateSuccess; } From 60fa299efea160e0f5738583d74242651e88cb47 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 May 2020 14:40:35 +0200 Subject: [PATCH 41/45] Adapt b_eff to OO code --- b_eff/CHANGELOG | 5 + b_eff/CMakeLists.txt | 2 +- b_eff/README.md | 14 +- b_eff/src/common/parameters.h.in | 9 + b_eff/src/device/CMakeLists.txt | 4 +- b_eff/src/host/CMakeLists.txt | 25 +- .../common_benchmark_io_implementation.cpp | 80 ------- b_eff/src/host/execution.h | 20 +- b_eff/src/host/execution_default.cpp | 16 +- b_eff/src/host/main.cpp | 110 +-------- b_eff/src/host/network_benchmark.cpp | 196 ++++++++++++++++ b_eff/src/host/network_benchmark.hpp | 218 ++++++++++++++++++ b_eff/src/host/network_functionality.cpp | 104 --------- b_eff/src/host/program_settings.h | 31 --- b_eff/tests/CMakeLists.txt | 28 +-- b_eff/tests/main.cpp | 69 ++++++ ...nel_functionality_and_host_integration.cpp | 112 +++++---- .../test_program_settings.h} | 30 +-- 18 files changed, 626 insertions(+), 447 deletions(-) delete mode 100644 b_eff/src/host/common_benchmark_io_implementation.cpp create mode 100644 b_eff/src/host/network_benchmark.cpp create mode 100644 b_eff/src/host/network_benchmark.hpp delete mode 100644 b_eff/src/host/network_functionality.cpp delete mode 100644 b_eff/src/host/program_settings.h create mode 100644 b_eff/tests/main.cpp rename b_eff/{src/host/network_functionality.hpp => tests/test_program_settings.h} (65%) diff --git a/b_eff/CHANGELOG b/b_eff/CHANGELOG index 48a3cac4..4017002c 100755 --- a/b_eff/CHANGELOG +++ b/b_eff/CHANGELOG @@ -2,6 +2,11 @@ This file contains all changes made to the source code for each release. +## 1.1.1 + +#### Changed: +- Converted host code to new OO code + ## 1.1 #### Fixes: diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt index 26d7724a..e0eed444 100755 --- a/b_eff/CMakeLists.txt +++ b/b_eff/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.1) -project(b_eff VERSION 1.1) +project(b_eff VERSION 1.1.1) set(SEND_KERNEL_NAME send) set(RECV_KERNEL_NAME recv) diff --git a/b_eff/README.md b/b_eff/README.md index 23bb4213..3c905605 100644 --- a/b_eff/README.md +++ b/b_eff/README.md @@ -24,8 +24,8 @@ The targets below can be used to build the benchmark and its kernels: | Target | Description | | -------- | ---------------------------------------------- | - | fnet | Builds the host application | - | Google_Tests_run| Compile the tests and its dependencies | + | Network_intel | Builds the host application | + | Network_test_intel| Compile the tests and its dependencies | More over the are additional targets to generate kernel reports and bitstreams. The provided kernel is optimized for the Bittware 520N board with four external @@ -46,7 +46,7 @@ The targets below can be used to build the benchmark and its kernels: mkdir build && cd build cmake .. - make fnet + make Network_intel You will find all executables and kernel files in the `bin` folder of your build directory. @@ -64,16 +64,16 @@ of the Intel FPGA SDK installation. All binaries and FPGA bitstreams can be found in the `bin` directory with in the build directory. For execution of the benchmark run: - ./fnet -f path_to_kernel.aocx + ./Network_intel -f path_to_kernel.aocx For more information on available input parameters run - $./fnet -h + $./Network_intel -h Implementation of the effective bandwidth benchmark proposed in the HPCC benchmark suite for FPGA. Version: "1.1" Usage: - ./fnet [OPTION...] + ./Network_intel [OPTION...] -f, --file arg Kernel file name -n, arg Number of repetitions (default: 10) @@ -89,7 +89,7 @@ For more information on available input parameters run To execute the unit and integration tests run - ./Google_Tests_run + ./Network_test_intel -f KERNEL_FILE_NAME in the `bin` folder within the build directory. It will run an emulation of the kernel and execute some functionality tests. diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in index b34757a8..4037ba64 100644 --- a/b_eff/src/common/parameters.h.in +++ b/b_eff/src/common/parameters.h.in @@ -16,6 +16,15 @@ #define HOST_DATA_TYPE @HOST_DATA_TYPE@ #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ +/* +Short description of the program. +Moreover the version and build time is also compiled into the description. +*/ + +#define PROGRAM_DESCRIPTION "Implementation of the effective bandwidth benchmark"\ + " proposed in the HPCC benchmark suite for FPGA.\n"\ + "Version: " VERSION "\n" + /** Output separator */ diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt index 2697865b..61a33a85 100644 --- a/b_eff/src/device/CMakeLists.txt +++ b/b_eff/src/device/CMakeLists.txt @@ -2,7 +2,7 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) generate_kernel_targets_intel(communication_bw520n communication_bw520n_disable_pipelining communication_bw520n_combined_loops) -add_test(NAME test_emulation_intel COMMAND ./fnet -f communication_bw520n_emulate.aocx -l 32 -n 1 +add_test(NAME test_emulation_intel COMMAND ./Network_intel -f communication_bw520n_emulate.aocx -l 1 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) -add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./fnet -f communication_bw520n_emulate.aocx -l 32 -n 1 +add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_emulate.aocx -l 1 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index 65fae838..44bb1832 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -1,12 +1,19 @@ - -include_directories(../../../extern/cxxopts/include ../../../shared) -include_directories(${IntelFPGAOpenCL_INCLUDE_DIRS}) +add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) +set(HOST_SOURCE execution_default.cpp network_benchmark.cpp) include_directories(${MPI_CXX_INCLUDE_PATH}) -include_directories(${CMAKE_BINARY_DIR}/src/common) -include_directories(${CMAKE_SOURCE_DIR}/../shared/setup .) -set(HOST_SOURCE execution_default.cpp main.cpp common_benchmark_io_implementation.cpp ../../../shared/setup/fpga_setup.cpp network_functionality.cpp) +set(HOST_EXE_NAME Network) +set(LIB_NAME net_lib) -add_executable(fnet ${HOST_SOURCE}) -target_link_libraries(fnet ${IntelFPGAOpenCL_LIBRARIES} ${MPI_LIBRARIES}) -target_compile_options(fnet PRIVATE -D_USE_MPI_) +if (INTELFPGAOPENCL_FOUND) + add_library(${LIB_NAME}_intel STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_intel PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${IntelFPGAOpenCL_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_intel PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_intel main.cpp) + target_link_libraries(${LIB_NAME}_intel "${IntelFPGAOpenCL_LIBRARIES}" "${OpenMP_CXX_FLAGS}" ${MPI_LIBRARIES}) + target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel) + target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA -D_USE_MPI_) + target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_intel_host_executable COMMAND $ -h) +endif() diff --git a/b_eff/src/host/common_benchmark_io_implementation.cpp b/b_eff/src/host/common_benchmark_io_implementation.cpp deleted file mode 100644 index 20d8354c..00000000 --- a/b_eff/src/host/common_benchmark_io_implementation.cpp +++ /dev/null @@ -1,80 +0,0 @@ - -#include "cxxopts.hpp" -#include "parameters.h" -#include "setup/common_benchmark_io.hpp" - -/** -Parses and returns program options using the cxxopts library. -Supports the following parameters: - - file name of the FPGA kernel file (-f,--file) - - number of repetitions (-n) - - number of kernel replications (-r) - - data size (-d) - - use memory interleaving -@see https://github.com/jarro2783/cxxopts - -@return program settings that are created from the given program arguments -*/ -std::shared_ptr -parseProgramParameters(int argc, char *argv[]) { - // Defining and parsing program options - cxxopts::Options options(argv[0], PROGRAM_DESCRIPTION); - options.add_options() - ("f,file", "Kernel file name", cxxopts::value()) - ("n", "Number of repetitions", - cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) - ("l", "Inital looplength of Kernel", - cxxopts::value()->default_value(std::to_string(1u << 15u))) - ("device", "Index of the device that has to be used. If not given you "\ - "will be asked which device to use if there are multiple devices "\ - "available.", cxxopts::value()->default_value(std::to_string(DEFAULT_DEVICE))) - ("platform", "Index of the platform that has to be used. If not given "\ - "you will be asked which platform to use if there are multiple "\ - "platforms available.", - cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) - ("h,help", "Print this help"); - cxxopts::ParseResult result = options.parse(argc, argv); - - if (result.count("h")) { - // Just print help when argument is given - std::cout << options.help() << std::endl; - exit(0); - } - // Check parsed options and handle special cases - if (result.count("f") <= 0) { - // Path to the kernel file is mandatory - exit if not given! - std::cerr << "Kernel file must be given! Aborting" << std::endl; - std::cout << options.help() << std::endl; - exit(1); - } - - // Create program settings from program arguments - std::shared_ptr sharedSettings( - new ProgramSettings{result["n"].as(), - result["l"].as(), - result["platform"].as(), - result["device"].as(), - result["f"].as()}); - return sharedSettings; -} - -/** - * Prints the used configuration to std out before starting the actual benchmark. - * - * @param programSettings The program settings retrieved from the command line - * @param device The device used for execution - */ -void printFinalConfiguration(const std::shared_ptr &programSettings, - const cl::Device &device) {// Give setup summary - std::cout << PROGRAM_DESCRIPTION << std::endl << HLINE; - std::cout << "Summary:" << std::endl - << "Repetitions: " << programSettings->numRepetitions - << std::endl - << "Kernel file: " << programSettings->kernelFileName - << std::endl; - std::cout << "Device: " - << device.getInfo() << std::endl; - std::cout << HLINE - << "Start benchmark using the given configuration." << std::endl - << HLINE; -} diff --git a/b_eff/src/host/execution.h b/b_eff/src/host/execution.h index 34db801a..6d3eb6db 100644 --- a/b_eff/src/host/execution.h +++ b/b_eff/src/host/execution.h @@ -30,25 +30,11 @@ SOFTWARE. /* External library headers */ #include "CL/cl.hpp" #include "parameters.h" +#include "network_benchmark.hpp" namespace bm_execution { - struct ExecutionConfiguration { - cl::Context context; - cl::Device device; - cl::Program program; - uint repetitions; - }; - - struct ExecutionTimings { - cl_uint looplength; - cl_uint messageSize; - std::vector calculationTimings; - }; - - typedef std::map>>> CollectedResultMap; - /** The actual execution of the benchmark. This method can be implemented in multiple *.cpp files. This header enables @@ -59,8 +45,8 @@ simple exchange of the different calculation methods. @return The resulting matrix */ - std::shared_ptr - calculate(std::shared_ptr config, cl_uint messageSize, cl_uint looplength); + std::shared_ptr + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength); } // namespace bm_execution diff --git a/b_eff/src/host/execution_default.cpp b/b_eff/src/host/execution_default.cpp index 1ce4640d..e610c261 100644 --- a/b_eff/src/host/execution_default.cpp +++ b/b_eff/src/host/execution_default.cpp @@ -41,25 +41,25 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr - calculate(std::shared_ptr config, cl_uint messageSize, cl_uint looplength) { + std::shared_ptr + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength) { - cl::Kernel sendKernel(config->program, SEND_KERNEL_NAME); + cl::Kernel sendKernel(*config.program, SEND_KERNEL_NAME); sendKernel.setArg(0, messageSize); sendKernel.setArg(1, looplength); - cl::Kernel recvKernel(config->program, RECV_KERNEL_NAME); + cl::Kernel recvKernel(*config.program, RECV_KERNEL_NAME); recvKernel.setArg(0, messageSize); recvKernel.setArg(1, looplength); - cl::CommandQueue sendQueue(config->context); - cl::CommandQueue recvQueue(config->context); + cl::CommandQueue sendQueue(*config.context); + cl::CommandQueue recvQueue(*config.context); std::vector calculationTimings; - for (uint r =0; r < config->repetitions; r++) { + for (uint r =0; r < config.programSettings->numRepetitions; r++) { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); sendQueue.enqueueTask(sendKernel); @@ -73,7 +73,7 @@ namespace bm_execution { calculationTimings.push_back(calculationTime.count()); } - std::shared_ptr result(new ExecutionTimings{ + std::shared_ptr result(new network::ExecutionTimings{ looplength, messageSize, calculationTimings diff --git a/b_eff/src/host/main.cpp b/b_eff/src/host/main.cpp index c3876dcb..57067eca 100644 --- a/b_eff/src/host/main.cpp +++ b/b_eff/src/host/main.cpp @@ -2,116 +2,28 @@ // Created by Marius Meyer on 04.12.19. // -#include - -#include "network_functionality.hpp" -#include "setup/common_benchmark_io.hpp" -#include "program_settings.h" #include "mpi.h" +#include "network_benchmark.hpp" + +using namespace network; /** The program entry point */ int main(int argc, char *argv[]) { - // Initialize the MPI environment MPI_Init(&argc, &argv); - - // Get the number of processes - int world_size; - MPI_Comm_size(MPI_COMM_WORLD, &world_size); - - // Get the rank of the process - int world_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); - // Setup benchmark - std::shared_ptr programSettings = - parseProgramParameters(argc, argv); - if (world_rank == 0) { - fpga_setup::setupEnvironmentAndClocks(); - } - std::vector usedDevice = - fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); - cl::Context context = cl::Context(usedDevice); - cl::Program program = fpga_setup::fpgaSetup(&context, usedDevice, - &programSettings->kernelFileName); - - if (world_rank == 0) { - printFinalConfiguration(programSettings, usedDevice[0]); - } - - std::shared_ptr config( - new bm_execution::ExecutionConfiguration { - context, usedDevice[0], program, - programSettings->numRepetitions - }); - - auto msgSizes = getMessageSizes(); - std::vector> timing_results; - - for (cl_uint size : msgSizes) { - if (world_rank == 0) { - std::cout << "Measure for " << size << " Byte" << std::endl; - } - cl_uint looplength = std::max((programSettings->looplength) / ((size + (CHANNEL_WIDTH)) / (CHANNEL_WIDTH)), 1u); - timing_results.push_back(bm_execution::calculate(config, size,looplength)); + auto bm = NetworkBenchmark(argc, argv); + bool success = bm.executeBenchmark(); + MPI_Finalize(); + if (success) { + return 0; } - - // Collect the measurement results from all other nodes - bm_execution::CollectedResultMap collected_results; - if (world_rank > 0) { - for (const auto& t : timing_results) { - MPI_Send(&(t->messageSize), - 1, - MPI_UNSIGNED, 0, 0, MPI_COMM_WORLD); - MPI_Send(&(t->looplength), - 1, - MPI_UNSIGNED, 0, 1, MPI_COMM_WORLD); - MPI_Send(&(t->calculationTimings.front()), - programSettings->numRepetitions, - MPI_DOUBLE, 0, 2, MPI_COMM_WORLD); - } - } else { - std::cout << "Collect results over MPI."; - int k = 0; - for (int size : msgSizes) { - std::vector> tmp_timings; - std::cout << "."; - for (int i=1; i < world_size; i++) { - auto execution_result = std::make_shared( bm_execution::ExecutionTimings { - 0,0,std::vector(programSettings->numRepetitions) - }); - MPI_Status status; - MPI_Recv(&(execution_result->messageSize), - 1, - MPI_UNSIGNED, i, 0, MPI_COMM_WORLD, &status); - MPI_Recv(&(execution_result->looplength), - 1, - MPI_UNSIGNED, i, 1, MPI_COMM_WORLD, &status); - MPI_Recv(&(execution_result->calculationTimings.front()), - programSettings->numRepetitions, - MPI_DOUBLE, i, 2, MPI_COMM_WORLD, &status); - tmp_timings.push_back(execution_result); - if (execution_result->messageSize != size) { - std::cerr << "Wrong message size: " << execution_result->messageSize << " != " << size << " from rank " << i << std::endl; - exit(2); - } - } - tmp_timings.push_back(timing_results[k]); - k++; - collected_results.emplace(size, std::make_shared>>(tmp_timings)); - } - std::cout << " done!" << std::endl; + else { + return 1; } - - if (world_rank == 0) { - printResults(collected_results); - } - - MPI_Finalize(); - return 0; } + diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp new file mode 100644 index 00000000..6ddf3c6a --- /dev/null +++ b/b_eff/src/host/network_benchmark.cpp @@ -0,0 +1,196 @@ +// +// Created by Marius Meyer on 04.12.19. +// + +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "network_benchmark.hpp" + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "execution.h" +#include "parameters.h" + +network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), + looplength(results["l"].as()) { + +} + +std::map +network::NetworkProgramSettings::getSettingsMap() { + auto map = hpcc_base::BaseSettings::getSettingsMap(); + map["Loop Length"] = std::to_string(looplength); + return map; +} + +network::NetworkBenchmark::NetworkBenchmark(int argc, char* argv[]) { + setupBenchmark(argc, argv); +} + +network::NetworkBenchmark::NetworkBenchmark() {} + +void +network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { + options.add_options() + ("l", "Inital looplength of Kernel", + cxxopts::value()->default_value(std::to_string(1u << 15u))); +} + +std::unique_ptr +network::NetworkBenchmark::executeKernel(NetworkData &data) { + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + std::vector> timing_results; + + for (cl_uint size : data.messageSizes) { + if (world_rank == 0) { + std::cout << "Measure for " << size << " Byte" << std::endl; + } + cl_uint looplength = std::max((executionSettings->programSettings->looplength) / ((size + (CHANNEL_WIDTH)) / (CHANNEL_WIDTH)), 1u); + timing_results.push_back(bm_execution::calculate(*executionSettings, size, looplength)); + } + + std::unique_ptr collected_results = std::unique_ptr (new network::NetworkExecutionTimings()); + if (world_rank > 0) { + for (const auto& t : timing_results) { + MPI_Send(&(t->messageSize), + 1, + MPI_UNSIGNED, 0, 0, MPI_COMM_WORLD); + MPI_Send(&(t->looplength), + 1, + MPI_UNSIGNED, 0, 1, MPI_COMM_WORLD); + MPI_Send(&(t->calculationTimings.front()), + executionSettings->programSettings->numRepetitions, + MPI_DOUBLE, 0, 2, MPI_COMM_WORLD); + } + } else { + std::cout << "Collect results over MPI."; + int k = 0; + for (int size : data.messageSizes) { + std::vector> tmp_timings; + std::cout << "."; + for (int i=1; i < world_size; i++) { + auto execution_result = std::shared_ptr( new network::ExecutionTimings { + 0,0,std::vector(executionSettings->programSettings->numRepetitions) + }); + MPI_Status status; + MPI_Recv(&(execution_result->messageSize), + 1, + MPI_UNSIGNED, i, 0, MPI_COMM_WORLD, &status); + MPI_Recv(&(execution_result->looplength), + 1, + MPI_UNSIGNED, i, 1, MPI_COMM_WORLD, &status); + MPI_Recv(&(execution_result->calculationTimings.front()), + executionSettings->programSettings->numRepetitions, + MPI_DOUBLE, i, 2, MPI_COMM_WORLD, &status); + tmp_timings.push_back(execution_result); + if (execution_result->messageSize != size) { + std::cerr << "Wrong message size: " << execution_result->messageSize << " != " << size << " from rank " << i << std::endl; + exit(2); + } + } + tmp_timings.push_back(timing_results[k]); + k++; + collected_results->timings.emplace(size, std::make_shared>>(tmp_timings)); + } + std::cout << " done!" << std::endl; + } + + return collected_results; +} + +void +network::NetworkBenchmark::printResults(const network::NetworkExecutionTimings &output) { + std::vector maxBandwidths; + + std::cout << std::setw(ENTRY_SPACE) << "MSize" << " " + << std::setw(ENTRY_SPACE) << "looplength" << " " + << std::setw(ENTRY_SPACE) << "transfer" << " " + << std::setw(ENTRY_SPACE) << "B/s" << std::endl; + + std::vector totalMaxMinCalculationTime; + for (int i =0; i < output.timings.size(); i++) { + totalMaxMinCalculationTime.push_back(0.0); + } + int i = 0; + for (const auto& msgSizeResults : output.timings) { + for (const auto& r : *msgSizeResults.second) { + double localMinCalculationTime = *min_element(r->calculationTimings.begin(), r->calculationTimings.end()); + totalMaxMinCalculationTime[i] = std::max(totalMaxMinCalculationTime[i], localMinCalculationTime); + } + i++; + } + i = 0; + for (const auto& msgSizeResults : output.timings) { + int looplength = msgSizeResults.second->at(0)->looplength; + // The total sent data in bytes will be: + // #Nodes * message_size * looplength * 2 + // the * 2 is because we have two kernels per bitstream that will send and receive simultaneously. + // This will be divided by half of the maximum of the minimum measured runtime over all ranks. + double maxCalcBW = static_cast(msgSizeResults.second->size() * 2 * msgSizeResults.first * looplength) + / (totalMaxMinCalculationTime[i]); + + maxBandwidths.push_back(maxCalcBW); + + std::cout << std::setw(ENTRY_SPACE) << msgSizeResults.first << " " + << std::setw(ENTRY_SPACE) << looplength << " " + << std::setw(ENTRY_SPACE) << totalMaxMinCalculationTime[i] << " " + << std::setw(ENTRY_SPACE) << maxCalcBW + << std::endl; + i++; + } + + + double b_eff = accumulate(maxBandwidths.begin(), maxBandwidths.end(), 0.0) / maxBandwidths.size(); + + std::cout << std::endl << "b_eff = " << b_eff << " B/s" << std::endl; +} + +std::unique_ptr +network::NetworkBenchmark::generateInputData() { + auto d = std::unique_ptr(new network::NetworkData()); + for (uint i = 0; i < 13; i++) { + d->messageSizes.push_back(1u << i); + } + cl_uint fourKB = 1u << 13u; + for (uint i=1; i <= 8; i++) { + d->messageSizes.push_back(fourKB * (1u << i)); + } + return d; +} + +bool +network::NetworkBenchmark::validateOutputAndPrintError(network::NetworkData &data) { + // TODO: No data returned from kernel to validate. Implement such a runtime validation! + return true; +} + diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp new file mode 100644 index 00000000..50840851 --- /dev/null +++ b/b_eff/src/host/network_benchmark.hpp @@ -0,0 +1,218 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef SRC_HOST_NETWORK_BENCHMARK_H_ +#define SRC_HOST_NETWORK_BENCHMARK_H_ + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "hpcc_benchmark.hpp" +#include "parameters.h" + +/** + * @brief Contains all classes and methods needed by the Network benchmark + * + */ +namespace network { + + struct ExecutionTimings { + cl_uint looplength; + cl_uint messageSize; + std::vector calculationTimings; + }; + + /** + * @brief The data structure used to store all measurement results + * + */ + typedef std::map>>> CollectedResultMap; + +/** + * @brief The Network benchmark specific program settings + * + */ +class NetworkProgramSettings : public hpcc_base::BaseSettings { + +public: + /** + * @brief Initial number of sent messages per message size + * + */ + uint looplength; + + /** + * @brief Construct a new Network Program Settings object + * + * @param results the result map from parsing the program input parameters + */ + NetworkProgramSettings(cxxopts::ParseResult &results); + + /** + * @brief Get a map of the settings. This map will be used to print the final configuration. + * + * @return a map of program parameters. keys are the name of the parameter. + */ + std::map getSettingsMap() override; + +}; + +/** + * @brief Data class for the network benchmark + * + */ +class NetworkData { + +public: + /** + * @brief Used message sizes for the benchmark execution + * + */ + std::vector messageSizes; +}; + +/** + * @brief Measured execution timing from the kernel execution + * + */ +class NetworkExecutionTimings { +public: + + /** + * @brief A vector containing the timings for all repetitions for the kernel execution + * + */ + CollectedResultMap timings; + +}; + +/** + * @brief Implementation of the Network benchmark + * + */ +class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark { + +protected: + + /** + * @brief Additional input parameters of the Network benchmark + * + * @param options + */ + void + addAdditionalParseOptions(cxxopts::Options &options) override; + +public: + + /** + * @brief Network specific implementation of the data generation + * + * @return std::unique_ptr The input and output data of the benchmark + */ + std::unique_ptr + generateInputData() override; + + /** + * @brief Network specific implementation of the kernel execution + * + * @param data The input and output data of the benchmark + * @return std::unique_ptr Measured runtimes of the kernel execution + */ + std::unique_ptr + executeKernel(NetworkData &data) override; + + /** + * @brief Network specific implementation of the execution validation + * + * @param data The input and output data of the benchmark + * @return true If validation is successful + * @return false otherwise + */ + bool + validateOutputAndPrintError(NetworkData &data) override; + + /** + * @brief Network specific implementation of printing the execution results + * + * @param output Measured runtimes of the kernel execution + */ + void + printResults(const NetworkExecutionTimings &output) override; + + /** + * @brief Construct a new Network Benchmark object + * + * @param argc the number of program input parameters + * @param argv the program input parameters as array of strings + */ + NetworkBenchmark(int argc, char* argv[]); + + /** + * @brief Construct a new Network Benchmark object + */ + NetworkBenchmark(); + +}; + +/** + * Bit reverses the order of the given Network data in place + * + * @param data Array of complex numbers that will be sorted in bit reversed order + * @param iterations Length of the data array will be calculated with iterations * Network Size + */ +void bit_reverse(std::complex *data, unsigned iterations); + +// The function definitions and implementations below this comment are taken from the +// Network1D example implementation of the Intel FPGA SDK for OpenCL 19.4 +// They are licensed under the following conditions: +// +// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +void fourier_transform_gold(bool inverse, const int lognr_points, std::complex *data); + +void fourier_stage(int lognr_points, std::complex *data); + +} // namespace network + + +#endif // SRC_HOST_STREAM_BENCHMARK_H_ diff --git a/b_eff/src/host/network_functionality.cpp b/b_eff/src/host/network_functionality.cpp deleted file mode 100644 index 9f67cd60..00000000 --- a/b_eff/src/host/network_functionality.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// -// Created by Marius Meyer on 04.12.19. -// - -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#include "network_functionality.hpp" - -/* C++ standard library headers */ -#include -#include - -/* Project's headers */ -#include "execution.h" -#include "cxxopts.hpp" -#include "setup/fpga_setup.hpp" -#include "setup/common_benchmark_io.hpp" -#include "parameters.h" - -/** -Prints the execution results to stdout - -@param results The execution results -*/ -void -printResults(bm_execution::CollectedResultMap results) { - - std::vector maxBandwidths; - - std::cout << std::setw(ENTRY_SPACE) << "MSize" << " " - << std::setw(ENTRY_SPACE) << "looplength" << " " - << std::setw(ENTRY_SPACE) << "transfer" << " " - << std::setw(ENTRY_SPACE) << "B/s" << std::endl; - - std::vector totalMaxMinCalculationTime; - for (int i =0; i < results.size(); i++) { - totalMaxMinCalculationTime.push_back(0.0); - } - int i = 0; - for (const auto& msgSizeResults : results) { - for (const auto& r : *msgSizeResults.second) { - double localMinCalculationTime = *min_element(r->calculationTimings.begin(), r->calculationTimings.end()); - totalMaxMinCalculationTime[i] = std::max(totalMaxMinCalculationTime[i], localMinCalculationTime); - } - i++; - } - i = 0; - for (const auto& msgSizeResults : results) { - int looplength = msgSizeResults.second->at(0)->looplength; - // The total sent data in bytes will be: - // #Nodes * message_size * looplength * 2 - // the * 2 is because we have two kernels per bitstream that will send and receive simultaneously. - // This will be divided by half of the maximum of the minimum measured runtime over all ranks. - double maxCalcBW = static_cast(msgSizeResults.second->size() * 2 * msgSizeResults.first * looplength) - / (totalMaxMinCalculationTime[i]); - - maxBandwidths.push_back(maxCalcBW); - - std::cout << std::setw(ENTRY_SPACE) << msgSizeResults.first << " " - << std::setw(ENTRY_SPACE) << looplength << " " - << std::setw(ENTRY_SPACE) << totalMaxMinCalculationTime[i] << " " - << std::setw(ENTRY_SPACE) << maxCalcBW - << std::endl; - i++; - } - - - double b_eff = accumulate(maxBandwidths.begin(), maxBandwidths.end(), 0.0) / maxBandwidths.size(); - - std::cout << std::endl << "b_eff = " << b_eff << " B/s" << std::endl; - -} - -std::vector getMessageSizes() { - std::vector sizes; - for (uint i = 0; i < 13; i++) { - sizes.push_back(1u << i); - } - cl_uint fourKB = 1u << 13u; - for (uint i=1; i <= 8; i++) { - sizes.push_back(fourKB * (1u << i)); - } - return sizes; -} \ No newline at end of file diff --git a/b_eff/src/host/program_settings.h b/b_eff/src/host/program_settings.h deleted file mode 100644 index b5e7c0e3..00000000 --- a/b_eff/src/host/program_settings.h +++ /dev/null @@ -1,31 +0,0 @@ - -#ifndef SRC_HOST_PROGRAM_SETTINGS_H_ -#define SRC_HOST_PROGRAM_SETTINGS_H_ - -#include "parameters.h" - -/* C++ standard library headers */ -#include - -#include "CL/opencl.h" - -/* -Short description of the program. -Moreover the version and build time is also compiled into the description. -*/ - -#define PROGRAM_DESCRIPTION "Implementation of the effective bandwidth benchmark"\ - " proposed in the HPCC benchmark suite for FPGA.\n"\ - "Version: " VERSION "\n" - - -struct ProgramSettings { - uint numRepetitions; - uint looplength; - int defaultPlatform; - int defaultDevice; - std::string kernelFileName; -}; - - -#endif diff --git a/b_eff/tests/CMakeLists.txt b/b_eff/tests/CMakeLists.txt index d423fb66..0ad3bd23 100755 --- a/b_eff/tests/CMakeLists.txt +++ b/b_eff/tests/CMakeLists.txt @@ -1,17 +1,19 @@ - -# 'lib' is the folder with Google Test sources -add_subdirectory(../../extern/googletest ${CMAKE_CURRENT_BINARY_DIR}/lib) +add_subdirectory(../../extern/googletest ${CMAKE_BINARY_DIR}/lib/googletest) include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR}) -include_directories(${CMAKE_BINARY_DIR}/src/common ../../extern/cxxopts/include ../../shared) -include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) +include_directories(${CMAKE_BINARY_DIR}/src/common) include_directories(${MPI_CXX_INCLUDE_PATH}) -include_directories(${CMAKE_SOURCE_DIR}/src/host) -set(PROJECT_SOURCES ../../shared/setup/fpga_setup.cpp ../src/host/common_benchmark_io_implementation.cpp ../src/host/execution_default.cpp) -set(TEST_SOURCES ../../shared/testing/main.cpp ../../shared/setup/test_fpga_setup.cpp test_kernel_functionality_and_host_integration.cpp) +set(HOST_EXE_NAME Network) +set(LIB_NAME net_lib) + +set(TEST_SOURCES main.cpp test_kernel_functionality_and_host_integration.cpp) -add_executable(Google_Tests_run ${TEST_SOURCES} ${PROJECT_SOURCES}) -target_link_libraries(Google_Tests_run gtest gmock ${IntelFPGAOpenCL_LIBRARIES} ${MPI_LIBRARIES}) -add_dependencies(Google_Tests_run communication_bw520n_emulate_intel) -target_compile_options(Google_Tests_run PRIVATE -D_USE_MPI_) -add_test(NAME test_intel_unit COMMAND $ -f communication_bw520n_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) \ No newline at end of file +if (INTELFPGAOPENCL_FOUND) + include_directories(SYSTEM ${IntelFPGAOpenCL_INCLUDE_DIRS}) + add_executable(${HOST_EXE_NAME}_test_intel ${TEST_SOURCES} ${PROJECT_SOURCES}) + target_link_libraries(${HOST_EXE_NAME}_test_intel gtest gmock ${LIB_NAME}_intel ${IntelFPGAOpenCL_LIBRARIES} "${OpenMP_CXX_FLAGS}" ${MPI_LIBRARIES}) + add_dependencies(${HOST_EXE_NAME}_test_intel communication_bw520n_emulate_intel) + target_compile_definitions(${HOST_EXE_NAME}_test_intel PRIVATE -DINTEL_FPGA -D_USE_MPI_) + target_compile_options(${HOST_EXE_NAME}_test_intel PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME ${HOST_EXE_NAME}_test_intel_unit COMMAND $ -f communication_bw520n_emulate.aocx WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) +endif() diff --git a/b_eff/tests/main.cpp b/b_eff/tests/main.cpp new file mode 100644 index 00000000..61bb1701 --- /dev/null +++ b/b_eff/tests/main.cpp @@ -0,0 +1,69 @@ +/* +Copyright (c) 2020 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +/* Project's headers */ +#include "test_program_settings.h" +#include "gtest/gtest.h" +#include "CL/cl.hpp" + +#ifdef _USE_MPI_ +#include "mpi.h" + +class MPIEnvironment : public ::testing::Environment { +public: + MPIEnvironment(int* argc, char** argv[]) { + MPI_Init(argc, argv); + } + + ~MPIEnvironment() override { + MPI_Finalize(); + } +}; +#endif + +int global_argc; +char** global_argv; + +/** +The program entry point for the unit tests +*/ +int +main(int argc, char *argv[]) { + + std::cout << "THIS BINARY EXECUTES UNIT TESTS FOR THE FOLLOWING BENCHMARK:" << std::endl << std::endl; + + ::testing::InitGoogleTest(&argc, argv); + + global_argc = argc; + global_argv = argv; + +#ifdef _USE_MPI_ + ::testing::Environment* const mpi_env = + ::testing::AddGlobalTestEnvironment(new MPIEnvironment(&argc, &argv)); +#endif + + bool result = RUN_ALL_TESTS(); + + return result; + +} + diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp index 91c41472..de045cf9 100644 --- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp +++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp @@ -4,81 +4,82 @@ #include #include "gtest/gtest.h" -#include "execution.h" +#include "network_benchmark.hpp" #include "parameters.h" -#include "setup/fpga_setup.hpp" -#include "unistd.h" #include "mpi.h" -#include "testing/test_program_settings.h" +#include "test_program_settings.h" #include -struct OpenCLKernelTest : testing::Test { - std::string kernelFileName = "communication_bw520n_emulate.aocx"; - std::shared_ptr config; - unsigned repetitions = 10; +struct NetworkKernelTest : testing::Test { + std::unique_ptr bm; + std::unique_ptr data; unsigned numberOfChannels = 4; std::string channelOutName = "kernel_output_ch"; std::string channelInName = "kernel_input_ch"; - OpenCLKernelTest() { - kernelFileName = programSettings->kernelFileName; - setupFPGA(); + NetworkKernelTest() { + bm = std::unique_ptr(new network::NetworkBenchmark(global_argc, global_argv)); + bm->getExecutionSettings().programSettings->numRepetitions = 1; + data = bm->generateInputData(); + } + + void SetUp() override { + createChannelFilesAndSymbolicLinks(); } void createChannelFilesAndSymbolicLinks() { for (int i=0; i < numberOfChannels; i++) { std::string fname = channelOutName + std::to_string(i); - std::fstream fs; - fs.open(fname, std::ios::out); + std::ofstream fs; + fs.open(fname, std::ofstream::out | std::ofstream::trunc); fs.close(); std::remove((channelInName + std::to_string(i%2 ? i-1 : i+1)).c_str()); symlink(fname.c_str(), (channelInName + std::to_string(i%2 ? i-1 : i+1)).c_str()); } } - - void setupFPGA() { - createChannelFilesAndSymbolicLinks(); - std::vector device = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); - cl::Context context(device[0]); - cl::Program program = fpga_setup::fpgaSetup(&context, device, &kernelFileName); - config = std::make_shared( - bm_execution::ExecutionConfiguration{ - context, device[0], program, - repetitions - }); - } }; /** * Tests if calculate returns the correct execution results */ -TEST_F(OpenCLKernelTest, CalculateReturnsCorrectExecutionResultFor111) { - config->repetitions = 1; - auto result = bm_execution::calculate(config, 1,1); - EXPECT_EQ(1, result->messageSize); - EXPECT_EQ(1, result->looplength); - EXPECT_EQ(1, result->calculationTimings.size()); +TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) { + bm->getExecutionSettings().programSettings->numRepetitions = 1; + bm->getExecutionSettings().programSettings->looplength = 1; + data->messageSizes.clear(); + data->messageSizes.push_back(1); + auto result = bm->executeKernel(*data); + EXPECT_NE(result->timings.end(), result->timings.find(1)); + EXPECT_EQ(1, result->timings.find(1)->second->at(0)->looplength); + EXPECT_EQ(1, result->timings.find(1)->second->at(0)->calculationTimings.size()); } /** * Tests if calculate returns the correct execution results for multiple repetitions */ -TEST_F(OpenCLKernelTest, CalculateReturnsCorrectExecutionResultFor842) { - config->repetitions = 2; - auto result = bm_execution::calculate(config, 8,4); - EXPECT_EQ(8, result->messageSize); - EXPECT_EQ(4, result->looplength); - EXPECT_EQ(2, result->calculationTimings.size()); +TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) { + bm->getExecutionSettings().programSettings->numRepetitions = 2; + bm->getExecutionSettings().programSettings->looplength = 4; + data->messageSizes.clear(); + data->messageSizes.push_back(8); + auto result = bm->executeKernel(*data); + EXPECT_NE(result->timings.end(), result->timings.find(8)); + EXPECT_EQ(4, result->timings.find(8)->second->at(0)->looplength); + EXPECT_EQ(2, result->timings.find(8)->second->at(0)->calculationTimings.size()); } /** * Tests if data is written to the channels for small message sizes */ -TEST_F(OpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) { - config->repetitions = 1; +TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) { const unsigned messageSize = CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE); const unsigned looplength = 4; - auto result = bm_execution::calculate(config, messageSize,looplength); + bm->getExecutionSettings().programSettings->numRepetitions = 1; + // looplength * 2 because the looplength will be recalculated in executeKernel! + // This should end up in looplength again + bm->getExecutionSettings().programSettings->looplength = looplength * 2; + data->messageSizes.clear(); + data->messageSizes.push_back(messageSize); + auto result = bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[messageSize * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { std::string ifname = channelOutName + std::to_string(i); @@ -97,11 +98,16 @@ TEST_F(OpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) /** * Tests if data is written to the channels for small message sizes filling two channels */ -TEST_F(OpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) { - config->repetitions = 1; +TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) { const unsigned messageSize = 2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE); const unsigned looplength = 4; - auto result = bm_execution::calculate(config, messageSize,looplength); + bm->getExecutionSettings().programSettings->numRepetitions = 1; + // looplength * 3 because the looplength will be recalculated in executeKernel! + // This should end up in looplength again + bm->getExecutionSettings().programSettings->looplength = looplength * 3; + data->messageSizes.clear(); + data->messageSizes.push_back(messageSize); + auto result = bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[messageSize * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { std::string ifname = channelOutName + std::to_string(i); @@ -117,11 +123,14 @@ TEST_F(OpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) /** * Tests if data is written to the channels for message sizes filling more than two channels */ -TEST_F(OpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) { - config->repetitions = 1; +TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) { const unsigned messageSize = 4 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE); const unsigned looplength = 1; - auto result = bm_execution::calculate(config, messageSize,looplength); + bm->getExecutionSettings().programSettings->numRepetitions = 1; + bm->getExecutionSettings().programSettings->looplength = looplength; + data->messageSizes.clear(); + data->messageSizes.push_back(messageSize); + auto result = bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[messageSize * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { std::string ifname = channelOutName + std::to_string(i); @@ -137,11 +146,16 @@ TEST_F(OpenCLKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoC /** * Tests if correct data is written to the channels */ -TEST_F(OpenCLKernelTest, CorrectDataIsWrittenToChannel) { - config->repetitions = 1; +TEST_F(NetworkKernelTest, CorrectDataIsWrittenToChannel) { const unsigned messageSize = 2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE); const unsigned looplength = 4; - auto result = bm_execution::calculate(config, messageSize,looplength); + bm->getExecutionSettings().programSettings->numRepetitions = 1; + // looplength * 3 because the looplength will be recalculated in executeKernel! + // This should end up in looplength again + bm->getExecutionSettings().programSettings->looplength = looplength * 3; + data->messageSizes.clear(); + data->messageSizes.push_back(messageSize); + auto result = bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[messageSize * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { std::string ifname = channelOutName + std::to_string(i); diff --git a/b_eff/src/host/network_functionality.hpp b/b_eff/tests/test_program_settings.h similarity index 65% rename from b_eff/src/host/network_functionality.hpp rename to b_eff/tests/test_program_settings.h index cbcd3427..89e2070f 100644 --- a/b_eff/src/host/network_functionality.hpp +++ b/b_eff/tests/test_program_settings.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2019 Marius Meyer +Copyright (c) 2020 Marius Meyer Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -20,30 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef SRC_HOST_NETWORK_FUNCTIONALITY_H_ -#define SRC_HOST_NETWORK_FUNCTIONALITY_H_ -/* C++ standard library headers */ -#include - -/* Project's headers */ -#include "execution.h" -#include "cxxopts.hpp" -#include "setup/fpga_setup.hpp" -#include "parameters.h" - - -/** -Prints the execution results to stdout - -@param results The execution results -*/ -void -printResults(bm_execution::CollectedResultMap results); - - -std::vector getMessageSizes(); - - - -#endif // SRC_HOST_NETWORK_FUNCTIONALITY_H_ +extern int global_argc; +extern char** global_argv; \ No newline at end of file From c7578eb3858f1f8f4f5baba8c80d52f7a21d6a74 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 May 2020 15:07:35 +0200 Subject: [PATCH 42/45] Fix FFT unit tests --- FFT/tests/test_execution_functionality.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/FFT/tests/test_execution_functionality.cpp b/FFT/tests/test_execution_functionality.cpp index 8b64d0a1..3db9d624 100644 --- a/FFT/tests/test_execution_functionality.cpp +++ b/FFT/tests/test_execution_functionality.cpp @@ -113,6 +113,7 @@ TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) { // Need to again bit reverse input for iFFT fft::bit_reverse(data->data, 1); + bm->getExecutionSettings().programSettings->inverse = true; auto result2 = bm->executeKernel(*data); // Since data was already sorted by iFFT the bit reversal of the kernel has t be undone fft::bit_reverse(data->data, 1); From 60cac3fc32bd844b6f8c4fe736196cf1675154f5 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 May 2020 15:23:36 +0200 Subject: [PATCH 43/45] Update binary names in CI --- .gitlab-ci.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 220f181c..2370c6bc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -22,7 +22,7 @@ build:STREAM: - build/bin/stream_kernels_single_emulate.aocx - build/bin/stream_kernels_emulate.aocx - build/bin/STREAM_FPGA_intel - - build/bin/Test_intel + - build/bin/STREAM_FPGA_test_intel build:RandomAccess: @@ -38,7 +38,7 @@ build:RandomAccess: paths: - build/bin/random_access_kernels_single_emulate.aocx - build/bin/RandomAccess_intel - - build/bin/Test_intel + - build/bin/RandomAccess_test_intel build:PTRANS: @@ -54,8 +54,8 @@ build:PTRANS: paths: - build/bin/transpose_optimized_emulate.aocx - build/bin/transpose_default_emulate.aocx - - build/bin/trans_intel - - build/bin/Test_intel + - build/bin/Transpose_intel + - build/bin/Transpose_test_intel build:LINPACK: stage: build @@ -70,8 +70,8 @@ build:LINPACK: paths: - build/bin/lu_blocked_pvt_emulate.aocx - build/bin/lu_blocked_pvt_test_emulate.aocx - - build/bin/LINPACK_intel - - build/bin/Test_intel + - build/bin/Linpack_intel + - build/bin/Linpack_test_intel build:GEMM: stage: build @@ -86,7 +86,7 @@ build:GEMM: paths: - build/bin/gemm_cannon_emulate.aocx - build/bin/GEMM_intel - - build/bin/Test_intel + - build/bin/GEMM_test_intel build:FFT: stage: build @@ -101,7 +101,7 @@ build:FFT: paths: - build/bin/fft1d_float_8_emulate.aocx - build/bin/FFT_intel - - build/bin/Test_intel + - build/bin/FFT_test_intel build:b_eff: stage: build @@ -115,8 +115,8 @@ build:b_eff: artifacts: paths: - build/bin/communication_bw520n_emulate.aocx - - build/bin/fnet - - build/bin/Google_Tests_run + - build/bin/Network_intel + - build/bin/Network_test_intel ### # From 28a2e27f4624c573874ceadb9e4b1989c5030968 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 May 2020 16:48:19 +0200 Subject: [PATCH 44/45] Resolve doxygen documentation warnings --- .gitlab-ci.yml | 4 +- FFT/src/host/fft_benchmark.hpp | 18 +++++ GEMM/src/host/gemm_benchmark.hpp | 48 ++++++++++++-- LINPACK/src/host/linpack_benchmark.hpp | 26 +++++++- PTRANS/src/host/transpose_benchmark.cpp | 5 -- PTRANS/src/host/transpose_benchmark.hpp | 18 ++++- README.md | 11 ++++ .../src/host/random_access_benchmark.cpp | 5 -- .../src/host/random_access_benchmark.hpp | 5 ++ STREAM/src/host/stream_benchmark.cpp | 5 -- STREAM/src/host/stream_benchmark.hpp | 30 ++++++++- b_eff/src/host/network_benchmark.hpp | 66 +++++++------------ shared/include/hpcc_benchmark.hpp | 6 +- 13 files changed, 178 insertions(+), 69 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2370c6bc..7d9db887 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -167,7 +167,7 @@ test:LINPACK: stage: test script: - cd build - - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:LINPACK @@ -180,7 +180,7 @@ test:GEMM: stage: test script: - cd build - - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test dependencies: - build:GEMM diff --git a/FFT/src/host/fft_benchmark.hpp b/FFT/src/host/fft_benchmark.hpp index da4f7c39..2f1cbd17 100644 --- a/FFT/src/host/fft_benchmark.hpp +++ b/FFT/src/host/fft_benchmark.hpp @@ -79,6 +79,10 @@ class FFTData { public: + /** + * @brief The data array used ofr the FFT calculation + * + */ std::complex* data; /** @@ -215,8 +219,22 @@ void bit_reverse(std::complex *data, unsigned iterations); // This agreement shall be governed in all respects by the laws of the State of California and // by the laws of the United States of America. +/** + * @brief Do a FFT with a reference implementation on the CPU + * + * @param inverse if false, the FFT will be calculated, else the iFFT + * @param lognr_points The log2 of the FFT size that should be calculated + * @param data The input data for the FFT + */ void fourier_transform_gold(bool inverse, const int lognr_points, std::complex *data); +/** + * @brief Calculate a single stage of the whole FFT calculation. + * This function will mainly be used by fourier_transform_gold() to calculate the FFT. + * + * @param lognr_points The log2 of the FFT size that should be calculated + * @param data The input data for the FFT stage + */ void fourier_stage(int lognr_points, std::complex *data); } // namespace fft diff --git a/GEMM/src/host/gemm_benchmark.hpp b/GEMM/src/host/gemm_benchmark.hpp index 71b37f8a..7a7ccd5c 100644 --- a/GEMM/src/host/gemm_benchmark.hpp +++ b/GEMM/src/host/gemm_benchmark.hpp @@ -39,7 +39,7 @@ extern "C" void sgemm_(char*, char*, int*, int*,int*, float*, float*, int*, floa /** - * @brief Contains all classes and methods needed by the LINPACK benchmark + * @brief Contains all classes and methods needed by the GEMM benchmark * */ namespace gemm { @@ -74,14 +74,54 @@ class GEMMProgramSettings : public hpcc_base::BaseSettings { }; /** - * @brief Data class cotnaining the data the kernel is exeucted with + * @brief Data class containing all data needed by the kernel to calculate + * \f$C\_out = \alpha * A * B + \beta * C\f$ * */ class GEMMData { public: - HOST_DATA_TYPE *A, *B, *C, *C_out; - HOST_DATA_TYPE normtotal, alpha, beta; + /** + * @brief Pointer to the matrix A of the calculation + * + */ + HOST_DATA_TYPE *A; + + /** + * @brief Pointer to the matrix B of the calculation + * + */ + HOST_DATA_TYPE *B; + + /** + * @brief Pointer to the matrix C of the calculation + * + */ + HOST_DATA_TYPE *C; + + /** + * @brief Pointer to the output matrix of the calculation + * + */ + HOST_DATA_TYPE *C_out; + + /** + * @brief Stores the maximum value of all input matrices for the error calculation + * + */ + HOST_DATA_TYPE normtotal; + + /** + * @brief The scalar value that will be used for \f$\alpha\f$ in the calculation + * + */ + HOST_DATA_TYPE alpha; + + /** + * @brief The scalar value that will be used for \f$\beta\f$ in the calculation + * + */ + HOST_DATA_TYPE beta; /** * @brief Construct a new GEMM Data object diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index 355fa7c6..68efe1f6 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -67,14 +67,35 @@ class LinpackProgramSettings : public hpcc_base::BaseSettings { }; /** - * @brief Data class cotnaining the data the kernel is exeucted with + * @brief Data class containing the data the kernel is exeucted with * */ class LinpackData { public: - HOST_DATA_TYPE *A, *b; + + /** + * @brief The input matrix representing the left side of the linear equation system + * + */ + HOST_DATA_TYPE *A; + + /** + * @brief The input vector the right side of the linear equation system + * + */ + HOST_DATA_TYPE *b; + + /** + * @brief A vector that can be used to store pivoting information + * + */ cl_int* ipvt; + + /** + * @brief The maximum value of A that will be used for the error calculation + * + */ HOST_DATA_TYPE norma; /** @@ -201,6 +222,7 @@ Can be used in exchange with kernel functions for functionality testing @param a the matrix with size of n*n @param n size of matrix A @param lda row with of the matrix. must be >=n +@param ipvt array of pivoting indices */ void gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt); diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index d243829f..89debac9 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -68,11 +68,6 @@ transpose::TransposeBenchmark::executeKernel(TransposeData &data) { return bm_execution::calculate(*executionSettings, data.A, data.B, data.result); } -/** -Prints the execution results to stdout - -@param results The execution results -*/ void transpose::TransposeBenchmark::printResults(const transpose::TransposeExecutionTimings &output) { double flops = executionSettings->programSettings->matrixSize * executionSettings->programSettings->matrixSize; diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index fb7666be..929a1aa5 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -79,7 +79,23 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings { class TransposeData { public: - HOST_DATA_TYPE *A, *B, *result; + /** + * @brief Input matrix A + * + */ + HOST_DATA_TYPE *A; + + /** + * @brief Input matrix B + * + */ + HOST_DATA_TYPE *B; + + /** + * @brief The result matrix + * + */ + HOST_DATA_TYPE *result; /** * @brief Construct a new Transpose Data object diff --git a/README.md b/README.md index 405ac454..f10b59bc 100755 --- a/README.md +++ b/README.md @@ -170,6 +170,17 @@ When major changes are made on the code the functionality should be checked by r To simplify this process the script `test_all.sh` can be used to build all benchmarks with the default configuration and run all tests. +## Code Documentation + +The benchmark suite supports the generation of code documentation using doxygen in HTML and Latex format. +To generate the documentation, execute the following commands: + + cd docs + doxygen doxy.config + +The generated documentation will be placed in `docs/html` and `docs/latex`. +To view the HTML documentation, open `docs/html/index.html` with a internet browser. + ## Notes on Xilinx Vitis Compatibility diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp index fb72c45c..cf67db60 100644 --- a/RandomAccess/src/host/random_access_benchmark.cpp +++ b/RandomAccess/src/host/random_access_benchmark.cpp @@ -70,11 +70,6 @@ random_access::RandomAccessBenchmark::executeKernel(RandomAccessData &data) { return bm_execution::calculate(*executionSettings, data.data); } -/** -Prints the execution results to stdout - -@param results The execution results -*/ void random_access::RandomAccessBenchmark::printResults(const random_access::RandomAccessExecutionTimings &output) { std::cout << std::setw(ENTRY_SPACE) diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp index 6eb6470c..d3c8bc79 100644 --- a/RandomAccess/src/host/random_access_benchmark.hpp +++ b/RandomAccess/src/host/random_access_benchmark.hpp @@ -79,6 +79,11 @@ class RandomAccessProgramSettings : public hpcc_base::BaseSettings { class RandomAccessData { public: + + /** + * @brief The input data array that will be updated using random accesses + * + */ HOST_DATA_TYPE *data; /** diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp index e3448f51..82ce87c8 100644 --- a/STREAM/src/host/stream_benchmark.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -77,11 +77,6 @@ stream::StreamBenchmark::executeKernel(StreamData &data) { data.C); } -/** -Prints the execution results to stdout - -@param results The execution results -*/ void stream::StreamBenchmark::printResults(const stream::StreamExecutionTimings &output) { diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp index 71287e44..a38cb925 100644 --- a/STREAM/src/host/stream_benchmark.hpp +++ b/STREAM/src/host/stream_benchmark.hpp @@ -85,7 +85,30 @@ class StreamProgramSettings : public hpcc_base::BaseSettings { class StreamData { public: - HOST_DATA_TYPE *A, *B, *C; + /** + * @brief The input array A of the benchmark + * + */ + HOST_DATA_TYPE *A; + + /** + * @brief The input array B of the benchmark + * + */ + HOST_DATA_TYPE *B; + + /** + * @brief The input array C of the benchmark + * + */ + HOST_DATA_TYPE *C; + + /** + * @brief Construct a new Stream Data object + * + * @param context the context that will be used to allocate SVM memory + * @param size the size of the data arrays in number of values + */ StreamData(const cl::Context& context, size_t size) { #ifdef INTEL_FPGA #ifdef USE_SVM @@ -111,6 +134,10 @@ class StreamData { #endif } + /** + * @brief Destroy the Stream Data object + * + */ ~StreamData() { #ifdef USE_SVM clSVMFree(A); @@ -183,7 +210,6 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark calculationTimings; }; @@ -146,8 +167,7 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark *data, unsigned iterations); - -// The function definitions and implementations below this comment are taken from the -// Network1D example implementation of the Intel FPGA SDK for OpenCL 19.4 -// They are licensed under the following conditions: -// -// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this -// software and associated documentation files (the "Software"), to deal in the Software -// without restriction, including without limitation the rights to use, copy, modify, merge, -// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to -// whom the Software is furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in all copies or -// substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// This agreement shall be governed in all respects by the laws of the State of California and -// by the laws of the United States of America. - -void fourier_transform_gold(bool inverse, const int lognr_points, std::complex *data); - -void fourier_stage(int lognr_points, std::complex *data); - } // namespace network diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 75f6a2fe..3d2df760 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -299,8 +299,10 @@ class HpccFpgaBenchmark { } /** - * @brief Selects and prepares the target device and prints the final configuration - * before executing the benchmark + * @brief Selects and prepares the target device and prints the final configuration. + * This method will initialize the executionSettings that are needed for the + * benchmark execution. + * Thus, it has to be called before executeBenchmark() or executeKernel() method are called! * * @param argc Number of input parameters as it is provided by the main function * @param argv Strings containing the input parameters as provided by the main function From 34d2dbddd4dff186d861bbe51e2e86c46d5608a2 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 May 2020 17:42:17 +0200 Subject: [PATCH 45/45] Add build script for RandomAccess and s10mx --- RandomAccess/scripts/build_s10mx_hbm.sh | 29 +++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 RandomAccess/scripts/build_s10mx_hbm.sh diff --git a/RandomAccess/scripts/build_s10mx_hbm.sh b/RandomAccess/scripts/build_s10mx_hbm.sh new file mode 100644 index 00000000..17fef118 --- /dev/null +++ b/RandomAccess/scripts/build_s10mx_hbm.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Synthesize the STREAM single kernel for the Stratix 10 MX HBM board on Noctua. +# Submit this script to sbatch in this folder! +# +#SBATCH -p fpgasyn + +module load intelFPGA_pro/19.4.0 +module load intel_s10mx/19.3.0 +module load lang/Python/3.7.0-foss-2018b +module load devel/CMake/3.15.3-GCCcore-8.3.0 + +SCRIPT_PATH=${SLURM_SUBMIT_DIR} + +BENCHMARK_DIR=${SCRIPT_PATH}/../ + +BUILD_DIR_4K=${SCRIPT_PATH}/../../build/synth/RA-s10xm_hbm + +mkdir -p ${BUILD_DIR_4K} +cd ${BUILD_DIR_4K} + +cmake ${BENCHMARK_DIR} -DDEVICE_BUFFER_SIZE=1024 -DNUM_REPLICATIONS=32 \ + -DAOC_FLAGS="-fpc -fp-relaxed -global-ring" \ + -DINTEL_CODE_GENERATION_SETTINGS=${BENCHMARK_DIR}/settings/settings.gen.intel.random_access_kernels_single.s10mxhbm.py + +make random_access_kernels_single_intel& + +wait +