diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..656a3655 --- /dev/null +++ b/.clang-format @@ -0,0 +1,7 @@ +--- +BasedOnStyle: LLVM +Language: Cpp +BreakConstructorInitializersBeforeComma: 'true' +AllowShortFunctionsOnASingleLine: All +PointerAlignment: Left +ColumnLimit: 120 \ No newline at end of file diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 00000000..07e30621 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,64 @@ +--- +# Configure clang-tidy for this project. + +# -bugprone-narrowing-conversions: too many false positives around +# `std::size_t` vs. `*::difference_type`. + +# -boost-use-ranges: crash of clangd https://github.com/llvm/llvm-project/issues/109037 + +# -readability-identifier-length length of at least 3 does not make sense for some variables + +# -cppcoreguidelines-avoid-magic-numbers +# -readability-magic-numbers currently we have too may numbers in this code + +# -bugprone-easily-swappable-parameters we are not using strong typedefs + +# -readability-function-cognitive-complexity allow big functions + +Checks: > + -*, + boost-*, + bugprone-*, + cert-*, + clang-analyzer-*, + concurrency-*, + cppcoreguidelines-*, + google-*, + misc-*, + modernize-*, + performance-*, + portability-*, + readability-*, + -bugprone-narrowing-conversions, + -cppcoreguidelines-special-member-functions, + -boost-use-ranges, + -readability-identifier-length, + -cppcoreguidelines-avoid-magic-numbers, + -readability-magic-numbers, + -bugprone-easily-swappable-parameters, + -readability-function-cognitive-complexity + +# Turn all the warnings from the checks above into errors. +WarningsAsErrors: "*" + +HeaderFilterRegex: "include/firestarter/.*\\.(h|hpp)$" + +CheckOptions: + - { key: readability-identifier-naming.NamespaceCase, value: lower_case } + - { key: readability-identifier-naming.ClassCase, value: CamelCase } + - { key: readability-identifier-naming.StructCase, value: CamelCase } + - { key: readability-identifier-naming.FunctionCase, value: camelBack } + - { key: readability-identifier-naming.MemberCase, value: CamelCase } + - { key: readability-identifier-naming.VariableCase, value: CamelCase } + - { key: readability-identifier-naming.EnumCase, value: CamelCase } + - { key: readability-identifier-naming.ParameterCase, value: CamelCase } + - { key: readability-identifier-naming.UnionCase, value: CamelCase } + - { key: readability-identifier-naming.IgnoreMainLikeFunctions, value: 1 } + - { key: readability-redundant-member-init.IgnoreBaseInCopyConstructors, value: 1 } + - { key: modernize-use-default-member-init.UseAssignment, value: 1 } + - { key: readability-implicit-bool-conversion.AllowIntegerConditions, value: 1 } + - { key: readability-implicit-bool-conversion.AllowPointerConditions, value: 1 } + - { key: readability-function-cognitive-complexity.IgnoreMacros, value: 1 } + - { key: misc-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic, value: "true" } + # disable warnings is asmjit + - { key: 'clang-analyzer-optin.cplusplus.UninitializedObject:IgnoreRecordsWithField', value: 'asmjit::Operand_::Signature' } \ No newline at end of file diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..d1806bac --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1 @@ +9732bdb59717274f666e9c1497289d1f9a0d7858 diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml new file mode 100644 index 00000000..ef004c50 --- /dev/null +++ b/.github/workflows/clang-format.yml @@ -0,0 +1,19 @@ +name: clang-format-review + +# You can be more specific, but it currently only works on pull requests +on: [push, pull_request] + +jobs: + clang-format: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Install clang-tidy + run: | + sudo apt-get update + sudo apt-get install -y clang-tidy + - name: Analyze + run: | + clang-format --dry-run --Werror -style=file $(find ./src/ -name '*.cpp' -print) + clang-format --dry-run --Werror -style=file $(find ./include/ -name '*.hpp' -print) + clang-format --dry-run --Werror -style=file $(find ./include/ -name '*.h' -print) diff --git a/.github/workflows/clang-tidy.yml b/.github/workflows/clang-tidy.yml new file mode 100644 index 00000000..051d72f9 --- /dev/null +++ b/.github/workflows/clang-tidy.yml @@ -0,0 +1,50 @@ +name: clang-tidy-review + +on: [push, pull_request] + +env: + PYTHONUNBUFFERED: 1 + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + + # Ideally we would want to run the clang-tidy for every kind of build. + # This would make shure that we will check all platform dependent code parts. + # Here we only test the standard linux build. + - name: Install python3 and libraries + run: | + sudo apt update + sudo apt install python3 python3-pip + pip install click + + - name: Create build directory + run: | + mkdir build + + - name: Run CMake configure (default) + run: | + cd build + cmake .. + + - name: Build + run: | + cd build + make -j4 + + - name: Run clang-tidy + run: | + ./tooling/clang-tidy.py clang-tidy-report --build-root build --cores 4 + + - name: Print report + run: | + cat build/clang-tidy-report.txt + + - name: Check if report is empty + run: | + ./tooling/clang-tidy.py check --build-root build \ No newline at end of file diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 6b4c9178..b58b7e03 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -21,6 +21,8 @@ jobs: run: | sudo rm -rf /usr/local/lib/android sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/.ghcup - name: Install g++-9 (if needed) if: matrix.compiler == 'g++-9' diff --git a/.gitignore b/.gitignore index c4fde123..e157a461 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ result* *.swp *.swo build*/ +.cache/ diff --git a/CMakeLists.txt b/CMakeLists.txt index ec61b97f..c8f580e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,8 @@ cmake_minimum_required(VERSION 3.22) project(FIRESTARTER) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + include(cmake/GitSubmoduleUpdate.cmake) # set FIRESTARTER version diff --git a/include/firestarter/AlignedAlloc.hpp b/include/firestarter/AlignedAlloc.hpp new file mode 100644 index 00000000..7c5714fb --- /dev/null +++ b/include/firestarter/AlignedAlloc.hpp @@ -0,0 +1,78 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include +#include +#include + +namespace firestarter { + +struct AlignedAlloc { +private: + /// Round the size to the nearest multiple of the aligment + /// \arg Size The number to be rounded up. + /// \arg Alignment The number to whoose multiple to be round up to. + /// \returns Size rounded up to the nearest multiple of the Alignment + static auto padSize(const std::size_t Size, const std::size_t Alignment) -> std::size_t { + return Alignment * static_cast(std::ceil(static_cast(Size) / static_cast(Alignment))); + }; + +public: + /// Allocate memory with a given alignment. The size will automatically increased to the nearest multiple of the + /// alignment. + /// \arg Size The minimum required memory. + /// \arg Alignment describes to which boundary the memory should be aligned. The default is 64B which will account to + /// the size of a cache line on most systems. + /// \returns The pointer to the allocated memory. + static auto malloc(const std::size_t Size, const std::size_t Alignment = 64) -> void* { + // NOLINTBEGIN(cppcoreguidelines-owning-memory) +#if defined(__APPLE__) + return aligned_alloc(Alignment, padSize(Size, Alignment)); +#elif defined(__MINGW64__) + return _mm_malloc(padSize(Size, Alignment), Alignment); +#elif defined(_MSC_VER) + return _aligned_malloc(padSize(Size, Alignment), Alignment); +#else + return aligned_alloc(Alignment, padSize(Size, Alignment)); +#endif + // NOLINTEND(cppcoreguidelines-owning-memory) + }; + + /// Deallocate memory which has been allocated by the AlignedAlloc::malloc function. + /// \arg Ptr The pointer to the allocated memory. + static void free(void* Ptr) { + // NOLINTBEGIN(cppcoreguidelines-owning-memory,cppcoreguidelines-no-malloc) +#if defined(__APPLE__) + ::free(Ptr); +#elif defined(__MINGW64__) + _mm_free(Ptr); +#elif defined(_MSC_VER) + _aligned_free(Ptr); +#else + std::free(Ptr); +#endif + // NOLINTEND(cppcoreguidelines-owning-memory,cppcoreguidelines-no-malloc) + }; +}; + +} // namespace firestarter diff --git a/include/firestarter/Config.hpp b/include/firestarter/Config.hpp new file mode 100644 index 00000000..5c272401 --- /dev/null +++ b/include/firestarter/Config.hpp @@ -0,0 +1,124 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include +#include +#include + +namespace firestarter { + +/// This struct contains the parsed config from the command line for Firestarter. +struct Config { + /// The argument vector from the command line. + const char** Argv; + + /// The timeout after which firestarter terminates. This is available in combination with optimization. + std::chrono::seconds Timeout{}; + /// The period after with which the low/high load routine is switched. + std::chrono::microseconds Period{}; + /// The load in the range of 0 < Load <= Period, which controls how long of the period the high-load loop runs. + std::chrono::microseconds Load{}; + + /// The interval every which the register will be dumped to the file. + std::chrono::seconds DumpRegistersTimeDelta = std::chrono::seconds(0); + /// The time to skip from the measurement start + std::chrono::milliseconds StartDelta = std::chrono::milliseconds(0); + /// The time to skip from the measurement stop + std::chrono::milliseconds StopDelta = std::chrono::milliseconds(0); + /// Metric values will be polled by the MeasurementInterval. + std::chrono::milliseconds MeasurementInterval = std::chrono::milliseconds(0); + /// The time how long the processor will be preheated before starting a measurement or optimization. + std::chrono::seconds Preheat{}; + /// The time how long a measurement should take. + std::chrono::seconds EvaluationDuration{}; + + /// The crossover probability used in the NSGA2 optimization algorithm. + double Nsga2Cr; + /// The mutation probability used in the NSGA2 optimization algorithm. + double Nsga2M; + + /// The name of the metrics that are read from stdin. + std::vector StdinMetrics; + /// The paths to the metrics that are loaded using shared libraries. + std::vector MetricPaths; + /// The list of metrics that are used for maximization. If a metric is prefixed with '-' it will be minimized. + std::vector OptimizationMetrics; + + /// The optional cpu bind that allow pinning to specific cpus. + std::string CpuBind; + /// The optional selected instruction groups. If this is empty the default will be choosen. + std::string InstructionGroups; + /// The file where the dump register feature will safe its output to. + std::string DumpRegistersOutpath; + /// The name of the optimization algorithm. + std::string OptimizationAlgorithm; + /// The file where the data saved during optimization is saved. + std::string OptimizeOutfile; + + /// The argument count from the command line. + int Argc; + /// The requested number of threads firestarter should run with. 0 means all threads. + unsigned RequestedNumThreads; + /// The selected function id. 0 means automatic selection. + unsigned FunctionId; + /// The line count of the payload. 0 means default. + unsigned LineCount = 0; + /// The number of gpus firestarter should stress. Default is -1 means all gpus. + int Gpus = 0; + /// The matrix size which should be used. 0 means automatic detections. + unsigned GpuMatrixSize = 0; + /// The number of individuals that should be used for the optimization. + unsigned Individuals; + /// The number of generations that should be used for the optimization. + unsigned Generations; + + /// If the function summary should be printed. + bool PrintFunctionSummary; + /// If the available instruction groups for a function should be printed. + bool ListInstructionGroups; + /// Allow payloads that are not supported on the current processor. + bool AllowUnavailablePayload = false; + /// Is the dump registers debug feature enabled? + bool DumpRegisters = false; + /// Is the error detection feature enabled? + bool ErrorDetection = false; + /// Should the GPUs use floating point precision? If neither GpuUseFloat or GpuUseDouble is set, precision will be + /// choosen automatically. + bool GpuUseFloat = false; + /// Should the GPUs use double point precision? If neither GpuUseFloat or GpuUseDouble is set, precision will be + /// choosen automatically. + bool GpuUseDouble = false; + /// Should we print all available metrics. + bool ListMetrics = false; + /// Do we perform an measurement. + bool Measurement = false; + /// Do we perform optimization. + bool Optimize = false; + + Config() = delete; + + /// Parser the config from the command line argumens. + Config(int Argc, const char** Argv); +}; + +} // namespace firestarter \ No newline at end of file diff --git a/include/firestarter/Constants.hpp b/include/firestarter/Constants.hpp index 419d8b6a..71a0d992 100644 --- a/include/firestarter/Constants.hpp +++ b/include/firestarter/Constants.hpp @@ -21,16 +21,112 @@ #pragma once -#define THREAD_WAIT 1 -#define THREAD_WORK 2 -#define THREAD_INIT 3 -#define THREAD_STOP 4 -#define THREAD_SWITCH 5 -#define THREAD_INIT_FAILURE 0xffffffff - -/* DO NOT CHANGE! the asm load-loop tests if load-variable is == 0 */ -#define LOAD_LOW 0 -/* DO NOT CHANGE! the asm load-loop continues until the load-variable is != 1 */ -#define LOAD_HIGH 1 -#define LOAD_STOP 2 -#define LOAD_SWITCH 4 +#include + +namespace firestarter { + +using EightBytesType = uint64_t; + +// We want enum to have the size of 8B. Disable the warnings for bigger enum size than needed. +// NOLINTBEGIN(performance-enum-size) + +/// This enum describes the state of the load workers. +enum class LoadThreadState : EightBytesType { + /// Idle + ThreadWait = 1, + /// Work loop (both low and high load) + ThreadWork = 2, + /// Init the thread + ThreadInit = 3, + /// Tell the thread to recompile the payload and reinitialize the data. + ThreadSwitch = 4 +}; + +/// This enum describes the Load that should be applied by firestarter. +enum class LoadThreadWorkType : EightBytesType { + /* DO NOT CHANGE! the asm load-loop tests if load-variable is == 0 */ + /// Apply low load + LoadLow = 0, + /* DO NOT CHANGE! the asm load-loop continues until the load-variable is != 1 */ + /// Apply hugh load + LoadHigh = 1, + /// Exit the load loop and stop the execution of firestarter. + LoadStop = 2, + /// Exit the load loop. + LoadSwitch = 4 +}; +// NOLINTEND(performance-enum-size) + +/// This struct holds infomation about enabled or disabled compile time features for FIRESTARTER. +struct FirestarterOptionalFeatures { + /// Do we have a build that enabled optimization? + bool OptimizationEnabled; + /// Do we have a build that enabled CUDA or HIP? + bool CudaEnabled; + /// Do we have a build that enabled OneAPU? + bool OneAPIEnabled; + /// Is error detection enabled? + bool ErrorDetectionEnabled; + /// Are debug features enabled? + bool DebugFeatureEnabled; + /// Is dumping registers enabled? + bool DumpRegisterEnabled; + /// Is the current build for X86? + bool IsX86; + /// Is the current build for Windows? + bool IsWin32; + /// Is the current build built with Windows MSC? + bool IsMsc; + + /// Is one of the GPU features enabled? + [[nodiscard]] constexpr auto gpuEnabled() const -> bool { return CudaEnabled || OneAPIEnabled; } +}; + +// MSC only supports designated initializers from C++20 +static constexpr const FirestarterOptionalFeatures OptionalFeatures { +#if defined(linux) || defined(__linux__) + /*OptimizationEnabled=*/true, +#else + /*OptimizationEnabled=*/false, +#endif + +#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP) + /*CudaEnabled=*/true, +#else + /*CudaEnabled=*/false, +#endif + +#ifdef FIRESTARTER_BUILD_ONEAPI + /*OneAPIEnabled=*/true, +#else + /*OneAPIEnabled=*/false, +#endif + + /*ErrorDetectionEnabled=*/true, + +#ifdef FIRESTARTER_DEBUG_FEATURES + /*DebugFeatureEnabled=*/true, /*DumpRegisterEnabled =*/true, +#else + /*DebugFeatureEnabled=*/false, /*DumpRegisterEnabled =*/false, +#endif + +#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) + /*IsX86=*/true, +#else +#error "FIRESTARTER is not implemented for this ISA" +#endif + +#ifdef _WIN32 + /*IsWin32=*/true, +#else + /*IsWin32=*/false, +#endif + +#ifdef _MSC_VER + /*IsMsc=*/true, +#else + /*IsMsc=*/false, +#endif +}; + +} // namespace firestarter \ No newline at end of file diff --git a/include/firestarter/Cuda/Cuda.hpp b/include/firestarter/Cuda/Cuda.hpp index a2f281d9..396654c7 100644 --- a/include/firestarter/Cuda/Cuda.hpp +++ b/include/firestarter/Cuda/Cuda.hpp @@ -21,30 +21,61 @@ #pragma once +#include "firestarter/Constants.hpp" + #include -#include #include -#include namespace firestarter::cuda { +/// This class handles the workload on CUDA and HIP compatible GPUs. A gemm routine is used to stress them with a +/// constant high load. This header does not include any CUDA or HIP specific headers to allow us to not guard the +/// include of this header in other parts of the programm. class Cuda { private: - std::thread _initThread; - std::condition_variable _waitForInitCv; - std::mutex _waitForInitCvMutex; + /// The thread that is used to initilize the gpus. This thread will wait until each thread that runs the gemm routine + /// joins. + std::thread InitThread; - static void initGpus(std::condition_variable &cv, - volatile unsigned long long *loadVar, bool useFloat, - bool useDouble, unsigned matrixSize, int gpus); + /// Spawns a thread for each of the selected gpus, initilizes them and starts the execution of the gemm in parallel. + /// \arg WaitForInitCv The condition variables used to signal that all gpus are initialized. + /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter. + /// \arg UseFloat Set to true if we want to stress using single precision floating points. + /// \arg UseDouble Set to true if we want to stress using double precision floating points. If neither UseFloat or + /// UseDouble is set the precision will be choosen automatically. + /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for + /// automatic selection. + /// \arg Gpus Select the number of gpus to stress or -1 for all. + static void initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar, + bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus); public: - Cuda(volatile unsigned long long *loadVar, bool useFloat, bool useDouble, - unsigned matrixSize, int gpus); + /// Initilize the cuda class. This will start a thread running the Cuda::initGpus function and wait until all gpus are + /// inititialized. + /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter. + /// \arg UseFloat Set to true if we want to stress using single precision floating points. + /// \arg UseDouble Set to true if we want to stress using double precision floating points. If neither UseFloat or + /// UseDouble is set the precision will be choosen automatically. + /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for + /// automatic selection. + /// \arg Gpus Select the number of gpus to stress or -1 for all. + Cuda(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble, unsigned MatrixSize, + int Gpus) +#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP) + ; +#else + { + (void)&LoadVar; + (void)UseFloat; + (void)UseDouble; + (void)MatrixSize; + (void)Gpus; + } +#endif ~Cuda() { - if (_initThread.joinable()) { - _initThread.join(); + if (InitThread.joinable()) { + InitThread.join(); } } }; diff --git a/include/firestarter/Cuda/CudaHipCompat.hpp b/include/firestarter/Cuda/CudaHipCompat.hpp new file mode 100644 index 00000000..f0543f4d --- /dev/null +++ b/include/firestarter/Cuda/CudaHipCompat.hpp @@ -0,0 +1,774 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +// This file provides compatibility for the minor differences between the CUDA and HIP APIs. We do this by: +// 1. Include the required header files for CUDA or HIP +// 2. Define compatibility types between CUDA and HIP. This results in all enum names to be the same in the source code. +// These types are mapped to the ones with the correct prefix. These are cu and hip, CU and HIP, cuda and hip or CUDA +// and HIP. +// 3. Define functions that converts the error code enums into strings. +// 4. Define compatibility function for cals to CUDA, HIP or one of their libraries (blas, rand etc.) + +#pragma once + +#include "firestarter/Logging/Log.hpp" + +#include +#include +#include +#include +#include + +#ifdef FIRESTARTER_BUILD_CUDA + +#include +#include +#include +#include + +#elif defined(FIRESTARTER_BUILD_HIP) + +#include +#include +#include +#include + +#endif + +namespace firestarter::cuda::compat { + +/// Use this function as a wrapper to all calls of CUDA or HIP functions. If an error occured we abort and print the +/// error code. +/// \tparam T The type of the error code returned from calls to CUDA or HIP. This may be one of BlasStatusT, ErrorT, +/// RandStatusT or CUresult. +/// \arg TVal The errorcode returned from calls to CUDA or HIP. +/// \arg File The file for the log message in which the error occured. +/// \arg Line The line for the log message in which the error occured. +/// \arg DeviceIndex if the CUDA or HIP call is associated to a specific device, the index of the device should be +/// provided here for the log message. +template void accellSafeCall(T TVal, const char* File, int Line, std::optional DeviceIndex = {}); + +#ifdef FIRESTARTER_BUILD_CUDA +// Start of CUDA compatibility types + +// NOLINTNEXTLINE(performance-enum-size) +enum class BlasStatusT : std::underlying_type_t { + BLAS_STATUS_SUCCESS = CUBLAS_STATUS_SUCCESS, + BLAS_STATUS_NOT_INITIALIZED = CUBLAS_STATUS_NOT_INITIALIZED, + BLAS_STATUS_ALLOC_FAILED = CUBLAS_STATUS_ALLOC_FAILED, + BLAS_STATUS_INVALID_VALUE = CUBLAS_STATUS_INVALID_VALUE, + BLAS_STATUS_ARCH_MISMATCH = CUBLAS_STATUS_ARCH_MISMATCH, + BLAS_STATUS_MAPPING_ERROR = CUBLAS_STATUS_MAPPING_ERROR, + BLAS_STATUS_EXECUTION_FAILED = CUBLAS_STATUS_EXECUTION_FAILED, + BLAS_STATUS_INTERNAL_ERROR = CUBLAS_STATUS_INTERNAL_ERROR, + BLAS_STATUS_NOT_SUPPORTED = CUBLAS_STATUS_NOT_SUPPORTED, + BLAS_STATUS_LICENSE_ERROR = CUBLAS_STATUS_LICENSE_ERROR, +}; + +constexpr const char* AccelleratorString = "CUDA"; + +// NOLINTNEXTLINE(performance-enum-size) +enum class ErrorT : std::underlying_type_t { + Success = cudaSuccess, +}; + +// NOLINTNEXTLINE(performance-enum-size) +enum class RandStatusT : std::underlying_type_t { + RAND_STATUS_SUCCESS = CURAND_STATUS_SUCCESS, + RAND_STATUS_VERSION_MISMATCH = CURAND_STATUS_VERSION_MISMATCH, + RAND_STATUS_NOT_INITIALIZED = CURAND_STATUS_NOT_INITIALIZED, + RAND_STATUS_ALLOCATION_FAILED = CURAND_STATUS_ALLOCATION_FAILED, + RAND_STATUS_TYPE_ERROR = CURAND_STATUS_TYPE_ERROR, + RAND_STATUS_OUT_OF_RANGE = CURAND_STATUS_OUT_OF_RANGE, + RAND_STATUS_LENGTH_NOT_MULTIPLE = CURAND_STATUS_LENGTH_NOT_MULTIPLE, + RAND_STATUS_DOUBLE_PRECISION_REQUIRED = CURAND_STATUS_DOUBLE_PRECISION_REQUIRED, + RAND_STATUS_LAUNCH_FAILURE = CURAND_STATUS_LAUNCH_FAILURE, + RAND_STATUS_PREEXISTING_FAILURE = CURAND_STATUS_PREEXISTING_FAILURE, + RAND_STATUS_INITIALIZATION_FAILED = CURAND_STATUS_INITIALIZATION_FAILED, + RAND_STATUS_ARCH_MISMATCH = CURAND_STATUS_ARCH_MISMATCH, + RAND_STATUS_INTERNAL_ERROR = CURAND_STATUS_INTERNAL_ERROR, +}; + +using StreamOrContext = CUcontext; + +using DeviceProperties = cudaDeviceProp; + +using RandGenerator = curandGenerator_t; + +using BlasHandle = cublasHandle_t; + +using BlasStatus = cublasStatus_t; + +// NOLINTNEXTLINE(performance-enum-size) +enum class BlasOperation : std::underlying_type_t { + BLAS_OP_N = CUBLAS_OP_N, + BLAS_OP_T = CUBLAS_OP_T, + BLAS_OP_C = CUBLAS_OP_C, +}; + +using BlasOperationT = cublasOperation_t; + +using CUResultOrHipErrorT = CUresult; + +#elif defined(FIRESTARTER_BUILD_HIP) +// Start of HIP compatibility types + +// NOLINTNEXTLINE(performance-enum-size) +enum class BlasStatusT : std::underlying_type_t { + BLAS_STATUS_SUCCESS = HIPBLAS_STATUS_SUCCESS, + BLAS_STATUS_NOT_INITIALIZED = HIPBLAS_STATUS_NOT_INITIALIZED, + BLAS_STATUS_ALLOC_FAILED = HIPBLAS_STATUS_ALLOC_FAILED, + BLAS_STATUS_INVALID_VALUE = HIPBLAS_STATUS_INVALID_VALUE, + BLAS_STATUS_ARCH_MISMATCH = HIPBLAS_STATUS_ARCH_MISMATCH, + BLAS_STATUS_MAPPING_ERROR = HIPBLAS_STATUS_MAPPING_ERROR, + BLAS_STATUS_EXECUTION_FAILED = HIPBLAS_STATUS_EXECUTION_FAILED, + BLAS_STATUS_INTERNAL_ERROR = HIPBLAS_STATUS_INTERNAL_ERROR, + BLAS_STATUS_NOT_SUPPORTED = HIPBLAS_STATUS_NOT_SUPPORTED, + BLAS_STATUS_UNKNOWN = HIPBLAS_STATUS_UNKNOWN, + BLAS_STATUS_HANDLE_IS_NULLPTR = HIPBLAS_STATUS_HANDLE_IS_NULLPTR, + BLAS_STATUS_INVALID_ENUM = HIPBLAS_STATUS_INVALID_ENUM, +}; + +constexpr const char* AccelleratorString = "HIP"; + +// NOLINTNEXTLINE(performance-enum-size) +enum class ErrorT : std::underlying_type_t { + Success = hipSuccess, +}; + +// NOLINTNEXTLINE(performance-enum-size) +enum class RandStatusT : std::underlying_type_t { + RAND_STATUS_SUCCESS = HIPRAND_STATUS_SUCCESS, + RAND_STATUS_VERSION_MISMATCH = HIPRAND_STATUS_VERSION_MISMATCH, + RAND_STATUS_NOT_INITIALIZED = HIPRAND_STATUS_NOT_INITIALIZED, + RAND_STATUS_ALLOCATION_FAILED = HIPRAND_STATUS_ALLOCATION_FAILED, + RAND_STATUS_TYPE_ERROR = HIPRAND_STATUS_TYPE_ERROR, + RAND_STATUS_OUT_OF_RANGE = HIPRAND_STATUS_OUT_OF_RANGE, + RAND_STATUS_LENGTH_NOT_MULTIPLE = HIPRAND_STATUS_LENGTH_NOT_MULTIPLE, + RAND_STATUS_DOUBLE_PRECISION_REQUIRED = HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED, + RAND_STATUS_LAUNCH_FAILURE = HIPRAND_STATUS_LAUNCH_FAILURE, + RAND_STATUS_PREEXISTING_FAILURE = HIPRAND_STATUS_PREEXISTING_FAILURE, + RAND_STATUS_INITIALIZATION_FAILED = HIPRAND_STATUS_INITIALIZATION_FAILED, + RAND_STATUS_ARCH_MISMATCH = HIPRAND_STATUS_ARCH_MISMATCH, + RAND_STATUS_INTERNAL_ERROR = HIPRAND_STATUS_INTERNAL_ERROR, + RAND_STATUS_NOT_IMPLEMENTED = HIPRAND_STATUS_NOT_IMPLEMENTED, +}; + +using StreamOrContext = hipStream_t; + +using DeviceProperties = hipDeviceProp_t; + +using RandGenerator = hiprandGenerator_t; + +using BlasHandle = hipblasHandle_t; + +using BlasStatus = hipblasStatus_t; + +// NOLINTNEXTLINE(performance-enum-size) +enum class BlasOperation : std::underlying_type_t { + BLAS_OP_N = HIPBLAS_OP_N, + BLAS_OP_T = HIPBLAS_OP_T, + BLAS_OP_C = HIPBLAS_OP_C, +}; + +using BlasOperationT = hipblasOperation_t; + +using CUResultOrHipErrorT = ErrorT; + +#else + +// Start of compatibility types for clangd + +// NOLINTNEXTLINE(performance-enum-size) +enum class BlasStatusT { + BLAS_STATUS_SUCCESS = 0, +}; + +constexpr const char* AccelleratorString = "unknown"; + +// NOLINTNEXTLINE(performance-enum-size) +enum class ErrorT { + Success = 0, +}; + +// NOLINTNEXTLINE(performance-enum-size) +enum class RandStatusT { + RAND_STATUS_SUCCESS = 0, +}; + +using StreamOrContext = void*; + +using DeviceProperties = void*; + +using RandGenerator = void*; + +using BlasHandle = void*; + +using BlasStatus = void*; + +// NOLINTNEXTLINE(performance-enum-size) +enum class BlasOperation { + BLAS_OP_N, + BLAS_OP_T, + BLAS_OP_C, +}; + +using BlasOperationT = std::size_t; + +using CUResultOrHipErrorT = void*; + +#endif + +// abstracted function for both CUDA and HIP + +/// Get the error string from a call to CUDA of HIP libraries. +/// \arg Status The status code that is returned by these calls. +/// \return The error as a string. +inline auto getErrorString(ErrorT Error) -> const char* { +#ifdef FIRESTARTER_BUILD_CUDA + return cudaGetErrorString(static_cast(Error)); +#elif defined(FIRESTARTER_BUILD_HIP) + return hipGetErrorString(static_cast(Error)); +#else + (void)Error; + return "unknown"; +#endif +} + +/// Get the error string from a call to CUDA of HIP blas library. +/// \arg Status The status code that is returned by these calls. +/// \return The error as a string. +constexpr auto getErrorString(BlasStatusT Status) -> const char* { + switch (Status) { + case BlasStatusT::BLAS_STATUS_SUCCESS: + return "blas status: success"; +#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP) + case BlasStatusT::BLAS_STATUS_NOT_INITIALIZED: + return "blas status: not initialized"; + case BlasStatusT::BLAS_STATUS_ALLOC_FAILED: + return "blas status: alloc failed"; + case BlasStatusT::BLAS_STATUS_INVALID_VALUE: + return "blas status: invalid value"; + case BlasStatusT::BLAS_STATUS_ARCH_MISMATCH: + return "blas status: arch mismatch"; + case BlasStatusT::BLAS_STATUS_MAPPING_ERROR: + return "blas status: mapping error"; + case BlasStatusT::BLAS_STATUS_EXECUTION_FAILED: + return "blas status: execution failed"; + case BlasStatusT::BLAS_STATUS_INTERNAL_ERROR: + return "blas status: internal error"; + case BlasStatusT::BLAS_STATUS_NOT_SUPPORTED: + return "blas status: not supported"; +#endif +#ifdef FIRESTARTER_BUILD_CUDA + case BlasStatusT::BLAS_STATUS_LICENSE_ERROR: + return "blas status: license error"; +#endif +#ifdef FIRESTARTER_BUILD_HIP + case BlasStatusT::BLAS_STATUS_UNKNOWN: + return "blas status: unknown"; + case BlasStatusT::BLAS_STATUS_HANDLE_IS_NULLPTR: + return "blas status: handle is null pointer"; + case BlasStatusT::BLAS_STATUS_INVALID_ENUM: + return "blas status: invalid enum"; +#endif + default: + return "unknown"; + } +} + +/// Get the error string from a call to CUDA of HIP random library. +/// \arg Status The status code that is returned by these calls. +/// \return The error as a string. +constexpr auto getErrorString(RandStatusT Status) -> const char* { + switch (Status) { + case RandStatusT::RAND_STATUS_SUCCESS: + return "rand status: success"; +#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP) + case RandStatusT::RAND_STATUS_VERSION_MISMATCH: + return "rand status: version mismatch"; + case RandStatusT::RAND_STATUS_NOT_INITIALIZED: + return "rand status: not initialized"; + case RandStatusT::RAND_STATUS_ALLOCATION_FAILED: + return "rand status: allocation failed"; + case RandStatusT::RAND_STATUS_TYPE_ERROR: + return "rand status: type error"; + case RandStatusT::RAND_STATUS_OUT_OF_RANGE: + return "rand status: out of range"; + case RandStatusT::RAND_STATUS_LENGTH_NOT_MULTIPLE: + return "rand status: length not multiple"; + case RandStatusT::RAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "rand status: double precision required"; + case RandStatusT::RAND_STATUS_LAUNCH_FAILURE: + return "rand status: launch failure"; + case RandStatusT::RAND_STATUS_PREEXISTING_FAILURE: + return "rand status: preexisting failure"; + case RandStatusT::RAND_STATUS_INITIALIZATION_FAILED: + return "rand status: initialization failed"; + case RandStatusT::RAND_STATUS_ARCH_MISMATCH: + return "rand status: arch mismatch"; + case RandStatusT::RAND_STATUS_INTERNAL_ERROR: + return "rand status: internal error"; +#endif +#ifdef FIRESTARTER_BUILD_HIP + case RandStatusT::RAND_STATUS_NOT_IMPLEMENTED: + return "rand status: not implemented"; +#endif + default: + return "unknown"; + } +} + +#ifdef FIRESTARTER_BUILD_CUDA +/// Get the error string from a call to CUDA library. +/// \arg Result The status code that is returned by these calls. +/// \return The error as a string. +auto getErrorString(CUresult Result) -> const char* { + const char* ErrorString; + accellSafeCall(cuGetErrorName(Result, &ErrorString), __FILE__, __LINE__); + return ErrorString; +} +#else +// define types to not run into compile errors with if constexpr + +enum class CUresult {}; +// NOLINTBEGIN(readability-identifier-naming) +constexpr const int CUDA_SUCCESS = 0; +// NOLINTEND(readability-identifier-naming) +#endif + +template void accellSafeCall(T TVal, const char* File, const int Line, std::optional DeviceIndex) { + if constexpr (std::is_same_v) { + if (TVal == BlasStatusT::BLAS_STATUS_SUCCESS) { + return; + } + } else if constexpr (std::is_same_v) { + if (TVal == ErrorT::Success) { + return; + } + } else if constexpr (std::is_same_v) { + if (TVal == RandStatusT::RAND_STATUS_SUCCESS) { + return; + } + } else if constexpr (std::is_same_v) { +#ifndef FIRESTARTER_BUILD_CUDA + static_assert(false, "Tried to call accellSafeCall with CUresult, but not building for CUDA."); +#endif + if (TVal == CUDA_SUCCESS) { + return; + } + } else { + assert(false && "Tried to call accellSafeCall with an unknown type."); + } + + std::stringstream Ss; + Ss << AccelleratorString << " error at " << File << ":" << Line + << ": error code = " << static_cast>(TVal) << " (" << getErrorString(TVal) << ")"; + + if (DeviceIndex) { + Ss << ", device index: " << *DeviceIndex; + } + + firestarter::log::error() << Ss.str(); +} + +/// Wrapper to cuInit or hipInit. +/// \arg Flags The Flags forwarded to cuInit or hipInit. +/// \returns The Error code returned from these calls. +inline auto init(unsigned int Flags) -> CUResultOrHipErrorT { +#ifdef FIRESTARTER_BUILD_CUDA + return cuInit(Flags); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipInit(Flags)); +#else + (void)Flags; + static_assert(false, "Tried to call init, but neither building for CUDA nor HIP."); +#endif +} + +/// Get the number GPU devices. Wrapper to cuDeviceGetCount or hipGetDeviceCount. +/// \arg DevCount The reference to where the number of GPU devices will be written. +/// \returns The Error code returned from these calls. +inline auto getDeviceCount(int& DevCount) -> CUResultOrHipErrorT { +#ifdef FIRESTARTER_BUILD_CUDA + return cuDeviceGetCount(&DevCount); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipGetDeviceCount(&DevCount)); +#else + (void)DevCount; + static_assert(false, "Tried to call getDeviceCount, but neither building for CUDA nor HIP."); +#endif +} + +/// Create a context in case of CUDA or a stream in case of HIP on a specific device. It must be deleted with +/// destroyContextOrStream. +/// \arg DeviceIndex The device on which to create the context or stream. +/// \return The created context or stream. +inline auto createContextOrStream(int DeviceIndex) -> StreamOrContext { + StreamOrContext Soc{}; +#ifdef FIRESTARTER_BUILD_CUDA + firestarter::log::trace() << "Creating " << AccelleratorString << " context for computation on device nr. " + << DeviceIndex; + CUdevice Device; + accellSafeCall(cuDeviceGet(&Device, DeviceIndex), __FILE__, __LINE__, DeviceIndex); + accellSafeCall(cuCtxCreate(&Soc, 0, Device), __FILE__, __LINE__, DeviceIndex); + + firestarter::log::trace() << "Set created " << AccelleratorString << " context on device nr. " << DeviceIndex; + accellSafeCall(cuCtxSetCurrent(Soc), __FILE__, __LINE__, DeviceIndex); +#elif defined(FIRESTARTER_BUILD_HIP) + firestarter::log::trace() << "Creating " << AccelleratorString << " Stream for computation on device nr. " + << DeviceIndex; + accellSafeCall(static_cast(hipSetDevice(DeviceIndex)), __FILE__, __LINE__, DeviceIndex); + accellSafeCall(static_cast(hipStreamCreate(&Soc)), __FILE__, __LINE__, DeviceIndex); +#else + (void)DeviceIndex; + static_assert(false, "Tried to call createContextOrStream, but neither building for CUDA nor HIP."); +#endif + return Soc; +} + +/// Destroy the context (CUDA) or stream (HIP) with cuCtxDestroy and hipStreamDestroy respectively. +/// \arg Soc The reference to the context or stream. +/// \returns The Error code returned from these calls. +inline auto destroyContextOrStream(StreamOrContext& Soc) -> CUResultOrHipErrorT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(cuCtxDestroy(Soc)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipStreamDestroy(Soc)); +#else + (void)Soc; + static_assert(false, "Tried to call destroyContextOrStream, but neither building for CUDA nor HIP."); +#endif +} + +/// Create a blas handle. Wrapper to cublasCreate or hipblasCreate. +/// \arg BlasHandle The reference to a BlasHandle object which will be initialized. +/// \returns The Error code returned from these calls. +inline auto blasCreate(BlasHandle& BlasHandle) -> BlasStatusT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(cublasCreate(&BlasHandle)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipblasCreate(&BlasHandle)); +#else + (void)BlasHandle; + static_assert(false, "Tried to call blasCreate, but neither building for CUDA nor HIP."); +#endif +} + +/// Destory a blas handle. Wrapper to cublasDestroy or hipblasDestroy. +/// \arg BlasHandle The reference to a BlasHandle object which will be destroyed. +/// \returns The Error code returned from these calls. +inline auto blasDestroy(BlasHandle& BlasHandle) -> BlasStatusT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(cublasDestroy(BlasHandle)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipblasDestroy(BlasHandle)); +#else + (void)BlasHandle; + static_assert(false, "Tried to call blasDestroy, but neither building for CUDA nor HIP."); +#endif +} + +/// Get the properties of a specific GPU device. Wrapper to cudaGetDeviceProperties or hipGetDeviceProperties. +/// \arg Property The reference to the properties that are retrived. +/// \arg DeviceIndex The index of the GPU device for which to retrive the device properties.s +/// \returns The Error code returned from these calls. +inline auto getDeviceProperties(DeviceProperties& Property, int DeviceIndex) -> ErrorT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(cudaGetDeviceProperties(&Property, DeviceIndex)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipGetDeviceProperties(&Property, DeviceIndex)); +#else + (void)Property; + (void)DeviceIndex; + static_assert(false, "Tried to call getDeviceProperties, but neither building for CUDA nor HIP."); +#endif +} + +/// Get the number of memory in the current CUDA or HIP context. Wrapper to cuMemGetInfo or +/// hipMemGetInfo. +/// \arg MemoryAvail The reference to the available memory that is retrived. +/// \arg MemoryTotal The reference to the total memory that is retrived. +/// \returns The Error code returned from these calls. +inline auto memGetInfo(std::size_t& MemoryAvail, std::size_t& MemoryTotal) -> CUResultOrHipErrorT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(cuMemGetInfo(&MemoryAvail, &MemoryTotal)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipMemGetInfo(&MemoryAvail, &MemoryTotal)); +#else + (void)MemoryAvail; + (void)MemoryTotal; + static_assert(false, "Tried to call memGetInfo, but neither building for CUDA nor HIP."); +#endif +} + +/// Malloc device memory in the current CUDA or HIP context. Wrapper to cuMemAlloc or +/// hipMalloc. +/// \tparam FloatingPointType The type of the floating point used. Either float or double. +/// \arg Ptr The reference to the device pointer which is retrieved by the malloc call. +/// \arg MemorySize The memory that is allocated on the device in bytes. +/// \returns The Error code returned from these calls. +template +auto malloc(FloatingPointType** Ptr, std::size_t MemorySize) -> CUResultOrHipErrorT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(cuMemAlloc(reinterpret_cast(Ptr), MemorySize)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipMalloc(Ptr, MemorySize)); +#else + (void)Ptr; + (void)MemorySize; + static_assert(false, "Tried to call malloc, but neither building for CUDA nor HIP."); +#endif +} + +/// Free device memory in the current CUDA or HIP context. Wrapper to cuMemFree or +/// hipFree. +/// \tparam FloatingPointType The type of the floating point used. Either float or double. +/// \arg Ptr The reference to the device pointer which is used in the free call. +/// \returns The Error code returned from these calls. +template auto free(FloatingPointType* Ptr) -> CUResultOrHipErrorT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(cuMemFree(reinterpret_cast(Ptr))); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipFree(Ptr)); +#else + (void)Ptr; + static_assert(false, "Tried to call free, but neither building for CUDA nor HIP."); +#endif +} + +/// Create a random generator in the current CUDA or HIP context. Wrapper to curandCreateGenerator or +/// hiprandCreateGenerator. +/// \arg RandomGen The reference to the random generation which is retrived by the calls. +/// \returns The Error code returned from these calls. +inline auto randCreateGeneratorPseudoRandom(RandGenerator& RandomGen) -> RandStatusT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(curandCreateGenerator(&RandomGen, CURAND_RNG_PSEUDO_DEFAULT)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hiprandCreateGenerator(&RandomGen, HIPRAND_RNG_PSEUDO_DEFAULT)); +#else + (void)RandomGen; + static_assert(false, "Tried to call randCreateGeneratorPseudoRandom, but neither building for CUDA nor HIP."); +#endif +} + +/// Set the pseudo random generator seed in the current CUDA or HIP context. Wrapper to +/// curandSetPseudoRandomGeneratorSeed or hiprandSetPseudoRandomGeneratorSeed. +/// \arg RandomGen The reference to the random generator. +/// \arg Seed The seed used to initialize the pseudo random generator. +/// \returns The Error code returned from these calls. +inline auto randSetPseudoRandomGeneratorSeed(RandGenerator& RandomGen, int Seed) -> RandStatusT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(curandSetPseudoRandomGeneratorSeed(RandomGen, Seed)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hiprandSetPseudoRandomGeneratorSeed(RandomGen, Seed)); +#else + (void)RandomGen; + (void)Seed; + static_assert(false, "Tried to call randSetPseudoRandomGeneratorSeed, but neither building for CUDA nor HIP."); +#endif +} + +/// Initialize the provided memory with with a specific number of uniform random floats. Wrapper to +/// curandGenerateUniform or hiprandGenerateUniform. +/// \arg RandomGen The reference to the random generator. +/// \arg OutputPtr The device pointer on which is initialized with specific number of uniform random floats. +/// \arg Num The number of unifrom random floats. +/// \returns The Error code returned from these calls. +inline auto randGenerateUniform(RandGenerator& RandomGen, float* OutputPtr, std::size_t Num) -> RandStatusT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(curandGenerateUniform(RandomGen, OutputPtr, Num)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hiprandGenerateUniform(RandomGen, OutputPtr, Num)); +#else + (void)RandomGen; + (void)OutputPtr; + (void)Num; + static_assert(false, "Tried to call randGenerateUniform, but neither building for CUDA nor HIP."); +#endif +} + +/// Initialize the provided memory with with a specific number of uniform random doubles. Wrapper to +/// curandGenerateUniformDouble or hiprandGenerateUniformDouble. +/// \arg RandomGen The reference to the random generator. +/// \arg OutputPtr The device pointer on which is initialized with specific number of uniform random floats. +/// \arg Num The number of unifrom random doubles. +/// \returns The Error code returned from these calls. +inline auto randGenerateUniformDouble(RandGenerator& RandomGen, double* OutputPtr, std::size_t Num) -> RandStatusT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(curandGenerateUniformDouble(RandomGen, OutputPtr, Num)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hiprandGenerateUniformDouble(RandomGen, OutputPtr, Num)); +#else + (void)RandomGen; + (void)OutputPtr; + (void)Num; + static_assert(false, "Tried to call randGenerateUniformDouble, but neither building for CUDA nor HIP."); +#endif +} + +/// Initialize the provided memory with with a specific number of uniform random floating points. Wrapper to +/// randGenerateUniform or randGenerateUniformDouble. +/// \tparam FloatPointType The float point types is used. Either float or double. +/// \arg Generator The reference to the random generator. +/// \arg OutputPtr The device pointer on which is initialized with specific number of uniform random floats. +/// \arg Num The number of unifrom random doubles. +/// \returns The Error code returned from these calls. +template +auto generateUniform(RandGenerator& Generator, FloatPointType* OutputPtr, size_t Num) -> RandStatusT { + if constexpr (std::is_same_v) { + return randGenerateUniform(Generator, OutputPtr, Num); + } else if constexpr (std::is_same_v) { + return randGenerateUniformDouble(Generator, OutputPtr, Num); + } else { + assert(false && "generateUniform: Template argument must be either float or double"); + } +} + +/// Destory a random generator in the current CUDA or HIP context. Wrapper to curandDestroyGenerator or +/// hiprandDestroyGenerator. +/// \arg RandomGen The reference to the random generation which shoule be destroyed. +/// \returns The Error code returned from these calls. +inline auto randDestroyGenerator(RandGenerator& RandomGen) -> RandStatusT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(curandDestroyGenerator(RandomGen)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hiprandDestroyGenerator(RandomGen)); +#else + (void)RandomGen; + static_assert(false, "Tried to call randDestroyGenerator, but neither building for CUDA nor HIP."); +#endif +} + +/// Copy memory from a device pointer to another device pointer. Wrapper to cuMemcpyDtoD or hipMemcpyDtoD. +/// \arg DestinationPtr The destination address. +/// \arg SourcePtr The source address. +/// \arg Size The number of bytes to copy. +/// \returns The Error code returned from these calls. +template +auto memcpyDtoD(FloatPointType* DestinationPtr, FloatPointType* SourcePtr, std::size_t Size) -> CUResultOrHipErrorT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast( + cuMemcpyDtoD(reinterpret_cast(DestinationPtr), reinterpret_cast(SourcePtr), Size)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipMemcpyDtoD(DestinationPtr, SourcePtr, Size)); +#else + (void)DestinationPtr; + (void)SourcePtr; + (void)Size; + static_assert(false, "Tried to call memcpyDtoD, but neither building for CUDA nor HIP."); +#endif +} + +/// Block until the current device finished. Wrapper to cudaDeviceSynchronize or hipcudaDeviceSynchronize. +/// \returns The Error code returned from these calls. +inline auto deviceSynchronize() -> ErrorT { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(cudaDeviceSynchronize()); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipDeviceSynchronize()); +#else + static_assert(false, "Tried to call deviceSynchronize, but neither building for CUDA nor HIP."); +#endif +} + +/// This function performs the matrix-matrix multiplication C = Alpha * op(A) * op(B) + Beta * C with op(A) and op(B) +/// described by the selected operation for Transa and Transb. BlasOperation::BLAS_OP_N maps to op(X) = X, +/// BlasOperation::BLAS_OP_T to op(X) = X transposed and BlasOperation::BLAS_OP_C to op(X) = conjugate transpose of X. +/// It wrapps (cu|hip)blas(S|D)gemm. +/// \tparam FloatPointType The float point types is used. Either float or double. +/// \arg Handle The blass handle +/// \arg Transa The operation selected for op(A) +/// \arg Transb The operation selected for op(B) +/// \arg M Number of rows of matrix op(A) and C. +/// \arg N Number of columns of matrix op(B) and C. +/// \arg K Number of columns of op(A) and rows of op(B). +/// \arg Alpha +/// \arg A +/// \arg Lda Leading dimension of two-dimensional array used to store the matrix A. +/// \arg B +/// \arg Ldb Leading dimension of two-dimensional array used to store matrix B. +/// \arg Beta +/// \arg C +/// \arg Ldc Leading dimension of a two-dimensional array used to store the matrix C. +/// \returns The Error code returned from these calls. +template +auto gemm(BlasHandle Handle, BlasOperation Transa, BlasOperation Transb, int M, int N, int K, + const FloatPointType& Alpha, const FloatPointType* A, int Lda, const FloatPointType* B, int Ldb, + const FloatPointType& Beta, FloatPointType* C, int Ldc) -> BlasStatusT { + if constexpr (std::is_same_v) { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(cublasSgemm(Handle, static_cast(Transa), + static_cast(Transb), M, N, K, &Alpha, A, Lda, B, Ldb, + &Beta, C, Ldc)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipblasSgemm(Handle, static_cast(Transa), + static_cast(Transb), M, N, K, &Alpha, A, Lda, B, Ldb, + &Beta, C, Ldc)); +#endif + } else if constexpr (std::is_same_v) { +#ifdef FIRESTARTER_BUILD_CUDA + return static_cast(cublasDgemm(Handle, static_cast(Transa), + static_cast(Transb), M, N, K, &Alpha, A, Lda, B, Ldb, + &Beta, C, Ldc)); +#elif defined(FIRESTARTER_BUILD_HIP) + return static_cast(hipblasDgemm(Handle, static_cast(Transa), + static_cast(Transb), M, N, K, &Alpha, A, Lda, B, Ldb, + &Beta, C, Ldc)); +#endif + } else { + (void)Handle; + (void)Transa; + (void)Transb; + (void)M; + (void)N; + (void)K; + (void)Alpha; + (void)A; + (void)Lda; + (void)B; + (void)Ldb; + (void)Beta; + (void)C; + (void)Ldc; + assert(false && "gemm: Template argument must be either float or double"); + } + +#if not(defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)) + (void)Handle; + (void)Transa; + (void)Transb; + (void)M; + (void)N; + (void)K; + (void)Alpha; + (void)A; + (void)Lda; + (void)B; + (void)Ldb; + (void)Beta; + (void)C; + (void)Ldc; + static_assert(false, "Tried to call gemm, but neither building for CUDA nor HIP."); +#endif +} + +} // namespace firestarter::cuda::compat \ No newline at end of file diff --git a/include/firestarter/DumpRegisterStruct.hpp b/include/firestarter/DumpRegisterStruct.hpp index 7e80c111..63d4695e 100644 --- a/include/firestarter/DumpRegisterStruct.hpp +++ b/include/firestarter/DumpRegisterStruct.hpp @@ -21,22 +21,41 @@ #pragma once +#include "firestarter/Constants.hpp" + +#include + namespace firestarter { /* DO NOT CHANGE! the asm load-loop tests if it should dump the current register * content */ -enum DumpVariable : unsigned long long { Start = 0, Wait = 1 }; +// NOLINTBEGIN(performance-enum-size) +/// This struct defines the variable the is used to control when the registers should be dumped. +enum class DumpVariable : EightBytesType { + /// Start saving register to memory + Start = 0, + /// When done when change it to the Wait state. There we do nothing. + Wait = 1 +}; +// NOLINTEND(performance-enum-size) -#define REGISTER_MAX_NUM 32 +// The maximal number of SIMD registers. This is currently 32 for zmm registers. +constexpr const auto RegisterMaxNum = 32; +/// The maximal number of doubles in SIMD registers. This is currently 8 for zmm registers. +constexpr const auto RegisterMaxSize = 8; +/// The maximum number of doubles in SIMD registers multiplied with the maximum number of vector registers. +constexpr const auto MaxNumberOfDoublesInVectorRegisters = RegisterMaxNum * RegisterMaxSize; +/// This struct is used to do the communication between the high-load loop and the part of the program that saves the +/// dumped registers to a file. struct DumpRegisterStruct { - // REGISTER_MAX_NUM cachelines - volatile double registerValues[REGISTER_MAX_NUM * 8]; - // pad to use a whole cacheline - volatile unsigned long long padding[7]; - volatile DumpVariable dumpVar; + /// This array will contain the dumped registers. It has the size of 32 Cachelines. (8B doubles * 8 double in a + /// register * 32 registers) + std::array RegisterValues; + /// Pad the DumpVar to use a whole cacheline + std::array Padding; + /// The variable that controls the execution of the dump register code in the high-load routine. + volatile DumpVariable DumpVar; }; -#undef REGISTER_MAX_NUM - } // namespace firestarter diff --git a/include/firestarter/DumpRegisterWorkerData.hpp b/include/firestarter/DumpRegisterWorkerData.hpp index f7b721d4..e0bf01d4 100644 --- a/include/firestarter/DumpRegisterWorkerData.hpp +++ b/include/firestarter/DumpRegisterWorkerData.hpp @@ -21,42 +21,53 @@ #pragma once -#include -#include +#include "firestarter/LoadWorkerData.hpp" +#include "firestarter/Logging/Log.hpp" +#include "firestarter/WindowsCompat.hpp" // IWYU pragma: keep #include - -#ifdef FIRESTARTER_DEBUG_FEATURES +#include namespace firestarter { +/// This class holds the data that is required for the worker thread that dumps the register contents to a file. class DumpRegisterWorkerData { public: - DumpRegisterWorkerData(std::shared_ptr loadWorkerData, - std::chrono::seconds dumpTimeDelta, - std::string dumpFilePath) - : loadWorkerData(loadWorkerData), dumpTimeDelta(dumpTimeDelta) { - - if (dumpFilePath.empty()) { - char cwd[PATH_MAX]; - if (getcwd(cwd, sizeof(cwd)) != NULL) { - this->dumpFilePath = cwd; + DumpRegisterWorkerData() = delete; + + /// Initialize the DumpRegisterWorkerData. + /// \arg LoadWorkerDataPtr The shared pointer to the data of the thread were registers should be dummped. We need it + /// to access the memory to which the registers are dumped as well as getting the size and count of registers. + /// \arg DumpTimeDelta Every this number of seconds the register content will be dumped. + /// \arg DumpFilePath The folder that is used to dump registers to. If the string is empty the current directory will + /// be choosen. If it cannot be determined /tmp is used. In this directory a file called hamming_distance.csv will be + /// created. + DumpRegisterWorkerData(std::shared_ptr LoadWorkerDataPtr, std::chrono::seconds DumpTimeDelta, + const std::string& DumpFilePath) + : LoadWorkerDataPtr(std::move(LoadWorkerDataPtr)) + , DumpTimeDelta(DumpTimeDelta) { + if (DumpFilePath.empty()) { + char* Pwd = get_current_dir_name(); + if (Pwd) { + this->DumpFilePath = Pwd; } else { log::error() << "getcwd() failed. Set --dump-registers-outpath to /tmp"; - this->dumpFilePath = "/tmp"; + this->DumpFilePath = "/tmp"; } } else { - this->dumpFilePath = dumpFilePath; + this->DumpFilePath = DumpFilePath; } } - ~DumpRegisterWorkerData() {} + ~DumpRegisterWorkerData() = default; - std::shared_ptr loadWorkerData; - const std::chrono::seconds dumpTimeDelta; - std::string dumpFilePath; + /// The shared pointer to the data of the thread were registers should be dummped. We need it to access the memory to + /// which the registers are dumped as well as getting the size and count of registers. + std::shared_ptr LoadWorkerDataPtr; + /// Every this number of seconds the register content will be dumped. + const std::chrono::seconds DumpTimeDelta; + /// The folder in which the hamming_distance.csv file will be created. + std::string DumpFilePath; }; -} // namespace firestarter - -#endif +} // namespace firestarter \ No newline at end of file diff --git a/include/firestarter/Environment/CPUTopology.hpp b/include/firestarter/Environment/CPUTopology.hpp index dcb61e96..bf9a8d19 100644 --- a/include/firestarter/Environment/CPUTopology.hpp +++ b/include/firestarter/Environment/CPUTopology.hpp @@ -21,7 +21,9 @@ #pragma once +#include #include +#include #include #include #include @@ -32,54 +34,93 @@ extern "C" { namespace firestarter::environment { +/// This class models the properties of a processor. class CPUTopology { public: - CPUTopology(std::string architecture); + explicit CPUTopology(std::string Architecture); virtual ~CPUTopology(); - unsigned numThreads() const { - return _numThreadsPerCore * _numCoresTotal; - } - unsigned maxNumThreads() const; - unsigned numThreadsPerCore() const { return _numThreadsPerCore; } - unsigned numCoresTotal() const { return _numCoresTotal; } - unsigned numPackages() const { return _numPackages; } + friend auto operator<<(std::ostream& Stream, CPUTopology const& CpuTopologyRef) -> std::ostream&; - std::string const &architecture() const { return _architecture; } - virtual std::string const &vendor() const { return _vendor; } - virtual std::string const &processorName() const { return _processorName; } - virtual std::string const &model() const = 0; + /// The total number of hardware threads. + [[nodiscard]] auto numThreads() const -> unsigned { return NumThreadsPerCore * NumCoresTotal; } + /// The maximum os_index of all PUs plus 1 if we cannot determine the number of cpu kinds. Otherwise the maximum + /// number of PUs. + [[nodiscard]] auto maxNumThreads() const -> unsigned; + /// Assuming we have a consistent number of threads per core. The number of thread per core. + [[nodiscard]] auto numThreadsPerCore() const -> unsigned { return NumThreadsPerCore; } + /// The total number of cores. + [[nodiscard]] auto numCoresTotal() const -> unsigned { return NumCoresTotal; } + /// The total number of packages. + [[nodiscard]] auto numPackages() const -> unsigned { return NumPackages; } + /// The CPU architecture e.g., x86_64 + [[nodiscard]] auto architecture() const -> std::string const& { return Architecture; } + /// The CPU vendor i.e., Intel or AMD. + [[nodiscard]] virtual auto vendor() const -> std::string const& { return Vendor; } + /// The processor name, this includes the vendor specific name + [[nodiscard]] virtual auto processorName() const -> std::string const& { return ProcessorName; } + /// The model of the processor. With X86 this is the the string of Family, Model and Stepping. + [[nodiscard]] virtual auto model() const -> std::string const& = 0; - // get the size of the L1i-cache in bytes - unsigned instructionCacheSize() const { return _instructionCacheSize; } + /// Getter for the L1i-cache size in bytes + [[nodiscard]] auto instructionCacheSize() const -> const auto& { return InstructionCacheSize; } - // return the cpu clockrate in Hz - virtual unsigned long long clockrate() const { return _clockrate; } - // return the cpu features - virtual std::list const &features() const = 0; + /// Getter for the clockrate in Hz + [[nodiscard]] virtual auto clockrate() const -> uint64_t { return Clockrate; } - // get a timestamp - virtual unsigned long long timestamp() const = 0; + /// Getter for the list of CPU features + [[nodiscard]] virtual auto features() const -> std::list const& = 0; - int getPkgIdFromPU(unsigned pu) const; - int getCoreIdFromPU(unsigned pu) const; + /// Get the current hardware timestamp + [[nodiscard]] virtual auto timestamp() const -> uint64_t = 0; + + /// Get the logical index of the core that housed the PU which is described by the os index. + /// \arg Pu The os index of the thread. + /// \returns Optionally the logical index of the CPU that houses this hardware thread. + [[nodiscard]] auto getCoreIdFromPU(unsigned Pu) const -> std::optional; + + /// Get the logical index of the package that housed the PU which is described by the os index. + /// \arg Pu The os index of the thread. + /// \returns Optionally the logical index of the package that houses this hardware thread. + [[nodiscard]] auto getPkgIdFromPU(unsigned Pu) const -> std::optional; protected: - std::string scalingGovernor() const; - std::ostream &print(std::ostream &stream) const; + /// Read the scaling_govenor file of cpu0 on linux and return the contents as a string. + [[nodiscard]] static auto scalingGovernor() -> std::string; + + /// Print the information about this process to a stream. + [[nodiscard]] auto print(std::ostream& Stream) const -> std::ostream&; private: - static std::stringstream getFileAsStream(std::string const &filePath); - - unsigned _numThreadsPerCore; - unsigned _numCoresTotal; - unsigned _numPackages; - std::string _architecture; - std::string _vendor = ""; - std::string _processorName = ""; - unsigned _instructionCacheSize = 0; - unsigned long long _clockrate = 0; - hwloc_topology_t topology; + /// The CPU vendor i.e., Intel or AMD. + std::string Vendor; + + /// Helper function to open a filepath and return a stringstream with its contents. + /// \arg FilePath The file to open + /// \returns A stringstream with the contents of the file. + [[nodiscard]] static auto getFileAsStream(std::string const& FilePath) -> std::stringstream; + + /// Assuming we have a consistent number of threads per core. The number of thread per core. + unsigned NumThreadsPerCore; + /// The total number of cores. + unsigned NumCoresTotal; + /// The total number of packages. + unsigned NumPackages; + + /// The CPU architecture e.g., x86_64 + std::string Architecture; + /// The processor name, this includes the vendor specific name + std::string ProcessorName; + /// The optional size of the instruction cache per core. + std::optional InstructionCacheSize; + /// Clockrate of the CPU in Hz + uint64_t Clockrate = 0; + /// The hwloc topology that is used to query information about the processor. + hwloc_topology_t Topology{}; }; +inline auto operator<<(std::ostream& Stream, CPUTopology const& CpuTopologyRef) -> std::ostream& { + return CpuTopologyRef.print(Stream); +} + } // namespace firestarter::environment diff --git a/include/firestarter/Environment/Environment.hpp b/include/firestarter/Environment/Environment.hpp index c76dc073..41446bde 100644 --- a/include/firestarter/Environment/Environment.hpp +++ b/include/firestarter/Environment/Environment.hpp @@ -21,74 +21,129 @@ #pragma once -#include -#include -#include +#include "firestarter/Environment/CPUTopology.hpp" +#include "firestarter/Environment/Platform/PlatformConfig.hpp" #include +#include +#include #include namespace firestarter::environment { +/// This class handles parsing of user input to FIRESTARTER, namely the number of threads used, the thread affinity, the +/// selection of the correct high-load function, selection of the instruction groups and number of lines. It also +/// handles printing useful information, provides interfaces to the PlatformConfig and the number of threads. It +/// facilitates setting the cpu affinity in further parts of FIRESTARTER. class Environment { public: - Environment(CPUTopology *topology) : _topology(topology) {} - ~Environment() { - delete this->_topology; - if (_selectedConfig != nullptr) { - delete _selectedConfig; - } - } + Environment() = delete; + explicit Environment(std::unique_ptr&& Topology) + : Topology(std::move(Topology)) {} + virtual ~Environment() = default; + + /// Parse the user input for the cpu affinity and the number of requested threads. If a CpuBind is provided we + /// evaluate it and set the number of threads and their affinity accordingly. This is only supported on linux and with + /// the FIRESTARTER_THREAD_AFFINITY build flag. This function will save the correct number of threads based on the + /// user input in RequestedNumThreads. It must be called for FIRESTARTER to function properly. + /// \arg RequestedNumThreads The number of threads that are requested by a user. If this is zero the number will be + /// automatically determined. + /// \arg CpuBind If this string following the CPULIST format: "x,y,z", "x-y", "x-y/step", and any combination of the + /// above. We select the number of requested CPUs and their cpubind from this string. + void evaluateCpuAffinity(unsigned RequestedNumThreads, const std::string& CpuBind); - int evaluateCpuAffinity(unsigned requestedNumThreads, std::string cpuBind); - int setCpuAffinity(unsigned thread); + /// The worker threads are numerated from zero to RequestedNumThreads. Set the cpuaffinity of a calling thread based + /// on this index to the one that that should be used according to the determined CpuBind list from the call to + /// evaluateCpuAffinity. This function will throw if it is called with an invalid index. + /// \arg Thread The index of the worker thread. + void setCpuAffinity(unsigned Thread) const; + + /// Print the summary of the used thread for the workers. If thread affinity is supported (linux and compiled with the + /// FIRESTARTER_THREAD_AFFINITY flag), print which thread is pinned to which CPU. void printThreadSummary(); - virtual void evaluateFunctions() = 0; - virtual int selectFunction(unsigned functionId, - bool allowUnavailablePayload) = 0; - virtual int selectInstructionGroups(std::string groups) = 0; + /// Select a PlatformConfig based on its generated id. This function will throw if a payload is not available or the + /// id is incorrect. If id is zero we automatically select a matching PlatformConfig. + /// \arg FunctionId The id of the PlatformConfig that should be selected. + /// \arg AllowUnavailablePayload If true we will not throw if the PlatformConfig is not available. + virtual void selectFunction(unsigned FunctionId, bool AllowUnavailablePayload) = 0; + + /// Parse the selected payload instruction groups and save the in the selected function. Throws if the input is + /// invalid. + /// \arg Groups The list of instruction groups that is in the format: multiple INSTRUCTION:VALUE pairs + /// comma-seperated. + virtual void selectInstructionGroups(std::string Groups) = 0; + + /// Print the available instruction groups of the selected function. virtual void printAvailableInstructionGroups() = 0; - virtual void setLineCount(unsigned lineCount) = 0; + + /// Set the line count in the selected function. + /// \arg LineCount The maximum number of instruction that should be in the high-load loop. + virtual void setLineCount(unsigned LineCount) = 0; + + /// Print a summary of the settings of the selected config. virtual void printSelectedCodePathSummary() = 0; + + /// Print a list of available high-load function and if they are available on the current system. virtual void printFunctionSummary() = 0; - platform::RuntimeConfig &selectedConfig() const { -#if defined(__clang__) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-value" -#endif -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-value" - assert(("No RuntimeConfig selected", _selectedConfig != nullptr)); -#pragma GCC diagnostic pop -#if defined(__clang__) -#pragma clang diagnostic pop -#endif - return *_selectedConfig; + /// Get the number of threads FIRESTARTER will run with. + [[nodiscard]] auto requestedNumThreads() const -> uint64_t { return RequestedNumThreads; } + + /// Getter (which allows modifying) for the current platform config containing the payload, settings and the + /// associated name. + [[nodiscard]] virtual auto config() -> platform::PlatformConfig& { + assert(Config && "No PlatformConfig selected"); + return *Config; } - unsigned long long requestedNumThreads() const { - return _requestedNumThreads; + /// Const getter for the current platform config containing the payload, settings and the associated name. + [[nodiscard]] virtual auto config() const -> const platform::PlatformConfig& { + assert(Config && "No PlatformConfig selected"); + return *Config; } - CPUTopology const &topology() const { - assert(_topology != nullptr); - return *_topology; + /// Const getter for the current CPU topology. + [[nodiscard]] virtual auto topology() const -> const CPUTopology& { + assert(Topology && "Topology is a nullptr"); + return *Topology; } protected: - platform::RuntimeConfig *_selectedConfig = nullptr; - CPUTopology *_topology = nullptr; + /// This function sets the config based on the + void setConfig(std::unique_ptr&& Config) { this->Config = std::move(Config); } private: - unsigned long long _requestedNumThreads; + /// The selected config that contains the payload, settings and the associated name. + std::unique_ptr Config; + /// The description of the current CPU. + std::unique_ptr Topology; + + /// The number of threads FIRESTARTER is requested to run with. This will initially be set to zero, which will be + /// replaced by the maximum number of threads after calling evaluateCpuAffinity. + uint64_t RequestedNumThreads = 0; - // TODO: replace these functions with the builtins one from hwloc - int cpuAllowed(unsigned id); - int cpuSet(unsigned id); + // TODO(Issue #74): Use hwloc for cpu thread affinity. +#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY) + /// Check if the Cpu is allowed to be used with the current program. + /// \arg Id The if of the CPU which is checked. + /// \returns true if the CPU with Id is allowed to be used by the program. + static auto cpuAllowed(unsigned Id) -> bool; - std::vector cpuBind; + /// Set the cpu affinity of the current thread to a specific CPU. + /// \arg Id The id of the CPU to which to pin the calling thread. + /// \returns 0 on success. See the man page for. sched_setaffinity. + static auto cpuSet(unsigned Id) -> int; + + /// Add a CPU to mask if this CPU is available on the current system or throw with an error. + /// \arg Cpu The id of the CPU to add to the mask. + /// \arg Mask The reference to the mask to add the cpu to. + void addCpuSet(unsigned Cpu, cpu_set_t& Mask) const; + + /// The list of physical CPU ids that are requested to be used. The length of this list should match the number of + /// requested threads if it is not zero. + std::vector CpuBind; +#endif }; } // namespace firestarter::environment diff --git a/include/firestarter/Environment/Payload/CompiledPayload.hpp b/include/firestarter/Environment/Payload/CompiledPayload.hpp new file mode 100644 index 00000000..488c6c8d --- /dev/null +++ b/include/firestarter/Environment/Payload/CompiledPayload.hpp @@ -0,0 +1,101 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include "firestarter/Constants.hpp" +#include "firestarter/Environment/Payload/PayloadStats.hpp" + +#include +#include +#include + +namespace firestarter::environment::payload { + +class Payload; + +/// This class represents a payload that can be executed. It is created by calling compilePayload of the payload class +/// with specific settings. It contains a reference to the init and low load functions (which do not change with payload +/// settings) and the high load function which changes based on the settings. The stats of the high load function (nb. +/// of flops, bytes of memory accessed and instructions) can also be retrieved. +class CompiledPayload { +public: + CompiledPayload() = delete; + virtual ~CompiledPayload() = default; + + /// A unique ptr for the CompiledPayload with a custom deleter. + using UniquePtr = std::unique_ptr; + + using HighLoadFunctionPtr = uint64_t (*)(double*, volatile LoadThreadWorkType*, uint64_t); + + /// Getter for the stats of the high load function of the compiled payload + [[nodiscard]] auto stats() const -> const PayloadStats& { return Stats; }; + + /// Function to initialize the memory used by the high load function. + /// \arg MemoryAddr The pointer to the memory. + /// \arg BufferSize The number of doubles that is allocated in MemoryAddr. + void init(double* MemoryAddr, uint64_t BufferSize); + + /// Function to produce a low load on the cpu. + /// \arg LoadVar The variable that controls the load. If this variable changes from LoadThreadWorkType::LowLoad to + /// something else this function will return. + /// \arg Period The period of the low/high load switching. This function may sleep a fraction of this period. + void lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period); + + /// Function to produce high load on the cpu. + /// \arg MemoryAddr The pointer to the memory. + /// \arg LoadVar The variable that controls the load. If this variable changes from LoadThreadWorkType::LoadHigh to + /// something else this function will return. + /// \arg Iterations The current iteration counter. This number will be incremented for every iteration of the high + /// load loop. + /// \returns The iteration counter passed into this function plus the number of iteration of the high load loop. + [[nodiscard]] auto highLoadFunction(double* MemoryAddr, volatile LoadThreadWorkType& LoadVar, uint64_t Iterations) + -> uint64_t { + return HighLoadFunction(MemoryAddr, &LoadVar, Iterations); + } + +protected: + /// Constructor for the CompiledPayload. + /// \arg Stats The stats of the high load function from the payload. + /// \arg PayloadPtr A unique pointer to the payload class to allow calling the init and low load functions which do + /// not change based on different payload settings. + /// \arg HighLoadFunction The pointer to the compiled high load function. + CompiledPayload(const PayloadStats& Stats, std::unique_ptr&& PayloadPtr, + HighLoadFunctionPtr HighLoadFunction) + : Stats(Stats) + , PayloadPtr(std::move(PayloadPtr)) + , HighLoadFunction(HighLoadFunction) {} + + /// Getter for the pointer to the high load function. We need to access this pointer directly to free the associated + /// memory from asmjit. + [[nodiscard]] auto highLoadFunctionPtr() -> HighLoadFunctionPtr { return HighLoadFunction; } + +private: + /// The stats of the compiled payload. + PayloadStats Stats; + /// The pointer to the payload class to allow calling the init and low load functions which do not change based on + /// different payload settings. + std::unique_ptr PayloadPtr; + /// The pointer to the compiled high load function. + HighLoadFunctionPtr HighLoadFunction; +}; + +} // namespace firestarter::environment::payload \ No newline at end of file diff --git a/include/firestarter/Environment/Payload/Payload.hpp b/include/firestarter/Environment/Payload/Payload.hpp index 40246ac0..b5b17199 100644 --- a/include/firestarter/Environment/Payload/Payload.hpp +++ b/include/firestarter/Environment/Payload/Payload.hpp @@ -21,93 +21,86 @@ #pragma once -#include +#include "firestarter/Constants.hpp" +#include "firestarter/Environment/CPUTopology.hpp" +#include "firestarter/Environment/Payload/CompiledPayload.hpp" +#include "firestarter/Environment/Payload/PayloadSettings.hpp" + +#include #include #include -#include +#include namespace firestarter::environment::payload { class Payload { private: - std::string _name; - unsigned getSequenceStartCount(const std::vector &sequence, - const std::string start); + /// The name of this payload. It is usally named by the CPU extension this payload uses e.g., SSE2 or FMA. + std::string Name; + + /// The size of the SIMD registers in units of doubles (8B) + unsigned RegisterSize = 0; + + /// The number of SIMD registers used by the payload + unsigned RegisterCount = 0; protected: - unsigned _flops; - unsigned _bytes; - // number of instructions in load loop - unsigned _instructions; - // size of used simd registers in bytes - unsigned _registerSize; - // number of used simd registers - unsigned _registerCount; - - std::vector generateSequence( - const std::vector> &proportion); - unsigned getL2SequenceCount(const std::vector &sequence) { - return getSequenceStartCount(sequence, "L2"); - }; - unsigned getL3SequenceCount(const std::vector &sequence) { - return getSequenceStartCount(sequence, "L3"); - }; - unsigned getRAMSequenceCount(const std::vector &sequence) { - return getSequenceStartCount(sequence, "RAM"); - }; - - unsigned - getNumberOfSequenceRepetitions(const std::vector &sequence, - const unsigned numberOfLines) { - if (sequence.size() == 0) { - return 0; - } - return numberOfLines / sequence.size(); - }; - - unsigned getL2LoopCount(const std::vector &sequence, - const unsigned numberOfLines, const unsigned size, - const unsigned threads); - unsigned getL3LoopCount(const std::vector &sequence, - const unsigned numberOfLines, const unsigned size, - const unsigned threads); - unsigned getRAMLoopCount(const std::vector &sequence, - const unsigned numberOfLines, const unsigned size, - const unsigned threads); + /// Function to initialize the memory used by the high load function. + /// \arg MemoryAddr The pointer to the memory. + /// \arg BufferSize The number of doubles that is allocated in MemoryAddr. + virtual void init(double* MemoryAddr, uint64_t BufferSize) const = 0; + + /// Function to produce a low load on the cpu. + /// \arg LoadVar The variable that controls the load. If this variable changes from LoadThreadWorkType::LowLoad to + /// something else this function will return. + /// \arg Period The period of the low/high load switching. This function may sleep a fraction of this period. + virtual void lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period) const = 0; public: - Payload(std::string name, unsigned registerSize, unsigned registerCount) - : _name(name), _registerSize(registerSize), - _registerCount(registerCount) {} - virtual ~Payload() {} - - const std::string &name() const { return _name; } - unsigned flops() const { return _flops; } - unsigned bytes() const { return _bytes; } - unsigned instructions() const { return _instructions; } - unsigned registerSize() const { return _registerSize; } - unsigned registerCount() const { return _registerCount; } - - virtual bool isAvailable() const = 0; - - virtual void lowLoadFunction(volatile unsigned long long *addrHigh, - unsigned long long period) = 0; - - virtual int compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) = 0; - virtual std::list getAvailableInstructions() const = 0; - virtual void init(unsigned long long *memoryAddr, - unsigned long long bufferSize) = 0; - virtual unsigned long long - highLoadFunction(unsigned long long *addrMem, - volatile unsigned long long *addrHigh, - unsigned long long iterations) = 0; - - virtual Payload *clone() const = 0; + Payload() = delete; + + /// Abstract construction for the payload. + /// \arg Name The name of this payload. It is usally named by the CPU extension this payload uses e.g., SSE2 or FMA. + /// \arg RegisterSize The size of the SIMD registers in units of doubles (8B). + /// \arg RegisterCount The number of SIMD registers used by the payload. + Payload(std::string Name, unsigned RegisterSize, unsigned RegisterCount) noexcept + : Name(std::move(Name)) + , RegisterSize(RegisterSize) + , RegisterCount(RegisterCount) {} + virtual ~Payload() = default; + + // Allow init and lowLoadFunction functions to be accessed by the CompiledPayload class. + friend void CompiledPayload::init(double* MemoryAddr, uint64_t BufferSize); + friend void CompiledPayload::lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period); + + /// Get the name of this payload. It is usally named by the CPU extension this payload uses e.g., SSE2 or FMA. + [[nodiscard]] auto name() const -> const std::string& { return Name; } + + /// The size of the SIMD registers in units of doubles (8B) + [[nodiscard]] auto registerSize() const -> unsigned { return RegisterSize; } + + /// The number of SIMD registers used by the payload + [[nodiscard]] auto registerCount() const -> unsigned { return RegisterCount; } + + /// Check if this payload is available on the current system. This usally translates if the cpu extensions are + /// available. + /// \arg Topology The CPUTopology that is used to check agains if this payload is supported. + /// \returns true if the payload is supported on the given CPUTopology. + [[nodiscard]] virtual auto isAvailable(const CPUTopology& Topology) const -> bool = 0; + + /// Compile this payload with supplied settings and optional features. + /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches. + /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the + /// compiled payload. + /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine + /// of the compiled payload. + /// \returns The compiled payload that provides access to the init and load functions. + [[nodiscard]] virtual auto compilePayload(const PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const -> CompiledPayload::UniquePtr = 0; + + /// Get the available instruction items that are supported by this payload. + /// \returns The available instruction items that are supported by this payload. + [[nodiscard]] virtual auto getAvailableInstructions() const -> std::list = 0; }; } // namespace firestarter::environment::payload diff --git a/include/firestarter/Environment/Payload/PayloadSettings.hpp b/include/firestarter/Environment/Payload/PayloadSettings.hpp new file mode 100644 index 00000000..8438e9a6 --- /dev/null +++ b/include/firestarter/Environment/Payload/PayloadSettings.hpp @@ -0,0 +1,268 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace firestarter::environment::payload { + +/// This class represents the settings that can be changed in the high load routine of a payload. +struct PayloadSettings { +public: + using InstructionWithProportion = std::pair; + +private: + /// The number of threads for which this payload is available. Multiple ones may exsists. The PayloadSettings are + /// concreate once this is set to contain only one element. + std::list Threads; + + /// The size of the L1i cache per physical CPU core. This value may be empty. + std::optional InstructionCacheSize; + + /// The size of the L1d,L2,...,L3 caches per physical CPU core. + std::list DataCacheBufferSize; + + /// The selected size of the buffer that is in the RAM on the physical CPU core. + unsigned RamBufferSize; + + /// The maximum number of instructions that should appear inside the high load routine. + unsigned Lines; + + /// This represents the instructions in combination with the number of times they should appear in the generated + /// sequence. + std::vector InstructionGroups; + + /// Get the number of items in the sequence that start with a given string. + /// \arg Sequence The sequence that is analyzed. + /// \arg Start The string that contains the start of the item names that should be counted in the sequence. + /// \returns The number of items in the sequence that start with the supplied strings. + [[nodiscard]] static auto getSequenceStartCount(const std::vector& Sequence, const std::string& Start) + -> unsigned; + +public: + PayloadSettings() = delete; + + PayloadSettings(std::initializer_list Threads, std::initializer_list DataCacheBufferSize, + unsigned RamBufferSize, unsigned Lines, std::vector&& InstructionGroups) + : Threads(Threads) + , DataCacheBufferSize(DataCacheBufferSize) + , RamBufferSize(RamBufferSize) + , Lines(Lines) + , InstructionGroups(std::move(InstructionGroups)) {} + + /// Generate a sequence of items interleaved with one another based on a supplied number how many times each items + /// should appear in the resulting sequence. + /// \arg Proportion The mapping of items defined by a string and the number of times this item should apear in the + /// resuling sequence. + /// \returns The sequence that is generated from the supplied propotions + [[nodiscard]] static auto generateSequence(const std::vector& Proportion) + -> std::vector; + + /// Get the number of items in the sequence that start with "L2". + /// \arg Sequence The sequence that is analyzed. + /// \returns The number of items items in the sequence that start with "L2". + [[nodiscard]] static auto getL2SequenceCount(const std::vector& Sequence) -> unsigned { + return getSequenceStartCount(Sequence, "L2"); + }; + + /// Get the number of items in the sequence that start with "L3". + /// \arg Sequence The sequence that is analyzed. + /// \returns The number of items items in the sequence that start with "L3". + [[nodiscard]] static auto getL3SequenceCount(const std::vector& Sequence) -> unsigned { + return getSequenceStartCount(Sequence, "L3"); + }; + + /// Get the number of items in the sequence that start with "RAM". + /// \arg Sequence The sequence that is analyzed. + /// \returns The number of items items in the sequence that start with "RAM". + [[nodiscard]] static auto getRAMSequenceCount(const std::vector& Sequence) -> unsigned { + return getSequenceStartCount(Sequence, "RAM"); + }; + + /// Get the maximum number of repetitions of the the supplied sequence so that the size of the sequence times the + /// number of repetitions is smaller equal to the number of lines. The number of repetitions is a unsigned number. + /// \arg Sequence The reference to the sequence that should be repeated multiple times + /// \arg NumberOfLines The maximum number of entries in the repeated sequence + /// \returns The number of repetitions of the sequence. + [[nodiscard]] static auto getNumberOfSequenceRepetitions(const std::vector& Sequence, + const unsigned NumberOfLines) -> unsigned { + if (Sequence.empty()) { + return 0; + } + return NumberOfLines / Sequence.size(); + }; + + /// Get the number of accesses that can be made to 80% of the L2 cache size (each incrementing the pointer to the + /// cache) before the pointer need to be reseted to the original value. This assumes that each L2 item in the sequence + /// increments the pointer by one cache line (64B). It is also assumed that the number of accesses fit at least once + /// into this cache. This should always be the case on modern CPUs. + /// \arg Sequence The reference to the sequence. + /// \arg NumberOfLines The maximum number of entries in the repeated sequence. + /// \arg Size The size of the L2 Cache. + /// \returns The maximum number of iterations of the repeated sequence to fill up to 80% of the L2 cache. + [[nodiscard]] static auto getL2LoopCount(const std::vector& Sequence, unsigned NumberOfLines, + unsigned Size) -> unsigned; + + /// Get the number of accesses that can be made to 80% of the L3 cache size (each incrementing the pointer to the + /// cache) before the pointer need to be reseted to the original value. This assumes that each L3 item in the sequence + /// increments the pointer by one cache line (64B). See the note about assumptions on the size of the cache in the + /// documentation of getL2LoopCount. + /// \arg Sequence The reference to the sequence. + /// \arg NumberOfLines The maximum number of entries in the repeated sequence. + /// \arg Size The size of the L3 Cache. + /// \returns The maximum number of iterations of the repeated sequence to fill up to 80% of the L3 cache. + [[nodiscard]] static auto getL3LoopCount(const std::vector& Sequence, unsigned NumberOfLines, + unsigned Size) -> unsigned; + + /// Get the number of accesses that can be made to 100% of the RAM size (each incrementing the pointer to the ram) + /// before the pointer need to be reseted to the original value. This assumes that each RAM item in the sequence + /// increments the pointer by one cache line (64B). See the note about assumptions on the size of the cache in the + /// documentation of getL2LoopCount. + /// \arg Sequence The reference to the sequence. + /// \arg NumberOfLines The maximum number of entries in the repeated sequence. + /// \arg Size The size of the RAM. + /// \returns The maximum number of iterations of the repeated sequence to fill up to 100% of the RAM. + [[nodiscard]] static auto getRAMLoopCount(const std::vector& Sequence, unsigned NumberOfLines, + unsigned Size) -> unsigned; + + /// Are the payload settings concreate, i.e. can one specific payload be compiled with these settings. This is the + /// case if the option of threads is reduces to a single element. + [[nodiscard]] auto isConcreate() const -> bool { return Threads.size() == 1; } + + /// The number of threads which are available with the associated platform/payload. + [[nodiscard]] auto threads() const -> const auto& { return Threads; } + + /// The concreate number of threads which is selected. + [[nodiscard]] auto thread() const -> unsigned { + assert(isConcreate() && "Number of threads is not concreate."); + return Threads.front(); + } + + /// The available instruction cache size. This refers to the L1i-Cache on the physical CPU core. + [[nodiscard]] auto instructionCacheSize() const -> const auto& { return InstructionCacheSize; } + + /// The size of the L1d,L2,...,L3 caches per physical CPU core. + [[nodiscard]] auto dataCacheBufferSize() const -> const auto& { return DataCacheBufferSize; } + + /// The selected size of the buffer that is in the RAM on the physical CPU core. + [[nodiscard]] auto ramBufferSize() const -> auto{ return RamBufferSize; } + + /// Return the total buffer size for the data caches and the ram per physical CPU core. + [[nodiscard]] auto totalBufferSize() const -> std::size_t { + std::size_t Total = 0; + for (const auto& DataCacheSize : DataCacheBufferSize) { + Total += DataCacheSize; + } + Total += RamBufferSize; + return Total; + } + + /// The number of instruction groups which should be used in the payload per physical CPU core. + [[nodiscard]] auto lines() const -> auto{ return Lines; } + + /// The available instruction cache size. This refers to the L1i-Cache per thread on the physical CPU core. + [[nodiscard]] auto instructionCacheSizePerThread() const -> std::optional { + auto InstructionCacheSize = this->InstructionCacheSize; + if (*InstructionCacheSize) { + return *InstructionCacheSize / thread(); + } + return {}; + } + + /// The size of the L1d,L2,...,L3 caches per thread on the physical CPU core. + [[nodiscard]] auto dataCacheBufferSizePerThread() const -> std::list { + auto DataCacheBufferSizePerThread = DataCacheBufferSize; + for (auto& Value : DataCacheBufferSizePerThread) { + Value /= thread(); + } + return DataCacheBufferSizePerThread; + } + + /// The selected size of the buffer that is in the RAM per thread on the physical CPU core. + [[nodiscard]] auto ramBufferSizePerThread() const -> auto{ return RamBufferSize / thread(); } + + /// Return the total buffer size for the data caches and the ram per thread on the physical CPU core. + [[nodiscard]] auto totalBufferSizePerThread() const -> std::size_t { return totalBufferSize() / thread(); } + + /// The number of instruction groups which should be used in the payload per thread on the physical CPU core. + [[nodiscard]] auto linesPerThread() const -> auto{ return Lines / thread(); } + + /// The vector of instruction groups with proportions. + [[nodiscard]] auto instructionGroups() const -> const auto& { return InstructionGroups; } + + /// Generate a sequence of items interleaved with one another based on the instruction groups. + /// \returns The sequence that is generated from the supplied propotions in the instruction groups. + [[nodiscard]] auto sequence() const -> std::vector { return generateSequence(instructionGroups()); } + + /// The vector of used instructions that are saved in the instruction groups + [[nodiscard]] auto instructionGroupItems() const -> std::vector { + std::vector Items; + Items.reserve(InstructionGroups.size()); + for (auto const& Pair : InstructionGroups) { + Items.push_back(Pair.first); + } + return Items; + } + + /// Get the string that represents the instructions in combination with the number of times they should appear in the + /// sequence. + [[nodiscard]] auto getInstructionGroupsString() const -> std::string { + std::stringstream Ss; + + for (auto const& [Name, Value] : InstructionGroups) { + Ss << Name << ":" << Value << ","; + } + + auto Str = Ss.str(); + if (!Str.empty()) { + Str.pop_back(); + } + + return Str; + } + + /// Make the settings concreate. + /// \arg InstructionCacheSize The detected size of the instructions cache. + /// \arg ThreadPerCore The number of threads per pysical CPU. + void concretize(std::optional InstructionCacheSize, unsigned ThreadsPerCore) { + this->InstructionCacheSize = InstructionCacheSize; + this->Threads = {ThreadsPerCore}; + } + + /// Save the supplied instruction groups with their proportion in the payload settings. + /// \arg InstructionGroups The vector with pairs of instructions and proportions + void selectInstructionGroups(std::vector const& InstructionGroups) { + this->InstructionGroups = InstructionGroups; + } + + /// Save the line count in the payload settings. + void setLineCount(unsigned LineCount) { this->Lines = LineCount; } +}; + +} // namespace firestarter::environment::payload diff --git a/include/firestarter/Measurement/Metric/RAPL.h b/include/firestarter/Environment/Payload/PayloadStats.hpp similarity index 63% rename from include/firestarter/Measurement/Metric/RAPL.h rename to include/firestarter/Environment/Payload/PayloadStats.hpp index d88e3d91..79b2b1e3 100644 --- a/include/firestarter/Measurement/Metric/RAPL.h +++ b/include/firestarter/Environment/Payload/PayloadStats.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * FIRESTARTER - A Processor Stress Test Utility - * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High + * Copyright (C) 2024 TU Dresden, Center for Information Services and High * Performance Computing * * This program is free software: you can redistribute it and/or modify @@ -21,6 +21,18 @@ #pragma once -#include +namespace firestarter::environment::payload { -extern metric_interface_t rapl_metric; +/// This struct represents the stats a compiled payload has. +struct PayloadStats { + /// The number of flops computed per iteration of the high load routine. + unsigned Flops = 0; + + /// The number of bytes accessed to the main memory per iteration of the high load routine. + unsigned Bytes = 0; + + /// The number of instructions in load loop + unsigned Instructions = 0; +}; + +} // namespace firestarter::environment::payload diff --git a/include/firestarter/Environment/Platform/PlatformConfig.hpp b/include/firestarter/Environment/Platform/PlatformConfig.hpp index cbde3c68..40833b8c 100644 --- a/include/firestarter/Environment/Platform/PlatformConfig.hpp +++ b/include/firestarter/Environment/Platform/PlatformConfig.hpp @@ -21,83 +21,117 @@ #pragma once -#include -#include - -#include -#include -#include -#include -#include +#include "firestarter/Environment/CPUTopology.hpp" +#include "firestarter/Environment/Payload/Payload.hpp" +#include "firestarter/Logging/Log.hpp" namespace firestarter::environment::platform { +/// The payload in combination with settings and a short hand name for the specific microarchitecture this payload is +/// designed for. class PlatformConfig { private: - std::string _name; - std::list _threads; - payload::Payload *_payload; + /// The name of this platform. This is usually a short hand for the CPU microarchitecture e.g., HSW_COREI or + /// HSW_XEONEP. + std::string Name; -protected: - unsigned _instructionCacheSize; - std::list _dataCacheBufferSize; - unsigned _ramBufferSize; - unsigned _lines; + /// The settings for the associated payload. + payload::PayloadSettings Settings; + + /// The payload this platfrom should execute. + std::shared_ptr Payload; public: - PlatformConfig(std::string name, std::list threads, - unsigned instructionCacheSize, - std::initializer_list dataCacheBufferSize, - unsigned ramBufferSize, unsigned lines, - payload::Payload *payload) - : _name(name), _threads(threads), _payload(payload), - _instructionCacheSize(instructionCacheSize), - _dataCacheBufferSize(dataCacheBufferSize), - _ramBufferSize(ramBufferSize), _lines(lines) {} - virtual ~PlatformConfig() { delete _payload; } - - const std::string &name() const { return _name; } - unsigned instructionCacheSize() const { return _instructionCacheSize; } - const std::list &dataCacheBufferSize() const { - return _dataCacheBufferSize; - } - unsigned ramBufferSize() const { return _ramBufferSize; } - unsigned lines() const { return _lines; } - payload::Payload const &payload() const { return *_payload; } - - std::map getThreadMap() const { - std::map threadMap; - - for (auto const &thread : _threads) { - std::stringstream functionName; - functionName << "FUNC_" << name() << "_" << payload().name() << "_" - << thread << "T"; - threadMap[thread] = functionName.str(); - } + /// Getter for the name of the platform. + [[nodiscard]] auto name() const -> const auto& { return Name; } - return threadMap; - } + /// Getter for the settings of the platform. + [[nodiscard]] auto settings() const -> const auto& { return Settings; } - bool isAvailable() const { return payload().isAvailable(); } + /// Reference to the settings. This allows them to be overriden. + [[nodiscard]] auto settings() -> auto& { return Settings; } - virtual bool isDefault() const = 0; + /// Getter for the payload of the platform. + [[nodiscard]] auto payload() const -> const auto& { return Payload; } - virtual std::vector> - getDefaultPayloadSettings() const = 0; + /// Check if this platform is available on the current system. This transloate to if the cpu extensions are + /// available for the payload that is used. + /// \arg Topology The reference to the CPUTopology that is used to check agains if this platform is supported. + /// \returns true if the platform is supported on the given CPUTopology. + [[nodiscard]] auto isAvailable(const CPUTopology& Topology) const -> bool { return isAvailable(&Topology); } - std::string getDefaultPayloadSettingsString() const { - std::stringstream ss; + /// Check if this platform is available and the default on the current system. + /// \arg Topology The reference to the CPUTopology that is used to check agains if this payload is supported. + /// \returns true if the platform is the default one for a given CPUTopology. + [[nodiscard]] auto isDefault(const CPUTopology& Topology) const -> bool { return isDefault(&Topology); } - for (auto const &[name, value] : this->getDefaultPayloadSettings()) { - ss << name << ":" << value << ","; +protected: + /// Check if this platform is available on the current system. This transloate to if the cpu extensions are + /// available for the payload that is used. + /// \arg Topology The pointer to the CPUTopology that is used to check agains if this platform is supported. + /// \returns true if the platform is supported on the given CPUTopology. + [[nodiscard]] virtual auto isAvailable(const CPUTopology* Topology) const -> bool { + return payload()->isAvailable(*Topology); + } + + /// Check if this platform is available and the default on the current system. + /// \arg Topology The pointer to the CPUTopology that is used to check agains if this payload is supported. + /// \returns true if the platform is the default one for a given CPUTopology. + [[nodiscard]] virtual auto isDefault(const CPUTopology*) const -> bool = 0; + +public: + PlatformConfig() = delete; + + PlatformConfig(std::string Name, payload::PayloadSettings&& Settings, + std::shared_ptr&& Payload) noexcept + : Name(std::move(Name)) + , Settings(std::move(Settings)) + , Payload(std::move(Payload)) {} + + virtual ~PlatformConfig() = default; + + /// Clone a the platform config. + [[nodiscard]] virtual auto clone() const -> std::unique_ptr = 0; + + /// Clone a concreate platform config. + /// \arg InstructionCacheSize The detected size of the instructions cache. + /// \arg ThreadPerCore The number of threads per pysical CPU. + [[nodiscard]] virtual auto cloneConcreate(std::optional InstructionCacheSize, unsigned ThreadsPerCore) const + -> std::unique_ptr = 0; + + /// The function name for this platform config given a specific thread per core count. + /// \arg ThreadsPerCore The number of threads per core. + /// \returns The name of the function (a platform name, payload name and a specific thread per core count) + [[nodiscard]] auto functionName(unsigned ThreadsPerCore) const -> std::string { + return "FUNC_" + Name + "_" + Payload->name() + "_" + std::to_string(ThreadsPerCore) + "T"; + }; + + /// Get the concreate functions name. + [[nodiscard]] auto functionName() const -> std::string { + assert(Settings.isConcreate() && "Settings must be concreate for a concreate function name"); + return functionName(Settings.thread()); + }; + + /// Print a summary for the selected platform/payload with given settings. + void printCodePathSummary() const { + assert(Settings.isConcreate() && "Setting must be concreate to print the code path summary."); + + log::info() << "\n" + << " Taking " << Payload->name() << " path optimized for " << Name << " - " << Settings.thread() + << " thread(s) per core\n" + << " Used buffersizes per thread:"; + + if (Settings.instructionCacheSizePerThread()) { + log::info() << " - L1i-Cache: " << *Settings.instructionCacheSizePerThread() << " Bytes"; } - auto str = ss.str(); - if (str.size() > 0) { - str.pop_back(); + unsigned I = 1; + for (auto const& Bytes : Settings.dataCacheBufferSizePerThread()) { + log::info() << " - L" << I << "d-Cache: " << Bytes << " Bytes"; + I++; } - return str; + log::info() << " - Memory: " << Settings.ramBufferSizePerThread() << " Bytes"; } }; diff --git a/include/firestarter/Environment/Platform/RuntimeConfig.hpp b/include/firestarter/Environment/Platform/RuntimeConfig.hpp deleted file mode 100644 index 2ed821ea..00000000 --- a/include/firestarter/Environment/Platform/RuntimeConfig.hpp +++ /dev/null @@ -1,131 +0,0 @@ -/****************************************************************************** - * FIRESTARTER - A Processor Stress Test Utility - * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High - * Performance Computing - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Contact: daniel.hackenberg@tu-dresden.de - *****************************************************************************/ - -#pragma once - -#include - -#include - -namespace firestarter::environment::platform { - -class RuntimeConfig { -private: - PlatformConfig const &_platformConfig; - std::unique_ptr _payload; - unsigned _thread; - std::vector> _payloadSettings; - unsigned _instructionCacheSize; - std::list _dataCacheBufferSize; - unsigned _ramBufferSize; - unsigned _lines; - -public: - RuntimeConfig(PlatformConfig const &platformConfig, unsigned thread, - unsigned detectedInstructionCacheSize) - : _platformConfig(platformConfig), _payload(nullptr), _thread(thread), - _payloadSettings(platformConfig.getDefaultPayloadSettings()), - _instructionCacheSize(platformConfig.instructionCacheSize()), - _dataCacheBufferSize(platformConfig.dataCacheBufferSize()), - _ramBufferSize(platformConfig.ramBufferSize()), - _lines(platformConfig.lines()) { - if (detectedInstructionCacheSize != 0) { - this->_instructionCacheSize = detectedInstructionCacheSize; - } - }; - - RuntimeConfig(const RuntimeConfig &c) - : _platformConfig(c.platformConfig()), - _payload(c.platformConfig().payload().clone()), _thread(c.thread()), - _payloadSettings(c.payloadSettings()), - _instructionCacheSize(c.instructionCacheSize()), - _dataCacheBufferSize(c.dataCacheBufferSize()), - _ramBufferSize(c.ramBufferSize()), _lines(c.lines()) {} - - ~RuntimeConfig() { _payload.reset(); } - - PlatformConfig const &platformConfig() const { return _platformConfig; } - payload::Payload &payload() const { -#if defined(__clang__) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-value" -#endif -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-value" - assert(("Payload pointer is null. Each thread has to use it's own " - "RuntimeConfig", - _payload != nullptr)); -#pragma GCC diagnostic pop -#if defined(__clang__) -#pragma clang diagnostic pop -#endif - return *_payload; - } - unsigned thread() const { return _thread; } - const std::vector> &payloadSettings() const { - return _payloadSettings; - } - std::vector payloadItems() const { - std::vector items; - for (auto const &pair : _payloadSettings) { - items.push_back(pair.first); - } - return items; - } - - unsigned instructionCacheSize() const { return _instructionCacheSize; } - const std::list &dataCacheBufferSize() const { - return _dataCacheBufferSize; - } - unsigned ramBufferSize() const { return _ramBufferSize; } - unsigned lines() const { return _lines; } - - void setPayloadSettings( - std::vector> const &payloadSettings) { - this->_payloadSettings = payloadSettings; - } - - void setLineCount(unsigned lineCount) { this->_lines = lineCount; } - - void printCodePathSummary() const { - log::info() << "\n" - << " Taking " << platformConfig().payload().name() - << " path optimized for " << platformConfig().name() << " - " - << thread() << " thread(s) per core\n" - << " Used buffersizes per thread:"; - - if (instructionCacheSize() != 0) { - log::info() << " - L1i-Cache: " << instructionCacheSize() / thread() - << " Bytes"; - } - - unsigned i = 1; - for (auto const &bytes : dataCacheBufferSize()) { - log::info() << " - L" << i << "d-Cache: " << bytes / thread() - << " Bytes"; - i++; - } - - log::info() << " - Memory: " << ramBufferSize() / thread() << " Bytes"; - } -}; - -} // namespace firestarter::environment::platform diff --git a/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp b/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp index b23f1b97..20bfc491 100644 --- a/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp +++ b/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp @@ -21,37 +21,50 @@ #pragma once -#include +#include "firestarter/Environment/X86/Payload/X86Payload.hpp" namespace firestarter::environment::x86::payload { + +/// This payload is designed for the AVX512 foundation CPU extension. class AVX512Payload final : public X86Payload { public: - AVX512Payload(asmjit::CpuFeatures const &supportedFeatures) - : X86Payload(supportedFeatures, {asmjit::CpuFeatures::X86::kAVX512_F}, - "AVX512", 8, 32) {} - - int compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) override; - std::list getAvailableInstructions() const override; - void init(unsigned long long *memoryAddr, - unsigned long long bufferSize) override; + AVX512Payload() noexcept + : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::kAVX512_F}, /*Name=*/"AVX512", /*RegisterSize=*/8, + /*RegisterCount=*/32, + /*InstructionFlops=*/ + {{"REG", 32}, + {"L1_L", 32}, + {"L1_BROADCAST", 16}, + {"L1_S", 16}, + {"L1_LS", 16}, + {"L2_L", 32}, + {"L2_S", 16}, + {"L2_LS", 16}, + {"L3_L", 32}, + {"L3_S", 16}, + {"L3_LS", 16}, + {"L3_P", 16}, + {"RAM_L", 32}, + {"RAM_S", 16}, + {"RAM_LS", 16}, + {"RAM_P", 16}}, + /*InstructionMemory=*/{{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}) {} - firestarter::environment::payload::Payload *clone() const override { - return new AVX512Payload(this->supportedFeatures()); - }; + /// Compile this payload with supplied settings and optional features. + /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches. + /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the + /// compiled payload. + /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine + /// of the compiled payload. + /// \returns The compiled payload that provides access to the init and load functions. + [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const + -> environment::payload::CompiledPayload::UniquePtr override; private: - const std::map instructionFlops = { - {"REG", 32}, {"L1_L", 32}, {"L1_BROADCAST", 16}, {"L1_S", 16}, - {"L1_LS", 16}, {"L2_L", 32}, {"L2_S", 16}, {"L2_LS", 16}, - {"L3_L", 32}, {"L3_S", 16}, {"L3_LS", 16}, {"L3_P", 16}, - {"RAM_L", 32}, {"RAM_S", 16}, {"RAM_LS", 16}, {"RAM_P", 16}}; - - const std::map instructionMemory = { - {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}; + /// Function to initialize the memory used by the high load function. + /// \arg MemoryAddr The pointer to the memory. + /// \arg BufferSize The number of doubles that is allocated in MemoryAddr. + void init(double* MemoryAddr, uint64_t BufferSize) const override; }; } // namespace firestarter::environment::x86::payload diff --git a/include/firestarter/Environment/X86/Payload/AVXPayload.hpp b/include/firestarter/Environment/X86/Payload/AVXPayload.hpp index 0a6e8014..24ef7a15 100644 --- a/include/firestarter/Environment/X86/Payload/AVXPayload.hpp +++ b/include/firestarter/Environment/X86/Payload/AVXPayload.hpp @@ -21,36 +21,49 @@ #pragma once -#include +#include "firestarter/Environment/X86/Payload/X86Payload.hpp" namespace firestarter::environment::x86::payload { + +/// This payload is designed for the AVX CPU extension. class AVXPayload final : public X86Payload { public: - AVXPayload(asmjit::CpuFeatures const &supportedFeatures) - : X86Payload(supportedFeatures, {asmjit::CpuFeatures::X86::kAVX}, "AVX", - 4, 16) {} - - int compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) override; - std::list getAvailableInstructions() const override; - void init(unsigned long long *memoryAddr, - unsigned long long bufferSize) override; + AVXPayload() + : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::kAVX}, /*Name=*/"AVX", /*RegisterSize=*/4, + /*RegisterCount=*/16, + /*InstructionFlops=*/ + {{"REG", 4}, + {"L1_L", 4}, + {"L1_S", 4}, + {"L1_LS", 4}, + {"L2_L", 4}, + {"L2_S", 4}, + {"L2_LS", 4}, + {"L3_L", 4}, + {"L3_S", 4}, + {"L3_LS", 4}, + {"L3_P", 4}, + {"RAM_L", 4}, + {"RAM_S", 4}, + {"RAM_LS", 4}, + {"RAM_P", 4}}, + /*InstructionMemory=*/{{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}) {} - firestarter::environment::payload::Payload *clone() const override { - return new AVXPayload(this->supportedFeatures()); - }; + /// Compile this payload with supplied settings and optional features. + /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches. + /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the + /// compiled payload. + /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine + /// of the compiled payload. + /// \returns The compiled payload that provides access to the init and load functions. + [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const + -> environment::payload::CompiledPayload::UniquePtr override; private: - const std::map instructionFlops = { - {"REG", 4}, {"L1_L", 4}, {"L1_S", 4}, {"L1_LS", 4}, {"L2_L", 4}, - {"L2_S", 4}, {"L2_LS", 4}, {"L3_L", 4}, {"L3_S", 4}, {"L3_LS", 4}, - {"L3_P", 4}, {"RAM_L", 4}, {"RAM_S", 4}, {"RAM_LS", 4}, {"RAM_P", 4}}; - - const std::map instructionMemory = { - {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}; + /// Function to initialize the memory used by the high load function. + /// \arg MemoryAddr The pointer to the memory. + /// \arg BufferSize The number of doubles that is allocated in MemoryAddr. + void init(double* MemoryAddr, uint64_t BufferSize) const override; }; } // namespace firestarter::environment::x86::payload diff --git a/include/firestarter/Environment/X86/Payload/CompiledX86Payload.hpp b/include/firestarter/Environment/X86/Payload/CompiledX86Payload.hpp new file mode 100644 index 00000000..776f83f4 --- /dev/null +++ b/include/firestarter/Environment/X86/Payload/CompiledX86Payload.hpp @@ -0,0 +1,81 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include "firestarter/Environment/Payload/CompiledPayload.hpp" +#include "firestarter/Logging/Log.hpp" + +#include +#include + +namespace firestarter::environment::x86::payload { + +/// This class provides the functionality to compile a payload created with asmjit and create a unique pointer to the +/// CompiledPayload class which can be used to execute the functions of this payload. +class CompiledX86Payload final : public environment::payload::CompiledPayload { +private: + // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) + inline static asmjit::JitRuntime Runtime = asmjit::JitRuntime(); + + /// Custom deleter to release the memory of the high load function from the asmjit runtime. + /// \arg Payload The pointer to this class + static void deleter(CompiledX86Payload* Payload) { + if (Payload && Payload->highLoadFunctionPtr()) { + Runtime.release(Payload->highLoadFunctionPtr()); + } + } + /// Custom deleter to release the memory of the high load function from the asmjit runtime. + /// \arg Payload The pointer to this class + static void deleter(CompiledPayload* Payload) { deleter(dynamic_cast(Payload)); } + + /// Wrap the CompiledPayload class and forward all arguments. + /// \arg Stats The stats of the high load function from the payload. + /// \arg PayloadPtr A unique pointer to the payload class to allow calling the init and low load functions which do + /// not change based on different payload settings. + /// \arg HighLoadFunction The pointer to the compiled high load function. + CompiledX86Payload(const environment::payload::PayloadStats& Stats, + std::unique_ptr&& PayloadPtr, HighLoadFunctionPtr HighLoadFunction) + : CompiledPayload(Stats, std::move(PayloadPtr), HighLoadFunction) {} + +public: + CompiledX86Payload() = delete; + ~CompiledX86Payload() override = default; + + /// Create a unique pointer to a compiled payload from payload stats and assembly in a code holder. + /// \tparam DerivedPayload The payload class from which the CodeHolder with the assembly was created from. + /// \arg Stats The stats of the payload that is contained in the CodeHolder. + /// \arg Code The CodeHolder that contains the assembly instruction making up the payload. This will be added to the + /// JitRuntime and a pointer to the function will be provided to the CompiledPayload class. + /// \returns The unique pointer to the compiled payload. + template + [[nodiscard]] static auto create(environment::payload::PayloadStats Stats, asmjit::CodeHolder& Code) -> UniquePtr { + HighLoadFunctionPtr HighLoadFunction{}; + const auto Err = Runtime.add(&HighLoadFunction, &Code); + if (Err) { + workerLog::error() << "Asmjit adding Assembler to JitRuntime failed"; + } + + return {new CompiledX86Payload(Stats, std::move(std::make_unique()), HighLoadFunction), deleter}; + } +}; + +} // namespace firestarter::environment::x86::payload diff --git a/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp b/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp index 47d8a778..f0e711f6 100644 --- a/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp +++ b/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp @@ -21,39 +21,49 @@ #pragma once -#include +#include "firestarter/Environment/X86/Payload/X86Payload.hpp" namespace firestarter::environment::x86::payload { +/// This payload is designed for the FMA4 CPU extension. class FMA4Payload final : public X86Payload { public: - FMA4Payload(asmjit::CpuFeatures const &supportedFeatures) - : X86Payload( - supportedFeatures, - {asmjit::CpuFeatures::X86::kAVX, asmjit::CpuFeatures::X86::kFMA4}, - "FMA4", 4, 16) {} + FMA4Payload() noexcept + : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::kAVX, asmjit::CpuFeatures::X86::kFMA4}, + /*Name=*/"FMA4", /*RegisterSize=*/4, /*RegisterCount=*/16, + /*InstructionFlops=*/ + {{"REG", 8}, + {"L1_L", 12}, + {"L1_S", 8}, + {"L1_LS", 8}, + {"L2_L", 8}, + {"L2_S", 4}, + {"L2_LS", 4}, + {"L3_L", 8}, + {"L3_S", 4}, + {"L3_LS", 4}, + {"L3_P", 4}, + {"RAM_L", 8}, + {"RAM_S", 4}, + {"RAM_LS", 4}, + {"RAM_P", 4}}, + /*InstructionMemory=*/{{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}) {} - int compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) override; - std::list getAvailableInstructions() const override; - void init(unsigned long long *memoryAddr, - unsigned long long bufferSize) override; - - firestarter::environment::payload::Payload *clone() const override { - return new FMA4Payload(this->supportedFeatures()); - }; + /// Compile this payload with supplied settings and optional features. + /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches. + /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the + /// compiled payload. + /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine + /// of the compiled payload. + /// \returns The compiled payload that provides access to the init and load functions. + [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const + -> environment::payload::CompiledPayload::UniquePtr override; private: - const std::map instructionFlops = { - {"REG", 8}, {"L1_L", 12}, {"L1_S", 8}, {"L1_LS", 8}, {"L2_L", 8}, - {"L2_S", 4}, {"L2_LS", 4}, {"L3_L", 8}, {"L3_S", 4}, {"L3_LS", 4}, - {"L3_P", 4}, {"RAM_L", 8}, {"RAM_S", 4}, {"RAM_LS", 4}, {"RAM_P", 4}}; - - const std::map instructionMemory = { - {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}; + /// Function to initialize the memory used by the high load function. + /// \arg MemoryAddr The pointer to the memory. + /// \arg BufferSize The number of doubles that is allocated in MemoryAddr. + void init(double* MemoryAddr, uint64_t BufferSize) const override; }; -} // namespace firestarter::environment::x86::payload +} // namespace firestarter::environment::x86::payload \ No newline at end of file diff --git a/include/firestarter/Environment/X86/Payload/FMAPayload.hpp b/include/firestarter/Environment/X86/Payload/FMAPayload.hpp index 57ab455d..8280a5b2 100644 --- a/include/firestarter/Environment/X86/Payload/FMAPayload.hpp +++ b/include/firestarter/Environment/X86/Payload/FMAPayload.hpp @@ -21,40 +21,39 @@ #pragma once -#include +#include "firestarter/Environment/X86/Payload/X86Payload.hpp" namespace firestarter::environment::x86::payload { + +/// This payload is designed for the FMA CPU extension. class FMAPayload final : public X86Payload { public: - FMAPayload(asmjit::CpuFeatures const &supportedFeatures) - : X86Payload(supportedFeatures, - {asmjit::CpuFeatures::X86::kAVX, asmjit::CpuFeatures::X86::kFMA}, - "FMA", 4, 16) {} - - int compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) override; - std::list getAvailableInstructions() const override; - void init(unsigned long long *memoryAddr, - unsigned long long bufferSize) override; + FMAPayload() noexcept + : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::kAVX, asmjit::CpuFeatures::X86::kFMA}, /*Name=*/"FMA", + /*RegisterSize=*/4, /*RegisterCount=*/16, + /*InstructionFlops=*/{{"REG", 16}, {"L1_L", 16}, {"L1_2L", 16}, {"L1_S", 8}, + {"L1_LS", 8}, {"L1_LS_256", 8}, {"L1_2LS_256", 16}, {"L2_L", 16}, + {"L2_S", 8}, {"L2_LS", 8}, {"L2_LS_256", 8}, {"L2_2LS_256", 16}, + {"L3_L", 16}, {"L3_S", 8}, {"L3_LS", 8}, {"L3_LS_256", 8}, + {"L3_P", 8}, {"RAM_L", 16}, {"RAM_S", 8}, {"RAM_LS", 8}, + {"RAM_P", 8}}, + /*InstructionMemory=*/{{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}) {} - firestarter::environment::payload::Payload *clone() const override { - return new FMAPayload(this->supportedFeatures()); - }; + /// Compile this payload with supplied settings and optional features. + /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches. + /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the + /// compiled payload. + /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine + /// of the compiled payload. + /// \returns The compiled payload that provides access to the init and load functions. + [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const + -> environment::payload::CompiledPayload::UniquePtr override; private: - const std::map instructionFlops = { - {"REG", 16}, {"L1_L", 16}, {"L1_2L", 16}, {"L1_S", 8}, - {"L1_LS", 8}, {"L1_LS_256", 8}, {"L1_2LS_256", 16}, {"L2_L", 16}, - {"L2_S", 8}, {"L2_LS", 8}, {"L2_LS_256", 8}, {"L2_2LS_256", 16}, - {"L3_L", 16}, {"L3_S", 8}, {"L3_LS", 8}, {"L3_LS_256", 8}, - {"L3_P", 8}, {"RAM_L", 16}, {"RAM_S", 8}, {"RAM_LS", 8}, - {"RAM_P", 8}}; - - const std::map instructionMemory = { - {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}; + /// Function to initialize the memory used by the high load function. + /// \arg MemoryAddr The pointer to the memory. + /// \arg BufferSize The number of doubles that is allocated in MemoryAddr. + void init(double* MemoryAddr, uint64_t BufferSize) const override; }; } // namespace firestarter::environment::x86::payload diff --git a/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp b/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp index d02a28e9..557af0d4 100644 --- a/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp +++ b/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp @@ -21,36 +21,49 @@ #pragma once -#include +#include "firestarter/Environment/X86/Payload/X86Payload.hpp" namespace firestarter::environment::x86::payload { + +/// This payload is designed for the SSE2 CPU extension. class SSE2Payload final : public X86Payload { public: - SSE2Payload(asmjit::CpuFeatures const &supportedFeatures) - : X86Payload(supportedFeatures, {asmjit::CpuFeatures::X86::kSSE2}, - "SSE2", 2, 16) {} - - int compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) override; - std::list getAvailableInstructions() const override; - void init(unsigned long long *memoryAddr, - unsigned long long bufferSize) override; + SSE2Payload() noexcept + : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::kSSE2}, /*Name=*/"SSE2", /*RegisterSize=*/2, + /*RegisterCount=*/16, + /*InstructionFlops=*/ + {{"REG", 2}, + {"L1_L", 2}, + {"L1_S", 2}, + {"L1_LS", 2}, + {"L2_L", 2}, + {"L2_S", 2}, + {"L2_LS", 2}, + {"L3_L", 2}, + {"L3_S", 2}, + {"L3_LS", 2}, + {"L3_P", 2}, + {"RAM_L", 2}, + {"RAM_S", 2}, + {"RAM_LS", 2}, + {"RAM_P", 2}}, + /*InstructionMemory=*/{{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}) {} - firestarter::environment::payload::Payload *clone() const override { - return new SSE2Payload(this->supportedFeatures()); - }; + /// Compile this payload with supplied settings and optional features. + /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches. + /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the + /// compiled payload. + /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine + /// of the compiled payload. + /// \returns The compiled payload that provides access to the init and load functions. + [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const + -> environment::payload::CompiledPayload::UniquePtr override; private: - const std::map instructionFlops = { - {"REG", 2}, {"L1_L", 2}, {"L1_S", 2}, {"L1_LS", 2}, {"L2_L", 2}, - {"L2_S", 2}, {"L2_LS", 2}, {"L3_L", 2}, {"L3_S", 2}, {"L3_LS", 2}, - {"L3_P", 2}, {"RAM_L", 2}, {"RAM_S", 2}, {"RAM_LS", 2}, {"RAM_P", 2}}; - - const std::map instructionMemory = { - {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}; + /// Function to initialize the memory used by the high load function. + /// \arg MemoryAddr The pointer to the memory. + /// \arg BufferSize The number of doubles that is allocated in MemoryAddr. + void init(double* MemoryAddr, uint64_t BufferSize) const override; }; } // namespace firestarter::environment::x86::payload diff --git a/include/firestarter/Environment/X86/Payload/X86Payload.hpp b/include/firestarter/Environment/X86/Payload/X86Payload.hpp index c0ebadc5..44d5bd4f 100644 --- a/include/firestarter/Environment/X86/Payload/X86Payload.hpp +++ b/include/firestarter/Environment/X86/Payload/X86Payload.hpp @@ -21,82 +21,544 @@ #pragma once -#include -#include - -#include -#include +#include "firestarter/Constants.hpp" // IWYU pragma: keep +#include "firestarter/DumpRegisterStruct.hpp" // IWYU pragma: keep +#include "firestarter/Environment/Payload/Payload.hpp" +#include "firestarter/Environment/X86/X86CPUTopology.hpp" +#include "firestarter/LoadWorkerMemory.hpp" +#include "firestarter/Logging/Log.hpp" // IWYU pragma: keep #include +#include +#include +#include // IWYU pragma: keep +#include +#include -#define INIT_BLOCKSIZE 1024 +constexpr const auto InitBlocksize = 1024; +/// This abstract class models a payload that can be compiled with settings and executed for X86 CPUs. namespace firestarter::environment::x86::payload { class X86Payload : public environment::payload::Payload { private: - // we can use this to check, if our platform support this payload - asmjit::CpuFeatures const &_supportedFeatures; - std::list featureRequests; + /// This list contains the features (cpu extenstions) that are requied to execute the payload. + std::list FeatureRequests; -protected: - // asmjit::CodeHolder code; - asmjit::JitRuntime rt; - // typedef int (*LoadFunction)(firestarter::ThreadData *); - typedef unsigned long long (*LoadFunction)(unsigned long long *, - volatile unsigned long long *, - unsigned long long); - LoadFunction loadFunction = nullptr; - - asmjit::CpuFeatures const &supportedFeatures() const { - return this->_supportedFeatures; - } + /// The mapping from instructions to the number of flops per instruction. This map is required to have an entry for + /// every instruction. + std::map InstructionFlops; - template - void emitErrorDetectionCode(asmjit::x86::Builder &cb, IterReg iter_reg, - asmjit::x86::Gpq addrHigh_reg, - asmjit::x86::Gpq pointer_reg, - asmjit::x86::Gpq temp_reg, - asmjit::x86::Gpq temp_reg2); + /// The mapping from instructions to the size of main memory accesses for this instuction. This map is not required to + /// contain all instructions. + std::map InstructionMemory; public: - X86Payload(asmjit::CpuFeatures const &supportedFeatures, - std::initializer_list featureRequests, - std::string name, unsigned registerSize, unsigned registerCount) - : Payload(name, registerSize, registerCount), - _supportedFeatures(supportedFeatures), - featureRequests(featureRequests) {} - - bool isAvailable() const override { - bool available = true; - - for (auto const &feature : featureRequests) { - available &= this->_supportedFeatures.has(feature); + /// Abstract constructor for a payload on X86 CPUs. + /// \arg FeatureRequests This list with features (cpu extenstions) that are requied to execute the payload. + /// \arg Name The name of this payload. It is usally named by the CPU extension this payload uses e.g., SSE2 or FMA. + /// \arg RegisterSize The size of the SIMD registers in units of doubles (8B). + /// \arg RegisterCount The number of SIMD registers used by the payload. + /// \arg InstructionFlops The mapping from instructions to the number of flops per instruction. This map is required + /// to have an entry for every instruction. + /// \arg InstructionMemory The mapping from instructions to the size of main memory accesses for this instuction. This + /// map is not required to contain all instructions. + X86Payload(std::initializer_list FeatureRequests, std::string Name, + unsigned RegisterSize, unsigned RegisterCount, std::map&& InstructionFlops, + std::map&& InstructionMemory) noexcept + : Payload(std::move(Name), RegisterSize, RegisterCount) + , FeatureRequests(FeatureRequests) + , InstructionFlops(std::move(InstructionFlops)) + , InstructionMemory(std::move(InstructionMemory)) {} + +private: + /// Check if this payload is available on the current system. This is equivalent to checking if the supplied Topology + /// contains all features that are in FeatureRequests. + /// \arg Topology The CPUTopology that is used to check agains if this payload is supported. + /// \returns true if the payload is supported on the given CPUTopology. + [[nodiscard]] auto isAvailable(const CPUTopology& Topology) const -> bool final { + const auto* FinalTopology = dynamic_cast(&Topology); + assert(FinalTopology && "isAvailable not called with const X86CPUTopology*"); + + bool Available = true; + + for (auto const& Feature : FeatureRequests) { + Available &= FinalTopology->featuresAsmjit().has(Feature); } - return available; + return Available; }; - // A generic implemenation for all x86 payloads -#if defined(__clang__) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Woverloaded-virtual" -#endif -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Woverloaded-virtual" - void init(unsigned long long *memoryAddr, unsigned long long bufferSize, - double firstValue, double lastValue); -#pragma GCC diagnostic pop -#if defined(__clang__) -#pragma clang diagnostic pop -#endif - // use cpuid and usleep as low load - void lowLoadFunction(volatile unsigned long long *addrHigh, - unsigned long long period) override; - - unsigned long long highLoadFunction(unsigned long long *addrMem, - volatile unsigned long long *addrHigh, - unsigned long long iterations) override; +protected: + /// Emit the code to dump the xmm, ymm or zmm registers into memory for the dump registers feature. + /// \tparam Vec the type of the vector register used. + /// \arg Cb The asmjit code builder that is used to emit the assembler code. + /// \arg PointerReg the register containing the pointer into memory in LoadWorkerMemory that is used in the high-load + /// routine. + /// \arg VecPtr The function that is used to create a ptr to the vector register + template + void emitDumpRegisterCode(asmjit::x86::Builder& Cb, const asmjit::x86::Gpq& PointerReg, + asmjit::x86::Mem (*VecPtr)(const asmjit::x86::Gp&, int32_t)) const { + constexpr const auto DumpRegisterStructRegisterValuesTopOffset = + -static_cast(LoadWorkerMemory::getMemoryOffset()) + + static_cast(offsetof(LoadWorkerMemory, ExtraVars.Drs.Padding)); + constexpr const auto DumpRegisterStructDumpVariableOffset = + -static_cast(LoadWorkerMemory::getMemoryOffset()) + + static_cast(offsetof(LoadWorkerMemory, ExtraVars.Drs.DumpVar)); + + auto SkipRegistersDump = Cb.newLabel(); + + Cb.test(ptr_64(PointerReg, DumpRegisterStructDumpVariableOffset), asmjit::Imm(firestarter::DumpVariable::Wait)); + Cb.jnz(SkipRegistersDump); + + // dump all the vector registers register + for (unsigned I = 0; I < registerCount(); I++) { + Cb.vmovapd(VecPtr(PointerReg, + DumpRegisterStructRegisterValuesTopOffset - static_cast(registerSize() * 8 * (I + 1))), + Vec(I)); + } + + // set read flag + Cb.mov(ptr_64(PointerReg, DumpRegisterStructDumpVariableOffset), asmjit::Imm(firestarter::DumpVariable::Wait)); + + Cb.bind(SkipRegistersDump); + } + + /// Emit the code to detect errors between this and two other threads that execute the same payload concurrently. We + /// backup the registers in Mm2...Mm7. We will check every 0x3fff iterations. If the check did not succeed we write + /// the LoadThreadWorkType::LoadStop flag in the AddrHighReg and therefore abort as soon as we pass the check in the + /// high-load routine. + /// \tparam MaybeConstIterRegT The type of the iteration register. If this is Mm, we assume that Mm0 is used by the + /// payload and the other Mm1...Mm7 are free to use. If they are free we will use them to backup rax, rbx, rcx, rdx, + /// r8 and r9. Otherwise we push them on the stack. + /// \tparam MaybeConstVectorRegT This is the type of the vector register. It can be either Xmm, Ymm or Zmm. In case of + /// Xmm we backup xmm0 on the stack, in case of Ymm we backup ymm0 im Mm4...Mm7 and in case of Zmm we use zmm31 for + /// the backup. This register may not be used in the payload. + /// \arg Cb The asmjit code builder that is used to emit the assembler code. + /// \arg IterReg The register that holds the iteration counter of the high-load loop. + /// \arg AddrHighReg The register contains the pointer to the memory address where the LoadThreadWorkType is saved. + /// \arg PointerReg The register contains the pointer into memory in LoadWorkerMemory that is used in the high-load + /// routine. + /// \arg TempReg The first register that can be used to store temporary values. + /// \arg TempReg2 The second register that can be used to store temporary values. + template + void emitErrorDetectionCode(asmjit::x86::Builder& Cb, MaybeConstIterRegT& IterReg, + const asmjit::x86::Gpq& AddrHighReg, const asmjit::x86::Gpq& PointerReg, + const asmjit::x86::Gpq& TempReg, const asmjit::x86::Gpq& TempReg2) const { + using IterRegT = std::remove_const_t; + using VectorRegT = std::remove_const_t; + + // we don't want anything to break... so we use asserts for everything that + // could break it + static_assert(std::is_base_of_v, "VectorReg must be of asmjit::asmjit::x86::Vec"); + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v, + "VectorReg ist not of any supported type"); + static_assert(std::is_same_v || std::is_same_v, + "IterReg is not of any supported type"); + + if constexpr (std::is_same_v) { + assert(IterReg == asmjit::x86::mm0 && "iter_reg must be mm0"); + } + + assert(IterReg != TempReg && "iter_reg must be != temp_reg"); + assert(TempReg != TempReg2 && "temp_reg must be != temp_reg2"); + assert(TempReg != AddrHighReg && "temp_reg must be != addrHigh_reg"); + assert(TempReg != PointerReg && "temp_reg must be != pointer_reg"); + + assert(IterReg != asmjit::x86::r8 && "iter_reg must be != r8"); + assert(IterReg != asmjit::x86::r9 && "iter_reg must be != r9"); + assert(IterReg != asmjit::x86::rax && "iter_reg must be != rax"); + assert(IterReg != asmjit::x86::rbx && "iter_reg must be != rbx"); + assert(IterReg != asmjit::x86::rcx && "iter_reg must be != rcx"); + assert(IterReg != asmjit::x86::rdx && "iter_reg must be != rdx"); + + assert(TempReg != asmjit::x86::r8 && "temp_reg must be != r8"); + assert(TempReg != asmjit::x86::r9 && "temp_reg must be != r9"); + assert(TempReg != asmjit::x86::rax && "temp_reg must be != rax"); + assert(TempReg != asmjit::x86::rbx && "temp_reg must be != rbx"); + assert(TempReg != asmjit::x86::rcx && "temp_reg must be != rcx"); + assert(TempReg != asmjit::x86::rdx && "temp_reg must be != rdx"); + + assert(TempReg2 != asmjit::x86::r8 && "temp_reg2 must be != r8"); + assert(TempReg2 != asmjit::x86::r9 && "temp_reg2 must be != r9"); + assert(TempReg2 != asmjit::x86::rax && "temp_reg2 must be != rax"); + assert(TempReg2 != asmjit::x86::rbx && "temp_reg2 must be != rbx"); + assert(TempReg2 != asmjit::x86::rcx && "temp_reg2 must be != rcx"); + assert(TempReg2 != asmjit::x86::rdx && "temp_reg2 must be != rdx"); + + assert(AddrHighReg != asmjit::x86::r8 && "addrHigh_reg must be != r8"); + assert(AddrHighReg != asmjit::x86::r9 && "addrHigh_reg must be != r9"); + assert(AddrHighReg != asmjit::x86::rax && "addrHigh_reg must be != rax"); + assert(AddrHighReg != asmjit::x86::rbx && "addrHigh_reg must be != rbx"); + assert(AddrHighReg != asmjit::x86::rcx && "addrHigh_reg must be != rcx"); + assert(AddrHighReg != asmjit::x86::rdx && "addrHigh_reg must be != rdx"); + + auto SkipErrorDetection = Cb.newLabel(); + + if constexpr (std::is_same_v) { + Cb.movq(TempReg, IterReg); + } else { + Cb.mov(TempReg, IterReg); + } + // round about 50-100 Hz + // more or less, but this isn't really that relevant + Cb.and_(TempReg, asmjit::Imm(0x3fff)); + Cb.test(TempReg, TempReg); + Cb.jnz(SkipErrorDetection); + + Cb.mov(TempReg, asmjit::Imm(0xffffffff)); + + auto RegisterCount = registerCount(); + + // Create a backup of VectorReg(0) + if constexpr (std::is_same_v) { + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.push(TempReg2); + Cb.crc32(TempReg, TempReg2); + Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.push(TempReg2); + Cb.crc32(TempReg, TempReg2); + + } else if constexpr (std::is_same_v && std::is_same_v) { + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.movq(asmjit::x86::Mm(7), TempReg2); + Cb.crc32(TempReg, TempReg2); + Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.movq(asmjit::x86::Mm(6), TempReg2); + Cb.crc32(TempReg, TempReg2); + + Cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1)); + + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.movq(asmjit::x86::Mm(5), TempReg2); + Cb.crc32(TempReg, TempReg2); + Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.movq(asmjit::x86::Mm(4), TempReg2); + Cb.crc32(TempReg, TempReg2); + } else if constexpr (std::is_same_v && std::is_same_v) { + // We use vector registers zmm31 for our backup + Cb.vmovapd(asmjit::x86::zmm31, asmjit::x86::zmm0); + RegisterCount--; + } + + // Calculate the hash of the remaining VectorReg + // use VectorReg(0) as a temporary place to unpack values + for (unsigned I = 1; I < RegisterCount; I++) { + if constexpr (std::is_same_v) { + Cb.vmovapd(asmjit::x86::xmm0, asmjit::x86::Xmm(I)); + + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + } else if constexpr (std::is_same_v) { + Cb.vmovapd(asmjit::x86::ymm0, asmjit::x86::Ymm(I)); + + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + + Cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1)); + + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + } else if constexpr (std::is_same_v) { + Cb.vmovapd(asmjit::x86::ymm0, asmjit::x86::Ymm(I)); + + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + + Cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1)); + + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + + Cb.vextractf32x4(asmjit::x86::xmm0, asmjit::x86::Zmm(I), asmjit::Imm(2)); + + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + + Cb.vextractf32x4(asmjit::x86::xmm0, asmjit::x86::Zmm(I), asmjit::Imm(3)); + + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.movq(TempReg2, asmjit::x86::xmm0); + Cb.crc32(TempReg, TempReg2); + } + } + + // Restore VectorReg(0) from backup + if constexpr (std::is_same_v) { + Cb.pop(TempReg2); + Cb.movq(asmjit::x86::xmm0, TempReg2); + Cb.movlhps(asmjit::x86::xmm0, asmjit::x86::xmm0); + Cb.pop(TempReg2); + Cb.pinsrw(asmjit::x86::xmm0, TempReg2.r32(), asmjit::Imm(0)); + Cb.shr(TempReg2, asmjit::Imm(32)); + Cb.movd(TempReg2.r32(), asmjit::x86::Mm(7)); + Cb.pinsrw(asmjit::x86::xmm0, TempReg2.r32(), asmjit::Imm(1)); + } else if constexpr (std::is_same_v && std::is_same_v) { + Cb.movq(TempReg2, asmjit::x86::Mm(5)); + Cb.movq(asmjit::x86::xmm0, TempReg2); + Cb.movq(TempReg2, asmjit::x86::Mm(4)); + Cb.pinsrq(asmjit::x86::xmm0, TempReg2, asmjit::Imm(1)); + + Cb.vinsertf128(asmjit::x86::ymm0, asmjit::x86::ymm0, asmjit::x86::xmm0, asmjit::Imm(1)); + + Cb.movq(TempReg2, asmjit::x86::Mm(7)); + Cb.movq(asmjit::x86::xmm0, TempReg2); + Cb.movq(TempReg2, asmjit::x86::Mm(6)); + Cb.pinsrq(asmjit::x86::xmm0, TempReg2, asmjit::Imm(1)); + } else if constexpr (std::is_same_v && std::is_same_v) { + // We use vector registers zmm31 for our backup + Cb.vmovapd(asmjit::x86::zmm0, asmjit::x86::zmm31); + } + + // before starting the communication, backup r8, r9, rax, rbx, rcx and rdx + if constexpr (std::is_same_v) { + Cb.movq(asmjit::x86::Mm(7), asmjit::x86::rax); + Cb.movq(asmjit::x86::Mm(6), asmjit::x86::rbx); + Cb.movq(asmjit::x86::Mm(5), asmjit::x86::rcx); + Cb.movq(asmjit::x86::Mm(4), asmjit::x86::rdx); + Cb.movq(asmjit::x86::Mm(3), asmjit::x86::r8); + Cb.movq(asmjit::x86::Mm(2), asmjit::x86::r9); + } else { + Cb.push(asmjit::x86::rax); + Cb.push(asmjit::x86::rbx); + Cb.push(asmjit::x86::rcx); + Cb.push(asmjit::x86::rdx); + Cb.push(asmjit::x86::r8); + Cb.push(asmjit::x86::r9); + } + + // do the actual communication + // temp_reg contains our hash + + // save the pointer_reg. it might be any of r8, r9, rax, rbx, rcx or rdx + Cb.mov(TempReg2, PointerReg); + + // Don't touch me! + // This sychronization and communication works even if the threads run at + // different (changing) speed, with just one "lock cmpxchg16b" Brought to you + // by a few hours of headache for two people. + auto Communication = [&](const int32_t ErrorDetetectionStructOffset) { + const auto CommunicationOffset = + ErrorDetetectionStructOffset + static_cast(offsetof(ErrorDetectionStruct::OneSide, Communication)); + const auto Local0Offset = + ErrorDetetectionStructOffset + static_cast(offsetof(ErrorDetectionStruct::OneSide, Locals[0])); + const auto Local1Offset = + ErrorDetetectionStructOffset + static_cast(offsetof(ErrorDetectionStruct::OneSide, Locals[1])); + const auto Local2Offset = + ErrorDetetectionStructOffset + static_cast(offsetof(ErrorDetectionStruct::OneSide, Locals[2])); + const auto Local3Offset = + ErrorDetetectionStructOffset + static_cast(offsetof(ErrorDetectionStruct::OneSide, Locals[3])); + const auto ErrorOffset = + ErrorDetetectionStructOffset + static_cast(offsetof(ErrorDetectionStruct::OneSide, Error)); + + // communication + Cb.mov(asmjit::x86::r8, asmjit::x86::ptr_64(TempReg2, CommunicationOffset)); + + // temp data + Cb.mov(asmjit::x86::r9, TempReg2); + + Cb.mov(asmjit::x86::rdx, asmjit::x86::ptr_64(asmjit::x86::r9, Local0Offset)); + Cb.mov(asmjit::x86::rax, asmjit::x86::ptr_64(asmjit::x86::r9, Local1Offset)); + + auto L0 = Cb.newLabel(); + Cb.bind(L0); + + // Atomically ompare the data in the communicaton with the local data. + Cb.lock(); + Cb.cmpxchg16b(asmjit::x86::ptr(asmjit::x86::r8)); + + auto L1 = Cb.newLabel(); + Cb.jnz(L1); + + // Communication had the same data as saved in locals 0 and 1. rcx, rbx saved in communication. + // Save written data rcx, rbx in locals 0 and 1. + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local0Offset), asmjit::x86::rcx); + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local1Offset), asmjit::x86::rbx); + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local2Offset), asmjit::Imm(0)); + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local3Offset), asmjit::Imm(0)); + + Cb.mov(asmjit::x86::rax, asmjit::Imm(2)); + + auto L6 = Cb.newLabel(); + Cb.jmp(L6); + + Cb.bind(L1); + + // Communication had differnt data as saved in locals 0 and 1. rdx, rax contains the data in communication. + // Compare the iteration counter of this and the other thread + Cb.cmp(asmjit::x86::rcx, asmjit::x86::rdx); + + auto L2 = Cb.newLabel(); + Cb.jle(L2); + + // The current iteration counter is bigger than the counter of the other thread. + // Save the current counter and hash into our local storage. + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local0Offset), asmjit::x86::rcx); + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local1Offset), asmjit::x86::rbx); + + // Repeat the lock cmpxchg16b routine until the other thread catches up. + Cb.jmp(L0); + + Cb.bind(L2); + + // The current iteration counter is smaller equal than the iteration counter of the other thread. + + auto L3 = Cb.newLabel(); + + // Check if the read value from the other thread is saved locally. + Cb.cmp(asmjit::x86::ptr_64(asmjit::x86::r9, Local2Offset), asmjit::Imm(0)); + Cb.jne(L3); + Cb.cmp(asmjit::x86::ptr_64(asmjit::x86::r9, Local3Offset), asmjit::Imm(0)); + Cb.jne(L3); + + // Save the last read value from the other thread into the local storage. + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local2Offset), asmjit::x86::rdx); + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local3Offset), asmjit::x86::rax); + + Cb.bind(L3); + + // Check if the id of the two threads are equal + Cb.cmp(asmjit::x86::rcx, asmjit::x86::ptr_64(asmjit::x86::r9, Local2Offset)); + Cb.mov(asmjit::x86::rax, asmjit::Imm(4)); + // If the iteration counter of this thread is smaller, skip this check. The other thread will wait for this one. + Cb.jne(L6); + + // Compare the hashes and write teh result + Cb.cmp(asmjit::x86::rbx, asmjit::x86::ptr_64(asmjit::x86::r9, Local3Offset)); + auto L4 = Cb.newLabel(); + Cb.jne(L4); + + // Hash check succeeded. + Cb.mov(asmjit::x86::rax, asmjit::Imm(0)); + + auto L5 = Cb.newLabel(); + Cb.jmp(L5); + + Cb.bind(L4); + + // Hash check failed + Cb.mov(asmjit::x86::rax, asmjit::Imm(1)); + + Cb.bind(L5); + + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local2Offset), asmjit::Imm(0)); + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local3Offset), asmjit::Imm(0)); + + Cb.bind(L6); + + // if check failed + Cb.cmp(asmjit::x86::rax, asmjit::Imm(1)); + auto L7 = Cb.newLabel(); + Cb.jne(L7); + + // write the error flag + Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, ErrorOffset), asmjit::Imm(1)); + + // stop the execution after some time + Cb.mov(asmjit::x86::ptr_64(AddrHighReg), asmjit::Imm(LoadThreadWorkType::LoadStop)); + Cb.mfence(); + + Cb.bind(L7); + + auto L9 = Cb.newLabel(); + Cb.jmp(L9); + }; + + constexpr const auto ErrorDetectionStructCommunicationLeftOffset = + -static_cast(LoadWorkerMemory::getMemoryOffset()) + + static_cast(offsetof(LoadWorkerMemory, ExtraVars.Eds.Left.Communication)); + constexpr const auto ErrorDetectionStructCommunicationRightOffset = + -static_cast(LoadWorkerMemory::getMemoryOffset()) + + static_cast(offsetof(LoadWorkerMemory, ExtraVars.Eds.Right.Communication)); + + // left communication + // move hash + Cb.mov(asmjit::x86::rbx, TempReg); + // move iterations counter + if constexpr (std::is_same_v) { + Cb.movq(asmjit::x86::rcx, IterReg); + } else { + Cb.mov(asmjit::x86::rcx, IterReg); + } + + Communication(ErrorDetectionStructCommunicationLeftOffset); + + // right communication + // move hash + Cb.mov(asmjit::x86::rbx, TempReg); + // move iterations counter + if constexpr (std::is_same_v) { + Cb.movq(asmjit::x86::rcx, IterReg); + } else { + Cb.mov(asmjit::x86::rcx, IterReg); + } + + Communication(ErrorDetectionStructCommunicationRightOffset); + + // restore r8, r9, rax, rbx, rcx and rdx + if constexpr (std::is_same_v) { + Cb.movq(asmjit::x86::rax, asmjit::x86::Mm(7)); + Cb.movq(asmjit::x86::rbx, asmjit::x86::Mm(6)); + Cb.movq(asmjit::x86::rcx, asmjit::x86::Mm(5)); + Cb.movq(asmjit::x86::rdx, asmjit::x86::Mm(4)); + Cb.movq(asmjit::x86::r8, asmjit::x86::Mm(3)); + Cb.movq(asmjit::x86::r9, asmjit::x86::Mm(2)); + } else { + Cb.pop(asmjit::x86::r9); + Cb.pop(asmjit::x86::r8); + Cb.pop(asmjit::x86::rdx); + Cb.pop(asmjit::x86::rcx); + Cb.pop(asmjit::x86::rbx); + Cb.pop(asmjit::x86::rax); + } + + Cb.bind(SkipErrorDetection); + } + + static void initMemory(double* MemoryAddr, uint64_t BufferSize, double FirstValue, double LastValue); + + /// Function to produce a low load on the cpu. + /// \arg LoadVar The variable that controls the load. If this variable changes from LoadThreadWorkType::LowLoad to + /// something else this function will return. + /// \arg Period The period of the low/high load switching. This function will sleep 1% of the Period and check if the + /// LoadVar changed. + void lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period) const final; + + /// Get the available instruction items that are supported by this payload. + /// \returns The available instruction items that are supported by this payload. + [[nodiscard]] auto getAvailableInstructions() const -> std::list final; + + /// Get the mapping from instructions to the number of flops per instruction. This map is required to have an entry + /// for every instruction. + [[nodiscard]] auto instructionFlops() const -> const auto& { return InstructionFlops; } + + /// Get the mapping from instructions to the size of main memory accesses for this instuction. This map is not + /// required to contain all instructions. + [[nodiscard]] auto instructionMemory() const -> const auto& { return InstructionMemory; } }; } // namespace firestarter::environment::x86::payload diff --git a/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp b/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp index a1776f37..5d624725 100644 --- a/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp +++ b/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp @@ -21,35 +21,34 @@ #pragma once -#include +#include "firestarter/Environment/X86/Payload/X86Payload.hpp" namespace firestarter::environment::x86::payload { + +/// This payload is designed for the FMA CPU extension in combination with the first generation Zen microarchitecture. class ZENFMAPayload final : public X86Payload { public: - ZENFMAPayload(asmjit::CpuFeatures const &supportedFeatures) - : X86Payload( - supportedFeatures, - {asmjit::CpuFeatures::X86::Id::kAVX, asmjit::CpuFeatures::X86::Id::kFMA}, - "ZENFMA", 4, 16) {} - - int compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) override; - std::list getAvailableInstructions() const override; - void init(unsigned long long *memoryAddr, - unsigned long long bufferSize) override; + ZENFMAPayload() noexcept + : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::Id::kAVX, asmjit::CpuFeatures::X86::Id::kFMA}, + /*Name=*/"ZENFMA", /*RegisterSize=*/4, /*RegisterCount=*/16, + /*InstructionFlops=*/{{"REG", 8}, {"L1_LS", 8}, {"L2_L", 8}, {"L3_L", 8}, {"RAM_L", 8}}, + /*InstructionMemory=*/{{"RAM_L", 64}}) {} - firestarter::environment::payload::Payload *clone() const override { - return new ZENFMAPayload(this->supportedFeatures()); - }; + /// Compile this payload with supplied settings and optional features. + /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches. + /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the + /// compiled payload. + /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine + /// of the compiled payload. + /// \returns The compiled payload that provides access to the init and load functions. + [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const + -> environment::payload::CompiledPayload::UniquePtr override; private: - const std::map instructionFlops = { - {"REG", 8}, {"L1_LS", 8}, {"L2_L", 8}, {"L3_L", 8}, {"RAM_L", 8}}; - - const std::map instructionMemory = {{"RAM_L", 64}}; + /// Function to initialize the memory used by the high load function. + /// \arg MemoryAddr The pointer to the memory. + /// \arg BufferSize The number of doubles that is allocated in MemoryAddr. + void init(double* MemoryAddr, uint64_t BufferSize) const override; }; } // namespace firestarter::environment::x86::payload diff --git a/include/firestarter/Environment/X86/Platform/BulldozerConfig.hpp b/include/firestarter/Environment/X86/Platform/BulldozerConfig.hpp index 12a922b9..936b3601 100644 --- a/include/firestarter/Environment/X86/Platform/BulldozerConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/BulldozerConfig.hpp @@ -21,24 +21,20 @@ #pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/FMA4Payload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class BulldozerConfig final : public X86PlatformConfig { - public: - BulldozerConfig(asmjit::CpuFeatures const &supportedFeatures, - unsigned family, unsigned model, unsigned threads) - : X86PlatformConfig("BLD_OPTERON", 21, {1, 2, 3}, {1}, 0, - {16384, 1048576, 786432}, 104857600, 1536, family, - model, threads, - new payload::FMA4Payload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>( - {{"RAM_L", 1}, {"L3_L", 1}, {"L2_LS", 5}, {"L1_L", 90}, {"REG", 45}}); - } + BulldozerConfig() noexcept + : X86PlatformConfig( + /*Name=*/"BLD_OPTERON", /*Family=*/21, /*Models=*/{1, 2, 3}, + /*Settings=*/ + environment::payload::PayloadSettings( + /*Threads=*/{1}, /*DataCacheBufferSize=*/{16384, 1048576, 786432}, /*RamBufferSize=*/104857600, + /*Lines=*/1536, + /*InstructionGroups=*/{{"RAM_L", 1}, {"L3_L", 1}, {"L2_LS", 5}, {"L1_L", 90}, {"REG", 45}}), + /*Payload=*/std::make_shared()) {} }; } // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/Platform/HaswellConfig.hpp b/include/firestarter/Environment/X86/Platform/HaswellConfig.hpp index f079ec18..768d3597 100644 --- a/include/firestarter/Environment/X86/Platform/HaswellConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/HaswellConfig.hpp @@ -21,24 +21,20 @@ #pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/FMAPayload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class HaswellConfig final : public X86PlatformConfig { - public: - HaswellConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family, - unsigned model, unsigned threads) - : X86PlatformConfig("HSW_COREI", 6, {60, 61, 69, 70, 71}, {1, 2}, 0, - {32768, 262144, 1572864}, 104857600, 1536, family, - model, threads, - new payload::FMAPayload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>( - {{"RAM_L", 2}, {"L3_LS", 3}, {"L2_LS", 9}, {"L1_LS", 90}, {"REG", 40}}); - } + HaswellConfig() noexcept + : X86PlatformConfig( + /*Name=*/"HSW_COREI", /*Family=*/6, /*Models=*/{60, 61, 69, 70, 71}, + /*Settings=*/ + environment::payload::PayloadSettings( + /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 1572864}, /*RamBufferSize=*/104857600, + /*Lines=*/1536, + /*InstructionGroups=*/{{"RAM_L", 2}, {"L3_LS", 3}, {"L2_LS", 9}, {"L1_LS", 90}, {"REG", 40}}), + /*Payload=*/std::make_shared()) {} }; } // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/Platform/HaswellEPConfig.hpp b/include/firestarter/Environment/X86/Platform/HaswellEPConfig.hpp index df5a1927..23d2518f 100644 --- a/include/firestarter/Environment/X86/Platform/HaswellEPConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/HaswellEPConfig.hpp @@ -21,27 +21,20 @@ #pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/FMAPayload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class HaswellEPConfig final : public X86PlatformConfig { - public: - HaswellEPConfig(asmjit::CpuFeatures const &supportedFeatures, - unsigned family, unsigned model, unsigned threads) - : X86PlatformConfig("HSW_XEONEP", 6, {63, 79}, {1, 2}, 0, - {32768, 262144, 2621440}, 104857600, 1536, family, - model, threads, - new payload::FMAPayload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>({{"RAM_L", 8}, - {"L3_LS", 1}, - {"L2_LS", 29}, - {"L1_LS", 100}, - {"REG", 100}}); - } + HaswellEPConfig() noexcept + : X86PlatformConfig( + /*Name=*/"HSW_XEONEP", /*Family=*/6, /*Models=*/{63, 79}, + /*Settings=*/ + environment::payload::PayloadSettings( + /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 2621440}, + /*RamBufferSize=*/104857600, /*Lines=*/1536, + /*InstructionGroups=*/{{"RAM_L", 8}, {"L3_LS", 1}, {"L2_LS", 29}, {"L1_LS", 100}, {"REG", 100}}), + /*Payload=*/std::make_shared()) {} }; } // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp b/include/firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp index de520c56..f849c07b 100644 --- a/include/firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp @@ -21,24 +21,19 @@ #pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/AVX512Payload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class KnightsLandingConfig final : public X86PlatformConfig { - public: - KnightsLandingConfig(asmjit::CpuFeatures const &supportedFeatures, - unsigned family, unsigned model, unsigned threads) - : X86PlatformConfig("KNL_XEONPHI", 6, {87}, {4}, 0, - {32768, 524288, 236279125}, 26214400, 1536, family, - model, threads, - new payload::AVX512Payload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>( - {{"RAM_P", 3}, {"L2_S", 8}, {"L1_L", 40}, {"REG", 10}}); - } + KnightsLandingConfig() noexcept + : X86PlatformConfig(/*Name=*/"KNL_XEONPHI", /*Family=*/6, /*Models=*/{87}, + /*Settings=*/ + environment::payload::PayloadSettings( + /*Threads=*/{4}, /*DataCacheBufferSize=*/{32768, 524288, 236279125}, + /*RamBufferSize=*/26214400, /*Lines=*/1536, + /*InstructionGroups=*/{{"RAM_P", 3}, {"L2_S", 8}, {"L1_L", 40}, {"REG", 10}}), + /*Payload=*/std::make_shared()) {} }; } // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/Platform/NaplesConfig.hpp b/include/firestarter/Environment/X86/Platform/NaplesConfig.hpp index 0ad94682..abef11da 100644 --- a/include/firestarter/Environment/X86/Platform/NaplesConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/NaplesConfig.hpp @@ -21,27 +21,20 @@ #pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/ZENFMAPayload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class NaplesConfig final : public X86PlatformConfig { - public: - NaplesConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family, - unsigned model, unsigned threads) - : X86PlatformConfig("ZEN_EPYC", 23, {1, 8, 17, 24}, {1, 2}, 0, - {65536, 524288, 2097152}, 104857600, 1536, family, - model, threads, - new payload::ZENFMAPayload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>({{"RAM_L", 3}, - {"L3_L", 14}, - {"L2_L", 75}, - {"L1_LS", 81}, - {"REG", 100}}); - } + NaplesConfig() noexcept + : X86PlatformConfig( + /*Name=*/"ZEN_EPYC", /*Family=*/23, /*Models=*/{1, 8, 17, 24}, + /*Settings=*/ + environment::payload::PayloadSettings( + /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{65536, 524288, 2097152}, /*RamBufferSize=*/104857600, + /*Lines=*/1536, + /*InstructionGroups=*/{{"RAM_L", 3}, {"L3_L", 14}, {"L2_L", 75}, {"L1_LS", 81}, {"REG", 100}}), + /*Payload=*/std::make_shared()) {} }; } // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/Platform/NehalemConfig.hpp b/include/firestarter/Environment/X86/Platform/NehalemConfig.hpp index da7764d4..31374061 100644 --- a/include/firestarter/Environment/X86/Platform/NehalemConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/NehalemConfig.hpp @@ -21,24 +21,19 @@ #pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/SSE2Payload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class NehalemConfig final : public X86PlatformConfig { - public: - NehalemConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family, - unsigned model, unsigned threads) - : X86PlatformConfig("NHM_COREI", 6, {30, 37, 23}, {1, 2}, 0, - {32768, 262144, 1572864}, 104857600, 1536, family, - model, threads, - new payload::SSE2Payload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>( - {{"RAM_P", 1}, {"L1_LS", 70}, {"REG", 2}}); - } + NehalemConfig() noexcept + : X86PlatformConfig( + /*Name=*/"NHM_COREI", /*Family=*/6, /*Models=*/{30, 37, 23}, + /*Settings=*/ + environment::payload::PayloadSettings(/*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 1572864}, + /*RamBufferSize=*/104857600, /*Lines=*/1536, + /*InstructionGroups=*/{{"RAM_P", 1}, {"L1_LS", 70}, {"REG", 2}}), + /*Payload=*/std::make_shared()) {} }; } // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/Platform/NehalemEPConfig.hpp b/include/firestarter/Environment/X86/Platform/NehalemEPConfig.hpp index 06ac2f64..9a6a08bb 100644 --- a/include/firestarter/Environment/X86/Platform/NehalemEPConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/NehalemEPConfig.hpp @@ -21,24 +21,19 @@ #pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/SSE2Payload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class NehalemEPConfig final : public X86PlatformConfig { - public: - NehalemEPConfig(asmjit::CpuFeatures const &supportedFeatures, - unsigned family, unsigned model, unsigned threads) - : X86PlatformConfig("NHM_XEONEP", 6, {26, 44}, {1, 2}, 0, - {32768, 262144, 2097152}, 104857600, 1536, family, - model, threads, - new payload::SSE2Payload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>( - {{"RAM_P", 1}, {"L1_LS", 60}, {"REG", 2}}); - } + NehalemEPConfig() noexcept + : X86PlatformConfig(/*Name=*/"NHM_XEONEP", /*Family=*/6, /*Models=*/{26, 44}, + /*Settings=*/ + environment::payload::PayloadSettings( + /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 2097152}, + /*RamBufferSize=*/104857600, /*Lines=*/1536, + /*InstructionGroups=*/{{"RAM_P", 1}, {"L1_LS", 60}, {"REG", 2}}), + /*Payload=*/std::make_shared()) {} }; } // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/Platform/RomeConfig.hpp b/include/firestarter/Environment/X86/Platform/RomeConfig.hpp index f7569bf4..e70161d7 100644 --- a/include/firestarter/Environment/X86/Platform/RomeConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/RomeConfig.hpp @@ -21,28 +21,21 @@ #pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/FMAPayload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class RomeConfig final : public X86PlatformConfig { - public: - RomeConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family, - unsigned model, unsigned threads) - : X86PlatformConfig("ZEN_2_EPYC", 23, {49}, {1, 2}, 0, - {32768, 524288, 2097152}, 104857600, 1536, family, - model, threads, - new payload::FMAPayload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>({{"RAM_L", 10}, - {"L3_L", 25}, - {"L2_L", 91}, - {"L1_2LS_256", 72}, - {"L1_LS_256", 82}, - {"REG", 75}}); - } + RomeConfig() noexcept + : X86PlatformConfig( + /*Name=*/"ZEN_2_EPYC", /*Family=*/23, /*Models=*/{49}, + /*Settings=*/ + environment::payload::PayloadSettings( + /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 524288, 2097152}, /*RamBufferSize=*/104857600, + /*Lines=*/1536, + /*InstructionGroups=*/ + {{"RAM_L", 10}, {"L3_L", 25}, {"L2_L", 91}, {"L1_2LS_256", 72}, {"L1_LS_256", 82}, {"REG", 75}}), + /*Payload=*/std::make_shared()) {} }; } // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp b/include/firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp index 7e928c1f..b5c5b1c4 100644 --- a/include/firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp @@ -21,27 +21,20 @@ #pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/AVXPayload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class SandyBridgeConfig final : public X86PlatformConfig { - public: - SandyBridgeConfig(asmjit::CpuFeatures const &supportedFeatures, - unsigned family, unsigned model, unsigned threads) - : X86PlatformConfig("SNB_COREI", 6, {42, 58}, {1, 2}, 0, - {32768, 262144, 1572864}, 104857600, 1536, family, - model, threads, - new payload::AVXPayload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>({{"RAM_L", 2}, - {"L3_LS", 4}, - {"L2_LS", 10}, - {"L1_LS", 90}, - {"REG", 45}}); - } + SandyBridgeConfig() noexcept + : X86PlatformConfig( + /*Name=*/"SNB_COREI", /*Family=*/6, /*Models=*/{42, 58}, + /*Settings=*/ + environment::payload::PayloadSettings( + /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 1572864}, /*RamBufferSize=*/104857600, + /*Lines=*/1536, + /*InstructionGroups=*/{{"RAM_L", 2}, {"L3_LS", 4}, {"L2_LS", 10}, {"L1_LS", 90}, {"REG", 45}}), + /*Payload=*/std::make_shared()) {} }; } // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp b/include/firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp index cb7fcb43..67048ba5 100644 --- a/include/firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp @@ -19,32 +19,22 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#ifndef INCLUDE_FIRESTARTER_ENVIRONMENT_X86_PLATFORM_SANDYBRIDGEEPCONFIG_H -#define INCLUDE_FIRESTARTER_ENVIRONMENT_X86_PLATFORM_SANDYBRIDGEEPCONFIG_H +#pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/AVXPayload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class SandyBridgeEPConfig final : public X86PlatformConfig { - public: - SandyBridgeEPConfig(asmjit::CpuFeatures const &supportedFeatures, - unsigned family, unsigned model, unsigned threads) - : X86PlatformConfig("SNB_XEONEP", 6, {45, 62}, {1, 2}, 0, - {32768, 262144, 2621440}, 104857600, 1536, family, - model, threads, - new payload::AVXPayload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>({{"RAM_L", 3}, - {"L3_LS", 2}, - {"L2_LS", 10}, - {"L1_LS", 90}, - {"REG", 30}}); - } + SandyBridgeEPConfig() noexcept + : X86PlatformConfig( + /*Name=*/"SNB_XEONEP", /*Family=*/6, /*Models=*/{45, 62}, + /*Settings=*/ + environment::payload::PayloadSettings( + /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 2621440}, /*RamBufferSize=*/104857600, + /*Lines=*/1536, + /*InstructionGroups=*/{{"RAM_L", 3}, {"L3_LS", 2}, {"L2_LS", 10}, {"L1_LS", 90}, {"REG", 30}}), + /*Payload=*/std::make_shared()) {} }; -} // namespace firestarter::environment::x86::platform - -#endif +} // namespace firestarter::environment::x86::platform \ No newline at end of file diff --git a/include/firestarter/Environment/X86/Platform/SkylakeConfig.hpp b/include/firestarter/Environment/X86/Platform/SkylakeConfig.hpp index aec85be8..8a109d11 100644 --- a/include/firestarter/Environment/X86/Platform/SkylakeConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/SkylakeConfig.hpp @@ -19,32 +19,22 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#ifndef INCLUDE_FIRESTARTER_ENVIRONMENT_X86_PLATFORM_SKYLAKECONFIG_H -#define INCLUDE_FIRESTARTER_ENVIRONMENT_X86_PLATFORM_SKYLAKECONFIG_H +#pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/FMAPayload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class SkylakeConfig final : public X86PlatformConfig { - public: - SkylakeConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family, - unsigned model, unsigned threads) - : X86PlatformConfig("SKL_COREI", 6, {78, 94}, {1, 2}, 0, - {32768, 262144, 1572864}, 104857600, 1536, family, - model, threads, - new payload::FMAPayload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>({{"RAM_L", 3}, - {"L3_LS_256", 5}, - {"L2_LS_256", 18}, - {"L1_2LS_256", 78}, - {"REG", 40}}); - } + SkylakeConfig() noexcept + : X86PlatformConfig(/*Name=*/"SKL_COREI", /*Family=*/6, /*Models=*/{78, 94}, + /*Settings=*/ + environment::payload::PayloadSettings( + /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 1572864}, + /*RamBufferSize=*/104857600, /*Lines=*/1536, + /*InstructionGroups=*/ + {{"RAM_L", 3}, {"L3_LS_256", 5}, {"L2_LS_256", 18}, {"L1_2LS_256", 78}, {"REG", 40}}), + /*Payload=*/std::make_shared()) {} }; -} // namespace firestarter::environment::x86::platform - -#endif +} // namespace firestarter::environment::x86::platform \ No newline at end of file diff --git a/include/firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp b/include/firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp index be767d0b..864ebec9 100644 --- a/include/firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp @@ -21,31 +21,28 @@ #pragma once -#include -#include +#include "firestarter/Environment/X86/Payload/AVX512Payload.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86::platform { class SkylakeSPConfig final : public X86PlatformConfig { - public: - SkylakeSPConfig(asmjit::CpuFeatures const &supportedFeatures, - unsigned family, unsigned model, unsigned threads) - : X86PlatformConfig("SKL_XEONEP", 6, {85}, {1, 2}, 0, - {32768, 1048576, 1441792}, 1048576000, 1536, family, - model, threads, - new payload::AVX512Payload(supportedFeatures)) {} - - std::vector> - getDefaultPayloadSettings() const override { - return std::vector>({{"RAM_S", 3}, - {"RAM_P", 1}, - {"L3_S", 1}, - {"L3_P", 1}, - {"L2_S", 4}, - {"L2_L", 70}, - {"L1_S", 0}, - {"L1_L", 40}, - {"REG", 140}}); - } + SkylakeSPConfig() noexcept + : X86PlatformConfig(/*Name=*/"SKL_XEONEP", /*Family=*/6, /*Models=*/{85}, + /*Settings=*/ + environment::payload::PayloadSettings(/*Threads=*/{1, 2}, + /*DataCacheBufferSize=*/{32768, 1048576, 1441792}, + /*RamBufferSize=*/1048576000, /*Lines=*/1536, + /*InstructionGroups=*/ + {{"RAM_S", 3}, + {"RAM_P", 1}, + {"L3_S", 1}, + {"L3_P", 1}, + {"L2_S", 4}, + {"L2_L", 70}, + {"L1_S", 0}, + {"L1_L", 40}, + {"REG", 140}}), + /*Payload=*/std::make_shared()) {} }; } // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/Platform/X86PlatformConfig.hpp b/include/firestarter/Environment/X86/Platform/X86PlatformConfig.hpp index 45956f38..15d54638 100644 --- a/include/firestarter/Environment/X86/Platform/X86PlatformConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/X86PlatformConfig.hpp @@ -21,38 +21,79 @@ #pragma once -#include -#include +#include "firestarter/Environment/CPUTopology.hpp" +#include "firestarter/Environment/Platform/PlatformConfig.hpp" +#include "firestarter/Environment/X86/X86CPUTopology.hpp" namespace firestarter::environment::x86::platform { +/// Models a platform config that is the default based on x86 CPU family and model ids. class X86PlatformConfig : public environment::platform::PlatformConfig { private: - unsigned _family; - std::list _models; - unsigned _currentFamily; - unsigned _currentModel; - unsigned _currentThreads; + /// The famility id of the processor for which this is the default platform config. + unsigned Family; + /// The list of model ids in combination with the family for which this is the default platform config. + std::list Models; public: - X86PlatformConfig(std::string name, unsigned family, - std::initializer_list models, - std::initializer_list threads, - unsigned instructionCacheSize, - std::initializer_list dataCacheBufferSize, - unsigned ramBuffersize, unsigned lines, - unsigned currentFamily, unsigned currentModel, - unsigned currentThreads, payload::X86Payload *payload) - : PlatformConfig(name, threads, instructionCacheSize, dataCacheBufferSize, - ramBuffersize, lines, payload), - _family(family), _models(models), _currentFamily(currentFamily), - _currentModel(currentModel), _currentThreads(currentThreads) {} - - bool isDefault() const override { - return _family == _currentFamily && - (std::find(_models.begin(), _models.end(), _currentModel) != - _models.end()) && - isAvailable(); + X86PlatformConfig(std::string Name, unsigned Family, std::list&& Models, + environment::payload::PayloadSettings&& Settings, + std::shared_ptr&& Payload) noexcept + : PlatformConfig(std::move(Name), std::move(Settings), std::move(Payload)) + , Family(Family) + , Models(std::move(Models)) {} + + /// Check if this platform is available on the current system. This transloate to if the cpu extensions are + /// available for the payload that is used. + /// \arg Topology The reference to the X86CPUTopology that is used to check agains if this platform is supported. + /// \returns true if the platform is supported on the given X86CPUTopology. + [[nodiscard]] auto isAvailable(const X86CPUTopology& Topology) const -> bool { return isAvailable(&Topology); } + + /// Check if this platform is available and the default on the current system. + /// \arg Topology The reference to the X86CPUTopology that is used to check agains if this payload is supported. + /// \returns true if the platform is the default one for a given X86CPUTopology. + [[nodiscard]] auto isDefault(const X86CPUTopology& Topology) const -> bool { return isDefault(&Topology); } + + /// Clone a the platform config. + [[nodiscard]] auto clone() const -> std::unique_ptr final { + auto Ptr = std::make_unique(name(), Family, std::list(Models), + environment::payload::PayloadSettings(settings()), + std::shared_ptr(payload())); + return Ptr; + } + + /// Clone a concreate platform config. + /// \arg InstructionCacheSize The detected size of the instructions cache. + /// \arg ThreadPerCore The number of threads per pysical CPU. + [[nodiscard]] auto cloneConcreate(std::optional InstructionCacheSize, unsigned ThreadsPerCore) const + -> std::unique_ptr final { + auto Ptr = clone(); + Ptr->settings().concretize(InstructionCacheSize, ThreadsPerCore); + return Ptr; + } + +private: + /// Check if this platform is available on the current system. This tranlates to if the cpu extensions are + /// available for the payload that is used. + /// \arg Topology The pointer to the CPUTopology that is used to check agains if this platform is supported. + /// \returns true if the platform is supported on the given CPUTopology. + [[nodiscard]] auto isAvailable(const CPUTopology* Topology) const -> bool final { + return environment::platform::PlatformConfig::isAvailable(Topology); + } + + /// Check if this platform is available and the default on the current system. This is done by checking if the family + /// id in the CPUTopology matches the one saved in Family and if the model id in the CPUTopology is contained in + /// Models. + /// \arg Topology The pointer to the CPUTopology that is used to check agains if this payload is supported. + /// \returns true if the platform is the default one for a given CPUTopology. + [[nodiscard]] auto isDefault(const CPUTopology* Topology) const -> bool final { + const auto* FinalTopology = dynamic_cast(Topology); + assert(FinalTopology && "isDefault not called with const X86CPUTopology*"); + + // Check if the family of the topology matches the family of the config, if the model of the topology is contained + // in the models list of the config and if the config is available on the current platform. + return Family == FinalTopology->familyId() && + (std::find(Models.begin(), Models.end(), FinalTopology->modelId()) != Models.end()) && isAvailable(Topology); } }; diff --git a/include/firestarter/Environment/X86/X86CPUTopology.hpp b/include/firestarter/Environment/X86/X86CPUTopology.hpp index 44a02dc2..0a85d040 100644 --- a/include/firestarter/Environment/X86/X86CPUTopology.hpp +++ b/include/firestarter/Environment/X86/X86CPUTopology.hpp @@ -21,55 +21,68 @@ #pragma once -#include +#include "firestarter/Environment/CPUTopology.hpp" #include namespace firestarter::environment::x86 { +/// This class models the properties of a x86_64 processor. class X86CPUTopology final : public CPUTopology { public: X86CPUTopology(); - friend std::ostream &operator<<(std::ostream &stream, - X86CPUTopology const &cpuTopology); + friend auto operator<<(std::ostream& Stream, X86CPUTopology const& CpuTopology) -> std::ostream&; - std::list const &features() const override { - return this->featureList; - } - const asmjit::CpuFeatures& featuresAsmjit() const{ - return this->cpuInfo.features(); - } + /// Getter for the list of CPU features + [[nodiscard]] auto features() const -> std::list const& override { return this->FeatureList; } + /// Getter for the CPU features class from asmjit + [[nodiscard]] auto featuresAsmjit() const -> const asmjit::CpuFeatures& { return this->CpuInfo.features(); } - std::string const &vendor() const override { return this->_vendor; } - std::string const &model() const override { return this->_model; } + /// Getter for the clockrate in Hz + [[nodiscard]] auto clockrate() const -> uint64_t override; - unsigned long long clockrate() const override; + /// Get the current hardware timestamp + [[nodiscard]] auto timestamp() const -> uint64_t override; - unsigned long long timestamp() const override; - - unsigned familyId() const { return this->cpuInfo.familyId(); } - unsigned modelId() const { return this->cpuInfo.modelId(); } - unsigned stepping() const { return this->cpuInfo.stepping(); } + /// The family id of the x86 processor + [[nodiscard]] auto familyId() const -> unsigned { return this->CpuInfo.familyId(); } + /// The model id of the x86 processor + [[nodiscard]] auto modelId() const -> unsigned { return this->CpuInfo.modelId(); } + /// The stepping id of the x86 processor + [[nodiscard]] auto stepping() const -> unsigned { return this->CpuInfo.stepping(); } + /// The CPU vendor i.e., Intel or AMD. + [[nodiscard]] auto vendor() const -> std::string const& final { return Vendor; } + /// Get the string containing family, model and stepping ids. + [[nodiscard]] auto model() const -> std::string const& final { return Model; } private: - bool hasRdtsc() const { return this->_hasRdtsc; } - bool hasInvariantRdtsc() const { return this->_hasInvariantRdtsc; } - void cpuid(unsigned long long *a, unsigned long long *b, - unsigned long long *c, unsigned long long *d) const; - - asmjit::CpuInfo cpuInfo; - std::list featureList; - - bool _hasRdtsc; - bool _hasInvariantRdtsc; - std::string _vendor; - std::string _model; + /// Does this processor support timestamp counters + [[nodiscard]] auto hasRdtsc() const -> bool { return this->HasRdtsc; } + /// Does this processor have invariant timestamp counters + [[nodiscard]] auto hasInvariantRdtsc() const -> bool { return this->HasInvariantRdtsc; } + + /// A wrapper to the cpuid call to keep a consitent interface between Windows and other platforms. + static void cpuid(uint64_t* Rax, uint64_t* Rbx, uint64_t* Rcx, uint64_t* Rdx); + + /// The asmjit CpuInfo for the current processor + asmjit::CpuInfo CpuInfo; + /// The list of cpufeatures that are supported by the current processpr + std::list FeatureList; + + /// Does this processor support timestamp counters + bool HasRdtsc; + /// Does this processor have invariant timestamp counters + bool HasInvariantRdtsc; + + /// The CPU vendor i.e., Intel or AMD. + std::string Vendor; + /// Model string containing family, model and stepping ids. + std::string Model; }; -inline std::ostream &operator<<(std::ostream &stream, - X86CPUTopology const &cpuTopology) { - return cpuTopology.print(stream); +inline auto operator<<(std::ostream& Stream, X86CPUTopology const& CpuTopology) -> std::ostream& { + return CpuTopology.print(Stream); } -} // namespace firestarter::environment::x86 +} // namespace firestarter::environment::x86 \ No newline at end of file diff --git a/include/firestarter/Environment/X86/X86Environment.hpp b/include/firestarter/Environment/X86/X86Environment.hpp index 11ad940e..f4760f7e 100644 --- a/include/firestarter/Environment/X86/X86Environment.hpp +++ b/include/firestarter/Environment/X86/X86Environment.hpp @@ -21,91 +21,102 @@ #pragma once -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#define REGISTER(NAME) \ - [](asmjit::CpuFeatures const &supportedFeatures, unsigned family, \ - unsigned model, unsigned threads) -> platform::X86PlatformConfig * { \ - return new platform::NAME(supportedFeatures, family, model, threads); \ - } +#include "firestarter/Environment/Environment.hpp" +#include "firestarter/Environment/X86/Platform/BulldozerConfig.hpp" +#include "firestarter/Environment/X86/Platform/HaswellConfig.hpp" +#include "firestarter/Environment/X86/Platform/HaswellEPConfig.hpp" +#include "firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp" +#include "firestarter/Environment/X86/Platform/NaplesConfig.hpp" +#include "firestarter/Environment/X86/Platform/NehalemConfig.hpp" +#include "firestarter/Environment/X86/Platform/NehalemEPConfig.hpp" +#include "firestarter/Environment/X86/Platform/RomeConfig.hpp" +#include "firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp" +#include "firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp" +#include "firestarter/Environment/X86/Platform/SkylakeConfig.hpp" +#include "firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp" +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" namespace firestarter::environment::x86 { class X86Environment final : public Environment { public: - X86Environment() : Environment(new X86CPUTopology()) {} - - ~X86Environment() { - for (auto const &config : platformConfigs) { - delete config; - } - for (auto const &config : fallbackPlatformConfigs) { - delete config; - } + X86Environment() + : Environment(std::make_unique()) {} + + /// Getter (which allows modifying) for the current platform config containing the payload, settings, the + /// associated name and the default X86 family and models. + [[nodiscard]] auto config() -> platform::X86PlatformConfig& final { + auto* X86PlatformConfig = dynamic_cast(&Environment::config()); + assert(X86PlatformConfig && "X86PlatformConfig is a nullptr"); + return *X86PlatformConfig; + } + + /// Const getter for the current platform config containing the payload, settings, the associated name and the default + /// X86 family and models. + [[nodiscard]] auto config() const -> const platform::X86PlatformConfig& final { + const auto* X86PlatformConfig = dynamic_cast(&Environment::config()); + assert(X86PlatformConfig && "X86PlatformConfig is a nullptr"); + return *X86PlatformConfig; } - X86CPUTopology const &topology() { - return *reinterpret_cast(this->_topology); + /// Const getter for the current CPU topology with X86 specific modifications. + [[nodiscard]] auto topology() const -> const X86CPUTopology& final { + const auto* X86Topology = dynamic_cast(&Environment::topology()); + assert(X86Topology && "X86Topology is a nullptr"); + return *X86Topology; } - void evaluateFunctions() override; - int selectFunction(unsigned functionId, - bool allowUnavailablePayload) override; - int selectInstructionGroups(std::string groups) override; + /// Select a PlatformConfig based on its generated id. This function will throw if a payload is not available or the + /// id is incorrect. If id is zero we automatically select a matching PlatformConfig. + /// \arg FunctionId The id of the PlatformConfig that should be selected. + /// \arg AllowUnavailablePayload If true we will not throw if the PlatformConfig is not available. + void selectFunction(unsigned FunctionId, bool AllowUnavailablePayload) override; + + /// Parse the selected payload instruction groups and save the in the selected function. Throws if the input is + /// invalid. + /// \arg Groups The list of instruction groups that is in the format: multiple INSTRUCTION:VALUE pairs + /// comma-seperated. + void selectInstructionGroups(std::string Groups) override; + + /// Print the available instruction groups of the selected function. void printAvailableInstructionGroups() override; - void setLineCount(unsigned lineCount) override; + + /// Set the line count in the selected function. + /// \arg LineCount The maximum number of instruction that should be in the high-load loop. + void setLineCount(unsigned LineCount) override; + + /// Print a summary of the settings of the selected config. void printSelectedCodePathSummary() override; + + /// Print a list of available high-load function and if they are available on the current system. This includes all + /// PlatformConfigs in combination with all thread per core counts. void printFunctionSummary() override; private: - // The available function IDs are generated by iterating through this list of - // PlatformConfig. Add new PlatformConfig at the bottom to maintain stable - // IDs. - const std::list> - platformConfigsCtor = { - REGISTER(KnightsLandingConfig), REGISTER(SkylakeConfig), - REGISTER(SkylakeSPConfig), REGISTER(HaswellConfig), - REGISTER(HaswellEPConfig), REGISTER(SandyBridgeConfig), - REGISTER(SandyBridgeEPConfig), REGISTER(NehalemConfig), - REGISTER(NehalemEPConfig), REGISTER(BulldozerConfig), - REGISTER(NaplesConfig), REGISTER(RomeConfig)}; - - std::list platformConfigs; - - // List of fallback PlatformConfig. Add one for each x86 extension. - const std::list> - fallbackPlatformConfigsCtor = { - REGISTER(SkylakeSPConfig), // AVX512 - REGISTER(BulldozerConfig), // FMA4 - REGISTER(HaswellConfig), // FMA - REGISTER(SandyBridgeConfig), // AVX - REGISTER(NehalemConfig) // SSE2 - }; - - std::list fallbackPlatformConfigs; - -#undef REGISTER + /// The list of availabe platform configs that is printed when supplying the --avail command line argument. The IDs + /// for these configs are generated by iterating through this list starting with 1. To maintain stable IDs in + /// FIRESTARTER new configs should be added to the bottom of the list. + const std::list> PlatformConfigs = { + std::make_shared(), std::make_shared(), + std::make_shared(), std::make_shared(), + std::make_shared(), std::make_shared(), + std::make_shared(), std::make_shared(), + std::make_shared(), std::make_shared(), + std::make_shared(), std::make_shared()}; + + /// The list of configs that are fallbacks. If none of the PlatformConfigs is the default one on the current CPU, we + /// select the first one from this list that is available on the current system. If multiple configs can be available + /// on one system the one with higher priority should be at the top of this list. Modern X86 CPUs will support SSE2 + /// therefore it is the last on the list. CPUs that support AVX512 will most certainly also support FMA and AVX, + /// AVX512 takes precedence. This list should contain one entry for each of the supported CPU extensions by the + /// FIRESTARTER payloads. + const std::list> FallbackPlatformConfigs = { + std::make_shared(), // AVX512 + std::make_shared(), // FMA4 + std::make_shared(), // FMA + std::make_shared(), // AVX + std::make_shared() // SSE2 + }; }; } // namespace firestarter::environment::x86 diff --git a/include/firestarter/ErrorDetectionStruct.hpp b/include/firestarter/ErrorDetectionStruct.hpp index 38bcbc6a..1fc3ad24 100644 --- a/include/firestarter/ErrorDetectionStruct.hpp +++ b/include/firestarter/ErrorDetectionStruct.hpp @@ -21,26 +21,31 @@ #pragma once +#include + namespace firestarter { +/// This struct is used for the error detection feature. The error detection works between two threads. The current one +/// and one on the left. Analogous for the thread on the right. We hash the contents of the vector registers and compare +/// them with the current iteration counter aginst the other threads. struct ErrorDetectionStruct { - // we have two cache lines (64B) containing each two 16B local variable and - // one ptr (8B) - - // the pointer to 16B of communication - volatile unsigned long long *communicationLeft; - volatile unsigned long long localsLeft[4]; - // if this variable is not 0, an error occured in the comparison with the left - // thread. - volatile unsigned long long errorLeft; - volatile unsigned long long paddingLeft[2]; + struct OneSide { + /// The pointer to 16B of communication between the two threads which is used with lock cmpxchg16b + uint64_t* Communication; + /// The local variables that are used for the error detection algorithm + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) + uint64_t Locals[4]; + /// If this variable is not 0, an error occured in the comparison with the other thread. + uint64_t Error; + /// Padding to fill up a cache line. + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) + uint64_t Padding[2]; + }; - volatile unsigned long long *communicationRight; - volatile unsigned long long localsRight[4]; - // if this variable is not 0, an error occured in the comparison with the - // right thread. - volatile unsigned long long errorRight; - volatile unsigned long long paddingRight[2]; + /// The data that is used for the error detection algorithm between the current and the thread left to it. + OneSide Left; + /// The data that is used for the error detection algorithm between the current and the thread right to it. + OneSide Right; }; -} // namespace firestarter +} // namespace firestarter \ No newline at end of file diff --git a/include/firestarter/Firestarter.hpp b/include/firestarter/Firestarter.hpp index 31347dd2..a51feebb 100644 --- a/include/firestarter/Firestarter.hpp +++ b/include/firestarter/Firestarter.hpp @@ -21,36 +21,19 @@ #pragma once -#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP) -#include -#endif - -#ifdef FIRESTARTER_BUILD_ONEAPI -#include -#endif - - - -#include - -#if defined(linux) || defined(__linux__) -#include -#include -#include -#include -#endif - -#include -#include - -#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ - defined(_M_X64) -#include -#endif +#include "firestarter/Config.hpp" +#include "firestarter/Constants.hpp" +#include "firestarter/Cuda/Cuda.hpp" +#include "firestarter/DumpRegisterWorkerData.hpp" +#include "firestarter/LoadWorkerData.hpp" +#include "firestarter/Measurement/MeasurementWorker.hpp" +#include "firestarter/OneAPI/OneAPI.hpp" +#include "firestarter/Optimizer/Algorithm.hpp" +#include "firestarter/Optimizer/OptimizerWorker.hpp" +#include "firestarter/Optimizer/Population.hpp" #include #include -#include #include #include #include @@ -64,141 +47,140 @@ extern "C" { namespace firestarter { +/// This is the main class of firestarter and handles the execution of the programm. class Firestarter { public: - Firestarter(const int argc, const char **argv, - std::chrono::seconds const &timeout, unsigned loadPercent, - std::chrono::microseconds const &period, - unsigned requestedNumThreads, std::string const &cpuBind, - bool printFunctionSummary, unsigned functionId, - bool listInstructionGroups, std::string const &instructionGroups, - unsigned lineCount, bool allowUnavailablePayload, - bool dumpRegisters, - std::chrono::seconds const &dumpRegistersTimeDelta, - std::string const &dumpRegistersOutpath, bool errorDetection, - int gpus, unsigned gpuMatrixSize, bool gpuUseFloat, - bool gpuUseDouble, bool listMetrics, bool measurement, - std::chrono::milliseconds const &startDelta, - std::chrono::milliseconds const &stopDelta, - std::chrono::milliseconds const &measurementInterval, - std::vector const &metricPaths, - std::vector const &stdinMetrics, bool optimize, - std::chrono::seconds const &preheat, - std::string const &optimizationAlgorithm, - std::vector const &optimizationMetrics, - std::chrono::seconds const &evaluationDuration, - unsigned individuals, std::string const &optimizeOutfile, - unsigned generations, double nsga2_cr, double nsga2_m); - - ~Firestarter(); + Firestarter() = delete; + /// Read the config, validate and throw on problems with config. Setup everything that is required for the execution + /// of firestarter. + /// \arg ProvidedConfig The config for the execution of Firestarter + explicit Firestarter(Config&& ProvidedConfig); + + ~Firestarter() = default; + + /// This function takes care of the execution of firestarter. It will start the load on CPUs and GPUs. void mainThread(); private: - const int _argc; - const char **_argv; - const std::chrono::seconds _timeout; - const unsigned _loadPercent; - std::chrono::microseconds _load; - std::chrono::microseconds _period; - const bool _dumpRegisters; - const std::chrono::seconds _dumpRegistersTimeDelta; - const std::string _dumpRegistersOutpath; - const bool _errorDetection; - const int _gpus; - const unsigned _gpuMatrixSize; - const bool _gpuUseFloat; - const bool _gpuUseDouble; - const std::chrono::milliseconds _startDelta; - const std::chrono::milliseconds _stopDelta; - const bool _measurement; - const bool _optimize; - const std::chrono::seconds _preheat; - const std::string _optimizationAlgorithm; - const std::vector _optimizationMetrics; - const std::chrono::seconds _evaluationDuration; - const unsigned _individuals; - const std::string _optimizeOutfile; - const unsigned _generations; - const double _nsga2_cr; - const double _nsga2_m; - -#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ - defined(_M_X64) - environment::x86::X86Environment *_environment = nullptr; - - environment::x86::X86Environment &environment() const { - return *_environment; - } -#else -#error "FIRESTARTER is not implemented for this ISA" -#endif + const Config Cfg; + + /// The class that handles setting up the payload for firestarter + std::unique_ptr Environment; + /// The class for execution of the gemm routine on Cuda or HIP GPUs. + std::unique_ptr Cuda; + /// The class for execution of the gemm routine on OneAPI GPUs. + std::unique_ptr Oneapi; + /// The pointer to the optimization algorithm that is used by the optimization functionality. + std::unique_ptr Algorithm; + /// The thread that is used to dump register contents to a file. + std::thread DumpRegisterWorkerThread; + /// The shared pointer to the datastructure that handles the management of metrics, acquisition of metric data and + /// provids summaries of a time range of metric values. + std::shared_ptr MeasurementWorker; + + /// The vector of thread handles for the load workers and shared pointer to the their respective data. + std::vector>> LoadThreads; + /// The vector of communication data, where each element is shared between two neighbouring threads for the error + /// detection feature. + std::vector> ErrorCommunication; + + /// The population holding the problem that is used for the optimization feature. + std::unique_ptr Population; + + // NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables) + // TODO(Issue #85): Currently we support one instance of the Firestarter class. Variables that need to be accessed + // from outside the class, e.g. in the sigterm handler are inline static. + + /// The instance of the optimization worker that handles the execution of the optimization. + inline static std::unique_ptr Optimizer; + + /// Variable to control the termination of the watchdog + inline static bool WatchdogTerminate = false; + /// Condition variable for the WatchdogTerminate to allow notifying when sleeping for a specific time. + inline static std::condition_variable WatchdogTerminateAlert; + /// Mutex to guard access to WatchdogTerminate. + inline static std::mutex WatchdogTerminateMutex; + + /// Variable to control the load of the threads + inline static volatile LoadThreadWorkType LoadVar = LoadThreadWorkType::LoadLow; + + // NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables) + + /// Spawn the load workers and initialize them. + void initLoadWorkers(); + + /// Wait for the load worker to join + void joinLoadWorkers(); -#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP) - std::unique_ptr _cuda; -#endif + /// Print the error report for the error detection feature. + void printThreadErrorReport(); -#ifdef FIRESTARTER_BUILD_ONEAPI - std::unique_ptr _oneapi; -#endif + /// Print the performance report. It contains the estimation of the FLOPS and main memory bandwidth. + void printPerformanceReport(); -#if defined(linux) || defined(__linux__) - inline static std::unique_ptr _optimizer; - std::shared_ptr _measurementWorker; - std::unique_ptr _algorithm; - firestarter::optimizer::Population _population; -#endif + /// Set the load workers to the ThreadInit state. + void signalInit() { signalLoadWorkers(LoadThreadState::ThreadInit); } - // LoadThreadWorker.cpp - int initLoadWorkers(bool lowLoad, unsigned long long period); - void joinLoadWorkers(); - void printThreadErrorReport(); - void printPerformanceReport(); + /// Set the load workers to the ThreadWork state. + void signalWork() { signalLoadWorkers(LoadThreadState::ThreadWork); }; - void signalWork() { signalLoadWorkers(THREAD_WORK); }; + /// Set the load workers to the ThreadWork state. + /// \arg Setting The new setting to switch to. + void signalSwitch(std::vector> const& Setting) { + struct SwitchLoad { + static void func() { LoadVar = LoadThreadWorkType::LoadSwitch; }; + }; - // WatchdogWorker.cpp - int watchdogWorker(std::chrono::microseconds period, - std::chrono::microseconds load, - std::chrono::seconds timeout); + for (auto& Thread : LoadThreads) { + auto Td = Thread.second; -#ifdef FIRESTARTER_DEBUG_FEATURES - // DumpRegisterWorker.cpp - int initDumpRegisterWorker(std::chrono::seconds dumpTimeDelta, - std::string dumpFilePath); - void joinDumpRegisterWorker(); -#endif + Td->config().settings().selectInstructionGroups(Setting); + } - // LoadThreadWorker.cpp - void signalLoadWorkers(int comm); - static void loadThreadWorker(std::shared_ptr td); + signalLoadWorkers(LoadThreadState::ThreadSwitch, SwitchLoad::func); + }; -#ifdef FIRESTARTER_DEBUG_FEATURES - // DumpRegisterWorker.cpp - static void dumpRegisterWorker(std::unique_ptr data); -#endif + /// Execute a state change in the load worker threads. This should happen at the same time in all threads. First the + /// mutex in all threads are locked an then the state is updated and we wait until we get an acknowledgement from the + /// threads. + /// \arg State The new state of the threads. + /// \arg Function An optional function that will be executed after the state in all threads has been updated and + /// before we wait for the acknowledgement of the thread. + void signalLoadWorkers(LoadThreadState State, void (*Function)() = nullptr); - static void setLoad(unsigned long long value); + /// The function that is executed for each load thread. + /// \arg Td The shared pointer to the data that is required in this thread. + static void loadThreadWorker(const std::shared_ptr& Td); - static void sigalrmHandler(int signum); - static void sigtermHandler(int signum); + /// This function handels switching the load from high to low in a loop and stopping the execution if a timeout was + /// set. + /// \arg Period The period of the high/low switch. Set to zero to disable switching between a high and low load. + /// \arg Load The time of the period where high load is applied. + /// \arg Timeout The timeout after which firestarter stops. Set to zero to disable. + static void watchdogWorker(std::chrono::microseconds Period, std::chrono::microseconds Load, + std::chrono::seconds Timeout); - // variables to control the termination of the watchdog - inline static bool _watchdog_terminate = false; - inline static std::condition_variable _watchdogTerminateAlert; - inline static std::mutex _watchdogTerminateMutex; + /// Start the thread to dump the registers of the first load thread to a file. + void initDumpRegisterWorker(); - // variable to control the load of the threads - inline static volatile unsigned long long loadVar = LOAD_LOW; + /// Wait for the dump register thread to terminate. + void joinDumpRegisterWorker(); - std::vector>> - loadThreads; + /// The thread that dumps the registers of the first thread to a file. + /// \arg Data The data that is required for the worker thread to dump the register contents to a file. + static void dumpRegisterWorker(std::unique_ptr Data); - std::vector> errorCommunication; + /// Set the load var to a specific value and update it with a memory fence across threads. + /// \arg Value The new load value. + static void setLoad(LoadThreadWorkType Value); -#ifdef FIRESTARTER_DEBUG_FEATURES - std::thread dumpRegisterWorkerThread; -#endif + /// Sigalarm handler does nothing. + static void sigalrmHandler(int Signum); + + /// Sigterm handler stops the execution of firestarter + /// \arg Signum The signal number is ignored. + static void sigtermHandler(int Signum); }; } // namespace firestarter diff --git a/include/firestarter/Json/Summary.hpp b/include/firestarter/Json/Summary.hpp index 540c4aed..e6f33e5d 100644 --- a/include/firestarter/Json/Summary.hpp +++ b/include/firestarter/Json/Summary.hpp @@ -21,24 +21,27 @@ #pragma once -#include +#include "firestarter/Measurement/Summary.hpp" +/// Json serializer and deserializer for the firestarter::measurement::Summary struct namespace nlohmann { template <> struct adl_serializer { - static firestarter::measurement::Summary from_json(const json &j) { - return {j["num_timepoints"].get(), - std::chrono::milliseconds( - j["duration"].get()), - j["average"].get(), j["stddev"].get()}; + // functions for nlohmann json do not follow LLVM code style + // NOLINTBEGIN(readability-identifier-naming) + static auto from_json(const json& J) -> firestarter::measurement::Summary { + return {J["num_timepoints"].get(), + std::chrono::milliseconds(J["duration"].get()), J["average"].get(), + J["stddev"].get()}; } - static void to_json(json &j, firestarter::measurement::Summary s) { - j = json::object(); + static void to_json(json& J, firestarter::measurement::Summary S) { + J = json::object(); - j["num_timepoints"] = s.num_timepoints; - j["duration"] = s.duration.count(); - j["average"] = s.average; - j["stddev"] = s.stddev; + J["num_timepoints"] = S.NumTimepoints; + J["duration"] = S.Duration.count(); + J["average"] = S.Average; + J["stddev"] = S.Stddev; } + // NOLINTEND(readability-identifier-naming) }; } // namespace nlohmann diff --git a/include/firestarter/LoadWorkerData.hpp b/include/firestarter/LoadWorkerData.hpp index ec70476f..1cf3dac3 100644 --- a/include/firestarter/LoadWorkerData.hpp +++ b/include/firestarter/LoadWorkerData.hpp @@ -21,108 +21,146 @@ #pragma once -#include -#include -#include -#include +#include "firestarter/Constants.hpp" +#include "firestarter/Environment/Environment.hpp" +#include "firestarter/Environment/Platform/PlatformConfig.hpp" +#include "firestarter/LoadWorkerMemory.hpp" #include +#include #include #include - -#define PAD_SIZE(size, align) \ - align *(int)std::ceil((double)size / (double)align) - -#if defined(__APPLE__) -#define ALIGNED_MALLOC(size, align) aligned_alloc(align, PAD_SIZE(size, align)) -#define ALIGNED_FREE free -#elif defined(__MINGW64__) -#define ALIGNED_MALLOC(size, align) _mm_malloc(PAD_SIZE(size, align), align) -#define ALIGNED_FREE _mm_free -#elif defined(_MSC_VER) -#define ALIGNED_MALLOC(size, align) \ - _aligned_malloc(PAD_SIZE(size, align), align) -#define ALIGNED_FREE _aligned_free -#else -#define ALIGNED_MALLOC(size, align) \ - std::aligned_alloc(align, PAD_SIZE(size, align)) -#define ALIGNED_FREE std::free -#endif +#include namespace firestarter { +/// This class contains the information that is required to execute the load routines and change the payload during +/// executions. class LoadWorkerData { public: - LoadWorkerData(int id, environment::Environment &environment, - volatile unsigned long long *loadVar, - unsigned long long period, bool dumpRegisters, - bool errorDetection) - : addrHigh(loadVar), period(period), dumpRegisters(dumpRegisters), - errorDetection(errorDetection), _id(id), _environment(environment), - _config(new environment::platform::RuntimeConfig( - environment.selectedConfig())) { - // use REGISTER_MAX_NUM cache lines for the dumped registers - // and another cache line for the control variable. - // as we are doing aligned moves we only have the option to waste a whole - // cacheline - addrOffset = dumpRegisters - ? sizeof(DumpRegisterStruct) / sizeof(unsigned long long) - : 0; - - addrOffset += errorDetection ? sizeof(ErrorDetectionStruct) / - sizeof(unsigned long long) - : 0; - } + /// This struct models parameters acquired during the execution of the high-load routine. + struct Metrics { + /// The number of iteration the high-load loop was executed. + std::atomic Iterations{}; + /// The start of the execution of the high-load loop. + std::atomic StartTsc{}; + /// The stop of the execution of the high-load loop. + std::atomic StopTsc{}; + + auto operator=(const Metrics& Other) -> Metrics& { + if (this == &Other) { + return *this; + } - ~LoadWorkerData() { - delete _config; - if (addrMem - addrOffset != nullptr) { - ALIGNED_FREE(addrMem - addrOffset); + Iterations.store(Other.Iterations.load()); + StartTsc.store(Other.StartTsc.load()); + StopTsc.store(Other.StopTsc.load()); + return *this; } - } + }; + + /// Create the datastructure that is shared between a load worker thread and firestarter. + /// \arg Id The id of the load worker thread. They are counted from 0 to the maximum number of threads - 1. + /// \arg Environment The reference to the environment which allows setting the thread affinity and getting the current + /// timestamp. + /// \arg LoadVar The variable that controls the execution of the load worker. + /// \arg Period Is used in combination with the LoadVar for the low load routine. + /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the + /// compiled payload. + /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine + /// of the compiled payload. + LoadWorkerData(uint64_t Id, const environment::Environment& Environment, volatile LoadThreadWorkType& LoadVar, + std::chrono::microseconds Period, bool DumpRegisters, bool ErrorDetection) + : LoadVar(LoadVar) + , Period(Period) + , DumpRegisters(DumpRegisters) + , ErrorDetection(ErrorDetection) + , Id(Id) + , Environment(Environment) + , Config(Environment.config().clone()) {} - void setErrorCommunication( - std::shared_ptr communicationLeft, - std::shared_ptr communicationRight) { - this->communicationLeft = communicationLeft; - this->communicationRight = communicationRight; + ~LoadWorkerData() = default; + + /// Set the shared pointer to the memory shared between two thread for the communication required for the error + /// detection feature. + /// \arg CommunicationLeft The memory shared with the left thread. + /// \arg CommunicationRight The memory shared with the right thread. + void setErrorCommunication(std::shared_ptr CommunicationLeft, + std::shared_ptr CommunicationRight) { + this->CommunicationLeft = std::move(CommunicationLeft); + this->CommunicationRight = std::move(CommunicationRight); } - int id() const { return _id; } - environment::Environment &environment() const { return _environment; } - environment::platform::RuntimeConfig &config() const { return *_config; } + /// Gettter for the id of the thread. + [[nodiscard]] auto id() const -> uint64_t { return Id; } + /// Const getter for the environment. + [[nodiscard]] auto environment() const -> const environment::Environment& { return Environment; } + /// Getter for the current platform config. + [[nodiscard]] auto config() const -> environment::platform::PlatformConfig& { return *Config; } + + /// Access the DumpRegisterStruct. Asserts when dumping registers is not enabled. + /// \returns a reference to the DumpRegisterStruct + [[nodiscard]] auto dumpRegisterStruct() const -> DumpRegisterStruct& { + assert(DumpRegisters && "Tried to access DumpRegisterStruct, but dumping registers is not enabled."); + return Memory->ExtraVars.Drs; + } - const ErrorDetectionStruct *errorDetectionStruct() const { - return reinterpret_cast(addrMem - addrOffset); + /// Access the ErrorDetectionStruct. Asserts when error detections is not enabled. + /// \returns a reference to the ErrorDetectionStruct + [[nodiscard]] auto errorDetectionStruct() const -> ErrorDetectionStruct& { + assert(ErrorDetection && "Tried to access ErrorDetectionStruct, but error detection is not enabled."); + return Memory->ExtraVars.Eds; } - int comm = THREAD_WAIT; - bool ack = false; - std::mutex mutex; - unsigned long long *addrMem = nullptr; - unsigned long long addrOffset; - volatile unsigned long long *addrHigh; - unsigned long long buffersizeMem; - unsigned long long iterations = 0; - // save the last iteration count when switching payloads - std::atomic lastIterations; - unsigned long long flops; - unsigned long long startTsc; - unsigned long long stopTsc; - std::atomic lastStartTsc; - std::atomic lastStopTsc; + /// The members in this struct are used for the communication between the main thread and the load thread. + struct Communication { + /// The state of the load worker. + LoadThreadState State = LoadThreadState::ThreadWait; + /// This variable will be set to true when the state change was acknowledged by the load thread. + bool Ack = false; + /// The mutex that is used to lock access to the Ack and State variabels. + std::mutex Mutex; + } Communication; + + /// The memory which is used by the load worker. + LoadWorkerMemory::UniquePtr Memory = {nullptr, nullptr}; + + /// The compiled payload which contains the pointers to the specific functions which are executed and some stats. + environment::payload::CompiledPayload::UniquePtr CompiledPayloadPtr = {nullptr, nullptr}; + + /// The variable that controls the execution of the load worker. + volatile LoadThreadWorkType& LoadVar; + + /// The size of the buffer that is allocated in the load worker. + uint64_t BuffersizeMem{}; + + /// The collected metrics from the current execution of the LoadThreadState::ThreadWork state. Do not read from it. + Metrics CurrentRun; + + /// The collected metrics from the last execution of the LoadThreadState::ThreadWork state. + Metrics LastRun; + // period in usecs // used in low load routine to sleep 1/100th of this time - unsigned long long period; - bool dumpRegisters; - bool errorDetection; - std::shared_ptr communicationLeft; - std::shared_ptr communicationRight; - -private: - int _id; - environment::Environment &_environment; - environment::platform::RuntimeConfig *_config; + std::chrono::microseconds Period; + + /// Should the code to support dumping registers be baked into the high load routine of the compiled payload. + bool DumpRegisters; + + /// Should the code to support error detection between thread be baked into the high load routine of the compiled + /// payload. + bool ErrorDetection; + /// The pointer to the variable that is used for communication to the left thread for the error detection feature. + std::shared_ptr CommunicationLeft; + /// The pointer to the variable that is used for communication to the right thread for the error detection feature. + std::shared_ptr CommunicationRight; + + /// The id of this load thread. + const uint64_t Id; + /// The reference to the environment which allows setting the thread affinity and getting the current timestamp. + const environment::Environment& Environment; + /// The config that is cloned from the environment for this specfic load worker. + std::unique_ptr Config; }; } // namespace firestarter diff --git a/include/firestarter/LoadWorkerMemory.hpp b/include/firestarter/LoadWorkerMemory.hpp new file mode 100644 index 00000000..11493665 --- /dev/null +++ b/include/firestarter/LoadWorkerMemory.hpp @@ -0,0 +1,91 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include "firestarter/AlignedAlloc.hpp" +#include "firestarter/DumpRegisterStruct.hpp" +#include "firestarter/ErrorDetectionStruct.hpp" + +#include + +namespace firestarter { + +/// This struct is used to allocate the memory for the high-load routine. +struct LoadWorkerMemory { +private: + LoadWorkerMemory() = default; + ~LoadWorkerMemory() = default; + + /// Function to deallocate the memory for this struct to be used with unique_ptr. + /// \arg Ptr The pointer to the memory + static void deallocate(void* Ptr) { + static_cast(Ptr)->~LoadWorkerMemory(); + AlignedAlloc::free(Ptr); + } + +public: + using UniquePtr = std::unique_ptr; + + /// The extra variables that are before the memory used for the calculation in the high-load routine. They are used + /// for optional FIRESTARTER features where further communication between the high-load routine is needed e.g., for + /// error detection or dumping registers. + struct ExtraLoadWorkerVariables { + /// The data for the dump registers functionality. + DumpRegisterStruct Drs; + /// The data for the error detections functionality. + ErrorDetectionStruct Eds; + } ExtraVars; + + /// A placeholder to extract the address of the memory region with dynamic size which is used for the calculation in + /// the high-load routine. Do not write or read to this type directly. + EightBytesType DoNotUseAddrMem; + + /// This padding makes shure that we are aligned to a cache line. The allocated memory will most probably reach beyond + /// this array. + std::array DoNotUsePadding; + + /// Get the pointer to the start of the memory use for computations. + /// \returns the pointer to the memory. + [[nodiscard]] auto getMemoryAddress() -> auto{ + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) + return reinterpret_cast(&DoNotUseAddrMem); + } + + /// Get the offset to the memory which is used by the high-load functions + /// \returns the offset to the memory + [[nodiscard]] constexpr static auto getMemoryOffset() -> auto{ return offsetof(LoadWorkerMemory, DoNotUseAddrMem); } + + /// Allocate the memory for the high-load thread on 64B cache line boundaries and return a unique_ptr. + /// \arg Bytes The number of bytes allocated for the array whoose start address is returned by the getMemoryAddress + /// function. + /// \returns A unique_ptr to the memory for the high-load thread. + [[nodiscard]] static auto allocate(const std::size_t Bytes) -> UniquePtr { + // Allocate the memory for the ExtraLoadWorkerVariables (which are 64B aligned) and the data for the high-load + // routine which may not be 64B aligned. + static_assert(sizeof(ExtraLoadWorkerVariables) % 64 == 0, + "ExtraLoadWorkerVariables is not a multiple of 64B i.e., multiple cachelines."); + auto* Ptr = AlignedAlloc::malloc(Bytes + sizeof(ExtraLoadWorkerVariables)); + return {static_cast(Ptr), deallocate}; + } +}; + +} // namespace firestarter diff --git a/include/firestarter/Logging/FirstWorkerThreadFilter.hpp b/include/firestarter/Logging/FirstWorkerThreadFilter.hpp index af8b7ff1..4e501b2e 100644 --- a/include/firestarter/Logging/FirstWorkerThreadFilter.hpp +++ b/include/firestarter/Logging/FirstWorkerThreadFilter.hpp @@ -21,31 +21,29 @@ #pragma once -#include #include - #include -namespace firestarter { - -namespace logging { +namespace firestarter::logging { +/// Logging filter for nitro to discard values that do not match a specific thread id. template class FirstWorkerThreadFilter { public: - typedef Record record_type; + using record_type = Record; - static void setFirstThread(std::thread::id newFirstThread) { - firstThread = newFirstThread; - } + /// Set the thread id from which records should not be discarded. + /// \arg NewFirstThread The specified thread. + static void setFirstThread(std::thread::id NewFirstThread) { FirstThread = NewFirstThread; } - bool filter(Record &r) const { - return r.std_thread_id() == firstThread || - r.severity() >= nitro::log::severity_level::error; + /// Filter records. We keep record if they are from the specified thread or if the severity is at least error. + /// \arg R The record to filter. + /// \returns true if the record should be kept. + auto filter(Record& R) const -> bool { + return R.std_thread_id() == FirstThread || R.severity() >= nitro::log::severity_level::error; } private: - inline static std::thread::id firstThread{}; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) + inline static std::thread::id FirstThread{}; }; -} // namespace logging - -} // namespace firestarter +} // namespace firestarter::logging diff --git a/include/firestarter/Logging/Log.hpp b/include/firestarter/Logging/Log.hpp index f5b613c0..10090668 100644 --- a/include/firestarter/Logging/Log.hpp +++ b/include/firestarter/Logging/Log.hpp @@ -21,22 +21,19 @@ #pragma once -#include - -#include -#include +#include "firestarter/Logging/FirstWorkerThreadFilter.hpp" +#include "firestarter/SafeExit.hpp" +#include +#include #include #include #include #include - #include #include - -#include -#include -#include +#include +#include #include #include @@ -44,70 +41,81 @@ namespace firestarter { namespace logging { +/// Formatter to log Records with severity warn, error and fatal to stderr and all other Records to stdout. If a record +/// has severity error or fatal we abort the program. class StdOut { public: - void sink(nitro::log::severity_level severity, - const std::string &formatted_record) { - switch (severity) { + static void sink(nitro::log::severity_level Severity, const std::string& FormattedRecord) { + switch (Severity) { case nitro::log::severity_level::warn: case nitro::log::severity_level::error: case nitro::log::severity_level::fatal: - std::cerr << formatted_record << std::endl << std::flush; + std::cerr << FormattedRecord << '\n' << std::flush; break; default: - std::cout << formatted_record << std::endl << std::flush; + std::cout << FormattedRecord << '\n' << std::flush; break; } + + // Exit on error or fatal + if (Severity == nitro::log::severity_level::error || Severity == nitro::log::severity_level::fatal) { + safeExit(EXIT_FAILURE); + } } }; -using record = nitro::log::record< - nitro::log::severity_attribute, nitro::log::message_attribute, - nitro::log::timestamp_attribute, nitro::log::std_thread_id_attribute>; +// NOLINTBEGIN(readability-identifier-naming) +// The class may not be named Record since this is used as a template argument name in nitro which will cause errors +// when compiling with MSC. +using record = nitro::log::record; +// NOLINTEND(readability-identifier-naming) -template class formater { +template +// NOLINTBEGIN(readability-identifier-naming) +// The class may not be named Formater since this is used as a template argument name in nitro which will cause errors +// when compiling with MSC. We will also write it with lower case and the correct spelling in case it gets renamed +// correctly there. +/// Format Record and add a string representing the severity in front. +class formatter { + // NOLINTEND(readability-identifier-naming) public: - std::string format(Record &r) { - std::stringstream s; + auto format(Record& R) -> std::string { + std::stringstream S; - switch (r.severity()) { + switch (R.severity()) { case nitro::log::severity_level::warn: - s << "Warning: "; + S << "Warning: "; break; case nitro::log::severity_level::error: - s << "Error: "; + S << "Error: "; break; case nitro::log::severity_level::fatal: - s << "Fatal: "; + S << "Fatal: "; break; case nitro::log::severity_level::trace: - s << "Debug: "; + S << "Debug: "; break; default: break; } - s << r.message(); + S << R.message(); - return s.str(); + return S.str(); } }; -template -using filter = nitro::log::filter::severity_filter; +template using Filter = nitro::log::filter::severity_filter; template -using workerFilter = - nitro::log::filter::and_filter, - FirstWorkerThreadFilter>; +using WorkerFilter = nitro::log::filter::and_filter, FirstWorkerThreadFilter>; } // namespace logging -using log = nitro::log::logger; +using log = nitro::log::logger; using workerLog = - nitro::log::logger; + nitro::log::logger; } // namespace firestarter diff --git a/include/firestarter/Measurement/MeasurementWorker.hpp b/include/firestarter/Measurement/MeasurementWorker.hpp index 4fc8a6a1..a25c8da3 100644 --- a/include/firestarter/Measurement/MeasurementWorker.hpp +++ b/include/firestarter/Measurement/MeasurementWorker.hpp @@ -21,95 +21,118 @@ #pragma once -#include -#include -#include +#include "firestarter/Measurement/Metric/IPCEstimate.hpp" +#include "firestarter/Measurement/Metric/Perf.hpp" +#include "firestarter/Measurement/Metric/RAPL.hpp" +#include "firestarter/Measurement/MetricInterface.h" +#include "firestarter/Measurement/Summary.hpp" +#include "firestarter/Measurement/TimeValue.hpp" +#include "firestarter/WindowsCompat.hpp" // IWYU pragma: keep #include #include #include -extern "C" { -#include -#include -#include -#include - -#include -} - -void insertCallback(void *cls, const char *metricName, int64_t timeSinceEpoch, - double value); +void insertCallback(void* Cls, const char* MetricName, int64_t TimeSinceEpoch, double Value); namespace firestarter::measurement { +/// This class handles the management of metrics, acquisition of metric data and provids summaries of a time range of +/// metric values. class MeasurementWorker { private: - pthread_t workerThread; - pthread_t stdinThread; + /// The thread that handles the values that are read from metrics + pthread_t WorkerThread{}; + /// The thread that handles the metric values that are read from stdin + pthread_t StdinThread{}; - std::vector metrics = { - &rapl_metric, &perf_ipc_metric, &perf_freq_metric, &ipc_estimate_metric}; + /// The vector of metrics that are available. Currently the following metrics are builtin: sysfs-powercap-rapl, + /// perf-ipc, perf-freq and ipc-estimate. Metric provided through shared libraries are added to this list. + std::vector Metrics = {&RaplMetric, &PerfIpcMetric, &PerfFreqMetric, &IpcEstimateMetric}; - std::mutex values_mutex; - std::map> values = {}; + /// Mutex to access the Values map. + std::mutex ValuesMutex; + /// Map from metric name to the vector of timevalues of this metric. + std::map> Values; - static int *dataAcquisitionWorker(void *measurementWorker); + /// The thread function handles the timed polling of the metric values and saves them to the Value datastructure. + static auto dataAcquisitionWorker(void* MeasurementWorker) -> void*; - static int *stdinDataAcquisitionWorker(void *measurementWorker); + /// The thread function that handles the acquisition of the metric values from stdin and saves them to the Value + /// datastructure. + static auto stdinDataAcquisitionWorker(void* MeasurementWorker) -> void*; - const metric_interface_t *findMetricByName(std::string metricName); + /// Return the pointer to a metric from the Metrics vector that matches the supplied name. + /// \arg MetricName The name of the metric + /// \returns the pointer to the metric with the specified name or a nullptr + auto findMetricByName(std::string MetricName) -> const MetricInterface*; - std::chrono::milliseconds updateInterval; + /// We poll the values of all the metrics after this number of milliseconds. + std::chrono::milliseconds UpdateInterval; - std::chrono::high_resolution_clock::time_point startTime; + /// The start time of the measurement that should be summarized with the getValues function. + std::chrono::high_resolution_clock::time_point StartTime; - // some metric values have to be devided by this - const unsigned long long numThreads; + /// The number of thread FIRESTARTER runs with. This is required by some metrics + const uint64_t NumThreads; - std::string availableMetricsString; + std::string AvailableMetricsString; #ifndef FIRESTARTER_LINK_STATIC - std::vector _metricDylibs = {}; + /// The pointer to the metrics that are used for dynamic libraries. We need to save them seperately here to call + /// dlclose later. + std::vector MetricDylibs; #endif - std::vector _stdinMetrics = {}; + /// The name of the metrics that are supplied from stdin. + std::vector StdinMetrics; public: - // creates the worker thread - MeasurementWorker(std::chrono::milliseconds updateInterval, - unsigned long long numThreads, - std::vector const &metricDylibs, - std::vector const &stdinMetrics); - - // stops the worker threads + /// Initilize the measurement worker. It will spawn the threads for the polling of metic values. + /// \arg UpdateInterval The polling time for metric updates. + /// \arg NumThreads The number of thread FIRESTARTER is running with. + /// \arg MetricDylibsNames The vector of files to which are passed to dlopen for using additional metrics from shared + /// libraries. + /// \arg StdinMetricsNames The vector of metric names that should be read in from stdin + MeasurementWorker(std::chrono::milliseconds UpdateInterval, uint64_t NumThreads, + std::vector const& MetricDylibsNames, + std::vector const& StdinMetricsNames); + + /// Stops the worker threads ~MeasurementWorker(); - std::string const &availableMetrics() const { - return this->availableMetricsString; - } + /// Get the formatting table of all metrics and if they are available + [[nodiscard]] auto availableMetrics() const -> std::string const& { return this->AvailableMetricsString; } - std::vector const &stdinMetrics() { return _stdinMetrics; } + /// The vector of all metrics that are read from stdin + auto stdinMetrics() -> std::vector const& { return StdinMetrics; } - // returns a list of metrics - std::vector metricNames(); + /// Get the name of the metrics. This includes all metrics, builins, from dynamic libraries and metrics from stdin. + auto metricNames() -> std::vector; - // setup the selected metrics - // returns a vector with the names of inialized metrics - std::vector - initMetrics(std::vector const &metricNames); + /// Initialize the metrics with the provided names. + /// \arg MetricNames The metrics to initialize + /// \returns The vector of metrics that were successfully initialized. + auto initMetrics(std::vector const& MetricNames) -> std::vector; - // callback function for metrics - void insertCallback(const char *metricName, int64_t timeSinceEpoch, - double value); + /// This function insert a time value pair for a specific metric. This function will be provided to metrics to allow + /// them to push time value pairs. + /// \arg MetricName The name of the metric for which values are inserted + /// \arg TimeSinceEpoch The time since epoch of the time value pair + /// \arg Value The value of the time value pair + void insertCallback(const char* MetricName, int64_t TimeSinceEpoch, double Value); - // start the measurement + /// Set the StartTime to the current timestep void startMeasurement(); - // get the measurement values begining from measurement start until now. - std::map getValues( - std::chrono::milliseconds startDelta = std::chrono::milliseconds::zero(), - std::chrono::milliseconds stopDelta = std::chrono::milliseconds::zero()); + /// Get the measurement values begining from measurement start (set with startMeasurement) until the measurement stop + /// (now). + /// \arg StartDelta The time to skip from the measurement start + /// \arg StopDelta The time to skip from the measurement stop + /// \returns The map from all metrics to their respective summaries. + auto getValues(std::chrono::milliseconds StartDelta = std::chrono::milliseconds::zero(), + std::chrono::milliseconds StopDelta = std::chrono::milliseconds::zero()) + -> std::map; }; } // namespace firestarter::measurement diff --git a/include/firestarter/Measurement/Metric/IPCEstimate.hpp b/include/firestarter/Measurement/Metric/IPCEstimate.hpp new file mode 100644 index 00000000..52bc9cdb --- /dev/null +++ b/include/firestarter/Measurement/Metric/IPCEstimate.hpp @@ -0,0 +1,91 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2021 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include "firestarter/Measurement/MetricInterface.h" + +#include + +/// The wrapper for the C interface to the IpcEstimateMetric metric. +struct IpcEstimateMetricData { +private: + IpcEstimateMetricData() = default; + + /// The error string of this metric + std::string ErrorString; + + /// The saved callback to push the metric value + void (*Callback)(void*, const char*, int64_t, double){}; + /// The saved first argument for the callback + void* CallbackArg{}; + +public: + IpcEstimateMetricData(IpcEstimateMetricData const&) = delete; + void operator=(IpcEstimateMetricData const&) = delete; + + /// Get the instance of this metric + static auto instance() -> IpcEstimateMetricData& { + static IpcEstimateMetricData Instance; + return Instance; + } + + /// Deinit the metric. + /// \returns EXIT_SUCCESS on success. + static auto fini() -> int32_t; + + /// Init the metric. + /// \returns EXIT_SUCCESS on success. + static auto init() -> int32_t; + + /// Get error in case return code not being EXIT_SUCCESS. + /// \returns The error string. + static auto getError() -> const char*; + + /// The first argument is the function pointer to the callback. The first argument to this function pointer needs to + /// be filled with the second argument to this function. + /// The supplied function pointer needs to be called with the metric name for the second, an unix timestamp (time + /// since epoch) for the third and a metric value for the forth argument. This allows the metric to provide values in + /// a pushing way in contract to the pulling way of the GetReading function. + static auto registerInsertCallback(void (*C)(void*, const char*, int64_t, double), void* Arg) -> int32_t; + + /// Push a value with the current timestamp. + /// \arg Value The metric value to push. + static void insertValue(double Value); +}; + +/// This metric provdies the ipc estimated based on the estimated number of instructions and the runtime of the high +/// load loop. The metric value is dependent on the frequency of the processor. It serves as an estimation of the IPC +/// times the processor frequency. +static constexpr const MetricInterface IpcEstimateMetric{ + /*Name=*/"ipc-estimate", + /*Type=*/ + {/*Absolute=*/1, /*Accumalative=*/0, /*DivideByThreadCount=*/0, /*InsertCallback=*/1, /*IgnoreStartStopDelta=*/1, + /*Reserved=*/0}, + /*Unit=*/"IPC", + /*CallbackTime=*/0, + /*Callback=*/nullptr, + /*Init=*/IpcEstimateMetricData::init, + /*Fini=*/IpcEstimateMetricData::fini, + /*GetReading=*/nullptr, + /*GetError=*/IpcEstimateMetricData::getError, + /*RegisterInsertCallback=*/IpcEstimateMetricData::registerInsertCallback, +}; \ No newline at end of file diff --git a/include/firestarter/Measurement/Metric/Perf.hpp b/include/firestarter/Measurement/Metric/Perf.hpp new file mode 100644 index 00000000..8e0e14c7 --- /dev/null +++ b/include/firestarter/Measurement/Metric/Perf.hpp @@ -0,0 +1,142 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include "firestarter/Measurement/MetricInterface.h" + +#include +#include + +/// The wrapper for the C interface to the PerfIpcMetric and PerfFreqMetric metric. +class PerfMetricData { +private: + PerfMetricData() = default; + + static const constexpr char* PerfEventParanoidFile = "/proc/sys/kernel/perf_event_paranoid"; + + /// The datastructure that is read from the file descriptor provided by the perf_event_open syscall. + struct ReadFormat { + struct ValueAndId { + uint64_t Value; + uint64_t Id; + }; + + uint64_t Nr; + std::array Values; + }; + + /// The error string of this metric + std::string ErrorString; + + /// The file descriptor of the perf_event_open syscall for the PERF_COUNT_HW_CPU_CYCLES event. This file descriptor + /// handles as a group for the other file descriptor. + int CpuCyclesFd = -1; + /// The file descriptor of the perf_event_open syscall for the PERF_COUNT_HW_INSTRUCTIONS event. + int InstructionsFd = -1; + /// The PERF_EVENT_IOC_ID for the cpu cycles file descriptor. + uint64_t CpuCyclesId{}; + /// The PERF_EVENT_IOC_ID for the cpu instruction file descriptor. + uint64_t InstructionsId{}; + + /// The flag that stop init from being executed multiple times. + bool InitDone = false; + /// The value that is returned if the init function called multiple times. + int32_t InitValue{}; + + /// Save the last read metric for the perf-ipc metric. This value will be updated when the perf-ipc metric is read. + struct ReadFormat Last {}; + + /// Get a reading of the perf-freq and perf-ipc metric. Pointers can be nullptr. + /// \arg IpcValue The pointer to which the value for ipc metric value will be saved. + /// \arg FreqValue The pointer to which the value for freq metric value will be saved. + /// \returns EXIT_SUCCESS if we got a new value. + static auto getReading(double* IpcValue, double* FreqValue) -> int32_t; + +public: + PerfMetricData(PerfMetricData const&) = delete; + void operator=(PerfMetricData const&) = delete; + + /// Get the instance of this metric + static auto instance() -> PerfMetricData& { + static PerfMetricData Instance; + return Instance; + } + + /// Deinit the metric. + /// \returns EXIT_SUCCESS on success. + static auto fini() -> int32_t; + + /// Init the metric. + /// \returns EXIT_SUCCESS on success. + static auto init() -> int32_t; + + /// Read the from a specific PERF_EVENT_IOC_ID out of the ReadFormat datastructure. + /// \arg Reader The ReadFormat datastructure from which the value will be extracter + /// \arg Id The PERF_EVENT_IOC_ID of the metric that should be read. + static auto valueFromId(struct ReadFormat* Reader, uint64_t Id) -> uint64_t; + + /// Get a reading of the perf-ipc metric. + /// \arg Value The pointer to which the value will be saved. + /// \returns EXIT_SUCCESS if we got a new value. + static auto getReadingIpc(double* Value) -> int32_t; + + /// Get a reading of the perf-freq metric. + /// \arg Value The pointer to which the value will be saved. + /// \returns EXIT_SUCCESS if we got a new value. + static auto getReadingFreq(double* Value) -> int32_t; + + /// Get error in case return code not being EXIT_SUCCESS. + /// \returns The error string. + static auto getError() -> const char*; +}; + +/// This metric provides IPC measurement of the programm and all associated threads. +static constexpr const MetricInterface PerfIpcMetric{ + /*Name=*/"perf-ipc", + /*Type=*/ + {/*Absolute=*/1, /*Accumalative=*/0, /*DivideByThreadCount=*/0, /*InsertCallback=*/0, /*IgnoreStartStopDelta=*/0, + /*Reserved=*/0}, + /*Unit=*/"IPC", + /*CallbackTime=*/0, + /*Callback=*/nullptr, + /*Init=*/PerfMetricData::init, + /*Fini=*/PerfMetricData::fini, + /*GetReading=*/PerfMetricData::getReadingIpc, + /*GetError=*/PerfMetricData::getError, + /*RegisterInsertCallback=*/nullptr, +}; + +/// This metric provides frequency measurement on the CPUs used to execute the program on. +static constexpr const MetricInterface PerfFreqMetric{ + /*Name=*/"perf-freq", + /*Type=*/ + {/*Absolute=*/0, /*Accumalative=*/1, /*DivideByThreadCount=*/1, /*InsertCallback=*/0, /*IgnoreStartStopDelta=*/0, + /*Reserved=*/0}, + /*Unit=*/"GHz", + /*CallbackTime=*/0, + /*Callback=*/nullptr, + /*Init=*/PerfMetricData::init, + /*Fini=*/PerfMetricData::fini, + /*GetReading=*/PerfMetricData::getReadingFreq, + /*GetError=*/PerfMetricData::getError, + /*RegisterInsertCallback=*/nullptr, +}; \ No newline at end of file diff --git a/include/firestarter/Measurement/Metric/RAPL.hpp b/include/firestarter/Measurement/Metric/RAPL.hpp new file mode 100644 index 00000000..59d4a822 --- /dev/null +++ b/include/firestarter/Measurement/Metric/RAPL.hpp @@ -0,0 +1,108 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include "firestarter/Measurement/MetricInterface.h" + +#include +#include +#include + +/// The wrapper for the C interface to the RaplMetric metric. +class RaplMetricData { +private: + /// Datastructure to hold the path of the sysfs rapl entry, the last reading (improtant to detect overflows), the + /// counter of the number of overflows and the maximum value that the reading will have. + struct ReaderDef { + ReaderDef() = delete; + + ReaderDef(std::string Path, int64_t LastReading, int64_t Overflow, int64_t Max) + : Path(std::move(Path)) + , LastReading(LastReading) + , Overflow(Overflow) + , Max(Max){}; + + std::string Path; + int64_t LastReading; + int64_t Overflow; + int64_t Max; + }; + + /// The path to the sysfs rapl entries + static constexpr const char* RaplPath = "/sys/class/powercap"; + + /// The error string of this metric + std::string ErrorString; + + /// The vector of readers that hold the path and read values from the sysfs rapl + std::vector> Readers; + + RaplMetricData() = default; + +public: + RaplMetricData(RaplMetricData const&) = delete; + void operator=(RaplMetricData const&) = delete; + + /// Get the instance of this metric + static auto instance() -> RaplMetricData& { + static RaplMetricData Instance; + return Instance; + } + + /// Deinit the metric. + /// \returns EXIT_SUCCESS on success. + static auto fini() -> int32_t; + + /// Init the metric. + /// \returns EXIT_SUCCESS on success. + static auto init() -> int32_t; + + /// Get a reading of the sysfs-powercap-rapl metric. + /// \arg Value The pointer to which the value will be saved. + /// \returns EXIT_SUCCESS if we got a new value. + static auto getReading(double* Value) -> int32_t; + + /// Get error in case return code not being EXIT_SUCCESS. + /// \returns The error string. + static auto getError() -> const char*; + + /// This function should be called every 30s. It will make shure that we do not miss an overflow of a counter and + /// therefore get a wrong reading. + static void callback(); +}; + +/// This metric provides power measurements through the RAPL interface. Either psys measurement is choosen or if this is +/// not available the sum of packages and drams. +static constexpr const MetricInterface RaplMetric{ + /*Name=*/"sysfs-powercap-rapl", + /*Type=*/ + {/*Absolute=*/0, /*Accumalative=*/1, /*DivideByThreadCount=*/0, /*InsertCallback=*/0, /*IgnoreStartStopDelta=*/0, + /*Reserved=*/0}, + /*Unit=*/"J", + /*CallbackTime=*/30000000, + /*Callback=*/RaplMetricData::callback, + /*Init=*/RaplMetricData::init, + /*Fini=*/RaplMetricData::fini, + /*GetReading=*/RaplMetricData::getReading, + /*GetError=*/RaplMetricData::getError, + /*RegisterInsertCallback=*/nullptr, +}; \ No newline at end of file diff --git a/include/firestarter/Measurement/MetricInterface.h b/include/firestarter/Measurement/MetricInterface.h index dbea19e8..03f4872c 100644 --- a/include/firestarter/Measurement/MetricInterface.h +++ b/include/firestarter/Measurement/MetricInterface.h @@ -21,65 +21,84 @@ #pragma once +/// This file provides a C style interface to write metrics for FIRESTARTER and provide them as shared libraries. + +#ifdef __cplusplus +extern "C" { +#endif + +// NOLINTNEXTLINE(modernize-deprecated-headers) #include -// clang-format off +// NOLINTBEGIN(modernize-use-using) + +/// Describe the type of the metric and how values need to be accumulated. Per default metrics are of pulling type where +/// FIRESTARTER will pull the values through the GetReading function. typedef struct { - // Either set absolute or accumalative to specify the type of values from the - // metric. - uint32_t absolute : 1, - accumalative : 1, - // Set to divide metric values by thread count. - divide_by_thread_count : 1, - // Set to insert time-value pairs via callback function passed by - // register_insert_callback. - insert_callback : 1, - // ignore the start and stop delta set by the user - ignore_start_stop_delta : 1, - __reserved : 27; -} metric_type_t; -// clang-format on - -// Define `metric_interface_t metric` inside your shared library to be able to -// load it during runtime. + uint32_t + /// Set this to 1 if the metric values provided are absolute. + Absolute : 1, + /// Set this to 1 if the metric values provided are accumulative. + Accumalative : 1, + /// Set this to 1 if the metric value needs to be divided by the number of threads. + DivideByThreadCount : 1, + /// Set this to 1 if the metric will provide time-value data in a pushing way trough the RegisterInsertCallback + /// function. + InsertCallback : 1, + /// Set this to 1 if the accumulation of the metric should ignore the start/stop delta which are specified by the + /// user of FIRESTARTER. + IgnoreStartStopDelta : 1, + /// Reserved space to fill 32 bits + Reserved : 27; +} MetricType; + +/// Define `MetricInterface Metric` inside your shared library to be able to load it during runtime. typedef struct { - // the name of the metric - const char *name; + /// The name of the metric + const char* Name; + + /// Describes what the value of the metrics represents and how it needs to be accumulated. + MetricType Type; - // metric type with bitfield from metric_type_t - metric_type_t type; + /// The unit of the metric + const char* Unit; - // the unit of the metric - const char *unit; + /// The time in usecs after which the callback should be called again. Set to 0 to disable. + uint64_t CallbackTime; - uint64_t callback_time; + /// This function will be called every `CallbackTime` usecs. Disable by setting `CallbackTime` to 0. + void (*Callback)(); - // This function will be called every `callback_time` usecs. Disable by - // setting `callback_time` to 0. - void (*callback)(void); + /// init the metric. + /// \returns EXIT_SUCCESS on success. + int32_t (*Init)(); - // init the metric. - // returns EXIT_SUCCESS on success. - int32_t (*init)(void); + /// deinit the metric. + /// \returns EXIT_SUCCESS on success. + int32_t (*Fini)(); - // deinit the metric. - // returns EXIT_SUCCESS on success. - int32_t (*fini)(void); + /// Get a reading of the metric. Set this function pointer to null if MetricType::InsertCallback is specified in the + /// Type. + /// \arg Value The pointer to which the value will be saved. + /// \returns EXIT_SUCCESS if we got a new value. + int32_t (*GetReading)(double* Value); - // Get a reading of the metric - // Return EXIT_SUCCESS if we got a new value. - // Set this function pointer to NULL if METRIC_INSERT_CALLBACK is specified. - int32_t (*get_reading)(double *value); + /// Get error in case return code not being EXIT_SUCCESS. + /// \returns The error string. + const char* (*GetError)(); - // Get error in case return code not being EXIT_SUCCESS - const char *(*get_error)(void); + /// If MetricType::InsertCallback is specified in the Type this function will be used to pass the metric a callback + /// and the first argument to this callback. + /// The first argument is the function pointer to the callback. The first argument to this function pointer needs to + /// be filled with the second argument to this function. + /// The supplied function pointer needs to be called with the metric name for the second, an unix timestamp (time + /// since epoch) for the third and a metric value for the forth argument. This allows the metric to provide values in + /// a pushing way in contract to the pulling way of the GetReading function. + int32_t (*RegisterInsertCallback)(void (*)(void*, const char*, int64_t, double), void*); - // If METRIC_INSERT_CALLBACK is set in the type, this function will be passed - // a callback and the first argument for the callback. - // Further arguments of callback are the metric name, an unix timestamp (time - // since epoch) and a metric value. - int32_t (*register_insert_callback)(void (*)(void *, const char *, int64_t, - double), - void *); +} MetricInterface; +// NOLINTEND(modernize-use-using) -} metric_interface_t; +#ifdef __cplusplus +}; +#endif \ No newline at end of file diff --git a/include/firestarter/Measurement/Summary.hpp b/include/firestarter/Measurement/Summary.hpp index 23f819f0..05c5a925 100644 --- a/include/firestarter/Measurement/Summary.hpp +++ b/include/firestarter/Measurement/Summary.hpp @@ -21,30 +21,32 @@ #pragma once -#include +#include "firestarter/Measurement/MetricInterface.h" +#include "firestarter/Measurement/TimeValue.hpp" #include #include #include -extern "C" { -#include -} - namespace firestarter::measurement { +/// This struct summarized multiple timevalues. The duration, the number of time points an average and stddev is saved. struct Summary { - - size_t num_timepoints; - std::chrono::milliseconds duration; - - double average; - double stddev; - - static Summary calculate(std::vector::iterator begin, - std::vector::iterator end, - metric_type_t metricType, - unsigned long long numThreads); + size_t NumTimepoints; + std::chrono::milliseconds Duration; + + double Average; + double Stddev; + + /// Calculate the summary over a range of timevalues for a given metric and number of threads. + /// \arg Begin The start of the iterator + /// \arg End The end of the iterator + /// \arg MetricType This describes what each timevalue represents and how the metric needs to be calucated into a + /// summary. + /// \arg NumThreads The number of threads this metric was accumulated across. + /// \returns The summary over the range of timevalues from a specific metric. + static auto calculate(std::vector::iterator Begin, std::vector::iterator End, + MetricType MetricType, uint64_t NumThreads) -> Summary; }; } // namespace firestarter::measurement diff --git a/include/firestarter/Measurement/TimeValue.hpp b/include/firestarter/Measurement/TimeValue.hpp index eae3de23..8088385e 100644 --- a/include/firestarter/Measurement/TimeValue.hpp +++ b/include/firestarter/Measurement/TimeValue.hpp @@ -25,16 +25,16 @@ namespace firestarter::measurement { +/// This struct models a value that was captured at a specific timepoint. struct TimeValue { - TimeValue() = default; - constexpr TimeValue(std::chrono::high_resolution_clock::time_point t, - double v) - : time(t), value(v){}; + constexpr TimeValue(std::chrono::high_resolution_clock::time_point Time, double Value) + : Time(Time) + , Value(Value){}; - std::chrono::high_resolution_clock::time_point time; - double value; + std::chrono::high_resolution_clock::time_point Time; + double Value{}; }; } // namespace firestarter::measurement diff --git a/include/firestarter/OneAPI/OneAPI.hpp b/include/firestarter/OneAPI/OneAPI.hpp index cf939388..4022b8c4 100644 --- a/include/firestarter/OneAPI/OneAPI.hpp +++ b/include/firestarter/OneAPI/OneAPI.hpp @@ -21,32 +21,63 @@ #pragma once +#include "firestarter/Constants.hpp" + #include -#include #include -#include namespace firestarter::oneapi { +/// This class handles the workload on OneAPI compatible GPUs. A gemm routine is used to stress them with a +/// constant high load. This header does not include any OneAPI specific headers to allow us to not guard the +/// include of this header in other parts of the programm. class OneAPI { private: - std::thread _initThread; - std::condition_variable _waitForInitCv; - std::mutex _waitForInitCvMutex; + /// The thread that is used to initilize the gpus. This thread will wait until each thread that runs the gemm routine + /// joins. + std::thread InitThread; - static void initGpus(std::condition_variable &cv, - volatile unsigned long long *loadVar, bool useFloat, - bool useDouble, unsigned matrixSize, int gpus); + /// Spawns a thread for each of the selected gpus, initilizes them and starts the execution of the gemm in parallel. + /// \arg WaitForInitCv The condition variables used to signal that all gpus are initialized. + /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter. + /// \arg UseFloat Set to true if we want to stress using single precision floating points. + /// \arg UseDouble Set to true if we want to stress using double precision floating points. If neither UseFloat or + /// UseDouble is set the precision will be choosen automatically. + /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for + /// automatic selection. + /// \arg Gpus Select the number of gpus to stress or -1 for all. + static void initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar, + bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus); public: - OneAPI(volatile unsigned long long *loadVar, bool useFloat, bool useDouble, - unsigned matrixSize, int gpus); + /// Initilize the OneAPI class. This will start a thread running the OneAPI::initGpus function and wait until all gpus + /// are inititialized. + /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter. + /// \arg UseFloat Set to true if we want to stress using single precision floating points. + /// \arg UseDouble Set to true if we want to stress using double precision floating points. If neither UseFloat or + /// UseDouble is set the precision will be choosen automatically. + /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for + /// automatic selection. + /// \arg Gpus Select the number of gpus to stress or -1 for all. + OneAPI(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble, unsigned MatrixSize, + int Gpus) +#if defined(FIRESTARTER_BUILD_ONEAPI) + ; +#else + { + (void)&LoadVar; + (void)UseFloat; + (void)UseDouble; + (void)MatrixSize; + (void)Gpus; + } +#endif ~OneAPI() { - if (_initThread.joinable()) { - _initThread.join(); + if (InitThread.joinable()) { + InitThread.join(); } } }; -} // namespace firestarter::oneapi +} // namespace firestarter::oneapi \ No newline at end of file diff --git a/include/firestarter/Optimizer/Algorithm.hpp b/include/firestarter/Optimizer/Algorithm.hpp index 14009183..be5d5961 100644 --- a/include/firestarter/Optimizer/Algorithm.hpp +++ b/include/firestarter/Optimizer/Algorithm.hpp @@ -21,19 +21,26 @@ #pragma once -#include +#include "firestarter/Optimizer/Population.hpp" namespace firestarter::optimizer { +/// Abstract class to provide an interface for evolutionary optimization algorithms. class Algorithm { public: - Algorithm() {} - virtual ~Algorithm() {} + Algorithm() = default; + virtual ~Algorithm() = default; - virtual void checkPopulation(Population const &pop, - std::size_t populationSize) = 0; + /// Check if the population size and the problem matches the requirements of the algorithm. Asserts if this checks + /// fail. + /// \arg Prob The poblem that should be optimized with this algorithm + /// \arg PopulationSize The initial size of the population that is used + virtual void check(Problem const& Prob, std::size_t PopulationSize) = 0; - virtual Population evolve(Population &pop) = 0; + /// Evolve the population across multiple iterations. + /// \arg Pop The initial population + /// \returns The final population after the optimization has run + virtual auto evolve(Population& Pop) -> Population = 0; }; } // namespace firestarter::optimizer diff --git a/include/firestarter/Optimizer/Algorithm/NSGA2.hpp b/include/firestarter/Optimizer/Algorithm/NSGA2.hpp index c1825f73..6b395823 100644 --- a/include/firestarter/Optimizer/Algorithm/NSGA2.hpp +++ b/include/firestarter/Optimizer/Algorithm/NSGA2.hpp @@ -21,25 +21,46 @@ #pragma once -#include +#include "firestarter/Optimizer/Algorithm.hpp" namespace firestarter::optimizer::algorithm { +/// This class implements the NSGA2 evolutionary optimization algorithm. +/// The NSGA2 algorithm, as described in "A fast and elitist multiobjective genetic algorithm: NSGA-II" +/// (https://dl.acm.org/doi/10.1109/4235.996017), is a multiobjective algorithm allowing FIRESTARTER to optimize with +/// two (or more) metrics. This is relevant because adding the IPC (instruction per cycle) metric supports the +/// optimization algorithm to converge towards higher power consumption. class NSGA2 : public Algorithm { public: - NSGA2(unsigned gen, double cr, double m); - ~NSGA2() {} + /// Initialize the NSGA2 algorithm. + /// \arg Gen The number of generation that the algorithm uses to evolve its population. + /// \arg Cr The Crossover probability. Must be in range [0,1[ + /// \arg M Mutation probability. Must be in range [0,1] + NSGA2(unsigned Gen, double Cr, double M); + ~NSGA2() override = default; - void checkPopulation(firestarter::optimizer::Population const &pop, - std::size_t populationSize) override; + /// Check if the problem and population size matches the requirements of NSGA2. We must have a multi-objective problem + /// and at least 5 and a multiple of 4 individuals in our population. + /// \arg Prob The poblem that should be optimized with this algorithm + /// \arg PopulationSize The initial size of the population that is used + void check(firestarter::optimizer::Problem const& Prob, std::size_t PopulationSize) override; - firestarter::optimizer::Population - evolve(firestarter::optimizer::Population &pop) override; + /// Evolve the population across multiple iterations. + /// \arg Pop The initial population + /// \returns The final population after the optimization has run + auto evolve(firestarter::optimizer::Population& Pop) -> firestarter::optimizer::Population override; private: - unsigned _gen; - double _cr; - double _m; + // NOLINTBEGIN(cppcoreguidelines-avoid-const-or-ref-data-members) + + /// The number of generations of the NSGA2 algorithm. + const unsigned Gen; + /// The crossover propability in the range [0,1[. + const double Cr; + /// The mutation propability in the range [0,1]. + const double M; + + // NOLINTEND(cppcoreguidelines-avoid-const-or-ref-data-members) }; } // namespace firestarter::optimizer::algorithm diff --git a/include/firestarter/Optimizer/History.hpp b/include/firestarter/Optimizer/History.hpp index 9dec066d..10d635c1 100644 --- a/include/firestarter/Optimizer/History.hpp +++ b/include/firestarter/Optimizer/History.hpp @@ -21,10 +21,11 @@ #pragma once -#include -#include -#include -#include +#include "firestarter/Json/Summary.hpp" // IWYU pragma: keep +#include "firestarter/Logging/Log.hpp" +#include "firestarter/Measurement/Summary.hpp" +#include "firestarter/Optimizer/Individual.hpp" +#include "firestarter/WindowsCompat.hpp" // IWYU pragma: keep #include #include @@ -32,291 +33,313 @@ #include #include #include -#include +#include #include #include -#include #include -extern "C" { -#include -} - namespace firestarter::optimizer { +/// Singleton that handle keeping track of the history of evaluated indivudals and their associated metric summaries. struct History { private: - // https://stackoverflow.com/questions/17074324/how-can-i-sort-two-vectors-in-the-same-way-with-criteria-that-uses-only-one-of/17074810#17074810 - template - inline static std::vector - sortPermutation(const std::vector &vec, Compare &compare) { - std::vector p(vec.size()); - std::iota(p.begin(), p.end(), 0); - std::sort(p.begin(), p.end(), [&](std::size_t i, std::size_t j) { - return compare(vec[i], vec[j]); - }); - return p; + /// Find the permuation of a vector when sorting it with a supplied comparison function. + /// \tparam T The type of the vector elements + /// \tparam CompareT The type of the comparison function. + /// \arg Vec The const reference to vector that will be sorted. + /// \arg Compare The comparision function which will be used to sort the vector. + /// \returns The indices of how the vector would be sorted according to the comparison function. + template + static auto sortPermutation(const std::vector& Vec, CompareT& Compare) -> std::vector { + // https://stackoverflow.com/questions/17074324/how-can-i-sort-two-vectors-in-the-same-way-with-criteria-that-uses-only-one-of/17074810#17074810 + std::vector P(Vec.size()); + std::iota(P.begin(), P.end(), 0); + std::sort(P.begin(), P.end(), [&](std::size_t I, std::size_t J) { return Compare(Vec[I], Vec[J]); }); + return P; } - inline static void padding(std::stringstream &ss, std::size_t width, - std::size_t taken, char c) { - for (std::size_t i = 0; i < (std::max)(width, taken) - taken; ++i) { - ss << c; + /// Add padding to a stingstream to fill it up to a maximum width. + /// \arg Ss The stringstream to add padding to. + /// \arg Width The maximum width until which should be padded. + /// \arg Taken The number of characters that are already filled up. + /// \arg C The character that should be used for padding. + static void padding(std::stringstream& Ss, std::size_t Width, std::size_t Taken, char C) { + for (std::size_t I = 0; I < (std::max)(Width, Taken) - Taken; ++I) { + Ss << C; } } - inline static int MAX_ELEMENT_PRINT_COUNT = 20; - inline static std::size_t MIN_COLUMN_WIDTH = 10; + /// The maximum number of elements that will be printed. + static constexpr const int MaxElementPrintCount = 20; + /// The minimum width of columns that are printed. + static constexpr const std::size_t MinColumnWidth = 10; - inline static std::vector _x = {}; - inline static std::vector< - std::map> - _f = {}; + // NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables) + /// The vector of individuals that have been evaluated. This vector has the same size as F. + inline static std::vector X = {}; + /// The vector of metric summaries associated to the evaluated individuals. This vector has the same size as X. + inline static std::vector> F = {}; + // NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables) public: - inline static void append( - std::vector const &ind, - std::map const &metric) { - _x.push_back(ind); - _f.push_back(metric); + /// Append an evaluated individual to the history. + /// \arg Ind The individual to add. + /// \arg Metric The metric summaries for this individual. + static void append(std::vector const& Ind, + std::map const& Metric) { + X.push_back(Ind); + F.push_back(Metric); } - inline static std::optional< - std::map> - find(std::vector const &individual) { - auto findEqual = [individual](auto const &ind) { - return ind == individual; - }; - auto ind = std::find_if(_x.begin(), _x.end(), findEqual); - if (ind == _x.end()) { + /// Loopup an indiviudal in the history and return the metric summaries if it is in the history. + /// \arg Individual The individual which may already be evaluated. + /// \returns The metric summaries if the individual is in the history or std::nullopt otherwise. + static auto find(std::vector const& Individual) + -> std::optional> { + auto FindEqual = [&Individual](auto const& Ind) { return Ind == Individual; }; + auto Ind = std::find_if(X.begin(), X.end(), FindEqual); + if (Ind == X.end()) { return {}; } - auto dist = std::distance(_x.begin(), ind); - return _f[dist]; + auto Dist = std::distance(X.begin(), Ind); + return F[Dist]; } - inline static void - printBest(std::vector const &optimizationMetrics, - std::vector const &payloadItems) { - // TODO: print paretto front + /// Print the best individuals per metric. This will print a table with the average metric value and indiviudals per + /// metric. + /// \arg OptimizationMetrics The metrics for which the best individual should be printed. + /// \arg PayloadItems The instruction of the associated instruction groups used in the optimization. + static void printBest(std::vector const& OptimizationMetrics, + std::vector const& PayloadItems) { + // TODO(Issue #76): print paretto front // print the best 20 individuals for each metric in a format // where the user can give it to --run-instruction-groups directly - std::map columnWidth; + std::map ColumnWidth; - for (auto const &metric : optimizationMetrics) { - columnWidth[metric] = (std::max)(metric.size(), MIN_COLUMN_WIDTH); - firestarter::log::trace() << metric << ": " << columnWidth[metric]; + for (auto const& Metric : OptimizationMetrics) { + ColumnWidth[Metric] = (std::max)(Metric.size(), MinColumnWidth); + firestarter::log::trace() << Metric << ": " << ColumnWidth[Metric]; } - for (auto const &metric : optimizationMetrics) { - using SummaryMap = - std::map; - auto compareIndividual = [&metric](SummaryMap const &mapA, - SummaryMap const &mapB) { - auto summaryA = mapA.find(metric); - auto summaryB = mapB.find(metric); - - if (summaryA == mapA.end() || summaryB == mapB.end()) { - summaryA = mapA.find(metric.substr(1)); - summaryB = mapB.find(metric.substr(1)); - assert(summaryA != mapA.end()); - assert(summaryB != mapB.end()); - return summaryA->second.average < summaryB->second.average; + for (auto const& Metric : OptimizationMetrics) { + using SummaryMap = std::map; + auto CompareIndividual = [&Metric](SummaryMap const& MapA, SummaryMap const& MapB) { + auto SummaryA = MapA.find(Metric); + auto SummaryB = MapB.find(Metric); + + if (SummaryA == MapA.end() || SummaryB == MapB.end()) { + SummaryA = MapA.find(Metric.substr(1)); + SummaryB = MapB.find(Metric.substr(1)); + assert(SummaryA != MapA.end()); + assert(SummaryB != MapB.end()); + return SummaryA->second.Average < SummaryB->second.Average; } - assert(summaryA != mapA.end()); - assert(summaryB != mapB.end()); - return summaryA->second.average > summaryB->second.average; + assert(SummaryA != MapA.end()); + assert(SummaryB != MapB.end()); + return SummaryA->second.Average > SummaryB->second.Average; }; - auto perm = sortPermutation(_f, compareIndividual); + auto Perm = sortPermutation(F, CompareIndividual); - auto formatIndividual = - [&payloadItems](std::vector const &individual) { - std::string result = ""; - assert(payloadItems.size() == individual.size()); + auto FormatIndividual = [&PayloadItems](std::vector const& Individual) { + std::string Result; + assert(PayloadItems.size() == Individual.size()); - for (std::size_t i = 0; i < individual.size(); ++i) { - // skip zero values - if (individual[i] == 0) { - continue; - } + for (std::size_t I = 0; I < Individual.size(); ++I) { + // skip zero values + if (Individual[I] == 0) { + continue; + } - if (result.size() != 0) { - result += ","; - } - result += payloadItems[i] + ":" + std::to_string(individual[i]); - } + if (!Result.empty()) { + Result += ","; + } + Result += PayloadItems[I] + ":" + std::to_string(Individual[I]); + } - return result; - }; + return Result; + }; - auto begin = perm.begin(); - auto end = perm.end(); + auto Begin = Perm.begin(); + auto End = Perm.end(); - // stop printing at a max of MAX_ELEMENT_PRINT_COUNT - if (std::distance(begin, end) > MAX_ELEMENT_PRINT_COUNT) { - end = perm.begin(); - std::advance(end, MAX_ELEMENT_PRINT_COUNT); + // stop printing at a max of MaxElementPrintCount + if (std::distance(Begin, End) > MaxElementPrintCount) { + End = Perm.begin(); + std::advance(End, MaxElementPrintCount); } // print each of the best elements - std::size_t max = 0; - for (auto it = begin; it != end; ++it) { - max = (std::max)(max, formatIndividual(_x[*it]).size()); + std::size_t Max = 0; + for (auto It = Begin; It != End; ++It) { + Max = (std::max)(Max, FormatIndividual(X[*It]).size()); } - std::stringstream firstLine; - std::stringstream secondLine; - std::string ind = "INDIVIDUAL"; + std::stringstream FirstLine; + std::stringstream SecondLine; + std::string const Ind = "INDIVIDUAL"; - firstLine << " " << ind; - padding(firstLine, max, ind.size(), ' '); + FirstLine << " " << Ind; + padding(FirstLine, Max, Ind.size(), ' '); - secondLine << " "; - padding(secondLine, (std::max)(max, ind.size()), 0, '-'); + SecondLine << " "; + padding(SecondLine, (std::max)(Max, Ind.size()), 0, '-'); - for (auto const &metric : optimizationMetrics) { - auto width = columnWidth[metric]; + for (auto const& Metric : OptimizationMetrics) { + auto Width = ColumnWidth[Metric]; - firstLine << " | "; - secondLine << "---"; + FirstLine << " | "; + SecondLine << "---"; - firstLine << metric; - padding(firstLine, width, metric.size(), ' '); - padding(secondLine, width, 0, '-'); + FirstLine << Metric; + padding(FirstLine, Width, Metric.size(), ' '); + padding(SecondLine, Width, 0, '-'); } - std::stringstream ss; + std::stringstream Ss; - ss << "\n Best individuals sorted by metric " << metric << " " - << ((metric[0] == '-') ? "ascending" : "descending") << ":\n" - << firstLine.str() << "\n" - << secondLine.str() << "\n"; + Ss << "\n Best individuals sorted by metric " << Metric << " " + << ((Metric[0] == '-') ? "ascending" : "descending") << ":\n" + << FirstLine.str() << "\n" + << SecondLine.str() << "\n"; // print INDIVIDUAL | metric 1 | metric 2 | ... | metric N - for (auto it = begin; it != end; ++it) { - auto const fitness = _f[*it]; - auto const ind = formatIndividual(_x[*it]); + for (auto It = Begin; It != End; ++It) { + auto const& Fitness = F[*It]; + auto const Ind = FormatIndividual(X[*It]); - ss << " " << ind; - padding(ss, max, ind.size(), ' '); + Ss << " " << Ind; + padding(Ss, Max, Ind.size(), ' '); - for (auto const &metric : optimizationMetrics) { - auto width = columnWidth[metric]; - std::string value; + for (auto const& Metric : OptimizationMetrics) { + auto Width = ColumnWidth[Metric]; + std::string Value; - auto fitnessOfMetric = fitness.find(metric); - auto invertedMetric = metric.substr(1); - auto fitnessOfInvertedMetric = fitness.find(invertedMetric); + auto FitnessOfMetric = Fitness.find(Metric); + auto InvertedMetric = Metric.substr(1); + auto FitnessOfInvertedMetric = Fitness.find(InvertedMetric); - if (fitnessOfMetric != fitness.end()) { - value = std::to_string(fitnessOfMetric->second.average); - } else if (fitnessOfInvertedMetric != fitness.end()) { - value = std::to_string(fitnessOfInvertedMetric->second.average); + if (FitnessOfMetric != Fitness.end()) { + Value = std::to_string(FitnessOfMetric->second.Average); + } else if (FitnessOfInvertedMetric != Fitness.end()) { + Value = std::to_string(FitnessOfInvertedMetric->second.Average); } else { assert(false); } - ss << " | " << value; - padding(ss, width, value.size(), ' '); + Ss << " | " << Value; + padding(Ss, Width, Value.size(), ' '); } - ss << "\n"; + Ss << "\n"; } - ss << "\n"; + Ss << "\n"; - firestarter::log::info() << ss.str(); + firestarter::log::info() << Ss.str(); } - firestarter::log::info() - << "To run FIRESTARTER with the best individual of a given metric " - "use the command line argument " - "`--run-instruction-groups=INDIVIDUAL`"; + firestarter::log::info() << "To run FIRESTARTER with the best individual of a given metric " + "use the command line argument " + "`--run-instruction-groups=INDIVIDUAL`"; } - inline static void save(std::string const &path, std::string const &startTime, - std::vector const &payloadItems, - const int argc, const char **argv) { + /// Save the history to a file. This function is not threadsafe as is calls History::getTime. + /// \arg Path The folder in which the outfile shall be created. If it is empty the current directory name or /tmp will + /// be choosen. + /// \arg StartTime The start time as a string which is saved in the json datastructure. + /// \arg PayloadItems The Vector of meta instructions which map to the vector of individuals. + /// \arg Argc The Argc of the executed programm. + /// \arg Argv The Argv of the executed programm. + static void save(std::string const& Path, std::string const& StartTime, std::vector const& PayloadItems, + const int Argc, const char** Argv) { using json = nlohmann::json; - json j = json::object(); + json J = json::object(); - j["individuals"] = json::array(); - for (auto const &ind : _x) { - j["individuals"].push_back(ind); + J["individuals"] = json::array(); + for (auto const& Ind : X) { + J["individuals"].push_back(Ind); } - j["metrics"] = json::array(); - for (auto const &eval : _f) { - j["metrics"].push_back(eval); + J["metrics"] = json::array(); + for (auto const& Eval : F) { + J["metrics"].push_back(Eval); } + // Initialize a string with length of 256 filled with null characters + auto Hostname = std::string(256, 0); // get the hostname - char cHostname[256]; - std::string hostname; - if (0 != gethostname(cHostname, sizeof(cHostname))) { - hostname = "unknown"; - } else { - hostname = cHostname; + if (0 != gethostname(Hostname.data(), Hostname.size())) { + Hostname = "unknown"; + } + + // Strip away any remaining null terminators + if (const auto Pos = Hostname.find('\0'); Pos != std::string::npos) { + Hostname.erase(Pos); } - j["hostname"] = hostname; + J["hostname"] = Hostname; - j["startTime"] = startTime; - j["endTime"] = getTime(); + J["startTime"] = StartTime; + J["endTime"] = getTime(); // save the payload items - j["payloadItems"] = json::array(); - for (auto const &item : payloadItems) { - j["payloadItems"].push_back(item); + J["payloadItems"] = json::array(); + for (auto const& Item : PayloadItems) { + J["payloadItems"].push_back(Item); } // save the arguments - j["args"] = json::array(); - for (int i = 0; i < argc; ++i) { - j["args"].push_back(argv[i]); + J["args"] = json::array(); + for (int I = 0; I < Argc; ++I) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + J["args"].push_back(Argv[I]); } // dump the output - std::string s = j.dump(); + const auto S = J.dump(); - firestarter::log::trace() << s; + firestarter::log::trace() << S; - std::string outpath = path; - if (outpath.empty()) { - char *pwd = get_current_dir_name(); - if (pwd) { - outpath = pwd; - free(pwd); + std::string Outpath = Path; + if (Outpath.empty()) { + // Wrap get_current_dir_name in a unique ptr, as it needs to get deleted by free when it is not used anymore. + const std::unique_ptr WrappedPwd = {get_current_dir_name(), free}; + if (WrappedPwd) { + // Get the pointer captured in the WrappedPwd (not only the first char as would be with *WrappedPwd) + Outpath = WrappedPwd.get(); } else { firestarter::log::warn() << "Could not find $PWD."; - outpath = "/tmp"; + Outpath = "/tmp"; } - outpath += "/" + hostname + "_" + startTime + ".json"; + Outpath += "/" + Hostname + "_" + StartTime + ".json"; } - firestarter::log::info() << "\nDumping output json in " << outpath; + firestarter::log::info() << "\nDumping output json in " << Outpath; - std::ofstream fp(outpath); + std::ofstream Fp(Outpath); - if (fp.bad()) { - firestarter::log::error() << "Could not open " << outpath; + if (Fp.bad()) { + firestarter::log::error() << "Could not open " << Outpath; return; } - fp << s; + Fp << S; - fp.close(); + Fp.close(); } - inline static std::string getTime() { - auto t = std::time(nullptr); - auto tm = *std::localtime(&t); - std::stringstream ss; - ss << std::put_time(&tm, "%F_%T%z"); - return ss.str(); + /// Get the current time in the local timezone as a string formatted by "%F_%T%z". This function is NOT threadsafe. + /// \returns The current time in local timezone as a formatted string. + static auto getTime() -> std::string { + const auto T = std::time(nullptr); + // NOLINTNEXTLINE(concurrency-mt-unsafe) + const auto* Tm = std::localtime(&T); + std::stringstream Ss; + Ss << std::put_time(Tm, "%F_%T%z"); + return Ss.str(); } }; } // namespace firestarter::optimizer diff --git a/include/firestarter/Optimizer/OptimizerWorker.hpp b/include/firestarter/Optimizer/OptimizerWorker.hpp index 90eb80a5..17293ad3 100644 --- a/include/firestarter/Optimizer/OptimizerWorker.hpp +++ b/include/firestarter/Optimizer/OptimizerWorker.hpp @@ -19,42 +19,52 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include +#include "firestarter/Optimizer/Algorithm.hpp" +#include "firestarter/Optimizer/Population.hpp" +#include "firestarter/WindowsCompat.hpp" // IWYU pragma: keep #include #include -extern "C" { -#include -} - namespace firestarter::optimizer { +/// Class to run the optimization in another thread. class OptimizerWorker { public: - OptimizerWorker( - std::unique_ptr &&algorithm, - firestarter::optimizer::Population &population, - std::string const &optimizationAlgorithm, unsigned individuals, - std::chrono::seconds const &preheat); + /// Start the optimization in another thread. + /// \arg Algorithm The algorithm that is used to optimize FIRESTARTER. + /// \arg Population The population containing the problem that will be used to optimize FIRESTARTER. + /// \arg Individuals The number of individuals for the intial population. + /// \arg Preheat The time we preheat before starting the optimization. + OptimizerWorker(std::unique_ptr&& Algorithm, + std::unique_ptr&& Population, unsigned Individuals, + std::chrono::seconds const& Preheat); - ~OptimizerWorker() {} + ~OptimizerWorker() = default; - void join(); + /// Join the optimization thread. + void join() const; - void kill(); + /// Kill the optimization thread. + void kill() const; private: - static void *optimizerThread(void *optimizerWorker); + /// The thread worker that does the optimization. + /// \arg OptimizerWorker The pointer to the OptimizerWorker (this) datastructure. + /// \returns a nullptr + static auto optimizerThread(void* OptimizerWorker) -> void*; - std::unique_ptr _algorithm; - firestarter::optimizer::Population _population; - std::string _optimizationAlgorithm; - unsigned _individuals; - std::chrono::seconds _preheat; + /// The algorithm that is used to optimize FIRESTARTER. + std::unique_ptr Algorithm; + /// The population containing the problem that will be used to optimize FIRESTARTER. + std::unique_ptr Population; + /// The number of individuals for the intial population. + unsigned Individuals; + /// The time we preheat before starting the optimization. + std::chrono::seconds Preheat; - pthread_t workerThread; + /// The pthread that is used for the optimization. + pthread_t WorkerThread{}; }; } // namespace firestarter::optimizer diff --git a/include/firestarter/Optimizer/Population.hpp b/include/firestarter/Optimizer/Population.hpp index b02f451d..ac857e30 100644 --- a/include/firestarter/Optimizer/Population.hpp +++ b/include/firestarter/Optimizer/Population.hpp @@ -19,80 +19,74 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#ifndef FIRESTARTER_OPTIMIZER_POPULATION_HPP -#define FIRESTARTER_OPTIMIZER_POPULATION_HPP +#pragma once -#include -#include -#include +#include "firestarter/Optimizer/Individual.hpp" +#include "firestarter/Optimizer/Problem.hpp" #include #include -#include -#include -#include #include namespace firestarter::optimizer { +/// This class models the notion of a population used by the NSGA2 algorithm that contains a number of individuals with +/// their associated fitness. class Population { public: - // Construct a population from a problem. - Population() = default; + Population() = delete; - Population(std::shared_ptr &&problem) - : _problem(std::move(problem)), gen(rd()) {} + /// Construct a population from a problem. + explicit Population(std::shared_ptr&& ProblemPtr) + : ProblemPtr(std::move(ProblemPtr)) {} - Population(Population &pop) - : _problem(pop._problem), _x(pop._x), _f(pop._f), gen(rd()) {} + ~Population() = default; - Population &operator=(Population const &pop) { - _problem = std::move(pop._problem); - _x = pop._x; - _f = pop._f; - gen = pop.gen; + /// Generate a supplied number of individuals and save them with their fitness in this datastructure. If the number is + /// less then the number of dimensions we fill them with random individuals. If it is at least the number of + /// dimension, we first create individuals with one dimension equal to one and the rest equal to zero. + /// \arg PopulationSize The number of individuals to generate. + void generateInitialPopulation(std::size_t PopulationSize); - return *this; - } + /// The number of individuals in this population. + [[nodiscard]] auto size() const -> std::size_t; - ~Population() {} + /// Append one individual to the population. If a lookup of the fitness in the history is no successful, the + /// individual will be evaluated and the fitness saved. + /// \arg Ind The individual to be added to the population. + void append(Individual const& Ind); - void generateInitialPopulation(std::size_t populationSize = 0); + /// Insert an indiviudal and an associated fitness at a specific index in the population. + /// \arg Idx On which index to insert in the population. + /// \arg Ind The individual to insert. + /// \arg Fit The fitness to insert. + void insert(std::size_t Idx, Individual const& Ind, std::vector const& Fit); - std::size_t size() const; + /// Generate a random individual inside the bounds of the problem based on a non-determenistic generator. + /// \returns The random individual inside the bounds of the problem. + [[nodiscard]] auto getRandomIndividual() const -> Individual; - // add one individual to the population. fitness will be evaluated. - void append(Individual const &ind); + /// Const reference to the optimization problem. + [[nodiscard]] auto problem() const -> Problem const& { return *ProblemPtr; } - void insert(std::size_t idx, Individual const &ind, - std::vector const &fit); - - // get a random individual inside bounds of problem - Individual getRandomIndividual(); - - // returns the best individual in case of single-objective. - // return nothing in case of mutli-objective. - std::optional bestIndividual() const; - - Problem const &problem() const { return *_problem; } - - std::vector const &x() const { return _x; } - std::vector> const &f() const { return _f; } + /// Const reference to the vector of individuals. + [[nodiscard]] auto x() const -> std::vector const& { return X; } + /// Const reference to the vector of fitnesses. + [[nodiscard]] auto f() const -> std::vector> const& { return F; } private: - // add one individual to the population with a fitness. - void append(Individual const &ind, std::vector const &fit); - - // our problem. - std::shared_ptr _problem; - - std::vector _x; - std::vector> _f; - - std::random_device rd; - std::mt19937 gen; + /// Append one individual with a given fitness to the population. + /// \arg Ind The individual to be appended to the population. + /// \arg Fit The fitness of the individual. + void append(Individual const& Ind, std::vector const& Fit); + + /// The optimization problem + std::shared_ptr ProblemPtr; + + /// The vector of individuals + std::vector X; + /// The vector of fitnesses associated to each individual + std::vector> F; }; -} // namespace firestarter::optimizer - -#endif +} // namespace firestarter::optimizer \ No newline at end of file diff --git a/include/firestarter/Optimizer/Problem.hpp b/include/firestarter/Optimizer/Problem.hpp index f88b0bc3..bee3fdbb 100644 --- a/include/firestarter/Optimizer/Problem.hpp +++ b/include/firestarter/Optimizer/Problem.hpp @@ -21,8 +21,8 @@ #pragma once -#include -#include +#include "firestarter/Measurement/Summary.hpp" +#include "firestarter/Optimizer/Individual.hpp" #include #include @@ -31,37 +31,50 @@ namespace firestarter::optimizer { +/// This class models the abstract problem which should be optimized. It provides the methods to evaluate an individual +/// and calculate its fitness. class Problem { + /// The number of metric evaluations + uint64_t Fevals = 0; + public: - Problem() : _fevals(0) {} - virtual ~Problem() {} + Problem() = default; + virtual ~Problem() = default; - // return the fitness for an individual - virtual std::map - metrics(Individual const &individual) = 0; + /// Perform an evaluation of the supplied individual. This returns a map from the metric name to their respective + /// summary. This function will increment the fevals. + /// \arg Individual The individual that should be evaluated. + /// \returns A map from metric name to the summary of this metric for the specific individual + virtual auto metrics(Individual const& Individual) -> std::map = 0; - virtual std::vector - fitness(std::map const - &summaries) = 0; + /// Convert the result of one evaluation into a fitness (vector of doubles) for the supplied summaries + /// \arg Summaries The summaries of one evaluation. + /// \returns The fitness vector derived from the summaries. The size of this vector is equal to the number of + /// objectives. + [[nodiscard]] virtual auto fitness(std::map const& Summaries) const + -> std::vector = 0; - // get the bounds of the problem - virtual std::vector> getBounds() const = 0; + /// Get the bounds of the problem. For each dimension a min and max value is supplied. + /// \return The min and max bound per dimension. + [[nodiscard]] virtual auto getBounds() const -> std::vector> = 0; - // get the number of dimensions of the problem - std::size_t getDims() const { return this->getBounds().size(); }; + /// Get the number of dimensions of the problem. + /// \returns The number of dimensions. + [[nodiscard]] auto getDims() const -> std::size_t { return this->getBounds().size(); }; - // get the number of objectives. - virtual std::size_t getNobjs() const = 0; + /// Get the number of optimization objectives for this problem. + /// \arg The number of objectives. + [[nodiscard]] virtual auto getNobjs() const -> std::size_t = 0; - // is the problem multiobjective - bool isMO() const { return this->getNobjs() > 1; }; + /// Check if the problem is a multi-objective one. + [[nodiscard]] auto isMO() const -> bool { return this->getNobjs() > 1; }; - // get the number of fitness evaluations - unsigned long long getFevals() const { return _fevals; }; + /// Get the number of evaluations. + [[nodiscard]] auto getFevals() const -> uint64_t { return Fevals; }; protected: - // number of fitness evaluations - unsigned long long _fevals; + /// Increment the number of evaluations. + void incrementFevals() { Fevals++; }; }; } // namespace firestarter::optimizer diff --git a/include/firestarter/Optimizer/Problem/CLIArgumentProblem.hpp b/include/firestarter/Optimizer/Problem/CLIArgumentProblem.hpp index 1ca0de58..4335a4f9 100644 --- a/include/firestarter/Optimizer/Problem/CLIArgumentProblem.hpp +++ b/include/firestarter/Optimizer/Problem/CLIArgumentProblem.hpp @@ -21,125 +21,148 @@ #pragma once -#include +#include "firestarter/Measurement/MeasurementWorker.hpp" +#include "firestarter/Optimizer/Problem.hpp" #include -#include -#include #include -#include #include #include #include namespace firestarter::optimizer::problem { +/// This class models the problem of optimizing firestarter on the fly. The evaluation of metrics is done by switching +/// the settings of the high load routine and measuring the metric in the specified runtime. class CLIArgumentProblem final : public firestarter::optimizer::Problem { +private: + /// The function which takes instruction groups and switches the payload in the high load function to the supplied + /// ones. + std::function> const&)> ChangePayloadFunction; + /// The shared pointer to the measurement infrastructure which will be used to get metric values. + std::shared_ptr MeasurementWorker; + /// The metrics that are used in the optimization. They may have a dash at the start to allow them to be changed from + /// maximization to minimization. + std::vector Metrics; + /// The duration of the measurement. + std::chrono::seconds Timeout; + /// The time to skip from the measurement start + std::chrono::milliseconds StartDelta; + /// The time to skip from the measurement stop + std::chrono::milliseconds StopDelta; + /// The vector of instruction that is used in the optimization for the payload. + std::vector InstructionGroups; public: - CLIArgumentProblem( - std::function> const &)> - &&changePayloadFunction, - std::shared_ptr const - &measurementWorker, - std::vector const &metrics, std::chrono::seconds timeout, - std::chrono::milliseconds startDelta, std::chrono::milliseconds stopDelta, - std::vector const &instructionGroups) - : _changePayloadFunction(changePayloadFunction), - _measurementWorker(measurementWorker), _metrics(metrics), - _timeout(timeout), _startDelta(startDelta), _stopDelta(stopDelta), - _instructionGroups(instructionGroups) { - assert(_metrics.size() != 0); + /// Constructor for the problem of optimizing firestarter on the fly. + /// \arg ChangePayloadFunction The function which takes instruction groups and switches the payload in the high load + /// function to the supplied ones. + /// \arg MeasurementWorker The shared pointer to the measurement infrastructure which will be used to get metric + /// values + /// \arg Metrics The metrics that are used in the optimization. They may have a dash at the start to allow them to be + /// changed from maximization to minimization. + /// \arg Timeout The duration of the measurement. + /// \arg StartDelta The time to skip from the measurement start + /// \arg StopDelta The time to skip from the measurement stop + /// \arg InstructionGroups The vector of instruction that is used in the optimization for the payload. + CLIArgumentProblem(std::function> const&)>&& ChangePayloadFunction, + std::shared_ptr MeasurementWorker, + std::vector const& Metrics, std::chrono::seconds Timeout, + std::chrono::milliseconds StartDelta, std::chrono::milliseconds StopDelta, + std::vector InstructionGroups) + : ChangePayloadFunction(std::move(ChangePayloadFunction)) + , MeasurementWorker(std::move(MeasurementWorker)) + , Metrics(Metrics) + , Timeout(Timeout) + , StartDelta(StartDelta) + , StopDelta(StopDelta) + , InstructionGroups(std::move(InstructionGroups)) { + assert(!Metrics.empty()); } - ~CLIArgumentProblem() {} + ~CLIArgumentProblem() override = default; - // return all available metrics for the individual - std::map - metrics(std::vector const &individual) override { + /// Evaluate the given individual by switching the current payload, doing the measurement and returning the results. + /// \arg Individual The indivudal that should be measured. + /// \returns The map from all metrics to their respective summaries for the measured individual. + auto metrics(std::vector const& Individual) + -> std::map override { // increment evaluation idx - _fevals++; + incrementFevals(); // change the payload - assert(_instructionGroups.size() == individual.size()); - std::vector> payload = {}; - auto it1 = _instructionGroups.begin(); - auto it2 = individual.begin(); - for (; it1 != _instructionGroups.end(); ++it1, ++it2) { - payload.push_back(std::make_pair(*it1, *it2)); + assert(InstructionGroups.size() == Individual.size()); + std::vector> Payload = {}; + auto It1 = InstructionGroups.begin(); + auto It2 = Individual.begin(); + for (; It1 != InstructionGroups.end(); ++It1, ++It2) { + Payload.emplace_back(*It1, *It2); } - _changePayloadFunction(payload); + ChangePayloadFunction(Payload); // start the measurement - // NOTE: starting the measurement must happen after switching to not mess up - // ipc-estimate metric - _measurementWorker->startMeasurement(); + // NOTE: starting the measurement must happen after switching to not + // mess up ipc-estimate metric + MeasurementWorker->startMeasurement(); // wait for the measurement to finish - std::this_thread::sleep_for(_timeout); + std::this_thread::sleep_for(Timeout); - // FIXME: this is an ugly workaround for the ipc-estimate metric - // changeing the payload triggers a write of the iteration counter of the - // last payload, which we use to estimate the ipc. - _changePayloadFunction(payload); + // TODO(Issue #82): This is an ugly workaround for the ipc-estimate metric. + // Changing the payload triggers a write of the iteration counter of + // the last payload, which we use to estimate the ipc. + ChangePayloadFunction(Payload); // return the results - return _measurementWorker->getValues(_startDelta, _stopDelta); + return MeasurementWorker->getValues(StartDelta, StopDelta); } - std::vector fitness( - std::map const &summaries) - override { - std::vector values = {}; - - for (auto const &metricName : _metrics) { - auto findName = [metricName](auto const &summary) { - auto invertedName = "-" + summary.first; - return metricName.compare(summary.first) == 0 || - metricName.compare(invertedName) == 0; + /// Calculate the fitness based on the metric summaries of an individual. This will select the metrics that are + /// required for the optimization, round them and potentially invert the results if the optimization metric name + /// starts with a dash ('-'). + /// \arg Summaries The metric values for all metrics for an individual + /// \return The vector containing the fitness for that metrics that are used in the optimization. + [[nodiscard]] auto fitness(std::map const& Summaries) const + -> std::vector override { + std::vector Values = {}; + + for (auto const& MetricName : Metrics) { + auto FindName = [MetricName](auto const& Summary) { + auto InvertedName = "-" + Summary.first; + return MetricName == Summary.first || MetricName == InvertedName; }; - auto it = std::find_if(summaries.begin(), summaries.end(), findName); + auto It = std::find_if(Summaries.begin(), Summaries.end(), FindName); - if (it == summaries.end()) { + if (It == Summaries.end()) { continue; } // round to two decimal places after the comma - auto value = std::round(it->second.average * 100.0) / 100.0; + auto Value = std::round(It->second.Average * 100.0) / 100.0; // invert metric - if (metricName[0] == '-') { - value *= -1.0; + if (MetricName[0] == '-') { + Value *= -1.0; } - values.push_back(value); + Values.push_back(Value); } - return values; + return Values; } - // get the bounds of the problem - std::vector> getBounds() const override { - std::vector> vec( - _instructionGroups.size(), std::make_tuple(0, 100)); + /// Get the bounds of the problem. We currently set these bounds fix to a range from 0 to 100 for every instruction. + /// \returns A vector the size of the number of instruction groups containing a tuple(0, 100). + [[nodiscard]] auto getBounds() const -> std::vector> override { + std::vector> Vec(InstructionGroups.size(), + std::make_tuple(0, 100)); - return vec; + return Vec; } - // get the number of objectives. - std::size_t getNobjs() const override { return _metrics.size(); } - -private: - std::function> const &)> - _changePayloadFunction; - std::shared_ptr - _measurementWorker; - std::vector _metrics; - std::chrono::seconds _timeout; - std::chrono::milliseconds _startDelta; - std::chrono::milliseconds _stopDelta; - std::vector _instructionGroups; + /// Get the number of optimization objectives. + [[nodiscard]] auto getNobjs() const -> std::size_t override { return Metrics.size(); } }; } // namespace firestarter::optimizer::problem diff --git a/include/firestarter/Optimizer/Util/MultiObjective.hpp b/include/firestarter/Optimizer/Util/MultiObjective.hpp index 00701bfd..049d7be3 100644 --- a/include/firestarter/Optimizer/Util/MultiObjective.hpp +++ b/include/firestarter/Optimizer/Util/MultiObjective.hpp @@ -21,7 +21,7 @@ #pragma once -#include +#include "firestarter/Optimizer/Individual.hpp" #include #include @@ -29,41 +29,31 @@ namespace firestarter::optimizer::util { -bool less_than_f(double a, double b); +auto lessThanF(double A, double B) -> bool; -bool greater_than_f(double a, double b); +auto greaterThanF(double A, double B) -> bool; -bool pareto_dominance(const std::vector &obj1, - const std::vector &obj2); +auto paretoDominance(const std::vector& Obj1, const std::vector& Obj2) -> bool; -std::tuple>, - std::vector>, std::vector, - std::vector> -fast_non_dominated_sorting(const std::vector> &points); +auto fastNonDominatedSorting(const std::vector>& Points) + -> std::tuple>, std::vector>, + std::vector, std::vector>; -std::vector -crowding_distance(const std::vector> &non_dom_front); +auto crowdingDistance(const std::vector>& NonDomFront) -> std::vector; -std::vector::size_type mo_tournament_selection( - std::vector::size_type idx1, std::vector::size_type idx2, - const std::vector::size_type> &non_domination_rank, - const std::vector &crowding_d, std::mt19937 &mt); +auto moTournamentSelection(std::vector::size_type Idx1, std::vector::size_type Idx2, + const std::vector::size_type>& NonDominationRank, + const std::vector& CrowdingD, std::mt19937& Mt) -> std::vector::size_type; -std::pair -sbx_crossover(const firestarter::optimizer::Individual &parent1, - const firestarter::optimizer::Individual &parent2, - const double p_cr, std::mt19937 &mt); +auto sbxCrossover(const firestarter::optimizer::Individual& Parent1, const firestarter::optimizer::Individual& Parent2, + double PCr, std::mt19937& Mt) + -> std::pair; -void polynomial_mutation( - firestarter::optimizer::Individual &child, - const std::vector> &bounds, const double p_m, - std::mt19937 &mt); +void polynomialMutation(firestarter::optimizer::Individual& Child, + const std::vector>& Bounds, double PM, std::mt19937& Mt); -std::vector -select_best_N_mo(const std::vector> &input_f, - std::size_t N); +auto selectBestNMo(const std::vector>& InputF, std::size_t N) -> std::vector; -std::vector ideal(const std::vector> &points); +auto ideal(const std::vector>& Points) -> std::vector; } // namespace firestarter::optimizer::util diff --git a/include/firestarter/Measurement/Metric/IPCEstimate.h b/include/firestarter/SafeExit.hpp similarity index 79% rename from include/firestarter/Measurement/Metric/IPCEstimate.h rename to include/firestarter/SafeExit.hpp index 2c14bb0d..68823831 100644 --- a/include/firestarter/Measurement/Metric/IPCEstimate.h +++ b/include/firestarter/SafeExit.hpp @@ -1,6 +1,6 @@ /****************************************************************************** * FIRESTARTER - A Processor Stress Test Utility - * Copyright (C) 2021 TU Dresden, Center for Information Services and High + * Copyright (C) 2024 TU Dresden, Center for Information Services and High * Performance Computing * * This program is free software: you can redistribute it and/or modify @@ -21,8 +21,10 @@ #pragma once -#include +namespace firestarter { -extern metric_interface_t ipc_estimate_metric; +/// A thread safe wrapper to std::exit +/// \arg Status The status passed to std::exit +[[noreturn]] void safeExit(int Status); -extern void ipc_estimate_metric_insert(double value); +} // namespace firestarter diff --git a/include/firestarter/WindowsCompat.hpp b/include/firestarter/WindowsCompat.hpp new file mode 100644 index 00000000..11ef1329 --- /dev/null +++ b/include/firestarter/WindowsCompat.hpp @@ -0,0 +1,94 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#ifdef _MSC_VER +#include +#else + +/// Define the _mm_mfence and __cpuid function when we are not using MSC to enable the use of if constexpr instead of +/// ifdefs. +// NOLINTBEGIN(readability-identifier-naming,cert-dcl37-c,cert-dcl37-cpp,cert-dcl51-cpp,bugprone-reserved-identifier) +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-function" +#endif +#if defined(__clang__) +#include +#elif not(defined(__MINGW32__) || defined(__MINGW64__)) +void _mm_mfence() noexcept; +#endif +#if not(defined(__INTEL_LLVM_COMPILER)) +void __cpuid(int* /*unused*/, int /*unused*/) noexcept; +#endif +#if defined(__clang__) +#pragma clang diagnostic pop +#elif defined(__GNUC__) +#pragma GCC diagnostic pop +#endif +// NOLINTEND(readability-identifier-naming,cert-dcl37-c,cert-dcl37-cpp,cert-dcl51-cpp,bugprone-reserved-identifier) + +#endif + +#ifdef _WIN32 +// SIGALRM is not available on Windows +#define SIGALRM 0 + +#include +static inline auto get_current_dir_name() -> char* { return _getcwd(nullptr, 0); } +#elif defined(__APPLE__) +#include +static inline auto get_current_dir_name() -> char* { return getcwd(nullptr, 0); } +#else +#include +#endif + +// correct include for gethostname +#ifdef _MSC_VER +#include +#else +// NOLINTBEGIN(readability-duplicate-include) +#include +// NOLINTEND(readability-duplicate-include) +#endif + +// Make references in header files to pthread_t compatible to MSC. This will not make them functionally work. +// We will be able to remove this hack once we transition from using pthread to std::thread +#ifdef _MSC_VER +struct Placeholder {}; +using pthread_t = Placeholder; +#else +extern "C" { +#include +} +#endif + +// Disable __asm__ __volatile__ in MSC +// Static assert wont work, since if constexpr doesn't seem to work correctly +#ifdef _MSC_VER +#define __volatile__(X, ...) \ + assert(false && "Attempted to use code path that uses the incorrect inline assembly macros for MSC.") +#define __asm__ +#endif \ No newline at end of file diff --git a/lib/.clang-tidy b/lib/.clang-tidy new file mode 100644 index 00000000..cf4dd00b --- /dev/null +++ b/lib/.clang-tidy @@ -0,0 +1,4 @@ +--- +# Disable all clangd checks for the lib folder + +Checks: '-*' \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6136bb35..c0355fa0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,13 +1,18 @@ SET(FIRESTARTER_FILES + firestarter/Config.cpp firestarter/Main.cpp firestarter/Firestarter.cpp firestarter/LoadWorker.cpp + firestarter/SafeExit.cpp firestarter/WatchdogWorker.cpp firestarter/DumpRegisterWorker.cpp + + firestarter/Environment/X86/Platform/X86PlatformConfig.cpp firestarter/Environment/Environment.cpp firestarter/Environment/CPUTopology.cpp - firestarter/Environment/Payload/Payload.cpp + firestarter/Environment/Payload/CompiledPayload.cpp + firestarter/Environment/Payload/PayloadSettings.cpp # here starts the x86 specific code firestarter/Environment/X86/X86Environment.cpp diff --git a/src/firestarter/Config.cpp b/src/firestarter/Config.cpp new file mode 100644 index 00000000..356580d5 --- /dev/null +++ b/src/firestarter/Config.cpp @@ -0,0 +1,392 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#include "firestarter/Config.hpp" +#include "firestarter/Constants.hpp" +#include "firestarter/Logging/Log.hpp" + +#include + +namespace { + +void printCopyright() { + firestarter::log::info() << "This program is free software: you can redistribute it and/or " + "modify\n" + << "it under the terms of the GNU General Public License as published " + "by\n" + << "the Free Software Foundation, either version 3 of the License, or\n" + << "(at your option) any later version.\n" + << "\n" + << "You should have received a copy of the GNU General Public License\n" + << "along with this program. If not, see " + ".\n"; +} + +void printWarranty() { + firestarter::log::info() << "This program is distributed in the hope that it will be useful,\n" + << "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" + << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" + << "GNU General Public License for more details.\n" + << "\n" + << "You should have received a copy of the GNU General Public License\n" + << "along with this program. If not, see " + ".\n"; +} + +void printHelp(cxxopts::Options const& Parser, std::string const& Section = "") { + std::vector> Options = { + {"information", "Information Options:\n"}, + {"general", "General Options:\n"}, + {"specialized-workloads", "Specialized workloads:\n"}, +#ifdef FIRESTARTER_DEBUG_FEATURES + {"debug", "Debugging:\n"}, +#endif +#if defined(linux) || defined(__linux__) + {"measurement", "Measurement:\n"}, + {"optimization", "Optimization:\n"} +#endif + }; + + // Select the specific option if sections is no empty + if (!Section.empty()) { + // section not found + auto FindSection = [&Section](std::pair const& Pair) { return Pair.first == Section; }; + auto SectionsIt = std::find_if(Options.begin(), Options.end(), FindSection); + if (SectionsIt == Options.end()) { + throw std::invalid_argument("Section \"" + Section + "\" not found in help."); + } + Options = {*SectionsIt}; + } + + // clang-format off + firestarter::log::info() + << Parser.help(Options) + << "Examples:\n" + << " ./FIRESTARTER starts FIRESTARTER without timeout\n" + << " ./FIRESTARTER -t 300 starts a 5 minute run of FIRESTARTER\n" + << " ./FIRESTARTER -l 50 -t 600 starts a 10 minute run of FIRESTARTER with\n" + << " 50\% high load and 50\% idle time\n" + << (firestarter::OptionalFeatures.gpuEnabled() ? + " on CPUs and full load on GPUs\n" + : "") + << " ./FIRESTARTER -l 75 -p 20000000\n" + << " starts FIRESTARTER with an interval length\n" + << " of 2 sec, 1.5s high load" + << (firestarter::OptionalFeatures.gpuEnabled() ? + " on CPUs and full load on GPUs\n" + : "\n") + << (firestarter::OptionalFeatures.OptimizationEnabled ? + " ./FIRESTARTER --measurement --start-delta=300000 -t 900\n" + " starts FIRESTARTER measuring all available\n" + " metrics for 15 minutes disregarding the first\n" + " 5 minutes and last two seconds (default to `--stop-delta`)\n" + " ./FIRESTARTER -t 20 --optimize=NSGA2 --optimization-metric sysfs-powercap-rapl,perf-ipc\n" + " starts FIRESTARTER optimizing with the sysfs-powercap-rapl\n" + " and perf-ipc metric. The duration is 20s long. The default\n" + " instruction groups for the current platform will be used.\n" + : "") + ; + // clang-format on +} + +} // namespace + +namespace firestarter { + +Config::Config(int Argc, const char** Argv) + : Argv(Argv) + , Argc(Argc) { + const auto* ExecutableName = *Argv; + + cxxopts::Options Parser(ExecutableName); + + const auto HelpDescription = + std::string("Display usage information. SECTION can be any of: information | general | specialized-workloads") + + (firestarter::OptionalFeatures.DebugFeatureEnabled ? " | debug" : "") + + (firestarter::OptionalFeatures.OptimizationEnabled ? "\n| measurement | optimization" : ""); + + const auto LoadDescription = + std::string("Set the percentage of high CPU load to LOAD\n(%) default: 100, valid values: 0 <= LOAD <=\n100, " + "threads will be idle in the remaining time,\nfrequency of load changes is determined by -p.") + + (firestarter::OptionalFeatures.gpuEnabled() ? " This option does NOT influence the GPU\nworkload!" : ""); + + // clang-format off + Parser.add_options("information") + ("h,help", HelpDescription, + cxxopts::value()->implicit_value(""), "SECTION") + ("v,version", "Display version information") + ("c,copyright", "Display copyright information") + ("w,warranty", "Display warranty information") + ("q,quiet", "Set log level to Warning") + ("r,report", "Display additional information (overridden by -q)") + ("debug", "Print debug output") + ("a,avail", "List available functions"); + + Parser.add_options("general") + ("i,function", "Specify integer ID of the load-function to be\nused (as listed by --avail)", + cxxopts::value()->default_value("0"), "ID"); + + if (firestarter::OptionalFeatures.gpuEnabled()) { + Parser.add_options("general") + ("f,usegpufloat", "Use single precision matrix multiplications\ninstead of default") + ("d,usegpudouble", "Use double precision matrix multiplications\ninstead of default") + ("g,gpus", "Number of gpus to use, default: -1 (all)", + cxxopts::value()->default_value("-1")) + ("m,matrixsize", "Size of the matrix to calculate, default: 0 (maximum)", + cxxopts::value()->default_value("0")); + } + + Parser.add_options("general") + ("t,timeout", "Set the timeout (seconds) after which FIRESTARTER\nterminates itself, default: 0 (no timeout)", + cxxopts::value()->default_value("0"), "TIMEOUT") + ("l,load", LoadDescription, + cxxopts::value()->default_value("100"), "LOAD") + ("p,period", "Set the interval length for CPUs to PERIOD\n(usec), default: 100000, each interval contains\na high load and an idle phase, the percentage\nof high load is defined by -l.", + cxxopts::value()->default_value("100000"), "PERIOD") + ("n,threads", "Specify the number of threads. Cannot be\ncombined with -b | --bind, which impicitly\nspecifies the number of threads.", + cxxopts::value()->default_value("0"), "COUNT") +#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY) + ("b,bind", "Select certain CPUs. CPULIST format: \"x,y,z\",\n\"x-y\", \"x-y/step\", and any combination of the\nabove. Cannot be combined with -n | --threads.", + cxxopts::value()->default_value(""), "CPULIST") +#endif + ("error-detection", "Enable error detection. This aborts execution when the calculated data is corruped by errors. FIRESTARTER must run with 2 or more threads for this feature. Cannot be used with -l | --load and --optimize."); + + Parser.add_options("specialized-workloads") + ("list-instruction-groups", "List the available instruction groups for the\npayload of the current platform.") + ("run-instruction-groups", "Run the payload with the specified\ninstruction groups. GROUPS format: multiple INST:VAL\npairs comma-seperated.", + cxxopts::value()->default_value(""), "GROUPS") + ("set-line-count", "Set the number of lines for a payload.", + cxxopts::value()); + + if (firestarter::OptionalFeatures.DebugFeatureEnabled) { + Parser.add_options("debug") + ("allow-unavailable-payload", "") + ("dump-registers", "Dump the working registers on the first\nthread. Depending on the payload these are mm, xmm,\nymm or zmm. Only use it without a timeout and\n100 percent load. DELAY between dumps in secs. Cannot be used with --error-detection.", + cxxopts::value()->implicit_value("10"), "DELAY") + ("dump-registers-outpath", "Path for the dump of the output files. If\nPATH is not given, current working directory will\nbe used.", + cxxopts::value()->default_value(""), "PATH"); + } + + if (firestarter::OptionalFeatures.OptimizationEnabled) { + Parser.add_options("measurement") + ("list-metrics", "List the available metrics.") +#ifndef FIRESTARTER_LINK_STATIC + ("metric-path", "Add a path to a shared library representing an interface for a metric. This option can be specified multiple times.", + cxxopts::value>()->default_value("")) +#endif + ("metric-from-stdin", "Add a metric NAME with values from stdin.\nFormat of input: \"NAME TIME_SINCE_EPOCH VALUE\\n\".\nTIME_SINCE_EPOCH is a int64 in nanoseconds. VALUE is a double. (Do not forget to flush\nlines!)", + cxxopts::value>(), "NAME") + ("measurement", "Start a measurement for the time specified by\n-t | --timeout. (The timeout must be greater\nthan the start and stop deltas.) Cannot be\ncombined with --optimize.") + ("measurement-interval", "Interval of measurements in milliseconds, default: 100", + cxxopts::value()->default_value("100")) + ("start-delta", "Cut of first N milliseconds of measurement, default: 5000", + cxxopts::value()->default_value("5000"), "N") + ("stop-delta", "Cut of last N milliseconds of measurement, default: 2000", + cxxopts::value()->default_value("2000"), "N") + ("preheat", "Preheat for N seconds, default: 240", + cxxopts::value()->default_value("240"), "N"); + + Parser.add_options("optimization") + ("optimize", "Run the optimization with one of these algorithms: NSGA2.\nCannot be combined with --measurement.", + cxxopts::value()) + ("optimize-outfile", "Dump the output of the optimization into this\nfile, default: $PWD/$HOSTNAME_$DATE.json", + cxxopts::value()) + ("optimization-metric", "Use a metric for optimization. Metrics listed\nwith cli argument --list-metrics or specified\nwith --metric-from-stdin are valid.", + cxxopts::value>()) + ("individuals", "Number of individuals for the population. For\nNSGA2 specify at least 5 and a multiple of 4,\ndefault: 20", + cxxopts::value()->default_value("20")) + ("generations", "Number of generations, default: 20", + cxxopts::value()->default_value("20")) + ("nsga2-cr", "Crossover probability. Must be in range [0,1[\ndefault: 0.6", + cxxopts::value()->default_value("0.6")) + ("nsga2-m", "Mutation probability. Must be in range [0,1]\ndefault: 0.4", + cxxopts::value()->default_value("0.4")); + } + // clang-format on + + try { + auto Options = Parser.parse(Argc, Argv); + + if (static_cast(Options.count("quiet"))) { + firestarter::logging::Filter::set_severity(nitro::log::severity_level::warn); + } else if (static_cast(Options.count("report"))) { + firestarter::logging::Filter::set_severity(nitro::log::severity_level::debug); + } else if (static_cast(Options.count("debug"))) { + firestarter::logging::Filter::set_severity(nitro::log::severity_level::trace); + } else { + firestarter::logging::Filter::set_severity(nitro::log::severity_level::info); + } + + if (static_cast(Options.count("version"))) { + safeExit(EXIT_SUCCESS); + } + + if (static_cast(Options.count("copyright"))) { + printCopyright(); + safeExit(EXIT_SUCCESS); + } + + if (static_cast(Options.count("warranty"))) { + printWarranty(); + safeExit(EXIT_SUCCESS); + } + + firestarter::log::info() << "This program comes with ABSOLUTELY NO WARRANTY; for details run `" << ExecutableName + << " -w`.\n" + << "This is free software, and you are welcome to redistribute it\n" + << "under certain conditions; run `" << ExecutableName << " -c` for details.\n"; + + if (static_cast(Options.count("help"))) { + auto Section = Options["help"].as(); + + printHelp(Parser, Section); + safeExit(EXIT_SUCCESS); + } + + Timeout = std::chrono::seconds(Options["timeout"].as()); + const auto LoadPercent = Options["load"].as(); + Period = std::chrono::microseconds(Options["period"].as()); + + if (LoadPercent > 100) { + throw std::invalid_argument("Option -l/--load may not be above 100."); + } + + Load = (Period * LoadPercent) / 100; + if (LoadPercent == 100 || Load == std::chrono::microseconds::zero()) { + Period = std::chrono::microseconds::zero(); + } + + ErrorDetection = static_cast(Options.count("error-detection")); + if (ErrorDetection && LoadPercent != 100) { + throw std::invalid_argument("Option --error-detection may only be used " + "with -l/--load equal 100."); + } + + if (firestarter::OptionalFeatures.DebugFeatureEnabled) { + AllowUnavailablePayload = static_cast(Options.count("allow-unavailable-payload")); + DumpRegisters = static_cast(Options.count("dump-registers")); + if (DumpRegisters) { + DumpRegistersTimeDelta = std::chrono::seconds(Options["dump-registers"].as()); + if (Timeout != std::chrono::microseconds::zero() && LoadPercent != 100) { + throw std::invalid_argument("Option --dump-registers may only be used " + "without a timeout and full load."); + } + if (ErrorDetection) { + throw std::invalid_argument("Options --dump-registers and --error-detection cannot be used " + "together."); + } + } + } + + RequestedNumThreads = Options["threads"].as(); + +#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY) + CpuBind = Options["bind"].as(); + if (!CpuBind.empty()) { + if (RequestedNumThreads != 0) { + throw std::invalid_argument("Options -b/--bind and -n/--threads cannot be used together."); + } + } +#endif + + if (firestarter::OptionalFeatures.gpuEnabled()) { + GpuUseFloat = static_cast(Options.count("usegpufloat")); + GpuUseDouble = static_cast(Options.count("usegpudouble")); + + if (GpuUseFloat && GpuUseDouble) { + throw std::invalid_argument("Options -f/--usegpufloat and " + "-d/--usegpudouble cannot be used together."); + } + + GpuMatrixSize = Options["matrixsize"].as(); + if (GpuMatrixSize > 0 && GpuMatrixSize < 64) { + throw std::invalid_argument("Option -m/--matrixsize may not be below 64."); + } + + Gpus = Options["gpus"].as(); + } + + PrintFunctionSummary = static_cast(Options.count("avail")); + + FunctionId = Options["function"].as(); + + ListInstructionGroups = static_cast(Options.count("list-instruction-groups")); + InstructionGroups = Options["run-instruction-groups"].as(); + if (static_cast(Options.count("set-line-count"))) { + LineCount = Options["set-line-count"].as(); + } + + if (firestarter::OptionalFeatures.OptimizationEnabled) { + StartDelta = std::chrono::milliseconds(Options["start-delta"].as()); + StopDelta = std::chrono::milliseconds(Options["stop-delta"].as()); + MeasurementInterval = std::chrono::milliseconds(Options["measurement-interval"].as()); +#ifndef FIRESTARTER_LINK_STATIC + MetricPaths = Options["metric-path"].as>(); +#endif + if (static_cast(Options.count("metric-from-stdin"))) { + StdinMetrics = Options["metric-from-stdin"].as>(); + } + Measurement = static_cast(Options.count("measurement")); + ListMetrics = static_cast(Options.count("list-metrics")); + Optimize = static_cast(Options.count("optimize")); + + if (Optimize) { + if (ErrorDetection) { + throw std::invalid_argument("Options --error-detection and --optimize " + "cannot be used together."); + } + if (Measurement) { + throw std::invalid_argument("Options --measurement and --optimize cannot be used together."); + } + Preheat = std::chrono::seconds(Options["preheat"].as()); + OptimizationAlgorithm = Options["optimize"].as(); + if (static_cast(Options.count("optimization-metric"))) { + OptimizationMetrics = Options["optimization-metric"].as>(); + } + if (LoadPercent != 100) { + throw std::invalid_argument("Options -p | --period and -l | --load are " + "not compatible with --optimize."); + } + if (Timeout == std::chrono::seconds::zero()) { + throw std::invalid_argument("Option -t | --timeout must be specified for optimization."); + } + EvaluationDuration = Timeout; + // this will deactivate the watchdog worker + Timeout = std::chrono::seconds::zero(); + Individuals = Options["individuals"].as(); + if (static_cast(Options.count("optimize-outfile"))) { + OptimizeOutfile = Options["optimize-outfile"].as(); + } + Generations = Options["generations"].as(); + Nsga2Cr = Options["nsga2-cr"].as(); + Nsga2M = Options["nsga2-m"].as(); + + if (OptimizationAlgorithm != "NSGA2") { + throw std::invalid_argument("Option --optimize must be any of: NSGA2"); + } + } + } + } catch (std::exception& E) { + printHelp(Parser); + firestarter::log::error() << E.what() << "\n"; + } +} +} // namespace firestarter \ No newline at end of file diff --git a/src/firestarter/Cuda/Cuda.cpp b/src/firestarter/Cuda/Cuda.cpp index e5abece9..9469073a 100644 --- a/src/firestarter/Cuda/Cuda.cpp +++ b/src/firestarter/Cuda/Cuda.cpp @@ -1,6 +1,6 @@ /****************************************************************************** * FIRESTARTER - A Processor Stress Test Utility - * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High + * Copyright (C) 2020-2024 TU Dresden, Center for Information Services and High * Performance Computing * * This program is free software: you can redistribute it and/or modify @@ -19,614 +19,284 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -/* CUDA error checking based on CudaWrapper.h - * https://github.com/ashwin/gDel3D/blob/master/GDelFlipping/src/gDel3D/GPU/CudaWrapper.h - * +/****************************************************************************** * inspired by gpu_burn * http://wili.cc/blog/gpu-burn.html *****************************************************************************/ -#include -#include -#include +#include "firestarter/Cuda/Cuda.hpp" +#include "firestarter/Cuda/CudaHipCompat.hpp" +#include "firestarter/Logging/Log.hpp" -#ifdef FIRESTARTER_BUILD_CUDA - #include - #include - #include - #include - #define FS_ACCEL_PREFIX_LC_LONG cuda - #define FS_ACCEL_PREFIX_LC cu - #define FS_ACCEL_PREFIX_UC CU - #define FS_ACCEL_PREFIX_UC_LONG CUDA - #define FS_ACCEL_STRING "CUDA" -#else - #ifdef FIRESTARTER_BUILD_HIP - #include - #include - #include - #include - #define FS_ACCEL_PREFIX_LC_LONG hip - #define FS_ACCEL_PREFIX_LC hip - #define FS_ACCEL_PREFIX_UC HIP - #define FS_ACCEL_PREFIX_UC_LONG HIP - #define FS_ACCEL_STRING "HIP" - #else - #error "Attempting to compile file but neither CUDA nor HIP is used" - #endif -#endif -#define CONCAT_(prefix, suffix) prefix##suffix -/// Concatenate `prefix, suffix` into `prefixsuffix` -#define CONCAT(prefix, suffix) CONCAT_(prefix, suffix) -//#define FS_ACCEL_ERROR_TYPE CONCAT(FS_ACCEL_PREFIX_LC_LONG,Error_t) -//#define FS_ACCEL_BLAS_STATUS_TYPE cublasStatus_t -//#define FS_ACCEL_RAND_STATUS_TYPE curandStatus_t - -#include #include +#include +#include #include -#define ACCELL_SAFE_CALL(cuerr, dev_index) \ - accell_safe_call(cuerr, dev_index, __FILE__, __LINE__) -#define SEED 123 - -using namespace firestarter::cuda; - -// CUDA error checking -static inline void accell_safe_call(CONCAT(FS_ACCEL_PREFIX_LC_LONG,Error_t) cuerr, int dev_index, - const char *file, const int line) { - if (cuerr != CONCAT(FS_ACCEL_PREFIX_LC_LONG,Success) && cuerr != 1) { - firestarter::log::error() - << FS_ACCEL_STRING" error at " << file << ":" << line << ": error code = " << cuerr - << " (" << CONCAT(FS_ACCEL_PREFIX_LC_LONG,GetErrorString)(cuerr) - << "), device index: " << dev_index; - exit(cuerr); - } - - return; -} - -static const char *_accellGetErrorEnum(CONCAT(FS_ACCEL_PREFIX_LC,blasStatus_t) error) { - switch (error) { - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_SUCCESS): - return FS_ACCEL_STRING"blas status: success"; - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_NOT_INITIALIZED): - return FS_ACCEL_STRING"blas status: not initialized"; - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_ALLOC_FAILED): - return FS_ACCEL_STRING"blas status: alloc failed"; - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_INVALID_VALUE): - return FS_ACCEL_STRING"blas status: invalid value"; - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_ARCH_MISMATCH): - return FS_ACCEL_STRING"blas status: arch mismatch"; - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_MAPPING_ERROR): - return FS_ACCEL_STRING"blas status: mapping error"; - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_EXECUTION_FAILED): - return FS_ACCEL_STRING"blas status: execution failed"; - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_INTERNAL_ERROR): - return FS_ACCEL_STRING"blas status: internal error"; - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_NOT_SUPPORTED): - return FS_ACCEL_STRING"blas status: not supported"; -#ifdef FIRESTARTER_BUILD_CUDA - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_LICENSE_ERROR): - return FS_ACCEL_STRING"blas status: license error"; -#endif -#ifdef FIRESTARTER_BUILD_HIP - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_UNKNOWN): - return FS_ACCEL_STRING"blas status: unknown"; - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_HANDLE_IS_NULLPTR): - return FS_ACCEL_STRING"blas status: handle is null pointer"; - case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_INVALID_ENUM): - return FS_ACCEL_STRING"blas status: invalid enum"; -#endif - } - - - return ""; -} - -static inline void accell_safe_call(CONCAT(FS_ACCEL_PREFIX_LC,blasStatus_t) cuerr, int dev_index, - const char *file, const int line) { - if (cuerr != CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_SUCCESS)) { - firestarter::log::error() - << FS_ACCEL_STRING"BLAS error at " << file << ":" << line - << ": error code = " << cuerr << " (" << _accellGetErrorEnum(cuerr) - << "), device index: " << dev_index; - exit(cuerr); - } - - return; -} - -#ifdef FIRESTARTER_BUILD_CUDA -static inline void accell_safe_call(CONCAT(FS_ACCEL_PREFIX_UC,result) cuerr, int dev_index, - const char *file, const int line) { - if (cuerr != CONCAT(FS_ACCEL_PREFIX_UC_LONG,_SUCCESS)) { - const char *errorString; - - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,GetErrorName)(cuerr, &errorString), dev_index); - - firestarter::log::error() - << FS_ACCEL_STRING" error at " << file << ":" << line << ": error code = " << cuerr - << " (" << errorString << "), device index: " << dev_index; - exit(cuerr); - } - - return; -} -#endif - -static const char *_accellrandGetErrorEnum(CONCAT(FS_ACCEL_PREFIX_LC,randStatus_t) cuerr) { - switch (cuerr) { - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_SUCCESS): - return FS_ACCEL_STRING"rand status: success"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_VERSION_MISMATCH): - return FS_ACCEL_STRING"rand status: version mismatch"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_NOT_INITIALIZED): - return FS_ACCEL_STRING"rand status: not initialized"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_ALLOCATION_FAILED): - return FS_ACCEL_STRING"rand status: allocation failed"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_TYPE_ERROR): - return FS_ACCEL_STRING"rand status: type error"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_OUT_OF_RANGE): - return FS_ACCEL_STRING"rand status: out of range"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_LENGTH_NOT_MULTIPLE): - return FS_ACCEL_STRING"rand status: length not multiple"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_DOUBLE_PRECISION_REQUIRED): - return FS_ACCEL_STRING"rand status: double precision required"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_LAUNCH_FAILURE): - return FS_ACCEL_STRING"rand status: launch failure"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_PREEXISTING_FAILURE): - return FS_ACCEL_STRING"rand status: preexisting failure"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_INITIALIZATION_FAILED): - return FS_ACCEL_STRING"rand status: initialization failed"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_ARCH_MISMATCH): - return FS_ACCEL_STRING"rand status: arch mismatch"; - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_INTERNAL_ERROR): - return FS_ACCEL_STRING"rand status: internal error"; -#ifdef FIRESTARTER_BUILD_HIP - case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_NOT_IMPLEMENTED): - return FS_ACCEL_STRING"rand status: not implemented"; -#endif - } - - return ""; -} +namespace firestarter::cuda { -static inline void accell_safe_call(CONCAT(FS_ACCEL_PREFIX_LC,randStatus_t) cuerr, int dev_index, - const char *file, const int line) { - if (cuerr != CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_SUCCESS)) { - firestarter::log::error() - << FS_ACCEL_STRING"RAND error at " << file << ":" << line - << ": error code = " << cuerr << " (" << _accellrandGetErrorEnum(cuerr) - << "), device index: " << dev_index; - exit(cuerr); - } +constexpr const int Seed = 123; - return; -} +namespace { -static int round_up(int num_to_round, int multiple) { - if (multiple == 0) { - return num_to_round; - } +template auto roundUp(int NumToRound) -> int { + static_assert(Multiple != 0, "Multiple may not be zero."); - int remainder = num_to_round % multiple; - if (remainder == 0) { - return num_to_round; + const int Remainder = NumToRound % Multiple; + if (Remainder == 0) { + return NumToRound; } - return num_to_round + multiple - remainder; + return NumToRound + Multiple - Remainder; } -#ifdef FIRESTARTER_BUILD_CUDA -static int get_precision(int useDouble, struct cudaDeviceProp properties) { -#else -#ifdef FIRESTARTER_BUILD_HIP -static int get_precision(int useDouble, struct hipDeviceProp_t properties) { -#endif -#endif +/// Convert the UseDouble input (0 -> single precision, 1 -> double precision, 2 -> automatic) to either 0 or 1 for +/// float or double respectively. For CUDART_VERSION at least equal 8000 and automatic selection we check if the card a +/// singleToDoublePrecisionPerfRatio bigger than 3 and select float in this case otherwise double. In all other cases +/// automatic results in double. +/// \arg UseDouble The input that specifies either single precision, double precision or automatic selection. +/// \arg Properties The device properties. +/// \return The selected precision, either 0 or 1 for float or double respectively. +auto getPrecision(int UseDouble, const compat::DeviceProperties& Properties) -> int { #if (CUDART_VERSION >= 8000) -// read precision ratio (dp/sp) of GPU to choose the right variant for maximum -// workload - if (useDouble == 2 && properties.singleToDoublePrecisionPerfRatio > 3) { + // read precision ratio (dp/sp) of GPU to choose the right variant for maximum + // workload + if (UseDouble == 2 && Properties.singleToDoublePrecisionPerfRatio > 3) { return 0; - } else if (useDouble) { + } + if (UseDouble) { return 1; - } else { - return 0; } -} + return 0; #else -// as precision ratio is not supported return default/user input value - (void)properties; + // as precision ratio is not supported return default/user input value + (void)Properties; - if (useDouble) { + if (UseDouble) { return 1; - } else { - return 0; } -} -#endif + return 0; -static int get_precision(int device_index, int useDouble) { - size_t memory_avail, memory_total; -#ifdef FIRESTARTER_BUILD_CUDA - CUcontext context; - CUdevice device; - struct cudaDeviceProp properties; - ACCELL_SAFE_CALL(cuDeviceGet(&device, device_index), device_index); - ACCELL_SAFE_CALL(cuCtxCreate(&context, 0, device), device_index); - ACCELL_SAFE_CALL(cuCtxSetCurrent(context), device_index); -#else -#ifdef FIRESTARTER_BUILD_HIP - struct hipDeviceProp_t properties; - ACCELL_SAFE_CALL(hipSetDevice(device_index), device_index); -#endif #endif - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,MemGetInfo)(&memory_avail, &memory_total), device_index); - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC_LONG,GetDeviceProperties)(&properties, device_index), - device_index); +} - useDouble = get_precision(useDouble, properties); +auto getPrecision(int DeviceIndex, int UseDouble) -> int { + std::size_t MemoryAvail{}; + std::size_t MemoryTotal{}; + compat::DeviceProperties Properties; - // we check for double precision support on the GPU and print errormsg, when - // the user wants to compute DP on a SP-only-Card. - if (useDouble && properties.major <= 1 && properties.minor <= 2) { - std::stringstream ss; - ss << FS_ACCEL_STRING" GPU " << device_index << ": " << properties.name << " "; - - firestarter::log::error() - << ss.str() << "Doesn't support double precision.\n" - << ss.str() << "Compute Capability: " << properties.major << "." - << properties.minor << ". Requiered for double precision: >=1.3\n" - << ss.str() - << "Stressing with single precision instead. Maybe use -f parameter."; - - useDouble = 0; - } + // NOLINTNEXTLINE(readability-qualified-auto) + auto StreamOrContext = compat::createContextOrStream(DeviceIndex); -#ifdef FIRESTARTER_BUILD_CUDA - ACCELL_SAFE_CALL(cuCtxDestroy(context), device_index); -#endif + compat::accellSafeCall(compat::memGetInfo(MemoryAvail, MemoryTotal), __FILE__, __LINE__, DeviceIndex); + compat::accellSafeCall(compat::getDeviceProperties(Properties, DeviceIndex), __FILE__, __LINE__, DeviceIndex); - return useDouble; -} + UseDouble = getPrecision(UseDouble, Properties); + const bool DoubleNotSupported = #ifdef FIRESTARTER_BUILD_CUDA -static int get_msize(int device_index, int useDouble) { - CUcontext context; - CUdevice device; - size_t memory_avail, memory_total; - - ACCELL_SAFE_CALL(cuDeviceGet(&device, device_index), device_index); - ACCELL_SAFE_CALL(cuCtxCreate(&context, 0, device), device_index); - ACCELL_SAFE_CALL(cuCtxSetCurrent(context), device_index); - ACCELL_SAFE_CALL(cuMemGetInfo(&memory_avail, &memory_total), device_index); - - ACCELL_SAFE_CALL(cuCtxDestroy(context), device_index); - - return round_up( - (int)(0.8 * sqrt(((memory_avail) / - ((useDouble ? sizeof(double) : sizeof(float)) * 3)))), - 1024); // a multiple of 1024 works always well -} + Properties.major <= 1 && Properties.minor <= 2; +#else + false; #endif -static CONCAT(FS_ACCEL_PREFIX_LC,blasStatus_t) gemm( - CONCAT(FS_ACCEL_PREFIX_LC,blasHandle_t) handle, - CONCAT(FS_ACCEL_PREFIX_LC,blasOperation_t) transa, - CONCAT(FS_ACCEL_PREFIX_LC,blasOperation_t) transb, - int &m, int &n, int &k, - const float *alpha, const float *A, int &lda, - const float *B, int &ldb, const float *beta, - float *C, int &ldc) { - return CONCAT(FS_ACCEL_PREFIX_LC,blasSgemm)(handle, transa, transb, m, n, k, - alpha, A, lda, B, ldb, - beta, C, ldc); -} + // we check for double precision support on the GPU and print errormsg, when + // the user wants to compute DP on a SP-only-Card. + if (UseDouble && DoubleNotSupported) { + std::stringstream Ss; + Ss << compat::AccelleratorString << " GPU " << DeviceIndex << ": " << Properties.name << " "; -static CONCAT(FS_ACCEL_PREFIX_LC,blasStatus_t) gemm( - CONCAT(FS_ACCEL_PREFIX_LC,blasHandle_t) handle, - CONCAT(FS_ACCEL_PREFIX_LC,blasOperation_t) transa, - CONCAT(FS_ACCEL_PREFIX_LC,blasOperation_t) transb, - int &m, int &n, int &k, - const double *alpha, const double *A, int &lda, - const double *B, int &ldb, const double *beta, - double *C, int &ldc) { - return CONCAT(FS_ACCEL_PREFIX_LC,blasDgemm)(handle, transa, transb, m, n, k, - alpha, A, lda, B, ldb, - beta, C, ldc); -} + firestarter::log::error() << Ss.str() << "Doesn't support double precision.\n" + << Ss.str() << "Compute Capability: " << Properties.major << "." << Properties.minor + << ". Requiered for double precision: >=1.3\n" + << Ss.str() << "Stressing with single precision instead. Maybe use -f parameter."; -static CONCAT(FS_ACCEL_PREFIX_LC,randStatus_t) generateUniform( - CONCAT(FS_ACCEL_PREFIX_LC,randGenerator_t) generator, - float *outputPtr, size_t num) { - return CONCAT(FS_ACCEL_PREFIX_LC,randGenerateUniform)(generator, outputPtr, num); -} + UseDouble = 0; + } -static CONCAT(FS_ACCEL_PREFIX_LC,randStatus_t) generateUniform( - CONCAT(FS_ACCEL_PREFIX_LC,randGenerator_t) generator, - double *outputPtr, size_t num) { - return CONCAT(FS_ACCEL_PREFIX_LC,randGenerateUniformDouble)(generator, outputPtr, num); + compat::accellSafeCall(compat::destroyContextOrStream(StreamOrContext), __FILE__, __LINE__, DeviceIndex); + + return UseDouble; } // GPU index. Used to pin this thread to the GPU. -template -static void create_load(std::condition_variable &waitForInitCv, - std::mutex &waitForInitCvMutex, int device_index, - std::atomic &initCount, - volatile unsigned long long *loadVar, int matrixSize) { - static_assert( - std::is_same::value || std::is_same::value, - "create_load: Template argument T must be either float or double"); - - int iterations, i; - - firestarter::log::trace() << "Starting CUDA/HIP with given matrix size " - << matrixSize; - - size_t size_use = 0; - if (matrixSize > 0) { - size_use = matrixSize; - } - - size_t use_bytes, memory_size; -#ifdef FIRESTARTER_BUILD_CUDA - CUcontext context; - struct cudaDeviceProp properties; - CUdevice device; - cublasHandle_t cublas; -#else -#ifdef FIRESTARTER_BUILD_HIP - hipStream_t stream; - struct hipDeviceProp_t properties; - hipDevice_t device; - hipblasHandle_t cublas; -#endif -#endif +// Size use is one square matrix dim size +template +void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex, int DeviceIndex, + std::atomic& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar, + unsigned MatrixSize) { + static_assert(std::is_same_v || std::is_same_v, + "create_load: Template argument must be either float or double"); + + firestarter::log::trace() << "Starting " << compat::AccelleratorString << " with given matrix size " << MatrixSize; + + compat::DeviceProperties Properties; + compat::BlasHandle Blas{}; // reserving the GPU and initializing cublas - firestarter::log::trace() << "Getting " FS_ACCEL_STRING " device nr. " << device_index; - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,DeviceGet)(&device, device_index), device_index); - -#ifdef FIRESTARTER_BUILD_CUDA - firestarter::log::trace() << "Creating " FS_ACCEL_STRING " context for computation on device nr. " - << device_index; - ACCELL_SAFE_CALL(cuCtxCreate(&context, 0, device), device_index); - - firestarter::log::trace() << "Set created " FS_ACCEL_STRING " context on device nr. " - << device_index; - ACCELL_SAFE_CALL(cuCtxSetCurrent(context), device_index); -#else -#ifdef FIRESTARTER_BUILD_HIP - firestarter::log::trace() << "Creating " FS_ACCEL_STRING " Stream for computation on device nr. " - << device_index; - ACCELL_SAFE_CALL(hipSetDevice(device_index), device_index); - ACCELL_SAFE_CALL(hipStreamCreate(&stream), device_index); -#endif -#endif + // NOLINTNEXTLINE(readability-qualified-auto) + auto StreamOrContext = compat::createContextOrStream(DeviceIndex); - firestarter::log::trace() << "Create " FS_ACCEL_STRING " Blas on device nr. " - << device_index; - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,blasCreate)(&cublas), device_index); + firestarter::log::trace() << "Create " << compat::AccelleratorString << " Blas on device nr. " << DeviceIndex; + compat::accellSafeCall(compat::blasCreate(Blas), __FILE__, __LINE__, DeviceIndex); - firestarter::log::trace() << "Get " FS_ACCEL_STRING " device properties (e.g., support for double)" - << " on device nr. " - << device_index; - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC_LONG,GetDeviceProperties)(&properties, device_index), - device_index); + firestarter::log::trace() << "Get " << compat::AccelleratorString << " device properties (e.g., support for double)" + << " on device nr. " << DeviceIndex; + compat::accellSafeCall(compat::getDeviceProperties(Properties, DeviceIndex), __FILE__, __LINE__, DeviceIndex); // getting information about the GPU memory - size_t memory_avail, memory_total; - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,MemGetInfo)(&memory_avail, &memory_total), device_index); - - firestarter::log::trace() << "Get " FS_ACCEL_STRING " Memory info on device nr. " - << device_index - <<": " << memory_avail << " B avail. from " - << memory_total << " B total"; - - // defining memory pointers -#ifdef FIRESTARTER_BUILD_CUDA - CUdeviceptr a_data_ptr; - CUdeviceptr b_data_ptr; - CUdeviceptr c_data_ptr; -#else -#ifdef FIRESTARTER_BUILD_HIP - T* a_data_ptr; - T* b_data_ptr; - T* c_data_ptr; -#endif -#endif - - // check if the user has not set a matrix OR has set a too big matrixsite and - // if this is true: set a good matrixsize - if (!size_use || ((size_use * size_use * sizeof(T) * 3 > memory_avail))) { - size_use = round_up((int)(0.8 * sqrt(((memory_avail) / (sizeof(T) * 3)))), - 1024); // a multiple of 1024 works always well + std::size_t MemoryAvail{}; + std::size_t MemoryTotal{}; + compat::accellSafeCall(compat::memGetInfo(MemoryAvail, MemoryTotal), __FILE__, __LINE__, DeviceIndex); + firestarter::log::trace() << "Get " << compat::AccelleratorString << " emory info on device nr. " << DeviceIndex + << ": " << MemoryAvail << " B avail. from " << MemoryTotal << " B total"; + + // Defining memory pointers. ADataPtr and BDataPtr will point to a square matrix. CDataPtr may be one or multiple + // square matrices. + FloatingPointType* ADataPtr{}; + FloatingPointType* BDataPtr{}; + FloatingPointType* CDataPtr{}; + + // If the matrix size is not set or three square matricies with dim size of SizeUse do not fit into the available + // memory, select the size so that 3 square matricies will fit into the available device memory where the dim size + // is a multiple of 1024. There may be edge cases with small device memory that results in matricies that are not + // multiples of 1024. + std::size_t MemorySize = sizeof(FloatingPointType) * MatrixSize * MatrixSize; + if (!MatrixSize || (MemorySize * 3 > MemoryAvail)) { + // a multiple of 1024 works always well + MatrixSize = roundUp<1024>(0.8 * std::sqrt(MemoryAvail / sizeof(FloatingPointType) / 3)); + MemorySize = sizeof(FloatingPointType) * MatrixSize * MatrixSize; } - firestarter::log::trace() << "Set " FS_ACCEL_STRING " matrix size: " << matrixSize; - use_bytes = (size_t)((T)memory_avail); - memory_size = sizeof(T) * size_use * size_use; - iterations = (use_bytes - 2 * memory_size) / memory_size; // = 1; - firestarter::log::trace() - << "Allocating " FS_ACCEL_STRING " memory on device nr. " - << device_index; + firestarter::log::trace() << "Set " << compat::AccelleratorString << " matrix size: " << MatrixSize; + // Calculate the numnber of C matricies based on the available memory and the matrix size in B. + const auto Iterations = (MemoryAvail - 2 * MemorySize) / MemorySize; + // The numner of used memory are two time the matrix size in B (Matrix A and B) plus the number of matricies in C. + const auto UseBytes = (2 + Iterations) * MemorySize; - // allocating memory on the GPU -#ifdef FIRESTARTER_BUILD_CUDA - ACCELL_SAFE_CALL(cuMemAlloc(&a_data_ptr, memory_size), device_index); - ACCELL_SAFE_CALL(cuMemAlloc(&b_data_ptr, memory_size), device_index); - ACCELL_SAFE_CALL(cuMemAlloc(&c_data_ptr, iterations * memory_size), - device_index); -#else -#ifdef FIRESTARTER_BUILD_HIP - ACCELL_SAFE_CALL(hipMalloc(&a_data_ptr, memory_size), device_index); - ACCELL_SAFE_CALL(hipMalloc(&b_data_ptr, memory_size), device_index); - ACCELL_SAFE_CALL(hipMalloc(&c_data_ptr, iterations * memory_size), - device_index); -#endif -#endif + firestarter::log::trace() << "Allocating " << compat::AccelleratorString << " memory on device nr. " << DeviceIndex; - firestarter::log::trace() << "Allocated " FS_ACCEL_STRING " memory on device nr. " - << device_index - <<". A: " << a_data_ptr << "(Size: " - << memory_size << "B)" - << "\n"; - - firestarter::log::trace() << "Allocated " FS_ACCEL_STRING " memory on device nr. " - << device_index - <<". B: " << b_data_ptr << "(Size: " - << memory_size << "B)" - << "\n"; - firestarter::log::trace() << "Allocated " FS_ACCEL_STRING " memory on device nr. " - << device_index - <<". C: " << c_data_ptr << "(Size: " - << iterations * memory_size << "B)" - << "\n"; - - firestarter::log::trace() << "Initializing " FS_ACCEL_STRING " matrices a, b on device nr. " - << device_index - << ". Using " - << size_use * size_use - << " elements of size " - << sizeof(T) << " Byte"; + // allocating memory on the GPU + compat::accellSafeCall(compat::malloc(&ADataPtr, MemorySize), __FILE__, __LINE__, DeviceIndex); + compat::accellSafeCall(compat::malloc(&BDataPtr, MemorySize), __FILE__, __LINE__, DeviceIndex); + compat::accellSafeCall(compat::malloc(&CDataPtr, Iterations * MemorySize), __FILE__, __LINE__, + DeviceIndex); + + firestarter::log::trace() << "Allocated " << compat::AccelleratorString << " memory on device nr. " << DeviceIndex + << ". A: " << ADataPtr << " (Size: " << MemorySize << "B)" + << "\n"; + firestarter::log::trace() << "Allocated " << compat::AccelleratorString << " memory on device nr. " << DeviceIndex + << ". B: " << BDataPtr << " (Size: " << MemorySize << "B)" + << "\n"; + firestarter::log::trace() << "Allocated " << compat::AccelleratorString << " memory on device nr. " << DeviceIndex + << ". C: " << CDataPtr << " (Size: " << Iterations * MemorySize << "B)" + << "\n"; + + firestarter::log::trace() << "Initializing " << compat::AccelleratorString << " matrices a, b on device nr. " + << DeviceIndex << ". Using " << MatrixSize * MatrixSize << " elements of size " + << sizeof(FloatingPointType) << " Byte"; // initialize matrix A and B on the GPU with random values - CONCAT(FS_ACCEL_PREFIX_LC,randGenerator_t) random_gen; - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,randCreateGenerator)( - &random_gen, - CONCAT(FS_ACCEL_PREFIX_UC,RAND_RNG_PSEUDO_DEFAULT)), - device_index); - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,randSetPseudoRandomGeneratorSeed)( - random_gen, SEED), - device_index); - ACCELL_SAFE_CALL( - generateUniform(random_gen, (T *)a_data_ptr, size_use * size_use), - device_index); - ACCELL_SAFE_CALL( - generateUniform(random_gen, (T *)b_data_ptr, size_use * size_use), - device_index); - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,randDestroyGenerator)(random_gen), - device_index); + { + compat::RandGenerator RandomGen{}; + compat::accellSafeCall(compat::randCreateGeneratorPseudoRandom(RandomGen), __FILE__, __LINE__, DeviceIndex); + compat::accellSafeCall(compat::randSetPseudoRandomGeneratorSeed(RandomGen, Seed), __FILE__, __LINE__, DeviceIndex); + compat::accellSafeCall(compat::generateUniform(RandomGen, ADataPtr, MatrixSize * MatrixSize), + __FILE__, __LINE__, DeviceIndex); + compat::accellSafeCall(compat::generateUniform(RandomGen, BDataPtr, MatrixSize * MatrixSize), + __FILE__, __LINE__, DeviceIndex); + compat::accellSafeCall(compat::randDestroyGenerator(RandomGen), __FILE__, __LINE__, DeviceIndex); + } // initialize c_data_ptr with copies of A - for (i = 0; i < iterations; i++) { - firestarter::log::trace() << "Initializing " FS_ACCEL_STRING " matrix c-" - << i - << " by copying " - << memory_size - << " byte from " - << a_data_ptr - << " to " - << c_data_ptr + (size_t)(i * size_use * size_use * (float)sizeof(T)/(float)sizeof(c_data_ptr)) - << "\n"; - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,MemcpyDtoD)( - c_data_ptr + (size_t)(i * size_use * size_use * (float)sizeof(T)/(float)sizeof(c_data_ptr)), - a_data_ptr, memory_size), - device_index); + for (std::size_t I = 0; I < Iterations; I++) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + auto DestinationPtr = CDataPtr + (I * MatrixSize * MatrixSize); + firestarter::log::trace() << "Initializing " << compat::AccelleratorString << " matrix c-" << I << " by copying " + << MemorySize << " byte from " << ADataPtr << " to " << DestinationPtr << "\n"; + compat::accellSafeCall(compat::memcpyDtoD(DestinationPtr, ADataPtr, MemorySize), __FILE__, + __LINE__, DeviceIndex); } // save gpuvar->init_count and sys.out { - std::lock_guard lk(waitForInitCvMutex); - -#define TO_MB(x) (unsigned long)(x / 1024 / 1024) - firestarter::log::info() - << " GPU " << device_index << "\n" - << " name: " << properties.name << "\n" - << " memory: " << TO_MB(memory_avail) << "/" - << TO_MB(memory_total) << " MiB available (using " << TO_MB(use_bytes) - << " MiB)\n" - << " matrix size: " << size_use << "\n" - << " used precision: " - << ((sizeof(T) == sizeof(double)) ? "double" : "single"); -#undef TO_MB - - initCount++; + const std::lock_guard Lk(WaitForInitCvMutex); + + auto ToMiB = [](const size_t Val) { return Val / 1024 / 1024; }; + firestarter::log::info() << " GPU " << DeviceIndex << "\n" + << " name: " << Properties.name << "\n" + << " memory: " << ToMiB(MemoryAvail) << "/" << ToMiB(MemoryTotal) + << " MiB available (using " << ToMiB(UseBytes) << " MiB)\n" + << " matrix size: " << MatrixSize << "\n" + << " used precision: " + << ((sizeof(FloatingPointType) == sizeof(double)) ? "double" : "single"); + + InitCount++; } - waitForInitCv.notify_all(); + WaitForInitCv.notify_all(); - const T alpha = 1.0; - const T beta = 0.0; + const FloatingPointType Alpha = 1.0; + const FloatingPointType Beta = 0.0; - int size_use_i = size_use; // actual stress begins here - while (*loadVar != LOAD_STOP) { - for (i = 0; i < iterations; i++) { - ACCELL_SAFE_CALL(gemm( - cublas, - CONCAT(FS_ACCEL_PREFIX_UC,BLAS_OP_N), - CONCAT(FS_ACCEL_PREFIX_UC,BLAS_OP_N), - size_use_i, size_use_i, - size_use_i, &alpha, (const T *)a_data_ptr, size_use_i, - (const T *)b_data_ptr, size_use_i, &beta, - (T *)c_data_ptr + i * size_use * size_use, size_use_i), - device_index); - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC_LONG,DeviceSynchronize)(), - device_index); + while (LoadVar != firestarter::LoadThreadWorkType::LoadStop) { + for (std::size_t I = 0; I < Iterations; I++) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + auto CSectionPtr = CDataPtr + (I * MatrixSize * MatrixSize); + compat::accellSafeCall(compat::gemm(Blas, compat::BlasOperation::BLAS_OP_N, + compat::BlasOperation::BLAS_OP_N, MatrixSize, MatrixSize, + MatrixSize, Alpha, ADataPtr, MatrixSize, BDataPtr, + MatrixSize, Beta, CSectionPtr, MatrixSize), + __FILE__, __LINE__, DeviceIndex); + compat::accellSafeCall(compat::deviceSynchronize(), __FILE__, __LINE__, DeviceIndex); } } -#ifdef FIRESTARTER_BUILD_CUDA - ACCELL_SAFE_CALL(cuMemFree(a_data_ptr), device_index); - ACCELL_SAFE_CALL(cuMemFree(b_data_ptr), device_index); - ACCELL_SAFE_CALL(cuMemFree(c_data_ptr), device_index); -#else -#ifdef FIRESTARTER_BUILD_HIP - ACCELL_SAFE_CALL(hipFree(a_data_ptr), device_index); - ACCELL_SAFE_CALL(hipFree(b_data_ptr), device_index); - ACCELL_SAFE_CALL(hipFree(c_data_ptr), device_index); -#endif -#endif - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,blasDestroy)(cublas), device_index); -#ifdef FIRESTARTER_BUILD_CUDA - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,CtxDestroy)(context), device_index); -#else -#ifdef FIRESTARTER_BUILD_HIP - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,StreamDestroy)(stream), device_index); -#endif -#endif + compat::accellSafeCall(compat::free(ADataPtr), __FILE__, __LINE__, DeviceIndex); + compat::accellSafeCall(compat::free(BDataPtr), __FILE__, __LINE__, DeviceIndex); + compat::accellSafeCall(compat::free(CDataPtr), __FILE__, __LINE__, DeviceIndex); + + compat::accellSafeCall(compat::blasDestroy(Blas), __FILE__, __LINE__, DeviceIndex); + + compat::accellSafeCall(compat::destroyContextOrStream(StreamOrContext), __FILE__, __LINE__, DeviceIndex); } -Cuda::Cuda(volatile unsigned long long *loadVar, bool useFloat, bool useDouble, - unsigned matrixSize, int gpus) { - std::thread t(Cuda::initGpus, std::ref(_waitForInitCv), loadVar, useFloat, - useDouble, matrixSize, gpus); - _initThread = std::move(t); +}; // namespace + +Cuda::Cuda(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble, unsigned MatrixSize, + int Gpus) { + std::condition_variable WaitForInitCv; + std::mutex WaitForInitCvMutex; - std::unique_lock lk(_waitForInitCvMutex); + std::thread T(Cuda::initGpus, std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat, UseDouble, MatrixSize, Gpus); + InitThread = std::move(T); + + std::unique_lock Lk(WaitForInitCvMutex); // wait for gpus to initialize - _waitForInitCv.wait(lk); + WaitForInitCv.wait(Lk); } -void Cuda::initGpus(std::condition_variable &cv, - volatile unsigned long long *loadVar, bool useFloat, - bool useDouble, unsigned matrixSize, int gpus) { - std::condition_variable waitForInitCv; - std::mutex waitForInitCvMutex; +void Cuda::initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar, + bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus) { + std::condition_variable GpuThreadsWaitForInitCv; + std::mutex GpuThreadsWaitForInitCvMutex; + std::vector GpuThreads; - if (gpus) { - ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,Init)(0), -1); - int devCount; -#ifdef FIRESTARTER_BUILD_CUDA - ACCELL_SAFE_CALL(cuDeviceGetCount(&devCount), -1); -#else -#ifdef FIRESTARTER_BUILD_HIP - ACCELL_SAFE_CALL(hipGetDeviceCount(&devCount), -1); -#endif -#endif + if (Gpus != 0) { + compat::accellSafeCall(compat::init(0), __FILE__, __LINE__); + + int DevCount{}; + compat::accellSafeCall(compat::getDeviceCount(DevCount), __FILE__, __LINE__); - if (devCount) { - std::vector gpuThreads; - std::atomic initCount = 0; - int use_double; + if (DevCount) { + std::atomic InitCount = 0; + int UseDoubleConverted{}; - if (useFloat) { - use_double = 0; - } else if (useDouble) { - use_double = 1; + if (UseFloat) { + UseDoubleConverted = 0; + } else if (UseDouble) { + UseDoubleConverted = 1; } else { - use_double = 2; + UseDoubleConverted = 2; } firestarter::log::info() @@ -636,65 +306,61 @@ void Cuda::initGpus(std::condition_variable &cv, << "\n graphics processor characteristics:"; // use all GPUs if the user gave no information about use_device - if (gpus < 0) { - gpus = devCount; + if (Gpus < 0) { + Gpus = DevCount; } - if (gpus > devCount) { - firestarter::log::warn() - << "You requested more " FS_ACCEL_STRING " devices than available. " - "Maybe you set " FS_ACCEL_STRING "_VISIBLE_DEVICES?"; - firestarter::log::warn() - << "FIRESTARTER will use " << devCount << " of the requested " - << gpus << " " FS_ACCEL_STRING " device(s)"; - gpus = devCount; + if (Gpus > DevCount) { + firestarter::log::warn() << "You requested more " << compat::AccelleratorString + << " devices than available. " + "Maybe you set " + << compat::AccelleratorString << "_VISIBLE_DEVICES?"; + firestarter::log::warn() << "FIRESTARTER will use " << DevCount << " of the requested " << Gpus << " " + << compat::AccelleratorString << " device(s)"; + Gpus = DevCount; } { - std::lock_guard lk(waitForInitCvMutex); + const std::lock_guard Lk(GpuThreadsWaitForInitCvMutex); - for (int i = 0; i < gpus; ++i) { + for (int I = 0; I < Gpus; ++I) { // if there's a GPU in the system without Double Precision support, we // have to correct this. - int precision = get_precision(i, use_double); - - if (precision) { - std::thread t(create_load, std::ref(waitForInitCv), - std::ref(waitForInitCvMutex), i, std::ref(initCount), - loadVar, (int)matrixSize); - gpuThreads.push_back(std::move(t)); - } else { - std::thread t(create_load, std::ref(waitForInitCv), - std::ref(waitForInitCvMutex), i, std::ref(initCount), - loadVar, (int)matrixSize); - gpuThreads.push_back(std::move(t)); - } + const auto Precision = getPrecision(I, UseDoubleConverted); + void (*LoadFunc)(std::condition_variable&, std::mutex&, int, std::atomic&, + const volatile firestarter::LoadThreadWorkType&, unsigned) = + Precision ? createLoad : createLoad; + + std::thread T(LoadFunc, std::ref(GpuThreadsWaitForInitCv), std::ref(GpuThreadsWaitForInitCvMutex), I, + std::ref(InitCount), std::cref(LoadVar), MatrixSize); + GpuThreads.emplace_back(std::move(T)); } } { - std::unique_lock lk(waitForInitCvMutex); + std::unique_lock Lk(GpuThreadsWaitForInitCvMutex); // wait for all threads to initialize - waitForInitCv.wait(lk, [&] { return initCount == gpus; }); - } - - // notify that init is done - cv.notify_all(); - - /* join computation threads */ - for (auto &t : gpuThreads) { - t.join(); + GpuThreadsWaitForInitCv.wait(Lk, [&] { return InitCount == Gpus; }); } } else { - firestarter::log::info() - << " - No " FS_ACCEL_STRING " devices. Just stressing CPU(s). Maybe use " - "FIRESTARTER instead of FIRESTARTER_" FS_ACCEL_STRING "?"; - cv.notify_all(); + firestarter::log::info() << " - No " << compat::AccelleratorString + << " devices. Just stressing CPU(s). Maybe use " + "FIRESTARTER instead of FIRESTARTER_" + << compat::AccelleratorString << "?"; } } else { - firestarter::log::info() - << " --gpus 0 is set. Just stressing CPU(s). Maybe use " - "FIRESTARTER instead of FIRESTARTER_" FS_ACCEL_STRING "?"; - cv.notify_all(); + firestarter::log::info() << " --gpus 0 is set. Just stressing CPU(s). Maybe use " + "FIRESTARTER instead of FIRESTARTER_" + << compat::AccelleratorString << "?"; + } + + // notify that init is done + WaitForInitCv.notify_all(); + + /* join computation threads */ + for (auto& Thread : GpuThreads) { + Thread.join(); } } + +} // namespace firestarter::cuda \ No newline at end of file diff --git a/src/firestarter/DumpRegisterWorker.cpp b/src/firestarter/DumpRegisterWorker.cpp index 3f7ab6a9..127d0f1d 100644 --- a/src/firestarter/DumpRegisterWorker.cpp +++ b/src/firestarter/DumpRegisterWorker.cpp @@ -19,30 +19,25 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#ifdef FIRESTARTER_DEBUG_FEATURES - -#include -#include +#include "firestarter/Firestarter.hpp" #include #include #include -using namespace firestarter; - namespace { -static unsigned hammingDistance(unsigned long long x, unsigned long long y) { - unsigned dist = 0; +auto hammingDistance(uint64_t X, uint64_t Y) -> unsigned { + unsigned Dist = 0; - for (unsigned long long val = x ^ y; val > 0; val >>= 1) { - dist += val & 1; + for (uint64_t Val = X ^ Y; Val > 0; Val >>= 1) { + Dist += Val & 1; } - return dist; + return Dist; } -static std::string registerNameBySize(unsigned registerSize) { - switch (registerSize) { +auto registerNameBySize(unsigned RegisterSize) -> std::string { + switch (RegisterSize) { case 2: return "xmm"; case 4: @@ -55,141 +50,120 @@ static std::string registerNameBySize(unsigned registerSize) { } } // namespace -int Firestarter::initDumpRegisterWorker(std::chrono::seconds dumpTimeDelta, - std::string dumpFilePath) { +namespace firestarter { - auto data = std::make_unique( - this->loadThreads.begin()->second, dumpTimeDelta, dumpFilePath); +void Firestarter::initDumpRegisterWorker() { + // Create the data for the worker thread. The thread will dump the register contents periodically and calculate the + // hamming distance between dumps. + auto Data = std::make_unique(this->LoadThreads.begin()->second, Cfg.DumpRegistersTimeDelta, + Cfg.DumpRegistersOutpath); - this->dumpRegisterWorkerThread = - std::thread(Firestarter::dumpRegisterWorker, std::move(data)); - - return EXIT_SUCCESS; + // Spawn the thread. + DumpRegisterWorkerThread = std::thread(Firestarter::dumpRegisterWorker, std::move(Data)); } -void Firestarter::joinDumpRegisterWorker() { - this->dumpRegisterWorkerThread.join(); -} - -void Firestarter::dumpRegisterWorker( - std::unique_ptr data) { +void Firestarter::joinDumpRegisterWorker() { this->DumpRegisterWorkerThread.join(); } +void Firestarter::dumpRegisterWorker(std::unique_ptr Data) { +#if defined(linux) || defined(__linux__) pthread_setname_np(pthread_self(), "DumpRegWorker"); +#endif - int registerCount = data->loadWorkerData->config().payload().registerCount(); - int registerSize = data->loadWorkerData->config().payload().registerSize(); - std::string registerPrefix = registerNameBySize(registerSize); - auto offset = sizeof(DumpRegisterStruct) / sizeof(unsigned long long); - - auto dumpRegisterStruct = reinterpret_cast( - data->loadWorkerData->addrMem - offset); + const auto RegisterCount = Data->LoadWorkerDataPtr->config().payload()->registerCount(); + const auto RegisterSize = Data->LoadWorkerDataPtr->config().payload()->registerSize(); + const auto Offset = RegisterCount * RegisterSize; + const std::string RegisterPrefix = registerNameBySize(RegisterSize); - auto dumpVar = reinterpret_cast( - &dumpRegisterStruct->dumpVar); + auto& DumpRegisterStructRef = Data->LoadWorkerDataPtr->Memory->ExtraVars.Drs; + auto& DumpVar = DumpRegisterStructRef.DumpVar; // memory of simd variables is before the padding - volatile unsigned long long *dumpMemAddr = - dumpRegisterStruct->padding - registerCount * registerSize; - - // TODO: maybe use aligned_malloc to make memcpy more efficient and don't - // interrupt the workload as much? - unsigned long long *last = reinterpret_cast( - malloc(sizeof(unsigned long long) * offset)); - unsigned long long *current = reinterpret_cast( - malloc(sizeof(unsigned long long) * offset)); - - if (last == nullptr || current == nullptr) { - log::error() << "Malloc failed in Firestarter::dumpRegisterWorker"; - exit(ENOMEM); - } + const auto* DumpMemAddr = DumpRegisterStructRef.Padding.data() - Offset; + + // allocate continous memory that fits the register contents + auto Last = std::vector(Offset); - std::stringstream dumpFilePath; - dumpFilePath << data->dumpFilePath; + std::stringstream DumpFilePath; + DumpFilePath << Data->DumpFilePath; #if defined(__MINGW32__) || defined(__MINGW64__) - dumpFilePath << "\\"; + DumpFilePath << "\\"; #else - dumpFilePath << "/"; + DumpFilePath << "/"; #endif - dumpFilePath << "hamming_distance.csv"; - auto dumpFile = std::ofstream(dumpFilePath.str()); + DumpFilePath << "hamming_distance.csv"; + auto DumpFile = std::ofstream(DumpFilePath.str()); // dump the header to the csv file - dumpFile << "total_hamming_distance,"; - for (int i = 0; i < registerCount; i++) { - for (int j = 0; j < registerSize; j++) { - dumpFile << registerPrefix << i << "[" << j << "]"; + DumpFile << "total_hamming_distance,"; + for (auto I = 0U; I < RegisterCount; I++) { + for (auto J = 0U; J < RegisterSize; J++) { + DumpFile << RegisterPrefix << I << "[" << J << "]"; - if (j != registerSize - 1) { - dumpFile << ","; + if (J != RegisterSize - 1) { + DumpFile << ","; } } - if (i != registerCount - 1) { - dumpFile << ","; + if (I != RegisterCount - 1) { + DumpFile << ","; } } - dumpFile << std::endl << std::flush; + DumpFile << '\n' << std::flush; // do not output the hamming distance for the first run - bool skipFirst = true; + bool SkipFirst = true; // continue until stop and dump the registers every data->dumpTimeDelta // seconds - for (; *data->loadWorkerData->addrHigh != LOAD_STOP;) { + for (; Data->LoadWorkerDataPtr->LoadVar != LoadThreadWorkType::LoadStop;) { // signal the thread to dump its largest SIMD registers - *dumpVar = DumpVariable::Start; + DumpVar = DumpVariable::Start; __asm__ __volatile__("mfence;"); - while (*dumpVar == DumpVariable::Start) { + while (DumpVar == DumpVariable::Start) { std::this_thread::sleep_for(std::chrono::milliseconds(10)); } + auto Current = std::vector(Offset); // copy the register content to minimize the interruption of the load worker - std::memcpy(current, (void *)dumpMemAddr, - sizeof(unsigned long long) * offset); + std::memcpy(Current.data(), DumpMemAddr, Current.size() * sizeof(decltype(Current)::value_type)); // skip the first output, as we first have to get some valid values for last - if (!skipFirst) { + if (!SkipFirst) { // calculate the total hamming distance - int totalHammingDistance = 0; - for (int i = 0; i < registerCount * registerSize; i++) { - totalHammingDistance += hammingDistance(current[i], last[i]); + auto TotalHammingDistance = 0U; + for (auto I = 0U; I < RegisterCount * RegisterSize; I++) { + TotalHammingDistance += hammingDistance(Current[I], Last[I]); } - dumpFile << totalHammingDistance << ","; + DumpFile << TotalHammingDistance << ","; // dump the hamming distance of each double (last, current) pair - for (int i = registerCount - 1; i >= 0; i--) { - // auto registerNum = registerCount - 1 - i; - - for (auto j = 0; j < registerSize; j++) { - auto index = registerSize * i + j; - auto hd = static_cast( - hammingDistance(current[index], last[index])); - - dumpFile << hd; - if (j != registerSize - 1) { - dumpFile << ","; + for (int I = static_cast(RegisterCount) - 1; I >= 0; I--) { + for (auto J = 0U; J < RegisterSize; J++) { + auto Index = (RegisterSize * I) + J; + auto Hd = static_cast(hammingDistance(Current[Index], Last[Index])); + + DumpFile << Hd; + if (J != RegisterSize - 1) { + DumpFile << ","; } } - if (i != 0) { - dumpFile << ","; + if (I != 0) { + DumpFile << ","; } } - dumpFile << std::endl << std::flush; + DumpFile << '\n' << std::flush; } else { - skipFirst = false; + SkipFirst = false; } - std::memcpy(last, current, sizeof(unsigned long long) * offset); + Last = std::move(Current); - std::this_thread::sleep_for(std::chrono::seconds(data->dumpTimeDelta)); + std::this_thread::sleep_for(std::chrono::seconds(Data->DumpTimeDelta)); } - dumpFile.close(); - - free(last); - free(current); + DumpFile.close(); } -#endif +} // namespace firestarter \ No newline at end of file diff --git a/src/firestarter/Environment/CPUTopology.cpp b/src/firestarter/Environment/CPUTopology.cpp index d7fb4bf0..a7acf3f2 100644 --- a/src/firestarter/Environment/CPUTopology.cpp +++ b/src/firestarter/Environment/CPUTopology.cpp @@ -19,127 +19,117 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include +#include "firestarter/Environment/CPUTopology.hpp" +#include "firestarter/Logging/Log.hpp" #include #include #include +#include -extern "C" { -#include -} - -using namespace firestarter::environment; +namespace firestarter::environment { -std::ostream &CPUTopology::print(std::ostream &stream) const { - stream << " system summary:\n" - << " number of processors: " << this->numPackages() << "\n" - << " number of cores (total)): " << this->numCoresTotal() << "\n" - << " (this includes only cores in the cgroup)" << "\n" - << " number of threads per core: " << this->numThreadsPerCore() +auto CPUTopology::print(std::ostream& Stream) const -> std::ostream& { + Stream << " system summary:\n" + << " number of processors: " << numPackages() << "\n" + << " number of cores (total)): " << numCoresTotal() << "\n" + << " (this includes only cores in the cgroup)" << "\n" - << " total number of threads: " << this->numThreads() << "\n\n"; + << " number of threads per core: " << numThreadsPerCore() << "\n" + << " total number of threads: " << numThreads() << "\n\n"; - std::stringstream ss; + std::stringstream Ss; - for (auto const &ent : this->features()) { - ss << ent << " "; + for (auto const& Entry : features()) { + Ss << Entry << " "; } - stream << " processor characteristics:\n" - << " architecture: " << this->architecture() << "\n" - << " vendor: " << this->vendor() << "\n" - << " processor-name: " << this->processorName() << "\n" - << " model: " << this->model() << "\n" - << " frequency: " << this->clockrate() / 1000000 - << " MHz\n" - << " supported features: " << ss.str() << "\n" + Stream << " processor characteristics:\n" + << " architecture: " << architecture() << "\n" + << " vendor: " << vendor() << "\n" + << " processor-name: " << processorName() << "\n" + << " model: " << model() << "\n" + << " frequency: " << clockrate() / 1000000 << " MHz\n" + << " supported features: " << Ss.str() << "\n" << " Caches:"; - std::vector caches = { - HWLOC_OBJ_L1CACHE, HWLOC_OBJ_L1ICACHE, HWLOC_OBJ_L2CACHE, - HWLOC_OBJ_L2ICACHE, HWLOC_OBJ_L3CACHE, HWLOC_OBJ_L3ICACHE, - HWLOC_OBJ_L4CACHE, HWLOC_OBJ_L5CACHE, + const std::vector Caches = { + HWLOC_OBJ_L1CACHE, HWLOC_OBJ_L1ICACHE, HWLOC_OBJ_L2CACHE, HWLOC_OBJ_L2ICACHE, + HWLOC_OBJ_L3CACHE, HWLOC_OBJ_L3ICACHE, HWLOC_OBJ_L4CACHE, HWLOC_OBJ_L5CACHE, }; - std::vector cacheStrings = {}; + for (hwloc_obj_type_t const& Cache : Caches) { + std::stringstream Ss; - for (hwloc_obj_type_t const &cache : caches) { - int width; - char string[128]; - int shared; - hwloc_obj_t cacheObj; - std::stringstream ss; + auto Width = hwloc_get_nbobjs_by_type(Topology, Cache); - width = hwloc_get_nbobjs_by_type(this->topology, cache); + if (Width >= 1) { + Ss << "\n - "; - if (width >= 1) { - ss << "\n - "; + auto* CacheObj = hwloc_get_obj_by_type(Topology, Cache, 0); + std::array String{}; + auto* StringPtr = String.data(); + hwloc_obj_type_snprintf(StringPtr, sizeof(String), CacheObj, 0); - cacheObj = hwloc_get_obj_by_type(this->topology, cache, 0); - hwloc_obj_type_snprintf(string, sizeof(string), cacheObj, 0); - - switch (cacheObj->attr->cache.type) { + switch (CacheObj->attr->cache.type) { case HWLOC_OBJ_CACHE_DATA: - ss << "Level " << cacheObj->attr->cache.depth << " Data"; + Ss << "Level " << CacheObj->attr->cache.depth << " Data"; break; case HWLOC_OBJ_CACHE_INSTRUCTION: - ss << "Level " << cacheObj->attr->cache.depth << " Instruction"; + Ss << "Level " << CacheObj->attr->cache.depth << " Instruction"; break; case HWLOC_OBJ_CACHE_UNIFIED: default: - ss << "Unified Level " << cacheObj->attr->cache.depth; + Ss << "Unified Level " << CacheObj->attr->cache.depth; break; } - ss << " Cache, " << cacheObj->attr->cache.size / 1024 << " KiB, " - << cacheObj->attr->cache.linesize << " B Cacheline, "; + Ss << " Cache, " << CacheObj->attr->cache.size / 1024 << " KiB, " << CacheObj->attr->cache.linesize + << " B Cacheline, "; - switch (cacheObj->attr->cache.associativity) { + switch (CacheObj->attr->cache.associativity) { case -1: - ss << "full"; + Ss << "full"; break; case 0: - ss << "unknown"; + Ss << "unknown"; break; default: - ss << cacheObj->attr->cache.associativity << "-way set"; + Ss << CacheObj->attr->cache.associativity << "-way set"; break; } - ss << " associative, "; + Ss << " associative, "; - shared = this->numThreads() / width; + auto Shared = numThreads() / Width; - if (shared > 1) { - ss << "shared among " << shared << " threads."; + if (Shared > 1) { + Ss << "shared among " << Shared << " threads."; } else { - ss << "per thread."; + Ss << "per thread."; } - stream << ss.str(); + Stream << Ss.str(); } } - return stream; + return Stream; } -CPUTopology::CPUTopology(std::string architecture) - : _architecture(architecture) { +CPUTopology::CPUTopology(std::string Architecture) + : Architecture(std::move(Architecture)) { - hwloc_topology_init(&this->topology); + hwloc_topology_init(&Topology); // do not filter icaches - hwloc_topology_set_cache_types_filter(this->topology, - HWLOC_TYPE_FILTER_KEEP_ALL); + hwloc_topology_set_cache_types_filter(Topology, HWLOC_TYPE_FILTER_KEEP_ALL); - hwloc_topology_load(this->topology); + hwloc_topology_load(Topology); // check for hybrid processor - int nr_cpukinds = hwloc_cpukinds_get_nr(this->topology, 0); + const auto NrCpukinds = hwloc_cpukinds_get_nr(Topology, 0); - switch (nr_cpukinds) { + switch (NrCpukinds) { case -1: log::warn() << "Hybrid core check failed"; break; @@ -147,292 +137,273 @@ CPUTopology::CPUTopology(std::string architecture) log::warn() << "Hybrid core check read no information"; break; default: - log::trace() << "Number of CPU kinds:" << nr_cpukinds; + log::trace() << "Number of CPU kinds:" << NrCpukinds; } - if (nr_cpukinds > 1) { + if (NrCpukinds > 1) { log::warn() << "FIRESTARTER detected a hybrid CPU set-up"; } // get number of packages - int depth = hwloc_get_type_depth(this->topology, HWLOC_OBJ_PACKAGE); + int Depth = hwloc_get_type_depth(Topology, HWLOC_OBJ_PACKAGE); - if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) { - this->_numPackages = 1; + if (Depth == HWLOC_TYPE_DEPTH_UNKNOWN) { + NumPackages = 1; log::warn() << "Could not get number of packages"; } else { - this->_numPackages = hwloc_get_nbobjs_by_depth(this->topology, depth); + NumPackages = hwloc_get_nbobjs_by_depth(Topology, Depth); } - log::trace() << "Number of Packages:" << this->_numPackages; + log::trace() << "Number of Packages:" << NumPackages; // get number of cores per package - depth = hwloc_get_type_depth(this->topology, HWLOC_OBJ_CORE); + Depth = hwloc_get_type_depth(Topology, HWLOC_OBJ_CORE); - if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) { - this->_numCoresTotal = 1; + if (Depth == HWLOC_TYPE_DEPTH_UNKNOWN) { + NumCoresTotal = 1; log::warn() << "Could not get number of cores"; } else { - this->_numCoresTotal = - hwloc_get_nbobjs_by_depth(this->topology, depth); - if ( this->_numCoresTotal == 0 ) { + NumCoresTotal = hwloc_get_nbobjs_by_depth(Topology, Depth); + if (NumCoresTotal == 0) { log::warn() << "Could not get number of cores"; - this->_numCoresTotal = 1; + NumCoresTotal = 1; } } - log::trace() << "Number of Cores:" << this->_numCoresTotal; + log::trace() << "Number of Cores:" << NumCoresTotal; // get number of threads per core - depth = hwloc_get_type_depth(this->topology, HWLOC_OBJ_PU); + Depth = hwloc_get_type_depth(Topology, HWLOC_OBJ_PU); - if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) { - this->_numThreadsPerCore = 1; + if (Depth == HWLOC_TYPE_DEPTH_UNKNOWN) { + NumThreadsPerCore = 1; log::warn() << "Could not get number of threads"; } else { - this->_numThreadsPerCore = - hwloc_get_nbobjs_by_depth(this->topology, depth) / - this->_numCoresTotal ; - if ( this->_numThreadsPerCore == 0 ) { + NumThreadsPerCore = hwloc_get_nbobjs_by_depth(Topology, Depth) / NumCoresTotal; + if (NumThreadsPerCore == 0) { log::warn() << "Could not get number of threads per core"; - this->_numThreadsPerCore = 1; + NumThreadsPerCore = 1; } } // get vendor, processor name and clockrate for linux #if defined(linux) || defined(__linux__) - auto procCpuinfo = this->getFileAsStream("/proc/cpuinfo"); - std::string line; - std::string clockrate = "0"; - - while (std::getline(procCpuinfo, line, '\n')) { - const std::regex vendorIdRe("^vendor_id.*:\\s*(.*)\\s*$"); - const std::regex modelNameRe("^model name.*:\\s*(.*)\\s*$"); - const std::regex cpuMHzRe("^cpu MHz.*:\\s*(.*)\\s*$"); - std::smatch vendorIdM; - std::smatch modelNameM; - std::smatch cpuMHzM; - - if (std::regex_match(line, vendorIdM, vendorIdRe)) { - this->_vendor = vendorIdM[1].str(); + { + auto ProcCpuinfo = getFileAsStream("/proc/cpuinfo"); + std::string Line; + std::string ClockrateStr = "0"; + + while (std::getline(ProcCpuinfo, Line, '\n')) { + const std::regex VendorIdRe("^vendor_id.*:\\s*(.*)\\s*$"); + const std::regex ModelNameRe("^model name.*:\\s*(.*)\\s*$"); + const std::regex CpuMHzRe("^cpu MHz.*:\\s*(.*)\\s*$"); + std::smatch VendorIdMatch; + std::smatch ModelNameMatch; + std::smatch CpuMHzMatch; + + if (std::regex_match(Line, VendorIdMatch, VendorIdRe)) { + Vendor = VendorIdMatch[1].str(); + } + + if (std::regex_match(Line, ModelNameMatch, ModelNameRe)) { + ProcessorName = ModelNameMatch[1].str(); + } + + if (std::regex_match(Line, CpuMHzMatch, CpuMHzRe)) { + ClockrateStr = CpuMHzMatch[1].str(); + } } - if (std::regex_match(line, modelNameM, modelNameRe)) { - this->_processorName = modelNameM[1].str(); + if (Vendor.empty()) { + log::warn() << "Could determine vendor from /proc/cpuinfo"; } - if (std::regex_match(line, cpuMHzM, cpuMHzRe)) { - clockrate = cpuMHzM[1].str(); + if (ProcessorName.empty()) { + log::warn() << "Could determine processor-name from /proc/cpuinfo"; } - } - if (this->_vendor == "") { - log::warn() << "Could determine vendor from /proc/cpuinfo"; - } + if (ClockrateStr == "0") { + firestarter::log::warn() << "Can't determine clockrate from /proc/cpuinfo"; + } else { + firestarter::log::trace() << "Clockrate from /proc/cpuinfo is " << ClockrateStr; + Clockrate = static_cast(1000000U) * std::stoi(ClockrateStr); + } - if (this->_processorName == "") { - log::warn() << "Could determine processor-name from /proc/cpuinfo"; - } + auto Governor = scalingGovernor(); + if (!Governor.empty()) { - if (clockrate == "0") { - firestarter::log::warn() << "Can't determine clockrate from /proc/cpuinfo"; - } else { - firestarter::log::trace() - << "Clockrate from /proc/cpuinfo is " << clockrate; - this->_clockrate = 1e6 * std::stoi(clockrate); - } + auto ScalingCurFreq = getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq").str(); + auto CpuinfoCurFreq = getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq").str(); + auto ScalingMaxFreq = getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq").str(); + auto CpuinfoMaxFreq = getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq").str(); - auto governor = this->scalingGovernor(); - if (!governor.empty()) { - - auto scalingCurFreq = - this->getFileAsStream( - "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq") - .str(); - auto cpuinfoCurFreq = - this->getFileAsStream( - "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq") - .str(); - auto scalingMaxFreq = - this->getFileAsStream( - "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq") - .str(); - auto cpuinfoMaxFreq = - this->getFileAsStream( - "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq") - .str(); - - if (governor.compare("performance") || governor.compare("powersave")) { - if (scalingCurFreq.empty()) { - if (!cpuinfoCurFreq.empty()) { - clockrate = cpuinfoCurFreq; + if (Governor == "performance" || Governor == "powersave") { + if (ScalingCurFreq.empty()) { + if (!CpuinfoCurFreq.empty()) { + ClockrateStr = CpuinfoCurFreq; + } + } else { + ClockrateStr = ScalingCurFreq; } } else { - clockrate = scalingCurFreq; - } - } else { - if (scalingMaxFreq.empty()) { - if (!cpuinfoMaxFreq.empty()) { - clockrate = cpuinfoMaxFreq; + if (ScalingMaxFreq.empty()) { + if (!CpuinfoMaxFreq.empty()) { + ClockrateStr = CpuinfoMaxFreq; + } + } else { + ClockrateStr = ScalingMaxFreq; } - } else { - clockrate = scalingMaxFreq; } - } - this->_clockrate = 1e3 * std::stoi(clockrate); + Clockrate = static_cast(1000U) * std::stoi(ClockrateStr); + } } #endif // try to detect processor name for macos #ifdef __APPLE__ - // use sysctl to detect the name - std::array buffer; - auto cmd = "sysctl -n machdep.cpu.brand_string"; - std::unique_ptr pipe(popen(cmd, "r"), pclose); - if (!pipe) { - log::warn() << "Could not determine processor-name"; - } - if (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { - auto str = std::string(buffer.data()); - str.erase(std::remove(str.begin(), str.end(), '\n'), str.end()); - this->_processorName = str; + { + // use sysctl to detect the name + std::array Buffer{}; + const auto* Cmd = "sysctl -n machdep.cpu.brand_string"; + std::unique_ptr Pipe(popen(Cmd, "r"), pclose); + if (!Pipe) { + log::warn() << "Could not determine processor-name"; + } + if (fgets(Buffer.data(), Buffer.size(), Pipe.get()) != nullptr) { + auto Str = std::string(Buffer.data()); + Str.erase(std::remove(Str.begin(), Str.end(), '\n'), Str.end()); + ProcessorName = Str; + } } #endif // try to detect processor name for windows #ifdef _WIN32 - // use wmic - std::array buffer; - auto cmd = "wmic cpu get name"; - std::unique_ptr pipe(_popen(cmd, "r"), _pclose); - if (!pipe) { - log::warn() << "Could not determine processor-name"; - } - auto line = 0; - while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { - if (line != 1) { - line++; - continue; + { + // use wmic + std::array Buffer{}; + const auto* Cmd = "wmic cpu get name"; + std::unique_ptr Pipe(_popen(Cmd, "r"), _pclose); + if (!Pipe) { + log::warn() << "Could not determine processor-name"; } + auto Line = 0; + while (fgets(Buffer.data(), Buffer.size(), Pipe.get()) != nullptr) { + if (Line != 1) { + Line++; + continue; + } - auto str = std::string(buffer.data()); - str.erase(std::remove(str.begin(), str.end(), '\n'), str.end()); - this->_processorName = str; + auto Str = std::string(Buffer.data()); + Str.erase(std::remove(Str.begin(), Str.end(), '\n'), Str.end()); + ProcessorName = Str; + } } #endif // get L1i-Cache size - int width = hwloc_get_nbobjs_by_type(this->topology, HWLOC_OBJ_L1ICACHE); + const auto Width = hwloc_get_nbobjs_by_type(Topology, HWLOC_OBJ_L1ICACHE); - if (width >= 1) { - hwloc_obj_t cacheObj = - hwloc_get_obj_by_type(this->topology, HWLOC_OBJ_L1ICACHE, 0); - this->_instructionCacheSize = cacheObj->attr->cache.size; + if (Width >= 1) { + hwloc_obj_t CacheObj = hwloc_get_obj_by_type(Topology, HWLOC_OBJ_L1ICACHE, 0); + InstructionCacheSize = CacheObj->attr->cache.size; } } -CPUTopology::~CPUTopology() { hwloc_topology_destroy(this->topology); } +CPUTopology::~CPUTopology() { hwloc_topology_destroy(Topology); } -std::stringstream CPUTopology::getFileAsStream(std::string const &filePath) { - std::ifstream file(filePath); - std::stringstream ss; +auto CPUTopology::getFileAsStream(std::string const& FilePath) -> std::stringstream { + std::ifstream File(FilePath); + std::stringstream Ss; - if (!file.is_open()) { - log::trace() << "Could not open " << filePath; + if (!File.is_open()) { + log::trace() << "Could not open " << FilePath; } else { - ss << file.rdbuf(); - file.close(); + Ss << File.rdbuf(); + File.close(); } - return ss; + return Ss; } -std::string CPUTopology::scalingGovernor() const { - return this - ->getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor") - .str(); +auto CPUTopology::scalingGovernor() -> std::string { + return getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor").str(); } -int CPUTopology::getCoreIdFromPU(unsigned pu) const { - int width; - hwloc_obj_t obj; +auto CPUTopology::getCoreIdFromPU(unsigned Pu) const -> std::optional { + auto Width = hwloc_get_nbobjs_by_type(Topology, HWLOC_OBJ_PU); - width = hwloc_get_nbobjs_by_type(this->topology, HWLOC_OBJ_PU); - - if (width >= 1) { - for (int i = 0; i < width; i++) { - obj = hwloc_get_obj_by_type(this->topology, HWLOC_OBJ_PU, i); - if (obj->os_index == pu) { - for (; obj; obj = obj->parent) { - if (obj->type == HWLOC_OBJ_CORE) { - return obj->logical_index; + if (Width >= 1) { + for (int I = 0; I < Width; I++) { + auto* Obj = hwloc_get_obj_by_type(Topology, HWLOC_OBJ_PU, I); + if (Obj->os_index == Pu) { + for (; Obj; Obj = Obj->parent) { + if (Obj->type == HWLOC_OBJ_CORE) { + return Obj->logical_index; } } } } } - return -1; + return {}; } -int CPUTopology::getPkgIdFromPU(unsigned pu) const { - int width; - hwloc_obj_t obj; - - width = hwloc_get_nbobjs_by_type(this->topology, HWLOC_OBJ_PU); +auto CPUTopology::getPkgIdFromPU(unsigned Pu) const -> std::optional { + auto Width = hwloc_get_nbobjs_by_type(Topology, HWLOC_OBJ_PU); - if (width >= 1) { - for (int i = 0; i < width; i++) { - obj = hwloc_get_obj_by_type(this->topology, HWLOC_OBJ_PU, i); - if (obj->os_index == pu) { - for (; obj; obj = obj->parent) { - if (obj->type == HWLOC_OBJ_PACKAGE) { - return obj->logical_index; + if (Width >= 1) { + for (int I = 0; I < Width; I++) { + auto* Obj = hwloc_get_obj_by_type(Topology, HWLOC_OBJ_PU, I); + if (Obj->os_index == Pu) { + for (; Obj; Obj = Obj->parent) { + if (Obj->type == HWLOC_OBJ_PACKAGE) { + return Obj->logical_index; } } } } } - return -1; + return {}; } -unsigned CPUTopology::maxNumThreads() const { - unsigned max = 0; +auto CPUTopology::maxNumThreads() const -> unsigned { + unsigned Max = 0; // There might be more then one kind of cores - int nr_cpukinds = hwloc_cpukinds_get_nr(this->topology, 0); + const auto NrCpukinds = hwloc_cpukinds_get_nr(Topology, 0); // fallback in case this did not work ... can happen on some platforms // already printed a warning earlier - if (nr_cpukinds < 1) { - hwloc_obj_t obj; - int width = hwloc_get_nbobjs_by_type(this->topology, HWLOC_OBJ_PU); - unsigned max = 0; - - for (int i = 0; i < width; i++) { - obj = hwloc_get_obj_by_type(this->topology, HWLOC_OBJ_PU, i); - max = max < obj->os_index ? obj->os_index : max; + if (NrCpukinds < 1) { + auto Width = hwloc_get_nbobjs_by_type(Topology, HWLOC_OBJ_PU); + unsigned Max = 0; + + for (int I = 0; I < Width; I++) { + auto* Obj = hwloc_get_obj_by_type(Topology, HWLOC_OBJ_PU, I); + Max = (std::max)(Max, Obj->os_index); } - return max + 1; + return Max + 1; } // Allocate bitmap to get CPUs later - hwloc_bitmap_t bitmap = hwloc_bitmap_alloc(); - if (bitmap == NULL) { + hwloc_bitmap_t Bitmap = hwloc_bitmap_alloc(); + if (Bitmap == nullptr) { log::error() << "Could not allocate memory for CPU bitmap"; return 1; } // Find CPUs per kind - for (int kind_index = 0; kind_index < nr_cpukinds; kind_index++) { - int result = hwloc_cpukinds_get_info(this->topology, kind_index, bitmap, - NULL, NULL, NULL, 0); - if (result) { - log::warn() << "Could not get information for CPU kind " << kind_index; + for (int KindIndex = 0; KindIndex < NrCpukinds; KindIndex++) { + const auto Result = hwloc_cpukinds_get_info(Topology, KindIndex, Bitmap, nullptr, nullptr, nullptr, 0); + if (Result) { + log::warn() << "Could not get information for CPU kind " << KindIndex; } - max += hwloc_bitmap_weight(bitmap); + Max += hwloc_bitmap_weight(Bitmap); } - hwloc_bitmap_free(bitmap); + hwloc_bitmap_free(Bitmap); - return max; + return Max; } + +}; // namespace firestarter::environment \ No newline at end of file diff --git a/src/firestarter/Environment/Environment.cpp b/src/firestarter/Environment/Environment.cpp index d827ee83..9d3f81c7 100644 --- a/src/firestarter/Environment/Environment.cpp +++ b/src/firestarter/Environment/Environment.cpp @@ -19,232 +19,204 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include +#include "firestarter/Environment/Environment.hpp" +#include "firestarter/Logging/Log.hpp" -#include #include +#include #include -using namespace firestarter::environment; +namespace firestarter::environment { -#if (defined(linux) || defined(__linux__)) && \ - defined(FIRESTARTER_THREAD_AFFINITY) +#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY) extern "C" { #include } -// this code is from the C version of FIRESTARTER -// TODO: replace this with cpu affinity of hwloc -#define ADD_CPU_SET(cpu, cpuset) \ - do { \ - if (this->cpuAllowed(cpu)) { \ - CPU_SET(cpu, &cpuset); \ - } else { \ - if (cpu >= this->topology().numThreads()) { \ - log::error() << "The given bind argument (-b/--bind) includes CPU " \ - << cpu << " that is not available on this system."; \ - } else { \ - log::error() << "The given bind argument (-b/--bind) cannot " \ - "be implemented with the cpuset given from the OS\n" \ - << "This can be caused by the taskset tool, cgroups, " \ - "the batch system, or similar mechanisms.\n" \ - << "Please fix the argument to match the restrictions."; \ - } \ - return EACCES; \ - } \ - } while (0) - -int Environment::cpuSet(unsigned id) { - cpu_set_t mask; - - CPU_ZERO(&mask); - CPU_SET(id, &mask); - - return sched_setaffinity(0, sizeof(cpu_set_t), &mask); +auto Environment::cpuSet(unsigned Id) -> int { + cpu_set_t Mask; + + CPU_ZERO(&Mask); + CPU_SET(Id, &Mask); + + return sched_setaffinity(0, sizeof(cpu_set_t), &Mask); } -int Environment::cpuAllowed(unsigned id) { - cpu_set_t mask; +auto Environment::cpuAllowed(unsigned Id) -> bool { + cpu_set_t Mask; - CPU_ZERO(&mask); + CPU_ZERO(&Mask); - if (!sched_getaffinity(0, sizeof(cpu_set_t), &mask)) { - return CPU_ISSET(id, &mask); + if (!sched_getaffinity(0, sizeof(cpu_set_t), &Mask)) { + return CPU_ISSET(Id, &Mask); } - return 0; + return false; } -#endif -int Environment::evaluateCpuAffinity(unsigned requestedNumThreads, - std::string cpuBind) { -#if not((defined(linux) || defined(__linux__)) && \ - defined(FIRESTARTER_THREAD_AFFINITY)) - (void)cpuBind; +void Environment::addCpuSet(unsigned Cpu, cpu_set_t& Mask) const { + if (cpuAllowed(Cpu)) { + CPU_SET(Cpu, &Mask); + } else { + if (Cpu >= topology().numThreads()) { + throw std::invalid_argument("The given bind argument (-b/--bind) includes CPU " + std::to_string(Cpu) + + " that is not available on this system."); + } + throw std::invalid_argument("The given bind argument (-b/--bind) cannot " + "be implemented with the cpuset given from the OS\n" + "This can be caused by the taskset tool, cgroups, " + "the batch system, or similar mechanisms.\n" + "Please fix the argument to match the restrictions."); + } +} #endif - if (requestedNumThreads > 0 && - requestedNumThreads > this->topology().numThreads()) { +void Environment::evaluateCpuAffinity(unsigned RequestedNumThreads, const std::string& CpuBind) { + if (RequestedNumThreads > 0 && RequestedNumThreads > topology().numThreads()) { log::warn() << "Not enough CPUs for requested number of threads"; } -#if (defined(linux) || defined(__linux__)) && \ - defined(FIRESTARTER_THREAD_AFFINITY) - cpu_set_t cpuset; +#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY) + cpu_set_t Cpuset; - CPU_ZERO(&cpuset); + CPU_ZERO(&Cpuset); - if (cpuBind.empty()) { + if (CpuBind.empty()) { // no cpu binding defined // use all CPUs if not defined otherwise - if (requestedNumThreads == 0) { - for (unsigned i = 0; i < this->topology().maxNumThreads(); i++) { - if (this->cpuAllowed(i)) { - CPU_SET(i, &cpuset); - requestedNumThreads++; + if (RequestedNumThreads == 0) { + for (unsigned I = 0; I < topology().maxNumThreads(); I++) { + if (cpuAllowed(I)) { + CPU_SET(I, &Cpuset); + RequestedNumThreads++; } } } else { // if -n / --threads is set - unsigned cpu_count = 0; - for (unsigned i = 0; i < this->topology().maxNumThreads(); i++) { + unsigned CpuCount = 0; + for (unsigned I = 0; I < topology().maxNumThreads(); I++) { // skip if cpu is not available - if (!this->cpuAllowed(i)) { + if (!cpuAllowed(I)) { continue; } - ADD_CPU_SET(i, cpuset); - cpu_count++; + addCpuSet(I, Cpuset); + CpuCount++; // we reached the desired amounts of threads - if (cpu_count >= requestedNumThreads) { + if (CpuCount >= RequestedNumThreads) { break; } } // requested to many threads - if (cpu_count < requestedNumThreads) { - log::error() << "You are requesting more threads than " - "there are CPUs available in the given cpuset.\n" - << "This can be caused by the taskset tool, cgrous, " - "the batch system, or similar mechanisms.\n" - << "Please fix the -n/--threads argument to match the " - "restrictions."; - return EACCES; + if (CpuCount < RequestedNumThreads) { + throw std::invalid_argument("You are requesting more threads than " + "there are CPUs available in the given cpuset.\n" + "This can be caused by the taskset tool, cgrous, " + "the batch system, or similar mechanisms.\n" + "Please fix the -n/--threads argument to match the " + "restrictions."); } } } else { + RequestedNumThreads = 0; + // parse CPULIST for binding - const std::string delimiter = ","; - const std::regex re("^(?:(\\d+)(?:-([1-9]\\d*)(?:\\/([1-9]\\d*))?)?)$"); + const auto Delimiter = ','; + const std::regex Re(R"(^(?:(\d+)(?:-([1-9]\d*)(?:\/([1-9]\d*))?)?)$)"); - std::stringstream ss(cpuBind); + std::stringstream Ss(CpuBind); - while (ss.good()) { - std::string token; - std::smatch m; - std::getline(ss, token, ','); - ; + while (Ss.good()) { + std::string Token; + std::smatch M; + std::getline(Ss, Token, Delimiter); - if (std::regex_match(token, m, re)) { - unsigned long x, y, s; + if (std::regex_match(Token, M, Re)) { + uint64_t Y = 0; + uint64_t S = 0; - x = std::stoul(m[1].str()); - if (m[2].matched) { - y = std::stoul(m[2].str()); + auto X = std::stoul(M[1].str()); + if (M[2].matched) { + Y = std::stoul(M[2].str()); } else { - y = x; + Y = X; } - if (m[3].matched) { - s = std::stoul(m[3].str()); + if (M[3].matched) { + S = std::stoul(M[3].str()); } else { - s = 1; + S = 1; } - if (y < x) { - log::error() << "y has to be >= x in x-y expressions of CPU list: " - << token; - return EXIT_FAILURE; + if (Y < X) { + throw std::invalid_argument("y has to be >= x in x-y expressions of CPU list: " + Token); } - for (unsigned long i = x; i <= y; i += s) { - ADD_CPU_SET(i, cpuset); - requestedNumThreads++; + for (auto I = X; I <= Y; I += S) { + addCpuSet(I, Cpuset); + RequestedNumThreads++; } } else { - log::error() << "Invalid symbols in CPU list: " << token; - return EXIT_FAILURE; + throw std::invalid_argument("Invalid symbols in CPU list: " + Token); } } } -#else - if (requestedNumThreads == 0) { - requestedNumThreads = this->topology().maxNumThreads(); - } -#endif - if (requestedNumThreads == 0) { - log::error() << "Found no usable CPUs!"; - return 127; + if (RequestedNumThreads == 0) { + throw std::invalid_argument("Found no usable CPUs!"); } -#if (defined(linux) || defined(__linux__)) && \ - defined(FIRESTARTER_THREAD_AFFINITY) - else { - for (unsigned i = 0; i < this->topology().maxNumThreads(); i++) { - if (CPU_ISSET(i, &cpuset)) { - this->cpuBind.push_back(i); - } + + // Save the ids of the threads. + for (unsigned I = 0; I < topology().maxNumThreads(); I++) { + if (CPU_ISSET(I, &Cpuset)) { + this->CpuBind.push_back(I); } } -#endif +#else + (void)CpuBind; - if (requestedNumThreads > this->topology().maxNumThreads()) { - requestedNumThreads = this->topology().maxNumThreads(); + if (RequestedNumThreads == 0) { + RequestedNumThreads = topology().maxNumThreads(); } +#endif - this->_requestedNumThreads = requestedNumThreads; - - return EXIT_SUCCESS; + // Limit the number of thread to the maximum on the CPU. + this->RequestedNumThreads = (std::min)(RequestedNumThreads, topology().maxNumThreads()); } void Environment::printThreadSummary() { - log::info() << "\n using " << this->requestedNumThreads() << " threads"; - -#if (defined(linux) || defined(__linux__)) && \ - defined(FIRESTARTER_THREAD_AFFINITY) - bool printCoreIdInfo = false; - size_t i = 0; - - std::vector cpuBind(this->cpuBind); - cpuBind.resize(this->requestedNumThreads()); - for (auto const &bind : cpuBind) { - int coreId = this->topology().getCoreIdFromPU(bind); - int pkgId = this->topology().getPkgIdFromPU(bind); - - if (coreId != -1 && pkgId != -1) { - log::info() << " - Thread " << i << " run on CPU " << bind << ", core " - << coreId << " in package: " << pkgId; - printCoreIdInfo = true; + log::info() << "\n using " << requestedNumThreads() << " threads"; + +#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY) + bool PrintCoreIdInfo = false; + size_t I = 0; + + std::vector CpuBind(this->CpuBind); + CpuBind.resize(requestedNumThreads()); + for (auto const& Bind : CpuBind) { + const auto CoreId = topology().getCoreIdFromPU(Bind); + const auto PkgId = topology().getPkgIdFromPU(Bind); + + if (CoreId && PkgId) { + log::info() << " - Thread " << I << " run on CPU " << Bind << ", core " << *CoreId + << " in package: " << *PkgId; + PrintCoreIdInfo = true; } - i++; + I++; } - if (printCoreIdInfo) { - log::info() - << " The cores are numbered using the logical_index from hwloc."; + if (PrintCoreIdInfo) { + log::info() << " The cores are numbered using the logical_index from hwloc."; } #endif } -int Environment::setCpuAffinity(unsigned thread) { - if (thread >= this->requestedNumThreads()) { - log::error() << "Trying to set more CPUs than available."; - return EXIT_FAILURE; +void Environment::setCpuAffinity(unsigned Thread) const { + if (Thread >= requestedNumThreads()) { + throw std::invalid_argument("Trying to set more CPUs than available."); } -#if (defined(linux) || defined(__linux__)) && \ - defined(FIRESTARTER_THREAD_AFFINITY) - this->cpuSet(this->cpuBind.at(thread)); +#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY) + cpuSet(CpuBind.at(Thread)); #endif - - return EXIT_SUCCESS; } +}; // namespace firestarter::environment \ No newline at end of file diff --git a/src/firestarter/Environment/Payload/CompiledPayload.cpp b/src/firestarter/Environment/Payload/CompiledPayload.cpp new file mode 100644 index 00000000..33183d7a --- /dev/null +++ b/src/firestarter/Environment/Payload/CompiledPayload.cpp @@ -0,0 +1,33 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2020 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#include "firestarter/Environment/Payload/CompiledPayload.hpp" +#include "firestarter/Environment/Payload/Payload.hpp" + +namespace firestarter::environment::payload { + +void CompiledPayload::init(double* MemoryAddr, uint64_t BufferSize) { PayloadPtr->init(MemoryAddr, BufferSize); } + +void CompiledPayload::lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period) { + PayloadPtr->lowLoadFunction(LoadVar, Period); +}; + +}; // namespace firestarter::environment::payload \ No newline at end of file diff --git a/src/firestarter/Environment/Payload/Payload.cpp b/src/firestarter/Environment/Payload/Payload.cpp deleted file mode 100644 index 68cfc547..00000000 --- a/src/firestarter/Environment/Payload/Payload.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/****************************************************************************** - * FIRESTARTER - A Processor Stress Test Utility - * Copyright (C) 2020 TU Dresden, Center for Information Services and High - * Performance Computing - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Contact: daniel.hackenberg@tu-dresden.de - *****************************************************************************/ - -#include -#include - -#include - -using namespace firestarter::environment::payload; - -unsigned -Payload::getSequenceStartCount(const std::vector &sequence, - const std::string start) { - unsigned i = 0; - - for (const auto &item : sequence) { - if (0 == item.rfind(start, 0)) { - i++; - } - } - - return i; -} - -std::vector Payload::generateSequence( - std::vector> const &proportions) { - std::vector> prop = proportions; - - prop.erase(std::remove_if(prop.begin(), prop.end(), - [](auto const &pair) { return pair.second == 0; }), - prop.end()); - - std::vector sequence = {}; - - if (prop.size() == 0) { - return sequence; - } - - auto it = prop.begin(); - auto insertIt = sequence.begin(); - - sequence.insert(insertIt, it->second, it->first); - - for (++it; it != prop.end(); ++it) { - for (unsigned i = 0; i < it->second; i++) { - insertIt = sequence.begin(); - std::advance(insertIt, 1 + floor(i * (sequence.size() + it->second - i) / - (float)it->second)); - sequence.insert(insertIt, it->first); - } - } - - return sequence; -} - -unsigned Payload::getL2LoopCount(const std::vector &sequence, - const unsigned numberOfLines, - const unsigned size, const unsigned threads) { - if (this->getL2SequenceCount(sequence) == 0) { - return 0; - } - return (0.8 * size / 64 / threads / - (this->getL2SequenceCount(sequence) * - this->getNumberOfSequenceRepetitions(sequence, - numberOfLines / threads))); -} - -unsigned Payload::getL3LoopCount(const std::vector &sequence, - const unsigned numberOfLines, - const unsigned size, const unsigned threads) { - if (this->getL3SequenceCount(sequence) == 0) { - return 0; - } - return (0.8 * size / 64 / threads / - (this->getL3SequenceCount(sequence) * - this->getNumberOfSequenceRepetitions(sequence, - numberOfLines / threads))); -} - -unsigned Payload::getRAMLoopCount(const std::vector &sequence, - const unsigned numberOfLines, - const unsigned size, const unsigned threads) { - if (this->getRAMSequenceCount(sequence) == 0) { - return 0; - } - return (1.0 * size / 64 / threads / - (this->getRAMSequenceCount(sequence) * - this->getNumberOfSequenceRepetitions(sequence, - numberOfLines / threads))); -} diff --git a/src/firestarter/Environment/Payload/PayloadSettings.cpp b/src/firestarter/Environment/Payload/PayloadSettings.cpp new file mode 100644 index 00000000..25ca4ea4 --- /dev/null +++ b/src/firestarter/Environment/Payload/PayloadSettings.cpp @@ -0,0 +1,98 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2020 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#include "firestarter/Environment/Payload/PayloadSettings.hpp" + +#include +#include + +namespace firestarter::environment::payload { + +auto PayloadSettings::getSequenceStartCount(const std::vector& Sequence, const std::string& Start) + -> unsigned { + unsigned I = 0; + + for (const auto& Item : Sequence) { + if (0 == Item.rfind(Start, 0)) { + I++; + } + } + + return I; +} + +auto PayloadSettings::generateSequence(std::vector const& Proportions) + -> std::vector { + std::vector> Prop = Proportions; + + Prop.erase(std::remove_if(Prop.begin(), Prop.end(), [](auto const& Pair) { return Pair.second == 0; }), Prop.end()); + + std::vector Sequence = {}; + + if (Prop.empty()) { + return Sequence; + } + + auto It = Prop.begin(); + auto InsertIt = Sequence.begin(); + + Sequence.insert(InsertIt, It->second, It->first); + + for (++It; It != Prop.end(); ++It) { + for (unsigned I = 0; I < It->second; I++) { + InsertIt = Sequence.begin(); + std::advance(InsertIt, 1 + std::floor(static_cast(I * (Sequence.size() + It->second - I)) / + static_cast(It->second))); + Sequence.insert(InsertIt, It->first); + } + } + + return Sequence; +} + +auto PayloadSettings::getL2LoopCount(const std::vector& Sequence, const unsigned NumberOfLines, + const unsigned Size) -> unsigned { + if (getL2SequenceCount(Sequence) == 0) { + return 0; + } + return static_cast( + (0.8 * Size / 64 / (getL2SequenceCount(Sequence) * getNumberOfSequenceRepetitions(Sequence, NumberOfLines)))); +} + +auto PayloadSettings::getL3LoopCount(const std::vector& Sequence, const unsigned NumberOfLines, + const unsigned Size) -> unsigned { + if (getL3SequenceCount(Sequence) == 0) { + return 0; + } + return static_cast( + (0.8 * Size / 64 / (getL3SequenceCount(Sequence) * getNumberOfSequenceRepetitions(Sequence, NumberOfLines)))); +} + +auto PayloadSettings::getRAMLoopCount(const std::vector& Sequence, const unsigned NumberOfLines, + const unsigned Size) -> unsigned { + if (getRAMSequenceCount(Sequence) == 0) { + return 0; + } + return static_cast( + (1.0 * Size / 64 / (getRAMSequenceCount(Sequence) * getNumberOfSequenceRepetitions(Sequence, NumberOfLines)))); +} + +}; // namespace firestarter::environment::payload \ No newline at end of file diff --git a/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp index 9316ed39..f52a5410 100644 --- a/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp @@ -19,432 +19,373 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include - -using namespace firestarter::environment::x86::payload; -using namespace asmjit; -using namespace asmjit::x86; - -int AVX512Payload::compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) { +#include "firestarter/Environment/X86/Payload/AVX512Payload.hpp" +#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp" + +namespace firestarter::environment::x86::payload { + +auto AVX512Payload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + using Imm = asmjit::Imm; + using Zmm = asmjit::x86::Zmm; + // NOLINTBEGIN(readability-identifier-naming) + constexpr asmjit::x86::Mem (*zmmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::zmmword_ptr; + constexpr auto zmm0 = asmjit::x86::zmm0; + constexpr auto zmm1 = asmjit::x86::zmm1; + constexpr auto zmm2 = asmjit::x86::zmm2; + // NOLINTEND(readability-identifier-naming) // Compute the sequence of instruction groups and the number of its repetions // to reach the desired size - auto sequence = this->generateSequence(proportion); - auto repetitions = - this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread); + auto Sequence = Settings.sequence(); + auto Repetitions = + environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread()); // compute count of flops and memory access for performance report - unsigned flops = 0; - unsigned bytes = 0; + environment::payload::PayloadStats Stats; - for (const auto &item : sequence) { - auto it = this->instructionFlops.find(item); + for (const auto& Item : Sequence) { + auto It = instructionFlops().find(Item); - if (it == this->instructionFlops.end()) { - workerLog::error() << "Instruction group " << item << " undefined in " - << name() << "."; - return EXIT_FAILURE; + if (It == instructionFlops().end()) { + workerLog::error() << "Instruction group " << Item << " undefined in " << name() << "."; } - flops += it->second; + Stats.Flops += It->second; - it = this->instructionMemory.find(item); + It = instructionMemory().find(Item); - if (it != this->instructionMemory.end()) { - bytes += it->second; + if (It != instructionMemory().end()) { + Stats.Bytes += It->second; } } - this->_flops = repetitions * flops; - this->_bytes = repetitions * bytes; - this->_instructions = repetitions * sequence.size() * 4 + 6; + Stats.Flops *= Repetitions; + Stats.Bytes *= Repetitions; + Stats.Instructions = Repetitions * Sequence.size() * 4 + 6; // calculate the buffer sizes - auto l1i_cache_size = instructionCacheSize / thread; - auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin(); - auto l1_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l2_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l3_size = *dataCacheBufferSizeIterator / thread; - auto ram_size = ramBufferSize / thread; + const auto L1iCacheSize = Settings.instructionCacheSizePerThread(); + const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread(); + auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin(); + const auto L1Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L2Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L3Size = *DataCacheBufferSizeIterator; + const auto RamSize = Settings.ramBufferSizePerThread(); // calculate the reset counters for the buffers - auto l2_loop_count = - getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread); - auto l3_loop_count = - getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread); - auto ram_loop_count = - getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread); - - CodeHolder code; - code.init(this->rt.environment()); - - if (nullptr != this->loadFunction) { - this->rt.release(&this->loadFunction); - } - - Builder cb(&code); - cb.addDiagnosticOptions( - asmjit::DiagnosticOptions::kValidateAssembler | - asmjit::DiagnosticOptions::kValidateIntermediate ); - - auto pointer_reg = rax; - auto l1_addr = rbx; - auto l2_addr = rcx; - auto l3_addr = r8; - auto ram_addr = r9; - auto l2_count_reg = r10; - auto l3_count_reg = r11; - auto ram_count_reg = r12; - auto temp_reg = r13; - auto temp_reg2 = rbp; - auto offset_reg = r14; - auto addrHigh_reg = r15; - auto iter_reg = mm0; - auto shift_reg = std::vector({rdi, rsi, rdx}); - auto shift_reg32 = std::vector({edi, esi, edx}); - auto nr_shift_regs = 3; - auto mul_regs = 3; - auto add_regs = 24; - auto alt_dst_regs = 5; - auto ram_reg = zmm30; - - FuncDetail func; - func.init(FuncSignatureT( - CallConvId::kCDecl), - this->rt.environment()); - - FuncFrame frame; - frame.init(func); + const auto L2LoopCount = + environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size); + const auto L3LoopCount = + environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size); + const auto RamLoopCount = + environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize); + + asmjit::CodeHolder Code; + Code.init(asmjit::Environment::host()); + + asmjit::x86::Builder Cb(&Code); + Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler | + asmjit::DiagnosticOptions::kValidateIntermediate); + + const auto PointerReg = asmjit::x86::rax; + const auto L1Addr = asmjit::x86::rbx; + const auto L2Addr = asmjit::x86::rcx; + const auto L3Addr = asmjit::x86::r8; + const auto RamAddr = asmjit::x86::r9; + const auto L2CountReg = asmjit::x86::r10; + const auto L3CountReg = asmjit::x86::r11; + const auto RamCountReg = asmjit::x86::r12; + const auto TempReg = asmjit::x86::r13; + const auto TempReg2 = asmjit::x86::rbp; + const auto OffsetReg = asmjit::x86::r14; + const auto AddrHighReg = asmjit::x86::r15; + const auto IterReg = asmjit::x86::mm0; + const auto ShiftReg = std::vector({asmjit::x86::rdi, asmjit::x86::rsi, asmjit::x86::rdx}); + const auto ShiftReg32 = std::vector({asmjit::x86::edi, asmjit::x86::esi, asmjit::x86::edx}); + const auto NrShiftRegs = 3; + const auto MulRegs = 3; + const auto AddRegs = 22; + const auto AltDstRegs = 5; + const auto RamReg = asmjit::x86::zmm30; + + asmjit::FuncDetail Func; + Func.init(asmjit::FuncSignature::build( + asmjit::CallConvId::kCDecl), + Code.environment()); + + asmjit::FuncFrame Frame; + Frame.init(Func); // make zmm registers dirty - for (int i = 0; i < 32; i++) { - frame.addDirtyRegs(Zmm(i)); + for (auto I = 0U; I < 32U; I++) { + Frame.addDirtyRegs(Zmm(I)); } - for (int i = 0; i < 8; i++) { - frame.addDirtyRegs(Mm(i)); + for (auto I = 0U; I < 8U; I++) { + Frame.addDirtyRegs(asmjit::x86::Mm(I)); } // make all other used registers dirty except RAX - frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg, - l3_count_reg, ram_count_reg, temp_reg, offset_reg, - addrHigh_reg, iter_reg, ram_addr); - for (const auto ® : shift_reg) { - frame.addDirtyRegs(reg); + Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg, + AddrHighReg, IterReg, RamAddr); + for (const auto& Reg : ShiftReg) { + Frame.addDirtyRegs(Reg); } - FuncArgsAssignment args(&func); + asmjit::FuncArgsAssignment Args(&Func); // FIXME: asmjit assigment to mm0 does not seem to be supported - args.assignAll(pointer_reg, addrHigh_reg, temp_reg); - args.updateFuncFrame(frame); - frame.finalize(); + Args.assignAll(PointerReg, AddrHighReg, TempReg); + Args.updateFuncFrame(Frame); + Frame.finalize(); - cb.emitProlog(frame); - cb.emitArgsAssignment(frame, args); + Cb.emitProlog(Frame); + Cb.emitArgsAssignment(Frame, Args); // FIXME: movq from temp_reg to iter_reg - cb.movq(iter_reg, temp_reg); + Cb.movq(IterReg, TempReg); // stop right away if low load is selected - auto FunctionExit = cb.newLabel(); + auto FunctionExit = Cb.newLabel(); - cb.mov(temp_reg, ptr_64(addrHigh_reg)); - cb.test(temp_reg, temp_reg); - cb.jz(FunctionExit); + Cb.mov(TempReg, ptr_64(AddrHighReg)); + Cb.test(TempReg, TempReg); + Cb.jz(FunctionExit); - cb.mov(offset_reg, + Cb.mov(OffsetReg, Imm(64)); // increment after each cache/memory access // Initialize registers for shift operations - for (auto const ® : shift_reg32) { - cb.mov(reg, Imm(0xAAAAAAAA)); + for (auto const& Reg : ShiftReg32) { + Cb.mov(Reg, Imm(0xAAAAAAAA)); } // Initialize AVX512-Registers for FMA Operations - cb.vmovapd(zmm0, zmmword_ptr(pointer_reg)); - cb.vmovapd(zmm1, zmmword_ptr(pointer_reg, 64)); - cb.vmovapd(zmm2, zmmword_ptr(pointer_reg, 128)); - auto add_start = mul_regs; - auto add_end = mul_regs + add_regs - 1; - auto trans_start = add_regs + mul_regs; - auto trans_end = add_regs + mul_regs + alt_dst_regs - 1; - for (int i = add_start; i <= trans_end; i++) { - cb.vmovapd(Zmm(i), zmmword_ptr(pointer_reg, 256 + i * 64)); - } - cb.mov(l1_addr, pointer_reg); // address for L1-buffer - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer - cb.mov(l2_count_reg, Imm(l2_loop_count)); - workerLog::trace() << "reset counter for L2-buffer with " - << l2_loop_count - << " cache line accesses per loop (" - << l2_size/1024 - << ") KiB"; - cb.mov(l3_count_reg, Imm(l3_loop_count)); - workerLog::trace() << "reset counter for L3-buffer with " - << l3_loop_count - << " cache line accesses per loop (" - << l3_size/1024 - << ") KiB"; - cb.mov(ram_count_reg, Imm(ram_loop_count)); - workerLog::trace() << "reset counter for RAM-buffer with " - << ram_loop_count - << " cache line accesses per loop (" - << ram_size/1024 - << ") KiB"; - - cb.align(AlignMode::kCode, 64); - - auto Loop = cb.newLabel(); - cb.bind(Loop); - - auto shift_pos = 0; - bool left = false; - auto add_dest = add_start + 1; - auto mov_dst = trans_start; - auto mov_src = mov_dst + 1; - unsigned l1_offset = 0; - -#define L1_INCREMENT() \ - l1_offset += 64; \ - if (l1_offset < l1_size * 0.5) { \ - cb.add(l1_addr, offset_reg); \ - } else { \ - l1_offset = 0; \ - cb.mov(l1_addr, pointer_reg); \ + Cb.vmovapd(zmm0, zmmword_ptr(PointerReg, 0)); + Cb.vmovapd(zmm1, zmmword_ptr(PointerReg, 64)); + Cb.vmovapd(zmm2, zmmword_ptr(PointerReg, 128)); + auto AddStart = MulRegs; + auto AddEnd = MulRegs + AddRegs - 1; + auto TransStart = AddRegs + MulRegs; + auto TransEnd = AddRegs + MulRegs + AltDstRegs - 1; + for (auto I = AddStart; I <= TransEnd; I++) { + Cb.vmovapd(Zmm(I), zmmword_ptr(PointerReg, 256 + (I * 64))); } - -#define L2_INCREMENT() cb.add(l2_addr, offset_reg) - -#define L3_INCREMENT() cb.add(l3_addr, offset_reg) - -#define RAM_INCREMENT() cb.add(ram_addr, offset_reg) - - for (unsigned count = 0; count < repetitions; count++) { - for (const auto &item : sequence) { - if (item == "REG") { - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); - cb.vfmadd231pd(Zmm(mov_dst), zmm2, zmm1); - cb.xor_(shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs], - temp_reg); - mov_dst++; - } else if (item == "L1_L") { - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); - cb.vfmadd231pd(Zmm(add_dest), zmm1, zmmword_ptr(l1_addr, 64)); - L1_INCREMENT(); - } else if (item == "L1_BROADCAST") { - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); - cb.vbroadcastsd(Zmm(add_dest), ptr_64(l1_addr, 64)); - L1_INCREMENT(); - } else if (item == "L1_S") { - cb.vmovapd(zmmword_ptr(l1_addr, 64), Zmm(add_dest)); - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); - L1_INCREMENT(); - } else if (item == "L1_LS") { - cb.vmovapd(zmmword_ptr(l1_addr, 64), Zmm(add_dest)); - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 128)); - L1_INCREMENT(); - } else if (item == "L2_L") { - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); - cb.vfmadd231pd(Zmm(add_dest), zmm1, zmmword_ptr(l2_addr, 64)); - L2_INCREMENT(); - } else if (item == "L2_S") { - cb.vmovapd(zmmword_ptr(l2_addr, 64), Zmm(add_dest)); - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); - L2_INCREMENT(); - } else if (item == "L2_LS") { - cb.vmovapd(zmmword_ptr(l2_addr, 64), Zmm(add_dest)); - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l2_addr, 128)); - L2_INCREMENT(); - } else if (item == "L3_L") { - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); - cb.vfmadd231pd(Zmm(add_dest), zmm1, zmmword_ptr(l3_addr, 64)); - L3_INCREMENT(); - } else if (item == "L3_S") { - cb.vmovapd(zmmword_ptr(l3_addr, 64), Zmm(add_dest)); - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); - L3_INCREMENT(); - } else if (item == "L3_LS") { - cb.vmovapd(zmmword_ptr(l3_addr, 64), Zmm(add_dest)); - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l3_addr, 128)); - L3_INCREMENT(); - } else if (item == "L3_P") { - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 64)); - cb.prefetcht2(ptr(l3_addr)); - L3_INCREMENT(); - } else if (item == "RAM_L") { - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); - cb.vfmadd231pd(ram_reg, zmm1, zmmword_ptr(ram_addr, 64)); - RAM_INCREMENT(); - } else if (item == "RAM_S") { - cb.vmovapd(zmmword_ptr(ram_addr, 64), Zmm(add_dest)); - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); - RAM_INCREMENT(); - } else if (item == "RAM_LS") { - cb.vmovapd(zmmword_ptr(ram_addr, 64), Zmm(add_dest)); - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(ram_addr, 128)); - RAM_INCREMENT(); - } else if (item == "RAM_P") { - cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 64)); - cb.prefetcht2(ptr(ram_addr)); - RAM_INCREMENT(); + Cb.mov(L1Addr, PointerReg); // address for L1-buffer + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer + Cb.mov(L2CountReg, Imm(L2LoopCount)); + workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop (" + << L2Size / 1024 << ") KiB"; + Cb.mov(L3CountReg, Imm(L3LoopCount)); + workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop (" + << L3Size / 1024 << ") KiB"; + Cb.mov(RamCountReg, Imm(RamLoopCount)); + workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop (" + << RamSize / 1024 << ") KiB"; + + Cb.align(asmjit::AlignMode::kCode, 64); + + auto Loop = Cb.newLabel(); + Cb.bind(Loop); + + auto ShiftPos = 0; + bool Left = false; + auto AddDest = AddStart + 1; + auto MovDst = TransStart; + unsigned L1Offset = 0; + + const auto L1Increment = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg]() { + L1Offset += 64; + if (L1Offset < L1Size * 0.5) { + Cb.add(L1Addr, OffsetReg); + } else { + L1Offset = 0; + Cb.mov(L1Addr, PointerReg); + } + }; + const auto L2Increment = [&Cb, &L2Addr, &OffsetReg]() { Cb.add(L2Addr, OffsetReg); }; + const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); }; + const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); }; + + for (auto Count = 0U; Count < Repetitions; Count++) { + for (const auto& Item : Sequence) { + if (Item == "REG") { + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2); + Cb.vfmadd231pd(Zmm(MovDst), zmm2, zmm1); + Cb.xor_(ShiftReg[(ShiftPos + NrShiftRegs - 1) % NrShiftRegs], TempReg); + MovDst++; + } else if (Item == "L1_L") { + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2); + Cb.vfmadd231pd(Zmm(AddDest), zmm1, zmmword_ptr(L1Addr, 64)); + L1Increment(); + } else if (Item == "L1_BROADCAST") { + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2); + Cb.vbroadcastsd(Zmm(AddDest), ptr_64(L1Addr, 64)); + L1Increment(); + } else if (Item == "L1_S") { + Cb.vmovapd(zmmword_ptr(L1Addr, 64), Zmm(AddDest)); + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2); + L1Increment(); + } else if (Item == "L1_LS") { + Cb.vmovapd(zmmword_ptr(L1Addr, 64), Zmm(AddDest)); + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(L1Addr, 128)); + L1Increment(); + } else if (Item == "L2_L") { + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2); + Cb.vfmadd231pd(Zmm(AddDest), zmm1, zmmword_ptr(L2Addr, 64)); + L2Increment(); + } else if (Item == "L2_S") { + Cb.vmovapd(zmmword_ptr(L2Addr, 64), Zmm(AddDest)); + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2); + L2Increment(); + } else if (Item == "L2_LS") { + Cb.vmovapd(zmmword_ptr(L2Addr, 64), Zmm(AddDest)); + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(L2Addr, 128)); + L2Increment(); + } else if (Item == "L3_L") { + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2); + Cb.vfmadd231pd(Zmm(AddDest), zmm1, zmmword_ptr(L3Addr, 64)); + L3Increment(); + } else if (Item == "L3_S") { + Cb.vmovapd(zmmword_ptr(L3Addr, 64), Zmm(AddDest)); + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2); + L3Increment(); + } else if (Item == "L3_LS") { + Cb.vmovapd(zmmword_ptr(L3Addr, 64), Zmm(AddDest)); + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(L3Addr, 128)); + L3Increment(); + } else if (Item == "L3_P") { + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(L1Addr, 64)); + Cb.prefetcht2(ptr(L3Addr)); + L3Increment(); + } else if (Item == "RAM_L") { + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2); + Cb.vfmadd231pd(RamReg, zmm1, zmmword_ptr(RamAddr, 64)); + RamIncrement(); + } else if (Item == "RAM_S") { + Cb.vmovapd(zmmword_ptr(RamAddr, 64), Zmm(AddDest)); + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2); + RamIncrement(); + } else if (Item == "RAM_LS") { + Cb.vmovapd(zmmword_ptr(RamAddr, 64), Zmm(AddDest)); + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(RamAddr, 128)); + RamIncrement(); + } else if (Item == "RAM_P") { + Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(L1Addr, 64)); + Cb.prefetcht2(ptr(RamAddr)); + RamIncrement(); } else { - workerLog::error() << "Instruction group " << item << " not found in " - << this->name() << "."; - return EXIT_FAILURE; + workerLog::error() << "Instruction group " << Item << " not found in " << name() << "."; } - if (left) { - cb.shr(shift_reg32[shift_pos], Imm(1)); + if (Left) { + Cb.shr(ShiftReg32[ShiftPos], Imm(1)); } else { - cb.shl(shift_reg32[shift_pos], Imm(1)); + Cb.shl(ShiftReg32[ShiftPos], Imm(1)); } - add_dest++; - if (add_dest > add_end) { - add_dest = add_start; + AddDest++; + if (AddDest > AddEnd) { + AddDest = AddStart; } - if (mov_dst > trans_end) { - mov_dst = trans_start; + if (MovDst > TransEnd) { + MovDst = TransStart; } - mov_src++; - if (mov_src > trans_end) { - mov_src = trans_start; - } - shift_pos++; - if (shift_pos == nr_shift_regs) { - shift_pos = 0; - left = !left; + ShiftPos++; + if (ShiftPos == NrShiftRegs) { + ShiftPos = 0; + Left = !Left; } } } - cb.movq(temp_reg, iter_reg); // restore iteration counter - if (this->getRAMSequenceCount(sequence) > 0) { + Cb.movq(TempReg, IterReg); // restore iteration counter + if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) { // reset RAM counter - auto NoRamReset = cb.newLabel(); - - cb.sub(ram_count_reg, Imm(1)); - cb.jnz(NoRamReset); - cb.mov(ram_count_reg, Imm(ram_loop_count)); - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); - cb.bind(NoRamReset); + auto NoRamReset = Cb.newLabel(); + + Cb.sub(RamCountReg, Imm(1)); + Cb.jnz(NoRamReset); + Cb.mov(RamCountReg, Imm(RamLoopCount)); + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); + Cb.bind(NoRamReset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.inc(temp_reg); // increment iteration counter - if (this->getL2SequenceCount(sequence) > 0) { + Cb.inc(TempReg); // increment iteration counter + if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) { // reset L2-Cache counter - auto NoL2Reset = cb.newLabel(); - - cb.sub(l2_count_reg, Imm(1)); - cb.jnz(NoL2Reset); - cb.mov(l2_count_reg, Imm(l2_loop_count)); - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); - cb.bind(NoL2Reset); + auto NoL2Reset = Cb.newLabel(); + + Cb.sub(L2CountReg, Imm(1)); + Cb.jnz(NoL2Reset); + Cb.mov(L2CountReg, Imm(L2LoopCount)); + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); + Cb.bind(NoL2Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.movq(iter_reg, temp_reg); // store iteration counter - if (this->getL3SequenceCount(sequence) > 0) { + Cb.movq(IterReg, TempReg); // store iteration counter + if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) { // reset L3-Cache counter - auto NoL3Reset = cb.newLabel(); - - cb.sub(l3_count_reg, Imm(1)); - cb.jnz(NoL3Reset); - cb.mov(l3_count_reg, Imm(l3_loop_count)); - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); - cb.bind(NoL3Reset); + auto NoL3Reset = Cb.newLabel(); + + Cb.sub(L3CountReg, Imm(1)); + Cb.jnz(NoL3Reset); + Cb.mov(L3CountReg, Imm(L3LoopCount)); + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); + Cb.bind(NoL3Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.mov(l1_addr, pointer_reg); - - if (dumpRegisters) { - auto SkipRegistersDump = cb.newLabel(); + Cb.mov(L1Addr, PointerReg); - cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - cb.jnz(SkipRegistersDump); - - // dump all the ymm register - for (int i = 0; i < (int)this->registerCount(); i++) { - cb.vmovapd( - zmmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)), - Zmm(i)); - } - - // set read flag - cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - - cb.bind(SkipRegistersDump); + if (DumpRegisters) { + emitDumpRegisterCode(Cb, PointerReg, zmmword_ptr); } - if (errorDetection) { - this->emitErrorDetectionCode( - cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2); + if (ErrorDetection) { + emitErrorDetectionCode(Cb, IterReg, AddrHighReg, PointerReg, TempReg, TempReg2); } - cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH)); - cb.jnz(Loop); + Cb.test(ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh)); + Cb.jnz(Loop); - cb.bind(FunctionExit); + Cb.bind(FunctionExit); - cb.movq(rax, iter_reg); + Cb.movq(asmjit::x86::rax, IterReg); - cb.emitEpilog(frame); + Cb.emitEpilog(Frame); - cb.finalize(); + Cb.finalize(); - // String sb; - // cb.dump(sb); - - Error err = this->rt.add(&this->loadFunction, &code); - if (err) { - workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in " - << __FILE__ << " at " << __LINE__; - return EXIT_FAILURE; - } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size - if (l1i_cache_size != 0) { - auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop); - auto instructionCachePercentage = 100 * loopSize / l1i_cache_size; + if (L1iCacheSize) { + auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop); + auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize; - if (loopSize > l1i_cache_size) { + if (LoopSize > *L1iCacheSize) { workerLog::warn() << "Work-loop is bigger than the L1i-Cache."; } - workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size - << " Bytes (" << instructionCachePercentage + workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage << "%) from the L1i-Cache for the work-loop."; - workerLog::trace() << "Sequence size: " << sequence.size(); - workerLog::trace() << "Repetition count: " << repetitions; + workerLog::trace() << "Sequence size: " << Sequence.size(); + workerLog::trace() << "Repetition count: " << Repetitions; } - return EXIT_SUCCESS; + return CompiledPayloadPtr; } -std::list AVX512Payload::getAvailableInstructions() const { - std::list instructions; - - transform(this->instructionFlops.begin(), this->instructionFlops.end(), - back_inserter(instructions), - [](const auto &item) { return item.first; }); - - return instructions; +void AVX512Payload::init(double* MemoryAddr, uint64_t BufferSize) const { + X86Payload::initMemory(MemoryAddr, BufferSize, 0.27948995982e-4, 0.27948995982e-4); } -void AVX512Payload::init(unsigned long long *memoryAddr, - unsigned long long bufferSize) { - X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4); -} +} // namespace firestarter::environment::x86::payload \ No newline at end of file diff --git a/src/firestarter/Environment/X86/Payload/AVXPayload.cpp b/src/firestarter/Environment/X86/Payload/AVXPayload.cpp index b6899025..b20a85f7 100644 --- a/src/firestarter/Environment/X86/Payload/AVXPayload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVXPayload.cpp @@ -19,475 +19,403 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include - -#include -#include - -using namespace firestarter::environment::x86::payload; -using namespace asmjit; -using namespace asmjit::x86; - -int AVXPayload::compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) { +#include "firestarter/Environment/X86/Payload/AVXPayload.hpp" +#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp" + +namespace firestarter::environment::x86::payload { + +auto AVXPayload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + using Imm = asmjit::Imm; + using Mm = asmjit::x86::Mm; + using Xmm = asmjit::x86::Xmm; + using Ymm = asmjit::x86::Ymm; + // NOLINTNEXTLINE(readability-identifier-naming) + constexpr asmjit::x86::Mem (*xmmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::xmmword_ptr; + // Compute the sequence of instruction groups and the number of its repetions // to reach the desired size - auto sequence = this->generateSequence(proportion); - auto repetitions = - this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread); + auto Sequence = Settings.sequence(); + auto Repetitions = + environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread()); // compute count of flops and memory access for performance report - unsigned flops = 0; - unsigned bytes = 0; + environment::payload::PayloadStats Stats; - for (const auto &item : sequence) { - auto it = this->instructionFlops.find(item); + for (const auto& Item : Sequence) { + auto It = instructionFlops().find(Item); - if (it == this->instructionFlops.end()) { - workerLog::error() << "Instruction group " << item << " undefined in " - << name() << "."; - return EXIT_FAILURE; + if (It == instructionFlops().end()) { + workerLog::error() << "Instruction group " << Item << " undefined in " << name() << "."; } - flops += it->second; + Stats.Flops += It->second; - it = this->instructionMemory.find(item); + It = instructionMemory().find(Item); - if (it != this->instructionMemory.end()) { - bytes += it->second; + if (It != instructionMemory().end()) { + Stats.Bytes += It->second; } } - this->_flops = repetitions * flops; - this->_bytes = repetitions * bytes; - this->_instructions = repetitions * sequence.size() * 2 + 4; + Stats.Flops *= Repetitions; + Stats.Bytes *= Repetitions; + Stats.Instructions = Repetitions * Sequence.size() * 2 + 4; // calculate the buffer sizes - auto l1i_cache_size = instructionCacheSize / thread; - auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin(); - auto l1_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l2_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l3_size = *dataCacheBufferSizeIterator / thread; - auto ram_size = ramBufferSize / thread; + const auto L1iCacheSize = Settings.instructionCacheSizePerThread(); + const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread(); + auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin(); + const auto L1Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L2Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L3Size = *DataCacheBufferSizeIterator; + const auto RamSize = Settings.ramBufferSizePerThread(); // calculate the reset counters for the buffers - auto l2_loop_count = - getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread); - auto l3_loop_count = - getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread); - auto ram_loop_count = - getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread); - - CodeHolder code; - code.init(this->rt.environment()); - - if (nullptr != this->loadFunction) { - this->rt.release(&this->loadFunction); - } - - Builder cb(&code); - cb.addDiagnosticOptions( - asmjit::DiagnosticOptions::kValidateAssembler | - asmjit::DiagnosticOptions::kValidateIntermediate ); - - auto pointer_reg = rax; - auto l1_addr = rbx; - auto l2_addr = rcx; - auto l3_addr = rdx; - auto ram_addr = rdi; - auto l2_count_reg = r8; - auto l3_count_reg = r9; - auto ram_count_reg = r10; - auto temp_reg = r11; - auto temp_reg2 = rbp; - auto offset_reg = r12; - auto addrHigh_reg = r13; - auto iter_reg = r14; - auto shift_regs = 6; - auto add_regs = 10; - auto trans_regs = 6; - - FuncDetail func; - func.init(FuncSignatureT( - CallConvId::kCDecl), - this->rt.environment()); - - FuncFrame frame; - frame.init(func); + const auto L2LoopCount = + environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size); + const auto L3LoopCount = + environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size); + const auto RamLoopCount = + environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize); + + asmjit::CodeHolder Code; + Code.init(asmjit::Environment::host()); + + asmjit::x86::Builder Cb(&Code); + Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler | + asmjit::DiagnosticOptions::kValidateIntermediate); + + const auto PointerReg = asmjit::x86::rax; + const auto L1Addr = asmjit::x86::rbx; + const auto L2Addr = asmjit::x86::rcx; + const auto L3Addr = asmjit::x86::rdx; + const auto RamAddr = asmjit::x86::rdi; + const auto L2CountReg = asmjit::x86::r8; + const auto L3CountReg = asmjit::x86::r9; + const auto RamCountReg = asmjit::x86::r10; + const auto TempReg = asmjit::x86::r11; + const auto TempReg2 = asmjit::x86::rbp; + const auto OffsetReg = asmjit::x86::r12; + const auto AddrHighReg = asmjit::x86::r13; + const auto IterReg = asmjit::x86::r14; + const auto ShiftRegs = 6; + const auto AddRegs = 10; + const auto TransRegs = 6; + + asmjit::FuncDetail Func; + Func.init(asmjit::FuncSignature::build( + asmjit::CallConvId::kCDecl), + Code.environment()); + + asmjit::FuncFrame Frame; + Frame.init(Func); // make xmm registers dirty - for (int i = 0; i < 16; i++) { - frame.addDirtyRegs(Ymm(i)); + for (auto I = 0U; I < 16U; I++) { + Frame.addDirtyRegs(Ymm(I)); } // make mmx registers dirty - for (int i = 0; i < 8; i++) { - frame.addDirtyRegs(Mm(i)); + for (auto I = 0U; I < 8U; I++) { + Frame.addDirtyRegs(Mm(I)); } // make all other used registers dirty except RAX - frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg, - l3_count_reg, ram_count_reg, temp_reg, temp_reg2, - offset_reg, addrHigh_reg, iter_reg); + Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg, + AddrHighReg, IterReg); - FuncArgsAssignment args(&func); - args.assignAll(pointer_reg, addrHigh_reg, iter_reg); - args.updateFuncFrame(frame); - frame.finalize(); + asmjit::FuncArgsAssignment Args(&Func); + Args.assignAll(PointerReg, AddrHighReg, IterReg); + Args.updateFuncFrame(Frame); + Frame.finalize(); - cb.emitProlog(frame); - cb.emitArgsAssignment(frame, args); + Cb.emitProlog(Frame); + Cb.emitArgsAssignment(Frame, Args); // stop right away if low load is selected - auto FunctionExit = cb.newLabel(); + auto FunctionExit = Cb.newLabel(); - cb.mov(temp_reg, ptr_64(addrHigh_reg)); - cb.test(temp_reg, temp_reg); - cb.jz(FunctionExit); + Cb.mov(TempReg, ptr_64(AddrHighReg)); + Cb.test(TempReg, TempReg); + Cb.jz(FunctionExit); - cb.mov(offset_reg, + Cb.mov(OffsetReg, Imm(64)); // increment after each cache/memory access // Initialize AVX-Registers for Addition - auto add_start = 0; - auto add_end = add_regs - 1; - auto trans_start = add_regs; - auto trans_end = add_regs + trans_regs - 1; - if (add_regs > 0) { - for (int i = add_start; i <= add_end; i++) { - cb.vmovapd(Ymm(i), ymmword_ptr(pointer_reg, 32 * i)); + auto AddStart = 0; + auto AddEnd = AddRegs - 1; + auto TransStart = AddRegs; + auto TransEnd = AddRegs + TransRegs - 1; + if (AddRegs > 0) { + for (auto I = AddStart; I <= AddEnd; I++) { + Cb.vmovapd(Ymm(I), ymmword_ptr(PointerReg, 32 * I)); } } // Initialize MMX-Registers for shift operations - auto shift_start = 0; - auto shift_end = shift_regs - 1; - if (shift_regs > 1) { - cb.mov(temp_reg, Imm(0x5555555555555555)); - cb.movq(Mm(shift_start), temp_reg); - for (int i = shift_start + 1; i <= shift_end; i++) { - cb.movq(Mm(i), Mm(shift_start)); + auto ShiftStart = 0; + auto ShiftEnd = ShiftRegs - 1; + if (ShiftRegs > 1) { + Cb.mov(TempReg, Imm(0x5555555555555555)); + Cb.movq(Mm(ShiftStart), TempReg); + for (auto I = ShiftStart + 1; I <= ShiftEnd; I++) { + Cb.movq(Mm(I), Mm(ShiftStart)); } } // Initialize AVX-Registers for Transfer-Operations - if (trans_regs > 0) { - if (trans_start % 2 == 0) { - cb.mov(temp_reg, Imm(0x0F0F0F0F0F0F0F0F)); + if (TransRegs > 0) { + if (TransStart % 2 == 0) { + Cb.mov(TempReg, Imm(0x0F0F0F0F0F0F0F0F)); } else { - cb.mov(temp_reg, Imm(0xF0F0F0F0F0F0F0F0)); + Cb.mov(TempReg, Imm(0xF0F0F0F0F0F0F0F0)); } - cb.pinsrq(Xmm(trans_start), temp_reg, Imm(0)); - cb.pinsrq(Xmm(trans_start), temp_reg, Imm(1)); - cb.vinsertf128(Ymm(trans_start), Ymm(trans_start), Xmm(trans_start), - Imm(1)); - for (int i = trans_start + 1; i <= trans_end; i++) { - if (i % 2 == 0) { - cb.shr(temp_reg, Imm(4)); + Cb.pinsrq(Xmm(TransStart), TempReg, Imm(0)); + Cb.pinsrq(Xmm(TransStart), TempReg, Imm(1)); + Cb.vinsertf128(Ymm(TransStart), Ymm(TransStart), Xmm(TransStart), Imm(1)); + for (auto I = TransStart + 1; I <= TransEnd; I++) { + if (I % 2 == 0) { + Cb.shr(TempReg, Imm(4)); } else { - cb.shl(temp_reg, Imm(4)); + Cb.shl(TempReg, Imm(4)); } - cb.pinsrq(Xmm(i), temp_reg, Imm(0)); - cb.pinsrq(Xmm(i), temp_reg, Imm(1)); - cb.vinsertf128(Ymm(i), Ymm(i), Xmm(i), Imm(1)); + Cb.pinsrq(Xmm(I), TempReg, Imm(0)); + Cb.pinsrq(Xmm(I), TempReg, Imm(1)); + Cb.vinsertf128(Ymm(I), Ymm(I), Xmm(I), Imm(1)); } } - cb.mov(l1_addr, pointer_reg); // address for L1-buffer - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer - cb.mov(l2_count_reg, Imm(l2_loop_count)); - workerLog::trace() << "reset counter for L2-buffer with " - << l2_loop_count - << " cache line accesses per loop (" - << l2_size/1024 - << ") KiB"; - cb.mov(l3_count_reg, Imm(l3_loop_count)); - workerLog::trace() << "reset counter for L3-buffer with " - << l3_loop_count - << " cache line accesses per loop (" - << l3_size/1024 - << ") KiB"; - cb.mov(ram_count_reg, Imm(ram_loop_count)); - workerLog::trace() << "reset counter for RAM-buffer with " - << ram_loop_count - << " cache line accesses per loop (" - << ram_size/1024 - << ") KiB"; - - cb.align(AlignMode::kCode, 64); - - auto Loop = cb.newLabel(); - cb.bind(Loop); - - auto left = false; - auto shift_dst = shift_start; - auto add_dest = add_start + 1; - auto mov_dst = trans_start; - auto mov_src = mov_dst + 1; - unsigned l1_offset = 0; - -#define L1_INCREMENT() \ - l1_offset += 64; \ - if (l1_offset < l1_size * 0.5) { \ - cb.add(l1_addr, offset_reg); \ - } else { \ - l1_offset = 0; \ - cb.mov(l1_addr, pointer_reg); \ - } - -#define L2_INCREMENT() cb.add(l2_addr, offset_reg); - -#define L3_INCREMENT() cb.add(l3_addr, offset_reg) - -#define RAM_INCREMENT() cb.add(ram_addr, offset_reg) - - for (unsigned count = 0; count < repetitions; count++) { - for (const auto &item : sequence) { - if (item == "REG") { - cb.vaddpd( - Ymm(add_dest), Ymm(add_dest), - Ymm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - cb.vmovdqa(Ymm(mov_dst), Ymm(mov_src)); - } else if (item == "L1_L") { - cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l1_addr, 32)); - L1_INCREMENT(); - } else if (item == "L1_S") { - cb.vaddpd( - Ymm(add_dest), Ymm(add_dest), - Ymm(add_start + (add_dest - add_start + add_regs - 1) % add_regs)); - cb.vmovapd(xmmword_ptr(l1_addr, 32), Xmm(add_dest)); - L1_INCREMENT(); - this->_instructions++; - } else if (item == "L1_LS") { - cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l1_addr, 32)); - cb.vmovapd(xmmword_ptr(l1_addr, 64), Xmm(add_dest)); - L1_INCREMENT(); - this->_instructions++; - } else if (item == "L2_L") { - cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l2_addr, 64)); - L2_INCREMENT(); - } else if (item == "L2_S") { - cb.vaddpd( - Ymm(add_dest), Ymm(add_dest), - Ymm(add_start + (add_dest - add_start + add_regs - 1) % add_regs)); - cb.vmovapd(xmmword_ptr(l2_addr, 64), Xmm(add_dest)); - L2_INCREMENT(); - this->_instructions++; - } else if (item == "L2_LS") { - cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l2_addr, 64)); - cb.vmovapd(xmmword_ptr(l2_addr, 96), Xmm(add_dest)); - L2_INCREMENT(); - this->_instructions++; - } else if (item == "L3_L") { - cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l3_addr, 64)); - L3_INCREMENT(); - } else if (item == "L3_S") { - cb.vaddpd( - Ymm(add_dest), Ymm(add_dest), - Ymm(add_start + (add_dest - add_start + add_regs - 1) % add_regs)); - cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest)); - L3_INCREMENT(); - this->_instructions++; - } else if (item == "L3_LS") { - cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l3_addr, 64)); - cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest)); - L3_INCREMENT(); - this->_instructions++; - } else if (item == "L3_P") { - cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l1_addr, 32)); - cb.prefetcht0(ptr(l3_addr)); - L3_INCREMENT(); - this->_instructions++; - } else if (item == "RAM_L") { - cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(ram_addr, 64)); - RAM_INCREMENT(); - } else if (item == "RAM_S") { - cb.vaddpd( - Ymm(add_dest), Ymm(add_dest), - Ymm(add_start + (add_dest - add_start + add_regs - 1) % add_regs)); - cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest)); - RAM_INCREMENT(); - this->_instructions++; - } else if (item == "RAM_LS") { - cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l3_addr, 64)); - cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest)); - RAM_INCREMENT(); - this->_instructions++; - } else if (item == "RAM_P") { - cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l1_addr, 32)); - cb.prefetcht2(ptr(ram_addr)); - RAM_INCREMENT(); - this->_instructions++; + Cb.mov(L1Addr, PointerReg); // address for L1-buffer + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer + Cb.mov(L2CountReg, Imm(L2LoopCount)); + workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop (" + << L2Size / 1024 << ") KiB"; + Cb.mov(L3CountReg, Imm(L3LoopCount)); + workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop (" + << L3Size / 1024 << ") KiB"; + Cb.mov(RamCountReg, Imm(RamLoopCount)); + workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop (" + << RamSize / 1024 << ") KiB"; + + Cb.align(asmjit::AlignMode::kCode, 64); + + auto Loop = Cb.newLabel(); + Cb.bind(Loop); + + auto Left = false; + auto ShiftDest = ShiftStart; + auto AddDest = AddStart + 1; + auto MovDest = TransStart; + auto MovSrc = MovDest + 1; + unsigned L1Offset = 0; + + const auto L1Increment = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg]() { + L1Offset += 64; + if (L1Offset < L1Size * 0.5) { + Cb.add(L1Addr, OffsetReg); + } else { + L1Offset = 0; + Cb.mov(L1Addr, PointerReg); + } + }; + const auto L2Increment = [&Cb, &L2Addr, &OffsetReg]() { Cb.add(L2Addr, OffsetReg); }; + const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); }; + const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); }; + + for (auto Count = 0U; Count < Repetitions; Count++) { + for (const auto& Item : Sequence) { + if (Item == "REG") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), Ymm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + Cb.vmovdqa(Ymm(MovDest), Ymm(MovSrc)); + } else if (Item == "L1_L") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L1Addr, 32)); + L1Increment(); + } else if (Item == "L1_S") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), Ymm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs))); + Cb.vmovapd(xmmword_ptr(L1Addr, 32), Xmm(AddDest)); + L1Increment(); + Stats.Instructions++; + } else if (Item == "L1_LS") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L1Addr, 32)); + Cb.vmovapd(xmmword_ptr(L1Addr, 64), Xmm(AddDest)); + L1Increment(); + Stats.Instructions++; + } else if (Item == "L2_L") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L2Addr, 64)); + L2Increment(); + } else if (Item == "L2_S") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), Ymm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs))); + Cb.vmovapd(xmmword_ptr(L2Addr, 64), Xmm(AddDest)); + L2Increment(); + Stats.Instructions++; + } else if (Item == "L2_LS") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L2Addr, 64)); + Cb.vmovapd(xmmword_ptr(L2Addr, 96), Xmm(AddDest)); + L2Increment(); + Stats.Instructions++; + } else if (Item == "L3_L") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L3Addr, 64)); + L3Increment(); + } else if (Item == "L3_S") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), Ymm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs))); + Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest)); + L3Increment(); + Stats.Instructions++; + } else if (Item == "L3_LS") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L3Addr, 64)); + Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest)); + L3Increment(); + Stats.Instructions++; + } else if (Item == "L3_P") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L1Addr, 32)); + Cb.prefetcht0(ptr(L3Addr)); + L3Increment(); + Stats.Instructions++; + } else if (Item == "RAM_L") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(RamAddr, 64)); + RamIncrement(); + } else if (Item == "RAM_S") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), Ymm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs))); + Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest)); + RamIncrement(); + Stats.Instructions++; + } else if (Item == "RAM_LS") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L3Addr, 64)); + Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest)); + RamIncrement(); + Stats.Instructions++; + } else if (Item == "RAM_P") { + Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L1Addr, 32)); + Cb.prefetcht2(ptr(RamAddr)); + RamIncrement(); + Stats.Instructions++; } else { - workerLog::error() << "Instruction group " << item << " not found in " - << this->name() << "."; - return EXIT_FAILURE; + workerLog::error() << "Instruction group " << Item << " not found in " << name() << "."; } - if (shift_regs > 1) { - this->_instructions++; - if (left) { - cb.psrlw(Mm(shift_start + (shift_dst - shift_start + 3) % shift_regs), - Mm(shift_dst)); + if (ShiftRegs > 1) { + Stats.Instructions++; + if (Left) { + Cb.psrlw(Mm(ShiftStart + ((ShiftDest - ShiftStart + 3) % ShiftRegs)), Mm(ShiftDest)); } else { - cb.psllw(Mm(shift_start + (shift_dst - shift_start + 3) % shift_regs), - Mm(shift_dst)); + Cb.psllw(Mm(ShiftStart + ((ShiftDest - ShiftStart + 3) % ShiftRegs)), Mm(ShiftDest)); } } - add_dest++; - if (add_dest > add_end) { + AddDest++; + if (AddDest > AddEnd) { // DO NOT REMOVE the + 1. It serves for the good of ymm0. If it was to // be overriden, the values in the other registers would rise up to inf. - add_dest = add_start + 1; + AddDest = AddStart + 1; } - mov_dst++; - if (mov_dst > trans_end) { - mov_dst = trans_start; + MovDest++; + if (MovDest > TransEnd) { + MovDest = TransStart; } - mov_src++; - if (mov_src > trans_end) { - mov_src = trans_start; + MovSrc++; + if (MovSrc > TransEnd) { + MovSrc = TransStart; } - if (shift_regs > 1) { - shift_dst++; - if (shift_dst > shift_end) { - shift_dst = shift_start; - left = !left; + if (ShiftRegs > 1) { + ShiftDest++; + if (ShiftDest > ShiftEnd) { + ShiftDest = ShiftStart; + Left = !Left; } } } } - if (this->getRAMSequenceCount(sequence) > 0) { + if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) { // reset RAM counter - auto NoRamReset = cb.newLabel(); - - cb.sub(ram_count_reg, Imm(1)); - cb.jnz(NoRamReset); - cb.mov(ram_count_reg, Imm(ram_loop_count)); - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); - cb.bind(NoRamReset); + auto NoRamReset = Cb.newLabel(); + + Cb.sub(RamCountReg, Imm(1)); + Cb.jnz(NoRamReset); + Cb.mov(RamCountReg, Imm(RamLoopCount)); + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); + Cb.bind(NoRamReset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - if (this->getL2SequenceCount(sequence) > 0) { + if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) { // reset L2-Cache counter - auto NoL2Reset = cb.newLabel(); - - cb.sub(l2_count_reg, Imm(1)); - cb.jnz(NoL2Reset); - cb.mov(l2_count_reg, Imm(l2_loop_count)); - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); - cb.bind(NoL2Reset); + auto NoL2Reset = Cb.newLabel(); + + Cb.sub(L2CountReg, Imm(1)); + Cb.jnz(NoL2Reset); + Cb.mov(L2CountReg, Imm(L2LoopCount)); + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); + Cb.bind(NoL2Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - if (this->getL3SequenceCount(sequence) > 0) { + if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) { // reset L3-Cache counter - auto NoL3Reset = cb.newLabel(); - - cb.sub(l3_count_reg, Imm(1)); - cb.jnz(NoL3Reset); - cb.mov(l3_count_reg, Imm(l3_loop_count)); - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); - cb.bind(NoL3Reset); + auto NoL3Reset = Cb.newLabel(); + + Cb.sub(L3CountReg, Imm(1)); + Cb.jnz(NoL3Reset); + Cb.mov(L3CountReg, Imm(L3LoopCount)); + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); + Cb.bind(NoL3Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.inc(iter_reg); // increment iteration counter - cb.mov(l1_addr, pointer_reg); - - if (dumpRegisters) { - auto SkipRegistersDump = cb.newLabel(); + Cb.inc(IterReg); // increment iteration counter + Cb.mov(L1Addr, PointerReg); - cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - cb.jnz(SkipRegistersDump); - - // dump all the ymm register - for (int i = 0; i < (int)this->registerCount(); i++) { - cb.vmovapd( - ymmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)), - Ymm(i)); - } - - // set read flag - cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - - cb.bind(SkipRegistersDump); + if (DumpRegisters) { + emitDumpRegisterCode(Cb, PointerReg, asmjit::x86::ymmword_ptr); } - if (errorDetection) { - this->emitErrorDetectionCode( - cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2); + if (ErrorDetection) { + emitErrorDetectionCode(Cb, IterReg, AddrHighReg, PointerReg, TempReg, TempReg2); } - cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH)); - cb.jnz(Loop); + Cb.test(ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh)); + Cb.jnz(Loop); - cb.bind(FunctionExit); + Cb.bind(FunctionExit); - cb.mov(rax, iter_reg); // restore iteration counter + Cb.mov(asmjit::x86::rax, IterReg); // restore iteration counter - cb.emitEpilog(frame); + Cb.emitEpilog(Frame); - cb.finalize(); + Cb.finalize(); - // String sb; - // cb.dump(sb); - - Error err = this->rt.add(&this->loadFunction, &code); - if (err) { - workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in " - << __FILE__ << " at " << __LINE__; - return EXIT_FAILURE; - } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size - if (l1i_cache_size != 0) { - auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop); - auto instructionCachePercentage = 100 * loopSize / l1i_cache_size; + if (L1iCacheSize) { + auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop); + auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize; - if (loopSize > l1i_cache_size) { + if (LoopSize > *L1iCacheSize) { workerLog::warn() << "Work-loop is bigger than the L1i-Cache."; } - workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size - << " Bytes (" << instructionCachePercentage + workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage << "%) from the L1i-Cache for the work-loop."; - workerLog::trace() << "Sequence size: " << sequence.size(); - workerLog::trace() << "Repetition count: " << repetitions; + workerLog::trace() << "Sequence size: " << Sequence.size(); + workerLog::trace() << "Repetition count: " << Repetitions; } - return EXIT_SUCCESS; + return CompiledPayloadPtr; } -std::list AVXPayload::getAvailableInstructions() const { - std::list instructions; - - transform(this->instructionFlops.begin(), this->instructionFlops.end(), - back_inserter(instructions), - [](const auto &item) { return item.first; }); - - return instructions; +void AVXPayload::init(double* MemoryAddr, uint64_t BufferSize) const { + X86Payload::initMemory(MemoryAddr, BufferSize, 1.654738925401e-10, 1.654738925401e-15); } -void AVXPayload::init(unsigned long long *memoryAddr, - unsigned long long bufferSize) { - X86Payload::init(memoryAddr, bufferSize, 1.654738925401e-10, - 1.654738925401e-15); -} +} // namespace firestarter::environment::x86::payload \ No newline at end of file diff --git a/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp b/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp index 32e81752..202d34c7 100644 --- a/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp @@ -19,459 +19,376 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include - -#include -#include - -using namespace firestarter::environment::x86::payload; -using namespace asmjit; -using namespace asmjit::x86; - -int FMA4Payload::compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) { +#include "firestarter/Environment/X86/Payload/FMA4Payload.hpp" +#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp" + +namespace firestarter::environment::x86::payload { + +auto FMA4Payload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + using Imm = asmjit::Imm; + using Xmm = asmjit::x86::Xmm; + // NOLINTBEGIN(readability-identifier-naming) + constexpr asmjit::x86::Mem (*xmmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::xmmword_ptr; + constexpr auto xmm0 = asmjit::x86::xmm0; + constexpr auto xmm1 = asmjit::x86::xmm1; + // NOLINTEND(readability-identifier-naming) + // Compute the sequence of instruction groups and the number of its repetions // to reach the desired size - auto sequence = this->generateSequence(proportion); - auto repetitions = - this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread); + auto Sequence = Settings.sequence(); + auto Repetitions = + environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread()); // compute count of flops and memory access for performance report - unsigned flops = 0; - unsigned bytes = 0; + environment::payload::PayloadStats Stats; - for (const auto &item : sequence) { - auto it = this->instructionFlops.find(item); + for (const auto& Item : Sequence) { + auto It = instructionFlops().find(Item); - if (it == this->instructionFlops.end()) { - workerLog::error() << "Instruction group " << item << " undefined in " - << name() << "."; - return EXIT_FAILURE; + if (It == instructionFlops().end()) { + workerLog::error() << "Instruction group " << Item << " undefined in " << name() << "."; } - flops += it->second; + Stats.Flops += It->second; - it = this->instructionMemory.find(item); + It = instructionMemory().find(Item); - if (it != this->instructionMemory.end()) { - bytes += it->second; + if (It != instructionMemory().end()) { + Stats.Bytes += It->second; } } - this->_flops = repetitions * flops; - this->_bytes = repetitions * bytes; - this->_instructions = repetitions * sequence.size() * 4 + 6; + Stats.Flops *= Repetitions; + Stats.Bytes *= Repetitions; + Stats.Instructions = Repetitions * Sequence.size() * 4 + 6; // calculate the buffer sizes - auto l1i_cache_size = instructionCacheSize / thread; - auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin(); - auto l1_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l2_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l3_size = *dataCacheBufferSizeIterator / thread; - auto ram_size = ramBufferSize / thread; + const auto L1iCacheSize = Settings.instructionCacheSizePerThread(); + const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread(); + auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin(); + const auto L1Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L2Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L3Size = *DataCacheBufferSizeIterator; + const auto RamSize = Settings.ramBufferSizePerThread(); // calculate the reset counters for the buffers - auto l2_loop_count = - getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread); - auto l3_loop_count = - getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread); - auto ram_loop_count = - getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread); - - CodeHolder code; - code.init(this->rt.environment()); - - if (nullptr != this->loadFunction) { - this->rt.release(&this->loadFunction); - } - - Builder cb(&code); - cb.addDiagnosticOptions( - asmjit::DiagnosticOptions::kValidateAssembler | - asmjit::DiagnosticOptions::kValidateIntermediate ); - - auto pointer_reg = rax; - auto l1_addr = rbx; - auto l2_addr = rcx; - auto l3_addr = r8; - auto ram_addr = r9; - auto l2_count_reg = r10; - auto l3_count_reg = r11; - auto ram_count_reg = r12; - auto temp_reg = r13; - auto temp_reg2 = rbp; - auto offset_reg = r14; - auto addrHigh_reg = r15; - auto iter_reg = mm0; - auto shift_reg = std::vector({rdi, rsi, rdx}); - auto shift_reg32 = std::vector({edi, esi, edx}); - auto nr_shift_regs = 3; - auto mul_regs = 2; - auto add_regs = 9; - auto alt_dst_regs = 3; - auto ram_reg = xmm15; - - FuncDetail func; - func.init(FuncSignatureT( - CallConvId::kCDecl), - this->rt.environment()); - - FuncFrame frame; - frame.init(func); + const auto L2LoopCount = + environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size); + const auto L3LoopCount = + environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size); + const auto RamLoopCount = + environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize); + + asmjit::CodeHolder Code; + Code.init(asmjit::Environment::host()); + + asmjit::x86::Builder Cb(&Code); + Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler | + asmjit::DiagnosticOptions::kValidateIntermediate); + + const auto PointerReg = asmjit::x86::rax; + const auto L1Addr = asmjit::x86::rbx; + const auto L2Addr = asmjit::x86::rcx; + const auto L3Addr = asmjit::x86::r8; + const auto RamAddr = asmjit::x86::r9; + const auto L2CountReg = asmjit::x86::r10; + const auto L3CountReg = asmjit::x86::r11; + const auto RamCountReg = asmjit::x86::r12; + const auto TempReg = asmjit::x86::r13; + const auto TempReg2 = asmjit::x86::rbp; + const auto OffsetReg = asmjit::x86::r14; + const auto AddrHighReg = asmjit::x86::r15; + const auto IterReg = asmjit::x86::mm0; + const auto ShiftReg = std::vector({asmjit::x86::rdi, asmjit::x86::rsi, asmjit::x86::rdx}); + const auto ShiftReg32 = std::vector({asmjit::x86::edi, asmjit::x86::esi, asmjit::x86::edx}); + const auto NbShiftRegs = 3; + const auto MulRegs = 2; + const auto AddRegs = 9; + const auto AltDestRegs = 3; + const auto RamReg = asmjit::x86::xmm15; + + asmjit::FuncDetail Func; + Func.init(asmjit::FuncSignature::build( + asmjit::CallConvId::kCDecl), + Code.environment()); + + asmjit::FuncFrame Frame; + Frame.init(Func); // make (x|y)mm registers dirty - for (int i = 0; i < 16; i++) { - frame.addDirtyRegs(Ymm(i)); + for (auto I = 0; I < 16; I++) { + Frame.addDirtyRegs(asmjit::x86::Ymm(I)); } - for (int i = 0; i < 8; i++) { - frame.addDirtyRegs(Mm(i)); + for (auto I = 0; I < 8; I++) { + Frame.addDirtyRegs(asmjit::x86::Mm(I)); } // make all other used registers dirty except RAX - frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg, - l3_count_reg, ram_count_reg, temp_reg, temp_reg2, - offset_reg, addrHigh_reg, iter_reg, ram_addr); - for (const auto ® : shift_reg) { - frame.addDirtyRegs(reg); + Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg, + AddrHighReg, IterReg, RamAddr); + for (const auto& Reg : ShiftReg) { + Frame.addDirtyRegs(Reg); } - FuncArgsAssignment args(&func); + asmjit::FuncArgsAssignment Args(&Func); // FIXME: asmjit assigment to mm0 does not seem to be supported - args.assignAll(pointer_reg, addrHigh_reg, temp_reg); - args.updateFuncFrame(frame); - frame.finalize(); + Args.assignAll(PointerReg, AddrHighReg, TempReg); + Args.updateFuncFrame(Frame); + Frame.finalize(); - cb.emitProlog(frame); - cb.emitArgsAssignment(frame, args); + Cb.emitProlog(Frame); + Cb.emitArgsAssignment(Frame, Args); // FIXME: movq from temp_reg to iter_reg - cb.movq(iter_reg, temp_reg); + Cb.movq(IterReg, TempReg); // stop right away if low load is selected - auto FunctionExit = cb.newLabel(); + auto FunctionExit = Cb.newLabel(); - cb.mov(temp_reg, ptr_64(addrHigh_reg)); - cb.test(temp_reg, temp_reg); - cb.jz(FunctionExit); + Cb.mov(TempReg, ptr_64(AddrHighReg)); + Cb.test(TempReg, TempReg); + Cb.jz(FunctionExit); - cb.mov(offset_reg, + Cb.mov(OffsetReg, Imm(64)); // increment after each cache/memory access // Initialize registers for shift operations - for (auto const ® : shift_reg32) { - cb.mov(reg, Imm(0xAAAAAAAA)); + for (auto const& Reg : ShiftReg32) { + Cb.mov(Reg, Imm(0xAAAAAAAA)); } // Initialize AVX-Registers for FMA4 Operations - cb.vmovapd(ymm0, ymmword_ptr(pointer_reg)); - cb.vmovapd(ymm1, ymmword_ptr(pointer_reg)); - auto add_start = mul_regs; - auto add_end = mul_regs + add_regs - 1; - auto trans_start = add_regs + mul_regs; - auto trans_end = add_regs + mul_regs + alt_dst_regs - 1; - for (int i = add_start; i <= trans_end; i++) { - cb.vmovapd(Ymm(i), ymmword_ptr(pointer_reg, 256 + i * 32)); + Cb.vmovapd(asmjit::x86::ymm0, asmjit::x86::ymmword_ptr(PointerReg)); + Cb.vmovapd(asmjit::x86::ymm1, asmjit::x86::ymmword_ptr(PointerReg)); + auto AddStart = MulRegs; + auto AddEnd = MulRegs + AddRegs - 1; + auto TransStart = AddRegs + MulRegs; + auto TransEnd = AddRegs + MulRegs + AltDestRegs - 1; + for (auto I = AddStart; I <= TransEnd; I++) { + Cb.vmovapd(asmjit::x86::Ymm(I), asmjit::x86::ymmword_ptr(PointerReg, 256 + (I * 32))); } - cb.mov(l1_addr, pointer_reg); // address for L1-buffer - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer - cb.mov(l2_count_reg, Imm(l2_loop_count)); - workerLog::trace() << "reset counter for L2-buffer with " - << l2_loop_count - << " cache line accesses per loop (" - << l2_size/1024 - << ") KiB"; - cb.mov(l3_count_reg, Imm(l3_loop_count)); - workerLog::trace() << "reset counter for L3-buffer with " - << l3_loop_count - << " cache line accesses per loop (" - << l3_size/1024 - << ") KiB"; - cb.mov(ram_count_reg, Imm(ram_loop_count)); - workerLog::trace() << "reset counter for RAM-buffer with " - << ram_loop_count - << " cache line accesses per loop (" - << ram_size/1024 - << ") KiB"; - - cb.align(AlignMode::kCode, 64); - - auto Loop = cb.newLabel(); - cb.bind(Loop); - - auto shift_pos = 0; - bool left = false; - auto add_dest = add_start + 1; - auto mov_dst = trans_start; - auto mov_src = mov_dst + 1; - unsigned l1_offset = 0; - -#define L1_INCREMENT() \ - l1_offset += 64; \ - if (l1_offset < l1_size * 0.5) { \ - cb.add(l1_addr, offset_reg); \ - } else { \ - l1_offset = 0; \ - cb.mov(l1_addr, pointer_reg); \ - } - -#define L2_INCREMENT() cb.add(l2_addr, offset_reg); - -#define L3_INCREMENT() cb.add(l3_addr, offset_reg) - -#define RAM_INCREMENT() cb.add(ram_addr, offset_reg) - - for (unsigned count = 0; count < repetitions; count++) { - for (const auto &item : sequence) { - if (item == "REG") { - cb.vfmaddpd( - Xmm(add_dest), Xmm(add_dest), xmm0, - Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - cb.vfmaddpd( - Xmm(mov_dst), Xmm(mov_dst), xmm1, - Xmm(add_start + (add_dest - add_start + add_regs + 2) % add_regs)); - cb.xor_(shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs], - temp_reg); - mov_dst++; - } else if (item == "L1_L") { - cb.vfmaddpd( - Xmm(add_dest), Xmm(add_dest), xmm0, - Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - cb.vfmaddpd(Ymm(add_dest), Ymm(add_dest), ymm1, - ymmword_ptr(l1_addr, 32)); - L1_INCREMENT(); - } else if (item == "L1_S") { - cb.vmovapd(xmmword_ptr(l1_addr, 32), Xmm(add_dest)); - cb.vfmaddpd( - Ymm(add_dest), Ymm(add_dest), ymm0, - Ymm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - L1_INCREMENT(); - } else if (item == "L1_LS") { - cb.vmovapd(xmmword_ptr(l1_addr, 64), Xmm(add_dest)); - cb.vfmaddpd(Ymm(add_dest), Ymm(add_dest), ymm0, - ymmword_ptr(l1_addr, 32)); - L1_INCREMENT(); - } else if (item == "L2_L") { - cb.vfmaddpd( - Xmm(add_dest), Xmm(add_dest), xmm0, - Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm1, - xmmword_ptr(l2_addr, 64)); - L2_INCREMENT(); - } else if (item == "L2_S") { - cb.vmovapd(xmmword_ptr(l2_addr, 64), Xmm(add_dest)); - cb.vfmaddpd( - Xmm(add_dest), Xmm(add_dest), xmm0, - Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - L2_INCREMENT(); - } else if (item == "L2_LS") { - cb.vmovapd(xmmword_ptr(l2_addr, 96), Xmm(add_dest)); - cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm0, - xmmword_ptr(l2_addr, 64)); - L2_INCREMENT(); - } else if (item == "L3_L") { - cb.vfmaddpd( - Xmm(add_dest), Xmm(add_dest), xmm0, - Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm1, - xmmword_ptr(l3_addr, 64)); - L3_INCREMENT(); - } else if (item == "L3_S") { - cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest)); - cb.vfmaddpd( - Xmm(add_dest), Xmm(add_dest), xmm0, - Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - L3_INCREMENT(); - } else if (item == "L3_LS") { - cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest)); - cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm0, - xmmword_ptr(l3_addr, 64)); - L3_INCREMENT(); - } else if (item == "L3_P") { - cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm0, - xmmword_ptr(l1_addr, 32)); - cb.prefetcht2(ptr(l3_addr)); - L3_INCREMENT(); - } else if (item == "RAM_L") { - cb.vfmaddpd( - Xmm(add_dest), Xmm(add_dest), xmm0, - Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - cb.vfmaddpd(ram_reg, ram_reg, xmm1, xmmword_ptr(ram_addr, 64)); - RAM_INCREMENT(); - } else if (item == "RAM_S") { - cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest)); - cb.vfmaddpd( - Xmm(add_dest), Xmm(add_dest), xmm0, - Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - RAM_INCREMENT(); - } else if (item == "RAM_LS") { - cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest)); - cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm0, - xmmword_ptr(ram_addr, 32)); - RAM_INCREMENT(); - } else if (item == "RAM_P") { - cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm0, - xmmword_ptr(l1_addr, 32)); - cb.prefetcht2(ptr(ram_addr)); - RAM_INCREMENT(); + Cb.mov(L1Addr, PointerReg); // address for L1-buffer + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer + Cb.mov(L2CountReg, Imm(L2LoopCount)); + workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop (" + << L2Size / 1024 << ") KiB"; + Cb.mov(L3CountReg, Imm(L3LoopCount)); + workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop (" + << L3Size / 1024 << ") KiB"; + Cb.mov(RamCountReg, Imm(RamLoopCount)); + workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop (" + << RamSize / 1024 << ") KiB"; + + Cb.align(asmjit::AlignMode::kCode, 64); + + auto Loop = Cb.newLabel(); + Cb.bind(Loop); + + auto ShiftPos = 0; + bool Left = false; + auto AddDest = AddStart + 1; + auto MovDest = TransStart; + auto MovSrc = MovDest + 1; + unsigned L1Offset = 0; + + const auto L1Increment = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg]() { + L1Offset += 64; + if (L1Offset < L1Size * 0.5) { + Cb.add(L1Addr, OffsetReg); + } else { + L1Offset = 0; + Cb.mov(L1Addr, PointerReg); + } + }; + const auto L2Increment = [&Cb, &L2Addr, &OffsetReg]() { Cb.add(L2Addr, OffsetReg); }; + const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); }; + const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); }; + + for (auto Count = 0U; Count < Repetitions; Count++) { + for (const auto& Item : Sequence) { + if (Item == "REG") { + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + Cb.vfmaddpd(Xmm(MovDest), Xmm(MovDest), xmm1, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 2) % AddRegs))); + Cb.xor_(ShiftReg[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs], TempReg); + MovDest++; + } else if (Item == "L1_L") { + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + Cb.vfmaddpd(asmjit::x86::Ymm(AddDest), asmjit::x86::Ymm(AddDest), asmjit::x86::ymm1, + asmjit::x86::ymmword_ptr(L1Addr, 32)); + L1Increment(); + } else if (Item == "L1_S") { + Cb.vmovapd(xmmword_ptr(L1Addr, 32), Xmm(AddDest)); + Cb.vfmaddpd(asmjit::x86::Ymm(AddDest), asmjit::x86::Ymm(AddDest), asmjit::x86::ymm0, + asmjit::x86::Ymm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + L1Increment(); + } else if (Item == "L1_LS") { + Cb.vmovapd(xmmword_ptr(L1Addr, 64), Xmm(AddDest)); + Cb.vfmaddpd(asmjit::x86::Ymm(AddDest), asmjit::x86::Ymm(AddDest), asmjit::x86::ymm0, + asmjit::x86::ymmword_ptr(L1Addr, 32)); + L1Increment(); + } else if (Item == "L2_L") { + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm1, xmmword_ptr(L2Addr, 64)); + L2Increment(); + } else if (Item == "L2_S") { + Cb.vmovapd(xmmword_ptr(L2Addr, 64), Xmm(AddDest)); + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + L2Increment(); + } else if (Item == "L2_LS") { + Cb.vmovapd(xmmword_ptr(L2Addr, 96), Xmm(AddDest)); + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, xmmword_ptr(L2Addr, 64)); + L2Increment(); + } else if (Item == "L3_L") { + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm1, xmmword_ptr(L3Addr, 64)); + L3Increment(); + } else if (Item == "L3_S") { + Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest)); + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + L3Increment(); + } else if (Item == "L3_LS") { + Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest)); + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, xmmword_ptr(L3Addr, 64)); + L3Increment(); + } else if (Item == "L3_P") { + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, xmmword_ptr(L1Addr, 32)); + Cb.prefetcht2(asmjit::x86::ptr(L3Addr)); + L3Increment(); + } else if (Item == "RAM_L") { + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + Cb.vfmaddpd(RamReg, RamReg, xmm1, xmmword_ptr(RamAddr, 64)); + RamIncrement(); + } else if (Item == "RAM_S") { + Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest)); + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + RamIncrement(); + } else if (Item == "RAM_LS") { + Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest)); + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, xmmword_ptr(RamAddr, 32)); + RamIncrement(); + } else if (Item == "RAM_P") { + Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, xmmword_ptr(L1Addr, 32)); + Cb.prefetcht2(asmjit::x86::ptr(RamAddr)); + RamIncrement(); } else { - workerLog::error() << "Instruction group " << item << " not found in " - << this->name() << "."; - return EXIT_FAILURE; + workerLog::error() << "Instruction group " << Item << " not found in " << name() << "."; } - if (left) { - cb.shr(shift_reg32[shift_pos], Imm(1)); + if (Left) { + Cb.shr(ShiftReg32[ShiftPos], Imm(1)); } else { - cb.shl(shift_reg32[shift_pos], Imm(1)); + Cb.shl(ShiftReg32[ShiftPos], Imm(1)); } - add_dest++; - if (add_dest > add_end) { - add_dest = add_start; + AddDest++; + if (AddDest > AddEnd) { + AddDest = AddStart; } - if (mov_dst > trans_end) { - mov_dst = trans_start; + if (MovDest > TransEnd) { + MovDest = TransStart; } - mov_src++; - if (mov_src > trans_end) { - mov_src = trans_start; + MovSrc++; + if (MovSrc > TransEnd) { + MovSrc = TransStart; } - shift_pos++; - if (shift_pos == nr_shift_regs) { - shift_pos = 0; - left = !left; + ShiftPos++; + if (ShiftPos == NbShiftRegs) { + ShiftPos = 0; + Left = !Left; } } } - cb.movq(temp_reg, iter_reg); // restore iteration counter - if (this->getRAMSequenceCount(sequence) > 0) { + Cb.movq(TempReg, IterReg); // restore iteration counter + if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) { // reset RAM counter - auto NoRamReset = cb.newLabel(); - - cb.sub(ram_count_reg, Imm(1)); - cb.jnz(NoRamReset); - cb.mov(ram_count_reg, Imm(ram_loop_count)); - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); - cb.bind(NoRamReset); + auto NoRamReset = Cb.newLabel(); + + Cb.sub(RamCountReg, Imm(1)); + Cb.jnz(NoRamReset); + Cb.mov(RamCountReg, Imm(RamLoopCount)); + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); + Cb.bind(NoRamReset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.inc(temp_reg); // increment iteration counter - if (this->getL2SequenceCount(sequence) > 0) { + Cb.inc(TempReg); // increment iteration counter + if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) { // reset L2-Cache counter - auto NoL2Reset = cb.newLabel(); - - cb.sub(l2_count_reg, Imm(1)); - cb.jnz(NoL2Reset); - cb.mov(l2_count_reg, Imm(l2_loop_count)); - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); - cb.bind(NoL2Reset); + auto NoL2Reset = Cb.newLabel(); + + Cb.sub(L2CountReg, Imm(1)); + Cb.jnz(NoL2Reset); + Cb.mov(L2CountReg, Imm(L2LoopCount)); + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); + Cb.bind(NoL2Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.movq(iter_reg, temp_reg); // store iteration counter - if (this->getL3SequenceCount(sequence) > 0) { + Cb.movq(IterReg, TempReg); // store iteration counter + if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) { // reset L3-Cache counter - auto NoL3Reset = cb.newLabel(); - - cb.sub(l3_count_reg, Imm(1)); - cb.jnz(NoL3Reset); - cb.mov(l3_count_reg, Imm(l3_loop_count)); - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); - cb.bind(NoL3Reset); + auto NoL3Reset = Cb.newLabel(); + + Cb.sub(L3CountReg, Imm(1)); + Cb.jnz(NoL3Reset); + Cb.mov(L3CountReg, Imm(L3LoopCount)); + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); + Cb.bind(NoL3Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.mov(l1_addr, pointer_reg); - - if (dumpRegisters) { - auto SkipRegistersDump = cb.newLabel(); + Cb.mov(L1Addr, PointerReg); - cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - cb.jnz(SkipRegistersDump); - - // dump all the ymm register - for (int i = 0; i < (int)this->registerCount(); i++) { - cb.vmovapd( - ymmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)), - Ymm(i)); - } - - // set read flag - cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - - cb.bind(SkipRegistersDump); + if (DumpRegisters) { + emitDumpRegisterCode(Cb, PointerReg, asmjit::x86::ymmword_ptr); } - if (errorDetection) { - this->emitErrorDetectionCode( - cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2); + if (ErrorDetection) { + emitErrorDetectionCode(Cb, IterReg, AddrHighReg, PointerReg, TempReg, + TempReg2); } - cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH)); - cb.jnz(Loop); + Cb.test(asmjit::x86::ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh)); + Cb.jnz(Loop); - cb.bind(FunctionExit); + Cb.bind(FunctionExit); - cb.movq(rax, iter_reg); + Cb.movq(asmjit::x86::rax, IterReg); - cb.emitEpilog(frame); + Cb.emitEpilog(Frame); - cb.finalize(); + Cb.finalize(); - // String sb; - // cb.dump(sb); - - Error err = this->rt.add(&this->loadFunction, &code); - if (err) { - workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in " - << __FILE__ << " at " << __LINE__; - return EXIT_FAILURE; - } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size - if (l1i_cache_size != 0) { - auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop); - auto instructionCachePercentage = 100 * loopSize / l1i_cache_size; + if (L1iCacheSize) { + auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop); + auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize; - if (loopSize > l1i_cache_size) { + if (LoopSize > *L1iCacheSize) { workerLog::warn() << "Work-loop is bigger than the L1i-Cache."; } - workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size - << " Bytes (" << instructionCachePercentage + workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage << "%) from the L1i-Cache for the work-loop."; - workerLog::trace() << "Sequence size: " << sequence.size(); - workerLog::trace() << "Repetition count: " << repetitions; + workerLog::trace() << "Sequence size: " << Sequence.size(); + workerLog::trace() << "Repetition count: " << Repetitions; } - return EXIT_SUCCESS; + return CompiledPayloadPtr; } -std::list FMA4Payload::getAvailableInstructions() const { - std::list instructions; - - transform(this->instructionFlops.begin(), this->instructionFlops.end(), - back_inserter(instructions), - [](const auto &item) { return item.first; }); - - return instructions; +void FMA4Payload::init(double* MemoryAddr, uint64_t BufferSize) const { + X86Payload::initMemory(MemoryAddr, BufferSize, 0.27948995982e-4, 0.27948995982e-4); } -void FMA4Payload::init(unsigned long long *memoryAddr, - unsigned long long bufferSize) { - X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4); -} +} // namespace firestarter::environment::x86::payload \ No newline at end of file diff --git a/src/firestarter/Environment/X86/Payload/FMAPayload.cpp b/src/firestarter/Environment/X86/Payload/FMAPayload.cpp index e3087c01..cec0021a 100644 --- a/src/firestarter/Environment/X86/Payload/FMAPayload.cpp +++ b/src/firestarter/Environment/X86/Payload/FMAPayload.cpp @@ -19,468 +19,411 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include - -#include -#include - -using namespace firestarter::environment::x86::payload; -using namespace asmjit; -using namespace asmjit::x86; - -int FMAPayload::compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) { +#include "firestarter/Environment/X86/Payload/FMAPayload.hpp" +#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp" + +namespace firestarter::environment::x86::payload { + +auto FMAPayload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + using Imm = asmjit::Imm; + using Xmm = asmjit::x86::Xmm; + using Ymm = asmjit::x86::Ymm; + // NOLINTBEGIN(readability-identifier-naming) + constexpr asmjit::x86::Mem (*ymmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::ymmword_ptr; + constexpr auto ymm0 = asmjit::x86::ymm0; + constexpr auto ymm1 = asmjit::x86::ymm1; + constexpr auto ymm2 = asmjit::x86::ymm2; + // NOLINTEND(readability-identifier-naming) + // Compute the sequence of instruction groups and the number of its repetions // to reach the desired size - auto sequence = this->generateSequence(proportion); - auto repetitions = - this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread); + auto Sequence = Settings.sequence(); + auto Repetitions = + environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread()); // compute count of flops and memory access for performance report - unsigned flops = 0; - unsigned bytes = 0; + environment::payload::PayloadStats Stats; - for (const auto &item : sequence) { - auto it = this->instructionFlops.find(item); + for (const auto& Item : Sequence) { + auto It = instructionFlops().find(Item); - if (it == this->instructionFlops.end()) { - workerLog::error() << "Instruction group " << item << " undefined in " - << name() << "."; - return EXIT_FAILURE; + if (It == instructionFlops().end()) { + workerLog::error() << "Instruction group " << Item << " undefined in " << name() << "."; } - flops += it->second; + Stats.Flops += It->second; - it = this->instructionMemory.find(item); + It = instructionMemory().find(Item); - if (it != this->instructionMemory.end()) { - bytes += it->second; + if (It != instructionMemory().end()) { + Stats.Bytes += It->second; } } - this->_flops = repetitions * flops; - this->_bytes = repetitions * bytes; - this->_instructions = repetitions * sequence.size() * 4 + 6; + Stats.Flops *= Repetitions; + Stats.Bytes *= Repetitions; + Stats.Instructions = Repetitions * Sequence.size() * 4 + 6; // calculate the buffer sizes - auto l1i_cache_size = instructionCacheSize / thread; - auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin(); - auto l1_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l2_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l3_size = *dataCacheBufferSizeIterator / thread; - auto ram_size = ramBufferSize / thread; + const auto L1iCacheSize = Settings.instructionCacheSizePerThread(); + const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread(); + auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin(); + const auto L1Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L2Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L3Size = *DataCacheBufferSizeIterator; + const auto RamSize = Settings.ramBufferSizePerThread(); // calculate the reset counters for the buffers - auto l2_loop_count = - getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread); - auto l3_loop_count = - getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread); - auto ram_loop_count = - getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread); - - CodeHolder code; - code.init(this->rt.environment()); - - if (nullptr != this->loadFunction) { - this->rt.release(&this->loadFunction); - } - - Builder cb(&code); - cb.addDiagnosticOptions( - asmjit::DiagnosticOptions::kValidateAssembler | - asmjit::DiagnosticOptions::kValidateIntermediate ); - - auto pointer_reg = rax; - auto l1_addr = rbx; - auto l2_addr = rcx; - auto l3_addr = r8; - auto ram_addr = r9; - auto l2_count_reg = r10; - auto l3_count_reg = r11; - auto ram_count_reg = r12; - auto temp_reg = r13; - auto temp_reg2 = rbp; - auto offset_reg = r14; - auto addrHigh_reg = r15; - auto iter_reg = mm0; - auto shift_reg = std::vector({rdi, rsi, rdx}); - auto shift_reg32 = std::vector({edi, esi, edx}); - auto nr_shift_regs = 3; - auto mul_regs = 3; - auto add_regs = 9; - auto alt_dst_regs = 3; - auto ram_reg = ymm15; - - FuncDetail func; - func.init(FuncSignatureT( - CallConvId::kCDecl), - this->rt.environment()); - - FuncFrame frame; - frame.init(func); + const auto L2LoopCount = + environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size); + const auto L3LoopCount = + environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size); + const auto RamLoopCount = + environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize); + + asmjit::CodeHolder Code; + Code.init(asmjit::Environment::host()); + + asmjit::x86::Builder Cb(&Code); + Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler | + asmjit::DiagnosticOptions::kValidateIntermediate); + + const auto PointerReg = asmjit::x86::rax; + const auto L1Addr = asmjit::x86::rbx; + const auto L2Addr = asmjit::x86::rcx; + const auto L3Addr = asmjit::x86::r8; + const auto RamAddr = asmjit::x86::r9; + const auto L2CountReg = asmjit::x86::r10; + const auto L3CountReg = asmjit::x86::r11; + const auto RamCountReg = asmjit::x86::r12; + const auto TempReg = asmjit::x86::r13; + const auto TempReg2 = asmjit::x86::rbp; + const auto OffsetReg = asmjit::x86::r14; + const auto AddrHighReg = asmjit::x86::r15; + const auto IterReg = asmjit::x86::mm0; + const auto ShiftRegs = std::vector({asmjit::x86::rdi, asmjit::x86::rsi, asmjit::x86::rdx}); + const auto ShiftRegs32 = std::vector({asmjit::x86::edi, asmjit::x86::esi, asmjit::x86::edx}); + const auto NbShiftRegs = 3; + const auto MulRegs = 3; + const auto AddRegs = 9; + const auto AltDestRegs = 3; + const auto RamReg = asmjit::x86::ymm15; + + asmjit::FuncDetail Func; + Func.init(asmjit::FuncSignature::build( + asmjit::CallConvId::kCDecl), + Code.environment()); + + asmjit::FuncFrame Frame; + Frame.init(Func); // make (x|y)mm registers dirty - for (int i = 0; i < 16; i++) { - frame.addDirtyRegs(Ymm(i)); + for (auto I = 0U; I < 16U; I++) { + Frame.addDirtyRegs(Ymm(I)); } - for (int i = 0; i < 8; i++) { - frame.addDirtyRegs(Mm(i)); + for (auto I = 0U; I < 8U; I++) { + Frame.addDirtyRegs(asmjit::x86::Mm(I)); } // make all other used registers dirty except RAX - frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg, - l3_count_reg, ram_count_reg, temp_reg, temp_reg2, - offset_reg, addrHigh_reg, iter_reg, ram_addr); - for (const auto ® : shift_reg) { - frame.addDirtyRegs(reg); + Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg, + AddrHighReg, IterReg, RamAddr); + for (const auto& Reg : ShiftRegs) { + Frame.addDirtyRegs(Reg); } - FuncArgsAssignment args(&func); + asmjit::FuncArgsAssignment Args(&Func); // FIXME: asmjit assigment to mm0 does not seem to be supported - args.assignAll(pointer_reg, addrHigh_reg, temp_reg); - args.updateFuncFrame(frame); - frame.finalize(); + Args.assignAll(PointerReg, AddrHighReg, TempReg); + Args.updateFuncFrame(Frame); + Frame.finalize(); - cb.emitProlog(frame); - cb.emitArgsAssignment(frame, args); + Cb.emitProlog(Frame); + Cb.emitArgsAssignment(Frame, Args); // FIXME: movq from temp_reg to iter_reg - cb.movq(iter_reg, temp_reg); + Cb.movq(IterReg, TempReg); // stop right away if low load is selected - auto FunctionExit = cb.newLabel(); + auto FunctionExit = Cb.newLabel(); - cb.mov(temp_reg, ptr_64(addrHigh_reg)); - cb.test(temp_reg, temp_reg); - cb.jz(FunctionExit); + Cb.mov(TempReg, ptr_64(AddrHighReg)); + Cb.test(TempReg, TempReg); + Cb.jz(FunctionExit); - cb.mov(offset_reg, + Cb.mov(OffsetReg, Imm(64)); // increment after each cache/memory access // Initialize registers for shift operations - for (auto const ® : shift_reg32) { - cb.mov(reg, Imm(0xAAAAAAAA)); + for (auto const& Reg : ShiftRegs32) { + Cb.mov(Reg, Imm(0xAAAAAAAA)); } // Initialize AVX-Registers for FMA Operations - cb.vmovapd(ymm0, ymmword_ptr(pointer_reg)); - cb.vmovapd(ymm1, ymmword_ptr(pointer_reg, 32)); - cb.vmovapd(ymm2, ymmword_ptr(pointer_reg, 64)); - auto add_start = mul_regs; - auto add_end = mul_regs + add_regs - 1; - auto trans_start = add_regs + mul_regs; - auto trans_end = add_regs + mul_regs + alt_dst_regs - 1; - for (int i = add_start; i <= trans_end; i++) { - cb.vmovapd(Ymm(i), ymmword_ptr(pointer_reg, 256 + i * 32)); - } - cb.mov(l1_addr, pointer_reg); // address for L1-buffer - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer - cb.mov(l2_count_reg, Imm(l2_loop_count)); - workerLog::trace() << "reset counter for L2-buffer with " - << l2_loop_count - << " cache line accesses per loop (" - << l2_size/1024 - << ") KiB"; - cb.mov(l3_count_reg, Imm(l3_loop_count)); - workerLog::trace() << "reset counter for L3-buffer with " - << l3_loop_count - << " cache line accesses per loop (" - << l3_size/1024 - << ") KiB"; - cb.mov(ram_count_reg, Imm(ram_loop_count)); - workerLog::trace() << "reset counter for RAM-buffer with " - << ram_loop_count - << " cache line accesses per loop (" - << ram_size/1024 - << ") KiB"; - - cb.align(AlignMode::kCode, 64); - - auto Loop = cb.newLabel(); - cb.bind(Loop); - - auto shift_pos = 0; - bool left = false; - auto add_dest = add_start + 1; - auto mov_dst = trans_start; - auto mov_src = mov_dst + 1; - unsigned l1_offset = 0; - -#define L1_INCREMENT_TIMES(n) \ - l1_offset += n * 64; \ - if (l1_offset < l1_size * 0.5) { \ - cb.add(l1_addr, offset_reg); \ - } else { \ - l1_offset = 0; \ - cb.mov(l1_addr, pointer_reg); \ - } - -#define L1_INCREMENT() L1_INCREMENT_TIMES(1) - -#define L2_INCREMENT_TIMES(n) \ - if (n == 1) { \ - cb.add(l2_addr, offset_reg); \ - } else { \ - cb.add(l2_addr, n * 64); \ + Cb.vmovapd(ymm0, ymmword_ptr(PointerReg, 0)); + Cb.vmovapd(ymm1, ymmword_ptr(PointerReg, 32)); + Cb.vmovapd(ymm2, ymmword_ptr(PointerReg, 64)); + auto AddStart = MulRegs; + auto AddEnd = MulRegs + AddRegs - 1; + auto TransStart = AddRegs + MulRegs; + auto TransEnd = AddRegs + MulRegs + AltDestRegs - 1; + for (auto I = AddStart; I <= TransEnd; I++) { + Cb.vmovapd(Ymm(I), ymmword_ptr(PointerReg, 256 + (I * 32))); } - -#define L2_INCREMENT() L2_INCREMENT_TIMES(1) - -#define L3_INCREMENT() cb.add(l3_addr, offset_reg) - -#define RAM_INCREMENT() cb.add(ram_addr, offset_reg) - - for (unsigned count = 0; count < repetitions; count++) { - for (const auto &item : sequence) { - if (item == "REG") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2); - cb.vfmadd231pd(Ymm(mov_dst), ymm2, ymm1); - cb.xor_(shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs], - temp_reg); - mov_dst++; - } else if (item == "L1_L") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2); - cb.vfmadd231pd(Ymm(add_dest), ymm1, ymmword_ptr(l1_addr, 32)); - L1_INCREMENT(); - } else if (item == "L1_2L") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 32)); - cb.vfmadd231pd(Ymm(mov_dst), ymm1, ymmword_ptr(l1_addr, 64)); - L1_INCREMENT(); - } else if (item == "L1_S") { - cb.vmovapd(xmmword_ptr(l1_addr, 32), Xmm(add_dest)); - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2); - L1_INCREMENT(); - } else if (item == "L1_LS") { - cb.vmovapd(xmmword_ptr(l1_addr, 64), Xmm(add_dest)); - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 32)); - L1_INCREMENT(); - } else if (item == "L1_LS_256") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 64)); - cb.vmovapd(ymmword_ptr(l1_addr, 32), Ymm(add_dest)); - L1_INCREMENT(); - } else if (item == "L1_2LS_256") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 64)); - cb.vfmadd231pd(Ymm(mov_dst), ymm1, ymmword_ptr(l1_addr, 96)); - cb.vmovapd(ymmword_ptr(l1_addr, 32), Ymm(add_dest)); - L1_INCREMENT_TIMES(2); - } else if (item == "L2_L") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2); - cb.vfmadd231pd(Ymm(add_dest), ymm1, ymmword_ptr(l2_addr, 64)); - L2_INCREMENT(); - } else if (item == "L2_S") { - cb.vmovapd(xmmword_ptr(l2_addr, 64), Xmm(add_dest)); - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2); - L2_INCREMENT(); - } else if (item == "L2_LS") { - cb.vmovapd(xmmword_ptr(l2_addr, 96), Xmm(add_dest)); - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l2_addr, 64)); - L2_INCREMENT(); - } else if (item == "L2_LS_256") { - cb.vmovapd(ymmword_ptr(l2_addr, 96), Ymm(add_dest)); - cb.vfmadd231pd(Ymm(add_dest), ymm0, ptr(l2_addr, 64)); - L2_INCREMENT(); - } else if (item == "L2_2LS_256") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ptr(l2_addr, 64)); - cb.vfmadd231pd(Ymm(mov_dst), ymm1, ptr(l2_addr, 96)); - cb.vmovapd(ymmword_ptr(l2_addr, 32), Ymm(add_dest)); - L2_INCREMENT_TIMES(2); - } else if (item == "L3_L") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2); - cb.vfmadd231pd(Ymm(add_dest), ymm1, ymmword_ptr(l3_addr, 64)); - L3_INCREMENT(); - } else if (item == "L3_S") { - cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest)); - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2); - L3_INCREMENT(); - } else if (item == "L3_LS") { - cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest)); - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l3_addr, 64)); - L3_INCREMENT(); - } else if (item == "L3_LS_256") { - cb.vmovapd(ymmword_ptr(l3_addr, 96), Ymm(add_dest)); - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l3_addr, 64)); - L3_INCREMENT(); - } else if (item == "L3_P") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 32)); - cb.prefetcht2(ptr(l3_addr)); - L3_INCREMENT(); - } else if (item == "RAM_L") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2); - cb.vfmadd231pd(ram_reg, ymm1, ymmword_ptr(ram_addr, 64)); - RAM_INCREMENT(); - } else if (item == "RAM_S") { - cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest)); - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2); - RAM_INCREMENT(); - } else if (item == "RAM_LS") { - cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest)); - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(ram_addr, 32)); - RAM_INCREMENT(); - } else if (item == "RAM_P") { - cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 32)); - cb.prefetcht2(ptr(ram_addr)); - RAM_INCREMENT(); + Cb.mov(L1Addr, PointerReg); // address for L1-buffer + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer + Cb.mov(L2CountReg, Imm(L2LoopCount)); + workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop (" + << L2Size / 1024 << ") KiB"; + Cb.mov(L3CountReg, Imm(L3LoopCount)); + workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop (" + << L3Size / 1024 << ") KiB"; + Cb.mov(RamCountReg, Imm(RamLoopCount)); + workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop (" + << RamSize / 1024 << ") KiB"; + + Cb.align(asmjit::AlignMode::kCode, 64); + + auto Loop = Cb.newLabel(); + Cb.bind(Loop); + + auto ShiftPos = 0; + bool Left = false; + auto AddDest = AddStart + 1; + auto MovDest = TransStart; + auto MovSrc = MovDest + 1; + unsigned L1Offset = 0; + + const auto L1IncrementTimes = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg](unsigned Times) { + L1Offset += Times * 64; + if (L1Offset < L1Size * 0.5) { + Cb.add(L1Addr, OffsetReg); + } else { + L1Offset = 0; + Cb.mov(L1Addr, PointerReg); + } + }; + const auto L1Increment = [&L1IncrementTimes] { L1IncrementTimes(1); }; + const auto L2IncrementTimes = [&Cb, &L2Addr, &OffsetReg](unsigned Times) { + if (Times == 1) { + Cb.add(L2Addr, OffsetReg); + } else { + Cb.add(L2Addr, Times * 64); + } + }; + const auto L2Increment = [&L2IncrementTimes] { L2IncrementTimes(1); }; + const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); }; + const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); }; + + for (auto Count = 0U; Count < Repetitions; Count++) { + for (const auto& Item : Sequence) { + if (Item == "REG") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2); + Cb.vfmadd231pd(Ymm(MovDest), ymm2, ymm1); + Cb.xor_(ShiftRegs[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs], TempReg); + MovDest++; + } else if (Item == "L1_L") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2); + Cb.vfmadd231pd(Ymm(AddDest), ymm1, ymmword_ptr(L1Addr, 32)); + L1Increment(); + } else if (Item == "L1_2L") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 32)); + Cb.vfmadd231pd(Ymm(MovDest), ymm1, ymmword_ptr(L1Addr, 64)); + L1Increment(); + } else if (Item == "L1_S") { + Cb.vmovapd(xmmword_ptr(L1Addr, 32), Xmm(AddDest)); + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2); + L1Increment(); + } else if (Item == "L1_LS") { + Cb.vmovapd(xmmword_ptr(L1Addr, 64), Xmm(AddDest)); + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 32)); + L1Increment(); + } else if (Item == "L1_LS_256") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 64)); + Cb.vmovapd(ymmword_ptr(L1Addr, 32), Ymm(AddDest)); + L1Increment(); + } else if (Item == "L1_2LS_256") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 64)); + Cb.vfmadd231pd(Ymm(MovDest), ymm1, ymmword_ptr(L1Addr, 96)); + Cb.vmovapd(ymmword_ptr(L1Addr, 32), Ymm(AddDest)); + L1IncrementTimes(2); + } else if (Item == "L2_L") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2); + Cb.vfmadd231pd(Ymm(AddDest), ymm1, ymmword_ptr(L2Addr, 64)); + L2Increment(); + } else if (Item == "L2_S") { + Cb.vmovapd(xmmword_ptr(L2Addr, 64), Xmm(AddDest)); + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2); + L2Increment(); + } else if (Item == "L2_LS") { + Cb.vmovapd(xmmword_ptr(L2Addr, 96), Xmm(AddDest)); + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L2Addr, 64)); + L2Increment(); + } else if (Item == "L2_LS_256") { + Cb.vmovapd(ymmword_ptr(L2Addr, 96), Ymm(AddDest)); + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ptr(L2Addr, 64)); + L2Increment(); + } else if (Item == "L2_2LS_256") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ptr(L2Addr, 64)); + Cb.vfmadd231pd(Ymm(MovDest), ymm1, ptr(L2Addr, 96)); + Cb.vmovapd(ymmword_ptr(L2Addr, 32), Ymm(AddDest)); + L2IncrementTimes(2); + } else if (Item == "L3_L") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2); + Cb.vfmadd231pd(Ymm(AddDest), ymm1, ymmword_ptr(L3Addr, 64)); + L3Increment(); + } else if (Item == "L3_S") { + Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest)); + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2); + L3Increment(); + } else if (Item == "L3_LS") { + Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest)); + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L3Addr, 64)); + L3Increment(); + } else if (Item == "L3_LS_256") { + Cb.vmovapd(ymmword_ptr(L3Addr, 96), Ymm(AddDest)); + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L3Addr, 64)); + L3Increment(); + } else if (Item == "L3_P") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 32)); + Cb.prefetcht2(ptr(L3Addr)); + L3Increment(); + } else if (Item == "RAM_L") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2); + Cb.vfmadd231pd(RamReg, ymm1, ymmword_ptr(RamAddr, 64)); + RamIncrement(); + } else if (Item == "RAM_S") { + Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest)); + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2); + RamIncrement(); + } else if (Item == "RAM_LS") { + Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest)); + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(RamAddr, 32)); + RamIncrement(); + } else if (Item == "RAM_P") { + Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 32)); + Cb.prefetcht2(ptr(RamAddr)); + RamIncrement(); } else { - workerLog::error() << "Instruction group " << item << " not found in " - << this->name() << "."; - return EXIT_FAILURE; + workerLog::error() << "Instruction group " << Item << " not found in " << name() << "."; } - if (item != "L1_2LS_256" && item != "L2_2LS_256") { - if (left) { - cb.shr(shift_reg32[shift_pos], Imm(1)); + if (Item != "L1_2LS_256" && Item != "L2_2LS_256") { + if (Left) { + Cb.shr(ShiftRegs32[ShiftPos], Imm(1)); } else { - cb.shl(shift_reg32[shift_pos], Imm(1)); + Cb.shl(ShiftRegs32[ShiftPos], Imm(1)); } } - add_dest++; - if (add_dest > add_end) { - add_dest = add_start; + AddDest++; + if (AddDest > AddEnd) { + AddDest = AddStart; } - if (mov_dst > trans_end) { - mov_dst = trans_start; + if (MovDest > TransEnd) { + MovDest = TransStart; } - mov_src++; - if (mov_src > trans_end) { - mov_src = trans_start; + MovSrc++; + if (MovSrc > TransEnd) { + MovSrc = TransStart; } - shift_pos++; - if (shift_pos == nr_shift_regs) { - shift_pos = 0; - left = !left; + ShiftPos++; + if (ShiftPos == NbShiftRegs) { + ShiftPos = 0; + Left = !Left; } } } - cb.movq(temp_reg, iter_reg); // restore iteration counter - if (this->getRAMSequenceCount(sequence) > 0) { + Cb.movq(TempReg, IterReg); // restore iteration counter + if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) { // reset RAM counter - auto NoRamReset = cb.newLabel(); - - cb.sub(ram_count_reg, Imm(1)); - cb.jnz(NoRamReset); - cb.mov(ram_count_reg, Imm(ram_loop_count)); - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); - cb.bind(NoRamReset); + auto NoRamReset = Cb.newLabel(); + + Cb.sub(RamCountReg, Imm(1)); + Cb.jnz(NoRamReset); + Cb.mov(RamCountReg, Imm(RamLoopCount)); + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); + Cb.bind(NoRamReset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.inc(temp_reg); // increment iteration counter - if (this->getL2SequenceCount(sequence) > 0) { + Cb.inc(TempReg); // increment iteration counter + if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) { // reset L2-Cache counter - auto NoL2Reset = cb.newLabel(); - - cb.sub(l2_count_reg, Imm(1)); - cb.jnz(NoL2Reset); - cb.mov(l2_count_reg, Imm(l2_loop_count)); - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); - cb.bind(NoL2Reset); + auto NoL2Reset = Cb.newLabel(); + + Cb.sub(L2CountReg, Imm(1)); + Cb.jnz(NoL2Reset); + Cb.mov(L2CountReg, Imm(L2LoopCount)); + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); + Cb.bind(NoL2Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.movq(iter_reg, temp_reg); // store iteration counter - if (this->getL3SequenceCount(sequence) > 0) { + Cb.movq(IterReg, TempReg); // store iteration counter + if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) { // reset L3-Cache counter - auto NoL3Reset = cb.newLabel(); - - cb.sub(l3_count_reg, Imm(1)); - cb.jnz(NoL3Reset); - cb.mov(l3_count_reg, Imm(l3_loop_count)); - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); - cb.bind(NoL3Reset); + auto NoL3Reset = Cb.newLabel(); + + Cb.sub(L3CountReg, Imm(1)); + Cb.jnz(NoL3Reset); + Cb.mov(L3CountReg, Imm(L3LoopCount)); + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); + Cb.bind(NoL3Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.mov(l1_addr, pointer_reg); + Cb.mov(L1Addr, PointerReg); - if (dumpRegisters) { - auto SkipRegistersDump = cb.newLabel(); - - cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - cb.jnz(SkipRegistersDump); - - // dump all the ymm register - for (int i = 0; i < (int)this->registerCount(); i++) { - cb.vmovapd( - ymmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)), - Ymm(i)); - } - - // set read flag - cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - - cb.bind(SkipRegistersDump); + if (DumpRegisters) { + emitDumpRegisterCode(Cb, PointerReg, ymmword_ptr); } - if (errorDetection) { - this->emitErrorDetectionCode( - cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2); + if (ErrorDetection) { + emitErrorDetectionCode(Cb, IterReg, AddrHighReg, PointerReg, TempReg, TempReg2); } - cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH)); - cb.jnz(Loop); + Cb.test(ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh)); + Cb.jnz(Loop); - cb.bind(FunctionExit); + Cb.bind(FunctionExit); - cb.movq(rax, iter_reg); + Cb.movq(asmjit::x86::rax, IterReg); - cb.emitEpilog(frame); + Cb.emitEpilog(Frame); - cb.finalize(); + Cb.finalize(); - // String sb; - // cb.dump(sb); - - Error err = this->rt.add(&this->loadFunction, &code); - if (err) { - workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in " - << __FILE__ << " at " << __LINE__; - return EXIT_FAILURE; - } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size - if (l1i_cache_size != 0) { - auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop); - auto instructionCachePercentage = 100 * loopSize / l1i_cache_size; + if (L1iCacheSize) { + auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop); + auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize; - if (loopSize > l1i_cache_size) { + if (LoopSize > *L1iCacheSize) { workerLog::warn() << "Work-loop is bigger than the L1i-Cache."; } - workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size - << " Bytes (" << instructionCachePercentage + workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage << "%) from the L1i-Cache for the work-loop."; - workerLog::trace() << "Sequence size: " << sequence.size(); - workerLog::trace() << "Repetition count: " << repetitions; + workerLog::trace() << "Sequence size: " << Sequence.size(); + workerLog::trace() << "Repetition count: " << Repetitions; } - return EXIT_SUCCESS; + return CompiledPayloadPtr; } -std::list FMAPayload::getAvailableInstructions() const { - std::list instructions; - - transform(this->instructionFlops.begin(), this->instructionFlops.end(), - back_inserter(instructions), - [](const auto &item) { return item.first; }); - - return instructions; +void FMAPayload::init(double* MemoryAddr, uint64_t BufferSize) const { + X86Payload::initMemory(MemoryAddr, BufferSize, 0.27948995982e-4, 0.27948995982e-4); } -void FMAPayload::init(unsigned long long *memoryAddr, - unsigned long long bufferSize) { - X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4); -} +} // namespace firestarter::environment::x86::payload \ No newline at end of file diff --git a/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp b/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp index d22880d1..fc77c8e1 100644 --- a/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp @@ -19,466 +19,394 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include - -#include -#include - -using namespace firestarter::environment::x86::payload; -using namespace asmjit; -using namespace asmjit::x86; - -int SSE2Payload::compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) { +#include "firestarter/Environment/X86/Payload/SSE2Payload.hpp" +#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp" + +namespace firestarter::environment::x86::payload { + +auto SSE2Payload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + using Imm = asmjit::Imm; + using Mm = asmjit::x86::Mm; + using Xmm = asmjit::x86::Xmm; + // NOLINTNEXTLINE(readability-identifier-naming) + constexpr asmjit::x86::Mem (*xmmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::xmmword_ptr; + // Compute the sequence of instruction groups and the number of its repetions // to reach the desired size - auto sequence = this->generateSequence(proportion); - auto repetitions = - this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread); + auto Sequence = Settings.sequence(); + auto Repetitions = + environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread()); // compute count of flops and memory access for performance report - unsigned flops = 0; - unsigned bytes = 0; + environment::payload::PayloadStats Stats; - for (const auto &item : sequence) { - auto it = this->instructionFlops.find(item); + for (const auto& Item : Sequence) { + auto It = instructionFlops().find(Item); - if (it == this->instructionFlops.end()) { - workerLog::error() << "Instruction group " << item << " undefined in " - << name() << "."; - return EXIT_FAILURE; + if (It == instructionFlops().end()) { + workerLog::error() << "Instruction group " << Item << " undefined in " << name() << "."; } - flops += it->second; + Stats.Flops += It->second; - it = this->instructionMemory.find(item); + It = instructionMemory().find(Item); - if (it != this->instructionMemory.end()) { - bytes += it->second; + if (It != instructionMemory().end()) { + Stats.Bytes += It->second; } } - this->_flops = repetitions * flops; - this->_bytes = repetitions * bytes; - this->_instructions = repetitions * sequence.size() * 2 + 4; + Stats.Flops *= Repetitions; + Stats.Bytes *= Repetitions; + Stats.Instructions = Repetitions * Sequence.size() * 2 + 4; // calculate the buffer sizes - auto l1i_cache_size = instructionCacheSize / thread; - auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin(); - auto l1_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l2_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l3_size = *dataCacheBufferSizeIterator / thread; - auto ram_size = ramBufferSize / thread; + const auto L1iCacheSize = Settings.instructionCacheSizePerThread(); + const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread(); + auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin(); + const auto L1Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L2Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L3Size = *DataCacheBufferSizeIterator; + const auto RamSize = Settings.ramBufferSizePerThread(); // calculate the reset counters for the buffers - auto l2_loop_count = - getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread); - auto l3_loop_count = - getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread); - auto ram_loop_count = - getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread); - - CodeHolder code; - code.init(this->rt.environment()); - - if (nullptr != this->loadFunction) { - this->rt.release(&this->loadFunction); - } - - Builder cb(&code); - cb.addDiagnosticOptions( - asmjit::DiagnosticOptions::kValidateAssembler | - asmjit::DiagnosticOptions::kValidateIntermediate ); - - auto pointer_reg = rax; - auto l1_addr = rbx; - auto l2_addr = rcx; - auto l3_addr = rdx; - auto ram_addr = rdi; - auto l2_count_reg = r8; - auto l3_count_reg = r9; - auto ram_count_reg = r10; - auto temp_reg = r11; - auto temp_reg2 = rbp; - auto offset_reg = r12; - auto addrHigh_reg = r13; - auto iter_reg = r14; - auto mov_regs = 0; - auto add_regs = 14; - auto trans_regs = 2; - - FuncDetail func; - func.init(FuncSignatureT( - CallConvId::kCDecl), - this->rt.environment()); - - FuncFrame frame; - frame.init(func); + const auto L2LoopCount = + environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size); + const auto L3LoopCount = + environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size); + const auto RamLoopCount = + environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize); + + asmjit::CodeHolder Code; + Code.init(asmjit::Environment::host()); + + asmjit::x86::Builder Cb(&Code); + Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler | + asmjit::DiagnosticOptions::kValidateIntermediate); + + const auto PointerReg = asmjit::x86::rax; + const auto L1Addr = asmjit::x86::rbx; + const auto L2Addr = asmjit::x86::rcx; + const auto L3Addr = asmjit::x86::rdx; + const auto RamAddr = asmjit::x86::rdi; + const auto L2CountReg = asmjit::x86::r8; + const auto L3CountReg = asmjit::x86::r9; + const auto RamCountReg = asmjit::x86::r10; + const auto TempReg = asmjit::x86::r11; + const auto TempReg2 = asmjit::x86::rbp; + const auto OffsetReg = asmjit::x86::r12; + const auto AddrHighReg = asmjit::x86::r13; + const auto IterReg = asmjit::x86::r14; + constexpr const auto MovRegs = 0; + const auto AddRegs = 14; + const auto TransRegs = 2; + + asmjit::FuncDetail Func; + Func.init(asmjit::FuncSignature::build( + asmjit::CallConvId::kCDecl), + Code.environment()); + + asmjit::FuncFrame Frame; + Frame.init(Func); // make xmm registers dirty - for (int i = 0; i < 16; i++) { - frame.addDirtyRegs(Xmm(i)); + for (auto I = 0U; I < 16U; I++) { + Frame.addDirtyRegs(Xmm(I)); } // make mmx registers dirty - for (int i = 0; i < 8; i++) { - frame.addDirtyRegs(Mm(i)); + for (auto I = 0U; I < 8U; I++) { + Frame.addDirtyRegs(Mm(I)); } // make all other used registers dirty except RAX - frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg, - l3_count_reg, ram_count_reg, temp_reg, temp_reg2, - offset_reg, addrHigh_reg, iter_reg); + Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg, + AddrHighReg, IterReg); - FuncArgsAssignment args(&func); - args.assignAll(pointer_reg, addrHigh_reg, iter_reg); - args.updateFuncFrame(frame); - frame.finalize(); + asmjit::FuncArgsAssignment Args(&Func); + Args.assignAll(PointerReg, AddrHighReg, IterReg); + Args.updateFuncFrame(Frame); + Frame.finalize(); - cb.emitProlog(frame); - cb.emitArgsAssignment(frame, args); + Cb.emitProlog(Frame); + Cb.emitArgsAssignment(Frame, Args); // stop right away if low load is selected - auto FunctionExit = cb.newLabel(); + auto FunctionExit = Cb.newLabel(); - cb.mov(temp_reg, ptr_64(addrHigh_reg)); - cb.test(temp_reg, temp_reg); - cb.jz(FunctionExit); + Cb.mov(TempReg, ptr_64(AddrHighReg)); + Cb.test(TempReg, TempReg); + Cb.jz(FunctionExit); - cb.mov(offset_reg, + Cb.mov(OffsetReg, Imm(64)); // increment after each cache/memory access // Initialize SSE-Registers for Addition - auto add_start = 0; - auto add_end = add_regs - 1; - auto trans_start = add_regs; - auto trans_end = add_regs + trans_regs - 1; - if (add_regs > 0) { - for (int i = add_start; i <= add_end; i++) { - cb.movapd(Xmm(i), xmmword_ptr(pointer_reg, 32 * i)); + const auto AddStart = 0; + const auto AddEnd = AddRegs - 1; + const auto TransStart = AddRegs; + const auto TransEnd = AddRegs + TransRegs - 1; + if (AddRegs > 0) { + for (auto I = AddStart; I <= AddEnd; I++) { + Cb.movapd(Xmm(I), xmmword_ptr(PointerReg, 32 * I)); } } // Initialize MMX-Registers for shift operations - auto mov_start = 0; - auto mov_end = mov_regs - 1; - if (mov_regs > 0) { - cb.mov(temp_reg, Imm(0x5555555555555555)); - cb.movq(Mm(mov_start), temp_reg); - for (int i = mov_start + 1; i <= mov_end; i++) { - cb.movq(Mm(i), Mm(mov_start)); + const auto MovStart = 0; + const auto MovEnd = MovRegs - 1; + if (MovRegs > 0) { + Cb.mov(TempReg, Imm(0x5555555555555555)); + Cb.movq(Mm(MovStart), TempReg); + for (auto I = MovStart + 1; I <= MovEnd; I++) { + Cb.movq(Mm(I), Mm(MovStart)); } } // Initialize SSE-Registers for Transfer-Operations - if (trans_regs > 0) { - if (trans_start % 2 == 0) { - cb.mov(temp_reg, Imm(0x0F0F0F0F0F0F0F0F)); + if (TransRegs > 0) { + if (TransStart % 2 == 0) { + Cb.mov(TempReg, Imm(0x0F0F0F0F0F0F0F0F)); } else { - cb.mov(temp_reg, Imm(0xF0F0F0F0F0F0F0F0)); + Cb.mov(TempReg, Imm(0xF0F0F0F0F0F0F0F0)); } - cb.pinsrq(Xmm(trans_start), temp_reg, Imm(0)); - cb.pinsrq(Xmm(trans_start), temp_reg, Imm(1)); - for (int i = trans_start + 1; i <= trans_end; i++) { - if (i % 2 == 0) { - cb.shr(temp_reg, Imm(4)); + Cb.pinsrq(Xmm(TransStart), TempReg, Imm(0)); + Cb.pinsrq(Xmm(TransStart), TempReg, Imm(1)); + for (auto I = TransStart + 1; I <= TransEnd; I++) { + if (I % 2 == 0) { + Cb.shr(TempReg, Imm(4)); } else { - cb.shl(temp_reg, Imm(4)); + Cb.shl(TempReg, Imm(4)); } - cb.pinsrq(Xmm(i), temp_reg, Imm(0)); - cb.pinsrq(Xmm(i), temp_reg, Imm(1)); + Cb.pinsrq(Xmm(I), TempReg, Imm(0)); + Cb.pinsrq(Xmm(I), TempReg, Imm(1)); } } - cb.mov(l1_addr, pointer_reg); // address for L1-buffer - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer - cb.mov(l2_count_reg, Imm(l2_loop_count)); - workerLog::trace() << "reset counter for L2-buffer with " - << l2_loop_count - << " cache line accesses per loop (" - << l2_size/1024 - << ") KiB"; - cb.mov(l3_count_reg, Imm(l3_loop_count)); - workerLog::trace() << "reset counter for L3-buffer with " - << l3_loop_count - << " cache line accesses per loop (" - << l3_size/1024 - << ") KiB"; - cb.mov(ram_count_reg, Imm(ram_loop_count)); - workerLog::trace() << "reset counter for RAM-buffer with " - << ram_loop_count - << " cache line accesses per loop (" - << ram_size/1024 - << ") KiB"; - - cb.align(AlignMode::kCode, 64); - - auto Loop = cb.newLabel(); - cb.bind(Loop); - - auto movq_dst = mov_start; - auto add_dest = add_start + 1; - auto mov_dst = trans_start; - auto mov_src = mov_dst + 1; - unsigned l1_offset = 0; - -#define L1_INCREMENT() \ - l1_offset += 64; \ - if (l1_offset < l1_size * 0.5) { \ - cb.add(l1_addr, offset_reg); \ - } else { \ - l1_offset = 0; \ - cb.mov(l1_addr, pointer_reg); \ - } - -#define L2_INCREMENT() cb.add(l2_addr, offset_reg); - -#define L3_INCREMENT() cb.add(l3_addr, offset_reg) - -#define RAM_INCREMENT() cb.add(ram_addr, offset_reg) - - for (unsigned count = 0; count < repetitions; count++) { - for (const auto &item : sequence) { - if (item == "REG") { - cb.addpd( - Xmm(add_dest), - Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs)); - cb.movdqa(Xmm(mov_dst), Xmm(mov_src)); - } else if (item == "L1_L") { - cb.addpd(Xmm(add_dest), xmmword_ptr(l1_addr, 32)); - L1_INCREMENT(); - } else if (item == "L1_S") { - cb.addpd( - Xmm(add_dest), - Xmm(add_start + (add_dest - add_start + add_regs - 1) % add_regs)); - cb.movapd(xmmword_ptr(l1_addr, 32), Xmm(add_dest)); - L1_INCREMENT(); - this->_instructions++; - } else if (item == "L1_LS") { - cb.addpd(Xmm(add_dest), xmmword_ptr(l1_addr, 32)); - cb.movapd(xmmword_ptr(l1_addr, 64), Xmm(add_dest)); - L1_INCREMENT(); - this->_instructions++; - } else if (item == "L2_L") { - cb.addpd(Xmm(add_dest), xmmword_ptr(l2_addr, 64)); - L2_INCREMENT(); - } else if (item == "L2_S") { - cb.addpd( - Xmm(add_dest), - Xmm(add_start + (add_dest - add_start + add_regs - 1) % add_regs)); - cb.movapd(xmmword_ptr(l2_addr, 64), Xmm(add_dest)); - L2_INCREMENT(); - this->_instructions++; - } else if (item == "L2_LS") { - cb.addpd(Xmm(add_dest), xmmword_ptr(l2_addr, 64)); - cb.movapd(xmmword_ptr(l2_addr, 96), Xmm(add_dest)); - L2_INCREMENT(); - this->_instructions++; - } else if (item == "L3_L") { - cb.addpd(Xmm(add_dest), xmmword_ptr(l3_addr, 64)); - L3_INCREMENT(); - } else if (item == "L3_S") { - cb.addpd( - Xmm(add_dest), - Xmm(add_start + (add_dest - add_start + add_regs - 1) % add_regs)); - cb.movapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest)); - L3_INCREMENT(); - this->_instructions++; - } else if (item == "L3_LS") { - cb.addpd(Xmm(add_dest), xmmword_ptr(l3_addr, 64)); - cb.movapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest)); - L3_INCREMENT(); - this->_instructions++; - } else if (item == "L3_P") { - cb.addpd(Xmm(add_dest), xmmword_ptr(l1_addr, 32)); - cb.prefetcht0(ptr(l3_addr)); - L3_INCREMENT(); - this->_instructions++; - } else if (item == "RAM_L") { - cb.addpd(Xmm(add_dest), xmmword_ptr(ram_addr, 64)); - RAM_INCREMENT(); - } else if (item == "RAM_S") { - cb.addpd( - Xmm(add_dest), - Xmm(add_start + (add_dest - add_start + add_regs - 1) % add_regs)); - cb.movapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest)); - RAM_INCREMENT(); - this->_instructions++; - } else if (item == "RAM_LS") { - cb.addpd(Xmm(add_dest), xmmword_ptr(l3_addr, 64)); - cb.movapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest)); - RAM_INCREMENT(); - this->_instructions++; - } else if (item == "RAM_P") { - cb.addpd(Xmm(add_dest), xmmword_ptr(l1_addr, 32)); - cb.prefetcht2(ptr(ram_addr)); - RAM_INCREMENT(); - this->_instructions++; + Cb.mov(L1Addr, PointerReg); // address for L1-buffer + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer + Cb.mov(L2CountReg, Imm(L2LoopCount)); + workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop (" + << L2Size / 1024 << ") KiB"; + Cb.mov(L3CountReg, Imm(L3LoopCount)); + workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop (" + << L3Size / 1024 << ") KiB"; + Cb.mov(RamCountReg, Imm(RamLoopCount)); + workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop (" + << RamSize / 1024 << ") KiB"; + + Cb.align(asmjit::AlignMode::kCode, 64); + + auto Loop = Cb.newLabel(); + Cb.bind(Loop); + + auto MovqDest = MovStart; + auto AddDest = AddStart + 1; + auto MovDest = TransStart; + auto MovSrc = MovDest + 1; + unsigned L1Offset = 0; + + const auto L1Increment = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg]() { + L1Offset += 64; + if (L1Offset < L1Size * 0.5) { + Cb.add(L1Addr, OffsetReg); + } else { + L1Offset = 0; + Cb.mov(L1Addr, PointerReg); + } + }; + const auto L2Increment = [&Cb, &L2Addr, &OffsetReg]() { Cb.add(L2Addr, OffsetReg); }; + const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); }; + const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); }; + + for (auto Count = 0U; Count < Repetitions; Count++) { + for (const auto& Item : Sequence) { + if (Item == "REG") { + Cb.addpd(Xmm(AddDest), Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs))); + Cb.movdqa(Xmm(MovDest), Xmm(MovSrc)); + } else if (Item == "L1_L") { + Cb.addpd(Xmm(AddDest), xmmword_ptr(L1Addr, 32)); + L1Increment(); + } else if (Item == "L1_S") { + Cb.addpd(Xmm(AddDest), Xmm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs))); + Cb.movapd(xmmword_ptr(L1Addr, 32), Xmm(AddDest)); + L1Increment(); + Stats.Instructions++; + } else if (Item == "L1_LS") { + Cb.addpd(Xmm(AddDest), xmmword_ptr(L1Addr, 32)); + Cb.movapd(xmmword_ptr(L1Addr, 64), Xmm(AddDest)); + L1Increment(); + Stats.Instructions++; + } else if (Item == "L2_L") { + Cb.addpd(Xmm(AddDest), xmmword_ptr(L2Addr, 64)); + L2Increment(); + } else if (Item == "L2_S") { + Cb.addpd(Xmm(AddDest), Xmm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs))); + Cb.movapd(xmmword_ptr(L2Addr, 64), Xmm(AddDest)); + L2Increment(); + Stats.Instructions++; + } else if (Item == "L2_LS") { + Cb.addpd(Xmm(AddDest), xmmword_ptr(L2Addr, 64)); + Cb.movapd(xmmword_ptr(L2Addr, 96), Xmm(AddDest)); + L2Increment(); + Stats.Instructions++; + } else if (Item == "L3_L") { + Cb.addpd(Xmm(AddDest), xmmword_ptr(L3Addr, 64)); + L3Increment(); + } else if (Item == "L3_S") { + Cb.addpd(Xmm(AddDest), Xmm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs))); + Cb.movapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest)); + L3Increment(); + Stats.Instructions++; + } else if (Item == "L3_LS") { + Cb.addpd(Xmm(AddDest), xmmword_ptr(L3Addr, 64)); + Cb.movapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest)); + L3Increment(); + Stats.Instructions++; + } else if (Item == "L3_P") { + Cb.addpd(Xmm(AddDest), xmmword_ptr(L1Addr, 32)); + Cb.prefetcht0(ptr(L3Addr)); + L3Increment(); + Stats.Instructions++; + } else if (Item == "RAM_L") { + Cb.addpd(Xmm(AddDest), xmmword_ptr(RamAddr, 64)); + RamIncrement(); + } else if (Item == "RAM_S") { + Cb.addpd(Xmm(AddDest), Xmm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs))); + Cb.movapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest)); + RamIncrement(); + Stats.Instructions++; + } else if (Item == "RAM_LS") { + Cb.addpd(Xmm(AddDest), xmmword_ptr(L3Addr, 64)); + Cb.movapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest)); + RamIncrement(); + Stats.Instructions++; + } else if (Item == "RAM_P") { + Cb.addpd(Xmm(AddDest), xmmword_ptr(L1Addr, 32)); + Cb.prefetcht2(ptr(RamAddr)); + RamIncrement(); + Stats.Instructions++; } else { - workerLog::error() << "Instruction group " << item << " not found in " - << this->name() << "."; - return EXIT_FAILURE; + workerLog::error() << "Instruction group " << Item << " not found in " << name() << "."; } - if (mov_regs > 0) { - this->_instructions++; - cb.movq( - Mm(mov_start + (movq_dst - mov_start + mov_regs - 1) % mov_regs), - Mm(movq_dst)); + if constexpr (MovRegs > 0) { + Stats.Instructions++; + Cb.movq(Mm(MovStart + ((MovqDest - MovStart + MovRegs - 1) % MovRegs)), Mm(MovqDest)); } - add_dest++; - if (add_dest > add_end) { + AddDest++; + if (AddDest > AddEnd) { // DO NOT REMOVE the + 1. It serves for the good of ymm0. If it was to // be overriden, the values in the other registers would rise up to inf. - add_dest = add_start + 1; + AddDest = AddStart + 1; } - mov_dst++; - if (mov_dst > trans_end) { - mov_dst = trans_start; + MovDest++; + if (MovDest > TransEnd) { + MovDest = TransStart; } - mov_src++; - if (mov_src > trans_end) { - mov_src = trans_start; + MovSrc++; + if (MovSrc > TransEnd) { + MovSrc = TransStart; } - if (mov_regs > 0) { - movq_dst++; - if (movq_dst > mov_end) { - movq_dst = mov_start; + if (MovRegs > 0) { + MovqDest++; + if (MovqDest > MovEnd) { + MovqDest = MovStart; } } } } - if (this->getRAMSequenceCount(sequence) > 0) { + if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) { // reset RAM counter - auto NoRamReset = cb.newLabel(); - - cb.sub(ram_count_reg, Imm(1)); - cb.jnz(NoRamReset); - cb.mov(ram_count_reg, Imm(ram_loop_count)); - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); - cb.bind(NoRamReset); + auto NoRamReset = Cb.newLabel(); + + Cb.sub(RamCountReg, Imm(1)); + Cb.jnz(NoRamReset); + Cb.mov(RamCountReg, Imm(RamLoopCount)); + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); + Cb.bind(NoRamReset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - if (this->getL2SequenceCount(sequence) > 0) { + if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) { // reset L2-Cache counter - auto NoL2Reset = cb.newLabel(); - - cb.sub(l2_count_reg, Imm(1)); - cb.jnz(NoL2Reset); - cb.mov(l2_count_reg, Imm(l2_loop_count)); - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); - cb.bind(NoL2Reset); + auto NoL2Reset = Cb.newLabel(); + + Cb.sub(L2CountReg, Imm(1)); + Cb.jnz(NoL2Reset); + Cb.mov(L2CountReg, Imm(L2LoopCount)); + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); + Cb.bind(NoL2Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - if (this->getL3SequenceCount(sequence) > 0) { + if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) { // reset L3-Cache counter - auto NoL3Reset = cb.newLabel(); - - cb.sub(l3_count_reg, Imm(1)); - cb.jnz(NoL3Reset); - cb.mov(l3_count_reg, Imm(l3_loop_count)); - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); - cb.bind(NoL3Reset); + auto NoL3Reset = Cb.newLabel(); + + Cb.sub(L3CountReg, Imm(1)); + Cb.jnz(NoL3Reset); + Cb.mov(L3CountReg, Imm(L3LoopCount)); + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); + Cb.bind(NoL3Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.inc(iter_reg); // increment iteration counter - cb.mov(l1_addr, pointer_reg); - - if (dumpRegisters) { - auto SkipRegistersDump = cb.newLabel(); + Cb.inc(IterReg); // increment iteration counter + Cb.mov(L1Addr, PointerReg); - cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - cb.jnz(SkipRegistersDump); - - // dump all the xmm register - for (int i = 0; i < (int)this->registerCount(); i++) { - cb.movapd( - xmmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)), - Xmm(i)); - } - - // set read flag - cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - - cb.bind(SkipRegistersDump); + if (DumpRegisters) { + emitDumpRegisterCode(Cb, PointerReg, xmmword_ptr); } - if (errorDetection) { - this->emitErrorDetectionCode( - cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2); + if (ErrorDetection) { + emitErrorDetectionCode(Cb, IterReg, AddrHighReg, PointerReg, TempReg, TempReg2); } - cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH)); - cb.jnz(Loop); + Cb.test(ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh)); + Cb.jnz(Loop); - cb.bind(FunctionExit); + Cb.bind(FunctionExit); - cb.mov(rax, iter_reg); // restore iteration counter + Cb.mov(asmjit::x86::rax, IterReg); // restore iteration counter - cb.emitEpilog(frame); + Cb.emitEpilog(Frame); - cb.finalize(); + Cb.finalize(); - // String sb; - // cb.dump(sb); - - Error err = this->rt.add(&this->loadFunction, &code); - if (err) { - workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in " - << __FILE__ << " at " << __LINE__; - return EXIT_FAILURE; - } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size - if (l1i_cache_size != 0) { - auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop); - auto instructionCachePercentage = 100 * loopSize / l1i_cache_size; + if (L1iCacheSize) { + auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop); + auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize; - if (loopSize > l1i_cache_size) { + if (LoopSize > *L1iCacheSize) { workerLog::warn() << "Work-loop is bigger than the L1i-Cache."; } - workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size - << " Bytes (" << instructionCachePercentage + workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage << "%) from the L1i-Cache for the work-loop."; - workerLog::trace() << "Sequence size: " << sequence.size(); - workerLog::trace() << "Repetition count: " << repetitions; + workerLog::trace() << "Sequence size: " << Sequence.size(); + workerLog::trace() << "Repetition count: " << Repetitions; } - return EXIT_SUCCESS; + return CompiledPayloadPtr; } -std::list SSE2Payload::getAvailableInstructions() const { - std::list instructions; - - transform(this->instructionFlops.begin(), this->instructionFlops.end(), - back_inserter(instructions), - [](const auto &item) { return item.first; }); - - return instructions; +void SSE2Payload::init(double* MemoryAddr, uint64_t BufferSize) const { + X86Payload::initMemory(MemoryAddr, BufferSize, 1.654738925401e-10, 1.654738925401e-15); } -void SSE2Payload::init(unsigned long long *memoryAddr, - unsigned long long bufferSize) { - X86Payload::init(memoryAddr, bufferSize, 1.654738925401e-10, - 1.654738925401e-15); -} +} // namespace firestarter::environment::x86::payload \ No newline at end of file diff --git a/src/firestarter/Environment/X86/Payload/X86Payload.cpp b/src/firestarter/Environment/X86/Payload/X86Payload.cpp index 42a2fa5b..296d1052 100644 --- a/src/firestarter/Environment/X86/Payload/X86Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/X86Payload.cpp @@ -19,468 +19,76 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ +#include "firestarter/Environment/X86/Payload/X86Payload.hpp" +#include "firestarter/Constants.hpp" +#include "firestarter/WindowsCompat.hpp" + +#include #include #include -#include - -#ifdef _MSC_VER -#include -#include -#endif - -#include -using namespace firestarter::environment::x86::payload; +namespace firestarter::environment::x86::payload { -void X86Payload::lowLoadFunction(volatile unsigned long long *addrHigh, - unsigned long long period) { - int nap; -#ifdef _MSC_VER - std::array cpuid; -#endif +void X86Payload::lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period) const { + auto Nap = Period / 100; - nap = period / 100; -#ifndef _MSC_VER - __asm__ __volatile__("mfence;" - "cpuid;" :: - : "eax", "ebx", "ecx", "edx"); -#else - _mm_mfence(); - __cpuid(cpuid.data(), 0); -#endif - // while signal low load - while (*addrHigh == LOAD_LOW) { -#ifndef _MSC_VER - __asm__ __volatile__("mfence;" - "cpuid;" :: - : "eax", "ebx", "ecx", "edx"); -#else + if constexpr (firestarter::OptionalFeatures.IsMsc) { + std::array Cpuid{}; _mm_mfence(); - __cpuid(cpuid.data(), 0); -#endif - std::this_thread::sleep_for(std::chrono::microseconds(nap)); -#ifndef _MSC_VER + __cpuid(Cpuid.data(), 0); + } else { __asm__ __volatile__("mfence;" "cpuid;" :: : "eax", "ebx", "ecx", "edx"); -#else - _mm_mfence(); - __cpuid(cpuid.data(), 0); -#endif - } -} - -void X86Payload::init(unsigned long long *memoryAddr, - unsigned long long bufferSize, double firstValue, - double lastValue) { - unsigned long long i = 0; - - for (; i < INIT_BLOCKSIZE; i++) - *((double *)(memoryAddr + i)) = 0.25 + (double)i * 8.0 * firstValue; - for (; i <= bufferSize - INIT_BLOCKSIZE; i += INIT_BLOCKSIZE) - std::memcpy(memoryAddr + i, memoryAddr + i - INIT_BLOCKSIZE, - sizeof(unsigned long long) * INIT_BLOCKSIZE); - for (; i < bufferSize; i++) - *((double *)(memoryAddr + i)) = 0.25 + (double)i * 8.0 * lastValue; -} - -unsigned long long -X86Payload::highLoadFunction(unsigned long long *addrMem, - volatile unsigned long long *addrHigh, - unsigned long long iterations) { - return this->loadFunction(addrMem, addrHigh, iterations); -} - -// add MM regs to dirty regs -// zmm31 is used for backup if VectorReg is of type asmjit::x86::Zmm -template -void X86Payload::emitErrorDetectionCode(asmjit::x86::Builder &cb, - IterReg iter_reg, - asmjit::x86::Gpq addrHigh_reg, - asmjit::x86::Gpq pointer_reg, - asmjit::x86::Gpq temp_reg, - asmjit::x86::Gpq temp_reg2) { - // we don't want anything to break... so we use asserts for everything that - // could break it - static_assert(std::is_base_of::value, - "VectorReg must be of asmjit::asmjit::x86::Vec"); - static_assert(std::is_same::value || - std::is_same::value || - std::is_same::value, - "VectorReg ist not of any supported type"); - static_assert(std::is_same::value || - std::is_same::value, - "IterReg is not of any supported type"); - - if constexpr (std::is_same::value) { - assert((iter_reg == asmjit::x86::mm0, "iter_reg must be mm0")); - } - - assert((iter_reg != temp_reg, "iter_reg must be != temp_reg")); - assert((temp_reg != temp_reg2, "temp_reg must be != temp_reg2")); - assert((temp_reg != addrHigh_reg, "temp_reg must be != addrHigh_reg")); - assert((temp_reg != pointer_reg, "temp_reg must be != pointer_reg")); - - assert((iter_reg != asmjit::x86::r8, "iter_reg must be != r8")); - assert((iter_reg != asmjit::x86::r9, "iter_reg must be != r9")); - assert((iter_reg != asmjit::x86::rax, "iter_reg must be != rax")); - assert((iter_reg != asmjit::x86::rbx, "iter_reg must be != rbx")); - assert((iter_reg != asmjit::x86::rcx, "iter_reg must be != rcx")); - assert((iter_reg != asmjit::x86::rdx, "iter_reg must be != rdx")); - - assert((temp_reg != asmjit::x86::r8, "temp_reg must be != r8")); - assert((temp_reg != asmjit::x86::r9, "temp_reg must be != r9")); - assert((temp_reg != asmjit::x86::rax, "temp_reg must be != rax")); - assert((temp_reg != asmjit::x86::rbx, "temp_reg must be != rbx")); - assert((temp_reg != asmjit::x86::rcx, "temp_reg must be != rcx")); - assert((temp_reg != asmjit::x86::rdx, "temp_reg must be != rdx")); - - assert((temp_reg2 != asmjit::x86::r8, "temp_reg2 must be != r8")); - assert((temp_reg2 != asmjit::x86::r9, "temp_reg2 must be != r9")); - assert((temp_reg2 != asmjit::x86::rax, "temp_reg2 must be != rax")); - assert((temp_reg2 != asmjit::x86::rbx, "temp_reg2 must be != rbx")); - assert((temp_reg2 != asmjit::x86::rcx, "temp_reg2 must be != rcx")); - assert((temp_reg2 != asmjit::x86::rdx, "temp_reg2 must be != rdx")); - - assert((addrHigh_reg != asmjit::x86::r8, "addrHigh_reg must be != r8")); - assert((addrHigh_reg != asmjit::x86::r9, "addrHigh_reg must be != r9")); - assert((addrHigh_reg != asmjit::x86::rax, "addrHigh_reg must be != rax")); - assert((addrHigh_reg != asmjit::x86::rbx, "addrHigh_reg must be != rbx")); - assert((addrHigh_reg != asmjit::x86::rcx, "addrHigh_reg must be != rcx")); - assert((addrHigh_reg != asmjit::x86::rdx, "addrHigh_reg must be != rdx")); - - auto SkipErrorDetection = cb.newLabel(); - - if constexpr (std::is_same::value) { - cb.movq(temp_reg, iter_reg); - } else { - cb.mov(temp_reg, iter_reg); - } - // round about 50-100 Hz - // more or less, but this isn't really that relevant - cb.and_(temp_reg, asmjit::Imm(0x3fff)); - cb.test(temp_reg, temp_reg); - cb.jnz(SkipErrorDetection); - - cb.mov(temp_reg, asmjit::Imm(0xffffffff)); - - int registerCount = (int)this->registerCount(); - - // Create a backup of VectorReg(0) - if constexpr (std::is_same::value) { - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.push(temp_reg2); - cb.crc32(temp_reg, temp_reg2); - cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.push(temp_reg2); - cb.crc32(temp_reg, temp_reg2); - - } else if constexpr (std::is_same::value && - std::is_same::value) { - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.movq(asmjit::x86::Mm(7), temp_reg2); - cb.crc32(temp_reg, temp_reg2); - cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.movq(asmjit::x86::Mm(6), temp_reg2); - cb.crc32(temp_reg, temp_reg2); - - cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1)); - - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.movq(asmjit::x86::Mm(5), temp_reg2); - cb.crc32(temp_reg, temp_reg2); - cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.movq(asmjit::x86::Mm(4), temp_reg2); - cb.crc32(temp_reg, temp_reg2); - } else if constexpr (std::is_same::value && - std::is_same::value) { - // We use vector registers zmm31 for our backup - cb.vmovapd(asmjit::x86::zmm31, asmjit::x86::zmm0); - registerCount--; } - // Calculate the hash of the remaining VectorReg - // use VectorReg(0) as a temporary place to unpack values - for (int i = 1; i < registerCount; i++) { - if constexpr (std::is_same::value) { - cb.vmovapd(asmjit::x86::xmm0, asmjit::x86::Xmm(i)); - - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - } else if constexpr (std::is_same::value) { - cb.vmovapd(asmjit::x86::ymm0, asmjit::x86::Ymm(i)); - - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - - cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1)); - - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - } else if constexpr (std::is_same::value) { - cb.vmovapd(asmjit::x86::ymm0, asmjit::x86::Ymm(i)); - - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - - cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1)); - - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - - cb.vextractf32x4(asmjit::x86::xmm0, asmjit::x86::Zmm(i), asmjit::Imm(2)); - - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - - cb.vextractf32x4(asmjit::x86::xmm0, asmjit::x86::Zmm(i), asmjit::Imm(3)); - - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); - cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.movq(temp_reg2, asmjit::x86::xmm0); - cb.crc32(temp_reg, temp_reg2); + // while signal low load + while (LoadVar == LoadThreadWorkType::LoadLow) { + if constexpr (firestarter::OptionalFeatures.IsMsc) { + std::array Cpuid{}; + _mm_mfence(); + __cpuid(Cpuid.data(), 0); + } else { + __asm__ __volatile__("mfence;" + "cpuid;" :: + : "eax", "ebx", "ecx", "edx"); + } + std::this_thread::sleep_for(Nap); + if constexpr (firestarter::OptionalFeatures.IsMsc) { + std::array Cpuid{}; + _mm_mfence(); + __cpuid(Cpuid.data(), 0); + } else { + __asm__ __volatile__("mfence;" + "cpuid;" :: + : "eax", "ebx", "ecx", "edx"); } } +} - // Restore VectorReg(0) from backup - if constexpr (std::is_same::value) { - cb.pop(temp_reg2); - cb.movq(asmjit::x86::xmm0, temp_reg2); - cb.movlhps(asmjit::x86::xmm0, asmjit::x86::xmm0); - cb.pop(temp_reg2); - cb.pinsrw(asmjit::x86::xmm0, temp_reg2.r32(), asmjit::Imm(0)); - cb.shr(temp_reg2, asmjit::Imm(32)); - cb.movd(temp_reg2.r32(), asmjit::x86::Mm(7)); - cb.pinsrw(asmjit::x86::xmm0, temp_reg2.r32(), asmjit::Imm(1)); - } else if constexpr (std::is_same::value && - std::is_same::value) { - cb.movq(temp_reg2, asmjit::x86::Mm(5)); - cb.movq(asmjit::x86::xmm0, temp_reg2); - cb.movq(temp_reg2, asmjit::x86::Mm(4)); - cb.pinsrq(asmjit::x86::xmm0, temp_reg2, asmjit::Imm(1)); - - cb.vinsertf128(asmjit::x86::ymm0, asmjit::x86::ymm0, asmjit::x86::xmm0, - asmjit::Imm(1)); - - cb.movq(temp_reg2, asmjit::x86::Mm(7)); - cb.movq(asmjit::x86::xmm0, temp_reg2); - cb.movq(temp_reg2, asmjit::x86::Mm(6)); - cb.pinsrq(asmjit::x86::xmm0, temp_reg2, asmjit::Imm(1)); - } else if constexpr (std::is_same::value && - std::is_same::value) { - // We use vector registers zmm31 for our backup - cb.vmovapd(asmjit::x86::zmm0, asmjit::x86::zmm31); - } +void X86Payload::initMemory(double* MemoryAddr, uint64_t BufferSize, double FirstValue, double LastValue) { + uint64_t I = 0; - // before starting the communication, backup r8, r9, rax, rbx, rcx and rdx - if constexpr (std::is_same::value) { - cb.movq(asmjit::x86::Mm(7), asmjit::x86::rax); - cb.movq(asmjit::x86::Mm(6), asmjit::x86::rbx); - cb.movq(asmjit::x86::Mm(5), asmjit::x86::rcx); - cb.movq(asmjit::x86::Mm(4), asmjit::x86::rdx); - cb.movq(asmjit::x86::Mm(3), asmjit::x86::r8); - cb.movq(asmjit::x86::Mm(2), asmjit::x86::r9); - } else { - cb.push(asmjit::x86::rax); - cb.push(asmjit::x86::rbx); - cb.push(asmjit::x86::rcx); - cb.push(asmjit::x86::rdx); - cb.push(asmjit::x86::r8); - cb.push(asmjit::x86::r9); + // NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic) + for (; I < InitBlocksize; I++) { + MemoryAddr[I] = 0.25 + static_cast(I) * 8.0 * FirstValue; } - - // do the actual communication - // temp_reg contains our hash - - // save the pointer_reg. it might be any of r8, r9, rax, rbx, rcx or rdx - cb.mov(temp_reg2, pointer_reg); - - // Don't touch me! - // This sychronization and communication works even if the threads run at - // different (changing) speed, with just one "lock cmpxchg16b" Brought to you - // by a few hours of headache for two people. - auto communication = [&](auto offset) { - // communication - cb.mov(asmjit::x86::r8, asmjit::x86::ptr_64(temp_reg2, offset)); - - // temp data - cb.mov(asmjit::x86::r9, temp_reg2); - cb.add(asmjit::x86::r9, asmjit::Imm(offset + 8)); - - cb.mov(asmjit::x86::rdx, asmjit::x86::ptr_64(asmjit::x86::r9, 0)); - cb.mov(asmjit::x86::rax, asmjit::x86::ptr_64(asmjit::x86::r9, 8)); - - auto L0 = cb.newLabel(); - cb.bind(L0); - - cb.lock(); - cb.cmpxchg16b(asmjit::x86::ptr(asmjit::x86::r8)); - - auto L1 = cb.newLabel(); - cb.jnz(L1); - - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 0), asmjit::x86::rcx); - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 8), asmjit::x86::rbx); - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 16), asmjit::Imm(0)); - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 24), asmjit::Imm(0)); - - cb.mov(asmjit::x86::rax, asmjit::Imm(2)); - - auto L6 = cb.newLabel(); - cb.jmp(L6); - - cb.bind(L1); - - cb.cmp(asmjit::x86::rcx, asmjit::x86::rdx); - - auto L2 = cb.newLabel(); - cb.jle(L2); - - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 0), asmjit::x86::rcx); - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 8), asmjit::x86::rbx); - - cb.jmp(L0); - - cb.bind(L2); - - auto L3 = cb.newLabel(); - - cb.cmp(asmjit::x86::ptr_64(asmjit::x86::r9, 16), asmjit::Imm(0)); - cb.jne(L3); - cb.cmp(asmjit::x86::ptr_64(asmjit::x86::r9, 24), asmjit::Imm(0)); - cb.jne(L3); - - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 16), asmjit::x86::rdx); - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 24), asmjit::x86::rax); - - cb.bind(L3); - - cb.cmp(asmjit::x86::rcx, asmjit::x86::ptr_64(asmjit::x86::r9, 16)); - cb.mov(asmjit::x86::rax, asmjit::Imm(4)); - cb.jne(L6); - - cb.cmp(asmjit::x86::rbx, asmjit::x86::ptr_64(asmjit::x86::r9, 24)); - auto L4 = cb.newLabel(); - cb.jne(L4); - - cb.mov(asmjit::x86::rax, asmjit::Imm(0)); - - auto L5 = cb.newLabel(); - cb.jmp(L5); - - cb.bind(L4); - - cb.mov(asmjit::x86::rax, asmjit::Imm(1)); - - cb.bind(L5); - - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 16), asmjit::Imm(0)); - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 24), asmjit::Imm(0)); - - cb.bind(L6); - - // if check failed - cb.cmp(asmjit::x86::rax, asmjit::Imm(1)); - auto L7 = cb.newLabel(); - cb.jne(L7); - - // write the error flag - cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 32), asmjit::Imm(1)); - - // stop the execution after some time - cb.mov(asmjit::x86::ptr_64(addrHigh_reg), asmjit::Imm(LOAD_STOP)); - cb.mfence(); - - cb.bind(L7); - - auto L9 = cb.newLabel(); - cb.jmp(L9); - }; - - // left communication - // move hash - cb.mov(asmjit::x86::rbx, temp_reg); - // move iterations counter - if constexpr (std::is_same::value) { - cb.movq(asmjit::x86::rcx, iter_reg); - } else { - cb.mov(asmjit::x86::rcx, iter_reg); + for (; I <= BufferSize - InitBlocksize; I += InitBlocksize) { + std::memcpy(MemoryAddr + I, MemoryAddr + I - InitBlocksize, sizeof(uint64_t) * InitBlocksize); } - - communication(-128); - - // right communication - // move hash - cb.mov(asmjit::x86::rbx, temp_reg); - // move iterations counter - if constexpr (std::is_same::value) { - cb.movq(asmjit::x86::rcx, iter_reg); - } else { - cb.mov(asmjit::x86::rcx, iter_reg); + for (; I < BufferSize; I++) { + MemoryAddr[I] = 0.25 + static_cast(I) * 8.0 * LastValue; } + // NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic) +} - communication(-64); +auto X86Payload::getAvailableInstructions() const -> std::list { + std::list Instructions; - // restore r8, r9, rax, rbx, rcx and rdx - if constexpr (std::is_same::value) { - cb.movq(asmjit::x86::rax, asmjit::x86::Mm(7)); - cb.movq(asmjit::x86::rbx, asmjit::x86::Mm(6)); - cb.movq(asmjit::x86::rcx, asmjit::x86::Mm(5)); - cb.movq(asmjit::x86::rdx, asmjit::x86::Mm(4)); - cb.movq(asmjit::x86::r8, asmjit::x86::Mm(3)); - cb.movq(asmjit::x86::r9, asmjit::x86::Mm(2)); - } else { - cb.pop(asmjit::x86::r9); - cb.pop(asmjit::x86::r8); - cb.pop(asmjit::x86::rdx); - cb.pop(asmjit::x86::rcx); - cb.pop(asmjit::x86::rbx); - cb.pop(asmjit::x86::rax); - } + transform(InstructionFlops.begin(), InstructionFlops.end(), back_inserter(Instructions), + [](const auto& Item) { return Item.first; }); - cb.bind(SkipErrorDetection); + return Instructions; } -template void -X86Payload::emitErrorDetectionCode( - asmjit::x86::Builder &cb, asmjit::x86::Gpq iter_reg, - asmjit::x86::Gpq addrHigh_reg, asmjit::x86::Gpq pointer_reg, - asmjit::x86::Gpq temp_reg, asmjit::x86::Gpq temp_reg2); -template void -X86Payload::emitErrorDetectionCode( - asmjit::x86::Builder &cb, asmjit::x86::Gpq iter_reg, - asmjit::x86::Gpq addrHigh_reg, asmjit::x86::Gpq pointer_reg, - asmjit::x86::Gpq temp_reg, asmjit::x86::Gpq temp_reg2); - -template void -X86Payload::emitErrorDetectionCode( - asmjit::x86::Builder &cb, asmjit::x86::Mm iter_reg, - asmjit::x86::Gpq addrHigh_reg, asmjit::x86::Gpq pointer_reg, - asmjit::x86::Gpq temp_reg, asmjit::x86::Gpq temp_reg2); -template void -X86Payload::emitErrorDetectionCode( - asmjit::x86::Builder &cb, asmjit::x86::Mm iter_reg, - asmjit::x86::Gpq addrHigh_reg, asmjit::x86::Gpq pointer_reg, - asmjit::x86::Gpq temp_reg, asmjit::x86::Gpq temp_reg2); +}; // namespace firestarter::environment::x86::payload \ No newline at end of file diff --git a/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp b/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp index 9e99ca2d..4857f82d 100644 --- a/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp +++ b/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp @@ -19,423 +19,361 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include - -#include -#include - -using namespace firestarter::environment::x86::payload; -using namespace asmjit; -using namespace asmjit::x86; - -int ZENFMAPayload::compilePayload( - std::vector> const &proportion, - unsigned instructionCacheSize, - std::list const &dataCacheBufferSize, unsigned ramBufferSize, - unsigned thread, unsigned numberOfLines, bool dumpRegisters, - bool errorDetection) { +#include "firestarter/Environment/X86/Payload/ZENFMAPayload.hpp" +#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp" + +namespace firestarter::environment::x86::payload { + +auto ZENFMAPayload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, + bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + using Imm = asmjit::Imm; + using Xmm = asmjit::x86::Xmm; + using Ymm = asmjit::x86::Ymm; + // NOLINTNEXTLINE(readability-identifier-naming) + constexpr asmjit::x86::Mem (*xmmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::xmmword_ptr; + // Compute the sequence of instruction groups and the number of its repetions // to reach the desired size - auto sequence = this->generateSequence(proportion); - auto repetitions = - this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread); + auto Sequence = Settings.sequence(); + auto Repetitions = + environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread()); // compute count of flops and memory access for performance report - unsigned flops = 0; - unsigned bytes = 0; + environment::payload::PayloadStats Stats; - for (const auto &item : sequence) { - auto it = this->instructionFlops.find(item); + for (const auto& Item : Sequence) { + auto It = instructionFlops().find(Item); - if (it == this->instructionFlops.end()) { - workerLog::error() << "Instruction group " << item << " undefined in " - << name() << "."; - return EXIT_FAILURE; + if (It == instructionFlops().end()) { + workerLog::error() << "Instruction group " << Item << " undefined in " << name() << "."; } - flops += it->second; + Stats.Flops += It->second; - it = this->instructionMemory.find(item); + It = instructionMemory().find(Item); - if (it != this->instructionMemory.end()) { - bytes += it->second; + if (It != instructionMemory().end()) { + Stats.Bytes += It->second; } } - this->_flops = repetitions * flops; - this->_bytes = repetitions * bytes; - this->_instructions = repetitions * sequence.size() * 4 + 6; + Stats.Flops *= Repetitions; + Stats.Bytes *= Repetitions; + Stats.Instructions = Repetitions * Sequence.size() * 4 + 6; // calculate the buffer sizes - auto l1i_cache_size = instructionCacheSize / thread; - auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin(); - auto l1_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l2_size = *dataCacheBufferSizeIterator / thread; - std::advance(dataCacheBufferSizeIterator, 1); - auto l3_size = *dataCacheBufferSizeIterator / thread; - auto ram_size = ramBufferSize / thread; + const auto L1iCacheSize = Settings.instructionCacheSizePerThread(); + const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread(); + auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin(); + const auto L1Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L2Size = *DataCacheBufferSizeIterator; + std::advance(DataCacheBufferSizeIterator, 1); + const auto L3Size = *DataCacheBufferSizeIterator; + const auto RamSize = Settings.ramBufferSizePerThread(); // calculate the reset counters for the buffers - auto l2_loop_count = - getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread); - auto l3_loop_count = - getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread); - auto ram_loop_count = - getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread); - - CodeHolder code; - code.init(this->rt.environment()); - - if (nullptr != this->loadFunction) { - this->rt.release(&this->loadFunction); - } - - Builder cb(&code); - cb.addDiagnosticOptions( - asmjit::DiagnosticOptions::kValidateAssembler | - asmjit::DiagnosticOptions::kValidateIntermediate ); - - auto pointer_reg = rax; - auto l1_addr = rbx; - auto l2_addr = rcx; - auto l3_addr = r8; - auto ram_addr = r9; - auto l2_count_reg = r10; - auto l3_count_reg = r11; - auto ram_count_reg = r12; - auto temp_reg = r13; - auto temp_reg2 = rbp; - auto offset_reg = r14; - auto addrHigh_reg = r15; - auto iter_reg = mm0; - auto shift_reg = std::vector({rdi, rsi, rdx}); - auto nr_shift_regs = 3; - auto nr_add_regs = 11; - auto ram_reg = ymm15; - - FuncDetail func; - func.init(FuncSignatureT( - CallConvId::kCDecl), - this->rt.environment()); - - FuncFrame frame; - frame.init(func); + const auto L2LoopCount = + environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size); + const auto L3LoopCount = + environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size); + const auto RamLoopCount = + environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize); + + asmjit::CodeHolder Code; + Code.init(asmjit::Environment::host()); + + asmjit::x86::Builder Cb(&Code); + Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler | + asmjit::DiagnosticOptions::kValidateIntermediate); + + auto PointerReg = asmjit::x86::rax; + auto L1Addr = asmjit::x86::rbx; + auto L2Addr = asmjit::x86::rcx; + auto L3Addr = asmjit::x86::r8; + auto RamAddr = asmjit::x86::r9; + auto L2CountReg = asmjit::x86::r10; + auto L3CountReg = asmjit::x86::r11; + auto RamCountReg = asmjit::x86::r12; + auto TempReg = asmjit::x86::r13; + auto TempReg2 = asmjit::x86::rbp; + auto OffsetReg = asmjit::x86::r14; + auto AddrHighReg = asmjit::x86::r15; + auto IterReg = asmjit::x86::mm0; + auto ShiftRegs = std::vector({asmjit::x86::rdi, asmjit::x86::rsi, asmjit::x86::rdx}); + auto NbShiftRegs = 3; + auto NbAddRegs = 11; + auto RamReg = asmjit::x86::ymm15; + + asmjit::FuncDetail Func; + Func.init(asmjit::FuncSignature::build( + asmjit::CallConvId::kCDecl), + Code.environment()); + + asmjit::FuncFrame Frame; + Frame.init(Func); // make (x|y)mm registers dirty - for (int i = 0; i < 16; i++) { - frame.addDirtyRegs(Ymm(i)); + for (auto I = 0U; I < 16U; I++) { + Frame.addDirtyRegs(asmjit::x86::Ymm(I)); } - for (int i = 0; i < 8; i++) { - frame.addDirtyRegs(Mm(i)); + for (auto I = 0U; I < 8U; I++) { + Frame.addDirtyRegs(asmjit::x86::Mm(I)); } // make all other used registers dirty except RAX - frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg, - l3_count_reg, ram_count_reg, temp_reg, temp_reg2, - offset_reg, addrHigh_reg, iter_reg, ram_addr); - for (const auto ® : shift_reg) { - frame.addDirtyRegs(reg); + Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg, + AddrHighReg, IterReg, RamAddr); + for (const auto& Reg : ShiftRegs) { + Frame.addDirtyRegs(Reg); } - FuncArgsAssignment args(&func); + asmjit::FuncArgsAssignment Args(&Func); // FIXME: asmjit assigment to mm0 does not seem to be supported - args.assignAll(pointer_reg, addrHigh_reg, temp_reg); - args.updateFuncFrame(frame); - frame.finalize(); + Args.assignAll(PointerReg, AddrHighReg, TempReg); + Args.updateFuncFrame(Frame); + Frame.finalize(); - cb.emitProlog(frame); - cb.emitArgsAssignment(frame, args); + Cb.emitProlog(Frame); + Cb.emitArgsAssignment(Frame, Args); // FIXME: movq from temp_reg to iter_reg - cb.movq(iter_reg, temp_reg); + Cb.movq(IterReg, TempReg); // stop right away if low load is selected - auto FunctionExit = cb.newLabel(); + auto FunctionExit = Cb.newLabel(); - cb.mov(temp_reg, ptr_64(addrHigh_reg)); - cb.test(temp_reg, temp_reg); - cb.jz(FunctionExit); + Cb.mov(TempReg, ptr_64(AddrHighReg)); + Cb.test(TempReg, TempReg); + Cb.jz(FunctionExit); - cb.mov(offset_reg, + Cb.mov(OffsetReg, Imm(64)); // increment after each cache/memory access // Initialize registers for shift operations - for (auto const ® : shift_reg) { - cb.mov(reg, Imm(0xAAAAAAAAAAAAAAAA)); + for (auto const& Reg : ShiftRegs) { + Cb.mov(Reg, Imm(0xAAAAAAAAAAAAAAAA)); } // Initialize AVX-Registers for FMA Operations - cb.vmovapd(ymm0, ymmword_ptr(pointer_reg)); - cb.vmovapd(ymm1, ymmword_ptr(pointer_reg, 32)); + Cb.vmovapd(asmjit::x86::ymm0, ymmword_ptr(PointerReg)); + Cb.vmovapd(asmjit::x86::ymm1, ymmword_ptr(PointerReg, 32)); - auto add_regs_start = 2; - auto add_regs_end = add_regs_start + nr_add_regs - 1; - for (int i = add_regs_start; i <= add_regs_end; i++) { - cb.vmovapd(Ymm(i), ymmword_ptr(pointer_reg, 256 + i * 32)); + auto AddRegsStart = 2; + auto AddRegsEnd = AddRegsStart + NbAddRegs - 1; + for (auto I = AddRegsStart; I <= AddRegsEnd; I++) { + Cb.vmovapd(Ymm(I), ymmword_ptr(PointerReg, 256 + (I * 32))); } // Initialize xmm14 for shift operation // cb.mov(temp_reg, Imm(1)); // cb.movd(temp_reg, Xmm(14)); - cb.movd(shift_reg[0], Xmm(13)); - cb.vbroadcastss(Xmm(13), Xmm(13)); - cb.vmovapd(Xmm(14), Xmm(13)); - cb.vpsrlq(Xmm(14), Xmm(14), Imm(1)); - - cb.mov(l1_addr, pointer_reg); // address for L1-buffer - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer - cb.mov(l2_count_reg, Imm(l2_loop_count)); - workerLog::trace() << "reset counter for L2-buffer with " - << l2_loop_count - << " cache line accesses per loop (" - << l2_size/1024 - << ") KiB"; - cb.mov(l3_count_reg, Imm(l3_loop_count)); - workerLog::trace() << "reset counter for L3-buffer with " - << l3_loop_count - << " cache line accesses per loop (" - << l3_size/1024 - << ") KiB"; - cb.mov(ram_count_reg, Imm(ram_loop_count)); - workerLog::trace() << "reset counter for RAM-buffer with " - << ram_loop_count - << " cache line accesses per loop (" - << ram_size/1024 - << ") KiB"; - - cb.align(AlignMode::kCode, 64); - - auto Loop = cb.newLabel(); - cb.bind(Loop); - - auto shift_pos = 0; - bool left = false; - auto itemCount = 0; - auto add_dest = add_regs_start; - unsigned l1_offset = 0; - -#define L1_INCREMENT() \ - l1_offset += 64; \ - if (l1_offset < l1_size * 0.5) { \ - cb.add(l1_addr, offset_reg); \ - } else { \ - l1_offset = 0; \ - cb.mov(l1_addr, pointer_reg); \ - } - -#define L2_INCREMENT() cb.add(l2_addr, offset_reg); - -#define L3_INCREMENT() cb.add(l3_addr, offset_reg) - -#define RAM_INCREMENT() cb.add(ram_addr, offset_reg) + Cb.movd(ShiftRegs[0], Xmm(13)); + Cb.vbroadcastss(Xmm(13), Xmm(13)); + Cb.vmovapd(Xmm(14), Xmm(13)); + Cb.vpsrlq(Xmm(14), Xmm(14), Imm(1)); + + Cb.mov(L1Addr, PointerReg); // address for L1-buffer + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer + Cb.mov(L2CountReg, Imm(L2LoopCount)); + workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop (" + << L2Size / 1024 << ") KiB"; + Cb.mov(L3CountReg, Imm(L3LoopCount)); + workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop (" + << L3Size / 1024 << ") KiB"; + Cb.mov(RamCountReg, Imm(RamLoopCount)); + workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop (" + << RamSize / 1024 << ") KiB"; + + Cb.align(asmjit::AlignMode::kCode, 64); + + auto Loop = Cb.newLabel(); + Cb.bind(Loop); + + auto ShiftPos = 0; + bool Left = false; + unsigned ItemCount = 0; + auto AddDest = AddRegsStart; + unsigned L1Offset = 0; + + const auto L1Increment = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg]() { + L1Offset += 64; + if (L1Offset < L1Size * 0.5) { + Cb.add(L1Addr, OffsetReg); + } else { + L1Offset = 0; + Cb.mov(L1Addr, PointerReg); + } + }; + const auto L2Increment = [&Cb, &L2Addr, &OffsetReg]() { Cb.add(L2Addr, OffsetReg); }; + const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); }; + const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); }; - for (unsigned count = 0; count < repetitions; count++) { - for (const auto &item : sequence) { + for (auto Count = 0U; Count < Repetitions; Count++) { + for (const auto& Item : Sequence) { // swap second and third param of fma instruction to force bitchanges on // the pipes to its execution units - Ymm secondParam; - Ymm thirdParam; - if (0 == itemCount % 2) { - secondParam = ymm0; - thirdParam = ymm1; + Ymm SecondParam; + Ymm ThirdParam; + if (0 == ItemCount % 2) { + SecondParam = asmjit::x86::ymm0; + ThirdParam = asmjit::x86::ymm1; } else { - secondParam = ymm1; - thirdParam = ymm0; + SecondParam = asmjit::x86::ymm1; + ThirdParam = asmjit::x86::ymm0; } - if (item == "REG") { - cb.vfmadd231pd(Ymm(add_dest), secondParam, thirdParam); - cb.xor_(temp_reg, - shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs]); - if (left) { - cb.shr(shift_reg[shift_pos], Imm(1)); + if (Item == "REG") { + Cb.vfmadd231pd(Ymm(AddDest), SecondParam, ThirdParam); + Cb.xor_(TempReg, ShiftRegs[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs]); + if (Left) { + Cb.shr(ShiftRegs[ShiftPos], Imm(1)); } else { - cb.shl(shift_reg[shift_pos], Imm(1)); + Cb.shl(ShiftRegs[ShiftPos], Imm(1)); } - } else if (item == "L1_LS") { - cb.vfmadd231pd(Ymm(add_dest), secondParam, ymmword_ptr(l1_addr, 32)); - cb.vmovapd(xmmword_ptr(l1_addr, 64), Xmm(add_dest)); - L1_INCREMENT(); - } else if (item == "L2_L") { - cb.vfmadd231pd(Ymm(add_dest), secondParam, ymmword_ptr(l2_addr, 64)); - cb.xor_(temp_reg, - shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs]); - L2_INCREMENT(); - } else if (item == "L3_L") { - cb.vfmadd231pd(Ymm(add_dest), secondParam, ymmword_ptr(l3_addr, 64)); - cb.xor_(temp_reg, - shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs]); - L3_INCREMENT(); - } else if (item == "RAM_L") { - cb.vfmadd231pd(Ymm(ram_reg), secondParam, ymmword_ptr(ram_addr, 32)); - cb.xor_(temp_reg, - shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs]); - RAM_INCREMENT(); + } else if (Item == "L1_LS") { + Cb.vfmadd231pd(Ymm(AddDest), SecondParam, ymmword_ptr(L1Addr, 32)); + Cb.vmovapd(xmmword_ptr(L1Addr, 64), Xmm(AddDest)); + L1Increment(); + } else if (Item == "L2_L") { + Cb.vfmadd231pd(Ymm(AddDest), SecondParam, ymmword_ptr(L2Addr, 64)); + Cb.xor_(TempReg, ShiftRegs[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs]); + L2Increment(); + } else if (Item == "L3_L") { + Cb.vfmadd231pd(Ymm(AddDest), SecondParam, ymmword_ptr(L3Addr, 64)); + Cb.xor_(TempReg, ShiftRegs[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs]); + L3Increment(); + } else if (Item == "RAM_L") { + Cb.vfmadd231pd(Ymm(RamReg), SecondParam, ymmword_ptr(RamAddr, 32)); + Cb.xor_(TempReg, ShiftRegs[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs]); + RamIncrement(); } else { - workerLog::error() << "Instruction group " << item << " not found in " - << this->name() << "."; - return EXIT_FAILURE; + workerLog::error() << "Instruction group " << Item << " not found in " << name() << "."; } // make sure the shifts do could end up shifting out the data one end. - if (itemCount < (int)(sequence.size() * repetitions - - (sequence.size() * repetitions) % 4)) { - switch (itemCount % 4) { + if (ItemCount < (Sequence.size() * Repetitions) - ((Sequence.size() * Repetitions) % 4)) { + // all cases are covered + // NOLINTNEXTLINE(bugprone-switch-missing-default-case) + switch (ItemCount % 4) { case 0: - cb.vpsrlq(Xmm(13), Xmm(13), Imm(1)); + Cb.vpsrlq(Xmm(13), Xmm(13), Imm(1)); break; case 1: - cb.vpsllq(Xmm(14), Xmm(14), Imm(1)); + Cb.vpsllq(Xmm(14), Xmm(14), Imm(1)); break; case 2: - cb.vpsllq(Xmm(13), Xmm(13), Imm(1)); + Cb.vpsllq(Xmm(13), Xmm(13), Imm(1)); break; case 3: - cb.vpsrlq(Xmm(14), Xmm(14), Imm(1)); + Cb.vpsrlq(Xmm(14), Xmm(14), Imm(1)); break; } } - itemCount++; + ItemCount++; - add_dest++; - if (add_dest > add_regs_end) { - add_dest = add_regs_start; + AddDest++; + if (AddDest > AddRegsEnd) { + AddDest = AddRegsStart; } - shift_pos++; - if (shift_pos == nr_shift_regs) { - shift_pos = 0; - left = !left; + ShiftPos++; + if (ShiftPos == NbShiftRegs) { + ShiftPos = 0; + Left = !Left; } } } - cb.movq(temp_reg, iter_reg); // restore iteration counter - if (this->getRAMSequenceCount(sequence) > 0) { + Cb.movq(TempReg, IterReg); // restore iteration counter + if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) { // reset RAM counter - auto NoRamReset = cb.newLabel(); - - cb.sub(ram_count_reg, Imm(1)); - cb.jnz(NoRamReset); - cb.mov(ram_count_reg, Imm(ram_loop_count)); - cb.mov(ram_addr, pointer_reg); - cb.add(ram_addr, Imm(l3_size)); - cb.bind(NoRamReset); + auto NoRamReset = Cb.newLabel(); + + Cb.sub(RamCountReg, Imm(1)); + Cb.jnz(NoRamReset); + Cb.mov(RamCountReg, Imm(RamLoopCount)); + Cb.mov(RamAddr, PointerReg); + Cb.add(RamAddr, Imm(L3Size)); + Cb.bind(NoRamReset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.inc(temp_reg); // increment iteration counter - if (this->getL2SequenceCount(sequence) > 0) { + Cb.inc(TempReg); // increment iteration counter + if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) { // reset L2-Cache counter - auto NoL2Reset = cb.newLabel(); - - cb.sub(l2_count_reg, Imm(1)); - cb.jnz(NoL2Reset); - cb.mov(l2_count_reg, Imm(l2_loop_count)); - cb.mov(l2_addr, pointer_reg); - cb.add(l2_addr, Imm(l1_size)); - cb.bind(NoL2Reset); + auto NoL2Reset = Cb.newLabel(); + + Cb.sub(L2CountReg, Imm(1)); + Cb.jnz(NoL2Reset); + Cb.mov(L2CountReg, Imm(L2LoopCount)); + Cb.mov(L2Addr, PointerReg); + Cb.add(L2Addr, Imm(L1Size)); + Cb.bind(NoL2Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.movq(iter_reg, temp_reg); // store iteration counter - if (this->getL3SequenceCount(sequence) > 0) { + Cb.movq(IterReg, TempReg); // store iteration counter + if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) { // reset L3-Cache counter - auto NoL3Reset = cb.newLabel(); - - cb.sub(l3_count_reg, Imm(1)); - cb.jnz(NoL3Reset); - cb.mov(l3_count_reg, Imm(l3_loop_count)); - cb.mov(l3_addr, pointer_reg); - cb.add(l3_addr, Imm(l2_size)); - cb.bind(NoL3Reset); + auto NoL3Reset = Cb.newLabel(); + + Cb.sub(L3CountReg, Imm(1)); + Cb.jnz(NoL3Reset); + Cb.mov(L3CountReg, Imm(L3LoopCount)); + Cb.mov(L3Addr, PointerReg); + Cb.add(L3Addr, Imm(L2Size)); + Cb.bind(NoL3Reset); // adds always two instruction - this->_instructions += 2; + Stats.Instructions += 2; } - cb.mov(l1_addr, pointer_reg); + Cb.mov(L1Addr, PointerReg); - if (dumpRegisters) { - auto SkipRegistersDump = cb.newLabel(); - - cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - cb.jnz(SkipRegistersDump); - - // dump all the ymm register - for (int i = 0; i < (int)this->registerCount(); i++) { - cb.vmovapd( - ymmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)), - Ymm(i)); - } - - // set read flag - cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); - - cb.bind(SkipRegistersDump); + if (DumpRegisters) { + emitDumpRegisterCode(Cb, PointerReg, asmjit::x86::ymmword_ptr); } - if (errorDetection) { - this->emitErrorDetectionCode( - cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2); + if (ErrorDetection) { + emitErrorDetectionCode(Cb, IterReg, AddrHighReg, PointerReg, TempReg, TempReg2); } - cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH)); - cb.jnz(Loop); + Cb.test(ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh)); + Cb.jnz(Loop); - cb.bind(FunctionExit); + Cb.bind(FunctionExit); - cb.movq(rax, iter_reg); + Cb.movq(asmjit::x86::rax, IterReg); - cb.emitEpilog(frame); + Cb.emitEpilog(Frame); - cb.finalize(); + Cb.finalize(); - // String sb; - // cb.dump(sb); - - Error err = this->rt.add(&this->loadFunction, &code); - if (err) { - workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in " - << __FILE__ << " at " << __LINE__; - return EXIT_FAILURE; - } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size - if (l1i_cache_size != 0) { - auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop); - auto instructionCachePercentage = 100 * loopSize / l1i_cache_size; + if (L1iCacheSize) { + auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop); + auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize; - if (loopSize > l1i_cache_size) { + if (LoopSize > *L1iCacheSize) { workerLog::warn() << "Work-loop is bigger than the L1i-Cache."; } - workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size - << " Bytes (" << instructionCachePercentage + workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage << "%) from the L1i-Cache for the work-loop."; - workerLog::trace() << "Sequence size: " << sequence.size(); - workerLog::trace() << "Repetition count: " << repetitions; + workerLog::trace() << "Sequence size: " << Sequence.size(); + workerLog::trace() << "Repetition count: " << Repetitions; } - return EXIT_SUCCESS; + return CompiledPayloadPtr; } -std::list ZENFMAPayload::getAvailableInstructions() const { - std::list instructions; - - transform(this->instructionFlops.begin(), this->instructionFlops.end(), - back_inserter(instructions), - [](const auto &item) { return item.first; }); - - return instructions; +void ZENFMAPayload::init(double* MemoryAddr, uint64_t BufferSize) const { + X86Payload::initMemory(MemoryAddr, BufferSize, 0.27948995982e-4, 0.27948995982e-4); } -void ZENFMAPayload::init(unsigned long long *memoryAddr, - unsigned long long bufferSize) { - X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4); -} +} // namespace firestarter::environment::x86::payload \ No newline at end of file diff --git a/src/firestarter/Environment/X86/Platform/X86PlatformConfig.cpp b/src/firestarter/Environment/X86/Platform/X86PlatformConfig.cpp new file mode 100644 index 00000000..fa4d4399 --- /dev/null +++ b/src/firestarter/Environment/X86/Platform/X86PlatformConfig.cpp @@ -0,0 +1,27 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +// This file exists to get an entry in the compile commands database. Clangd will interpolate the include directories +// for header files based on the source file with the best matching score. This file should be the best score for the +// included header. Therefore we should not see any errors in this file for missing includes. For more infomation +// look in the LLVM code base: clang/lib/Tooling/InterpolatingCompilationDatabase.cpp + +#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp" \ No newline at end of file diff --git a/src/firestarter/Environment/X86/X86CPUTopology.cpp b/src/firestarter/Environment/X86/X86CPUTopology.cpp index 8b8abe2b..64d64cfb 100644 --- a/src/firestarter/Environment/X86/X86CPUTopology.cpp +++ b/src/firestarter/Environment/X86/X86CPUTopology.cpp @@ -19,8 +19,8 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include +#include "firestarter/Environment/X86/X86CPUTopology.hpp" +#include "firestarter/Logging/Log.hpp" #include @@ -31,99 +31,100 @@ #pragma intrinsic(__rdtsc) #endif -using namespace firestarter::environment::x86; +namespace firestarter::environment::x86 { X86CPUTopology::X86CPUTopology() - : CPUTopology("x86_64"), cpuInfo(asmjit::CpuInfo::host()), - _vendor(this->cpuInfo.vendor()) { - - std::stringstream ss; - ss << "Family " << this->familyId() << ", Model " << this->modelId() - << ", Stepping " << this->stepping(); - this->_model = ss.str(); + : CPUTopology("x86_64") + , CpuInfo(asmjit::CpuInfo::host()) + , Vendor(CpuInfo.vendor()) { + + { + std::stringstream Ss; + Ss << "Family " << familyId() << ", Model " << modelId() << ", Stepping " << stepping(); + Model = Ss.str(); + } - for (int i = 0; i <= (int)asmjit::CpuFeatures::X86::Id::kMaxValue; i++) { - if (!this->cpuInfo.hasFeature(i)) { + for (auto FeatureId = 0; FeatureId <= asmjit::CpuFeatures::X86::Id::kMaxValue; FeatureId++) { + if (!CpuInfo.hasFeature(FeatureId)) { continue; } - asmjit::String sb; + asmjit::String Sb; - auto error = asmjit::Formatter::formatFeature(sb, this->cpuInfo.arch(), i); - if (error != asmjit::ErrorCode::kErrorOk) { - log::warn() << "Formatting cpu features got asmjit error: " << error; + auto Error = asmjit::Formatter::formatFeature(Sb, CpuInfo.arch(), FeatureId); + if (Error != asmjit::ErrorCode::kErrorOk) { + log::warn() << "Formatting cpu features got asmjit error: " << Error; } - this->featureList.push_back(std::string(sb.data())); + FeatureList.emplace_back(Sb.data()); } - unsigned long long a = 0, b = 0, c = 0, d = 0; + uint64_t Rax = 0; + uint64_t Rbx = 0; + uint64_t Rcx = 0; + uint64_t Rdx = 0; // check if we have rdtsc - this->cpuid(&a, &b, &c, &d); - if (a >= 1) { - a = 1; - this->cpuid(&a, &b, &c, &d); - if ((int)d & (1 << 4)) { - this->_hasRdtsc = true; - } else { - this->_hasRdtsc = false; - } + cpuid(&Rax, &Rbx, &Rcx, &Rdx); + if (Rax >= 1) { + Rax = 1; + cpuid(&Rax, &Rbx, &Rcx, &Rdx); + HasRdtsc = (Rdx & (1 << 4)) != 0; } // check if we have invarant rdtsc - if (this->hasRdtsc()) { - a = 0, b = 0, c = 0, d = 0; + if (hasRdtsc()) { + Rax = 0, Rbx = 0, Rcx = 0, Rdx = 0; - this->_hasInvariantRdtsc = true; + HasInvariantRdtsc = true; /* TSCs are usable if CPU supports only one frequency in C0 (no speedstep/Cool'n'Quite) or if multiple frequencies are available and the constant/invariant TSC feature flag is set */ - if (0 == this->vendor().compare("INTEL")) { + if ("INTEL" == vendor()) { /*check if Powermanagement and invariant TSC are supported*/ - a = 1; - this->cpuid(&a, &b, &c, &d); + Rax = 1; + cpuid(&Rax, &Rbx, &Rcx, &Rdx); /* no Frequency control */ - if ((!(d & (1 << 22))) && (!(c & (1 << 7)))) { - this->_hasInvariantRdtsc = true; + if ((!(Rdx & (1 << 22))) && (!(Rcx & (1 << 7)))) { + HasInvariantRdtsc = true; } else { - a = 0x80000000; - this->cpuid(&a, &b, &c, &d); - if (a >= 0x80000007) { - a = 0x80000007; - this->cpuid(&a, &b, &c, &d); + Rax = 0x80000000; + cpuid(&Rax, &Rbx, &Rcx, &Rdx); + if (Rax >= 0x80000007) { + Rax = 0x80000007; + cpuid(&Rax, &Rbx, &Rcx, &Rdx); /* invariant TSC */ - if (d & (1 << 8)) { - this->_hasInvariantRdtsc = true; + if (Rdx & (1 << 8)) { + HasInvariantRdtsc = true; } } } } - if (0 == this->vendor().compare("AMD")) { + if ("AMD" == vendor()) { /*check if Powermanagement and invariant TSC are supported*/ - a = 0x80000000; - this->cpuid(&a, &b, &c, &d); - if (a >= 0x80000007) { - a = 0x80000007; - this->cpuid(&a, &b, &c, &d); + Rax = 0x80000000; + cpuid(&Rax, &Rbx, &Rcx, &Rdx); + if (Rax >= 0x80000007) { + Rax = 0x80000007; + cpuid(&Rax, &Rbx, &Rcx, &Rdx); /* no Frequency control */ - if ((!(d & (1 << 7))) && (!(d & (1 << 1)))) { - this->_hasInvariantRdtsc = true; + if ((!(Rdx & (1 << 7))) && (!(Rdx & (1 << 1)))) { + HasInvariantRdtsc = true; } /* invariant TSC */ - if (d & (1 << 8)) { - this->_hasInvariantRdtsc = true; + if (Rdx & (1 << 8)) { + HasInvariantRdtsc = true; } } /* assuming no frequency control if cpuid does not provide the extended function to test for it */ else { - this->_hasInvariantRdtsc = true; + HasInvariantRdtsc = true; } } } @@ -133,123 +134,124 @@ X86CPUTopology::X86CPUTopology() // only constant TSCs will be used (i.e. power management indepent TSCs) // save frequency in highest P-State or use generic fallback if no invarient TSC // is available -unsigned long long X86CPUTopology::clockrate() const { - typedef std::chrono::high_resolution_clock Clock; - typedef std::chrono::microseconds ticks; +auto X86CPUTopology::clockrate() const -> uint64_t { + using ClockT = std::chrono::high_resolution_clock; + using TicksT = std::chrono::microseconds; - unsigned long long start1_tsc, start2_tsc, end1_tsc, end2_tsc; - unsigned long long time_diff; - unsigned long long clock_lower_bound, clock_upper_bound, clock; - unsigned long long clockrate = 0; - int i, num_measurements = 0, min_measurements; + uint64_t Clockrate = 0; + uint64_t MinMeasurements = 0; - Clock::time_point start_time, end_time; + ClockT::time_point StartTime; + ClockT::time_point EndTime; #if not(defined(__APPLE__) || defined(_WIN32)) - auto governor = this->scalingGovernor(); - if (governor.empty()) { + auto Governor = scalingGovernor(); + if (Governor.empty()) { return CPUTopology::clockrate(); } /* non invariant TSCs can be used if CPUs run at fixed frequency */ - if (!this->hasInvariantRdtsc() && governor.compare("performance") && - governor.compare("powersave")) { + if (!hasInvariantRdtsc() && Governor != "performance" && Governor != "powersave") { return CPUTopology::clockrate(); } - min_measurements = 5; + MinMeasurements = 5; #else - min_measurements = 20; + MinMeasurements = 20; #endif - i = 3; + for (uint64_t NumMeasurements = 0, TimeDiff = 0, Duration = 3; TimeDiff < 10000 || NumMeasurements < MinMeasurements; + Duration += 2) { + uint64_t End1Tsc = 0; + uint64_t End2Tsc = 0; - do { // start timestamp - start1_tsc = this->timestamp(); - start_time = Clock::now(); - start2_tsc = this->timestamp(); + const uint64_t Start1Tsc = timestamp(); + StartTime = ClockT::now(); + const uint64_t Start2Tsc = timestamp(); - // waiting - do { - end1_tsc = this->timestamp(); - } while (end1_tsc < start2_tsc + 1000000 * i); /* busy waiting */ + // busy wait waiting for duration to pass + for (; End1Tsc < Start2Tsc + 1000000 * Duration;) { + End1Tsc = timestamp(); + } // end timestamp - do { - end1_tsc = this->timestamp(); - end_time = Clock::now(); - end2_tsc = this->timestamp(); + End1Tsc = timestamp(); + EndTime = ClockT::now(); + End2Tsc = timestamp(); - time_diff = - std::chrono::duration_cast(end_time - start_time).count(); - } while (0 == time_diff); + TimeDiff = std::chrono::duration_cast(EndTime - StartTime).count(); - clock_lower_bound = (((end1_tsc - start2_tsc) * 1000000) / (time_diff)); - clock_upper_bound = (((end2_tsc - start1_tsc) * 1000000) / (time_diff)); + // measurement not long enough + if (TimeDiff <= 2000) { + continue; + } // if both values differ significantly, the measurement could have been // interrupted between 2 rdtsc's - if (((double)clock_lower_bound > (((double)clock_upper_bound) * 0.999)) && - ((time_diff) > 2000)) { - num_measurements++; - clock = (clock_lower_bound + clock_upper_bound) / 2; - if (clockrate == 0) - clockrate = clock; + const uint64_t ClockLowerBound = (((End1Tsc - Start2Tsc) * 1000000) / (TimeDiff)); + const uint64_t ClockUpperBound = (((End2Tsc - Start1Tsc) * 1000000) / (TimeDiff)); + + if (static_cast(ClockLowerBound) > ((static_cast(ClockUpperBound)) * 0.999)) { + NumMeasurements++; + const uint64_t Clock = (ClockLowerBound + ClockUpperBound) / 2; + const bool ClockrateUpdateCondition = Clockrate == 0 || #ifndef _WIN32 - else if (clock < clockrate) - clockrate = clock; + Clock < Clockrate; #else - else if (clock > clockrate) - clockrate = clock; + Clock > Clockrate; #endif + if (ClockrateUpdateCondition) { + Clockrate = Clock; + } } - i += 2; - } while (((time_diff) < 10000) || (num_measurements < min_measurements)); + } - return clockrate; + return Clockrate; } -unsigned long long X86CPUTopology::timestamp() const { -#ifndef _MSC_VER - unsigned long long reg_a, reg_d; -#else - unsigned long long i; -#endif - - if (!this->hasRdtsc()) { +auto X86CPUTopology::timestamp() const -> uint64_t { + if (!hasRdtsc()) { return 0; } #ifndef _MSC_VER - __asm__ __volatile__("rdtsc;" : "=a"(reg_a), "=d"(reg_d)); - return (reg_d << 32) | (reg_a & 0xffffffffULL); + // NOLINTBEGIN(misc-const-correctness) + uint64_t Rax = 0; + uint64_t Rdx = 0; + // NOLINTEND(misc-const-correctness) + __asm__ __volatile__("rdtsc;" : "=a"(Rax), "=d"(Rdx)); + return (Rdx << 32) | (Rax & 0xffffffffULL); #else - i = __rdtsc(); - return i; + return __rdtsc(); #endif } -void X86CPUTopology::cpuid(unsigned long long *a, unsigned long long *b, - unsigned long long *c, unsigned long long *d) const { +void X86CPUTopology::cpuid(uint64_t* Rax, uint64_t* Rbx, uint64_t* Rcx, uint64_t* Rdx) { #ifndef _MSC_VER - unsigned long long reg_a, reg_b, reg_c, reg_d; - + // NOLINTBEGIN(misc-const-correctness) + uint64_t RaxOut = 0; + uint64_t RbxOut = 0; + uint64_t RcxOut = 0; + uint64_t RdxOut = 0; + // NOLINTEND(misc-const-correctness) __asm__ __volatile__("cpuid;" - : "=a"(reg_a), "=b"(reg_b), "=c"(reg_c), "=d"(reg_d) - : "a"(*a), "b"(*b), "c"(*c), "d"(*d)); - *a = reg_a; - *b = reg_b; - *c = reg_c; - *d = reg_d; + : "=a"(RaxOut), "=b"(RbxOut), "=c"(RcxOut), "=d"(RdxOut) + : "a"(*Rax), "b"(*Rbx), "c"(*Rcx), "d"(*Rdx)); + *Rax = RaxOut; + *Rbx = RbxOut; + *Rcx = RcxOut; + *Rdx = RdxOut; #else std::array cpuid; - __cpuidex(cpuid.data(), *a, *c); + __cpuidex(cpuid.data(), *Rax, *Rcx); - *a = cpuid[0]; - *b = cpuid[1]; - *c = cpuid[2]; - *d = cpuid[3]; + *Rax = cpuid[0]; + *Rbx = cpuid[1]; + *Rcx = cpuid[2]; + *Rdx = cpuid[3]; #endif } + +} // namespace firestarter::environment::x86 \ No newline at end of file diff --git a/src/firestarter/Environment/X86/X86Environment.cpp b/src/firestarter/Environment/X86/X86Environment.cpp index d981358d..3ecd89c1 100644 --- a/src/firestarter/Environment/X86/X86Environment.cpp +++ b/src/firestarter/Environment/X86/X86Environment.cpp @@ -19,201 +19,155 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include +#include "firestarter/Environment/X86/X86Environment.hpp" +#include "firestarter/Logging/Log.hpp" #include #include +#include #include -using namespace firestarter::environment::x86; +namespace firestarter::environment::x86 { -void X86Environment::evaluateFunctions() { - for (auto ctor : this->platformConfigsCtor) { - // add asmjit for model and family detection - this->platformConfigs.push_back( - ctor(this->topology().featuresAsmjit(), this->topology().familyId(), - this->topology().modelId(), this->topology().numThreadsPerCore())); - } - - for (auto ctor : this->fallbackPlatformConfigsCtor) { - this->fallbackPlatformConfigs.push_back( - ctor(this->topology().featuresAsmjit(), this->topology().familyId(), - this->topology().modelId(), this->topology().numThreadsPerCore())); - } -} - -int X86Environment::selectFunction(unsigned functionId, - bool allowUnavailablePayload) { - unsigned id = 1; - std::string defaultPayloadName(""); +void X86Environment::selectFunction(unsigned FunctionId, bool AllowUnavailablePayload) { + unsigned Id = 1; + std::optional DefaultPayloadName; // if functionId is 0 get the default or fallback - for (auto config : this->platformConfigs) { - for (auto const &[thread, functionName] : config->getThreadMap()) { + for (const auto& PlatformConfigPtr : PlatformConfigs) { + for (auto const& ThreadsPerCore : PlatformConfigPtr->settings().threads()) { // the selected function - if (id == functionId) { - if (!config->isAvailable()) { - log::error() << "Function " << functionId << " (\"" << functionName - << "\") requires " << config->payload().name() - << ", which is not supported by the processor."; - if (!allowUnavailablePayload) { - return EXIT_FAILURE; + if (Id == FunctionId) { + if (!PlatformConfigPtr->isAvailable(topology())) { + const auto ErrorString = "Function " + std::to_string(FunctionId) + " (\"" + + PlatformConfigPtr->functionName(ThreadsPerCore) + "\") requires " + + PlatformConfigPtr->payload()->name() + ", which is not supported by the processor."; + if (AllowUnavailablePayload) { + log::warn() << ErrorString; + } else { + throw std::invalid_argument(ErrorString); } } // found function - this->_selectedConfig = - new ::firestarter::environment::platform::RuntimeConfig( - *config, thread, this->topology().instructionCacheSize()); - return EXIT_SUCCESS; + setConfig(PlatformConfigPtr->cloneConcreate(topology().instructionCacheSize(), ThreadsPerCore)); + return; } // default function - if (0 == functionId && config->isDefault()) { - if (thread == this->topology().numThreadsPerCore()) { - this->_selectedConfig = - new ::firestarter::environment::platform::RuntimeConfig( - *config, thread, this->topology().instructionCacheSize()); - return EXIT_SUCCESS; - } else { - defaultPayloadName = config->payload().name(); + if (0 == FunctionId && PlatformConfigPtr->isDefault(topology())) { + if (ThreadsPerCore == topology().numThreadsPerCore()) { + setConfig(PlatformConfigPtr->cloneConcreate(topology().instructionCacheSize(), ThreadsPerCore)); + return; } + DefaultPayloadName = PlatformConfigPtr->payload()->name(); } - id++; + Id++; } } // no default found // use fallback - if (0 == functionId) { - if (!defaultPayloadName.empty()) { + if (0 == FunctionId) { + if (DefaultPayloadName) { // default payload available, but number of threads per core is not // supported - log::warn() << "No " << defaultPayloadName << " code path for " - << this->topology().numThreadsPerCore() + log::warn() << "No " << *DefaultPayloadName << " code path for " << topology().numThreadsPerCore() << " threads per core!"; } - log::warn() << this->topology().vendor() << " " << this->topology().model() + log::warn() << topology().vendor() << " " << topology().model() << " is not supported by this version of FIRESTARTER!\n" << "Check project website for updates."; // loop over available implementation and check if they are marked as // fallback - for (auto config : this->fallbackPlatformConfigs) { - if (config->isAvailable()) { - auto selectedThread = 0; - auto selectedFunctionName = std::string(""); - for (auto const &[thread, functionName] : config->getThreadMap()) { - if (thread == this->topology().numThreadsPerCore()) { - selectedThread = thread; - selectedFunctionName = functionName; + for (const auto& FallbackPlatformConfigPtr : FallbackPlatformConfigs) { + if (FallbackPlatformConfigPtr->isAvailable(topology())) { + std::optional SelectedThreadsPerCore; + // find the fallback implementation with the correct thread per core count + for (auto const& ThreadsPerCore : FallbackPlatformConfigPtr->settings().threads()) { + if (ThreadsPerCore == topology().numThreadsPerCore()) { + SelectedThreadsPerCore = ThreadsPerCore; } } - if (selectedThread == 0) { - selectedThread = config->getThreadMap().begin()->first; - selectedFunctionName = config->getThreadMap().begin()->second; + // Otherwise select the first available thread per core count + if (!SelectedThreadsPerCore) { + SelectedThreadsPerCore = FallbackPlatformConfigPtr->settings().threads().front(); } - this->_selectedConfig = - new ::firestarter::environment::platform::RuntimeConfig( - *config, selectedThread, - this->topology().instructionCacheSize()); - log::warn() << "Using function " << selectedFunctionName + setConfig( + FallbackPlatformConfigPtr->cloneConcreate(topology().instructionCacheSize(), *SelectedThreadsPerCore)); + log::warn() << "Using function " << FallbackPlatformConfigPtr->functionName(*SelectedThreadsPerCore) << " as fallback.\n" << "You can use the parameter --function to try other " "functions."; - return EXIT_SUCCESS; + return; } } // no fallback found - log::error() << "No fallback implementation found for available ISA " - "extensions."; - return EXIT_FAILURE; + throw std::invalid_argument("No fallback implementation found for available ISA " + "extensions."); } - log::error() << "unknown function id: " << functionId - << ", see --avail for available ids"; - return EXIT_FAILURE; + throw std::invalid_argument("unknown function id: " + std::to_string(FunctionId) + ", see --avail for available ids"); } -int X86Environment::selectInstructionGroups(std::string groups) { - const std::string delimiter = ","; - const std::regex re("^(\\w+):(\\d+)$"); - const auto availableInstructionGroups = this->selectedConfig() - .platformConfig() - .payload() - .getAvailableInstructions(); - - std::stringstream ss(groups); - std::vector> payloadSettings = {}; - - while (ss.good()) { - std::string token; - std::smatch m; - std::getline(ss, token, ','); - - if (std::regex_match(token, m, re)) { - if (std::find(availableInstructionGroups.begin(), - availableInstructionGroups.end(), - m[1].str()) == availableInstructionGroups.end()) { - log::error() - << "Invalid instruction-group: " << m[1].str() - << "\n --run-instruction-groups format: multiple INST:VAL " - "pairs comma-seperated"; - return EXIT_FAILURE; +void X86Environment::selectInstructionGroups(std::string Groups) { + const auto Delimiter = ','; + const std::regex Re("^(\\w+):(\\d+)$"); + const auto AvailableInstructionGroups = config().payload()->getAvailableInstructions(); + + std::stringstream Ss(Groups); + std::vector> PayloadSettings = {}; + + while (Ss.good()) { + std::string Token; + std::smatch M; + std::getline(Ss, Token, Delimiter); + + if (std::regex_match(Token, M, Re)) { + if (std::find(AvailableInstructionGroups.begin(), AvailableInstructionGroups.end(), M[1].str()) == + AvailableInstructionGroups.end()) { + throw std::invalid_argument("Invalid instruction-group: " + M[1].str() + + "\n --run-instruction-groups format: multiple INST:VAL " + "pairs comma-seperated"); } - int num = std::stoul(m[2].str()); - if (num == 0) { - log::error() - << "instruction-group VAL may not contain number 0" - << "\n --run-instruction-groups format: multiple INST:VAL " - "pairs comma-seperated"; - return EXIT_FAILURE; + auto Num = std::stoul(M[2].str()); + if (Num == 0) { + throw std::invalid_argument("instruction-group VAL may not contain number 0" + "\n --run-instruction-groups format: multiple INST:VAL " + "pairs comma-seperated"); } - payloadSettings.push_back(std::make_pair(m[1].str(), num)); + PayloadSettings.emplace_back(M[1].str(), Num); } else { - log::error() - << "Invalid symbols in instruction-group: " << token - << "\n --run-instruction-groups format: multiple INST:VAL " - "pairs comma-seperated"; - return EXIT_FAILURE; + throw std::invalid_argument("Invalid symbols in instruction-group: " + Token + + "\n --run-instruction-groups format: multiple INST:VAL " + "pairs comma-seperated"); } } - this->selectedConfig().setPayloadSettings(payloadSettings); - - log::info() << " Running custom instruction group: " << groups; + config().settings().selectInstructionGroups(PayloadSettings); - return EXIT_SUCCESS; + log::info() << " Running custom instruction group: " << Groups; } void X86Environment::printAvailableInstructionGroups() { - std::stringstream ss; + std::stringstream Ss; - for (auto const &item : this->selectedConfig() - .platformConfig() - .payload() - .getAvailableInstructions()) { - ss << item << ","; + for (auto const& Item : config().payload()->getAvailableInstructions()) { + Ss << Item << ","; } - auto s = ss.str(); - if (s.size() > 0) { - s.pop_back(); + auto S = Ss.str(); + if (!S.empty()) { + S.pop_back(); } - log::info() << " available instruction-groups for payload " - << this->selectedConfig().platformConfig().payload().name() - << ":\n" - << " " << s; + log::info() << " available instruction-groups for payload " << config().payload()->name() << ":\n" + << " " << S; } -void X86Environment::setLineCount(unsigned lineCount) { - this->selectedConfig().setLineCount(lineCount); -} +void X86Environment::setLineCount(unsigned LineCount) { config().settings().setLineCount(LineCount); } -void X86Environment::printSelectedCodePathSummary() { - this->selectedConfig().printCodePathSummary(); -} +void X86Environment::printSelectedCodePathSummary() { config().printCodePathSummary(); } void X86Environment::printFunctionSummary() { log::info() << " available load-functions:\n" @@ -224,21 +178,19 @@ void X86Environment::printFunctionSummary() { "-------------------------------------------------------------" "-----------------------------"; - unsigned id = 1; - - for (auto const &config : this->platformConfigs) { - for (auto const &[thread, functionName] : config->getThreadMap()) { - const char *available = config->isAvailable() ? "yes" : "no"; - const char *fmt = " %4u | %-30s | %-24s | %s"; - int sz = - std::snprintf(nullptr, 0, fmt, id, functionName.c_str(), available, - config->getDefaultPayloadSettingsString().c_str()); - std::vector buf(sz + 1); - std::snprintf(&buf[0], buf.size(), fmt, id, functionName.c_str(), - available, - config->getDefaultPayloadSettingsString().c_str()); - log::info() << std::string(&buf[0]); - id++; + auto Id = 1U; + + for (auto const& Config : PlatformConfigs) { + for (auto const& ThreadsPerCore : Config->settings().threads()) { + const char* Available = Config->isAvailable(topology()) ? "yes" : "no"; + const auto& FunctionName = Config->functionName(ThreadsPerCore); + const auto& InstructionGroupsString = Config->settings().getInstructionGroupsString(); + + log::info() << " " << std::right << std::setw(4) << Id << " | " << std::left << std::setw(30) << FunctionName + << " | " << std::left << std::setw(24) << Available << " | " << InstructionGroupsString; + Id++; } } } + +} // namespace firestarter::environment::x86 \ No newline at end of file diff --git a/src/firestarter/Firestarter.cpp b/src/firestarter/Firestarter.cpp index 5fb58ad4..379e2039 100644 --- a/src/firestarter/Firestarter.cpp +++ b/src/firestarter/Firestarter.cpp @@ -19,438 +19,279 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include -#if defined(linux) || defined(__linux__) -#include -#include -#include -extern "C" { -#include -} -#endif +#include "firestarter/Firestarter.hpp" +#include "firestarter/Environment/X86/X86Environment.hpp" +#include "firestarter/Logging/Log.hpp" +#include "firestarter/Measurement/Metric/IPCEstimate.hpp" +#include "firestarter/Optimizer/Algorithm/NSGA2.hpp" +#include "firestarter/Optimizer/History.hpp" +#include "firestarter/Optimizer/Problem/CLIArgumentProblem.hpp" #include -#include -#include - -#ifdef _MSC_VER -#include -#endif - -using namespace firestarter; - -Firestarter::Firestarter( - const int argc, const char **argv, std::chrono::seconds const &timeout, - unsigned loadPercent, std::chrono::microseconds const &period, - unsigned requestedNumThreads, std::string const &cpuBind, - bool printFunctionSummary, unsigned functionId, bool listInstructionGroups, - std::string const &instructionGroups, unsigned lineCount, - bool allowUnavailablePayload, bool dumpRegisters, - std::chrono::seconds const &dumpRegistersTimeDelta, - std::string const &dumpRegistersOutpath, bool errorDetection, int gpus, - unsigned gpuMatrixSize, bool gpuUseFloat, bool gpuUseDouble, - bool listMetrics, bool measurement, - std::chrono::milliseconds const &startDelta, - std::chrono::milliseconds const &stopDelta, - std::chrono::milliseconds const &measurementInterval, - std::vector const &metricPaths, - std::vector const &stdinMetrics, bool optimize, - std::chrono::seconds const &preheat, - std::string const &optimizationAlgorithm, - std::vector const &optimizationMetrics, - std::chrono::seconds const &evaluationDuration, unsigned individuals, - std::string const &optimizeOutfile, unsigned generations, double nsga2_cr, - double nsga2_m) - : _argc(argc), _argv(argv), _timeout(timeout), _loadPercent(loadPercent), - _period(period), _dumpRegisters(dumpRegisters), - _dumpRegistersTimeDelta(dumpRegistersTimeDelta), - _dumpRegistersOutpath(dumpRegistersOutpath), - _errorDetection(errorDetection), _gpus(gpus), - _gpuMatrixSize(gpuMatrixSize), _gpuUseFloat(gpuUseFloat), - _gpuUseDouble(gpuUseDouble), _startDelta(startDelta), - _stopDelta(stopDelta), _measurement(measurement), _optimize(optimize), - _preheat(preheat), _optimizationAlgorithm(optimizationAlgorithm), - _optimizationMetrics(optimizationMetrics), - _evaluationDuration(evaluationDuration), _individuals(individuals), - _optimizeOutfile(optimizeOutfile), _generations(generations), - _nsga2_cr(nsga2_cr), _nsga2_m(nsga2_m) { - int returnCode; - - _load = (_period * _loadPercent) / 100; - if (_loadPercent == 100 || _load == std::chrono::microseconds::zero()) { - _period = std::chrono::microseconds::zero(); - } +#include +#include + +namespace firestarter { -#if defined(linux) || defined(__linux__) -#else - (void)listMetrics; - (void)measurementInterval; - (void)metricPaths; - (void)stdinMetrics; -#endif - -#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ - defined(_M_X64) - this->_environment = new environment::x86::X86Environment(); -#endif - - if (EXIT_SUCCESS != (returnCode = this->environment().evaluateCpuAffinity( - requestedNumThreads, cpuBind))) { - std::exit(returnCode); +Firestarter::Firestarter(Config&& ProvidedConfig) + : Cfg(std::move(ProvidedConfig)) { + if constexpr (firestarter::OptionalFeatures.IsX86) { + Environment = std::make_unique(); } -#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ - defined(_M_X64) - // Error detection uses crc32 instruction added by the SSE4.2 extension to x86 - if (_errorDetection) { - if (!_environment->topology().featuresAsmjit().has(asmjit::CpuFeatures::X86::kSSE4_2)) { - throw std::invalid_argument("Option --error-detection requires the crc32 " - "instruction added with SSE_4_2.\n"); + Environment->evaluateCpuAffinity(Cfg.RequestedNumThreads, Cfg.CpuBind); + + if constexpr (firestarter::OptionalFeatures.IsX86) { + // Error detection uses crc32 instruction added by the SSE4.2 extension to x86 + if (Cfg.ErrorDetection) { + const auto& X86Env = *dynamic_cast(Environment.get()); + if (!X86Env.topology().featuresAsmjit().has(asmjit::CpuFeatures::X86::kSSE4_2)) { + throw std::invalid_argument("Option --error-detection requires the crc32 " + "instruction added with SSE_4_2.\n"); + } } } -#endif - if (_errorDetection && this->environment().requestedNumThreads() < 2) { - throw std::invalid_argument( - "Option --error-detection must run with 2 or more threads. Number of " - "threads is " + - std::to_string(this->environment().requestedNumThreads()) + "\n"); + if (Cfg.ErrorDetection && Environment->requestedNumThreads() < 2) { + throw std::invalid_argument("Option --error-detection must run with 2 or more threads. Number of " + "threads is " + + std::to_string(Environment->requestedNumThreads()) + "\n"); } - this->environment().evaluateFunctions(); - - if (printFunctionSummary) { - this->environment().printFunctionSummary(); - std::exit(EXIT_SUCCESS); + if (Cfg.PrintFunctionSummary) { + Environment->printFunctionSummary(); + safeExit(EXIT_SUCCESS); } - if (EXIT_SUCCESS != (returnCode = this->environment().selectFunction( - functionId, allowUnavailablePayload))) { - std::exit(returnCode); - } + Environment->selectFunction(Cfg.FunctionId, Cfg.AllowUnavailablePayload); - if (listInstructionGroups) { - this->environment().printAvailableInstructionGroups(); - std::exit(EXIT_SUCCESS); + if (Cfg.ListInstructionGroups) { + Environment->printAvailableInstructionGroups(); + safeExit(EXIT_SUCCESS); } - if (!instructionGroups.empty()) { - if (EXIT_SUCCESS != - (returnCode = - this->environment().selectInstructionGroups(instructionGroups))) { - std::exit(returnCode); - } + if (!Cfg.InstructionGroups.empty()) { + Environment->selectInstructionGroups(Cfg.InstructionGroups); } - if (lineCount != 0) { - this->environment().setLineCount(lineCount); + if (Cfg.LineCount != 0) { + Environment->setLineCount(Cfg.LineCount); } -#if defined(linux) || defined(__linux__) - if (_measurement || listMetrics || _optimize) { - _measurementWorker = std::make_shared( - measurementInterval, this->environment().requestedNumThreads(), - metricPaths, stdinMetrics); + if constexpr (firestarter::OptionalFeatures.OptimizationEnabled) { + if (Cfg.Measurement || Cfg.ListMetrics || Cfg.Optimize) { + MeasurementWorker = std::make_shared( + Cfg.MeasurementInterval, Environment->requestedNumThreads(), Cfg.MetricPaths, Cfg.StdinMetrics); - if (listMetrics) { - log::info() << _measurementWorker->availableMetrics(); - std::exit(EXIT_SUCCESS); - } + if (Cfg.ListMetrics) { + log::info() << MeasurementWorker->availableMetrics(); + safeExit(EXIT_SUCCESS); + } + + // init all metrics + auto All = MeasurementWorker->metricNames(); + auto Initialized = MeasurementWorker->initMetrics(All); - // init all metrics - auto all = _measurementWorker->metricNames(); - auto initialized = _measurementWorker->initMetrics(all); + if (Initialized.empty()) { + std::invalid_argument("No metrics initialized"); + } - if (initialized.size() == 0) { - log::error() << "No metrics initialized"; - std::exit(EXIT_FAILURE); + // check if selected metrics are initialized + for (auto const& OptimizationMetric : Cfg.OptimizationMetrics) { + auto NameEqual = [OptimizationMetric](auto const& Name) { + auto InvertedName = "-" + Name; + return Name == OptimizationMetric || InvertedName == OptimizationMetric; + }; + // metric name is not found + if (std::find_if(All.begin(), All.end(), NameEqual) == All.end()) { + std::invalid_argument("Metric \"" + OptimizationMetric + "\" does not exist."); + } + // metric has not initialized properly + if (std::find_if(Initialized.begin(), Initialized.end(), NameEqual) == Initialized.end()) { + std::invalid_argument("Metric \"" + OptimizationMetric + "\" failed to initialize."); + } + } } - // check if selected metrics are initialized - for (auto const &optimizationMetric : optimizationMetrics) { - auto nameEqual = [optimizationMetric](auto const &name) { - auto invertedName = "-" + name; - return name.compare(optimizationMetric) == 0 || - invertedName.compare(optimizationMetric) == 0; + if (Cfg.Optimize) { + auto ApplySettings = [this](std::vector> const& Setting) { + using Clock = std::chrono::high_resolution_clock; + auto Start = Clock::now(); + + signalSwitch(Setting); + + LoadVar = LoadThreadWorkType::LoadHigh; + + signalWork(); + + uint64_t StartTimestamp = (std::numeric_limits::max)(); + uint64_t StopTimestamp = 0; + + for (auto const& Thread : LoadThreads) { + auto Td = Thread.second; + + StartTimestamp = std::min(StartTimestamp, Td->LastRun.StartTsc); + StopTimestamp = std::max(StopTimestamp, Td->LastRun.StopTsc); + } + + for (auto const& Thread : LoadThreads) { + auto Td = Thread.second; + IpcEstimateMetricData::insertValue( + static_cast(Td->LastRun.Iterations) * + static_cast(LoadThreads.front().second->CompiledPayloadPtr->stats().Instructions) / + static_cast(StopTimestamp - StartTimestamp)); + } + + auto End = Clock::now(); + + log::trace() << "Switching payload took " + << std::chrono::duration_cast(End - Start).count() << "ms"; }; - // metric name is not found - if (std::find_if(all.begin(), all.end(), nameEqual) == all.end()) { - log::error() << "Metric \"" << optimizationMetric - << "\" does not exist."; - std::exit(EXIT_FAILURE); - } - // metric has not initialized properly - if (std::find_if(initialized.begin(), initialized.end(), nameEqual) == - initialized.end()) { - log::error() << "Metric \"" << optimizationMetric - << "\" failed to initialize."; - std::exit(EXIT_FAILURE); + + auto Prob = std::make_shared( + std::move(ApplySettings), MeasurementWorker, Cfg.OptimizationMetrics, Cfg.EvaluationDuration, Cfg.StartDelta, + Cfg.StopDelta, Environment->config().settings().instructionGroupItems()); + + Population = std::make_unique(std::move(Prob)); + + if (Cfg.OptimizationAlgorithm == "NSGA2") { + Algorithm = + std::make_unique(Cfg.Generations, Cfg.Nsga2Cr, Cfg.Nsga2M); + } else { + throw std::invalid_argument("Algorithm " + Cfg.OptimizationAlgorithm + " unknown."); } - } - } - if (_optimize) { - auto applySettings = std::bind( - [this](std::vector> const &setting) { - using Clock = std::chrono::high_resolution_clock; - auto start = Clock::now(); - - for (auto &thread : this->loadThreads) { - auto td = thread.second; - - td->config().setPayloadSettings(setting); - } - - for (auto const &thread : this->loadThreads) { - auto td = thread.second; - - td->mutex.lock(); - } - - for (auto const &thread : this->loadThreads) { - auto td = thread.second; - - td->comm = THREAD_SWITCH; - td->mutex.unlock(); - } - - this->loadVar = LOAD_SWITCH; - - for (auto const &thread : this->loadThreads) { - auto td = thread.second; - bool ack; - - do { - td->mutex.lock(); - ack = td->ack; - td->mutex.unlock(); - } while (!ack); - - td->mutex.lock(); - td->ack = false; - td->mutex.unlock(); - } - - this->loadVar = LOAD_HIGH; - - this->signalWork(); - - unsigned long long startTimestamp = 0xffffffffffffffff; - unsigned long long stopTimestamp = 0; - - for (auto const &thread : this->loadThreads) { - auto td = thread.second; - - if (startTimestamp > td->lastStartTsc) { - startTimestamp = td->lastStartTsc; - } - if (stopTimestamp < td->lastStopTsc) { - stopTimestamp = td->lastStopTsc; - } - } - - for (auto const &thread : this->loadThreads) { - auto td = thread.second; - ipc_estimate_metric_insert( - (double)td->lastIterations * - (double)this->loadThreads.front() - .second->config() - .payload() - .instructions() / - (double)(stopTimestamp - startTimestamp)); - } - - auto end = Clock::now(); - - log::trace() << "Switching payload took " - << std::chrono::duration_cast( - end - start) - .count() - << "ms"; - }, - std::placeholders::_1); - - auto prob = - std::make_shared( - std::move(applySettings), _measurementWorker, _optimizationMetrics, - _evaluationDuration, _startDelta, _stopDelta, - this->environment().selectedConfig().payloadItems()); - - _population = firestarter::optimizer::Population(std::move(prob)); - - if (_optimizationAlgorithm == "NSGA2") { - _algorithm = std::make_unique( - _generations, _nsga2_cr, _nsga2_m); - } else { - throw std::invalid_argument("Algorithm " + _optimizationAlgorithm + - " unknown."); + Algorithm->check(Population->problem(), Cfg.Individuals); } - - _algorithm->checkPopulation( - static_cast(_population), - _individuals); } -#endif - this->environment().printSelectedCodePathSummary(); + Environment->printSelectedCodePathSummary(); - log::info() << this->environment().topology(); + log::info() << Environment->topology(); // setup thread with either high or low load configured at the start // low loads has to know the length of the period - if (EXIT_SUCCESS != (returnCode = this->initLoadWorkers((_loadPercent == 0), - _period.count()))) { - std::exit(returnCode); - } + initLoadWorkers(); // add some signal handler for aborting FIRESTARTER -#ifndef _WIN32 - std::signal(SIGALRM, Firestarter::sigalrmHandler); -#endif - - std::signal(SIGTERM, Firestarter::sigtermHandler); - std::signal(SIGINT, Firestarter::sigtermHandler); -} - -Firestarter::~Firestarter() { -#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP) - _cuda.reset(); -#endif -#ifdef FIRESTARTER_BUILD_ONEAPI - _oneapi.reset(); -#endif + if constexpr (!firestarter::OptionalFeatures.IsWin32) { + (void)std::signal(SIGALRM, Firestarter::sigalrmHandler); + } - delete _environment; + (void)std::signal(SIGTERM, Firestarter::sigtermHandler); + (void)std::signal(SIGINT, Firestarter::sigtermHandler); } void Firestarter::mainThread() { - this->environment().printThreadSummary(); + Environment->printThreadSummary(); -#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP) - _cuda = std::make_unique(&this->loadVar, _gpuUseFloat, - _gpuUseDouble, _gpuMatrixSize, _gpus); -#endif + Cuda = std::make_unique(LoadVar, Cfg.GpuUseFloat, Cfg.GpuUseDouble, Cfg.GpuMatrixSize, Cfg.Gpus); + Oneapi = std::make_unique(LoadVar, Cfg.GpuUseFloat, Cfg.GpuUseDouble, Cfg.GpuMatrixSize, Cfg.Gpus); -#ifdef FIRESTARTER_BUILD_ONEAPI - _oneapi = std::make_unique(&this->loadVar, _gpuUseFloat, - _gpuUseDouble, _gpuMatrixSize, _gpus); -#endif - - -#if defined(linux) || defined(__linux__) - // if measurement is enabled, start it here - if (_measurement) { - _measurementWorker->startMeasurement(); + if constexpr (firestarter::OptionalFeatures.OptimizationEnabled) { + // if measurement is enabled, start it here + if (Cfg.Measurement) { + MeasurementWorker->startMeasurement(); + } } -#endif - this->signalWork(); + signalWork(); -#ifdef FIRESTARTER_DEBUG_FEATURES - if (_dumpRegisters) { - int returnCode; - if (EXIT_SUCCESS != (returnCode = this->initDumpRegisterWorker( - _dumpRegistersTimeDelta, _dumpRegistersOutpath))) { - std::exit(returnCode); + if constexpr (firestarter::OptionalFeatures.DumpRegisterEnabled) { + if (Cfg.DumpRegisters) { + initDumpRegisterWorker(); } } -#endif // worker thread for load control - this->watchdogWorker(_period, _load, _timeout); + watchdogWorker(Cfg.Period, Cfg.Load, Cfg.Timeout); -#if defined(linux) || defined(__linux__) - // check if optimization is selected - if (_optimize) { - auto startTime = optimizer::History::getTime(); + if constexpr (firestarter::OptionalFeatures.OptimizationEnabled) { + // check if optimization is selected + if (Cfg.Optimize) { + auto StartTime = optimizer::History::getTime(); - Firestarter::_optimizer = std::make_unique( - std::move(_algorithm), _population, _optimizationAlgorithm, - _individuals, _preheat); + Firestarter::Optimizer = std::make_unique(std::move(Algorithm), std::move(Population), + Cfg.Individuals, Cfg.Preheat); - // wait here until optimizer thread terminates - Firestarter::_optimizer->join(); + // wait here until optimizer thread terminates + Firestarter::Optimizer->join(); + Firestarter::Optimizer.reset(); - auto payloadItems = this->environment().selectedConfig().payloadItems(); + auto PayloadItems = Environment->config().settings().instructionGroupItems(); - firestarter::optimizer::History::save(_optimizeOutfile, startTime, - payloadItems, _argc, _argv); + firestarter::optimizer::History::save(Cfg.OptimizeOutfile, StartTime, PayloadItems, Cfg.Argc, Cfg.Argv); - // print the best 20 according to each metric - firestarter::optimizer::History::printBest(_optimizationMetrics, - payloadItems); + // print the best 20 according to each metric + firestarter::optimizer::History::printBest(Cfg.OptimizationMetrics, PayloadItems); - // stop all the load threads - std::raise(SIGTERM); + // stop all the load threads + (void)std::raise(SIGTERM); + } } -#endif // wait for watchdog to timeout or until user terminates - this->joinLoadWorkers(); -#ifdef FIRESTARTER_DEBUG_FEATURES - if (_dumpRegisters) { - this->joinDumpRegisterWorker(); + joinLoadWorkers(); + if constexpr (firestarter::OptionalFeatures.DumpRegisterEnabled) { + if (Cfg.DumpRegisters) { + joinDumpRegisterWorker(); + } } -#endif - if (!_optimize) { - this->printPerformanceReport(); + if (!Cfg.Optimize) { + printPerformanceReport(); } -#if defined(linux) || defined(__linux__) - // if measurment is enabled, stop it here - if (_measurement) { - // TODO: clear this up - log::info() << "metric,num_timepoints,duration_ms,average,stddev"; - for (auto const &[name, sum] : - _measurementWorker->getValues(_startDelta, _stopDelta)) { - log::info() << std::quoted(name) << "," << sum.num_timepoints << "," - << sum.duration.count() << "," << sum.average << "," - << sum.stddev; + if constexpr (firestarter::OptionalFeatures.OptimizationEnabled) { + // if measurment is enabled, stop it here + if (Cfg.Measurement) { + // TODO(Issue #77): clear this up + log::info() << "metric,num_timepoints,duration_ms,average,stddev"; + for (auto const& [name, sum] : MeasurementWorker->getValues(Cfg.StartDelta, Cfg.StopDelta)) { + log::info() << std::quoted(name) << "," << sum.NumTimepoints << "," << sum.Duration.count() << "," + << sum.Average << "," << sum.Stddev; + } } } -#endif - if (_errorDetection) { - this->printThreadErrorReport(); + if (Cfg.ErrorDetection) { + printThreadErrorReport(); } } -void Firestarter::setLoad(unsigned long long value) { +void Firestarter::setLoad(LoadThreadWorkType Value) { // signal load change to workers - Firestarter::loadVar = value; -#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ - defined(_M_X64) -#ifndef _MSC_VER - __asm__ __volatile__("mfence;"); -#else - _mm_mfence(); -#endif -#else -#error "FIRESTARTER is not implemented for this ISA" -#endif + Firestarter::LoadVar = Value; + if constexpr (firestarter::OptionalFeatures.IsX86) { + if constexpr (firestarter::OptionalFeatures.IsMsc) { + _mm_mfence(); + } else { + __asm__ __volatile__("mfence;"); + } + } } -void Firestarter::sigalrmHandler(int signum) { (void)signum; } +void Firestarter::sigalrmHandler(int Signum) { (void)Signum; } -void Firestarter::sigtermHandler(int signum) { - (void)signum; +void Firestarter::sigtermHandler(int Signum) { + (void)Signum; - Firestarter::setLoad(LOAD_STOP); + Firestarter::setLoad(LoadThreadWorkType::LoadStop); // exit loop // used in case of 0 < load < 100 // or interrupt sleep for timeout { - std::lock_guard lk(Firestarter::_watchdogTerminateMutex); - Firestarter::_watchdog_terminate = true; + const std::lock_guard Lk(Firestarter::WatchdogTerminateMutex); + Firestarter::WatchdogTerminate = true; } - Firestarter::_watchdogTerminateAlert.notify_all(); + Firestarter::WatchdogTerminateAlert.notify_all(); -#if defined(linux) || defined(__linux__) - // if we have optimization running stop it - if (Firestarter::_optimizer) { - Firestarter::_optimizer->kill(); + if constexpr (firestarter::OptionalFeatures.OptimizationEnabled) { + // if we have optimization running stop it + if (Firestarter::Optimizer) { + Firestarter::Optimizer->kill(); + } } -#endif } + +} // namespace firestarter \ No newline at end of file diff --git a/src/firestarter/LoadWorker.cpp b/src/firestarter/LoadWorker.cpp index 3c922cf6..4d473832 100644 --- a/src/firestarter/LoadWorker.cpp +++ b/src/firestarter/LoadWorker.cpp @@ -19,14 +19,15 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include -#include +#include "firestarter/AlignedAlloc.hpp" +#include "firestarter/Constants.hpp" +#include "firestarter/ErrorDetectionStruct.hpp" +#include "firestarter/Firestarter.hpp" +#include "firestarter/LoadWorkerData.hpp" +#include "firestarter/Logging/Log.hpp" #if defined(linux) || defined(__linux__) -extern "C" { -#include -} +#include "firestarter/Measurement/Metric/IPCEstimate.hpp" #endif #ifdef ENABLE_VTRACING @@ -37,146 +38,135 @@ extern "C" { #endif #include +#include #include -#include +#include +#include +#include #include -using namespace firestarter; - -auto aligned_free_deleter = [](void *p) { ALIGNED_FREE(p); }; +namespace firestarter { -int Firestarter::initLoadWorkers(bool lowLoad, unsigned long long period) { - int returnCode; - - if (EXIT_SUCCESS != (returnCode = this->environment().setCpuAffinity(0))) { - return EXIT_FAILURE; - } +void Firestarter::initLoadWorkers() { + Environment->setCpuAffinity(0); // setup load variable to execute low or high load once the threads switch to // work. - this->loadVar = lowLoad ? LOAD_LOW : LOAD_HIGH; + LoadVar = Cfg.Load == std::chrono::microseconds::zero() ? LoadThreadWorkType::LoadLow : LoadThreadWorkType::LoadHigh; - auto numThreads = this->environment().requestedNumThreads(); + auto NumThreads = Environment->requestedNumThreads(); // create a std::vector> of requestenNumThreads() // communication pointers and add these to the threaddata - if (_errorDetection) { - for (unsigned long long i = 0; i < numThreads; i++) { - auto commPtr = reinterpret_cast( - ALIGNED_MALLOC(2 * sizeof(unsigned long long), 64)); - assert(commPtr); - this->errorCommunication.push_back( - std::shared_ptr(commPtr, aligned_free_deleter)); - log::debug() << "Threads " << (i + numThreads - 1) % numThreads << " and " - << i << " commPtr = 0x" << std::setfill('0') - << std::setw(sizeof(unsigned long long) * 2) << std::hex - << (unsigned long long)commPtr; + if (Cfg.ErrorDetection) { + for (uint64_t I = 0; I < NumThreads; I++) { + auto* CommPtr = static_cast(AlignedAlloc::malloc(2 * sizeof(uint64_t))); + assert(CommPtr); + ErrorCommunication.emplace_back(std::shared_ptr(CommPtr, AlignedAlloc::free)); + log::debug() << "Threads " << (I + NumThreads - 1) % NumThreads << " and " << I << " commPtr = 0x" + << std::setfill('0') << std::setw(sizeof(uint64_t) * 2) + << std::hex + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) + << reinterpret_cast(CommPtr); } } - for (unsigned long long i = 0; i < numThreads; i++) { - auto td = std::make_shared(i, this->environment(), - &this->loadVar, period, - _dumpRegisters, _errorDetection); + for (uint64_t I = 0; I < NumThreads; I++) { + auto Td = std::make_shared(I, std::cref(*Environment), std::ref(LoadVar), Cfg.Period, + Cfg.DumpRegisters, Cfg.ErrorDetection); - if (_errorDetection) { + if (Cfg.ErrorDetection) { // distribute pointers for error deteciton. (set threads in a ring) // give this thread the left pointer i and right pointer (i+1) % // requestedNumThreads(). - td->setErrorCommunication(this->errorCommunication[i], - this->errorCommunication[(i + 1) % numThreads]); + Td->setErrorCommunication(ErrorCommunication[I], ErrorCommunication[(I + 1) % NumThreads]); } - auto dataCacheSizeIt = - td->config().platformConfig().dataCacheBufferSize().begin(); - auto ramBufferSize = td->config().platformConfig().ramBufferSize(); - - td->buffersizeMem = (*dataCacheSizeIt + *std::next(dataCacheSizeIt, 1) + - *std::next(dataCacheSizeIt, 2) + ramBufferSize) / - td->config().thread() / sizeof(unsigned long long); + Td->BuffersizeMem = Td->config().settings().totalBufferSizePerThread() / sizeof(uint64_t); // create the thread - std::thread t(Firestarter::loadThreadWorker, td); + std::thread T(Firestarter::loadThreadWorker, Td); - log::trace() << "Created thread #" << i << " with ID: " << t.get_id(); + log::trace() << "Created thread #" << I << " with ID: " << T.get_id(); - if (i == 0) { + if (I == 0) { // only show error for all worker threads except first. - firestarter::logging::FirstWorkerThreadFilter< - firestarter::logging::record>::setFirstThread(t.get_id()); + firestarter::logging::FirstWorkerThreadFilter::setFirstThread(T.get_id()); } - this->loadThreads.push_back(std::make_pair(std::move(t), td)); + LoadThreads.emplace_back(std::move(T), Td); } - this->signalLoadWorkers(THREAD_INIT); - - return EXIT_SUCCESS; + signalLoadWorkers(LoadThreadState::ThreadInit); } -void Firestarter::signalLoadWorkers(int comm) { - bool ack; - - // start the work - for (auto const &thread : this->loadThreads) { - auto td = thread.second; +void Firestarter::signalLoadWorkers(const LoadThreadState State, void (*Function)()) { + // aquire the lock on all threads + for (auto const& Thread : LoadThreads) { + auto Td = Thread.second; - td->mutex.lock(); + Td->Communication.Mutex.lock(); } - for (auto const &thread : this->loadThreads) { - auto td = thread.second; + // switch the state on all threads + for (auto const& Thread : LoadThreads) { + auto Td = Thread.second; - td->comm = comm; - td->mutex.unlock(); + Td->Communication.State = State; + Td->Communication.Mutex.unlock(); } - for (auto const &thread : this->loadThreads) { - auto td = thread.second; + // Execute a function after the state in the threads has been updated. This may be required to terminate an inner + // loop. + if (Function) { + Function(); + } + + // wait for all threads to finish + for (auto const& Thread : LoadThreads) { + auto Td = Thread.second; - do { - td->mutex.lock(); - ack = td->ack; - td->mutex.unlock(); - } while (!ack); + // Wait until we receive the acknowledge + for (bool Ack = false; !Ack;) { + Td->Communication.Mutex.lock(); + Ack = Td->Communication.Ack; + Td->Communication.Mutex.unlock(); + } - td->mutex.lock(); - td->ack = false; - td->mutex.unlock(); + Td->Communication.Mutex.lock(); + Td->Communication.Ack = false; + Td->Communication.Mutex.unlock(); } } void Firestarter::joinLoadWorkers() { // wait for threads after watchdog has requested termination - for (auto &thread : this->loadThreads) { - thread.first.join(); + for (auto& Thread : LoadThreads) { + Thread.first.join(); } } void Firestarter::printThreadErrorReport() { - if (_errorDetection) { - auto maxSize = this->loadThreads.size(); + if (Cfg.ErrorDetection) { + auto MaxSize = LoadThreads.size(); - std::vector errors(maxSize, false); + std::vector Errors(MaxSize, false); - for (decltype(maxSize) i = 0; i < maxSize; i++) { - auto errorDetectionStruct = - this->loadThreads[i].second->errorDetectionStruct(); + for (decltype(MaxSize) I = 0; I < MaxSize; I++) { + const auto& ErrorDetectionStructPtr = LoadThreads[I].second->errorDetectionStruct(); - if (errorDetectionStruct->errorLeft) { - errors[(i + maxSize - 1) % maxSize] = true; + if (ErrorDetectionStructPtr.Left.Error) { + Errors[(I + MaxSize - 1) % MaxSize] = true; } - if (errorDetectionStruct->errorRight) { - errors[i] = true; + if (ErrorDetectionStructPtr.Right.Error) { + Errors[I] = true; } } - for (decltype(maxSize) i = 0; i < maxSize; i++) { - if (errors[i]) { - log::fatal() - << "Data mismatch between Threads " << i << " and " - << (i + 1) % maxSize - << ".\n This may be caused by bit-flips in the hardware."; + for (decltype(MaxSize) I = 0; I < MaxSize; I++) { + if (Errors[I]) { + log::fatal() << "Data mismatch between Threads " << I << " and " << (I + 1) % MaxSize + << ".\n This may be caused by bit-flips in the hardware."; } } } @@ -184,168 +174,138 @@ void Firestarter::printThreadErrorReport() { void Firestarter::printPerformanceReport() { // performance report - unsigned long long startTimestamp = 0xffffffffffffffff; - unsigned long long stopTimestamp = 0; + uint64_t StartTimestamp = (std::numeric_limits::max)(); + uint64_t StopTimestamp = 0; - unsigned long long iterations = 0; + uint64_t Iterations = 0; log::debug() << "\nperformance report:\n"; - for (auto const &thread : this->loadThreads) { - auto td = thread.second; + for (auto const& Thread : LoadThreads) { + auto Td = Thread.second; - log::debug() << "Thread " << td->id() << ": " << td->iterations - << " iterations, tsc_delta: " << td->stopTsc - td->startTsc; + log::debug() << "Thread " << Td->id() << ": " << Td->LastRun.Iterations + << " iterations, tsc_delta: " << Td->LastRun.StopTsc - Td->LastRun.StartTsc; - if (startTimestamp > td->startTsc) { - startTimestamp = td->startTsc; - } - if (stopTimestamp < td->stopTsc) { - stopTimestamp = td->stopTsc; - } + StartTimestamp = (std::min)(StartTimestamp, Td->LastRun.StartTsc.load()); + StopTimestamp = (std::max)(StopTimestamp, Td->LastRun.StopTsc.load()); - iterations += td->iterations; + Iterations += Td->LastRun.Iterations.load(); } - double runtime = (double)(stopTimestamp - startTimestamp) / - (double)this->environment().topology().clockrate(); - double gFlops = - (double)this->loadThreads.front().second->config().payload().flops() * - 0.000000001 * (double)iterations / runtime; - double bandwidth = - (double)this->loadThreads.front().second->config().payload().bytes() * - 0.000000001 * (double)iterations / runtime; + double const Runtime = + static_cast(StopTimestamp - StartTimestamp) / static_cast(Environment->topology().clockrate()); + double const GFlops = static_cast(LoadThreads.front().second->CompiledPayloadPtr->stats().Flops) * + 0.000000001 * static_cast(Iterations) / Runtime; + double const Bandwidth = static_cast(LoadThreads.front().second->CompiledPayloadPtr->stats().Bytes) * + 0.000000001 * static_cast(Iterations) / Runtime; // insert values for ipc-estimate metric // if we are on linux #if defined(linux) || defined(__linux__) - if (_measurement) { - for (auto const &thread : this->loadThreads) { - auto td = thread.second; - ipc_estimate_metric_insert((double)td->iterations * - (double)this->loadThreads.front() - .second->config() - .payload() - .instructions() / - (double)(stopTimestamp - startTimestamp)); + if (Cfg.Measurement) { + for (auto const& Thread : LoadThreads) { + auto Td = Thread.second; + IpcEstimateMetricData::insertValue( + static_cast(Td->LastRun.Iterations) * + static_cast(LoadThreads.front().second->CompiledPayloadPtr->stats().Instructions) / + static_cast(StopTimestamp - StartTimestamp)); } } #endif - // format runtime, gflops and bandwidth %.2f - const char *fmt = "%.2f"; - int size; - -#define FORMAT(input) \ - size = std::snprintf(nullptr, 0, fmt, input); \ - std::vector input##Vector(size + 1); \ - std::snprintf(&input##Vector[0], input##Vector.size(), fmt, input); \ - auto input##String = std::string(&input##Vector[0]) - - FORMAT(runtime); - FORMAT(gFlops); - FORMAT(bandwidth); - -#undef FORMAT - - log::debug() - << "\n" - << "total iterations: " << iterations << "\n" - << "runtime: " << runtimeString << " seconds (" - << stopTimestamp - startTimestamp << " cycles)\n" - << "\n" - << "estimated floating point performance: " << gFlopsString << " GFLOPS\n" - << "estimated memory bandwidth*: " << bandwidthString << " GB/s\n" - << "\n" - << "* this estimate is highly unreliable if --function is used in order " - "to " - "select\n" - << " a function that is not optimized for your architecture, or if " - "FIRESTARTER is\n" - << " executed on an unsupported architecture!"; + // format runtime, gflops and bandwidth with two decimal places + const auto FormatString = [](double Value) -> std::string { + std::stringstream Ss; + Ss << std::fixed << std::setprecision(2) << Value; + return Ss.str(); + }; + + log::debug() << "\n" + << "total iterations: " << Iterations << "\n" + << "runtime: " << FormatString(Runtime) << " seconds (" << StopTimestamp - StartTimestamp << " cycles)\n" + << "\n" + << "estimated floating point performance: " << FormatString(GFlops) << " GFLOPS\n" + << "estimated memory bandwidth*: " << FormatString(Bandwidth) << " GB/s\n" + << "\n" + << "* this estimate is highly unreliable if --function is used in order " + "to " + "select\n" + << " a function that is not optimized for your architecture, or if " + "FIRESTARTER is\n" + << " executed on an unsupported architecture!"; } -void Firestarter::loadThreadWorker(std::shared_ptr td) { +void Firestarter::loadThreadWorker(const std::shared_ptr& Td) { - int old = THREAD_WAIT; + auto OldState = LoadThreadState::ThreadWait; #if defined(linux) || defined(__linux__) pthread_setname_np(pthread_self(), "LoadWorker"); #endif for (;;) { - td->mutex.lock(); - int comm = td->comm; - td->mutex.unlock(); + Td->Communication.Mutex.lock(); + auto CurState = Td->Communication.State; + Td->Communication.Mutex.unlock(); - if (comm != old) { - old = comm; + if (CurState != OldState) { + OldState = CurState; - td->mutex.lock(); - td->ack = true; - td->mutex.unlock(); + Td->Communication.Mutex.lock(); + Td->Communication.Ack = true; + Td->Communication.Mutex.unlock(); } else { std::this_thread::sleep_for(std::chrono::microseconds(1)); continue; } - switch (comm) { + switch (CurState) { // allocate and initialize memory - case THREAD_INIT: + case LoadThreadState::ThreadInit: // set affinity - td->environment().setCpuAffinity(td->id()); + Td->environment().setCpuAffinity(Td->id()); // compile payload - td->config().payload().compilePayload( - td->config().payloadSettings(), td->config().instructionCacheSize(), - td->config().dataCacheBufferSize(), td->config().ramBufferSize(), - td->config().thread(), td->config().lines(), td->dumpRegisters, - td->errorDetection); + Td->CompiledPayloadPtr = + Td->config().payload()->compilePayload(Td->config().settings(), Td->DumpRegisters, Td->ErrorDetection); // allocate memory // if we should dump some registers, we use the first part of the memory // for them. - td->addrMem = - reinterpret_cast(ALIGNED_MALLOC( - (td->buffersizeMem + td->addrOffset) * sizeof(unsigned long long), - 64)) + - td->addrOffset; + Td->Memory = LoadWorkerMemory::allocate(Td->BuffersizeMem * sizeof(uint64_t)); // exit application on error - if (td->addrMem - td->addrOffset == nullptr) { - workerLog::error() << "Could not allocate memory for CPU load thread " - << td->id() << "\n"; - exit(ENOMEM); + if (Td->Memory == nullptr) { + workerLog::error() << "Could not allocate memory for CPU load thread " << Td->id() << "\n"; } - if (td->dumpRegisters) { - reinterpret_cast(td->addrMem - td->addrOffset) - ->dumpVar = DumpVariable::Wait; + if (Td->DumpRegisters) { + Td->dumpRegisterStruct().DumpVar = DumpVariable::Wait; } - if (td->errorDetection) { - auto errorDetectionStruct = reinterpret_cast( - td->addrMem - td->addrOffset); + if (Td->ErrorDetection) { + auto& ErrorDetectionStructRef = Td->errorDetectionStruct(); - std::memset(errorDetectionStruct, 0, sizeof(ErrorDetectionStruct)); + std::memset(&ErrorDetectionStructRef, 0, sizeof(ErrorDetectionStruct)); // distribute left and right communication pointers - errorDetectionStruct->communicationLeft = td->communicationLeft.get(); - errorDetectionStruct->communicationRight = td->communicationRight.get(); + ErrorDetectionStructRef.Left.Communication = Td->CommunicationLeft.get(); + ErrorDetectionStructRef.Right.Communication = Td->CommunicationRight.get(); // do first touch memset 0 for the communication pointers - std::memset((void *)errorDetectionStruct->communicationLeft, 0, - sizeof(unsigned long long) * 2); + std::memset(static_cast(ErrorDetectionStructRef.Left.Communication), 0, sizeof(uint64_t) * 2); } // call init function - td->config().payload().init(td->addrMem, td->buffersizeMem); + Td->CompiledPayloadPtr->init(Td->Memory->getMemoryAddress(), Td->BuffersizeMem); break; // perform stress test - case THREAD_WORK: + case LoadThreadState::ThreadWork: + Td->CurrentRun.Iterations = 0; // record threads start timestamp - td->startTsc = td->environment().topology().timestamp(); + Td->CurrentRun.StartTsc = Td->environment().topology().timestamp(); // will be terminated by watchdog for (;;) { @@ -354,11 +314,10 @@ void Firestarter::loadThreadWorker(std::shared_ptr td) { VT_USER_START("HIGH_LOAD_FUNC"); #endif #ifdef ENABLE_SCOREP - SCOREP_USER_REGION_BY_NAME_BEGIN("HIGH", - SCOREP_USER_REGION_TYPE_COMMON); + SCOREP_USER_REGION_BY_NAME_BEGIN("HIGH", SCOREP_USER_REGION_TYPE_COMMON); #endif - td->iterations = td->config().payload().highLoadFunction( - td->addrMem, td->addrHigh, td->iterations); + Td->CurrentRun.Iterations = Td->CompiledPayloadPtr->highLoadFunction(Td->Memory->getMemoryAddress(), + Td->LoadVar, Td->CurrentRun.Iterations); // call low load function #ifdef ENABLE_VTRACING @@ -369,7 +328,7 @@ void Firestarter::loadThreadWorker(std::shared_ptr td) { SCOREP_USER_REGION_BY_NAME_END("HIGH"); SCOREP_USER_REGION_BY_NAME_BEGIN("LOW", SCOREP_USER_REGION_TYPE_COMMON); #endif - td->config().payload().lowLoadFunction(td->addrHigh, td->period); + Td->CompiledPayloadPtr->lowLoadFunction(Td->LoadVar, Td->Period); #ifdef ENABLE_VTRACING VT_USER_END("LOW_LOAD_FUNC"); #endif @@ -378,41 +337,33 @@ void Firestarter::loadThreadWorker(std::shared_ptr td) { #endif // terminate if master signals end of run and record stop timestamp - if (*td->addrHigh == LOAD_STOP) { - td->stopTsc = td->environment().topology().timestamp(); + if (Td->LoadVar == LoadThreadWorkType::LoadStop) { + Td->CurrentRun.StopTsc = Td->environment().topology().timestamp(); + Td->LastRun = Td->CurrentRun; return; } - if (*td->addrHigh == LOAD_SWITCH) { - td->stopTsc = td->environment().topology().timestamp(); + if (Td->LoadVar == LoadThreadWorkType::LoadSwitch) { + Td->CurrentRun.StopTsc = Td->environment().topology().timestamp(); + Td->LastRun = Td->CurrentRun; break; } } break; - case THREAD_SWITCH: + case LoadThreadState::ThreadSwitch: // compile payload - td->config().payload().compilePayload( - td->config().payloadSettings(), td->config().instructionCacheSize(), - td->config().dataCacheBufferSize(), td->config().ramBufferSize(), - td->config().thread(), td->config().lines(), td->dumpRegisters, - td->errorDetection); + Td->CompiledPayloadPtr = + Td->config().payload()->compilePayload(Td->config().settings(), Td->DumpRegisters, Td->ErrorDetection); // call init function - td->config().payload().init(td->addrMem, td->buffersizeMem); - - // save old iteration count - td->lastIterations = td->iterations; - td->lastStartTsc = td->startTsc; - td->lastStopTsc = td->stopTsc; - td->iterations = 0; + Td->CompiledPayloadPtr->init(Td->Memory->getMemoryAddress(), Td->BuffersizeMem); break; - case THREAD_WAIT: + case LoadThreadState::ThreadWait: break; - case THREAD_STOP: - default: - return; } } } + +} // namespace firestarter \ No newline at end of file diff --git a/src/firestarter/Main.cpp b/src/firestarter/Main.cpp index 844052d5..24269db3 100644 --- a/src/firestarter/Main.cpp +++ b/src/firestarter/Main.cpp @@ -19,484 +19,35 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include - -#include - -#include - -struct Config { - inline static const std::vector> - optionsMap = {{"information", "Information Options:\n"}, - {"general", "General Options:\n"}, - {"specialized-workloads", "Specialized workloads:\n"}, -#ifdef FIRESTARTER_DEBUG_FEATURES - {"debug", "Debugging:\n"}, -#endif -#if defined(linux) || defined(__linux__) - {"measurement", "Measurement:\n"}, - {"optimization", "Optimization:\n"} -#endif - }; - - // default parameters - std::chrono::seconds timeout; - unsigned loadPercent; - std::chrono::microseconds period; - unsigned requestedNumThreads; - std::string cpuBind = ""; - bool printFunctionSummary; - unsigned functionId; - bool listInstructionGroups; - std::string instructionGroups; - unsigned lineCount = 0; - // debug features - bool allowUnavailablePayload = false; - bool dumpRegisters = false; - std::chrono::seconds dumpRegistersTimeDelta = std::chrono::seconds(0); - std::string dumpRegistersOutpath = ""; - bool errorDetection = false; - // CUDA parameters - int gpus = 0; - unsigned gpuMatrixSize = 0; - bool gpuUseFloat = false; - bool gpuUseDouble = false; - // linux features - bool listMetrics = false; - bool measurement = false; - std::chrono::milliseconds startDelta = std::chrono::milliseconds(0); - std::chrono::milliseconds stopDelta = std::chrono::milliseconds(0); - std::chrono::milliseconds measurementInterval = std::chrono::milliseconds(0); - std::vector stdinMetrics; - // linux and dynamic linked binary - std::vector metricPaths; - - // optimization - bool optimize = false; - std::chrono::seconds preheat; - std::string optimizationAlgorithm; - std::vector optimizationMetrics; - std::chrono::seconds evaluationDuration; - unsigned individuals; - std::string optimizeOutfile = ""; - unsigned generations; - double nsga2_cr; - double nsga2_m; - - Config(int argc, const char **argv); -}; - -void print_copyright() { - firestarter::log::info() - << "This program is free software: you can redistribute it and/or " - "modify\n" - << "it under the terms of the GNU General Public License as published " - "by\n" - << "the Free Software Foundation, either version 3 of the License, or\n" - << "(at your option) any later version.\n" - << "\n" - << "You should have received a copy of the GNU General Public License\n" - << "along with this program. If not, see " - ".\n"; -} - -void print_warranty() { - firestarter::log::info() - << "This program is distributed in the hope that it will be useful,\n" - << "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" - << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" - << "GNU General Public License for more details.\n" - << "\n" - << "You should have received a copy of the GNU General Public License\n" - << "along with this program. If not, see " - ".\n"; -} - -void print_help(cxxopts::Options const &parser, std::string const §ion) { - std::vector> options( - Config::optionsMap.size()); - - if (section.size() == 0) { - std::copy(Config::optionsMap.begin(), Config::optionsMap.end(), - options.begin()); - } else { - auto findSection = [&](std::pair const &pair) { - return pair.first == section; - }; - auto it = std::copy_if(Config::optionsMap.begin(), Config::optionsMap.end(), - options.begin(), findSection); - options.resize(std::distance(options.begin(), it)); - } - - // clang-format off - firestarter::log::info() - << parser.help(options) - << "Examples:\n" - << " ./FIRESTARTER starts FIRESTARTER without timeout\n" - << " ./FIRESTARTER -t 300 starts a 5 minute run of FIRESTARTER\n" - << " ./FIRESTARTER -l 50 -t 600 starts a 10 minute run of FIRESTARTER with\n" - << " 50\% high load and 50\% idle time\n" -#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_ONEAPI) || defined(FIRESTARTER_BUILD_HIP) - << " on CPUs and full load on GPUs\n" -#endif - << " ./FIRESTARTER -l 75 -p 20000000\n" - << " starts FIRESTARTER with an interval length\n" - << " of 2 sec, 1.5s high load" -#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_ONEAPI) || defined(FIRESTARTER_BUILD_HIP) - << " on CPUs and full load on GPUs\n" -#else - << "\n" -#endif -#if defined(linux) || defined(__linux__) - << " ./FIRESTARTER --measurement --start-delta=300000 -t 900\n" - << " starts FIRESTARTER measuring all available\n" - << " metrics for 15 minutes disregarding the first\n" - << " 5 minutes and last two seconds (default to `--stop-delta`)\n" - << " ./FIRESTARTER -t 20 --optimize=NSGA2 --optimization-metric sysfs-powercap-rapl,perf-ipc\n" - << " starts FIRESTARTER optimizing with the sysfs-powercap-rapl\n" - << " and perf-ipc metric. The duration is 20s long. The default\n" - << " instruction groups for the current platform will be used.\n" -#endif - ; - // clang-format on -} - -Config::Config(int argc, const char **argv) { - - cxxopts::Options parser(argv[0]); - - // clang-format off - parser.add_options("information") - ("h,help", "Display usage information. SECTION can be any of: information | general | specialized-workloads" -#ifdef FIRESTARTER_DEBUG_FEATURES - " | debug" -#endif -#if defined(linux) || defined(__linux__) - "\n| measurement | optimization" -#endif - , - cxxopts::value()->implicit_value(""), "SECTION") - ("v,version", "Display version information") - ("c,copyright", "Display copyright information") - ("w,warranty", "Display warranty information") - ("q,quiet", "Set log level to Warning") - ("r,report", "Display additional information (overridden by -q)") - ("debug", "Print debug output") - ("a,avail", "List available functions"); - - parser.add_options("general") - ("i,function", "Specify integer ID of the load-function to be\nused (as listed by --avail)", - cxxopts::value()->default_value("0"), "ID") -#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_ONEAPI) || defined(FIRESTARTER_BUILD_HIP) - ("f,usegpufloat", "Use single precision matrix multiplications\ninstead of default") - ("d,usegpudouble", "Use double precision matrix multiplications\ninstead of default") - ("g,gpus", "Number of gpus to use, default: -1 (all)", - cxxopts::value()->default_value("-1")) - ("m,matrixsize", "Size of the matrix to calculate, default: 0 (maximum)", - cxxopts::value()->default_value("0")) -#endif - ("t,timeout", "Set the timeout (seconds) after which FIRESTARTER\nterminates itself, default: 0 (no timeout)", - cxxopts::value()->default_value("0"), "TIMEOUT") - ("l,load", "Set the percentage of high CPU load to LOAD\n(%) default: 100, valid values: 0 <= LOAD <=\n100, threads will be idle in the remaining time,\nfrequency of load changes is determined by -p." -#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_ONEAPI) || defined(FIRESTARTER_BUILD_HIP) - " This option does NOT influence the GPU\nworkload!" -#endif - , cxxopts::value()->default_value("100"), "LOAD") - ("p,period", "Set the interval length for CPUs to PERIOD\n(usec), default: 100000, each interval contains\na high load and an idle phase, the percentage\nof high load is defined by -l.", - cxxopts::value()->default_value("100000"), "PERIOD") - ("n,threads", "Specify the number of threads. Cannot be\ncombined with -b | --bind, which impicitly\nspecifies the number of threads.", - cxxopts::value()->default_value("0"), "COUNT") -#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY) - ("b,bind", "Select certain CPUs. CPULIST format: \"x,y,z\",\n\"x-y\", \"x-y/step\", and any combination of the\nabove. Cannot be combined with -n | --threads.", - cxxopts::value()->default_value(""), "CPULIST") -#endif - ("error-detection", "Enable error detection. This aborts execution when the calculated data is corruped by errors. FIRESTARTER must run with 2 or more threads for this feature. Cannot be used with -l | --load and --optimize."); - - parser.add_options("specialized-workloads") - ("list-instruction-groups", "List the available instruction groups for the\npayload of the current platform.") - ("run-instruction-groups", "Run the payload with the specified\ninstruction groups. GROUPS format: multiple INST:VAL\npairs comma-seperated.", - cxxopts::value()->default_value(""), "GROUPS") - ("set-line-count", "Set the number of lines for a payload.", - cxxopts::value()); - -#ifdef FIRESTARTER_DEBUG_FEATURES - parser.add_options("debug") - ("allow-unavailable-payload", "") - ("dump-registers", "Dump the working registers on the first\nthread. Depending on the payload these are mm, xmm,\nymm or zmm. Only use it without a timeout and\n100 percent load. DELAY between dumps in secs. Cannot be used with --error-detection.", - cxxopts::value()->implicit_value("10"), "DELAY") - ("dump-registers-outpath", "Path for the dump of the output files. If\nPATH is not given, current working directory will\nbe used.", - cxxopts::value()->default_value(""), "PATH"); -#endif - -#if defined(linux) || defined(__linux__) - parser.add_options("measurement") - ("list-metrics", "List the available metrics.") -#ifndef FIRESTARTER_LINK_STATIC - ("metric-path", "Add a path to a shared library representing an interface for a metric. This option can be specified multiple times.", - cxxopts::value>()->default_value("")) -#endif - ("metric-from-stdin", "Add a metric NAME with values from stdin.\nFormat of input: \"NAME TIME_SINCE_EPOCH VALUE\\n\".\nTIME_SINCE_EPOCH is a int64 in nanoseconds. VALUE is a double. (Do not forget to flush\nlines!)", - cxxopts::value>(), "NAME") - ("measurement", "Start a measurement for the time specified by\n-t | --timeout. (The timeout must be greater\nthan the start and stop deltas.) Cannot be\ncombined with --optimize.") - ("measurement-interval", "Interval of measurements in milliseconds, default: 100", - cxxopts::value()->default_value("100")) - ("start-delta", "Cut of first N milliseconds of measurement, default: 5000", - cxxopts::value()->default_value("5000"), "N") - ("stop-delta", "Cut of last N milliseconds of measurement, default: 2000", - cxxopts::value()->default_value("2000"), "N") - ("preheat", "Preheat for N seconds, default: 240", - cxxopts::value()->default_value("240"), "N"); - - parser.add_options("optimization") - ("optimize", "Run the optimization with one of these algorithms: NSGA2.\nCannot be combined with --measurement.", - cxxopts::value()) - ("optimize-outfile", "Dump the output of the optimization into this\nfile, default: $PWD/$HOSTNAME_$DATE.json", - cxxopts::value()) - ("optimization-metric", "Use a metric for optimization. Metrics listed\nwith cli argument --list-metrics or specified\nwith --metric-from-stdin are valid.", - cxxopts::value>()) - ("individuals", "Number of individuals for the population. For\nNSGA2 specify at least 5 and a multiple of 4,\ndefault: 20", - cxxopts::value()->default_value("20")) - ("generations", "Number of generations, default: 20", - cxxopts::value()->default_value("20")) - ("nsga2-cr", "Crossover probability. Must be in range [0,1[\ndefault: 0.6", - cxxopts::value()->default_value("0.6")) - ("nsga2-m", "Mutation probability. Must be in range [0,1]\ndefault: 0.4", - cxxopts::value()->default_value("0.4")); -#endif - // clang-format on - - try { - auto options = parser.parse(argc, argv); - - if (options.count("quiet")) { - firestarter::logging::filter::set_severity( - nitro::log::severity_level::warn); - } else if (options.count("report")) { - firestarter::logging::filter::set_severity( - nitro::log::severity_level::debug); - } else if (options.count("debug")) { - firestarter::logging::filter::set_severity( - nitro::log::severity_level::trace); - } else { - firestarter::logging::filter::set_severity( - nitro::log::severity_level::info); - } - - if (options.count("version")) { - std::exit(EXIT_SUCCESS); - } - - if (options.count("copyright")) { - print_copyright(); - std::exit(EXIT_SUCCESS); - } - - if (options.count("warranty")) { - print_warranty(); - std::exit(EXIT_SUCCESS); - } - - firestarter::log::info() - << "This program comes with ABSOLUTELY NO WARRANTY; for details run `" - << argv[0] << " -w`.\n" - << "This is free software, and you are welcome to redistribute it\n" - << "under certain conditions; run `" << argv[0] - << " -c` for details.\n"; - - if (options.count("help")) { - auto section = options["help"].as(); - - // section not found - auto findSection = [&](std::pair const &pair) { - return pair.first == section; - }; - if (std::find_if(optionsMap.begin(), optionsMap.end(), findSection) == - optionsMap.end() && - section.size() != 0) { - throw std::invalid_argument("Section \"" + section + - "\" not found in help."); - } - - print_help(parser, section); - std::exit(EXIT_SUCCESS); - } - - timeout = std::chrono::seconds(options["timeout"].as()); - loadPercent = options["load"].as(); - period = std::chrono::microseconds(options["period"].as()); - - if (loadPercent > 100) { - throw std::invalid_argument("Option -l/--load may not be above 100."); - } - - errorDetection = options.count("error-detection"); - if (errorDetection && loadPercent != 100) { - throw std::invalid_argument("Option --error-detection may only be used " - "with -l/--load equal 100."); - } - -#ifdef FIRESTARTER_DEBUG_FEATURES - allowUnavailablePayload = options.count("allow-unavailable-payload"); - dumpRegisters = options.count("dump-registers"); - if (dumpRegisters) { - dumpRegistersTimeDelta = - std::chrono::seconds(options["dump-registers"].as()); - if (timeout != std::chrono::microseconds::zero() && loadPercent != 100) { - throw std::invalid_argument("Option --dump-registers may only be used " - "without a timeout and full load."); - } - if (errorDetection) { - throw std::invalid_argument( - "Options --dump-registers and --error-detection cannot be used " - "together."); - } - } -#endif - - requestedNumThreads = options["threads"].as(); - -#if (defined(linux) || defined(__linux__)) && \ - defined(FIRESTARTER_THREAD_AFFINITY) - cpuBind = options["bind"].as(); - if (!cpuBind.empty()) { - if (requestedNumThreads != 0) { - throw std::invalid_argument( - "Options -b/--bind and -n/--threads cannot be used together."); - } - } -#endif - -#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_ONEAPI) || defined(FIRESTARTER_BUILD_HIP) - gpuUseFloat = options.count("usegpufloat"); - gpuUseDouble = options.count("usegpudouble"); - - if (gpuUseFloat && gpuUseDouble) { - throw std::invalid_argument("Options -f/--usegpufloat and " - "-d/--usegpudouble cannot be used together."); - } - - gpuMatrixSize = options["matrixsize"].as(); - if (gpuMatrixSize > 0 && gpuMatrixSize < 64) { - throw std::invalid_argument( - "Option -m/--matrixsize may not be below 64."); - } - - gpus = options["gpus"].as(); -#endif - - printFunctionSummary = options.count("avail"); - - functionId = options["function"].as(); - - listInstructionGroups = options.count("list-instruction-groups"); - instructionGroups = options["run-instruction-groups"].as(); - if (options.count("set-line-count")) { - lineCount = options["set-line-count"].as(); - } - -#if defined(linux) || defined(__linux__) - startDelta = - std::chrono::milliseconds(options["start-delta"].as()); - stopDelta = std::chrono::milliseconds(options["stop-delta"].as()); - measurementInterval = std::chrono::milliseconds( - options["measurement-interval"].as()); -#ifndef FIRESTARTER_LINK_STATIC - metricPaths = options["metric-path"].as>(); -#endif - if (options.count("metric-from-stdin")) { - stdinMetrics = - options["metric-from-stdin"].as>(); - } - measurement = options.count("measurement"); - listMetrics = options.count("list-metrics"); - - if ((optimize = options.count("optimize"))) { - if (errorDetection) { - throw std::invalid_argument("Options --error-detection and --optimize " - "cannot be used together."); - } - if (measurement) { - throw std::invalid_argument( - "Options --measurement and --optimize cannot be used together."); - } - preheat = std::chrono::seconds(options["preheat"].as()); - optimizationAlgorithm = options["optimize"].as(); - if (options.count("optimization-metric")) { - optimizationMetrics = - options["optimization-metric"].as>(); - } - if (loadPercent != 100) { - throw std::invalid_argument("Options -p | --period and -l | --load are " - "not compatible with --optimize."); - } - if (timeout == std::chrono::seconds::zero()) { - throw std::invalid_argument( - "Option -t | --timeout must be specified for optimization."); - } - evaluationDuration = timeout; - // this will deactivate the watchdog worker - timeout = std::chrono::seconds::zero(); - individuals = options["individuals"].as(); - if (options.count("optimize-outfile")) { - optimizeOutfile = options["optimize-outfile"].as(); - } - generations = options["generations"].as(); - nsga2_cr = options["nsga2-cr"].as(); - nsga2_m = options["nsga2-m"].as(); - - if (optimizationAlgorithm != "NSGA2") { - throw std::invalid_argument("Option --optimize must be any of: NSGA2"); - } - } -#endif - - } catch (std::exception &e) { - firestarter::log::error() << e.what() << "\n"; - print_help(parser, ""); - std::exit(EXIT_FAILURE); - } -} - -int main(int argc, const char **argv) { - - firestarter::log::info() - << "FIRESTARTER - A Processor Stress Test Utility, Version " - << _FIRESTARTER_VERSION_STRING << "\n" - << "Copyright (C) " << _FIRESTARTER_BUILD_YEAR - << " TU Dresden, Center for Information Services and High Performance " - "Computing" - << "\n"; +#include "firestarter/Config.hpp" +#include "firestarter/Firestarter.hpp" +#include "firestarter/Logging/Log.hpp" + +auto main(int argc, const char** argv) -> int { + firestarter::log::info() << "FIRESTARTER - A Processor Stress Test Utility, Version " << _FIRESTARTER_VERSION_STRING + << "\n" + << "Copyright (C) " << _FIRESTARTER_BUILD_YEAR + << " TU Dresden, Center for Information Services and High Performance " + "Computing" + << "\n"; #ifdef _FIRESTARTER_VERSION_TEMPERED - firestarter::log::info() - << "*The version and/or year was explicitely set during build and does not " - << "necessarily represent the actual version.\n" - << "This helps maintainers to keep track of versions, e.g., on a cluster." - << "\n"; + firestarter::log::info() << "*The version and/or year was explicitely set during build and does not " + << "necessarily represent the actual version.\n" + << "This helps maintainers to keep track of versions, e.g., on a cluster." + << "\n"; #endif - Config cfg{argc, argv}; - try { - firestarter::Firestarter firestarter( - argc, argv, cfg.timeout, cfg.loadPercent, cfg.period, - cfg.requestedNumThreads, cfg.cpuBind, cfg.printFunctionSummary, - cfg.functionId, cfg.listInstructionGroups, cfg.instructionGroups, - cfg.lineCount, cfg.allowUnavailablePayload, cfg.dumpRegisters, - cfg.dumpRegistersTimeDelta, cfg.dumpRegistersOutpath, - cfg.errorDetection, cfg.gpus, cfg.gpuMatrixSize, cfg.gpuUseFloat, - cfg.gpuUseDouble, cfg.listMetrics, cfg.measurement, cfg.startDelta, - cfg.stopDelta, cfg.measurementInterval, cfg.metricPaths, - cfg.stdinMetrics, cfg.optimize, cfg.preheat, cfg.optimizationAlgorithm, - cfg.optimizationMetrics, cfg.evaluationDuration, cfg.individuals, - cfg.optimizeOutfile, cfg.generations, cfg.nsga2_cr, cfg.nsga2_m); + firestarter::Config Cfg{argc, argv}; + + firestarter::Firestarter Firestarter(std::move(Cfg)); - firestarter.mainThread(); + Firestarter.mainThread(); - } catch (std::exception const &e) { - firestarter::log::error() << e.what(); + } catch (std::exception const& E) { + firestarter::log::error() << E.what(); return EXIT_FAILURE; } return EXIT_SUCCESS; -} +} \ No newline at end of file diff --git a/src/firestarter/Measurement/MeasurementWorker.cpp b/src/firestarter/Measurement/MeasurementWorker.cpp index 498330ab..25294e04 100644 --- a/src/firestarter/Measurement/MeasurementWorker.cpp +++ b/src/firestarter/Measurement/MeasurementWorker.cpp @@ -19,10 +19,11 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include +#include "firestarter/Measurement/MeasurementWorker.hpp" +#include "firestarter/Logging/Log.hpp" +#include #include -#include #ifndef FIRESTARTER_LINK_STATIC extern "C" { @@ -30,414 +31,389 @@ extern "C" { } #endif -void insertCallback(void *cls, const char *metricName, int64_t timeSinceEpoch, - double value) { - static_cast(cls) - ->insertCallback(metricName, timeSinceEpoch, value); +void insertCallback(void* Cls, const char* MetricName, int64_t TimeSinceEpoch, double Value) { + static_cast(Cls)->insertCallback(MetricName, TimeSinceEpoch, Value); } -using namespace firestarter::measurement; +namespace { -MeasurementWorker::MeasurementWorker( - std::chrono::milliseconds updateInterval, unsigned long long numThreads, - std::vector const &metricDylibs, - std::vector const &stdinMetrics) - : updateInterval(updateInterval), numThreads(numThreads) { +// NOLINTBEGIN(cert-dcl50-cpp,cppcoreguidelines-pro-type-vararg,cppcoreguidelines-pro-bounds-array-to-pointer-decay,clang-analyzer-valist.Uninitialized) +auto scanStdin(const char* Fmt, int Count, ...) -> bool { + va_list Args; + va_start(Args, Count); + auto ReturnCode = std::vscanf(Fmt, Args); + va_end(Args); + return ReturnCode == Count; +} +// NOLINTEND(cert-dcl50-cpp,cppcoreguidelines-pro-type-vararg,cppcoreguidelines-pro-bounds-array-to-pointer-decay,clang-analyzer-valist.Uninitialized) + +} // namespace + +namespace firestarter::measurement { + +MeasurementWorker::MeasurementWorker(std::chrono::milliseconds UpdateInterval, uint64_t NumThreads, + std::vector const& MetricDylibsNames, + std::vector const& StdinMetricsNames) + : UpdateInterval(UpdateInterval) + , NumThreads(NumThreads) { #ifndef FIRESTARTER_LINK_STATIC // open dylibs and find metric symbol. - // create an entry in _metricDylibs with handle from dlopen and - // metric_interface_t structure. add this structe as a pointer to metrics. - for (auto const &dylib : metricDylibs) { - void *handle; - const char *filename = dylib.c_str(); + // create an entry in MetricDylibs with handle from dlopen and + // MetricInterface structure. add this structe as a pointer to metrics. + for (auto const& Dylib : MetricDylibsNames) { + void* Handle = nullptr; + const char* Filename = Dylib.c_str(); - handle = dlopen(dylib.c_str(), RTLD_NOW | RTLD_LOCAL); + Handle = dlopen(Dylib.c_str(), RTLD_NOW | RTLD_LOCAL); - if (!handle) { - firestarter::log::error() << filename << ": " << dlerror(); + if (!Handle) { + firestarter::log::error() << Filename << ": " << dlerror(); continue; } // clear existing error dlerror(); - metric_interface_t *metric = nullptr; + MetricInterface* Metric = nullptr; - metric = (metric_interface_t *)dlsym(handle, "metric"); + Metric = static_cast(dlsym(Handle, "metric")); - char *error; - if ((error = dlerror()) != NULL) { - firestarter::log::error() << filename << ": " << error; - dlclose(handle); + char* Error = nullptr; + if ((Error = dlerror()) != nullptr) { + firestarter::log::error() << Filename << ": " << Error; + dlclose(Handle); continue; } - if (this->findMetricByName(metric->name) != nullptr) { - firestarter::log::error() - << "A metric named \"" << metric->name << "\" is already loaded."; - dlclose(handle); + if (findMetricByName(Metric->Name) != nullptr) { + firestarter::log::error() << "A metric named \"" << Metric->Name << "\" is already loaded."; + dlclose(Handle); continue; } // lets push our metric object and the handle - this->_metricDylibs.push_back(handle); - this->metrics.push_back(metric); + MetricDylibs.push_back(Handle); + Metrics.push_back(Metric); } #else - (void)metricDylibs; + (void)MetricDylibsNames; #endif // setup metric objects for metric names passed from stdin. - for (auto const &name : stdinMetrics) { - if (this->findMetricByName(name) != nullptr) { - firestarter::log::error() - << "A metric named \"" << name << "\" is already loaded."; + for (auto const& Name : StdinMetricsNames) { + if (findMetricByName(Name) != nullptr) { + firestarter::log::error() << "A metric named \"" << Name << "\" is already loaded."; continue; } - this->_stdinMetrics.push_back(name); + StdinMetrics.push_back(Name); } - std::stringstream ss; - unsigned maxLength = 0; - std::map available; + std::stringstream Ss; + unsigned MaxLength = 0; + std::map Available; - for (auto const &metric : this->metrics) { - std::string name(metric->name); - maxLength = maxLength < name.size() ? name.size() : maxLength; - int returnCode = metric->init(); - metric->fini(); - available[name] = returnCode == EXIT_SUCCESS ? true : false; + for (auto const& Metric : Metrics) { + const std::string Name(Metric->Name); + MaxLength = MaxLength < Name.size() ? Name.size() : MaxLength; + auto ReturnCode = Metric->Init(); + Metric->Fini(); + Available[Name] = ReturnCode == EXIT_SUCCESS; } - unsigned padding = maxLength > 6 ? maxLength - 6 : 0; - ss << " METRIC" << std::string(padding + 1, ' ') << "| available\n"; - ss << " " << std::string(padding + 7, '-') << "-----------\n"; - for (auto const &[key, value] : available) { - ss << " " << key << std::string(padding + 7 - key.size(), ' ') << "| "; - ss << (value ? "yes" : "no") << "\n"; + const auto Padding = MaxLength > 6 ? MaxLength - 6 : 0; + Ss << " METRIC" << std::string(Padding + 1, ' ') << "| available\n"; + Ss << " " << std::string(Padding + 7, '-') << "-----------\n"; + for (auto const& [key, value] : Available) { + Ss << " " << key << std::string(Padding + 7 - key.size(), ' ') << "| "; + Ss << (value ? "yes" : "no") << "\n"; } - this->availableMetricsString = ss.str(); + AvailableMetricsString = Ss.str(); - pthread_create(&this->workerThread, NULL, - reinterpret_cast( - MeasurementWorker::dataAcquisitionWorker), - this); + pthread_create(&WorkerThread, nullptr, MeasurementWorker::dataAcquisitionWorker, this); // create a worker for getting metric values from stdin - if (this->_stdinMetrics.size() > 0) { - pthread_create(&this->stdinThread, NULL, - reinterpret_cast( - MeasurementWorker::stdinDataAcquisitionWorker), - this); + if (!StdinMetrics.empty()) { + pthread_create(&StdinThread, nullptr, MeasurementWorker::stdinDataAcquisitionWorker, this); } } MeasurementWorker::~MeasurementWorker() { - pthread_cancel(this->workerThread); + pthread_cancel(WorkerThread); - pthread_join(this->workerThread, NULL); + pthread_join(WorkerThread, nullptr); - if (this->_stdinMetrics.size() > 0) { - pthread_cancel(this->stdinThread); + if (!StdinMetrics.empty()) { + pthread_cancel(StdinThread); - pthread_join(this->stdinThread, NULL); + pthread_join(StdinThread, nullptr); } - for (auto const &[key, value] : this->values) { - auto metric = this->findMetricByName(key); - if (metric == nullptr) { + for (auto const& [key, value] : Values) { + const auto* Metric = findMetricByName(key); + if (Metric == nullptr) { continue; } - metric->fini(); + Metric->Fini(); } #ifndef FIRESTARTER_LINK_STATIC - for (auto handle : this->_metricDylibs) { - dlclose(handle); + for (auto* Handle : MetricDylibs) { + dlclose(Handle); } #endif } -std::vector MeasurementWorker::metricNames() { - std::vector metrics; - std::transform( - this->metrics.begin(), this->metrics.end(), std::back_inserter(metrics), - [](auto &metric) -> std::string { return std::string(metric->name); }); - for (auto const &name : this->_stdinMetrics) { - metrics.push_back(name); +auto MeasurementWorker::metricNames() -> std::vector { + std::vector MetricNames; + std::transform(Metrics.begin(), Metrics.end(), std::back_inserter(MetricNames), + [](auto& Metric) -> std::string { return std::string(Metric->Name); }); + for (auto const& Name : StdinMetrics) { + MetricNames.push_back(Name); } - return metrics; + return MetricNames; } -const metric_interface_t * -MeasurementWorker::findMetricByName(std::string metricName) { - auto name_equal = [metricName](auto &metricInterface) { - return metricName.compare(metricInterface->name) == 0; - }; - auto metric = - std::find_if(this->metrics.begin(), this->metrics.end(), name_equal); +auto MeasurementWorker::findMetricByName(std::string MetricName) -> const MetricInterface* { + auto NameEqual = [&MetricName](auto& MetricInterface) { return MetricName == MetricInterface->Name; }; + auto Metric = std::find_if(Metrics.begin(), Metrics.end(), NameEqual); // metric not found - if (metric == this->metrics.end()) { + if (Metric == Metrics.end()) { return nullptr; } // metric found - return const_cast(*metric); + return *Metric; } // this must be called by the main thread. // if not done so things like perf_event_attr.inherit might not work as expected -std::vector -MeasurementWorker::initMetrics(std::vector const &metricNames) { - this->values_mutex.lock(); +auto MeasurementWorker::initMetrics(std::vector const& MetricNames) -> std::vector { + ValuesMutex.lock(); - std::vector initialized = {}; + std::vector Initialized = {}; // try to find each metric and initialize it - for (auto const &metricName : metricNames) { + for (auto const& MetricName : MetricNames) { // init values map with empty vector - auto name_equal = [metricName](auto const &pair) { - return metricName.compare(pair.first) == 0; - }; - auto pair = - std::find_if(this->values.begin(), this->values.end(), name_equal); - if (pair != this->values.end()) { - pair->second.clear(); + auto NameEqual = [&MetricName](auto const& Pair) { return MetricName == Pair.first; }; + auto Pair = std::find_if(Values.begin(), Values.end(), NameEqual); + if (Pair != Values.end()) { + Pair->second.clear(); } else { - auto metric = this->findMetricByName(metricName); - if (metric != nullptr) { - int returnValue = metric->init(); - if (returnValue != EXIT_SUCCESS) { - log::error() << "Metric " << metric->name << ": " - << metric->get_error(); + const auto* Metric = findMetricByName(MetricName); + if (Metric != nullptr) { + const auto ReturnValue = Metric->Init(); + if (ReturnValue != EXIT_SUCCESS) { + log::warn() << "Metric " << Metric->Name << ": " << Metric->GetError(); continue; } } - this->values[metricName] = std::vector(); - if (metric != nullptr) { - if (metric->type.insert_callback) { - metric->register_insert_callback(::insertCallback, this); + Values[MetricName] = std::vector(); + if (Metric != nullptr) { + if (Metric->Type.InsertCallback) { + Metric->RegisterInsertCallback(::insertCallback, this); } } - initialized.push_back(metricName); + Initialized.push_back(MetricName); } } - this->values_mutex.unlock(); + ValuesMutex.unlock(); - return initialized; + return Initialized; } -void MeasurementWorker::insertCallback(const char *metricName, - int64_t timeSinceEpoch, double value) { - this->values_mutex.lock(); +void MeasurementWorker::insertCallback(const char* MetricName, int64_t TimeSinceEpoch, double Value) { + ValuesMutex.lock(); using Duration = std::chrono::duration; - auto time = - std::chrono::time_point( - Duration(timeSinceEpoch)); - auto name_equal = [metricName](auto const &pair) { - return std::string(metricName).compare(pair.first) == 0; - }; - auto pair = - std::find_if(this->values.begin(), this->values.end(), name_equal); + auto Time = std::chrono::time_point(Duration(TimeSinceEpoch)); + auto NameEqual = [&MetricName](auto const& Pair) { return std::string(MetricName) == Pair.first; }; + auto Pair = std::find_if(Values.begin(), Values.end(), NameEqual); - if (pair != this->values.end()) { - pair->second.push_back(TimeValue(time, value)); + if (Pair != Values.end()) { + Pair->second.emplace_back(Time, Value); } - this->values_mutex.unlock(); + ValuesMutex.unlock(); } -void MeasurementWorker::startMeasurement() { - this->startTime = std::chrono::high_resolution_clock::now(); -} +void MeasurementWorker::startMeasurement() { StartTime = std::chrono::high_resolution_clock::now(); } -std::map -MeasurementWorker::getValues(std::chrono::milliseconds startDelta, - std::chrono::milliseconds stopDelta) { - std::map measurment = {}; +auto MeasurementWorker::getValues(std::chrono::milliseconds StartDelta, std::chrono::milliseconds StopDelta) + -> std::map { + std::map Measurment = {}; - this->values_mutex.lock(); + ValuesMutex.lock(); - for (auto &[key, values] : this->values) { - auto startTime = this->startTime; - auto endTime = std::chrono::high_resolution_clock::now(); - auto metric = this->findMetricByName(key); + for (auto& [key, values] : Values) { + auto StartTime = this->StartTime; + auto EndTime = std::chrono::high_resolution_clock::now(); + const auto* Metric = findMetricByName(key); - metric_type_t type; - std::memset(&type, 0, sizeof(type)); - if (metric == nullptr) { - type.absolute = 1; + MetricType Type; + std::memset(&Type, 0, sizeof(Type)); + if (Metric == nullptr) { + Type.Absolute = 1; - startTime += startDelta; - endTime -= stopDelta; + StartTime += StartDelta; + EndTime -= StopDelta; } else { - std::memcpy(&type, &metric->type, sizeof(type)); + std::memcpy(&Type, &Metric->Type, sizeof(Type)); - if (metric->type.ignore_start_stop_delta == 0) { - startTime += startDelta; - endTime -= stopDelta; + if (Metric->Type.IgnoreStartStopDelta == 0) { + StartTime += StartDelta; + EndTime -= StopDelta; } } - decltype(values) croppedValues(values.size()); + decltype(values) CroppedValues(values.size()); - auto findAll = [startTime, endTime](auto const &tv) { - return startTime <= tv.time && tv.time <= endTime; - }; - auto it = std::copy_if(values.begin(), values.end(), croppedValues.begin(), - findAll); - croppedValues.resize(std::distance(croppedValues.begin(), it)); + auto FindAll = [&StartTime, &EndTime](auto const& Tv) { return StartTime <= Tv.Time && Tv.Time <= EndTime; }; + auto It = std::copy_if(values.begin(), values.end(), CroppedValues.begin(), FindAll); + CroppedValues.resize(std::distance(CroppedValues.begin(), It)); - Summary sum = Summary::calculate(croppedValues.begin(), croppedValues.end(), - type, this->numThreads); + const auto Sum = Summary::calculate(CroppedValues.begin(), CroppedValues.end(), Type, NumThreads); - measurment[key] = sum; + Measurment[key] = Sum; } - this->values_mutex.unlock(); + ValuesMutex.unlock(); - return measurment; + return Measurment; } -int *MeasurementWorker::dataAcquisitionWorker(void *measurementWorker) { +auto MeasurementWorker::dataAcquisitionWorker(void* MeasurementWorker) -> void* { + // NOLINTNEXTLINE(cert-pos47-c,concurrency-thread-canceltype-asynchronous) + pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, nullptr); - pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); - - auto _this = reinterpret_cast(measurementWorker); + auto* This = static_cast(MeasurementWorker); #ifndef __APPLE__ pthread_setname_np(pthread_self(), "DataAcquisition"); #endif - using clock = std::chrono::high_resolution_clock; + using Clock = std::chrono::high_resolution_clock; - using callbackTuple = - std::tuple; - auto callbackTupleComparator = [](callbackTuple left, callbackTuple right) { - return std::get<2>(left) > std::get<2>(right); + using CallbackTuple = + std::tuple; + auto CallbackTupleComparator = [](CallbackTuple Left, CallbackTuple Right) { + return std::get<2>(Left) > std::get<2>(Right); }; // this datastructure holds a tuple of our callback, the callback frequency // and the next timepoint. it will be sorted, so the pop function will give // back the next callback - std::priority_queue, - decltype(callbackTupleComparator)> - callbackQueue(callbackTupleComparator); + std::priority_queue, decltype(CallbackTupleComparator)> CallbackQueue( + CallbackTupleComparator); - _this->values_mutex.lock(); + This->ValuesMutex.lock(); - for (auto const &[key, value] : _this->values) { - auto metric_interface = _this->findMetricByName(key); + for (auto const& [key, value] : This->Values) { + const auto* MetricInterface = This->findMetricByName(key); - if (metric_interface == nullptr) { + if (MetricInterface == nullptr) { continue; } - auto callbackTime = - std::chrono::microseconds(metric_interface->callback_time); - if (callbackTime.count() == 0) { + auto CallbackTime = std::chrono::microseconds(MetricInterface->CallbackTime); + if (CallbackTime.count() == 0) { continue; } - auto currentTime = clock::now(); + auto CurrentTime = Clock::now(); - callbackQueue.push( - std::make_tuple(metric_interface->callback, callbackTime, currentTime)); + CallbackQueue.emplace(MetricInterface->Callback, CallbackTime, CurrentTime); } - _this->values_mutex.unlock(); + This->ValuesMutex.unlock(); - auto nextFetch = clock::now() + _this->updateInterval; + auto NextFetch = Clock::now() + This->UpdateInterval; for (;;) { - auto now = clock::now(); + auto Now = Clock::now(); - if (nextFetch <= now) { - _this->values_mutex.lock(); + if (NextFetch <= Now) { + This->ValuesMutex.lock(); - for (auto &[metricName, values] : _this->values) { - auto metric_interface = _this->findMetricByName(metricName); + for (auto& [metricName, values] : This->Values) { + const auto* MetricInterface = This->findMetricByName(metricName); - if (metric_interface == nullptr) { + if (MetricInterface == nullptr) { continue; } - double value; + double Value = NAN; - if (!metric_interface->type.insert_callback && - metric_interface->get_reading != nullptr) { - if (EXIT_SUCCESS == metric_interface->get_reading(&value)) { - auto tv = - TimeValue(std::chrono::high_resolution_clock::now(), value); - values.push_back(tv); + if (!MetricInterface->Type.InsertCallback && MetricInterface->GetReading != nullptr) { + if (EXIT_SUCCESS == MetricInterface->GetReading(&Value)) { + auto Tv = TimeValue(std::chrono::high_resolution_clock::now(), Value); + values.push_back(Tv); } } } - _this->values_mutex.unlock(); + This->ValuesMutex.unlock(); - nextFetch = now + _this->updateInterval; + NextFetch = Now + This->UpdateInterval; } - auto nextWake = nextFetch; + auto NextWake = NextFetch; - if (!callbackQueue.empty()) { - auto [callbackFunction, callbackTime, nextCallback] = callbackQueue.top(); + if (!CallbackQueue.empty()) { + auto [callbackFunction, callbackTime, nextCallback] = CallbackQueue.top(); - if (nextCallback <= now) { + if (nextCallback <= Now) { // remove the elment from the queue - callbackQueue.pop(); + CallbackQueue.pop(); // call our callback callbackFunction(); // add it with the updated callback time to the queue again - nextCallback = now + callbackTime; - callbackQueue.push( - std::make_tuple(callbackFunction, callbackTime, nextCallback)); + nextCallback = Now + callbackTime; + CallbackQueue.emplace(callbackFunction, callbackTime, nextCallback); } - nextWake = nextCallback < nextWake ? nextCallback : nextWake; + NextWake = nextCallback < NextWake ? nextCallback : NextWake; } - std::this_thread::sleep_for(nextWake - clock::now()); + std::this_thread::sleep_for(NextWake - Clock::now()); } } -int *MeasurementWorker::stdinDataAcquisitionWorker(void *measurementWorker) { - - pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); +auto MeasurementWorker::stdinDataAcquisitionWorker(void* MeasurementWorker) -> void* { + // NOLINTNEXTLINE(cert-pos47-c,concurrency-thread-canceltype-asynchronous) + pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, nullptr); - auto _this = reinterpret_cast(measurementWorker); + auto* This = static_cast(MeasurementWorker); #ifndef __APPLE__ pthread_setname_np(pthread_self(), "StdinDataAcquis"); #endif - for (std::string line; std::getline(std::cin, line);) { - int64_t time; - double value; - char name[128]; - if (std::sscanf(line.c_str(), "%127s %ld %lf", name, &time, &value) == 3) { - auto name_equal = [name](auto const &allowedName) { - return allowedName.compare(std::string(name)) == 0; - }; - auto item = std::find_if(_this->stdinMetrics().begin(), - _this->stdinMetrics().end(), name_equal); - // metric name is allowed - if (item != _this->stdinMetrics().end()) { - _this->insertCallback(name, time, value); - } + for (;;) { + int64_t Time = 0; + double Value = NAN; + std::array Name = {0}; + + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) + if (!scanStdin("%127s %ld %lf", 3, Name.data(), &Time, &Value)) { + continue; } - } - return NULL; + auto NameEqual = [&Name](auto const& AllowedName) { return AllowedName == std::string(Name.data()); }; + auto Item = std::find_if(This->stdinMetrics().begin(), This->stdinMetrics().end(), NameEqual); + // metric name is allowed + if (Item != This->stdinMetrics().end()) { + This->insertCallback(Name.data(), Time, Value); + } + } } + +} // namespace firestarter::measurement \ No newline at end of file diff --git a/src/firestarter/Measurement/Metric/IPCEstimate.cpp b/src/firestarter/Measurement/Metric/IPCEstimate.cpp index a58f91bb..5cd49b88 100644 --- a/src/firestarter/Measurement/Metric/IPCEstimate.cpp +++ b/src/firestarter/Measurement/Metric/IPCEstimate.cpp @@ -19,72 +19,51 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ +#include "firestarter/Measurement/Metric/IPCEstimate.hpp" + #include #include -#include - -extern "C" { -#include -#include -} -static std::string errorString = ""; +auto IpcEstimateMetricData::fini() -> int32_t { + auto& Instance = instance(); -static void (*callback)(void *, const char *, int64_t, double) = nullptr; -static void *callback_arg = nullptr; - -static int32_t fini(void) { - callback = nullptr; - callback_arg = nullptr; + Instance.Callback = nullptr; + Instance.CallbackArg = nullptr; return EXIT_SUCCESS; } -static int32_t init(void) { - errorString = ""; +auto IpcEstimateMetricData::init() -> int32_t { + instance().ErrorString = ""; return EXIT_SUCCESS; } -static const char *get_error(void) { - const char *errorCString = errorString.c_str(); - return errorCString; +auto IpcEstimateMetricData::getError() -> const char* { + const char* ErrorCString = instance().ErrorString.c_str(); + return ErrorCString; } -static int32_t register_insert_callback(void (*c)(void *, const char *, int64_t, - double), - void *arg) { - callback = c; - callback_arg = arg; +auto IpcEstimateMetricData::registerInsertCallback(void (*C)(void*, const char*, int64_t, double), void* Arg) + -> int32_t { + auto& Instance = instance(); + + Instance.Callback = C; + Instance.CallbackArg = Arg; + return EXIT_SUCCESS; } -void ipc_estimate_metric_insert(double value) { - if (callback == nullptr || callback_arg == nullptr) { +void IpcEstimateMetricData::insertValue(double Value) { + auto& Instance = instance(); + + if (Instance.Callback == nullptr || Instance.CallbackArg == nullptr) { return; } - int64_t t = std::chrono::duration_cast( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); - - callback(callback_arg, "ipc-estimate", t, value); -} + const int64_t T = + std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); -metric_interface_t ipc_estimate_metric = { - .name = "ipc-estimate", - .type = {.absolute = 1, - .accumalative = 0, - .divide_by_thread_count = 0, - .insert_callback = 1, - .ignore_start_stop_delta = 1, - .__reserved = 0}, - .unit = "IPC", - .callback_time = 0, - .callback = nullptr, - .init = init, - .fini = fini, - .get_reading = nullptr, - .get_error = get_error, - .register_insert_callback = register_insert_callback, -}; + Instance.Callback(Instance.CallbackArg, "ipc-estimate", T, Value); +} \ No newline at end of file diff --git a/src/firestarter/Measurement/Metric/Perf.cpp b/src/firestarter/Measurement/Metric/Perf.cpp index 48f3120b..92a09cf1 100644 --- a/src/firestarter/Measurement/Metric/Perf.cpp +++ b/src/firestarter/Measurement/Metric/Perf.cpp @@ -19,81 +19,67 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ +#include "firestarter/Measurement/Metric/Perf.hpp" + +#include +#include #include #include extern "C" { -#include -#include - #include #include #include #include +} -#define PERF_EVENT_PARANOID "/proc/sys/kernel/perf_event_paranoid" - -struct read_format { - uint64_t nr; - struct { - uint64_t value; - uint64_t id; - } values[2]; -}; - -static std::string errorString = ""; - -static int cpu_cycles_fd = -1; -static int instructions_fd = -1; -static uint64_t cpu_cycles_id; -static uint64_t instructions_id; -static bool init_done = false; -static int32_t init_value; - -static struct read_format last; - -static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, - int cpu, int group_fd, unsigned long flags) { - return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); +namespace { +auto perfEventOpen(struct perf_event_attr* HwEvent, pid_t Pid, int Cpu, int GroupFd, uint64_t Flags) -> int { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) + return static_cast(syscall(__NR_perf_event_open, HwEvent, Pid, Cpu, GroupFd, Flags)); } +} // namespace -static int32_t fini(void) { - if (!(cpu_cycles_fd < 0)) { - close(cpu_cycles_fd); - cpu_cycles_fd = -1; +auto PerfMetricData::fini() -> int32_t { + auto& Instance = instance(); + + if (!(Instance.CpuCyclesFd < 0)) { + close(Instance.CpuCyclesFd); + Instance.CpuCyclesFd = -1; } - if (!(instructions_fd < 0)) { - close(instructions_fd); - instructions_fd = -1; + if (!(Instance.InstructionsFd < 0)) { + close(Instance.InstructionsFd); + Instance.InstructionsFd = -1; } - init_done = false; + Instance.InitDone = false; return EXIT_SUCCESS; } -static int32_t init(void) { - if (init_done) { - return init_value; +auto PerfMetricData::init() -> int32_t { + auto& Instance = instance(); + + if (Instance.InitDone) { + return Instance.InitValue; } - if (access(PERF_EVENT_PARANOID, F_OK) == -1) { + if (access(PerfEventParanoidFile, F_OK) == -1) { // https://man7.org/linux/man-pages/man2/perf_event_open.2.html // The official way of knowing if perf_event_open() support is enabled // is checking for the existence of the file // /proc/sys/kernel/perf_event_paranoid. - errorString = - "syscall perf_event_open not supported or file " PERF_EVENT_PARANOID - " does not exist"; - init_value = EXIT_FAILURE; - init_done = true; + Instance.ErrorString = + "syscall perf_event_open not supported or file " + std::string(PerfEventParanoidFile) + " does not exist"; + Instance.InitValue = EXIT_FAILURE; + Instance.InitDone = true; return EXIT_FAILURE; } - struct perf_event_attr cpu_cycles_attr; - std::memset(&cpu_cycles_attr, 0, sizeof(struct perf_event_attr)); - cpu_cycles_attr.type = PERF_TYPE_HARDWARE; - cpu_cycles_attr.size = sizeof(struct perf_event_attr); - cpu_cycles_attr.config = PERF_COUNT_HW_CPU_CYCLES; - cpu_cycles_attr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; + struct perf_event_attr CpuCyclesAttr {}; + std::memset(&CpuCyclesAttr, 0, sizeof(struct perf_event_attr)); + CpuCyclesAttr.type = PERF_TYPE_HARDWARE; + CpuCyclesAttr.size = sizeof(struct perf_event_attr); + CpuCyclesAttr.config = PERF_COUNT_HW_CPU_CYCLES; + CpuCyclesAttr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; // https://man7.org/linux/man-pages/man2/perf_event_open.2.html // inherit // The inherit bit specifies that this counter should count @@ -113,166 +99,133 @@ static int32_t init(void) { // changed the check // - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) // + if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) - cpu_cycles_attr.inherit = 1; - cpu_cycles_attr.exclude_kernel = 1; - cpu_cycles_attr.exclude_hv = 1; - - if ((cpu_cycles_fd = perf_event_open( - &cpu_cycles_attr, - // pid == 0 and cpu == -1 - // This measures the calling process/thread on any CPU. - 0, -1, - // The group_fd argument allows event groups to be created. An event - // group has one event which is the group leader. The leader is - // created first, with group_fd = -1. The rest of the group members - // are created with subsequent perf_event_open() calls with group_fd - // being set to the file descriptor of the group leader. - -1, 0)) < 0) { + CpuCyclesAttr.inherit = 1; + CpuCyclesAttr.exclude_kernel = 1; + CpuCyclesAttr.exclude_hv = 1; + + Instance.CpuCyclesFd = perfEventOpen(&CpuCyclesAttr, + // pid == 0 and cpu == -1 + // This measures the calling process/thread on any CPU. + 0, -1, + // The group_fd argument allows event groups to be created. An event + // group has one event which is the group leader. The leader is + // created first, with group_fd = -1. The rest of the group members + // are created with subsequent perf_event_open() calls with group_fd + // being set to the file descriptor of the group leader. + -1, 0); + + if (Instance.CpuCyclesFd < 0) { fini(); - errorString = "perf_event_open failed for PERF_COUNT_HW_CPU_CYCLES"; - init_value = EXIT_FAILURE; - init_done = true; + Instance.ErrorString = "perf_event_open failed for PERF_COUNT_HW_CPU_CYCLES"; + Instance.InitValue = EXIT_FAILURE; + Instance.InitDone = true; return EXIT_FAILURE; } - ioctl(cpu_cycles_fd, PERF_EVENT_IOC_ID, &cpu_cycles_id); - - struct perf_event_attr instructions_attr; - std::memset(&instructions_attr, 0, sizeof(struct perf_event_attr)); - instructions_attr.type = PERF_TYPE_HARDWARE; - instructions_attr.size = sizeof(struct perf_event_attr); - instructions_attr.config = PERF_COUNT_HW_INSTRUCTIONS; - instructions_attr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; - instructions_attr.inherit = 1; - instructions_attr.exclude_kernel = 1; - instructions_attr.exclude_hv = 1; - - if ((instructions_fd = perf_event_open( - &instructions_attr, - // pid == 0 and cpu == -1 - // This measures the calling process/thread on any CPU. - 0, -1, - // The group_fd argument allows event groups to be created. An event - // group has one event which is the group leader. The leader is - // created first, with group_fd = -1. The rest of the group members - // are created with subsequent perf_event_open() calls with group_fd - // being set to the file descriptor of the group leader. - cpu_cycles_fd, 0)) < 0) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) + ioctl(Instance.CpuCyclesFd, PERF_EVENT_IOC_ID, &Instance.CpuCyclesId); + + struct perf_event_attr InstructionsAttr {}; + std::memset(&InstructionsAttr, 0, sizeof(struct perf_event_attr)); + InstructionsAttr.type = PERF_TYPE_HARDWARE; + InstructionsAttr.size = sizeof(struct perf_event_attr); + InstructionsAttr.config = PERF_COUNT_HW_INSTRUCTIONS; + InstructionsAttr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; + InstructionsAttr.inherit = 1; + InstructionsAttr.exclude_kernel = 1; + InstructionsAttr.exclude_hv = 1; + + Instance.InstructionsFd = perfEventOpen(&InstructionsAttr, + // pid == 0 and cpu == -1 + // This measures the calling process/thread on any CPU. + 0, -1, + // The group_fd argument allows event groups to be created. An event + // group has one event which is the group leader. The leader is + // created first, with group_fd = -1. The rest of the group members + // are created with subsequent perf_event_open() calls with group_fd + // being set to the file descriptor of the group leader. + Instance.CpuCyclesFd, 0); + + if (Instance.InstructionsFd < 0) { fini(); - errorString = "perf_event_open failed for PERF_COUNT_HW_INSTRUCTIONS"; - init_value = EXIT_FAILURE; - init_done = true; + Instance.ErrorString = "perf_event_open failed for PERF_COUNT_HW_INSTRUCTIONS"; + Instance.InitValue = EXIT_FAILURE; + Instance.InitDone = true; return EXIT_FAILURE; } - ioctl(instructions_fd, PERF_EVENT_IOC_ID, &instructions_id); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) + ioctl(Instance.InstructionsFd, PERF_EVENT_IOC_ID, &Instance.InstructionsId); - ioctl(cpu_cycles_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); - ioctl(cpu_cycles_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) + ioctl(Instance.CpuCyclesFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) + ioctl(Instance.CpuCyclesFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); - if (0 == read(cpu_cycles_fd, &last, sizeof(last))) { + if (0 == read(Instance.CpuCyclesFd, &Instance.Last, sizeof(Last))) { fini(); - errorString = "group read failed in init"; - init_value = EXIT_FAILURE; - init_done = true; + Instance.ErrorString = "group read failed in init"; + Instance.InitValue = EXIT_FAILURE; + Instance.InitDone = true; return EXIT_FAILURE; } - init_value = EXIT_SUCCESS; - init_done = true; + Instance.InitValue = EXIT_SUCCESS; + Instance.InitDone = true; return EXIT_SUCCESS; } -static uint64_t value_from_id(struct read_format *values, uint64_t id) { - for (decltype(values->nr) i = 0; i < values->nr; ++i) { - if (id == values->values[i].id) { - return values->values[i].value; +auto PerfMetricData::valueFromId(struct ReadFormat* Reader, uint64_t Id) -> uint64_t { + for (decltype(Reader->Nr) I = 0; I < Reader->Nr; ++I) { + assert(I < 2 && "Index is out of bounds"); + // NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index) + if (Id == Reader->Values[I].Id) { + return Reader->Values[I].Value; } + // NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index) } return 0; } -static int32_t get_reading(double *ipc_value, double *freq_value) { +auto PerfMetricData::getReading(double* IpcValue, double* FreqValue) -> int32_t { + auto& Instance = instance(); - if (cpu_cycles_fd < 0 || instructions_fd < 0) { + if (Instance.CpuCyclesFd < 0 || Instance.InstructionsFd < 0) { fini(); return EXIT_FAILURE; } - struct read_format read_values; + struct ReadFormat ReadValues {}; - if (0 == read(cpu_cycles_fd, &read_values, sizeof(read_values))) { + if (0 == read(Instance.CpuCyclesFd, &ReadValues, sizeof(ReadValues))) { fini(); - errorString = "group read failed"; + Instance.ErrorString = "group read failed"; return EXIT_FAILURE; } - if (ipc_value != nullptr) { - uint64_t diff[2]; - diff[0] = value_from_id(&read_values, instructions_id) - - value_from_id(&last, instructions_id); - diff[1] = value_from_id(&read_values, cpu_cycles_id) - - value_from_id(&last, cpu_cycles_id); + if (IpcValue != nullptr) { + std::array Diff = { + valueFromId(&ReadValues, Instance.InstructionsId) - valueFromId(&Instance.Last, Instance.InstructionsId), + valueFromId(&ReadValues, Instance.CpuCyclesId) - valueFromId(&Instance.Last, Instance.CpuCyclesId)}; - std::memcpy(&last, &read_values, sizeof(last)); + std::memcpy(&Instance.Last, &ReadValues, sizeof(Last)); - *ipc_value = (double)diff[0] / (double)diff[1]; + *IpcValue = static_cast(Diff[0]) / static_cast(Diff[1]); } - if (freq_value != nullptr) { - *freq_value = (double)value_from_id(&read_values, cpu_cycles_id) / 1e9; + if (FreqValue != nullptr) { + *FreqValue = static_cast(valueFromId(&ReadValues, Instance.CpuCyclesId)) / 1e9; } return EXIT_SUCCESS; } -static int32_t get_reading_ipc(double *value) { - return get_reading(value, nullptr); -} +auto PerfMetricData::getReadingIpc(double* Value) -> int32_t { return getReading(Value, nullptr); } -static int32_t get_reading_freq(double *value) { - return get_reading(nullptr, value); -} - -static const char *get_error(void) { - const char *errorCString = errorString.c_str(); - return errorCString; -} -} +auto PerfMetricData::getReadingFreq(double* Value) -> int32_t { return getReading(nullptr, Value); } -metric_interface_t perf_ipc_metric = { - .name = "perf-ipc", - .type = {.absolute = 1, - .accumalative = 0, - .divide_by_thread_count = 0, - .insert_callback = 0, - .ignore_start_stop_delta = 0, - .__reserved = 0}, - .unit = "IPC", - .callback_time = 0, - .callback = nullptr, - .init = init, - .fini = fini, - .get_reading = get_reading_ipc, - .get_error = get_error, - .register_insert_callback = nullptr, -}; - -metric_interface_t perf_freq_metric = { - .name = "perf-freq", - .type = {.absolute = 0, - .accumalative = 1, - .divide_by_thread_count = 1, - .insert_callback = 0, - .ignore_start_stop_delta = 0, - .__reserved = 0}, - .unit = "GHz", - .callback_time = 0, - .callback = nullptr, - .init = init, - .fini = fini, - .get_reading = get_reading_freq, - .get_error = get_error, - .register_insert_callback = nullptr, -}; +auto PerfMetricData::getError() -> const char* { + const char* ErrorCString = instance().ErrorString.c_str(); + return ErrorCString; +} \ No newline at end of file diff --git a/src/firestarter/Measurement/Metric/RAPL.cpp b/src/firestarter/Measurement/Metric/RAPL.cpp index 5f6b4bd7..458b2643 100644 --- a/src/firestarter/Measurement/Metric/RAPL.cpp +++ b/src/firestarter/Measurement/Metric/RAPL.cpp @@ -19,55 +19,34 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ +#include "firestarter/Measurement/Metric/RAPL.hpp" + #include #include #include #include #include +#include #include extern "C" { -#include -#include - #include +} -#define RAPL_PATH "/sys/class/powercap" - -static std::string errorString = ""; - -struct reader_def { - char *path; - long long int last_reading; - long long int overflow; - long long int max; -}; - -struct reader_def_free { - void operator()(struct reader_def *def) { - if (def != nullptr) { - if (((void *)def->path) != nullptr) { - free((void *)def->path); - } - free((void *)def); - } - } -}; - -static std::vector> readers = {}; - -static int32_t fini(void) { - readers.clear(); +auto RaplMetricData::fini() -> int32_t { + instance().Readers.clear(); return EXIT_SUCCESS; } -static int32_t init(void) { - errorString = ""; +auto RaplMetricData::init() -> int32_t { + auto& Instance = instance(); - DIR *raplDir = opendir(RAPL_PATH); - if (raplDir == NULL) { - errorString = "Could not open " RAPL_PATH; + Instance.ErrorString = ""; + + DIR* RaplDir = opendir(RaplPath); + if (RaplDir == nullptr) { + Instance.ErrorString = "Could not open " + std::string(RaplPath); return EXIT_FAILURE; } @@ -76,111 +55,86 @@ static int32_t init(void) { // and finally package only. // contains an empty path if it is not found - std::string psysPath = ""; + std::string PsysPath; // a vector of all paths to package and dram - std::vector paths = {}; - - struct dirent *dir; - while ((dir = readdir(raplDir)) != NULL) { - std::stringstream path; - std::stringstream namePath; - path << RAPL_PATH << "/" << dir->d_name; - namePath << path.str() << "/name"; - - std::ifstream nameStream(namePath.str()); - if (!nameStream.good()) { + std::vector Paths = {}; + + struct dirent* Dir = nullptr; + + // As long as the DIR object (named RaplDir here) is not shared between threads this call is thread-safe: + // https://www.gnu.org/software/libc/manual/html_node/Reading_002fClosing-Directory.html + // NOLINTNEXTLINE(concurrency-mt-unsafe) + while ((Dir = readdir(RaplDir)) != nullptr) { + std::stringstream Path; + std::stringstream NamePath; + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay) + Path << RaplPath << "/" << Dir->d_name; + NamePath << Path.str() << "/name"; + + std::ifstream NameStream(NamePath.str()); + if (!NameStream.good()) { // an error opening the file occured continue; } - std::string name; - std::getline(nameStream, name); + std::string Name; + std::getline(NameStream, Name); - if (name == "psys") { + if (Name == "psys") { // found psys - psysPath = path.str(); - } else if (0 == name.rfind("package", 0) || name == "dram") { + PsysPath = Path.str(); + } else if (0 == Name.rfind("package", 0) || Name == "dram") { // find all package and dram - paths.push_back(path.str()); + Paths.push_back(Path.str()); } } - closedir(raplDir); + closedir(RaplDir); // make psys the only value if available - if (!psysPath.empty()) { - paths.clear(); - paths.push_back(psysPath); + if (!PsysPath.empty()) { + Paths.clear(); + Paths.push_back(PsysPath); } // paths now contains all interesting nodes - if (paths.size() == 0) { - errorString = "No valid entries in " RAPL_PATH; + if (Paths.empty()) { + Instance.ErrorString = "No valid entries in " + std::string(RaplPath); return EXIT_FAILURE; } - for (auto const &path : paths) { - std::stringstream energyUjPath; - energyUjPath << path << "/energy_uj"; - std::ifstream energyReadingStream(energyUjPath.str()); - if (!energyReadingStream.good()) { - errorString = "Could not read energy_uj"; + for (auto const& Path : Paths) { + std::stringstream EnergyUjPath; + EnergyUjPath << Path << "/energy_uj"; + std::ifstream EnergyReadingStream(EnergyUjPath.str()); + if (!EnergyReadingStream.good()) { + Instance.ErrorString = "Could not read energy_uj"; break; } - std::stringstream maxEnergyUjRangePath; - maxEnergyUjRangePath << path << "/max_energy_range_uj"; - std::ifstream maxEnergyReadingStream(maxEnergyUjRangePath.str()); - if (!maxEnergyReadingStream.good()) { - errorString = "Could not read max_energy_range_uj"; + std::stringstream MaxEnergyUjRangePath; + MaxEnergyUjRangePath << Path << "/max_energy_range_uj"; + std::ifstream MaxEnergyReadingStream(MaxEnergyUjRangePath.str()); + if (!MaxEnergyReadingStream.good()) { + Instance.ErrorString = "Could not read max_energy_range_uj"; break; } - unsigned long long reading; - unsigned long long max; - std::string buffer; - int read; + std::string Buffer; - std::getline(energyReadingStream, buffer); - read = std::sscanf(buffer.c_str(), "%llu", &reading); - - if (read == 0) { - std::stringstream ss; - ss << "Contents in file " << energyUjPath.str() - << " do not conform to mask (unsigned long long)"; - errorString = ss.str(); - break; - } + std::getline(EnergyReadingStream, Buffer); + const auto Reading = std::stoul(Buffer); - std::getline(maxEnergyReadingStream, buffer); - read = std::sscanf(buffer.c_str(), "%llu", &max); + std::getline(MaxEnergyReadingStream, Buffer); + const auto Max = std::stoul(Buffer); - if (read == 0) { - std::stringstream ss; - ss << "Contents in file " << maxEnergyUjRangePath.str() - << " do not conform to mask (unsigned long long)"; - errorString = ss.str(); - break; - } + auto Def = std::make_unique(/*Path=*/Path, /*LastReading=*/Reading, /*Overflow=*/0, /*Max=*/Max); - std::shared_ptr def( - reinterpret_cast( - malloc(sizeof(struct reader_def))), - reader_def_free()); - auto pathName = path.c_str(); - size_t size = (strlen(pathName) + 1) * sizeof(char); - void *name = malloc(size); - memcpy(name, pathName, size); - def->path = (char *)name; - def->max = max; - def->last_reading = reading; - def->overflow = 0; - - readers.push_back(def); + Instance.Readers.emplace_back(std::move(Def)); } - if (errorString.size() != 0) { + if (!Instance.ErrorString.empty()) { fini(); return EXIT_FAILURE; } @@ -188,60 +142,39 @@ static int32_t init(void) { return EXIT_SUCCESS; } -static int32_t get_reading(double *value) { - double finalReading = 0.0; +auto RaplMetricData::getReading(double* Value) -> int32_t { + double FinalReading = 0.0; - for (auto &def : readers) { - long long int reading; - std::string buffer; + for (auto& Def : instance().Readers) { + std::string Buffer; - std::stringstream energyUjPath; - energyUjPath << def->path << "/energy_uj"; - std::ifstream energyReadingStream(energyUjPath.str()); - std::getline(energyReadingStream, buffer); - std::sscanf(buffer.c_str(), "%llu", &reading); + std::stringstream EnergyUjPath; + EnergyUjPath << Def->Path << "/energy_uj"; + std::ifstream EnergyReadingStream(EnergyUjPath.str()); + std::getline(EnergyReadingStream, Buffer); + const auto Reading = std::stoll(Buffer); - if (reading < def->last_reading) { - def->overflow += 1; + if (Reading < Def->LastReading) { + Def->Overflow += 1; } - def->last_reading = reading; + Def->LastReading = Reading; - finalReading += - 1.0E-6 * (double)(def->overflow * def->max + def->last_reading); + FinalReading += 1.0E-6 * static_cast((Def->Overflow * Def->Max) + Def->LastReading); } - if (value != nullptr) { - *value = finalReading; + if (Value != nullptr) { + *Value = FinalReading; } return EXIT_SUCCESS; } -static const char *get_error(void) { - const char *errorCString = errorString.c_str(); - return errorCString; +auto RaplMetricData::getError() -> const char* { + const char* ErrorCString = instance().ErrorString.c_str(); + return ErrorCString; } // this function will be called periodically to make sure we do not miss an // overflow of the counter -static void callback(void) { get_reading(nullptr); } -} - -metric_interface_t rapl_metric = { - .name = "sysfs-powercap-rapl", - .type = {.absolute = 0, - .accumalative = 1, - .divide_by_thread_count = 0, - .insert_callback = 0, - .ignore_start_stop_delta = 0, - .__reserved = 0}, - .unit = "J", - .callback_time = 30000000, - .callback = callback, - .init = init, - .fini = fini, - .get_reading = get_reading, - .get_error = get_error, - .register_insert_callback = nullptr, -}; +void RaplMetricData::callback() { getReading(nullptr); } \ No newline at end of file diff --git a/src/firestarter/Measurement/Summary.cpp b/src/firestarter/Measurement/Summary.cpp index 590c4e01..1fecb99f 100644 --- a/src/firestarter/Measurement/Summary.cpp +++ b/src/firestarter/Measurement/Summary.cpp @@ -19,88 +19,82 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include +#include "firestarter/Measurement/Summary.hpp" #include #include -using namespace firestarter::measurement; +namespace firestarter::measurement { // this functions borows a lot of code from // https://github.com/metricq/metricq-cpp/blob/master/tools/metricq-summary/src/summary.cpp -Summary Summary::calculate(std::vector::iterator begin, - std::vector::iterator end, - metric_type_t metricType, - unsigned long long numThreads) { - std::vector values = {}; - - // TODO: i would really like to make this code a bit more readable, but i - // could not find a way yet. - if (metricType.accumalative) { - TimeValue prev; - - if (begin != end) { - prev = *begin++; - for (auto it = begin; it != end; ++it) { - auto time_diff = - 1e-6 * - (double)std::chrono::duration_cast( - it->time - prev.time) - .count(); - auto value_diff = it->value - prev.value; - - double value = value_diff / time_diff; - - if (metricType.divide_by_thread_count) { - value /= numThreads; +auto Summary::calculate(std::vector::iterator Begin, std::vector::iterator End, + MetricType MetricType, uint64_t NumThreads) -> Summary { + std::vector Values; + + if (MetricType.Accumalative) { + TimeValue Prev; + + if (Begin != End) { + Prev = *Begin++; + for (auto It = Begin; It != End; ++It) { + auto TimeDiff = 1e-6 * static_cast( + std::chrono::duration_cast(It->Time - Prev.Time).count()); + auto ValueDiff = It->Value - Prev.Value; + + double Value = ValueDiff / TimeDiff; + + if (MetricType.DivideByThreadCount) { + Value /= static_cast(NumThreads); } - values.push_back(TimeValue(prev.time, value)); - prev = *it; + Values.emplace_back(Prev.Time, Value); + Prev = *It; } } - } else if (metricType.absolute) { - for (auto it = begin; it != end; ++it) { - double value = it->value; + } else if (MetricType.Absolute) { + for (auto It = Begin; It != End; ++It) { + double Value = It->Value; - if (metricType.divide_by_thread_count) { - value /= numThreads; + if (MetricType.DivideByThreadCount) { + Value /= static_cast(NumThreads); } - values.push_back(TimeValue(it->time, value)); + Values.emplace_back(It->Time, Value); } } else { assert(false); } - begin = values.begin(); - end = values.end(); + Begin = Values.begin(); + End = Values.end(); - Summary summary{}; + Summary SummaryVal{}; - summary.num_timepoints = std::distance(begin, end); + SummaryVal.NumTimepoints = std::distance(Begin, End); - if (summary.num_timepoints > 0) { + if (SummaryVal.NumTimepoints > 0) { - auto last = begin; - std::advance(last, summary.num_timepoints - 1); - summary.duration = std::chrono::duration_cast( - last->time - begin->time); + auto Last = Begin; + std::advance(Last, SummaryVal.NumTimepoints - 1); + SummaryVal.Duration = std::chrono::duration_cast(Last->Time - Begin->Time); - auto sum_over_nths = [&begin, end, summary](auto fn) { - double acc = 0.0; - for (auto it = begin; it != end; ++it) { - acc += fn(it->value); + auto SumOverNths = [&Begin, End, SummaryVal](auto Fn) { + double Acc = 0.0; + for (auto It = Begin; It != End; ++It) { + Acc += Fn(It->Value); } - return acc / summary.num_timepoints; + return Acc / static_cast(SummaryVal.NumTimepoints); }; - summary.average = sum_over_nths([](double v) { return v; }); - summary.stddev = std::sqrt(sum_over_nths([&summary](double v) { - double centered = v - summary.average; - return centered * centered; + SummaryVal.Average = SumOverNths([](double V) { return V; }); + SummaryVal.Stddev = std::sqrt(SumOverNths([&SummaryVal](double V) { + const auto Centered = V - SummaryVal.Average; + return Centered * Centered; })); } - return summary; + return SummaryVal; } + +} // namespace firestarter::measurement \ No newline at end of file diff --git a/src/firestarter/OneAPI/OneAPI.cpp b/src/firestarter/OneAPI/OneAPI.cpp index f09f79b0..3a5cfc4d 100644 --- a/src/firestarter/OneAPI/OneAPI.cpp +++ b/src/firestarter/OneAPI/OneAPI.cpp @@ -22,297 +22,275 @@ /* OneAPI for GPUs, based on CUDA component *****************************************************************************/ -#include -#include -#include +#include "firestarter/OneAPI/OneAPI.hpp" +#include "firestarter/Logging/Log.hpp" - -#include #include - +#include #include #include +#include #include -using namespace firestarter::oneapi; +namespace firestarter::oneapi { +namespace { -/* Random number generation helpers */ -template -void generate_random_data(size_t elems, T *v) -{ - for (size_t i = 0; i < elems; i++) - v[i] = double(std::rand()) / RAND_MAX; -} +/// Helper function to generate random floating point values between 0 and 1 in an array. +/// \tparam FloatingPointType The type of floating point value of the array. Either float or double. +/// \arg NumberOfElems The number of elements of the array. +/// \arg Array The array of floating point values which should be initilized with random data between 0 and 1. +template void fillArrayWithRandomFloats(size_t NumberOfElems, FloatingPointType* Array) { + static_assert(std::is_same_v || std::is_same_v, + "fillArrayWithRandomFloats: Template argument must be either float or double"); -template -void replicate_data(sycl::queue &Q, T *dst, size_t dst_elems, const T *src, size_t src_elems) -{ - firestarter::log::trace() << "replicate_data " << dst_elems << " elements from " << - src << " to " << dst ; - while (dst_elems > 0) { - auto copy_elems = std::min(dst_elems, src_elems); - Q.copy(src, dst, copy_elems); - dst += copy_elems; - dst_elems -= copy_elems; - } - Q.wait(); + for (size_t i = 0; i < NumberOfElems; i++) { + Array[i] = static_cast(std::rand()) / static_cast(RAND_MAX); + } } -static int get_precision(int device_index, int useDouble) { +template +void replicateData(sycl::queue& Q, FloatingPointType* Dst, size_t DstElems, const FloatingPointType* Src, + size_t SrcElems) { + static_assert(std::is_same_v || std::is_same_v, + "fillArrayWithRandomFloats: Template argument must be either float or double"); + + firestarter::log::trace() << "replicateData " << DstElems << " elements from " << Src << " to " + << Dst; + while (DstElems > 0) { + auto copy_elems = std::min(DstElems, SrcElems); + Q.copy(Src, Dst, copy_elems); + Dst += copy_elems; + DstElems -= copy_elems; + } + Q.wait(); +} - firestarter::log::trace() << "Checking useDouble " << useDouble; +int getPrecision(int DeviceIndex, int UseDouble) { + firestarter::log::trace() << "Checking UseDouble " << UseDouble; - if (!useDouble){ + if (!UseDouble) { return 0; } - int supports_double = 0; + int SupportsDouble = 0; - auto platforms = sycl::platform::get_platforms(); + auto Platforms = sycl::platform::get_platforms(); - if (platforms.empty()) { + if (Platforms.empty()) { firestarter::log::warn() << "No SYCL platforms found."; return -1; } // Choose a platform based on specific criteria (e.g., device type) - sycl::platform chosenPlatform; - auto nr_gpus = 0; - for (const auto &platform : platforms) { - firestarter::log::trace() << "Checking SYCL platform " << platform.get_info(); - auto devices = platform.get_devices(); - nr_gpus = 0; - for (const auto &device : devices) { - firestarter::log::trace() << "Checking SYCL device " << device.get_info(); - if (device.is_gpu()) { // Choose GPU, you can use other criteria + // TODO(Issue #75): We may select the incorrect platform with gpu devices of the wrong vendor/type. + sycl::platform ChosenPlatform; + auto NbGpus = 0; + for (const auto& Platform : Platforms) { + firestarter::log::trace() << "Checking SYCL platform " << Platform.get_info(); + auto Devices = Platform.get_devices(); + NbGpus = 0; + for (const auto& Device : Devices) { + firestarter::log::trace() << "Checking SYCL device " << Device.get_info(); + if (Device.is_gpu()) { // Choose GPU, you can use other criteria firestarter::log::trace() << " ... is GPU"; - chosenPlatform = platform; - nr_gpus++; + ChosenPlatform = Platform; + NbGpus++; } } } - if (!nr_gpus) { + if (!NbGpus) { firestarter::log::warn() << "No suitable platform with GPU found."; return -1; } // Get a list of devices for the chosen platform - firestarter::log::trace() << "Get support for double" - << " on device nr. " - << device_index; - auto devices = chosenPlatform.get_devices(); - if (devices[device_index].has(sycl::aspect::fp64)) - supports_double=1; + << " on device nr. " << DeviceIndex; + auto Devices = ChosenPlatform.get_devices(); + if (Devices[DeviceIndex].has(sycl::aspect::fp64)) + SupportsDouble = 1; - return supports_double; + return SupportsDouble; } -static int round_up(int num_to_round, int multiple) { - if (multiple == 0) { - return num_to_round; - } +template auto roundUp(int NumToRound) -> int { + static_assert(Multiple != 0, "Multiple may not be zero."); - int remainder = num_to_round % multiple; - if (remainder == 0) { - return num_to_round; + const int Remainder = NumToRound % Multiple; + if (Remainder == 0) { + return NumToRound; } - return num_to_round + multiple - remainder; + return NumToRound + Multiple - Remainder; } - // GPU index. Used to pin this thread to the GPU. -template -static void create_load(std::condition_variable &waitForInitCv, - std::mutex &waitForInitCvMutex, int device_index, - std::atomic &initCount, - volatile unsigned long long *loadVar, int matrixSize) { - static_assert( - std::is_same::value || std::is_same::value, - "create_load: Template argument T must be either float or double"); - - firestarter::log::trace() << "Starting OneAPI with given matrix size " - << matrixSize; - - size_t size_use = 0; - if (matrixSize > 0) { - size_use = matrixSize; - } +// The main difference to the CUDA/HIP version is that we do not run multiple iterations of C=A*B, just one single +// iteration. +template +void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex, int DeviceIndex, + std::atomic& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar, + unsigned MatrixSize) { + static_assert(std::is_same::value || std::is_same::value, + "createLoad: Template argument T must be either float or double"); - size_t use_bytes; + firestarter::log::trace() << "Starting OneAPI with given matrix size " << MatrixSize; // reserving the GPU and initializing - firestarter::log::trace() << "Getting device nr. " << device_index; + firestarter::log::trace() << "Getting device nr. " << DeviceIndex; - auto platforms = sycl::platform::get_platforms(); + auto Platforms = sycl::platform::get_platforms(); - if (platforms.empty()) { + if (Platforms.empty()) { firestarter::log::warn() << "No SYCL platforms found."; return; } // Choose a platform based on specific criteria (e.g., device type) - sycl::platform chosenPlatform; - auto nr_gpus = 0; - for (const auto &platform : platforms) { - auto devices = platform.get_devices(); - nr_gpus = 0; - for (const auto &device : devices) { - if (device.is_gpu()) { // Choose GPU, you can use other criteria - chosenPlatform = platform; - nr_gpus++; - } + sycl::platform ChosenPlatform; + auto NbGpus = 0; + for (const auto& Platform : Platforms) { + auto Devices = Platform.get_devices(); + NbGpus = 0; + for (const auto& Device : Devices) { + if (Device.is_gpu()) { // Choose GPU, you can use other criteria + ChosenPlatform = Platform; + NbGpus++; + } } } - if (!nr_gpus) { + if (!NbGpus) { firestarter::log::warn() << "No suitable platform with GPU found."; return; } - // Get a list of devices for the chosen platform - auto devices = chosenPlatform.get_devices(); - + // Get a list of devices for the chosen platform + auto Devices = ChosenPlatform.get_devices(); - firestarter::log::trace() << "Creating SYCL queue for computation on device nr. " - << device_index; - auto chosenDevice = devices[device_index]; - sycl::queue device_queue(chosenDevice); + firestarter::log::trace() << "Creating SYCL queue for computation on device nr. " << DeviceIndex; + auto ChosenDevice = Devices[DeviceIndex]; + auto DeviceQueue = sycl::queue(ChosenDevice); - firestarter::log::trace() << "Get memory size on device nr. " << device_index; - + firestarter::log::trace() << "Get memory size on device nr. " << DeviceIndex; // getting information about the GPU memory - size_t memory_total = devices[device_index].get_info(); - - firestarter::log::trace() << "Get Memory info on device nr. " - << device_index - <<": has " << memory_total << " B global memory"; - - // check if the user has not set a matrix OR has set a too big matrixsite and - // if this is true: set a good matrixsize - if (!size_use || ((size_use * size_use * sizeof(T) * 3 > memory_total))) { - size_use = round_up((int)(0.8 * sqrt(((memory_total) / (sizeof(T) * 3)))), - 1024); // a multiple of 1024 works always well + size_t MemoryTotal = Devices[DeviceIndex].get_info(); + + firestarter::log::trace() << "Get Memory info on device nr. " << DeviceIndex << ": has " << MemoryTotal + << " B global memory"; + + // If the matrix size is not set or three square matricies with dim size of SizeUse do not fit into the available + // memory, select the size so that 3 square matricies will fit into the available device memory where the dim size + // is a multiple of 1024. + std::size_t MemorySize = sizeof(FloatingPointType) * MatrixSize * MatrixSize; + if (!MatrixSize || (MemorySize * 3 > MemoryTotal)) { + // a multiple of 1024 works always well + MatrixSize = roundUp<1024>(0.8 * std::sqrt(MemoryTotal / sizeof(FloatingPointType) / 3)); + MemorySize = sizeof(FloatingPointType) * MatrixSize * MatrixSize; } - firestarter::log::trace() << "Set OneAPI matrix size in B: " << size_use; - use_bytes =sizeof(T) * size_use * size_use * 3; - - + firestarter::log::trace() << "Set OneAPI matrix size in B: " << MatrixSize; /* Allocate A/B/C matrices */ - firestarter::log::trace() - << "Allocating memory on device nr. " - << device_index; - auto A = malloc_device(size_use * size_use, device_queue); - auto B = malloc_device(size_use * size_use, device_queue); - auto C = malloc_device(size_use * size_use, device_queue); + firestarter::log::trace() << "Allocating memory on device nr. " << DeviceIndex; + auto* A = sycl::malloc_device(MatrixSize * MatrixSize, DeviceQueue); + auto* B = sycl::malloc_device(MatrixSize * MatrixSize, DeviceQueue); + auto* C = sycl::malloc_device(MatrixSize * MatrixSize, DeviceQueue); /* Create 64 MB random data on Host */ - constexpr int rd_size = 1024*1024*64; - auto random_data = malloc_host(rd_size, device_queue); - generate_random_data(rd_size, random_data); + constexpr int RandomSize = 1024 * 1024 * 64; + auto* RandomData = sycl::malloc_host(RandomSize, DeviceQueue); + fillArrayWithRandomFloats(RandomSize, RandomData); - firestarter::log::trace() - << "Copy memory to device nr. " - << device_index; + firestarter::log::trace() << "Copy memory to device nr. " << DeviceIndex; /* fill A and B with random data */ - replicate_data(device_queue, A, size_use * size_use, random_data, rd_size); - replicate_data(device_queue, B, size_use * size_use, random_data, rd_size); + replicateData(DeviceQueue, A, MatrixSize * MatrixSize, RandomData, RandomSize); + replicateData(DeviceQueue, B, MatrixSize * MatrixSize, RandomData, RandomSize); { - std::lock_guard lk(waitForInitCvMutex); - -#define TO_MB(x) (unsigned long)(x / 1024 / 1024) - firestarter::log::info() - << " GPU " << device_index << "\n" - << " name: " << devices[device_index].get_info() << "\n" - << " memory: " << TO_MB(memory_total) << " MiB total (using " << TO_MB(use_bytes) - << " MiB)\n" - << " matrix size: " << size_use << "\n" - << " used precision: " - << ((sizeof(T) == sizeof(double)) ? "double" : "single"); -#undef TO_MB - - initCount++; + std::lock_guard lk(WaitForInitCvMutex); + + auto ToMiB = [](const size_t Val) { return Val / 1024 / 1024; }; + firestarter::log::info() << " GPU " << DeviceIndex << "\n" + << " name: " << Devices[DeviceIndex].get_info() + << "\n" + << " memory: " << ToMiB(MemoryTotal) << " MiB total (using " + << ToMiB(MemorySize) << " MiB)\n" + << " matrix size: " << MatrixSize << "\n" + << " used precision: " + << ((sizeof(FloatingPointType) == sizeof(double)) ? "double" : "single"); + + InitCount++; } - waitForInitCv.notify_all(); - - firestarter::log::trace() << "Run gemm on device nr. " << device_index; - /* With this, we could run multiple gemms ...*/ -/* auto run_gemms = [=, &device_queue](int runs) -> double { - using namespace oneapi::mkl; - for (int i = 0; i < runs; i++) - - return runs; - }; -*/ - while (*loadVar != LOAD_STOP) { - firestarter::log::trace() << "Run gemm on device nr. " << device_index; - oneapi::mkl::blas::gemm(device_queue, oneapi::mkl::transpose::N, oneapi::mkl::transpose::N, size_use, size_use, size_use, 1, A, size_use, B, size_use, 0, C, size_use); - firestarter::log::trace() << "wait gemm on device nr. " << device_index; - device_queue.wait_and_throw(); + WaitForInitCv.notify_all(); + + firestarter::log::trace() << "Run gemm on device nr. " << DeviceIndex; + while (LoadVar != firestarter::LoadThreadWorkType::LoadStop) { + firestarter::log::trace() << "Run gemm on device nr. " << DeviceIndex; + ::oneapi::mkl::blas::gemm(DeviceQueue, ::oneapi::mkl::transpose::N, ::oneapi::mkl::transpose::N, MatrixSize, + MatrixSize, MatrixSize, 1, A, MatrixSize, B, MatrixSize, 0, C, MatrixSize); + firestarter::log::trace() << "wait gemm on device nr. " << DeviceIndex; + DeviceQueue.wait_and_throw(); } - } -OneAPI::OneAPI(volatile unsigned long long *loadVar, bool useFloat, bool useDouble, - unsigned matrixSize, int gpus) { - std::thread t(OneAPI::initGpus, std::ref(_waitForInitCv), loadVar, useFloat, - useDouble, matrixSize, gpus); - _initThread = std::move(t); +} // namespace + +OneAPI::OneAPI(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble, + unsigned MatrixSize, int Gpus) { + std::condition_variable WaitForInitCv; + std::mutex WaitForInitCvMutex; + + std::thread T(OneAPI::initGpus, std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat, UseDouble, MatrixSize, Gpus); + InitThread = std::move(T); - std::unique_lock lk(_waitForInitCvMutex); + std::unique_lock Lk(WaitForInitCvMutex); // wait for gpus to initialize - _waitForInitCv.wait(lk); + WaitForInitCv.wait(Lk); } -void OneAPI::initGpus(std::condition_variable &cv, - volatile unsigned long long *loadVar, bool useFloat, - bool useDouble, unsigned matrixSize, int gpus) { - std::condition_variable waitForInitCv; - std::mutex waitForInitCvMutex; +void OneAPI::initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar, + bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus) { + std::condition_variable GpuThreadsWaitForInitCv; + std::mutex GpuThreadsWaitForInitCvMutex; + std::vector GpuThreads; - if (gpus) { + if (Gpus != 0) { + auto Platforms = sycl::platform::get_platforms(); - auto platforms = sycl::platform::get_platforms(); - - if (platforms.empty()) { + if (Platforms.empty()) { std::cerr << "No SYCL platforms found." << std::endl; return; } // Choose a platform based on specific criteria (e.g., device type) - sycl::platform chosenPlatform; - auto devCount = 0; - for (const auto &platform : platforms) { - auto devices = platform.get_devices(); - devCount = 0; - for (const auto &device : devices) { - if (device.is_gpu()) { // Choose GPU, you can use other criteria - chosenPlatform = platform; - devCount++; - } + // TODO(Issue #75): We may select the incorrect platform with gpu devices of the wrong vendor/type. + auto DevCount = 0; + for (const auto& Platform : Platforms) { + auto Devices = Platform.get_devices(); + DevCount = 0; + for (const auto& Device : Devices) { + if (Device.is_gpu()) { // Choose GPU, you can use other criteria + DevCount++; + } } } - if (devCount) { - std::vector gpuThreads; - std::atomic initCount = 0; - int use_double; + if (DevCount) { + std::atomic InitCount = 0; + int UseDoubleConverted; - if (useFloat) { - use_double = 0; - } else if (useDouble) { - use_double = 1; + if (UseFloat) { + UseDoubleConverted = 0; + } else if (UseDouble) { + UseDoubleConverted = 1; } else { - use_double = 2; + UseDoubleConverted = 2; } firestarter::log::info() @@ -322,67 +300,58 @@ void OneAPI::initGpus(std::condition_variable &cv, << "\n graphics processor characteristics:"; // use all GPUs if the user gave no information about use_device - if (gpus < 0) { - gpus = devCount; + if (Gpus < 0) { + Gpus = DevCount; } - if (gpus > devCount) { - firestarter::log::warn() << "You requested more OneAPI devices than available."; - firestarter::log::warn() - << "FIRESTARTER will use " << devCount << " of the requested " - << gpus << " OneAPI device(s)"; - gpus = devCount; + + if (Gpus > DevCount) { + firestarter::log::warn() << "You requested more OneAPI devices than available. " + "Maybe you set OneAPI_VISIBLE_DEVICES?"; + firestarter::log::warn() << "FIRESTARTER will use " << DevCount << " of the requested " << Gpus + << " OneAPI device(s)"; + Gpus = DevCount; } { - std::lock_guard lk(waitForInitCvMutex); + const std::lock_guard Lk(GpuThreadsWaitForInitCvMutex); - for (int i = 0; i < gpus; ++i) { - // if there's a GPU in the system without Double Precision support, we - // have to correct this. - int precision = get_precision(i, use_double); - if (precision == -1){ + for (int I = 0; I < Gpus; ++I) { + const auto Precision = getPrecision(I, UseDoubleConverted); + if (Precision == -1) { firestarter::log::warn() << "This should not have happened. Could not get precision via SYCL."; } + void (*LoadFunc)(std::condition_variable&, std::mutex&, int, std::atomic&, + const volatile firestarter::LoadThreadWorkType&, unsigned) = + Precision ? createLoad : createLoad; - if (precision) { - firestarter::log::trace() << "Starting OneAPI GPU double workload."; - std::thread t(create_load, std::ref(waitForInitCv), - std::ref(waitForInitCvMutex), i, std::ref(initCount), - loadVar, (int)matrixSize); - gpuThreads.push_back(std::move(t)); - } else { - firestarter::log::trace() << "Starting OneAPI GPU float workload."; - std::thread t(create_load, std::ref(waitForInitCv), - std::ref(waitForInitCvMutex), i, std::ref(initCount), - loadVar, (int)matrixSize); - gpuThreads.push_back(std::move(t)); - } + std::thread T(LoadFunc, std::ref(GpuThreadsWaitForInitCv), std::ref(GpuThreadsWaitForInitCvMutex), I, + std::ref(InitCount), std::cref(LoadVar), MatrixSize); + GpuThreads.emplace_back(std::move(T)); } } { - std::unique_lock lk(waitForInitCvMutex); + std::unique_lock Lk(GpuThreadsWaitForInitCvMutex); // wait for all threads to initialize - waitForInitCv.wait(lk, [&] { return initCount == gpus; }); - } - - // notify that init is done - cv.notify_all(); - - /* join computation threads */ - for (auto &t : gpuThreads) { - t.join(); + GpuThreadsWaitForInitCv.wait(Lk, [&] { return InitCount == Gpus; }); } } else { - firestarter::log::info() - << " - No OneAPI devices. Just stressing CPU(s). Maybe use " - "FIRESTARTER instead of FIRESTARTER_OneAPI?"; - cv.notify_all(); + firestarter::log::info() << " - No OneAPI" + << " devices. Just stressing CPU(s). Maybe use " + "FIRESTARTER instead of FIRESTARTER_OneAPI?"; } } else { - firestarter::log::info() - << " --gpus 0 is set. Just stressing CPU(s). Maybe use " - "FIRESTARTER instead of FIRESTARTER_OneAPI?"; - cv.notify_all(); + firestarter::log::info() << " --gpus 0 is set. Just stressing CPU(s). Maybe use " + "FIRESTARTER instead of FIRESTARTER_OneAPI?"; + } + + // notify that init is done + WaitForInitCv.notify_all(); + + /* join computation threads */ + for (auto& Thread : GpuThreads) { + Thread.join(); } } + +} // namespace firestarter::oneapi \ No newline at end of file diff --git a/src/firestarter/Optimizer/Algorithm/NSGA2.cpp b/src/firestarter/Optimizer/Algorithm/NSGA2.cpp index 7c8a8146..c515e429 100644 --- a/src/firestarter/Optimizer/Algorithm/NSGA2.cpp +++ b/src/firestarter/Optimizer/Algorithm/NSGA2.cpp @@ -21,165 +21,161 @@ // This file borrows a lot of code from https://github.com/esa/pagmo2 -#include -#include -#include +#include "firestarter/Optimizer/Algorithm/NSGA2.hpp" +#include "firestarter/Logging/Log.hpp" +#include "firestarter/Optimizer/Individual.hpp" +#include "firestarter/Optimizer/Util/MultiObjective.hpp" #include +#include +#include #include -using namespace firestarter::optimizer::algorithm; +namespace firestarter::optimizer::algorithm { -NSGA2::NSGA2(unsigned gen, double cr, double m) - : Algorithm(), _gen(gen), _cr(cr), _m(m) { - if (cr >= 1. || cr < 0.) { +NSGA2::NSGA2(unsigned Gen, double Cr, double M) + : Gen(Gen) + , Cr(Cr) + , M(M) { + if (Cr >= 1. || Cr < 0.) { throw std::invalid_argument("The crossover probability must be in the " "[0,1[ range, while a value of " + - std::to_string(cr) + " was detected"); + std::to_string(Cr) + " was detected"); } - if (m < 0. || m > 1.) { + if (M < 0. || M > 1.) { throw std::invalid_argument("The mutation probability must be in the [0,1] " "range, while a value of " + - std::to_string(m) + " was detected"); + std::to_string(M) + " was detected"); } } -void NSGA2::checkPopulation(firestarter::optimizer::Population const &pop, - std::size_t populationSize) { - const auto &prob = pop.problem(); - - if (!prob.isMO()) { - throw std::invalid_argument( - "NSGA2 is a multiobjective algorithms, while number of objectives is " + - std::to_string(prob.getNobjs())); +void NSGA2::check(firestarter::optimizer::Problem const& Prob, std::size_t PopulationSize) { + if (!Prob.isMO()) { + throw std::invalid_argument("NSGA2 is a multiobjective algorithms, while number of objectives is " + + std::to_string(Prob.getNobjs())); } - if (populationSize < 5u || (populationSize % 4 != 0u)) { + if (PopulationSize < 5U || (PopulationSize % 4 != 0U)) { throw std::invalid_argument("for NSGA-II at least 5 individuals in the " "population are needed and the " "population size must be a multiple of 4. " "Detected input population size is: " + - std::to_string(populationSize)); + std::to_string(PopulationSize)); } } -firestarter::optimizer::Population -NSGA2::evolve(firestarter::optimizer::Population &pop) { - const auto &prob = pop.problem(); - const auto bounds = prob.getBounds(); - auto NP = pop.size(); - auto fevals0 = prob.getFevals(); +auto NSGA2::evolve(firestarter::optimizer::Population& Pop) -> firestarter::optimizer::Population { + const auto& Prob = Pop.problem(); + const auto Bounds = Prob.getBounds(); + auto NP = Pop.size(); + auto Fevals0 = Prob.getFevals(); - this->checkPopulation( - const_cast(pop), NP); + this->check(Prob, NP); - std::random_device rd; - std::mt19937 rng(rd()); + std::random_device Rd; + std::mt19937 Rng(Rd()); - std::vector best_idx(NP), shuffle1(NP), shuffle2(NP); - Individual::size_type parent1_idx, parent2_idx; - std::pair children; + std::vector BestIdx(NP); + std::vector Shuffle1(NP); + std::vector Shuffle2(NP); + Individual::size_type Parent1Idx = 0; + Individual::size_type Parent2Idx = 0; + std::pair Children; - std::iota(shuffle1.begin(), shuffle1.end(), Individual::size_type(0)); - std::iota(shuffle2.begin(), shuffle2.end(), Individual::size_type(0)); + std::iota(Shuffle1.begin(), Shuffle1.end(), static_cast(0)); + std::iota(Shuffle2.begin(), Shuffle2.end(), static_cast(0)); { - std::stringstream ss; + std::stringstream Ss; - ss << std::endl << std::setw(7) << "Gen:" << std::setw(15) << "Fevals:"; - for (decltype(prob.getNobjs()) i = 0; i < prob.getNobjs(); ++i) { - ss << std::setw(15) << "ideal" << std::to_string(i + 1u) << ":"; + Ss << '\n' << std::setw(7) << "Gen:" << std::setw(15) << "Fevals:"; + for (decltype(Prob.getNobjs()) I = 0; I < Prob.getNobjs(); ++I) { + Ss << std::setw(15) << "ideal" << std::to_string(I + 1U) << ":"; } - firestarter::log::info() << ss.str(); + firestarter::log::info() << Ss.str(); } - for (decltype(_gen) gen = 1u; gen <= _gen; ++gen) { + for (auto I = 1U; I <= Gen; ++I) { { // Print the logs - std::vector idealPoint = util::ideal(pop.f()); - std::stringstream ss; + const auto IdealPoint = util::ideal(Pop.f()); + std::stringstream Ss; - ss << std::setw(7) << gen << std::setw(15) << prob.getFevals() - fevals0; - for (decltype(idealPoint.size()) i = 0; i < idealPoint.size(); ++i) { - ss << std::setw(15) << idealPoint[i]; + Ss << std::setw(7) << I << std::setw(15) << Prob.getFevals() - Fevals0; + for (const auto I : IdealPoint) { + Ss << std::setw(15) << I; } - firestarter::log::info() << ss.str(); + firestarter::log::info() << Ss.str(); } // At each generation we make a copy of the population into popnew - firestarter::optimizer::Population popnew(pop); + firestarter::optimizer::Population Popnew(Pop); // We create some pseudo-random permutation of the poulation indexes - std::random_shuffle(shuffle1.begin(), shuffle1.end()); - std::random_shuffle(shuffle2.begin(), shuffle2.end()); + std::shuffle(Shuffle1.begin(), Shuffle1.end(), Rng); + std::shuffle(Shuffle2.begin(), Shuffle2.end(), Rng); // We compute crowding distance and non dominated rank for the current // population - auto fnds_res = util::fast_non_dominated_sorting(pop.f()); - auto ndf = - std::get<0>(fnds_res); // non dominated fronts [[0,3,2],[1,5,6],[4],...] - std::vector pop_cd( - NP); // crowding distances of the whole population - auto ndr = - std::get<3>(fnds_res); // non domination rank [0,1,0,0,2,1,1, ... ] - for (const auto &front_idxs : ndf) { - if (front_idxs.size() == - 1u) { // handles the case where the front has collapsed to one point - pop_cd[front_idxs[0]] = std::numeric_limits::infinity(); - } else if (front_idxs.size() == 2u) { // handles the case where the front + auto FndsRes = util::fastNonDominatedSorting(Pop.f()); + auto Ndf = std::get<0>(FndsRes); // non dominated fronts [[0,3,2],[1,5,6],[4],...] + std::vector PopCd(NP); // crowding distances of the whole population + auto Ndr = std::get<3>(FndsRes); // non domination rank [0,1,0,0,2,1,1, ... ] + for (const auto& FrontIdxs : Ndf) { + if (FrontIdxs.size() == 1U) { // handles the case where the front has collapsed to one point + PopCd[FrontIdxs[0]] = std::numeric_limits::infinity(); + } else if (FrontIdxs.size() == 2U) { // handles the case where the front // has collapsed to one point - pop_cd[front_idxs[0]] = std::numeric_limits::infinity(); - pop_cd[front_idxs[1]] = std::numeric_limits::infinity(); + PopCd[FrontIdxs[0]] = std::numeric_limits::infinity(); + PopCd[FrontIdxs[1]] = std::numeric_limits::infinity(); } else { - std::vector> front; - for (auto idx : front_idxs) { - front.push_back(pop.f()[idx]); + std::vector> Front; + Front.reserve(FrontIdxs.size()); + for (auto Idx : FrontIdxs) { + Front.push_back(Pop.f()[Idx]); } - auto cd = util::crowding_distance(front); - for (decltype(cd.size()) i = 0u; i < cd.size(); ++i) { - pop_cd[front_idxs[i]] = cd[i]; + auto Cd = util::crowdingDistance(Front); + for (decltype(Cd.size()) I = 0U; I < Cd.size(); ++I) { + PopCd[FrontIdxs[I]] = Cd[I]; } } } // We then loop thorugh all individuals with increment 4 to select two pairs // of parents that will each create 2 new offspring - for (decltype(NP) i = 0u; i < NP; i += 4) { + for (decltype(NP) I = 0U; I < NP; I += 4) { // We create two offsprings using the shuffled list 1 - parent1_idx = util::mo_tournament_selection(shuffle1[i], shuffle1[i + 1], - ndr, pop_cd, rng); - parent2_idx = util::mo_tournament_selection( - shuffle1[i + 2], shuffle1[i + 3], ndr, pop_cd, rng); - children = util::sbx_crossover(pop.x()[parent1_idx], pop.x()[parent2_idx], - _cr, rng); - util::polynomial_mutation(children.first, bounds, _m, rng); - util::polynomial_mutation(children.second, bounds, _m, rng); - - popnew.append(children.first); - popnew.append(children.second); + Parent1Idx = util::moTournamentSelection(Shuffle1[I], Shuffle1[I + 1], Ndr, PopCd, Rng); + Parent2Idx = util::moTournamentSelection(Shuffle1[I + 2], Shuffle1[I + 3], Ndr, PopCd, Rng); + Children = util::sbxCrossover(Pop.x()[Parent1Idx], Pop.x()[Parent2Idx], Cr, Rng); + util::polynomialMutation(Children.first, Bounds, M, Rng); + util::polynomialMutation(Children.second, Bounds, M, Rng); + + Popnew.append(Children.first); + Popnew.append(Children.second); // We repeat with the shuffled list 2 - parent1_idx = util::mo_tournament_selection(shuffle2[i], shuffle2[i + 1], - ndr, pop_cd, rng); - parent2_idx = util::mo_tournament_selection( - shuffle2[i + 2], shuffle2[i + 3], ndr, pop_cd, rng); - children = util::sbx_crossover(pop.x()[parent1_idx], pop.x()[parent2_idx], - _cr, rng); - util::polynomial_mutation(children.first, bounds, _m, rng); - util::polynomial_mutation(children.second, bounds, _m, rng); - - popnew.append(children.first); - popnew.append(children.second); - } // popnew now contains 2NP individuals - // This method returns the sorted N best individuals in the population + Parent1Idx = util::moTournamentSelection(Shuffle2[I], Shuffle2[I + 1], Ndr, PopCd, Rng); + Parent2Idx = util::moTournamentSelection(Shuffle2[I + 2], Shuffle2[I + 3], Ndr, PopCd, Rng); + Children = util::sbxCrossover(Pop.x()[Parent1Idx], Pop.x()[Parent2Idx], Cr, Rng); + util::polynomialMutation(Children.first, Bounds, M, Rng); + util::polynomialMutation(Children.second, Bounds, M, Rng); + + Popnew.append(Children.first); + Popnew.append(Children.second); + } + // Popnew now contains 2NP individuals + + // Save the best NP individuals in the population // according to the crowded comparison operator - best_idx = util::select_best_N_mo(popnew.f(), NP); - // We insert into the population - for (decltype(NP) i = 0; i < NP; ++i) { - pop.insert(i, popnew.x()[best_idx[i]], popnew.f()[best_idx[i]]); + BestIdx = util::selectBestNMo(Popnew.f(), NP); + for (decltype(NP) I = 0; I < NP; ++I) { + Pop.insert(I, Popnew.x()[BestIdx[I]], Popnew.f()[BestIdx[I]]); } } - return pop; + return Pop; } + +} // namespace firestarter::optimizer::algorithm \ No newline at end of file diff --git a/src/firestarter/Optimizer/OptimizerWorker.cpp b/src/firestarter/Optimizer/OptimizerWorker.cpp index 48819fd5..a82c1fa8 100644 --- a/src/firestarter/Optimizer/OptimizerWorker.cpp +++ b/src/firestarter/Optimizer/OptimizerWorker.cpp @@ -19,54 +19,56 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include +#include "firestarter/Optimizer/OptimizerWorker.hpp" +#include "firestarter/Optimizer/Algorithm/NSGA2.hpp" #include +#include -using namespace firestarter::optimizer; +namespace firestarter::optimizer { -OptimizerWorker::OptimizerWorker( - std::unique_ptr &&algorithm, - firestarter::optimizer::Population &population, - std::string const &optimizationAlgorithm, unsigned individuals, - std::chrono::seconds const &preheat) - : _algorithm(std::move(algorithm)), _population(population), - _optimizationAlgorithm(optimizationAlgorithm), _individuals(individuals), - _preheat(preheat) { - pthread_create( - &this->workerThread, NULL, - reinterpret_cast(OptimizerWorker::optimizerThread), - this); +OptimizerWorker::OptimizerWorker(std::unique_ptr&& Algorithm, + std::unique_ptr&& Population, unsigned Individuals, + std::chrono::seconds const& Preheat) + : Algorithm(std::move(Algorithm)) + , Population(std::move(Population)) + , Individuals(Individuals) + , Preheat(Preheat) { + pthread_create(&this->WorkerThread, nullptr, OptimizerWorker::optimizerThread, this); } -void OptimizerWorker::kill() { +void OptimizerWorker::kill() const { // we ignore ESRCH errno if thread already exited - pthread_cancel(this->workerThread); + pthread_cancel(WorkerThread); } -void OptimizerWorker::join() { +void OptimizerWorker::join() const { // we ignore ESRCH errno if thread already exited - pthread_join(this->workerThread, NULL); + pthread_join(WorkerThread, nullptr); } -void *OptimizerWorker::optimizerThread(void *optimizerWorker) { - pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); +auto OptimizerWorker::optimizerThread(void* OptimizerWorker) -> void* { + // NOLINTBEGIN(cert-pos47-c,concurrency-thread-canceltype-asynchronous) + pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, nullptr); + // NOLINTEND(cert-pos47-c,concurrency-thread-canceltype-asynchronous) - auto _this = reinterpret_cast(optimizerWorker); + auto* This = static_cast(OptimizerWorker); #ifndef __APPLE__ pthread_setname_np(pthread_self(), "Optimizer"); #endif // heat the cpu before attempting to optimize - std::this_thread::sleep_for(_this->_preheat); + std::this_thread::sleep_for(This->Preheat); // For NSGA2 we start with a initial population - if (_this->_optimizationAlgorithm == "NSGA2") { - _this->_population.generateInitialPopulation(_this->_individuals); + if (dynamic_cast(This->Algorithm.get())) { + This->Population->generateInitialPopulation(This->Individuals); } - _this->_algorithm->evolve(_this->_population); + This->Algorithm->evolve(*This->Population); - return NULL; + return nullptr; } + +} // namespace firestarter::optimizer \ No newline at end of file diff --git a/src/firestarter/Optimizer/Population.cpp b/src/firestarter/Optimizer/Population.cpp index 7d3a7e1a..a5a21527 100644 --- a/src/firestarter/Optimizer/Population.cpp +++ b/src/firestarter/Optimizer/Population.cpp @@ -19,125 +19,106 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include -#include +#include "firestarter/Optimizer/Population.hpp" +#include "firestarter/Logging/Log.hpp" +#include "firestarter/Optimizer/History.hpp" -#include #include -#include +#include -using namespace firestarter::optimizer; +namespace firestarter::optimizer { -void Population::generateInitialPopulation(std::size_t populationSize) { - firestarter::log::trace() << "Generating " << populationSize - << " random individuals for initial population."; +void Population::generateInitialPopulation(std::size_t PopulationSize) { + firestarter::log::trace() << "Generating " << PopulationSize << " random individuals for initial population."; - auto dims = this->problem().getDims(); - auto remaining = populationSize; + auto Dims = this->problem().getDims(); + auto Remaining = PopulationSize; - if (!(populationSize < dims)) { - for (decltype(dims) i = 0; i < dims; i++) { - Individual vec(dims, 0); - vec[i] = 1; - this->append(vec); + if (!(PopulationSize < Dims)) { + for (decltype(Dims) I = 0; I < Dims; I++) { + Individual Vec(Dims, 0); + Vec[I] = 1; + this->append(Vec); + Remaining--; } - - remaining -= dims; } else { - firestarter::log::trace() - << "Population size (" << std::to_string(populationSize) - << ") is less than size of problem dimension (" << std::to_string(dims) - << ")"; + firestarter::log::trace() << "Population size (" << std::to_string(PopulationSize) + << ") is less than size of problem dimension (" << std::to_string(Dims) << ")"; } - for (decltype(remaining) i = 0; i < remaining; i++) { + for (decltype(Remaining) I = 0; I < Remaining; I++) { this->append(this->getRandomIndividual()); } } -std::size_t Population::size() const { return _x.size(); } +auto Population::size() const -> std::size_t { return X.size(); } -void Population::append(Individual const &ind) { - assert(this->problem().getDims() == ind.size()); +void Population::append(Individual const& Ind) { + assert(this->problem().getDims() == Ind.size()); - std::map metrics; + std::map Metrics; // check if we already evaluated this individual - auto optional_metric = History::find(ind); - if (optional_metric.has_value()) { - metrics = optional_metric.value(); + const auto OptionalMetric = History::find(Ind); + if (OptionalMetric.has_value()) { + Metrics = OptionalMetric.value(); } else { - metrics = this->_problem->metrics(ind); + Metrics = this->ProblemPtr->metrics(Ind); } - auto fitness = this->_problem->fitness(metrics); + auto Fitness = this->ProblemPtr->fitness(Metrics); - this->append(ind, fitness); + this->append(Ind, Fitness); - if (!optional_metric.has_value()) { - History::append(ind, metrics); + if (!OptionalMetric.has_value()) { + History::append(Ind, Metrics); } } -void Population::append(Individual const &ind, std::vector const &fit) { - std::stringstream ss; - ss << " - Fitness: "; - for (auto const &v : fit) { - ss << v << " "; +void Population::append(Individual const& Ind, std::vector const& Fit) { + std::stringstream Ss; + Ss << " - Fitness: "; + for (auto const& V : Fit) { + Ss << V << " "; } - firestarter::log::trace() << ss.str(); + firestarter::log::trace() << Ss.str(); - assert(this->problem().getNobjs() == fit.size()); - assert(this->problem().getDims() == ind.size()); + assert(this->problem().getNobjs() == Fit.size()); + assert(this->problem().getDims() == Ind.size()); - this->_x.push_back(ind); - this->_f.push_back(fit); + this->X.push_back(Ind); + this->F.push_back(Fit); } -void Population::insert(std::size_t idx, Individual const &ind, - std::vector const &fit) { +void Population::insert(std::size_t Idx, Individual const& Ind, std::vector const& Fit) { // assert that population is big enough - assert(_x.size() > idx); + assert(X.size() > Idx); - _x[idx] = ind; - _f[idx] = fit; + X[Idx] = Ind; + F[Idx] = Fit; } -Individual Population::getRandomIndividual() { - auto dims = this->problem().getDims(); - auto const bounds = this->problem().getBounds(); - - firestarter::log::trace() << "Generating random individual of size: " << dims; +auto Population::getRandomIndividual() const -> Individual { + auto Dims = this->problem().getDims(); + auto const Bounds = this->problem().getBounds(); - Individual out(dims); + std::random_device Rd; + std::mt19937 Rng(Rd()); - for (decltype(dims) i = 0; i < dims; i++) { - auto const lb = std::get<0>(bounds[i]); - auto const ub = std::get<1>(bounds[i]); + firestarter::log::trace() << "Generating random individual of size: " << Dims; - out[i] = std::uniform_int_distribution(lb, ub)(this->gen); + Individual Out(Dims); - firestarter::log::trace() - << " - " << i << ": [" << lb << "," << ub << "]: " << out[i]; - } + for (decltype(Dims) I = 0; I < Dims; I++) { + auto const Lb = std::get<0>(Bounds[I]); + auto const Ub = std::get<1>(Bounds[I]); - return out; -} + Out[I] = std::uniform_int_distribution(Lb, Ub)(Rng); -std::optional Population::bestIndividual() const { - // return an empty vector if the problem is multi objective, as there is no - // single best individual - if (this->problem().isMO()) { - return {}; + firestarter::log::trace() << " - " << I << ": [" << Lb << "," << Ub << "]: " << Out[I]; } - // assert that we have individuals - assert(this->_x.size() > 0); - - auto best = std::max_element(this->_x.begin(), this->_x.end(), - [](auto a, auto b) { return a < b; }); - - assert(best != this->_x.end()); - - return *best; + return Out; } + +} // namespace firestarter::optimizer \ No newline at end of file diff --git a/src/firestarter/Optimizer/Util/MultiObjective.cpp b/src/firestarter/Optimizer/Util/MultiObjective.cpp index 2c87ba2f..7cae260a 100644 --- a/src/firestarter/Optimizer/Util/MultiObjective.cpp +++ b/src/firestarter/Optimizer/Util/MultiObjective.cpp @@ -21,7 +21,7 @@ // This file borrows a lot of code from https://github.com/esa/pagmo2 -#include +#include "firestarter/Optimizer/Util/MultiObjective.hpp" #include #include @@ -32,35 +32,30 @@ namespace firestarter::optimizer::util { // Less than compares floating point types placing nans after inf or before -inf // It is a useful function when calling e.g. std::sort to guarantee a weak // strict ordering and avoid an undefined behaviour -bool less_than_f(double a, double b) { - if (!std::isnan(a)) { - if (!std::isnan(b)) - return a < b; // a < b - else - return true; // a < nan - } else { - if (!std::isnan(b)) - return false; // nan < b - else - return false; // nan < nan +auto lessThanF(double A, double B) -> bool { + if (!std::isnan(A)) { + if (!std::isnan(B)) { + return A < B; // a < b + } + return true; // a < nan } + // nan < b or nan < nan + return false; } // Greater than compares floating point types placing nans after inf or before // -inf It is a useful function when calling e.g. std::sort to guarantee a weak // strict ordering and avoid an undefined behaviour -bool greater_than_f(double a, double b) { - if (!std::isnan(a)) { - if (!std::isnan(b)) - return a > b; // a > b - else - return false; // a > nan - } else { - if (!std::isnan(b)) - return true; // nan > b - else - return false; // nan > nan +auto greaterThanF(double A, double B) -> bool { + if (!std::isnan(A)) { + if (!std::isnan(B)) { + return A > B; // a > b + } + return false; // a > nan } + // nan > b -> true + // nan > nan -> false + return !std::isnan(B); } /// Pareto-dominance @@ -81,23 +76,22 @@ bool greater_than_f(double a, double b) { * @throws std::invalid_argument if the dimensions of the two objectives are * different */ -bool pareto_dominance(const std::vector &obj1, - const std::vector &obj2) { - if (obj1.size() != obj2.size()) { +auto paretoDominance(const std::vector& Obj1, const std::vector& Obj2) -> bool { + if (Obj1.size() != Obj2.size()) { throw std::invalid_argument( - "Different number of objectives found in input fitnesses: " + - std::to_string(obj1.size()) + " and " + std::to_string(obj2.size()) + - ". I cannot define dominance"); + "Different number of objectives found in input fitnesses: " + std::to_string(Obj1.size()) + " and " + + std::to_string(Obj2.size()) + ". I cannot define dominance"); } - bool found_strictly_dominating_dimension = false; - for (decltype(obj1.size()) i = 0u; i < obj1.size(); ++i) { - if (greater_than_f(obj2[i], obj1[i])) { + bool FoundStrictlyDominatingDimension = false; + for (decltype(Obj1.size()) I = 0U; I < Obj1.size(); ++I) { + if (greaterThanF(Obj2[I], Obj1[I])) { return false; - } else if (less_than_f(obj2[i], obj1[i])) { - found_strictly_dominating_dimension = true; + } + if (lessThanF(Obj2[I], Obj1[I])) { + FoundStrictlyDominatingDimension = true; } } - return found_strictly_dominating_dimension; + return FoundStrictlyDominatingDimension; } /// Fast non dominated sorting @@ -130,67 +124,63 @@ bool pareto_dominance(const std::vector &obj1, * * @throws std::invalid_argument If the size of \p points is not at least 2 */ -std::tuple>, - std::vector>, std::vector, - std::vector> -fast_non_dominated_sorting(const std::vector> &points) { - auto N = points.size(); +auto fastNonDominatedSorting(const std::vector>& Points) + -> std::tuple>, std::vector>, + std::vector, std::vector> { + auto N = Points.size(); // We make sure to have two points at least (one could also be allowed) - if (N < 2u) { - throw std::invalid_argument( - "At least two points are needed for fast_non_dominated_sorting: " + - std::to_string(N) + " detected."); + if (N < 2U) { + throw std::invalid_argument("At least two points are needed for fast_non_dominated_sorting: " + std::to_string(N) + + " detected."); } // Initialize the return values - std::vector> non_dom_fronts(1u); - std::vector> dom_list(N); - std::vector dom_count(N); - std::vector non_dom_rank(N); + std::vector> NonDomFronts(1U); + std::vector> DomList(N); + std::vector DomCount(N); + std::vector NonDomRank(N); // Start the fast non dominated sort algorithm - for (decltype(N) i = 0u; i < N; ++i) { - dom_list[i].clear(); - dom_count[i] = 0u; - for (decltype(N) j = 0u; j < i; ++j) { - if (pareto_dominance(points[i], points[j])) { - dom_list[i].push_back(j); - ++dom_count[j]; - } else if (pareto_dominance(points[j], points[i])) { - dom_list[j].push_back(i); - ++dom_count[i]; + for (decltype(N) I = 0U; I < N; ++I) { + DomList[I].clear(); + DomCount[I] = 0U; + for (decltype(N) J = 0U; J < I; ++J) { + if (paretoDominance(Points[I], Points[J])) { + DomList[I].push_back(J); + ++DomCount[J]; + } else if (paretoDominance(Points[J], Points[I])) { + DomList[J].push_back(I); + ++DomCount[I]; } } } - for (decltype(N) i = 0u; i < N; ++i) { - if (dom_count[i] == 0u) { - non_dom_rank[i] = 0u; - non_dom_fronts[0].push_back(i); + for (decltype(N) I = 0U; I < N; ++I) { + if (DomCount[I] == 0U) { + NonDomRank[I] = 0U; + NonDomFronts[0].push_back(I); } } // we copy dom_count as we want to output its value at this point - auto dom_count_copy(dom_count); - auto current_front = non_dom_fronts[0]; - std::vector>::size_type front_counter(0u); - while (current_front.size() != 0u) { - std::vector next_front; - for (decltype(current_front.size()) p = 0u; p < current_front.size(); ++p) { - for (decltype(dom_list[current_front[p]].size()) q = 0u; - q < dom_list[current_front[p]].size(); ++q) { - --dom_count_copy[dom_list[current_front[p]][q]]; - if (dom_count_copy[dom_list[current_front[p]][q]] == 0u) { - non_dom_rank[dom_list[current_front[p]][q]] = front_counter + 1u; - next_front.push_back(dom_list[current_front[p]][q]); + auto DomCountCopy(DomCount); + auto CurrentFront = NonDomFronts[0]; + std::vector>::size_type FrontCounter(0U); + while (!CurrentFront.empty()) { + std::vector NextFront; + for (const auto& P : CurrentFront) { + for (const auto& Q : DomList[P]) { + --DomCountCopy[Q]; + if (DomCountCopy[Q] == 0U) { + NonDomRank[Q] = FrontCounter + 1U; + NextFront.push_back(Q); } } } - ++front_counter; - current_front = next_front; - if (current_front.size() != 0u) { - non_dom_fronts.push_back(current_front); + ++FrontCounter; + CurrentFront = NextFront; + if (!CurrentFront.empty()) { + NonDomFronts.push_back(CurrentFront); } } - return std::make_tuple(std::move(non_dom_fronts), std::move(dom_list), - std::move(dom_count), std::move(non_dom_rank)); + return std::make_tuple(std::move(NonDomFronts), std::move(DomList), std::move(DomCount), std::move(NonDomRank)); } /// Crowding distance @@ -218,69 +208,64 @@ fast_non_dominated_sorting(const std::vector> &points) { * @throws std::invalid_argument If points in \p non_dom_front do not all have * the same dimensionality */ -std::vector -crowding_distance(const std::vector> &non_dom_front) { - auto N = non_dom_front.size(); +auto crowdingDistance(const std::vector>& NonDomFront) -> std::vector { + auto N = NonDomFront.size(); // We make sure to have two points at least - if (N < 2u) { - throw std::invalid_argument( - "A non dominated front must contain at least two points: " + - std::to_string(N) + " detected."); + if (N < 2U) { + throw std::invalid_argument("A non dominated front must contain at least two points: " + std::to_string(N) + + " detected."); } - auto M = non_dom_front[0].size(); + auto M = NonDomFront[0].size(); // We make sure the first point of the input non dominated front contains at // least two objectives - if (M < 2u) { + if (M < 2U) { throw std::invalid_argument("Points in the non dominated front must " "contain at least two objectives: " + std::to_string(M) + " detected."); } // We make sure all points contain the same number of objectives - if (!std::all_of( - non_dom_front.begin(), non_dom_front.end(), - [M](const std::vector &item) { return item.size() == M; })) { + if (!std::all_of(NonDomFront.begin(), NonDomFront.end(), + [M](const std::vector& Item) { return Item.size() == M; })) { throw std::invalid_argument("A non dominated front must contain points of " "uniform dimensionality. Some " "different sizes were instead detected."); } - std::vector indexes(N); - std::iota(indexes.begin(), indexes.end(), std::size_t(0u)); - std::vector retval(N, 0.); - for (decltype(M) i = 0u; i < M; ++i) { - std::sort(indexes.begin(), indexes.end(), - [i, &non_dom_front](std::size_t idx1, std::size_t idx2) { - return less_than_f(non_dom_front[idx1][i], - non_dom_front[idx2][i]); - }); - retval[indexes[0]] = std::numeric_limits::infinity(); - retval[indexes[N - 1u]] = std::numeric_limits::infinity(); - double df = - non_dom_front[indexes[N - 1u]][i] - non_dom_front[indexes[0]][i]; - for (decltype(N - 2u) j = 1u; j < N - 1u; ++j) { - retval[indexes[j]] += (non_dom_front[indexes[j + 1u]][i] - - non_dom_front[indexes[j - 1u]][i]) / - df; + std::vector Indexes(N); + std::iota(Indexes.begin(), Indexes.end(), static_cast(0U)); + std::vector Retval(N, 0.); + for (decltype(M) I = 0U; I < M; ++I) { + std::sort(Indexes.begin(), Indexes.end(), [I, &NonDomFront](std::size_t Idx1, std::size_t Idx2) { + return lessThanF(NonDomFront[Idx1][I], NonDomFront[Idx2][I]); + }); + Retval[Indexes[0]] = std::numeric_limits::infinity(); + Retval[Indexes[N - 1U]] = std::numeric_limits::infinity(); + const double Df = NonDomFront[Indexes[N - 1U]][I] - NonDomFront[Indexes[0]][I]; + for (decltype(N - 2U) J = 1U; J < N - 1U; ++J) { + Retval[Indexes[J]] += (NonDomFront[Indexes[J + 1U]][I] - NonDomFront[Indexes[J - 1U]][I]) / Df; } } - return retval; + return Retval; } // Multi-objective tournament selection. Requires all sizes to be consistent. // Does not check if input is well formed. -std::vector::size_type mo_tournament_selection( - std::vector::size_type idx1, std::vector::size_type idx2, - const std::vector::size_type> &non_domination_rank, - const std::vector &crowding_d, std::mt19937 &mt) { - if (non_domination_rank[idx1] < non_domination_rank[idx2]) - return idx1; - if (non_domination_rank[idx1] > non_domination_rank[idx2]) - return idx2; - if (crowding_d[idx1] > crowding_d[idx2]) - return idx1; - if (crowding_d[idx1] < crowding_d[idx2]) - return idx2; - std::uniform_real_distribution<> drng(0., 1.); - return ((drng(mt) < 0.5) ? idx1 : idx2); +auto moTournamentSelection(std::vector::size_type Idx1, std::vector::size_type Idx2, + const std::vector::size_type>& NonDominationRank, + const std::vector& CrowdingD, std::mt19937& Mt) -> std::vector::size_type { + if (NonDominationRank[Idx1] < NonDominationRank[Idx2]) { + return Idx1; + } + if (NonDominationRank[Idx1] > NonDominationRank[Idx2]) { + return Idx2; + } + if (CrowdingD[Idx1] > CrowdingD[Idx2]) { + return Idx1; + } + if (CrowdingD[Idx1] < CrowdingD[Idx2]) { + return Idx2; + } + std::uniform_real_distribution<> Drng(0., 1.); + return ((Drng(Mt) < 0.5) ? Idx1 : Idx2); } // Implementation of the binary crossover. @@ -288,66 +273,56 @@ std::vector::size_type mo_tournament_selection( // otherwise Requires dimensions of the parent and bounds to be equal -> out of // bound reads. nix is the integer dimension (integer alleles assumed at the end // of the chromosome) -std::pair -sbx_crossover(const firestarter::optimizer::Individual &parent1, - const firestarter::optimizer::Individual &parent2, - const double p_cr, std::mt19937 &mt) { +auto sbxCrossover(const firestarter::optimizer::Individual& Parent1, const firestarter::optimizer::Individual& Parent2, + const double PCr, std::mt19937& Mt) + -> std::pair { // Decision vector dimensions - auto nix = parent1.size(); - firestarter::optimizer::Individual::size_type site1, site2; + auto Nix = Parent1.size(); // Initialize the child decision vectors - firestarter::optimizer::Individual child1 = parent1; - firestarter::optimizer::Individual child2 = parent2; + firestarter::optimizer::Individual Child1 = Parent1; + firestarter::optimizer::Individual Child2 = Parent2; // Random distributions - std::uniform_real_distribution<> drng(0., + std::uniform_real_distribution<> Drng(0., 1.); // to generate a number in [0, 1) // This implements a Simulated Binary Crossover SBX - if (drng(mt) < - p_cr) { // No crossever at all will happen with probability p_cr + if (Drng(Mt) < PCr) { // No crossever at all will happen with probability p_cr // This implements two-points crossover and applies it to the integer part // of the chromosome. - if (nix > 0u) { - std::uniform_int_distribution< - firestarter::optimizer::Individual::size_type> - ra_num(0, nix - 1u); - site1 = ra_num(mt); - site2 = ra_num(mt); - if (site1 > site2) { - std::swap(site1, site2); + if (Nix > 0U) { + std::uniform_int_distribution RaNum(0, Nix - 1U); + auto Site1 = RaNum(Mt); + auto Site2 = RaNum(Mt); + if (Site1 > Site2) { + std::swap(Site1, Site2); } - for (decltype(site2) j = site1; j <= site2; ++j) { - child1[j] = parent2[j]; - child2[j] = parent1[j]; + for (decltype(Site2) J = Site1; J <= Site2; ++J) { + Child1[J] = Parent2[J]; + Child2[J] = Parent1[J]; } } } - return std::make_pair(std::move(child1), std::move(child2)); + return std::make_pair(std::move(Child1), std::move(Child2)); } // Performs polynomial mutation. Requires all sizes to be consistent. Does not // check if input is well formed. p_m is the mutation probability -void polynomial_mutation( - firestarter::optimizer::Individual &child, - const std::vector> &bounds, const double p_m, - std::mt19937 &mt) { +void polynomialMutation(firestarter::optimizer::Individual& Child, + const std::vector>& Bounds, const double PM, std::mt19937& Mt) { // Decision vector dimensions - auto nix = child.size(); + auto Nix = Child.size(); // Random distributions - std::uniform_real_distribution<> drng(0., + std::uniform_real_distribution<> Drng(0., 1.); // to generate a number in [0, 1) // This implements the integer mutation for an individual - for (decltype(nix) j = 0; j < nix; ++j) { - if (drng(mt) < p_m) { + for (decltype(Nix) J = 0; J < Nix; ++J) { + if (Drng(Mt) < PM) { // We need to draw a random integer in [lb, ub]. - auto lb = std::get<0>(bounds[j]); - auto ub = std::get<1>(bounds[j]); - std::uniform_int_distribution< - firestarter::optimizer::Individual::size_type> - dist(lb, ub); - auto mutated = dist(mt); - child[j] = mutated; + auto Lb = std::get<0>(Bounds[J]); + auto Ub = std::get<1>(Bounds[J]); + std::uniform_int_distribution Dist(Lb, Ub); + auto Mutated = Dist(Mt); + Child[J] = Mutated; } } } @@ -384,61 +359,58 @@ void polynomial_mutation( * @throws unspecified all exceptions thrown by * pagmo::fast_non_dominated_sorting and pagmo::crowding_distance */ -std::vector -select_best_N_mo(const std::vector> &input_f, - std::size_t N) { - if (N == 0u) { // corner case +auto selectBestNMo(const std::vector>& InputF, std::size_t N) -> std::vector { + if (N == 0U) { // corner case return {}; } - if (input_f.size() == 0u) { // corner case + if (InputF.empty()) { // corner case return {}; } - if (input_f.size() == 1u) { // corner case - return {0u}; + if (InputF.size() == 1U) { // corner case + return {0U}; } - if (N >= input_f.size()) { // corner case - std::vector retval(input_f.size()); - std::iota(retval.begin(), retval.end(), std::size_t(0u)); - return retval; + if (N >= InputF.size()) { // corner case + std::vector Retval(InputF.size()); + std::iota(Retval.begin(), Retval.end(), static_cast(0U)); + return Retval; } - std::vector retval; - std::vector::size_type front_id(0u); + std::vector Retval; + std::vector::size_type FrontId(0U); // Run fast-non-dominated sorting - auto tuple = fast_non_dominated_sorting(input_f); + auto Tuple = fastNonDominatedSorting(InputF); // Insert all non dominated fronts if not more than N - for (const auto &front : std::get<0>(tuple)) { - if (retval.size() + front.size() <= N) { - for (auto i : front) { - retval.push_back(i); + for (const auto& Front : std::get<0>(Tuple)) { + if (Retval.size() + Front.size() <= N) { + for (auto I : Front) { + Retval.push_back(I); } - if (retval.size() == N) { - return retval; + if (Retval.size() == N) { + return Retval; } - ++front_id; + ++FrontId; } else { break; } } - auto front = std::get<0>(tuple)[front_id]; - std::vector> non_dom_fits(front.size()); + auto Front = std::get<0>(Tuple)[FrontId]; + std::vector> NonDomFits(Front.size()); // Run crowding distance for the front - for (decltype(front.size()) i = 0u; i < front.size(); ++i) { - non_dom_fits[i] = input_f[front[i]]; + for (decltype(Front.size()) I = 0U; I < Front.size(); ++I) { + NonDomFits[I] = InputF[Front[I]]; } - std::vector cds(crowding_distance(non_dom_fits)); + std::vector Cds(crowdingDistance(NonDomFits)); // We now have front and crowding distance, we sort the front w.r.t. the // crowding - std::vector idxs(front.size()); - std::iota(idxs.begin(), idxs.end(), std::size_t(0u)); - std::sort(idxs.begin(), idxs.end(), - [&cds](std::size_t idx1, std::size_t idx2) { - return greater_than_f(cds[idx1], cds[idx2]); - }); // Descending order1 - auto remaining = N - retval.size(); - for (decltype(remaining) i = 0u; i < remaining; ++i) { - retval.push_back(front[idxs[i]]); + std::vector Idxs(Front.size()); + std::iota(Idxs.begin(), Idxs.end(), static_cast(0U)); + std::sort(Idxs.begin(), Idxs.end(), [&Cds](std::size_t Idx1, std::size_t Idx2) { + return greaterThanF(Cds[Idx1], Cds[Idx2]); + }); // Descending order1 + auto Remaining = N - Retval.size(); + for (decltype(Remaining) I = 0U; I < Remaining; ++I) { + Retval.push_back(Front[Idxs[I]]); } - return retval; + return Retval; } /// Ideal point @@ -458,31 +430,30 @@ select_best_N_mo(const std::vector> &input_f, * @throws std::invalid_argument if the input objective vectors are not all of * the same size */ -std::vector ideal(const std::vector> &points) { +auto ideal(const std::vector>& Points) -> std::vector { // Corner case - if (points.size() == 0u) { + if (Points.empty()) { return {}; } // Sanity checks - auto M = points[0].size(); - for (const auto &f : points) { - if (f.size() != M) { + auto M = Points[0].size(); + for (const auto& F : Points) { + if (F.size() != M) { throw std::invalid_argument("Input vector of objectives must contain " "fitness vector of equal dimension " + std::to_string(M)); } } // Actual algorithm - std::vector retval(M); - for (decltype(M) i = 0u; i < M; ++i) { - retval[i] = (*std::min_element( - points.begin(), points.end(), - [i](const std::vector &f1, const std::vector &f2) { - return util::greater_than_f(f1[i], f2[i]); - }))[i]; + std::vector Retval(M); + for (decltype(M) I = 0U; I < M; ++I) { + Retval[I] = (*std::min_element(Points.begin(), Points.end(), + [I](const std::vector& F1, const std::vector& F2) { + return util::greaterThanF(F1[I], F2[I]); + }))[I]; } - return retval; + return Retval; } } // namespace firestarter::optimizer::util diff --git a/include/firestarter/Measurement/Metric/Perf.h b/src/firestarter/SafeExit.cpp similarity index 75% rename from include/firestarter/Measurement/Metric/Perf.h rename to src/firestarter/SafeExit.cpp index 72221cca..4aed7a50 100644 --- a/include/firestarter/Measurement/Metric/Perf.h +++ b/src/firestarter/SafeExit.cpp @@ -19,10 +19,16 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#pragma once +#include "firestarter/SafeExit.hpp" -#include +#include -extern metric_interface_t perf_ipc_metric; +[[noreturn]] void firestarter::safeExit(const int Status) { + // This mutex is shared across all calls to safeExit, therefore also calls between different threads + static std::mutex ExitMutex; -extern metric_interface_t perf_freq_metric; + ExitMutex.lock(); + + // NOLINTNEXTLINE(concurrency-mt-unsafe) + std::exit(Status); +} \ No newline at end of file diff --git a/src/firestarter/WatchdogWorker.cpp b/src/firestarter/WatchdogWorker.cpp index 6a3f6b95..b5a73787 100644 --- a/src/firestarter/WatchdogWorker.cpp +++ b/src/firestarter/WatchdogWorker.cpp @@ -19,7 +19,7 @@ * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ -#include +#include "firestarter/Firestarter.hpp" #include #include @@ -28,11 +28,10 @@ #include #endif -using namespace firestarter; +namespace firestarter { -int Firestarter::watchdogWorker(std::chrono::microseconds period, - std::chrono::microseconds load, - std::chrono::seconds timeout) { +void Firestarter::watchdogWorker(std::chrono::microseconds Period, std::chrono::microseconds Load, + std::chrono::seconds Timeout) { using clock = std::chrono::high_resolution_clock; using nsec = std::chrono::nanoseconds; @@ -40,56 +39,53 @@ int Firestarter::watchdogWorker(std::chrono::microseconds period, using sec = std::chrono::seconds; // calculate idle time to be the rest of the period - auto idle = period - load; + auto Idle = Period - Load; // elapsed time - nsec time(0); + nsec Time(0); // do no enter the loop if we do not have to set the load level periodically, // at 0 or 100 load. - if (period > usec::zero()) { + if (Period > usec::zero()) { // this first time is critical as the period will be alligend from this // point - std::chrono::time_point startTime = clock::now(); + const auto StartTime = clock::now(); // this loop will set the load level periodically. for (;;) { - std::chrono::time_point currentTime = clock::now(); + const auto CurrentTime = clock::now(); // get the time already advanced in the current timeslice // this can happen if a load function does not terminates just on time - nsec advance = std::chrono::duration_cast(currentTime - startTime) % - std::chrono::duration_cast(period); + const auto Advance = + std::chrono::duration_cast(CurrentTime - StartTime) % std::chrono::duration_cast(Period); // subtract the advaned time from our timeslice by spilting it based on // the load level - nsec load_reduction = - (std::chrono::duration_cast(load).count() * advance) / - std::chrono::duration_cast(period).count(); - nsec idle_reduction = advance - load_reduction; + const auto LoadReduction = + (std::chrono::duration_cast(Load).count() * Advance) / std::chrono::duration_cast(Period).count(); + const auto IdleReduction = Advance - LoadReduction; // signal high load level - this->setLoad(LOAD_HIGH); + setLoad(LoadThreadWorkType::LoadHigh); // calculate values for nanosleep - nsec load_nsec = load - load_reduction; + const auto LoadNsec = Load - LoadReduction; // wait for time to be ellapsed with high load #ifdef ENABLE_VTRACING VT_USER_START("WD_HIGH"); #endif #ifdef ENABLE_SCOREP - SCOREP_USER_REGION_BY_NAME_BEGIN("WD_HIGH", - SCOREP_USER_REGION_TYPE_COMMON); + SCOREP_USER_REGION_BY_NAME_BEGIN("WD_HIGH", SCOREP_USER_REGION_TYPE_COMMON); #endif { - std::unique_lock lk(this->_watchdogTerminateMutex); + std::unique_lock Lk(WatchdogTerminateMutex); // abort waiting if we get the interrupt signal - this->_watchdogTerminateAlert.wait_for( - lk, load_nsec, [this]() { return this->_watchdog_terminate; }); + WatchdogTerminateAlert.wait_for(Lk, LoadNsec, []() { return WatchdogTerminate; }); // terminate on interrupt - if (this->_watchdog_terminate) { - return EXIT_SUCCESS; + if (WatchdogTerminate) { + return; } } #ifdef ENABLE_VTRACING @@ -100,27 +96,25 @@ int Firestarter::watchdogWorker(std::chrono::microseconds period, #endif // signal low load - this->setLoad(LOAD_LOW); + setLoad(LoadThreadWorkType::LoadLow); // calculate values for nanosleep - nsec idle_nsec = idle - idle_reduction; + const auto IdleNsec = Idle - IdleReduction; // wait for time to be ellapsed with low load #ifdef ENABLE_VTRACING VT_USER_START("WD_LOW"); #endif #ifdef ENABLE_SCOREP - SCOREP_USER_REGION_BY_NAME_BEGIN("WD_LOW", - SCOREP_USER_REGION_TYPE_COMMON); + SCOREP_USER_REGION_BY_NAME_BEGIN("WD_LOW", SCOREP_USER_REGION_TYPE_COMMON); #endif { - std::unique_lock lk(this->_watchdogTerminateMutex); + std::unique_lock Lk(WatchdogTerminateMutex); // abort waiting if we get the interrupt signal - this->_watchdogTerminateAlert.wait_for( - lk, idle_nsec, [this]() { return this->_watchdog_terminate; }); + WatchdogTerminateAlert.wait_for(Lk, IdleNsec, []() { return WatchdogTerminate; }); // terminate on interrupt - if (this->_watchdog_terminate) { - return EXIT_SUCCESS; + if (WatchdogTerminate) { + return; } } #ifdef ENABLE_VTRACING @@ -131,16 +125,15 @@ int Firestarter::watchdogWorker(std::chrono::microseconds period, #endif // increment elapsed time - time += period; + Time += Period; // exit when termination signal is received or timeout is reached { - std::lock_guard lk(this->_watchdogTerminateMutex); - if (this->_watchdog_terminate || - (timeout > sec::zero() && (time > timeout))) { - this->setLoad(LOAD_STOP); + const std::lock_guard Lk(WatchdogTerminateMutex); + if (WatchdogTerminate || (Timeout > sec::zero() && (Time > Timeout))) { + setLoad(LoadThreadWorkType::LoadStop); - return EXIT_SUCCESS; + return; } } } @@ -148,18 +141,15 @@ int Firestarter::watchdogWorker(std::chrono::microseconds period, // if timeout is set, sleep for this time and stop execution. // else return and wait for sigterm handler to request threads to stop. - if (timeout > sec::zero()) { + if (Timeout > sec::zero()) { { - std::unique_lock lk(Firestarter::_watchdogTerminateMutex); + std::unique_lock Lk(Firestarter::WatchdogTerminateMutex); // abort waiting if we get the interrupt signal - Firestarter::_watchdogTerminateAlert.wait_for( - lk, timeout, []() { return Firestarter::_watchdog_terminate; }); + Firestarter::WatchdogTerminateAlert.wait_for(Lk, Timeout, []() { return WatchdogTerminate; }); } - this->setLoad(LOAD_STOP); - - return EXIT_SUCCESS; + setLoad(LoadThreadWorkType::LoadStop); } - - return EXIT_SUCCESS; } + +} // namespace firestarter \ No newline at end of file diff --git a/tooling/clang-tidy.py b/tooling/clang-tidy.py new file mode 100755 index 00000000..b8ce7b5c --- /dev/null +++ b/tooling/clang-tidy.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import json +from pathlib import Path +import subprocess +import click +import multiprocessing +import sys +import typing +import random +from functools import partial + +# Find all source files from the compile commands database that are in a specific directory. +def find_source_files_from_compile_commands(compile_commands_path: Path, sources_dir: Path) -> typing.List[Path]: + with open(compile_commands_path, 'r') as fp: + compile_commands = json.loads(fp.read()) + sources = [ entry['file'] for entry in compile_commands ] + sources = list(filter(lambda file: str(file).startswith(str(sources_dir)), sources)) + return sources + +# Split a list of paths into multiple list of paths +def split_in_chunks(chunk_size: int, input: typing.List[Path]) -> typing.List[typing.List[Path]]: + length = len(input) // chunk_size + if length * chunk_size < len(input): + length += 1 + + return [ input[i:i+length] for i in range(0, len(input), length) ] + +# Run clang-tidy on a set of input files and return the stdout +def run_clang_tidy(files: typing.List[Path], project_root_path: Path, build_root_path: Path, clang_tidy_file_path: Path) -> bytes: + command_args = ['clang-tidy', '-extra-arg=-std=c++17', f'-p={build_root_path}', f'--config-file={clang_tidy_file_path}', '--format-style=file'] + command_args += files + print(f'Starting {command_args}') + p = subprocess.Popen(command_args, stdout=subprocess.PIPE, cwd=project_root_path) + + # Wait for clang-tidy instances to terminate + if p.poll() is None: + p.wait() + stdout, _ = p.communicate() + return stdout + b'\n' + + return b'' + +@click.group() +def cli(): + pass + +@cli.command(help='Exsits successfully if the report is empty') +@click.option('--build-root', help='The folder where the clang-tidy-report.txt is located.', required=True) +def check(build_root): + build_root_path = Path(build_root).absolute() + + print(f'Looking for clang-tidy-report.txt in {build_root_path}') + clang_tidy_report_path = build_root_path / Path('clang-tidy-report.txt') + if clang_tidy_report_path.exists(): + print(f'Found {clang_tidy_report_path}') + else: + sys.exit("Dind't find clang-tidy-report.txt. Aborting.") + + with open(clang_tidy_report_path, 'r') as fp: + content = fp.read().rstrip() + if len(content) == 0: + print('No content in clang-tidy-report.txt') + else: + sys.exit('Found content in clang-tidy-report.txt') + +@cli.command(help='Create the clang-tidy report') +@click.option('--project-root', default=Path(__file__).parent.parent.absolute(), help='The folder where the git repository is located.') +@click.option('--build-root', help='The folder where the compile_commands.json is located.', required=True) +@click.option('--cores', default=multiprocessing.cpu_count(), help='The number of clang-tidy processes to spawn.') +def clang_tidy_report(project_root, build_root, cores): + project_root_path = Path(project_root).absolute() + build_root_path = Path(build_root).absolute() + src_path = project_root_path / Path('src') + + print(f'Looking for compile_commands.json in {build_root_path}') + compile_commands_path = build_root_path / Path('compile_commands.json') + if compile_commands_path.exists(): + print(f'Found {compile_commands_path}') + else: + sys.exit("Dind't find compile_commands.json. Aborting.") + + print(f'Looking for .clang-tidy in {project_root_path}') + clang_tidy_file_path = project_root_path / Path('.clang-tidy') + if clang_tidy_file_path.exists(): + print(f'Found {clang_tidy_file_path}') + else: + sys.exit("Dind't find .clang-tidy. Aborting.") + + files = find_source_files_from_compile_commands(compile_commands_path, src_path) + print(f'Found {len(files)} source and header files.') + + print(f'Lanching {cores} instances of clang-tidy in project root: {project_root_path}') + + # Shuffle files to improve runtime performance. Use seed 123 to keep it the same across runs. + files_shuffled = files.copy() + random.Random(123).shuffle(files_shuffled) + + # Spawn multiple python thread that each start their own instance of clang-tidy. Opening all processes in the same python thread caused problems with github actions. + with multiprocessing.Pool(cores) as p: + stdout = p.map(partial(run_clang_tidy, project_root_path=project_root_path, build_root_path=build_root_path, clang_tidy_file_path=clang_tidy_file_path), split_in_chunks(cores, files_shuffled)) + + clang_tidy_report_file = build_root_path / Path('clang-tidy-report.txt') + print(f'Writing report to {clang_tidy_report_file}') + with open(clang_tidy_report_file, 'wb') as fp: + fp.write(b''.join(stdout)) + +if __name__ == '__main__': + cli() \ No newline at end of file