diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..656a3655
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,7 @@
+---
+BasedOnStyle: LLVM
+Language: Cpp
+BreakConstructorInitializersBeforeComma: 'true'
+AllowShortFunctionsOnASingleLine: All
+PointerAlignment: Left
+ColumnLimit: 120
\ No newline at end of file
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 00000000..07e30621
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,64 @@
+---
+# Configure clang-tidy for this project.
+
+#  -bugprone-narrowing-conversions: too many false positives around
+#      `std::size_t`  vs. `*::difference_type`.
+
+#  -boost-use-ranges: crash of clangd https://github.com/llvm/llvm-project/issues/109037
+
+#  -readability-identifier-length length of at least 3 does not make sense for some variables
+
+#  -cppcoreguidelines-avoid-magic-numbers
+#  -readability-magic-numbers currently we have too may numbers in this code
+
+#  -bugprone-easily-swappable-parameters we are not using strong typedefs
+
+#  -readability-function-cognitive-complexity allow big functions
+
+Checks: >
+  -*,
+  boost-*,
+  bugprone-*,
+  cert-*,
+  clang-analyzer-*,
+  concurrency-*,
+  cppcoreguidelines-*,
+  google-*,
+  misc-*,
+  modernize-*,
+  performance-*,
+  portability-*,
+  readability-*,
+  -bugprone-narrowing-conversions,
+  -cppcoreguidelines-special-member-functions,
+  -boost-use-ranges,
+  -readability-identifier-length,
+  -cppcoreguidelines-avoid-magic-numbers,
+  -readability-magic-numbers,
+  -bugprone-easily-swappable-parameters,
+  -readability-function-cognitive-complexity
+  
+# Turn all the warnings from the checks above into errors.
+WarningsAsErrors: "*"
+
+HeaderFilterRegex: "include/firestarter/.*\\.(h|hpp)$"
+
+CheckOptions:
+  - { key: readability-identifier-naming.NamespaceCase,          value: lower_case }
+  - { key: readability-identifier-naming.ClassCase,              value: CamelCase  }
+  - { key: readability-identifier-naming.StructCase,             value: CamelCase  }
+  - { key: readability-identifier-naming.FunctionCase,           value: camelBack  }
+  - { key: readability-identifier-naming.MemberCase,        value: CamelCase }
+  - { key: readability-identifier-naming.VariableCase,           value: CamelCase }
+  - { key: readability-identifier-naming.EnumCase,           value: CamelCase }
+  - { key: readability-identifier-naming.ParameterCase,           value: CamelCase }
+  - { key: readability-identifier-naming.UnionCase,           value: CamelCase }
+  - { key: readability-identifier-naming.IgnoreMainLikeFunctions,  value: 1 }
+  - { key: readability-redundant-member-init.IgnoreBaseInCopyConstructors,  value: 1 }
+  - { key: modernize-use-default-member-init.UseAssignment,  value: 1 }
+  - { key: readability-implicit-bool-conversion.AllowIntegerConditions,  value: 1 }
+  - { key: readability-implicit-bool-conversion.AllowPointerConditions,  value: 1 }
+  - { key: readability-function-cognitive-complexity.IgnoreMacros,  value: 1 }
+  - { key: misc-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic, value: "true" }
+  # disable warnings is asmjit
+  - { key: 'clang-analyzer-optin.cplusplus.UninitializedObject:IgnoreRecordsWithField', value: 'asmjit::Operand_::Signature' }
\ No newline at end of file
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 00000000..d1806bac
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1 @@
+9732bdb59717274f666e9c1497289d1f9a0d7858
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
new file mode 100644
index 00000000..ef004c50
--- /dev/null
+++ b/.github/workflows/clang-format.yml
@@ -0,0 +1,19 @@
+name: clang-format-review
+
+# You can be more specific, but it currently only works on pull requests
+on: [push, pull_request]
+
+jobs:
+  clang-format:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install clang-tidy
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y clang-tidy
+      - name: Analyze
+        run: |
+          clang-format --dry-run --Werror -style=file $(find ./src/ -name '*.cpp' -print)
+          clang-format --dry-run --Werror -style=file $(find ./include/ -name '*.hpp' -print)
+          clang-format --dry-run --Werror -style=file $(find ./include/ -name '*.h' -print)
diff --git a/.github/workflows/clang-tidy.yml b/.github/workflows/clang-tidy.yml
new file mode 100644
index 00000000..051d72f9
--- /dev/null
+++ b/.github/workflows/clang-tidy.yml
@@ -0,0 +1,50 @@
+name: clang-tidy-review
+
+on: [push, pull_request]
+
+env:
+  PYTHONUNBUFFERED: 1
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: 'true'
+
+    # Ideally we would want to run the clang-tidy for every kind of build.
+    # This would make shure that we will check all platform dependent code parts.
+    # Here we only test the standard linux build.    
+    - name: Install python3 and libraries
+      run: |
+        sudo apt update
+        sudo apt install python3 python3-pip
+        pip install click
+
+    - name: Create build directory
+      run: |
+        mkdir build
+
+    - name: Run CMake configure (default)
+      run: |
+        cd build
+        cmake ..
+
+    - name: Build
+      run: |
+        cd build
+        make -j4
+
+    - name: Run clang-tidy
+      run: |
+        ./tooling/clang-tidy.py clang-tidy-report --build-root build --cores 4
+
+    - name: Print report
+      run: |
+        cat build/clang-tidy-report.txt
+
+    - name: Check if report is empty
+      run: |
+        ./tooling/clang-tidy.py check --build-root build
\ No newline at end of file
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 6b4c9178..b58b7e03 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -21,6 +21,8 @@ jobs:
       run: |
         sudo rm -rf /usr/local/lib/android
         sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf /usr/local/.ghcup
 
     - name: Install g++-9 (if needed)
       if: matrix.compiler == 'g++-9'
diff --git a/.gitignore b/.gitignore
index c4fde123..e157a461 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ result*
 *.swp
 *.swo
 build*/
+.cache/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ec61b97f..c8f580e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,8 @@
 cmake_minimum_required(VERSION 3.22)
 project(FIRESTARTER)
 
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
 include(cmake/GitSubmoduleUpdate.cmake)
 
 # set FIRESTARTER version
diff --git a/include/firestarter/AlignedAlloc.hpp b/include/firestarter/AlignedAlloc.hpp
new file mode 100644
index 00000000..7c5714fb
--- /dev/null
+++ b/include/firestarter/AlignedAlloc.hpp
@@ -0,0 +1,78 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#pragma once
+
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+
+namespace firestarter {
+
+struct AlignedAlloc {
+private:
+  /// Round the size to the nearest multiple of the aligment
+  /// \arg Size The number to be rounded up.
+  /// \arg Alignment The number to whoose multiple to be round up to.
+  /// \returns Size rounded up to the nearest multiple of the Alignment
+  static auto padSize(const std::size_t Size, const std::size_t Alignment) -> std::size_t {
+    return Alignment * static_cast<int>(std::ceil(static_cast<double>(Size) / static_cast<double>(Alignment)));
+  };
+
+public:
+  /// Allocate memory with a given alignment. The size will automatically increased to the nearest multiple of the
+  /// alignment.
+  /// \arg Size The minimum required memory.
+  /// \arg Alignment describes to which boundary the memory should be aligned. The default is 64B which will account to
+  /// the size of a cache line on most systems.
+  /// \returns The pointer to the allocated memory.
+  static auto malloc(const std::size_t Size, const std::size_t Alignment = 64) -> void* {
+    // NOLINTBEGIN(cppcoreguidelines-owning-memory)
+#if defined(__APPLE__)
+    return aligned_alloc(Alignment, padSize(Size, Alignment));
+#elif defined(__MINGW64__)
+    return _mm_malloc(padSize(Size, Alignment), Alignment);
+#elif defined(_MSC_VER)
+    return _aligned_malloc(padSize(Size, Alignment), Alignment);
+#else
+    return aligned_alloc(Alignment, padSize(Size, Alignment));
+#endif
+    // NOLINTEND(cppcoreguidelines-owning-memory)
+  };
+
+  /// Deallocate memory which has been allocated by the AlignedAlloc::malloc function.
+  /// \arg Ptr The pointer to the allocated memory.
+  static void free(void* Ptr) {
+    // NOLINTBEGIN(cppcoreguidelines-owning-memory,cppcoreguidelines-no-malloc)
+#if defined(__APPLE__)
+    ::free(Ptr);
+#elif defined(__MINGW64__)
+    _mm_free(Ptr);
+#elif defined(_MSC_VER)
+    _aligned_free(Ptr);
+#else
+    std::free(Ptr);
+#endif
+    // NOLINTEND(cppcoreguidelines-owning-memory,cppcoreguidelines-no-malloc)
+  };
+};
+
+} // namespace firestarter
diff --git a/include/firestarter/Config.hpp b/include/firestarter/Config.hpp
new file mode 100644
index 00000000..5c272401
--- /dev/null
+++ b/include/firestarter/Config.hpp
@@ -0,0 +1,124 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#pragma once
+
+#include <chrono>
+#include <string>
+#include <vector>
+
+namespace firestarter {
+
+/// This struct contains the parsed config from the command line for Firestarter.
+struct Config {
+  /// The argument vector from the command line.
+  const char** Argv;
+
+  /// The timeout after which firestarter terminates. This is available in combination with optimization.
+  std::chrono::seconds Timeout{};
+  /// The period after with which the low/high load routine is switched.
+  std::chrono::microseconds Period{};
+  /// The load in the range of 0 < Load <= Period, which controls how long of the period the high-load loop runs.
+  std::chrono::microseconds Load{};
+
+  /// The interval every which the register will be dumped to the file.
+  std::chrono::seconds DumpRegistersTimeDelta = std::chrono::seconds(0);
+  /// The time to skip from the measurement start
+  std::chrono::milliseconds StartDelta = std::chrono::milliseconds(0);
+  /// The time to skip from the measurement stop
+  std::chrono::milliseconds StopDelta = std::chrono::milliseconds(0);
+  /// Metric values will be polled by the MeasurementInterval.
+  std::chrono::milliseconds MeasurementInterval = std::chrono::milliseconds(0);
+  /// The time how long the processor will be preheated before starting a measurement or optimization.
+  std::chrono::seconds Preheat{};
+  /// The time how long a measurement should take.
+  std::chrono::seconds EvaluationDuration{};
+
+  /// The crossover probability used in the NSGA2 optimization algorithm.
+  double Nsga2Cr;
+  /// The mutation probability used in the NSGA2 optimization algorithm.
+  double Nsga2M;
+
+  /// The name of the metrics that are read from stdin.
+  std::vector<std::string> StdinMetrics;
+  /// The paths to the metrics that are loaded using shared libraries.
+  std::vector<std::string> MetricPaths;
+  /// The list of metrics that are used for maximization. If a metric is prefixed with '-' it will be minimized.
+  std::vector<std::string> OptimizationMetrics;
+
+  /// The optional cpu bind that allow pinning to specific cpus.
+  std::string CpuBind;
+  /// The optional selected instruction groups. If this is empty the default will be choosen.
+  std::string InstructionGroups;
+  /// The file where the dump register feature will safe its output to.
+  std::string DumpRegistersOutpath;
+  /// The name of the optimization algorithm.
+  std::string OptimizationAlgorithm;
+  /// The file where the data saved during optimization is saved.
+  std::string OptimizeOutfile;
+
+  /// The argument count from the command line.
+  int Argc;
+  /// The requested number of threads firestarter should run with. 0 means all threads.
+  unsigned RequestedNumThreads;
+  /// The selected function id. 0 means automatic selection.
+  unsigned FunctionId;
+  /// The line count of the payload. 0 means default.
+  unsigned LineCount = 0;
+  /// The number of gpus firestarter should stress. Default is -1 means all gpus.
+  int Gpus = 0;
+  /// The matrix size which should be used. 0 means automatic detections.
+  unsigned GpuMatrixSize = 0;
+  /// The number of individuals that should be used for the optimization.
+  unsigned Individuals;
+  /// The number of generations that should be used for the optimization.
+  unsigned Generations;
+
+  /// If the function summary should be printed.
+  bool PrintFunctionSummary;
+  /// If the available instruction groups for a function should be printed.
+  bool ListInstructionGroups;
+  /// Allow payloads that are not supported on the current processor.
+  bool AllowUnavailablePayload = false;
+  /// Is the dump registers debug feature enabled?
+  bool DumpRegisters = false;
+  /// Is the error detection feature enabled?
+  bool ErrorDetection = false;
+  /// Should the GPUs use floating point precision? If neither GpuUseFloat or GpuUseDouble is set, precision will be
+  /// choosen automatically.
+  bool GpuUseFloat = false;
+  /// Should the GPUs use double point precision? If neither GpuUseFloat or GpuUseDouble is set, precision will be
+  /// choosen automatically.
+  bool GpuUseDouble = false;
+  /// Should we print all available metrics.
+  bool ListMetrics = false;
+  /// Do we perform an measurement.
+  bool Measurement = false;
+  /// Do we perform optimization.
+  bool Optimize = false;
+
+  Config() = delete;
+
+  /// Parser the config from the command line argumens.
+  Config(int Argc, const char** Argv);
+};
+
+} // namespace firestarter
\ No newline at end of file
diff --git a/include/firestarter/Constants.hpp b/include/firestarter/Constants.hpp
index 419d8b6a..71a0d992 100644
--- a/include/firestarter/Constants.hpp
+++ b/include/firestarter/Constants.hpp
@@ -21,16 +21,112 @@
 
 #pragma once
 
-#define THREAD_WAIT 1
-#define THREAD_WORK 2
-#define THREAD_INIT 3
-#define THREAD_STOP 4
-#define THREAD_SWITCH 5
-#define THREAD_INIT_FAILURE 0xffffffff
-
-/* DO NOT CHANGE! the asm load-loop tests if load-variable is == 0 */
-#define LOAD_LOW 0
-/* DO NOT CHANGE! the asm load-loop continues until the load-variable is != 1 */
-#define LOAD_HIGH 1
-#define LOAD_STOP 2
-#define LOAD_SWITCH 4
+#include <cstdint>
+
+namespace firestarter {
+
+using EightBytesType = uint64_t;
+
+// We want enum to have the size of 8B. Disable the warnings for bigger enum size than needed.
+// NOLINTBEGIN(performance-enum-size)
+
+/// This enum describes the state of the load workers.
+enum class LoadThreadState : EightBytesType {
+  /// Idle
+  ThreadWait = 1,
+  /// Work loop (both low and high load)
+  ThreadWork = 2,
+  /// Init the thread
+  ThreadInit = 3,
+  /// Tell the thread to recompile the payload and reinitialize the data.
+  ThreadSwitch = 4
+};
+
+/// This enum describes the Load that should be applied by firestarter.
+enum class LoadThreadWorkType : EightBytesType {
+  /* DO NOT CHANGE! the asm load-loop tests if load-variable is == 0 */
+  /// Apply low load
+  LoadLow = 0,
+  /* DO NOT CHANGE! the asm load-loop continues until the load-variable is != 1 */
+  /// Apply hugh load
+  LoadHigh = 1,
+  /// Exit the load loop and stop the execution of firestarter.
+  LoadStop = 2,
+  /// Exit the load loop.
+  LoadSwitch = 4
+};
+// NOLINTEND(performance-enum-size)
+
+/// This struct holds infomation about enabled or disabled compile time features for FIRESTARTER.
+struct FirestarterOptionalFeatures {
+  /// Do we have a build that enabled optimization?
+  bool OptimizationEnabled;
+  /// Do we have a build that enabled CUDA or HIP?
+  bool CudaEnabled;
+  /// Do we have a build that enabled OneAPU?
+  bool OneAPIEnabled;
+  /// Is error detection enabled?
+  bool ErrorDetectionEnabled;
+  /// Are debug features enabled?
+  bool DebugFeatureEnabled;
+  /// Is dumping registers enabled?
+  bool DumpRegisterEnabled;
+  /// Is the current build for X86?
+  bool IsX86;
+  /// Is the current build for Windows?
+  bool IsWin32;
+  /// Is the current build built with Windows MSC?
+  bool IsMsc;
+
+  /// Is one of the GPU features enabled?
+  [[nodiscard]] constexpr auto gpuEnabled() const -> bool { return CudaEnabled || OneAPIEnabled; }
+};
+
+// MSC only supports designated initializers from C++20
+static constexpr const FirestarterOptionalFeatures OptionalFeatures {
+#if defined(linux) || defined(__linux__)
+  /*OptimizationEnabled=*/true,
+#else
+  /*OptimizationEnabled=*/false,
+#endif
+
+#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)
+      /*CudaEnabled=*/true,
+#else
+      /*CudaEnabled=*/false,
+#endif
+
+#ifdef FIRESTARTER_BUILD_ONEAPI
+      /*OneAPIEnabled=*/true,
+#else
+      /*OneAPIEnabled=*/false,
+#endif
+
+      /*ErrorDetectionEnabled=*/true,
+
+#ifdef FIRESTARTER_DEBUG_FEATURES
+      /*DebugFeatureEnabled=*/true, /*DumpRegisterEnabled =*/true,
+#else
+      /*DebugFeatureEnabled=*/false, /*DumpRegisterEnabled =*/false,
+#endif
+
+#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+      /*IsX86=*/true,
+#else
+#error "FIRESTARTER is not implemented for this ISA"
+#endif
+
+#ifdef _WIN32
+      /*IsWin32=*/true,
+#else
+      /*IsWin32=*/false,
+#endif
+
+#ifdef _MSC_VER
+      /*IsMsc=*/true,
+#else
+      /*IsMsc=*/false,
+#endif
+};
+
+} // namespace firestarter
\ No newline at end of file
diff --git a/include/firestarter/Cuda/Cuda.hpp b/include/firestarter/Cuda/Cuda.hpp
index a2f281d9..396654c7 100644
--- a/include/firestarter/Cuda/Cuda.hpp
+++ b/include/firestarter/Cuda/Cuda.hpp
@@ -21,30 +21,61 @@
 
 #pragma once
 
+#include "firestarter/Constants.hpp"
+
 #include <condition_variable>
-#include <mutex>
 #include <thread>
-#include <vector>
 
 namespace firestarter::cuda {
 
+/// This class handles the workload on CUDA and HIP compatible GPUs. A gemm routine is used to stress them with a
+/// constant high load. This header does not include any CUDA or HIP specific headers to allow us to not guard the
+/// include of this header in other parts of the programm.
 class Cuda {
 private:
-  std::thread _initThread;
-  std::condition_variable _waitForInitCv;
-  std::mutex _waitForInitCvMutex;
+  /// The thread that is used to initilize the gpus. This thread will wait until each thread that runs the gemm routine
+  /// joins.
+  std::thread InitThread;
 
-  static void initGpus(std::condition_variable &cv,
-                       volatile unsigned long long *loadVar, bool useFloat,
-                       bool useDouble, unsigned matrixSize, int gpus);
+  /// Spawns a thread for each of the selected gpus, initilizes them and starts the execution of the gemm in parallel.
+  /// \arg WaitForInitCv The condition variables used to signal that all gpus are initialized.
+  /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter.
+  /// \arg UseFloat Set to true if we want to stress using single precision floating points.
+  /// \arg UseDouble Set to true if we want to stress using double precision floating points. If neither UseFloat or
+  /// UseDouble is set the precision will be choosen automatically.
+  /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for
+  /// automatic selection.
+  /// \arg Gpus Select the number of gpus to stress or -1 for all.
+  static void initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
+                       bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus);
 
 public:
-  Cuda(volatile unsigned long long *loadVar, bool useFloat, bool useDouble,
-       unsigned matrixSize, int gpus);
+  /// Initilize the cuda class. This will start a thread running the Cuda::initGpus function and wait until all gpus are
+  /// inititialized.
+  /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter.
+  /// \arg UseFloat Set to true if we want to stress using single precision floating points.
+  /// \arg UseDouble Set to true if we want to stress using double precision floating points. If neither UseFloat or
+  /// UseDouble is set the precision will be choosen automatically.
+  /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for
+  /// automatic selection.
+  /// \arg Gpus Select the number of gpus to stress or -1 for all.
+  Cuda(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble, unsigned MatrixSize,
+       int Gpus)
+#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)
+      ;
+#else
+  {
+    (void)&LoadVar;
+    (void)UseFloat;
+    (void)UseDouble;
+    (void)MatrixSize;
+    (void)Gpus;
+  }
+#endif
 
   ~Cuda() {
-    if (_initThread.joinable()) {
-      _initThread.join();
+    if (InitThread.joinable()) {
+      InitThread.join();
     }
   }
 };
diff --git a/include/firestarter/Cuda/CudaHipCompat.hpp b/include/firestarter/Cuda/CudaHipCompat.hpp
new file mode 100644
index 00000000..f0543f4d
--- /dev/null
+++ b/include/firestarter/Cuda/CudaHipCompat.hpp
@@ -0,0 +1,774 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+// This file provides compatibility for the minor differences between the CUDA and HIP APIs. We do this by:
+// 1. Include the required header files for CUDA or HIP
+// 2. Define compatibility types between CUDA and HIP. This results in all enum names to be the same in the source code.
+// These types are mapped to the ones with the correct prefix. These are cu and hip, CU and HIP, cuda and hip or CUDA
+// and HIP.
+// 3. Define functions that converts the error code enums into strings.
+// 4. Define compatibility function for cals to CUDA, HIP or one of their libraries (blas, rand etc.)
+
+#pragma once
+
+#include "firestarter/Logging/Log.hpp"
+
+#include <cassert>
+#include <cstddef>
+#include <optional>
+#include <sstream>
+#include <type_traits>
+
+#ifdef FIRESTARTER_BUILD_CUDA
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <curand_kernel.h>
+
+#elif defined(FIRESTARTER_BUILD_HIP)
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+#include <hiprand_kernel.h>
+
+#endif
+
+namespace firestarter::cuda::compat {
+
+/// Use this function as a wrapper to all calls of CUDA or HIP functions. If an error occured we abort and print the
+/// error code.
+/// \tparam T The type of the error code returned from calls to CUDA or HIP. This may be one of BlasStatusT, ErrorT,
+/// RandStatusT or CUresult.
+/// \arg TVal The errorcode returned from calls to CUDA or HIP.
+/// \arg File The file for the log message in which the error occured.
+/// \arg Line The line for the log message in which the error occured.
+/// \arg DeviceIndex if the CUDA or HIP call is associated to a specific device, the index of the device should be
+/// provided here for the log message.
+template <typename T> void accellSafeCall(T TVal, const char* File, int Line, std::optional<int> DeviceIndex = {});
+
+#ifdef FIRESTARTER_BUILD_CUDA
+// Start of CUDA compatibility types
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class BlasStatusT : std::underlying_type_t<cublasStatus_t> {
+  BLAS_STATUS_SUCCESS = CUBLAS_STATUS_SUCCESS,
+  BLAS_STATUS_NOT_INITIALIZED = CUBLAS_STATUS_NOT_INITIALIZED,
+  BLAS_STATUS_ALLOC_FAILED = CUBLAS_STATUS_ALLOC_FAILED,
+  BLAS_STATUS_INVALID_VALUE = CUBLAS_STATUS_INVALID_VALUE,
+  BLAS_STATUS_ARCH_MISMATCH = CUBLAS_STATUS_ARCH_MISMATCH,
+  BLAS_STATUS_MAPPING_ERROR = CUBLAS_STATUS_MAPPING_ERROR,
+  BLAS_STATUS_EXECUTION_FAILED = CUBLAS_STATUS_EXECUTION_FAILED,
+  BLAS_STATUS_INTERNAL_ERROR = CUBLAS_STATUS_INTERNAL_ERROR,
+  BLAS_STATUS_NOT_SUPPORTED = CUBLAS_STATUS_NOT_SUPPORTED,
+  BLAS_STATUS_LICENSE_ERROR = CUBLAS_STATUS_LICENSE_ERROR,
+};
+
+constexpr const char* AccelleratorString = "CUDA";
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class ErrorT : std::underlying_type_t<cudaError_t> {
+  Success = cudaSuccess,
+};
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class RandStatusT : std::underlying_type_t<curandStatus_t> {
+  RAND_STATUS_SUCCESS = CURAND_STATUS_SUCCESS,
+  RAND_STATUS_VERSION_MISMATCH = CURAND_STATUS_VERSION_MISMATCH,
+  RAND_STATUS_NOT_INITIALIZED = CURAND_STATUS_NOT_INITIALIZED,
+  RAND_STATUS_ALLOCATION_FAILED = CURAND_STATUS_ALLOCATION_FAILED,
+  RAND_STATUS_TYPE_ERROR = CURAND_STATUS_TYPE_ERROR,
+  RAND_STATUS_OUT_OF_RANGE = CURAND_STATUS_OUT_OF_RANGE,
+  RAND_STATUS_LENGTH_NOT_MULTIPLE = CURAND_STATUS_LENGTH_NOT_MULTIPLE,
+  RAND_STATUS_DOUBLE_PRECISION_REQUIRED = CURAND_STATUS_DOUBLE_PRECISION_REQUIRED,
+  RAND_STATUS_LAUNCH_FAILURE = CURAND_STATUS_LAUNCH_FAILURE,
+  RAND_STATUS_PREEXISTING_FAILURE = CURAND_STATUS_PREEXISTING_FAILURE,
+  RAND_STATUS_INITIALIZATION_FAILED = CURAND_STATUS_INITIALIZATION_FAILED,
+  RAND_STATUS_ARCH_MISMATCH = CURAND_STATUS_ARCH_MISMATCH,
+  RAND_STATUS_INTERNAL_ERROR = CURAND_STATUS_INTERNAL_ERROR,
+};
+
+using StreamOrContext = CUcontext;
+
+using DeviceProperties = cudaDeviceProp;
+
+using RandGenerator = curandGenerator_t;
+
+using BlasHandle = cublasHandle_t;
+
+using BlasStatus = cublasStatus_t;
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class BlasOperation : std::underlying_type_t<cublasOperation_t> {
+  BLAS_OP_N = CUBLAS_OP_N,
+  BLAS_OP_T = CUBLAS_OP_T,
+  BLAS_OP_C = CUBLAS_OP_C,
+};
+
+using BlasOperationT = cublasOperation_t;
+
+using CUResultOrHipErrorT = CUresult;
+
+#elif defined(FIRESTARTER_BUILD_HIP)
+// Start of HIP compatibility types
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class BlasStatusT : std::underlying_type_t<hipblasStatus_t> {
+  BLAS_STATUS_SUCCESS = HIPBLAS_STATUS_SUCCESS,
+  BLAS_STATUS_NOT_INITIALIZED = HIPBLAS_STATUS_NOT_INITIALIZED,
+  BLAS_STATUS_ALLOC_FAILED = HIPBLAS_STATUS_ALLOC_FAILED,
+  BLAS_STATUS_INVALID_VALUE = HIPBLAS_STATUS_INVALID_VALUE,
+  BLAS_STATUS_ARCH_MISMATCH = HIPBLAS_STATUS_ARCH_MISMATCH,
+  BLAS_STATUS_MAPPING_ERROR = HIPBLAS_STATUS_MAPPING_ERROR,
+  BLAS_STATUS_EXECUTION_FAILED = HIPBLAS_STATUS_EXECUTION_FAILED,
+  BLAS_STATUS_INTERNAL_ERROR = HIPBLAS_STATUS_INTERNAL_ERROR,
+  BLAS_STATUS_NOT_SUPPORTED = HIPBLAS_STATUS_NOT_SUPPORTED,
+  BLAS_STATUS_UNKNOWN = HIPBLAS_STATUS_UNKNOWN,
+  BLAS_STATUS_HANDLE_IS_NULLPTR = HIPBLAS_STATUS_HANDLE_IS_NULLPTR,
+  BLAS_STATUS_INVALID_ENUM = HIPBLAS_STATUS_INVALID_ENUM,
+};
+
+constexpr const char* AccelleratorString = "HIP";
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class ErrorT : std::underlying_type_t<hipError_t> {
+  Success = hipSuccess,
+};
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class RandStatusT : std::underlying_type_t<hiprandStatus_t> {
+  RAND_STATUS_SUCCESS = HIPRAND_STATUS_SUCCESS,
+  RAND_STATUS_VERSION_MISMATCH = HIPRAND_STATUS_VERSION_MISMATCH,
+  RAND_STATUS_NOT_INITIALIZED = HIPRAND_STATUS_NOT_INITIALIZED,
+  RAND_STATUS_ALLOCATION_FAILED = HIPRAND_STATUS_ALLOCATION_FAILED,
+  RAND_STATUS_TYPE_ERROR = HIPRAND_STATUS_TYPE_ERROR,
+  RAND_STATUS_OUT_OF_RANGE = HIPRAND_STATUS_OUT_OF_RANGE,
+  RAND_STATUS_LENGTH_NOT_MULTIPLE = HIPRAND_STATUS_LENGTH_NOT_MULTIPLE,
+  RAND_STATUS_DOUBLE_PRECISION_REQUIRED = HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED,
+  RAND_STATUS_LAUNCH_FAILURE = HIPRAND_STATUS_LAUNCH_FAILURE,
+  RAND_STATUS_PREEXISTING_FAILURE = HIPRAND_STATUS_PREEXISTING_FAILURE,
+  RAND_STATUS_INITIALIZATION_FAILED = HIPRAND_STATUS_INITIALIZATION_FAILED,
+  RAND_STATUS_ARCH_MISMATCH = HIPRAND_STATUS_ARCH_MISMATCH,
+  RAND_STATUS_INTERNAL_ERROR = HIPRAND_STATUS_INTERNAL_ERROR,
+  RAND_STATUS_NOT_IMPLEMENTED = HIPRAND_STATUS_NOT_IMPLEMENTED,
+};
+
+using StreamOrContext = hipStream_t;
+
+using DeviceProperties = hipDeviceProp_t;
+
+using RandGenerator = hiprandGenerator_t;
+
+using BlasHandle = hipblasHandle_t;
+
+using BlasStatus = hipblasStatus_t;
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class BlasOperation : std::underlying_type_t<hipblasOperation_t> {
+  BLAS_OP_N = HIPBLAS_OP_N,
+  BLAS_OP_T = HIPBLAS_OP_T,
+  BLAS_OP_C = HIPBLAS_OP_C,
+};
+
+using BlasOperationT = hipblasOperation_t;
+
+using CUResultOrHipErrorT = ErrorT;
+
+#else
+
+// Start of compatibility types for clangd
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class BlasStatusT {
+  BLAS_STATUS_SUCCESS = 0,
+};
+
+constexpr const char* AccelleratorString = "unknown";
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class ErrorT {
+  Success = 0,
+};
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class RandStatusT {
+  RAND_STATUS_SUCCESS = 0,
+};
+
+using StreamOrContext = void*;
+
+using DeviceProperties = void*;
+
+using RandGenerator = void*;
+
+using BlasHandle = void*;
+
+using BlasStatus = void*;
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum class BlasOperation {
+  BLAS_OP_N,
+  BLAS_OP_T,
+  BLAS_OP_C,
+};
+
+using BlasOperationT = std::size_t;
+
+using CUResultOrHipErrorT = void*;
+
+#endif
+
+// abstracted function for both CUDA and HIP
+
+/// Get the error string from a call to CUDA of HIP libraries.
+/// \arg Status The status code that is returned by these calls.
+/// \return The error as a string.
+inline auto getErrorString(ErrorT Error) -> const char* {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return cudaGetErrorString(static_cast<cudaError_t>(Error));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return hipGetErrorString(static_cast<hipError_t>(Error));
+#else
+  (void)Error;
+  return "unknown";
+#endif
+}
+
+/// Get the error string from a call to CUDA of HIP blas library.
+/// \arg Status The status code that is returned by these calls.
+/// \return The error as a string.
+constexpr auto getErrorString(BlasStatusT Status) -> const char* {
+  switch (Status) {
+  case BlasStatusT::BLAS_STATUS_SUCCESS:
+    return "blas status: success";
+#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)
+  case BlasStatusT::BLAS_STATUS_NOT_INITIALIZED:
+    return "blas status: not initialized";
+  case BlasStatusT::BLAS_STATUS_ALLOC_FAILED:
+    return "blas status: alloc failed";
+  case BlasStatusT::BLAS_STATUS_INVALID_VALUE:
+    return "blas status: invalid value";
+  case BlasStatusT::BLAS_STATUS_ARCH_MISMATCH:
+    return "blas status: arch mismatch";
+  case BlasStatusT::BLAS_STATUS_MAPPING_ERROR:
+    return "blas status: mapping error";
+  case BlasStatusT::BLAS_STATUS_EXECUTION_FAILED:
+    return "blas status: execution failed";
+  case BlasStatusT::BLAS_STATUS_INTERNAL_ERROR:
+    return "blas status: internal error";
+  case BlasStatusT::BLAS_STATUS_NOT_SUPPORTED:
+    return "blas status: not supported";
+#endif
+#ifdef FIRESTARTER_BUILD_CUDA
+  case BlasStatusT::BLAS_STATUS_LICENSE_ERROR:
+    return "blas status: license error";
+#endif
+#ifdef FIRESTARTER_BUILD_HIP
+  case BlasStatusT::BLAS_STATUS_UNKNOWN:
+    return "blas status: unknown";
+  case BlasStatusT::BLAS_STATUS_HANDLE_IS_NULLPTR:
+    return "blas status: handle is null pointer";
+  case BlasStatusT::BLAS_STATUS_INVALID_ENUM:
+    return "blas status: invalid enum";
+#endif
+  default:
+    return "unknown";
+  }
+}
+
+/// Get the error string from a call to CUDA of HIP random library.
+/// \arg Status The status code that is returned by these calls.
+/// \return The error as a string.
+constexpr auto getErrorString(RandStatusT Status) -> const char* {
+  switch (Status) {
+  case RandStatusT::RAND_STATUS_SUCCESS:
+    return "rand status: success";
+#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)
+  case RandStatusT::RAND_STATUS_VERSION_MISMATCH:
+    return "rand status: version mismatch";
+  case RandStatusT::RAND_STATUS_NOT_INITIALIZED:
+    return "rand status: not initialized";
+  case RandStatusT::RAND_STATUS_ALLOCATION_FAILED:
+    return "rand status: allocation failed";
+  case RandStatusT::RAND_STATUS_TYPE_ERROR:
+    return "rand status: type error";
+  case RandStatusT::RAND_STATUS_OUT_OF_RANGE:
+    return "rand status: out of range";
+  case RandStatusT::RAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "rand status: length not multiple";
+  case RandStatusT::RAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "rand status: double precision required";
+  case RandStatusT::RAND_STATUS_LAUNCH_FAILURE:
+    return "rand status: launch failure";
+  case RandStatusT::RAND_STATUS_PREEXISTING_FAILURE:
+    return "rand status: preexisting failure";
+  case RandStatusT::RAND_STATUS_INITIALIZATION_FAILED:
+    return "rand status: initialization failed";
+  case RandStatusT::RAND_STATUS_ARCH_MISMATCH:
+    return "rand status: arch mismatch";
+  case RandStatusT::RAND_STATUS_INTERNAL_ERROR:
+    return "rand status: internal error";
+#endif
+#ifdef FIRESTARTER_BUILD_HIP
+  case RandStatusT::RAND_STATUS_NOT_IMPLEMENTED:
+    return "rand status: not implemented";
+#endif
+  default:
+    return "unknown";
+  }
+}
+
+#ifdef FIRESTARTER_BUILD_CUDA
+/// Get the error string from a call to CUDA library.
+/// \arg Result The status code that is returned by these calls.
+/// \return The error as a string.
+auto getErrorString(CUresult Result) -> const char* {
+  const char* ErrorString;
+  accellSafeCall(cuGetErrorName(Result, &ErrorString), __FILE__, __LINE__);
+  return ErrorString;
+}
+#else
+// define types to not run into compile errors with if constexpr
+
+enum class CUresult {};
+// NOLINTBEGIN(readability-identifier-naming)
+constexpr const int CUDA_SUCCESS = 0;
+// NOLINTEND(readability-identifier-naming)
+#endif
+
+template <typename T> void accellSafeCall(T TVal, const char* File, const int Line, std::optional<int> DeviceIndex) {
+  if constexpr (std::is_same_v<T, BlasStatusT>) {
+    if (TVal == BlasStatusT::BLAS_STATUS_SUCCESS) {
+      return;
+    }
+  } else if constexpr (std::is_same_v<T, ErrorT>) {
+    if (TVal == ErrorT::Success) {
+      return;
+    }
+  } else if constexpr (std::is_same_v<T, RandStatusT>) {
+    if (TVal == RandStatusT::RAND_STATUS_SUCCESS) {
+      return;
+    }
+  } else if constexpr (std::is_same_v<T, CUresult>) {
+#ifndef FIRESTARTER_BUILD_CUDA
+    static_assert(false, "Tried to call accellSafeCall with CUresult, but not building for CUDA.");
+#endif
+    if (TVal == CUDA_SUCCESS) {
+      return;
+    }
+  } else {
+    assert(false && "Tried to call accellSafeCall with an unknown type.");
+  }
+
+  std::stringstream Ss;
+  Ss << AccelleratorString << " error at " << File << ":" << Line
+     << ": error code = " << static_cast<std::underlying_type_t<T>>(TVal) << " (" << getErrorString(TVal) << ")";
+
+  if (DeviceIndex) {
+    Ss << ", device index: " << *DeviceIndex;
+  }
+
+  firestarter::log::error() << Ss.str();
+}
+
+/// Wrapper to cuInit or hipInit.
+/// \arg Flags The Flags forwarded to cuInit or hipInit.
+/// \returns The Error code returned from these calls.
+inline auto init(unsigned int Flags) -> CUResultOrHipErrorT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return cuInit(Flags);
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<CUResultOrHipErrorT>(hipInit(Flags));
+#else
+  (void)Flags;
+  static_assert(false, "Tried to call init, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Get the number GPU devices. Wrapper to cuDeviceGetCount or hipGetDeviceCount.
+/// \arg DevCount The reference to where the number of GPU devices will be written.
+/// \returns The Error code returned from these calls.
+inline auto getDeviceCount(int& DevCount) -> CUResultOrHipErrorT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return cuDeviceGetCount(&DevCount);
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<CUResultOrHipErrorT>(hipGetDeviceCount(&DevCount));
+#else
+  (void)DevCount;
+  static_assert(false, "Tried to call getDeviceCount, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Create a context in case of CUDA or a stream in case of HIP on a specific device. It must be deleted with
+/// destroyContextOrStream.
+/// \arg DeviceIndex The device on which to create the context or stream.
+/// \return The created context or stream.
+inline auto createContextOrStream(int DeviceIndex) -> StreamOrContext {
+  StreamOrContext Soc{};
+#ifdef FIRESTARTER_BUILD_CUDA
+  firestarter::log::trace() << "Creating " << AccelleratorString << " context for computation on device nr. "
+                            << DeviceIndex;
+  CUdevice Device;
+  accellSafeCall(cuDeviceGet(&Device, DeviceIndex), __FILE__, __LINE__, DeviceIndex);
+  accellSafeCall(cuCtxCreate(&Soc, 0, Device), __FILE__, __LINE__, DeviceIndex);
+
+  firestarter::log::trace() << "Set created " << AccelleratorString << " context on device nr. " << DeviceIndex;
+  accellSafeCall(cuCtxSetCurrent(Soc), __FILE__, __LINE__, DeviceIndex);
+#elif defined(FIRESTARTER_BUILD_HIP)
+  firestarter::log::trace() << "Creating " << AccelleratorString << " Stream for computation on device nr. "
+                            << DeviceIndex;
+  accellSafeCall(static_cast<ErrorT>(hipSetDevice(DeviceIndex)), __FILE__, __LINE__, DeviceIndex);
+  accellSafeCall(static_cast<ErrorT>(hipStreamCreate(&Soc)), __FILE__, __LINE__, DeviceIndex);
+#else
+  (void)DeviceIndex;
+  static_assert(false, "Tried to call createContextOrStream, but neither building for CUDA nor HIP.");
+#endif
+  return Soc;
+}
+
+/// Destroy the context (CUDA) or stream (HIP) with cuCtxDestroy and hipStreamDestroy respectively.
+/// \arg Soc The reference to the context or stream.
+/// \returns The Error code returned from these calls.
+inline auto destroyContextOrStream(StreamOrContext& Soc) -> CUResultOrHipErrorT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<CUResultOrHipErrorT>(cuCtxDestroy(Soc));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<CUResultOrHipErrorT>(hipStreamDestroy(Soc));
+#else
+  (void)Soc;
+  static_assert(false, "Tried to call destroyContextOrStream, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Create a blas handle. Wrapper to cublasCreate or hipblasCreate.
+/// \arg BlasHandle The reference to a BlasHandle object which will be initialized.
+/// \returns The Error code returned from these calls.
+inline auto blasCreate(BlasHandle& BlasHandle) -> BlasStatusT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<BlasStatusT>(cublasCreate(&BlasHandle));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<BlasStatusT>(hipblasCreate(&BlasHandle));
+#else
+  (void)BlasHandle;
+  static_assert(false, "Tried to call blasCreate, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Destory a blas handle. Wrapper to cublasDestroy or hipblasDestroy.
+/// \arg BlasHandle The reference to a BlasHandle object which will be destroyed.
+/// \returns The Error code returned from these calls.
+inline auto blasDestroy(BlasHandle& BlasHandle) -> BlasStatusT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<BlasStatusT>(cublasDestroy(BlasHandle));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<BlasStatusT>(hipblasDestroy(BlasHandle));
+#else
+  (void)BlasHandle;
+  static_assert(false, "Tried to call blasDestroy, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Get the properties of a specific GPU device. Wrapper to cudaGetDeviceProperties or hipGetDeviceProperties.
+/// \arg Property The reference to the properties that are retrived.
+/// \arg DeviceIndex The index of the GPU device for which to retrive the device properties.s
+/// \returns The Error code returned from these calls.
+inline auto getDeviceProperties(DeviceProperties& Property, int DeviceIndex) -> ErrorT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<ErrorT>(cudaGetDeviceProperties(&Property, DeviceIndex));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<ErrorT>(hipGetDeviceProperties(&Property, DeviceIndex));
+#else
+  (void)Property;
+  (void)DeviceIndex;
+  static_assert(false, "Tried to call getDeviceProperties, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Get the number of memory in the current CUDA or HIP context. Wrapper to cuMemGetInfo or
+/// hipMemGetInfo.
+/// \arg MemoryAvail The reference to the available memory that is retrived.
+/// \arg MemoryTotal The reference to the total memory that is retrived.
+/// \returns The Error code returned from these calls.
+inline auto memGetInfo(std::size_t& MemoryAvail, std::size_t& MemoryTotal) -> CUResultOrHipErrorT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<CUResultOrHipErrorT>(cuMemGetInfo(&MemoryAvail, &MemoryTotal));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<CUResultOrHipErrorT>(hipMemGetInfo(&MemoryAvail, &MemoryTotal));
+#else
+  (void)MemoryAvail;
+  (void)MemoryTotal;
+  static_assert(false, "Tried to call memGetInfo, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Malloc device memory in the current CUDA or HIP context. Wrapper to cuMemAlloc or
+/// hipMalloc.
+/// \tparam FloatingPointType The type of the floating point used. Either float or double.
+/// \arg Ptr The reference to the device pointer which is retrieved by the malloc call.
+/// \arg MemorySize The memory that is allocated on the device in bytes.
+/// \returns The Error code returned from these calls.
+template <typename FloatingPointType>
+auto malloc(FloatingPointType** Ptr, std::size_t MemorySize) -> CUResultOrHipErrorT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<CUResultOrHipErrorT>(cuMemAlloc(reinterpret_cast<CUdeviceptr*>(Ptr), MemorySize));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<CUResultOrHipErrorT>(hipMalloc(Ptr, MemorySize));
+#else
+  (void)Ptr;
+  (void)MemorySize;
+  static_assert(false, "Tried to call malloc, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Free device memory in the current CUDA or HIP context. Wrapper to cuMemFree or
+/// hipFree.
+/// \tparam FloatingPointType The type of the floating point used. Either float or double.
+/// \arg Ptr The reference to the device pointer which is used in the free call.
+/// \returns The Error code returned from these calls.
+template <typename FloatingPointType> auto free(FloatingPointType* Ptr) -> CUResultOrHipErrorT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<CUResultOrHipErrorT>(cuMemFree(reinterpret_cast<CUdeviceptr>(Ptr)));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<CUResultOrHipErrorT>(hipFree(Ptr));
+#else
+  (void)Ptr;
+  static_assert(false, "Tried to call free, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Create a random generator in the current CUDA or HIP context. Wrapper to curandCreateGenerator or
+/// hiprandCreateGenerator.
+/// \arg RandomGen The reference to the random generation which is retrived by the calls.
+/// \returns The Error code returned from these calls.
+inline auto randCreateGeneratorPseudoRandom(RandGenerator& RandomGen) -> RandStatusT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<RandStatusT>(curandCreateGenerator(&RandomGen, CURAND_RNG_PSEUDO_DEFAULT));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<RandStatusT>(hiprandCreateGenerator(&RandomGen, HIPRAND_RNG_PSEUDO_DEFAULT));
+#else
+  (void)RandomGen;
+  static_assert(false, "Tried to call randCreateGeneratorPseudoRandom, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Set the pseudo random generator seed in the current CUDA or HIP context. Wrapper to
+/// curandSetPseudoRandomGeneratorSeed or hiprandSetPseudoRandomGeneratorSeed.
+/// \arg RandomGen The reference to the random generator.
+/// \arg Seed The seed used to initialize the pseudo random generator.
+/// \returns The Error code returned from these calls.
+inline auto randSetPseudoRandomGeneratorSeed(RandGenerator& RandomGen, int Seed) -> RandStatusT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<RandStatusT>(curandSetPseudoRandomGeneratorSeed(RandomGen, Seed));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<RandStatusT>(hiprandSetPseudoRandomGeneratorSeed(RandomGen, Seed));
+#else
+  (void)RandomGen;
+  (void)Seed;
+  static_assert(false, "Tried to call randSetPseudoRandomGeneratorSeed, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Initialize the provided memory with with a specific number of uniform random floats. Wrapper to
+/// curandGenerateUniform or hiprandGenerateUniform.
+/// \arg RandomGen The reference to the random generator.
+/// \arg OutputPtr The device pointer on which is initialized with specific number of uniform random floats.
+/// \arg Num The number of unifrom random floats.
+/// \returns The Error code returned from these calls.
+inline auto randGenerateUniform(RandGenerator& RandomGen, float* OutputPtr, std::size_t Num) -> RandStatusT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<RandStatusT>(curandGenerateUniform(RandomGen, OutputPtr, Num));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<RandStatusT>(hiprandGenerateUniform(RandomGen, OutputPtr, Num));
+#else
+  (void)RandomGen;
+  (void)OutputPtr;
+  (void)Num;
+  static_assert(false, "Tried to call randGenerateUniform, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Initialize the provided memory with with a specific number of uniform random doubles. Wrapper to
+/// curandGenerateUniformDouble or hiprandGenerateUniformDouble.
+/// \arg RandomGen The reference to the random generator.
+/// \arg OutputPtr The device pointer on which is initialized with specific number of uniform random floats.
+/// \arg Num The number of unifrom random doubles.
+/// \returns The Error code returned from these calls.
+inline auto randGenerateUniformDouble(RandGenerator& RandomGen, double* OutputPtr, std::size_t Num) -> RandStatusT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<RandStatusT>(curandGenerateUniformDouble(RandomGen, OutputPtr, Num));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<RandStatusT>(hiprandGenerateUniformDouble(RandomGen, OutputPtr, Num));
+#else
+  (void)RandomGen;
+  (void)OutputPtr;
+  (void)Num;
+  static_assert(false, "Tried to call randGenerateUniformDouble, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Initialize the provided memory with with a specific number of uniform random floating points. Wrapper to
+/// randGenerateUniform or randGenerateUniformDouble.
+/// \tparam FloatPointType The float point types is used. Either float or double.
+/// \arg Generator The reference to the random generator.
+/// \arg OutputPtr The device pointer on which is initialized with specific number of uniform random floats.
+/// \arg Num The number of unifrom random doubles.
+/// \returns The Error code returned from these calls.
+template <typename FloatPointType>
+auto generateUniform(RandGenerator& Generator, FloatPointType* OutputPtr, size_t Num) -> RandStatusT {
+  if constexpr (std::is_same_v<FloatPointType, float>) {
+    return randGenerateUniform(Generator, OutputPtr, Num);
+  } else if constexpr (std::is_same_v<FloatPointType, double>) {
+    return randGenerateUniformDouble(Generator, OutputPtr, Num);
+  } else {
+    assert(false && "generateUniform<FloatPointType>: Template argument must be either float or double");
+  }
+}
+
+/// Destory a random generator in the current CUDA or HIP context. Wrapper to curandDestroyGenerator or
+/// hiprandDestroyGenerator.
+/// \arg RandomGen The reference to the random generation which shoule be destroyed.
+/// \returns The Error code returned from these calls.
+inline auto randDestroyGenerator(RandGenerator& RandomGen) -> RandStatusT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<RandStatusT>(curandDestroyGenerator(RandomGen));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<RandStatusT>(hiprandDestroyGenerator(RandomGen));
+#else
+  (void)RandomGen;
+  static_assert(false, "Tried to call randDestroyGenerator, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Copy memory from a device pointer to another device pointer. Wrapper to cuMemcpyDtoD or hipMemcpyDtoD.
+/// \arg DestinationPtr The destination address.
+/// \arg SourcePtr The source address.
+/// \arg Size The number of bytes to copy.
+/// \returns The Error code returned from these calls.
+template <typename FloatPointType>
+auto memcpyDtoD(FloatPointType* DestinationPtr, FloatPointType* SourcePtr, std::size_t Size) -> CUResultOrHipErrorT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<CUResultOrHipErrorT>(
+      cuMemcpyDtoD(reinterpret_cast<CUdeviceptr>(DestinationPtr), reinterpret_cast<CUdeviceptr>(SourcePtr), Size));
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<CUResultOrHipErrorT>(hipMemcpyDtoD(DestinationPtr, SourcePtr, Size));
+#else
+  (void)DestinationPtr;
+  (void)SourcePtr;
+  (void)Size;
+  static_assert(false, "Tried to call memcpyDtoD, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// Block until the current device finished. Wrapper to cudaDeviceSynchronize or hipcudaDeviceSynchronize.
+/// \returns The Error code returned from these calls.
+inline auto deviceSynchronize() -> ErrorT {
+#ifdef FIRESTARTER_BUILD_CUDA
+  return static_cast<ErrorT>(cudaDeviceSynchronize());
+#elif defined(FIRESTARTER_BUILD_HIP)
+  return static_cast<ErrorT>(hipDeviceSynchronize());
+#else
+  static_assert(false, "Tried to call deviceSynchronize, but neither building for CUDA nor HIP.");
+#endif
+}
+
+/// This function performs the matrix-matrix multiplication C = Alpha * op(A) * op(B) + Beta * C with op(A) and op(B)
+/// described by the selected operation for Transa and Transb. BlasOperation::BLAS_OP_N maps to op(X) = X,
+/// BlasOperation::BLAS_OP_T to op(X) = X transposed and BlasOperation::BLAS_OP_C to op(X) = conjugate transpose of X.
+/// It wrapps (cu|hip)blas(S|D)gemm.
+/// \tparam FloatPointType The float point types is used. Either float or double.
+/// \arg Handle The blass handle
+/// \arg Transa The operation selected for op(A)
+/// \arg Transb The operation selected for op(B)
+/// \arg M Number of rows of matrix op(A) and C.
+/// \arg N Number of columns of matrix op(B) and C.
+/// \arg K Number of columns of op(A) and rows of op(B).
+/// \arg Alpha
+/// \arg A
+/// \arg Lda Leading dimension of two-dimensional array used to store the matrix A.
+/// \arg B
+/// \arg Ldb Leading dimension of two-dimensional array used to store matrix B.
+/// \arg Beta
+/// \arg C
+/// \arg Ldc Leading dimension of a two-dimensional array used to store the matrix C.
+/// \returns The Error code returned from these calls.
+template <typename FloatPointType>
+auto gemm(BlasHandle Handle, BlasOperation Transa, BlasOperation Transb, int M, int N, int K,
+          const FloatPointType& Alpha, const FloatPointType* A, int Lda, const FloatPointType* B, int Ldb,
+          const FloatPointType& Beta, FloatPointType* C, int Ldc) -> BlasStatusT {
+  if constexpr (std::is_same_v<FloatPointType, float>) {
+#ifdef FIRESTARTER_BUILD_CUDA
+    return static_cast<BlasStatusT>(cublasSgemm(Handle, static_cast<BlasOperationT>(Transa),
+                                                static_cast<BlasOperationT>(Transb), M, N, K, &Alpha, A, Lda, B, Ldb,
+                                                &Beta, C, Ldc));
+#elif defined(FIRESTARTER_BUILD_HIP)
+    return static_cast<BlasStatusT>(hipblasSgemm(Handle, static_cast<BlasOperationT>(Transa),
+                                                 static_cast<BlasOperationT>(Transb), M, N, K, &Alpha, A, Lda, B, Ldb,
+                                                 &Beta, C, Ldc));
+#endif
+  } else if constexpr (std::is_same_v<FloatPointType, double>) {
+#ifdef FIRESTARTER_BUILD_CUDA
+    return static_cast<BlasStatusT>(cublasDgemm(Handle, static_cast<BlasOperationT>(Transa),
+                                                static_cast<BlasOperationT>(Transb), M, N, K, &Alpha, A, Lda, B, Ldb,
+                                                &Beta, C, Ldc));
+#elif defined(FIRESTARTER_BUILD_HIP)
+    return static_cast<BlasStatusT>(hipblasDgemm(Handle, static_cast<BlasOperationT>(Transa),
+                                                 static_cast<BlasOperationT>(Transb), M, N, K, &Alpha, A, Lda, B, Ldb,
+                                                 &Beta, C, Ldc));
+#endif
+  } else {
+    (void)Handle;
+    (void)Transa;
+    (void)Transb;
+    (void)M;
+    (void)N;
+    (void)K;
+    (void)Alpha;
+    (void)A;
+    (void)Lda;
+    (void)B;
+    (void)Ldb;
+    (void)Beta;
+    (void)C;
+    (void)Ldc;
+    assert(false && "gemm<FloatPointType>: Template argument must be either float or double");
+  }
+
+#if not(defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP))
+  (void)Handle;
+  (void)Transa;
+  (void)Transb;
+  (void)M;
+  (void)N;
+  (void)K;
+  (void)Alpha;
+  (void)A;
+  (void)Lda;
+  (void)B;
+  (void)Ldb;
+  (void)Beta;
+  (void)C;
+  (void)Ldc;
+  static_assert(false, "Tried to call gemm, but neither building for CUDA nor HIP.");
+#endif
+}
+
+} // namespace firestarter::cuda::compat
\ No newline at end of file
diff --git a/include/firestarter/DumpRegisterStruct.hpp b/include/firestarter/DumpRegisterStruct.hpp
index 7e80c111..63d4695e 100644
--- a/include/firestarter/DumpRegisterStruct.hpp
+++ b/include/firestarter/DumpRegisterStruct.hpp
@@ -21,22 +21,41 @@
 
 #pragma once
 
+#include "firestarter/Constants.hpp"
+
+#include <array>
+
 namespace firestarter {
 
 /* DO NOT CHANGE! the asm load-loop tests if it should dump the current register
  * content */
-enum DumpVariable : unsigned long long { Start = 0, Wait = 1 };
+// NOLINTBEGIN(performance-enum-size)
+/// This struct defines the variable the is used to control when the registers should be dumped.
+enum class DumpVariable : EightBytesType {
+  /// Start saving register to memory
+  Start = 0,
+  /// When done when change it to the Wait state. There we do nothing.
+  Wait = 1
+};
+// NOLINTEND(performance-enum-size)
 
-#define REGISTER_MAX_NUM 32
+// The maximal number of SIMD registers. This is currently 32 for zmm registers.
+constexpr const auto RegisterMaxNum = 32;
+/// The maximal number of doubles in SIMD registers. This is currently 8 for zmm registers.
+constexpr const auto RegisterMaxSize = 8;
+/// The maximum number of doubles in SIMD registers multiplied with the maximum number of vector registers.
+constexpr const auto MaxNumberOfDoublesInVectorRegisters = RegisterMaxNum * RegisterMaxSize;
 
+/// This struct is used to do the communication between the high-load loop and the part of the program that saves the
+/// dumped registers to a file.
 struct DumpRegisterStruct {
-  // REGISTER_MAX_NUM cachelines
-  volatile double registerValues[REGISTER_MAX_NUM * 8];
-  // pad to use a whole cacheline
-  volatile unsigned long long padding[7];
-  volatile DumpVariable dumpVar;
+  /// This array will contain the dumped registers. It has the size of 32 Cachelines. (8B doubles * 8 double in a
+  /// register * 32 registers)
+  std::array<double, MaxNumberOfDoublesInVectorRegisters> RegisterValues;
+  /// Pad the DumpVar to use a whole cacheline
+  std::array<EightBytesType, 7> Padding;
+  /// The variable that controls the execution of the dump register code in the high-load routine.
+  volatile DumpVariable DumpVar;
 };
 
-#undef REGISTER_MAX_NUM
-
 } // namespace firestarter
diff --git a/include/firestarter/DumpRegisterWorkerData.hpp b/include/firestarter/DumpRegisterWorkerData.hpp
index f7b721d4..e0bf01d4 100644
--- a/include/firestarter/DumpRegisterWorkerData.hpp
+++ b/include/firestarter/DumpRegisterWorkerData.hpp
@@ -21,42 +21,53 @@
 
 #pragma once
 
-#include <firestarter/DumpRegisterStruct.hpp>
-#include <firestarter/LoadWorkerData.hpp>
+#include "firestarter/LoadWorkerData.hpp"
+#include "firestarter/Logging/Log.hpp"
+#include "firestarter/WindowsCompat.hpp" // IWYU pragma: keep
 
 #include <chrono>
-
-#ifdef FIRESTARTER_DEBUG_FEATURES
+#include <utility>
 
 namespace firestarter {
 
+/// This class holds the data that is required for the worker thread that dumps the register contents to a file.
 class DumpRegisterWorkerData {
 public:
-  DumpRegisterWorkerData(std::shared_ptr<LoadWorkerData> loadWorkerData,
-                         std::chrono::seconds dumpTimeDelta,
-                         std::string dumpFilePath)
-      : loadWorkerData(loadWorkerData), dumpTimeDelta(dumpTimeDelta) {
-
-    if (dumpFilePath.empty()) {
-      char cwd[PATH_MAX];
-      if (getcwd(cwd, sizeof(cwd)) != NULL) {
-        this->dumpFilePath = cwd;
+  DumpRegisterWorkerData() = delete;
+
+  /// Initialize the DumpRegisterWorkerData.
+  /// \arg LoadWorkerDataPtr The shared pointer to the data of the thread were registers should be dummped. We need it
+  /// to access the memory to which the registers are dumped as well as getting the size and count of registers.
+  /// \arg DumpTimeDelta Every this number of seconds the register content will be dumped.
+  /// \arg DumpFilePath The folder that is used to dump registers to. If the string is empty the current directory will
+  /// be choosen. If it cannot be determined /tmp is used. In this directory a file called hamming_distance.csv will be
+  /// created.
+  DumpRegisterWorkerData(std::shared_ptr<LoadWorkerData> LoadWorkerDataPtr, std::chrono::seconds DumpTimeDelta,
+                         const std::string& DumpFilePath)
+      : LoadWorkerDataPtr(std::move(LoadWorkerDataPtr))
+      , DumpTimeDelta(DumpTimeDelta) {
+    if (DumpFilePath.empty()) {
+      char* Pwd = get_current_dir_name();
+      if (Pwd) {
+        this->DumpFilePath = Pwd;
       } else {
         log::error() << "getcwd() failed. Set --dump-registers-outpath to /tmp";
-        this->dumpFilePath = "/tmp";
+        this->DumpFilePath = "/tmp";
       }
     } else {
-      this->dumpFilePath = dumpFilePath;
+      this->DumpFilePath = DumpFilePath;
     }
   }
 
-  ~DumpRegisterWorkerData() {}
+  ~DumpRegisterWorkerData() = default;
 
-  std::shared_ptr<LoadWorkerData> loadWorkerData;
-  const std::chrono::seconds dumpTimeDelta;
-  std::string dumpFilePath;
+  /// The shared pointer to the data of the thread were registers should be dummped. We need it to access the memory to
+  /// which the registers are dumped as well as getting the size and count of registers.
+  std::shared_ptr<LoadWorkerData> LoadWorkerDataPtr;
+  /// Every this number of seconds the register content will be dumped.
+  const std::chrono::seconds DumpTimeDelta;
+  /// The folder in which the hamming_distance.csv file will be created.
+  std::string DumpFilePath;
 };
 
-} // namespace firestarter
-
-#endif
+} // namespace firestarter
\ No newline at end of file
diff --git a/include/firestarter/Environment/CPUTopology.hpp b/include/firestarter/Environment/CPUTopology.hpp
index dcb61e96..bf9a8d19 100644
--- a/include/firestarter/Environment/CPUTopology.hpp
+++ b/include/firestarter/Environment/CPUTopology.hpp
@@ -21,7 +21,9 @@
 
 #pragma once
 
+#include <cstdint>
 #include <list>
+#include <optional>
 #include <ostream>
 #include <sstream>
 #include <string>
@@ -32,54 +34,93 @@ extern "C" {
 
 namespace firestarter::environment {
 
+/// This class models the properties of a processor.
 class CPUTopology {
 public:
-  CPUTopology(std::string architecture);
+  explicit CPUTopology(std::string Architecture);
   virtual ~CPUTopology();
 
-  unsigned numThreads() const {
-    return _numThreadsPerCore * _numCoresTotal;
-  }
-  unsigned maxNumThreads() const;
-  unsigned numThreadsPerCore() const { return _numThreadsPerCore; }
-  unsigned numCoresTotal() const { return _numCoresTotal; }
-  unsigned numPackages() const { return _numPackages; }
+  friend auto operator<<(std::ostream& Stream, CPUTopology const& CpuTopologyRef) -> std::ostream&;
 
-  std::string const &architecture() const { return _architecture; }
-  virtual std::string const &vendor() const { return _vendor; }
-  virtual std::string const &processorName() const { return _processorName; }
-  virtual std::string const &model() const = 0;
+  /// The total number of hardware threads.
+  [[nodiscard]] auto numThreads() const -> unsigned { return NumThreadsPerCore * NumCoresTotal; }
+  /// The maximum os_index of all PUs plus 1 if we cannot determine the number of cpu kinds. Otherwise the maximum
+  /// number of PUs.
+  [[nodiscard]] auto maxNumThreads() const -> unsigned;
+  /// Assuming we have a consistent number of threads per core. The number of thread per core.
+  [[nodiscard]] auto numThreadsPerCore() const -> unsigned { return NumThreadsPerCore; }
+  /// The total number of cores.
+  [[nodiscard]] auto numCoresTotal() const -> unsigned { return NumCoresTotal; }
+  /// The total number of packages.
+  [[nodiscard]] auto numPackages() const -> unsigned { return NumPackages; }
+  /// The CPU architecture e.g., x86_64
+  [[nodiscard]] auto architecture() const -> std::string const& { return Architecture; }
+  /// The CPU vendor i.e., Intel or AMD.
+  [[nodiscard]] virtual auto vendor() const -> std::string const& { return Vendor; }
+  /// The processor name, this includes the vendor specific name
+  [[nodiscard]] virtual auto processorName() const -> std::string const& { return ProcessorName; }
+  /// The model of the processor. With X86 this is the the string of Family, Model and Stepping.
+  [[nodiscard]] virtual auto model() const -> std::string const& = 0;
 
-  // get the size of the L1i-cache in bytes
-  unsigned instructionCacheSize() const { return _instructionCacheSize; }
+  /// Getter for the L1i-cache size in bytes
+  [[nodiscard]] auto instructionCacheSize() const -> const auto& { return InstructionCacheSize; }
 
-  // return the cpu clockrate in Hz
-  virtual unsigned long long clockrate() const { return _clockrate; }
-  // return the cpu features
-  virtual std::list<std::string> const &features() const = 0;
+  /// Getter for the clockrate in Hz
+  [[nodiscard]] virtual auto clockrate() const -> uint64_t { return Clockrate; }
 
-  // get a timestamp
-  virtual unsigned long long timestamp() const = 0;
+  /// Getter for the list of CPU features
+  [[nodiscard]] virtual auto features() const -> std::list<std::string> const& = 0;
 
-  int getPkgIdFromPU(unsigned pu) const;
-  int getCoreIdFromPU(unsigned pu) const;
+  /// Get the current hardware timestamp
+  [[nodiscard]] virtual auto timestamp() const -> uint64_t = 0;
+
+  /// Get the logical index of the core that housed the PU which is described by the os index.
+  /// \arg Pu The os index of the thread.
+  /// \returns Optionally the logical index of the CPU that houses this hardware thread.
+  [[nodiscard]] auto getCoreIdFromPU(unsigned Pu) const -> std::optional<unsigned>;
+
+  /// Get the logical index of the package that housed the PU which is described by the os index.
+  /// \arg Pu The os index of the thread.
+  /// \returns Optionally the logical index of the package that houses this hardware thread.
+  [[nodiscard]] auto getPkgIdFromPU(unsigned Pu) const -> std::optional<unsigned>;
 
 protected:
-  std::string scalingGovernor() const;
-  std::ostream &print(std::ostream &stream) const;
+  /// Read the scaling_govenor file of cpu0 on linux and return the contents as a string.
+  [[nodiscard]] static auto scalingGovernor() -> std::string;
+
+  /// Print the information about this process to a stream.
+  [[nodiscard]] auto print(std::ostream& Stream) const -> std::ostream&;
 
 private:
-  static std::stringstream getFileAsStream(std::string const &filePath);
-
-  unsigned _numThreadsPerCore;
-  unsigned _numCoresTotal;
-  unsigned _numPackages;
-  std::string _architecture;
-  std::string _vendor = "";
-  std::string _processorName = "";
-  unsigned _instructionCacheSize = 0;
-  unsigned long long _clockrate = 0;
-  hwloc_topology_t topology;
+  /// The CPU vendor i.e., Intel or AMD.
+  std::string Vendor;
+
+  /// Helper function to open a filepath and return a stringstream with its contents.
+  /// \arg FilePath The file to open
+  /// \returns A stringstream with the contents of the file.
+  [[nodiscard]] static auto getFileAsStream(std::string const& FilePath) -> std::stringstream;
+
+  /// Assuming we have a consistent number of threads per core. The number of thread per core.
+  unsigned NumThreadsPerCore;
+  /// The total number of cores.
+  unsigned NumCoresTotal;
+  /// The total number of packages.
+  unsigned NumPackages;
+
+  /// The CPU architecture e.g., x86_64
+  std::string Architecture;
+  /// The processor name, this includes the vendor specific name
+  std::string ProcessorName;
+  /// The optional size of the instruction cache per core.
+  std::optional<unsigned> InstructionCacheSize;
+  /// Clockrate of the CPU in Hz
+  uint64_t Clockrate = 0;
+  /// The hwloc topology that is used to query information about the processor.
+  hwloc_topology_t Topology{};
 };
 
+inline auto operator<<(std::ostream& Stream, CPUTopology const& CpuTopologyRef) -> std::ostream& {
+  return CpuTopologyRef.print(Stream);
+}
+
 } // namespace firestarter::environment
diff --git a/include/firestarter/Environment/Environment.hpp b/include/firestarter/Environment/Environment.hpp
index c76dc073..41446bde 100644
--- a/include/firestarter/Environment/Environment.hpp
+++ b/include/firestarter/Environment/Environment.hpp
@@ -21,74 +21,129 @@
 
 #pragma once
 
-#include <firestarter/Environment/CPUTopology.hpp>
-#include <firestarter/Environment/Platform/PlatformConfig.hpp>
-#include <firestarter/Environment/Platform/RuntimeConfig.hpp>
+#include "firestarter/Environment/CPUTopology.hpp"
+#include "firestarter/Environment/Platform/PlatformConfig.hpp"
 
 #include <cassert>
+#include <cstdint>
+#include <memory>
 #include <vector>
 
 namespace firestarter::environment {
 
+/// This class handles parsing of user input to FIRESTARTER, namely the number of threads used, the thread affinity, the
+/// selection of the correct high-load function, selection of the instruction groups and number of lines. It also
+/// handles printing useful information, provides interfaces to the PlatformConfig and the number of threads. It
+/// facilitates setting the cpu affinity in further parts of FIRESTARTER.
 class Environment {
 public:
-  Environment(CPUTopology *topology) : _topology(topology) {}
-  ~Environment() {
-    delete this->_topology;
-    if (_selectedConfig != nullptr) {
-      delete _selectedConfig;
-    }
-  }
+  Environment() = delete;
+  explicit Environment(std::unique_ptr<CPUTopology>&& Topology)
+      : Topology(std::move(Topology)) {}
+  virtual ~Environment() = default;
+
+  /// Parse the user input for the cpu affinity and the number of requested threads. If a CpuBind is provided we
+  /// evaluate it and set the number of threads and their affinity accordingly. This is only supported on linux and with
+  /// the FIRESTARTER_THREAD_AFFINITY build flag. This function will save the correct number of threads based on the
+  /// user input in RequestedNumThreads. It must be called for FIRESTARTER to function properly.
+  /// \arg RequestedNumThreads The number of threads that are requested by a user. If this is zero the number will be
+  /// automatically determined.
+  /// \arg CpuBind If this string following the CPULIST format: "x,y,z", "x-y", "x-y/step", and any combination of the
+  /// above. We select the number of requested CPUs and their cpubind from this string.
+  void evaluateCpuAffinity(unsigned RequestedNumThreads, const std::string& CpuBind);
 
-  int evaluateCpuAffinity(unsigned requestedNumThreads, std::string cpuBind);
-  int setCpuAffinity(unsigned thread);
+  /// The worker threads are numerated from zero to RequestedNumThreads. Set the cpuaffinity of a calling thread based
+  /// on this index to the one that that should be used according to the determined CpuBind list from the call to
+  /// evaluateCpuAffinity. This function will throw if it is called with an invalid index.
+  /// \arg Thread The index of the worker thread.
+  void setCpuAffinity(unsigned Thread) const;
+
+  /// Print the summary of the used thread for the workers. If thread affinity is supported (linux and compiled with the
+  /// FIRESTARTER_THREAD_AFFINITY flag), print which thread is pinned to which CPU.
   void printThreadSummary();
 
-  virtual void evaluateFunctions() = 0;
-  virtual int selectFunction(unsigned functionId,
-                             bool allowUnavailablePayload) = 0;
-  virtual int selectInstructionGroups(std::string groups) = 0;
+  /// Select a PlatformConfig based on its generated id. This function will throw if a payload is not available or the
+  /// id is incorrect. If id is zero we automatically select a matching PlatformConfig.
+  /// \arg FunctionId The id of the PlatformConfig that should be selected.
+  /// \arg AllowUnavailablePayload If true we will not throw if the PlatformConfig is not available.
+  virtual void selectFunction(unsigned FunctionId, bool AllowUnavailablePayload) = 0;
+
+  /// Parse the selected payload instruction groups and save the in the selected function. Throws if the input is
+  /// invalid.
+  /// \arg Groups The list of instruction groups that is in the format: multiple INSTRUCTION:VALUE pairs
+  /// comma-seperated.
+  virtual void selectInstructionGroups(std::string Groups) = 0;
+
+  /// Print the available instruction groups of the selected function.
   virtual void printAvailableInstructionGroups() = 0;
-  virtual void setLineCount(unsigned lineCount) = 0;
+
+  /// Set the line count in the selected function.
+  /// \arg LineCount The maximum number of instruction that should be in the high-load loop.
+  virtual void setLineCount(unsigned LineCount) = 0;
+
+  /// Print a summary of the settings of the selected config.
   virtual void printSelectedCodePathSummary() = 0;
+
+  /// Print a list of available high-load function and if they are available on the current system.
   virtual void printFunctionSummary() = 0;
 
-  platform::RuntimeConfig &selectedConfig() const {
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-value"
-#endif
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-value"
-    assert(("No RuntimeConfig selected", _selectedConfig != nullptr));
-#pragma GCC diagnostic pop
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-    return *_selectedConfig;
+  /// Get the number of threads FIRESTARTER will run with.
+  [[nodiscard]] auto requestedNumThreads() const -> uint64_t { return RequestedNumThreads; }
+
+  /// Getter (which allows modifying) for the current platform config containing the payload, settings and the
+  /// associated name.
+  [[nodiscard]] virtual auto config() -> platform::PlatformConfig& {
+    assert(Config && "No PlatformConfig selected");
+    return *Config;
   }
 
-  unsigned long long requestedNumThreads() const {
-    return _requestedNumThreads;
+  /// Const getter for the current platform config containing the payload, settings and the associated name.
+  [[nodiscard]] virtual auto config() const -> const platform::PlatformConfig& {
+    assert(Config && "No PlatformConfig selected");
+    return *Config;
   }
 
-  CPUTopology const &topology() const {
-    assert(_topology != nullptr);
-    return *_topology;
+  /// Const getter for the current CPU topology.
+  [[nodiscard]] virtual auto topology() const -> const CPUTopology& {
+    assert(Topology && "Topology is a nullptr");
+    return *Topology;
   }
 
 protected:
-  platform::RuntimeConfig *_selectedConfig = nullptr;
-  CPUTopology *_topology = nullptr;
+  /// This function sets the config based on the
+  void setConfig(std::unique_ptr<platform::PlatformConfig>&& Config) { this->Config = std::move(Config); }
 
 private:
-  unsigned long long _requestedNumThreads;
+  /// The selected config that contains the payload, settings and the associated name.
+  std::unique_ptr<platform::PlatformConfig> Config;
+  /// The description of the current CPU.
+  std::unique_ptr<CPUTopology> Topology;
+
+  /// The number of threads FIRESTARTER is requested to run with. This will initially be set to zero, which will be
+  /// replaced by the maximum number of threads after calling evaluateCpuAffinity.
+  uint64_t RequestedNumThreads = 0;
 
-  // TODO: replace these functions with the builtins one from hwloc
-  int cpuAllowed(unsigned id);
-  int cpuSet(unsigned id);
+  // TODO(Issue #74): Use hwloc for cpu thread affinity.
+#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY)
+  /// Check if the Cpu is allowed to be used with the current program.
+  /// \arg Id The if of the CPU which is checked.
+  /// \returns true if the CPU with Id is allowed to be used by the program.
+  static auto cpuAllowed(unsigned Id) -> bool;
 
-  std::vector<unsigned> cpuBind;
+  /// Set the cpu affinity of the current thread to a specific CPU.
+  /// \arg Id The id of the CPU to which to pin the calling thread.
+  /// \returns 0 on success. See the man page for. sched_setaffinity.
+  static auto cpuSet(unsigned Id) -> int;
+
+  /// Add a CPU to mask if this CPU is available on the current system or throw with an error.
+  /// \arg Cpu The id of the CPU to add to the mask.
+  /// \arg Mask The reference to the mask to add the cpu to.
+  void addCpuSet(unsigned Cpu, cpu_set_t& Mask) const;
+
+  /// The list of physical CPU ids that are requested to be used. The length of this list should match the number of
+  /// requested threads if it is not zero.
+  std::vector<unsigned> CpuBind;
+#endif
 };
 
 } // namespace firestarter::environment
diff --git a/include/firestarter/Environment/Payload/CompiledPayload.hpp b/include/firestarter/Environment/Payload/CompiledPayload.hpp
new file mode 100644
index 00000000..488c6c8d
--- /dev/null
+++ b/include/firestarter/Environment/Payload/CompiledPayload.hpp
@@ -0,0 +1,101 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#pragma once
+
+#include "firestarter/Constants.hpp"
+#include "firestarter/Environment/Payload/PayloadStats.hpp"
+
+#include <chrono>
+#include <memory>
+#include <utility>
+
+namespace firestarter::environment::payload {
+
+class Payload;
+
+/// This class represents a payload that can be executed. It is created by calling compilePayload of the payload class
+/// with specific settings. It contains a reference to the init and low load functions (which do not change with payload
+/// settings) and the high load function which changes based on the settings. The stats of the high load function (nb.
+/// of flops, bytes of memory accessed and instructions) can also be retrieved.
+class CompiledPayload {
+public:
+  CompiledPayload() = delete;
+  virtual ~CompiledPayload() = default;
+
+  /// A unique ptr for the CompiledPayload with a custom deleter.
+  using UniquePtr = std::unique_ptr<CompiledPayload, void (*)(CompiledPayload*)>;
+
+  using HighLoadFunctionPtr = uint64_t (*)(double*, volatile LoadThreadWorkType*, uint64_t);
+
+  /// Getter for the stats of the high load function of the compiled payload
+  [[nodiscard]] auto stats() const -> const PayloadStats& { return Stats; };
+
+  /// Function to initialize the memory used by the high load function.
+  /// \arg MemoryAddr The pointer to the memory.
+  /// \arg BufferSize The number of doubles that is allocated in MemoryAddr.
+  void init(double* MemoryAddr, uint64_t BufferSize);
+
+  /// Function to produce a low load on the cpu.
+  /// \arg LoadVar The variable that controls the load. If this variable changes from LoadThreadWorkType::LowLoad to
+  /// something else this function will return.
+  /// \arg Period The period of the low/high load switching. This function may sleep a fraction of this period.
+  void lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period);
+
+  /// Function to produce high load on the cpu.
+  /// \arg MemoryAddr The pointer to the memory.
+  /// \arg LoadVar The variable that controls the load. If this variable changes from LoadThreadWorkType::LoadHigh to
+  /// something else this function will return.
+  /// \arg Iterations The current iteration counter. This number will be incremented for every iteration of the high
+  /// load loop.
+  /// \returns The iteration counter passed into this function plus the number of iteration of the high load loop.
+  [[nodiscard]] auto highLoadFunction(double* MemoryAddr, volatile LoadThreadWorkType& LoadVar, uint64_t Iterations)
+      -> uint64_t {
+    return HighLoadFunction(MemoryAddr, &LoadVar, Iterations);
+  }
+
+protected:
+  /// Constructor for the CompiledPayload.
+  /// \arg Stats The stats of the high load function from the payload.
+  /// \arg PayloadPtr A unique pointer to the payload class to allow calling the init and low load functions which do
+  /// not change based on different payload settings.
+  /// \arg HighLoadFunction The pointer to the compiled high load function.
+  CompiledPayload(const PayloadStats& Stats, std::unique_ptr<Payload>&& PayloadPtr,
+                  HighLoadFunctionPtr HighLoadFunction)
+      : Stats(Stats)
+      , PayloadPtr(std::move(PayloadPtr))
+      , HighLoadFunction(HighLoadFunction) {}
+
+  /// Getter for the pointer to the high load function. We need to access this pointer directly to free the associated
+  /// memory from asmjit.
+  [[nodiscard]] auto highLoadFunctionPtr() -> HighLoadFunctionPtr { return HighLoadFunction; }
+
+private:
+  /// The stats of the compiled payload.
+  PayloadStats Stats;
+  /// The pointer to the payload class to allow calling the init and low load functions which do not change based on
+  /// different payload settings.
+  std::unique_ptr<Payload> PayloadPtr;
+  /// The pointer to the compiled high load function.
+  HighLoadFunctionPtr HighLoadFunction;
+};
+
+} // namespace firestarter::environment::payload
\ No newline at end of file
diff --git a/include/firestarter/Environment/Payload/Payload.hpp b/include/firestarter/Environment/Payload/Payload.hpp
index 40246ac0..b5b17199 100644
--- a/include/firestarter/Environment/Payload/Payload.hpp
+++ b/include/firestarter/Environment/Payload/Payload.hpp
@@ -21,93 +21,86 @@
 
 #pragma once
 
-#include <initializer_list>
+#include "firestarter/Constants.hpp"
+#include "firestarter/Environment/CPUTopology.hpp"
+#include "firestarter/Environment/Payload/CompiledPayload.hpp"
+#include "firestarter/Environment/Payload/PayloadSettings.hpp"
+
+#include <chrono>
 #include <list>
 #include <string>
-#include <vector>
+#include <utility>
 
 namespace firestarter::environment::payload {
 
 class Payload {
 private:
-  std::string _name;
-  unsigned getSequenceStartCount(const std::vector<std::string> &sequence,
-                                 const std::string start);
+  /// The name of this payload. It is usally named by the CPU extension this payload uses e.g., SSE2 or FMA.
+  std::string Name;
+
+  /// The size of the SIMD registers in units of doubles (8B)
+  unsigned RegisterSize = 0;
+
+  /// The number of SIMD registers used by the payload
+  unsigned RegisterCount = 0;
 
 protected:
-  unsigned _flops;
-  unsigned _bytes;
-  // number of instructions in load loop
-  unsigned _instructions;
-  // size of used simd registers in bytes
-  unsigned _registerSize;
-  // number of used simd registers
-  unsigned _registerCount;
-
-  std::vector<std::string> generateSequence(
-      const std::vector<std::pair<std::string, unsigned>> &proportion);
-  unsigned getL2SequenceCount(const std::vector<std::string> &sequence) {
-    return getSequenceStartCount(sequence, "L2");
-  };
-  unsigned getL3SequenceCount(const std::vector<std::string> &sequence) {
-    return getSequenceStartCount(sequence, "L3");
-  };
-  unsigned getRAMSequenceCount(const std::vector<std::string> &sequence) {
-    return getSequenceStartCount(sequence, "RAM");
-  };
-
-  unsigned
-  getNumberOfSequenceRepetitions(const std::vector<std::string> &sequence,
-                                 const unsigned numberOfLines) {
-    if (sequence.size() == 0) {
-      return 0;
-    }
-    return numberOfLines / sequence.size();
-  };
-
-  unsigned getL2LoopCount(const std::vector<std::string> &sequence,
-                          const unsigned numberOfLines, const unsigned size,
-                          const unsigned threads);
-  unsigned getL3LoopCount(const std::vector<std::string> &sequence,
-                          const unsigned numberOfLines, const unsigned size,
-                          const unsigned threads);
-  unsigned getRAMLoopCount(const std::vector<std::string> &sequence,
-                           const unsigned numberOfLines, const unsigned size,
-                           const unsigned threads);
+  /// Function to initialize the memory used by the high load function.
+  /// \arg MemoryAddr The pointer to the memory.
+  /// \arg BufferSize The number of doubles that is allocated in MemoryAddr.
+  virtual void init(double* MemoryAddr, uint64_t BufferSize) const = 0;
+
+  /// Function to produce a low load on the cpu.
+  /// \arg LoadVar The variable that controls the load. If this variable changes from LoadThreadWorkType::LowLoad to
+  /// something else this function will return.
+  /// \arg Period The period of the low/high load switching. This function may sleep a fraction of this period.
+  virtual void lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period) const = 0;
 
 public:
-  Payload(std::string name, unsigned registerSize, unsigned registerCount)
-      : _name(name), _registerSize(registerSize),
-        _registerCount(registerCount) {}
-  virtual ~Payload() {}
-
-  const std::string &name() const { return _name; }
-  unsigned flops() const { return _flops; }
-  unsigned bytes() const { return _bytes; }
-  unsigned instructions() const { return _instructions; }
-  unsigned registerSize() const { return _registerSize; }
-  unsigned registerCount() const { return _registerCount; }
-
-  virtual bool isAvailable() const = 0;
-
-  virtual void lowLoadFunction(volatile unsigned long long *addrHigh,
-                               unsigned long long period) = 0;
-
-  virtual int compilePayload(
-      std::vector<std::pair<std::string, unsigned>> const &proportion,
-      unsigned instructionCacheSize,
-      std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-      unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-      bool errorDetection) = 0;
-  virtual std::list<std::string> getAvailableInstructions() const = 0;
-  virtual void init(unsigned long long *memoryAddr,
-                    unsigned long long bufferSize) = 0;
-  virtual unsigned long long
-  highLoadFunction(unsigned long long *addrMem,
-                   volatile unsigned long long *addrHigh,
-                   unsigned long long iterations) = 0;
-
-  virtual Payload *clone() const = 0;
+  Payload() = delete;
+
+  /// Abstract construction for the payload.
+  /// \arg Name The name of this payload. It is usally named by the CPU extension this payload uses e.g., SSE2 or FMA.
+  /// \arg RegisterSize The size of the SIMD registers in units of doubles (8B).
+  /// \arg RegisterCount The number of SIMD registers used by the payload.
+  Payload(std::string Name, unsigned RegisterSize, unsigned RegisterCount) noexcept
+      : Name(std::move(Name))
+      , RegisterSize(RegisterSize)
+      , RegisterCount(RegisterCount) {}
+  virtual ~Payload() = default;
+
+  // Allow init and lowLoadFunction functions to be accessed by the CompiledPayload class.
+  friend void CompiledPayload::init(double* MemoryAddr, uint64_t BufferSize);
+  friend void CompiledPayload::lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period);
+
+  /// Get the name of this payload. It is usally named by the CPU extension this payload uses e.g., SSE2 or FMA.
+  [[nodiscard]] auto name() const -> const std::string& { return Name; }
+
+  /// The size of the SIMD registers in units of doubles (8B)
+  [[nodiscard]] auto registerSize() const -> unsigned { return RegisterSize; }
+
+  /// The number of SIMD registers used by the payload
+  [[nodiscard]] auto registerCount() const -> unsigned { return RegisterCount; }
+
+  /// Check if this payload is available on the current system. This usally translates if the cpu extensions are
+  /// available.
+  /// \arg Topology The CPUTopology that is used to check agains if this payload is supported.
+  /// \returns true if the payload is supported on the given CPUTopology.
+  [[nodiscard]] virtual auto isAvailable(const CPUTopology& Topology) const -> bool = 0;
+
+  /// Compile this payload with supplied settings and optional features.
+  /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches.
+  /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the
+  /// compiled payload.
+  /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine
+  /// of the compiled payload.
+  /// \returns The compiled payload that provides access to the init and load functions.
+  [[nodiscard]] virtual auto compilePayload(const PayloadSettings& Settings, bool DumpRegisters,
+                                            bool ErrorDetection) const -> CompiledPayload::UniquePtr = 0;
+
+  /// Get the available instruction items that are supported by this payload.
+  /// \returns The available instruction items that are supported by this payload.
+  [[nodiscard]] virtual auto getAvailableInstructions() const -> std::list<std::string> = 0;
 };
 
 } // namespace firestarter::environment::payload
diff --git a/include/firestarter/Environment/Payload/PayloadSettings.hpp b/include/firestarter/Environment/Payload/PayloadSettings.hpp
new file mode 100644
index 00000000..8438e9a6
--- /dev/null
+++ b/include/firestarter/Environment/Payload/PayloadSettings.hpp
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <initializer_list>
+#include <list>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace firestarter::environment::payload {
+
+/// This class represents the settings that can be changed in the high load routine of a payload.
+struct PayloadSettings {
+public:
+  using InstructionWithProportion = std::pair<std::string, unsigned>;
+
+private:
+  /// The number of threads for which this payload is available. Multiple ones may exsists. The PayloadSettings are
+  /// concreate once this is set to contain only one element.
+  std::list<unsigned> Threads;
+
+  /// The size of the L1i cache per physical CPU core. This value may be empty.
+  std::optional<unsigned> InstructionCacheSize;
+
+  /// The size of the L1d,L2,...,L3 caches per physical CPU core.
+  std::list<unsigned> DataCacheBufferSize;
+
+  /// The selected size of the buffer that is in the RAM on the physical CPU core.
+  unsigned RamBufferSize;
+
+  /// The maximum number of instructions that should appear inside the high load routine.
+  unsigned Lines;
+
+  /// This represents the instructions in combination with the number of times they should appear in the generated
+  /// sequence.
+  std::vector<InstructionWithProportion> InstructionGroups;
+
+  /// Get the number of items in the sequence that start with a given string.
+  /// \arg Sequence The sequence that is analyzed.
+  /// \arg Start The string that contains the start of the item names that should be counted in the sequence.
+  /// \returns The number of items in the sequence that start with the supplied strings.
+  [[nodiscard]] static auto getSequenceStartCount(const std::vector<std::string>& Sequence, const std::string& Start)
+      -> unsigned;
+
+public:
+  PayloadSettings() = delete;
+
+  PayloadSettings(std::initializer_list<unsigned> Threads, std::initializer_list<unsigned> DataCacheBufferSize,
+                  unsigned RamBufferSize, unsigned Lines, std::vector<InstructionWithProportion>&& InstructionGroups)
+      : Threads(Threads)
+      , DataCacheBufferSize(DataCacheBufferSize)
+      , RamBufferSize(RamBufferSize)
+      , Lines(Lines)
+      , InstructionGroups(std::move(InstructionGroups)) {}
+
+  /// Generate a sequence of items interleaved with one another based on a supplied number how many times each items
+  /// should appear in the resulting sequence.
+  /// \arg Proportion The mapping of items defined by a string and the number of times this item should apear in the
+  /// resuling sequence.
+  /// \returns The sequence that is generated from the supplied propotions
+  [[nodiscard]] static auto generateSequence(const std::vector<InstructionWithProportion>& Proportion)
+      -> std::vector<std::string>;
+
+  /// Get the number of items in the sequence that start with "L2".
+  /// \arg Sequence The sequence that is analyzed.
+  /// \returns The number of items items in the sequence that start with "L2".
+  [[nodiscard]] static auto getL2SequenceCount(const std::vector<std::string>& Sequence) -> unsigned {
+    return getSequenceStartCount(Sequence, "L2");
+  };
+
+  /// Get the number of items in the sequence that start with "L3".
+  /// \arg Sequence The sequence that is analyzed.
+  /// \returns The number of items items in the sequence that start with "L3".
+  [[nodiscard]] static auto getL3SequenceCount(const std::vector<std::string>& Sequence) -> unsigned {
+    return getSequenceStartCount(Sequence, "L3");
+  };
+
+  /// Get the number of items in the sequence that start with "RAM".
+  /// \arg Sequence The sequence that is analyzed.
+  /// \returns The number of items items in the sequence that start with "RAM".
+  [[nodiscard]] static auto getRAMSequenceCount(const std::vector<std::string>& Sequence) -> unsigned {
+    return getSequenceStartCount(Sequence, "RAM");
+  };
+
+  /// Get the maximum number of repetitions of the the supplied sequence so that the size of the sequence times the
+  /// number of repetitions is smaller equal to the number of lines. The number of repetitions is a unsigned number.
+  /// \arg Sequence The reference to the sequence that should be repeated multiple times
+  /// \arg NumberOfLines The maximum number of entries in the repeated sequence
+  /// \returns The number of repetitions of the sequence.
+  [[nodiscard]] static auto getNumberOfSequenceRepetitions(const std::vector<std::string>& Sequence,
+                                                           const unsigned NumberOfLines) -> unsigned {
+    if (Sequence.empty()) {
+      return 0;
+    }
+    return NumberOfLines / Sequence.size();
+  };
+
+  /// Get the number of accesses that can be made to 80% of the L2 cache size (each incrementing the pointer to the
+  /// cache) before the pointer need to be reseted to the original value. This assumes that each L2 item in the sequence
+  /// increments the pointer by one cache line (64B). It is also assumed that the number of accesses fit at least once
+  /// into this cache. This should always be the case on modern CPUs.
+  /// \arg Sequence The reference to the sequence.
+  /// \arg NumberOfLines The maximum number of entries in the repeated sequence.
+  /// \arg Size The size of the L2 Cache.
+  /// \returns The maximum number of iterations of the repeated sequence to fill up to 80% of the L2 cache.
+  [[nodiscard]] static auto getL2LoopCount(const std::vector<std::string>& Sequence, unsigned NumberOfLines,
+                                           unsigned Size) -> unsigned;
+
+  /// Get the number of accesses that can be made to 80% of the L3 cache size (each incrementing the pointer to the
+  /// cache) before the pointer need to be reseted to the original value. This assumes that each L3 item in the sequence
+  /// increments the pointer by one cache line (64B). See the note about assumptions on the size of the cache in the
+  /// documentation of getL2LoopCount.
+  /// \arg Sequence The reference to the sequence.
+  /// \arg NumberOfLines The maximum number of entries in the repeated sequence.
+  /// \arg Size The size of the L3 Cache.
+  /// \returns The maximum number of iterations of the repeated sequence to fill up to 80% of the L3 cache.
+  [[nodiscard]] static auto getL3LoopCount(const std::vector<std::string>& Sequence, unsigned NumberOfLines,
+                                           unsigned Size) -> unsigned;
+
+  /// Get the number of accesses that can be made to 100% of the RAM size (each incrementing the pointer to the ram)
+  /// before the pointer need to be reseted to the original value. This assumes that each RAM item in the sequence
+  /// increments the pointer by one cache line (64B). See the note about assumptions on the size of the cache in the
+  /// documentation of getL2LoopCount.
+  /// \arg Sequence The reference to the sequence.
+  /// \arg NumberOfLines The maximum number of entries in the repeated sequence.
+  /// \arg Size The size of the RAM.
+  /// \returns The maximum number of iterations of the repeated sequence to fill up to 100% of the RAM.
+  [[nodiscard]] static auto getRAMLoopCount(const std::vector<std::string>& Sequence, unsigned NumberOfLines,
+                                            unsigned Size) -> unsigned;
+
+  /// Are the payload settings concreate, i.e. can one specific payload be compiled with these settings. This is the
+  /// case if the option of threads is reduces to a single element.
+  [[nodiscard]] auto isConcreate() const -> bool { return Threads.size() == 1; }
+
+  /// The number of threads which are available with the associated platform/payload.
+  [[nodiscard]] auto threads() const -> const auto& { return Threads; }
+
+  /// The concreate number of threads which is selected.
+  [[nodiscard]] auto thread() const -> unsigned {
+    assert(isConcreate() && "Number of threads is not concreate.");
+    return Threads.front();
+  }
+
+  /// The available instruction cache size. This refers to the L1i-Cache on the physical CPU core.
+  [[nodiscard]] auto instructionCacheSize() const -> const auto& { return InstructionCacheSize; }
+
+  /// The size of the L1d,L2,...,L3 caches per physical CPU core.
+  [[nodiscard]] auto dataCacheBufferSize() const -> const auto& { return DataCacheBufferSize; }
+
+  /// The selected size of the buffer that is in the RAM on the physical CPU core.
+  [[nodiscard]] auto ramBufferSize() const -> auto{ return RamBufferSize; }
+
+  /// Return the total buffer size for the data caches and the ram per physical CPU core.
+  [[nodiscard]] auto totalBufferSize() const -> std::size_t {
+    std::size_t Total = 0;
+    for (const auto& DataCacheSize : DataCacheBufferSize) {
+      Total += DataCacheSize;
+    }
+    Total += RamBufferSize;
+    return Total;
+  }
+
+  /// The number of instruction groups which should be used in the payload per physical CPU core.
+  [[nodiscard]] auto lines() const -> auto{ return Lines; }
+
+  /// The available instruction cache size. This refers to the L1i-Cache per thread on the physical CPU core.
+  [[nodiscard]] auto instructionCacheSizePerThread() const -> std::optional<unsigned> {
+    auto InstructionCacheSize = this->InstructionCacheSize;
+    if (*InstructionCacheSize) {
+      return *InstructionCacheSize / thread();
+    }
+    return {};
+  }
+
+  /// The size of the L1d,L2,...,L3 caches per thread on the physical CPU core.
+  [[nodiscard]] auto dataCacheBufferSizePerThread() const -> std::list<unsigned> {
+    auto DataCacheBufferSizePerThread = DataCacheBufferSize;
+    for (auto& Value : DataCacheBufferSizePerThread) {
+      Value /= thread();
+    }
+    return DataCacheBufferSizePerThread;
+  }
+
+  /// The selected size of the buffer that is in the RAM per thread on the physical CPU core.
+  [[nodiscard]] auto ramBufferSizePerThread() const -> auto{ return RamBufferSize / thread(); }
+
+  /// Return the total buffer size for the data caches and the ram per thread on the physical CPU core.
+  [[nodiscard]] auto totalBufferSizePerThread() const -> std::size_t { return totalBufferSize() / thread(); }
+
+  /// The number of instruction groups which should be used in the payload per thread on the physical CPU core.
+  [[nodiscard]] auto linesPerThread() const -> auto{ return Lines / thread(); }
+
+  /// The vector of instruction groups with proportions.
+  [[nodiscard]] auto instructionGroups() const -> const auto& { return InstructionGroups; }
+
+  /// Generate a sequence of items interleaved with one another based on the instruction groups.
+  /// \returns The sequence that is generated from the supplied propotions in the instruction groups.
+  [[nodiscard]] auto sequence() const -> std::vector<std::string> { return generateSequence(instructionGroups()); }
+
+  /// The vector of used instructions that are saved in the instruction groups
+  [[nodiscard]] auto instructionGroupItems() const -> std::vector<std::string> {
+    std::vector<std::string> Items;
+    Items.reserve(InstructionGroups.size());
+    for (auto const& Pair : InstructionGroups) {
+      Items.push_back(Pair.first);
+    }
+    return Items;
+  }
+
+  /// Get the string that represents the instructions in combination with the number of times they should appear in the
+  /// sequence.
+  [[nodiscard]] auto getInstructionGroupsString() const -> std::string {
+    std::stringstream Ss;
+
+    for (auto const& [Name, Value] : InstructionGroups) {
+      Ss << Name << ":" << Value << ",";
+    }
+
+    auto Str = Ss.str();
+    if (!Str.empty()) {
+      Str.pop_back();
+    }
+
+    return Str;
+  }
+
+  /// Make the settings concreate.
+  /// \arg InstructionCacheSize The detected size of the instructions cache.
+  /// \arg ThreadPerCore The number of threads per pysical CPU.
+  void concretize(std::optional<unsigned> InstructionCacheSize, unsigned ThreadsPerCore) {
+    this->InstructionCacheSize = InstructionCacheSize;
+    this->Threads = {ThreadsPerCore};
+  }
+
+  /// Save the supplied instruction groups with their proportion in the payload settings.
+  /// \arg InstructionGroups The vector with pairs of instructions and proportions
+  void selectInstructionGroups(std::vector<InstructionWithProportion> const& InstructionGroups) {
+    this->InstructionGroups = InstructionGroups;
+  }
+
+  /// Save the line count in the payload settings.
+  void setLineCount(unsigned LineCount) { this->Lines = LineCount; }
+};
+
+} // namespace firestarter::environment::payload
diff --git a/include/firestarter/Measurement/Metric/RAPL.h b/include/firestarter/Environment/Payload/PayloadStats.hpp
similarity index 63%
rename from include/firestarter/Measurement/Metric/RAPL.h
rename to include/firestarter/Environment/Payload/PayloadStats.hpp
index d88e3d91..79b2b1e3 100644
--- a/include/firestarter/Measurement/Metric/RAPL.h
+++ b/include/firestarter/Environment/Payload/PayloadStats.hpp
@@ -1,6 +1,6 @@
 /******************************************************************************
  * FIRESTARTER - A Processor Stress Test Utility
- * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
  * Performance Computing
  *
  * This program is free software: you can redistribute it and/or modify
@@ -21,6 +21,18 @@
 
 #pragma once
 
-#include <firestarter/Measurement/MetricInterface.h>
+namespace firestarter::environment::payload {
 
-extern metric_interface_t rapl_metric;
+/// This struct represents the stats a compiled payload has.
+struct PayloadStats {
+  /// The number of flops computed per iteration of the high load routine.
+  unsigned Flops = 0;
+
+  /// The number of bytes accessed to the main memory per iteration of the high load routine.
+  unsigned Bytes = 0;
+
+  /// The number of instructions in load loop
+  unsigned Instructions = 0;
+};
+
+} // namespace firestarter::environment::payload
diff --git a/include/firestarter/Environment/Platform/PlatformConfig.hpp b/include/firestarter/Environment/Platform/PlatformConfig.hpp
index cbde3c68..40833b8c 100644
--- a/include/firestarter/Environment/Platform/PlatformConfig.hpp
+++ b/include/firestarter/Environment/Platform/PlatformConfig.hpp
@@ -21,83 +21,117 @@
 
 #pragma once
 
-#include <firestarter/Environment/Payload/Payload.hpp>
-#include <firestarter/Logging/Log.hpp>
-
-#include <algorithm>
-#include <initializer_list>
-#include <map>
-#include <sstream>
-#include <string>
+#include "firestarter/Environment/CPUTopology.hpp"
+#include "firestarter/Environment/Payload/Payload.hpp"
+#include "firestarter/Logging/Log.hpp"
 
 namespace firestarter::environment::platform {
 
+/// The payload in combination with settings and a short hand name for the specific microarchitecture this payload is
+/// designed for.
 class PlatformConfig {
 private:
-  std::string _name;
-  std::list<unsigned> _threads;
-  payload::Payload *_payload;
+  /// The name of this platform. This is usually a short hand for the CPU microarchitecture e.g., HSW_COREI or
+  /// HSW_XEONEP.
+  std::string Name;
 
-protected:
-  unsigned _instructionCacheSize;
-  std::list<unsigned> _dataCacheBufferSize;
-  unsigned _ramBufferSize;
-  unsigned _lines;
+  /// The settings for the associated payload.
+  payload::PayloadSettings Settings;
+
+  /// The payload this platfrom should execute.
+  std::shared_ptr<const payload::Payload> Payload;
 
 public:
-  PlatformConfig(std::string name, std::list<unsigned> threads,
-                 unsigned instructionCacheSize,
-                 std::initializer_list<unsigned> dataCacheBufferSize,
-                 unsigned ramBufferSize, unsigned lines,
-                 payload::Payload *payload)
-      : _name(name), _threads(threads), _payload(payload),
-        _instructionCacheSize(instructionCacheSize),
-        _dataCacheBufferSize(dataCacheBufferSize),
-        _ramBufferSize(ramBufferSize), _lines(lines) {}
-  virtual ~PlatformConfig() { delete _payload; }
-
-  const std::string &name() const { return _name; }
-  unsigned instructionCacheSize() const { return _instructionCacheSize; }
-  const std::list<unsigned> &dataCacheBufferSize() const {
-    return _dataCacheBufferSize;
-  }
-  unsigned ramBufferSize() const { return _ramBufferSize; }
-  unsigned lines() const { return _lines; }
-  payload::Payload const &payload() const { return *_payload; }
-
-  std::map<unsigned, std::string> getThreadMap() const {
-    std::map<unsigned, std::string> threadMap;
-
-    for (auto const &thread : _threads) {
-      std::stringstream functionName;
-      functionName << "FUNC_" << name() << "_" << payload().name() << "_"
-                   << thread << "T";
-      threadMap[thread] = functionName.str();
-    }
+  /// Getter for the name of the platform.
+  [[nodiscard]] auto name() const -> const auto& { return Name; }
 
-    return threadMap;
-  }
+  /// Getter for the settings of the platform.
+  [[nodiscard]] auto settings() const -> const auto& { return Settings; }
 
-  bool isAvailable() const { return payload().isAvailable(); }
+  /// Reference to the settings. This allows them to be overriden.
+  [[nodiscard]] auto settings() -> auto& { return Settings; }
 
-  virtual bool isDefault() const = 0;
+  /// Getter for the payload of the platform.
+  [[nodiscard]] auto payload() const -> const auto& { return Payload; }
 
-  virtual std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const = 0;
+  /// Check if this platform is available on the current system. This transloate to if the cpu extensions are
+  /// available for the payload that is used.
+  /// \arg Topology The reference to the CPUTopology that is used to check agains if this platform is supported.
+  /// \returns true if the platform is supported on the given CPUTopology.
+  [[nodiscard]] auto isAvailable(const CPUTopology& Topology) const -> bool { return isAvailable(&Topology); }
 
-  std::string getDefaultPayloadSettingsString() const {
-    std::stringstream ss;
+  /// Check if this platform is available and the default on the current system.
+  /// \arg Topology The reference to the CPUTopology that is used to check agains if this payload is supported.
+  /// \returns true if the platform is the default one for a given CPUTopology.
+  [[nodiscard]] auto isDefault(const CPUTopology& Topology) const -> bool { return isDefault(&Topology); }
 
-    for (auto const &[name, value] : this->getDefaultPayloadSettings()) {
-      ss << name << ":" << value << ",";
+protected:
+  /// Check if this platform is available on the current system. This transloate to if the cpu extensions are
+  /// available for the payload that is used.
+  /// \arg Topology The pointer to the CPUTopology that is used to check agains if this platform is supported.
+  /// \returns true if the platform is supported on the given CPUTopology.
+  [[nodiscard]] virtual auto isAvailable(const CPUTopology* Topology) const -> bool {
+    return payload()->isAvailable(*Topology);
+  }
+
+  /// Check if this platform is available and the default on the current system.
+  /// \arg Topology The pointer to the CPUTopology that is used to check agains if this payload is supported.
+  /// \returns true if the platform is the default one for a given CPUTopology.
+  [[nodiscard]] virtual auto isDefault(const CPUTopology*) const -> bool = 0;
+
+public:
+  PlatformConfig() = delete;
+
+  PlatformConfig(std::string Name, payload::PayloadSettings&& Settings,
+                 std::shared_ptr<const payload::Payload>&& Payload) noexcept
+      : Name(std::move(Name))
+      , Settings(std::move(Settings))
+      , Payload(std::move(Payload)) {}
+
+  virtual ~PlatformConfig() = default;
+
+  /// Clone a the platform config.
+  [[nodiscard]] virtual auto clone() const -> std::unique_ptr<PlatformConfig> = 0;
+
+  /// Clone a concreate platform config.
+  /// \arg InstructionCacheSize The detected size of the instructions cache.
+  /// \arg ThreadPerCore The number of threads per pysical CPU.
+  [[nodiscard]] virtual auto cloneConcreate(std::optional<unsigned> InstructionCacheSize, unsigned ThreadsPerCore) const
+      -> std::unique_ptr<PlatformConfig> = 0;
+
+  /// The function name for this platform config given a specific thread per core count.
+  /// \arg ThreadsPerCore The number of threads per core.
+  /// \returns The name of the function (a platform name, payload name and a specific thread per core count)
+  [[nodiscard]] auto functionName(unsigned ThreadsPerCore) const -> std::string {
+    return "FUNC_" + Name + "_" + Payload->name() + "_" + std::to_string(ThreadsPerCore) + "T";
+  };
+
+  /// Get the concreate functions name.
+  [[nodiscard]] auto functionName() const -> std::string {
+    assert(Settings.isConcreate() && "Settings must be concreate for a concreate function name");
+    return functionName(Settings.thread());
+  };
+
+  /// Print a summary for the selected platform/payload with given settings.
+  void printCodePathSummary() const {
+    assert(Settings.isConcreate() && "Setting must be concreate to print the code path summary.");
+
+    log::info() << "\n"
+                << "  Taking " << Payload->name() << " path optimized for " << Name << " - " << Settings.thread()
+                << " thread(s) per core\n"
+                << "  Used buffersizes per thread:";
+
+    if (Settings.instructionCacheSizePerThread()) {
+      log::info() << "    - L1i-Cache: " << *Settings.instructionCacheSizePerThread() << " Bytes";
     }
 
-    auto str = ss.str();
-    if (str.size() > 0) {
-      str.pop_back();
+    unsigned I = 1;
+    for (auto const& Bytes : Settings.dataCacheBufferSizePerThread()) {
+      log::info() << "    - L" << I << "d-Cache: " << Bytes << " Bytes";
+      I++;
     }
 
-    return str;
+    log::info() << "    - Memory: " << Settings.ramBufferSizePerThread() << " Bytes";
   }
 };
 
diff --git a/include/firestarter/Environment/Platform/RuntimeConfig.hpp b/include/firestarter/Environment/Platform/RuntimeConfig.hpp
deleted file mode 100644
index 2ed821ea..00000000
--- a/include/firestarter/Environment/Platform/RuntimeConfig.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/******************************************************************************
- * FIRESTARTER - A Processor Stress Test Utility
- * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High
- * Performance Computing
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
- *
- * Contact: daniel.hackenberg@tu-dresden.de
- *****************************************************************************/
-
-#pragma once
-
-#include <firestarter/Environment/Platform/PlatformConfig.hpp>
-
-#include <cassert>
-
-namespace firestarter::environment::platform {
-
-class RuntimeConfig {
-private:
-  PlatformConfig const &_platformConfig;
-  std::unique_ptr<payload::Payload> _payload;
-  unsigned _thread;
-  std::vector<std::pair<std::string, unsigned>> _payloadSettings;
-  unsigned _instructionCacheSize;
-  std::list<unsigned> _dataCacheBufferSize;
-  unsigned _ramBufferSize;
-  unsigned _lines;
-
-public:
-  RuntimeConfig(PlatformConfig const &platformConfig, unsigned thread,
-                unsigned detectedInstructionCacheSize)
-      : _platformConfig(platformConfig), _payload(nullptr), _thread(thread),
-        _payloadSettings(platformConfig.getDefaultPayloadSettings()),
-        _instructionCacheSize(platformConfig.instructionCacheSize()),
-        _dataCacheBufferSize(platformConfig.dataCacheBufferSize()),
-        _ramBufferSize(platformConfig.ramBufferSize()),
-        _lines(platformConfig.lines()) {
-    if (detectedInstructionCacheSize != 0) {
-      this->_instructionCacheSize = detectedInstructionCacheSize;
-    }
-  };
-
-  RuntimeConfig(const RuntimeConfig &c)
-      : _platformConfig(c.platformConfig()),
-        _payload(c.platformConfig().payload().clone()), _thread(c.thread()),
-        _payloadSettings(c.payloadSettings()),
-        _instructionCacheSize(c.instructionCacheSize()),
-        _dataCacheBufferSize(c.dataCacheBufferSize()),
-        _ramBufferSize(c.ramBufferSize()), _lines(c.lines()) {}
-
-  ~RuntimeConfig() { _payload.reset(); }
-
-  PlatformConfig const &platformConfig() const { return _platformConfig; }
-  payload::Payload &payload() const {
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-value"
-#endif
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-value"
-    assert(("Payload pointer is null. Each thread has to use it's own "
-            "RuntimeConfig",
-            _payload != nullptr));
-#pragma GCC diagnostic pop
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-    return *_payload;
-  }
-  unsigned thread() const { return _thread; }
-  const std::vector<std::pair<std::string, unsigned>> &payloadSettings() const {
-    return _payloadSettings;
-  }
-  std::vector<std::string> payloadItems() const {
-    std::vector<std::string> items;
-    for (auto const &pair : _payloadSettings) {
-      items.push_back(pair.first);
-    }
-    return items;
-  }
-
-  unsigned instructionCacheSize() const { return _instructionCacheSize; }
-  const std::list<unsigned> &dataCacheBufferSize() const {
-    return _dataCacheBufferSize;
-  }
-  unsigned ramBufferSize() const { return _ramBufferSize; }
-  unsigned lines() const { return _lines; }
-
-  void setPayloadSettings(
-      std::vector<std::pair<std::string, unsigned>> const &payloadSettings) {
-    this->_payloadSettings = payloadSettings;
-  }
-
-  void setLineCount(unsigned lineCount) { this->_lines = lineCount; }
-
-  void printCodePathSummary() const {
-    log::info() << "\n"
-                << "  Taking " << platformConfig().payload().name()
-                << " path optimized for " << platformConfig().name() << " - "
-                << thread() << " thread(s) per core\n"
-                << "  Used buffersizes per thread:";
-
-    if (instructionCacheSize() != 0) {
-      log::info() << "    - L1i-Cache: " << instructionCacheSize() / thread()
-                  << " Bytes";
-    }
-
-    unsigned i = 1;
-    for (auto const &bytes : dataCacheBufferSize()) {
-      log::info() << "    - L" << i << "d-Cache: " << bytes / thread()
-                  << " Bytes";
-      i++;
-    }
-
-    log::info() << "    - Memory: " << ramBufferSize() / thread() << " Bytes";
-  }
-};
-
-} // namespace firestarter::environment::platform
diff --git a/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp b/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp
index b23f1b97..20bfc491 100644
--- a/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp
+++ b/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp
@@ -21,37 +21,50 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/X86Payload.hpp>
+#include "firestarter/Environment/X86/Payload/X86Payload.hpp"
 
 namespace firestarter::environment::x86::payload {
+
+/// This payload is designed for the AVX512 foundation CPU extension.
 class AVX512Payload final : public X86Payload {
 public:
-  AVX512Payload(asmjit::CpuFeatures const &supportedFeatures)
-      : X86Payload(supportedFeatures, {asmjit::CpuFeatures::X86::kAVX512_F},
-                   "AVX512", 8, 32) {}
-
-  int compilePayload(
-      std::vector<std::pair<std::string, unsigned>> const &proportion,
-      unsigned instructionCacheSize,
-      std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-      unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-      bool errorDetection) override;
-  std::list<std::string> getAvailableInstructions() const override;
-  void init(unsigned long long *memoryAddr,
-            unsigned long long bufferSize) override;
+  AVX512Payload() noexcept
+      : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::kAVX512_F}, /*Name=*/"AVX512", /*RegisterSize=*/8,
+                   /*RegisterCount=*/32,
+                   /*InstructionFlops=*/
+                   {{"REG", 32},
+                    {"L1_L", 32},
+                    {"L1_BROADCAST", 16},
+                    {"L1_S", 16},
+                    {"L1_LS", 16},
+                    {"L2_L", 32},
+                    {"L2_S", 16},
+                    {"L2_LS", 16},
+                    {"L3_L", 32},
+                    {"L3_S", 16},
+                    {"L3_LS", 16},
+                    {"L3_P", 16},
+                    {"RAM_L", 32},
+                    {"RAM_S", 16},
+                    {"RAM_LS", 16},
+                    {"RAM_P", 16}},
+                   /*InstructionMemory=*/{{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}) {}
 
-  firestarter::environment::payload::Payload *clone() const override {
-    return new AVX512Payload(this->supportedFeatures());
-  };
+  /// Compile this payload with supplied settings and optional features.
+  /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches.
+  /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the
+  /// compiled payload.
+  /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine
+  /// of the compiled payload.
+  /// \returns The compiled payload that provides access to the init and load functions.
+  [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                    bool ErrorDetection) const
+      -> environment::payload::CompiledPayload::UniquePtr override;
 
 private:
-  const std::map<std::string, unsigned> instructionFlops = {
-      {"REG", 32},   {"L1_L", 32},  {"L1_BROADCAST", 16}, {"L1_S", 16},
-      {"L1_LS", 16}, {"L2_L", 32},  {"L2_S", 16},         {"L2_LS", 16},
-      {"L3_L", 32},  {"L3_S", 16},  {"L3_LS", 16},        {"L3_P", 16},
-      {"RAM_L", 32}, {"RAM_S", 16}, {"RAM_LS", 16},       {"RAM_P", 16}};
-
-  const std::map<std::string, unsigned> instructionMemory = {
-      {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}};
+  /// Function to initialize the memory used by the high load function.
+  /// \arg MemoryAddr The pointer to the memory.
+  /// \arg BufferSize The number of doubles that is allocated in MemoryAddr.
+  void init(double* MemoryAddr, uint64_t BufferSize) const override;
 };
 } // namespace firestarter::environment::x86::payload
diff --git a/include/firestarter/Environment/X86/Payload/AVXPayload.hpp b/include/firestarter/Environment/X86/Payload/AVXPayload.hpp
index 0a6e8014..24ef7a15 100644
--- a/include/firestarter/Environment/X86/Payload/AVXPayload.hpp
+++ b/include/firestarter/Environment/X86/Payload/AVXPayload.hpp
@@ -21,36 +21,49 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/X86Payload.hpp>
+#include "firestarter/Environment/X86/Payload/X86Payload.hpp"
 
 namespace firestarter::environment::x86::payload {
+
+/// This payload is designed for the AVX CPU extension.
 class AVXPayload final : public X86Payload {
 public:
-  AVXPayload(asmjit::CpuFeatures const &supportedFeatures)
-      : X86Payload(supportedFeatures, {asmjit::CpuFeatures::X86::kAVX}, "AVX",
-                   4, 16) {}
-
-  int compilePayload(
-      std::vector<std::pair<std::string, unsigned>> const &proportion,
-      unsigned instructionCacheSize,
-      std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-      unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-      bool errorDetection) override;
-  std::list<std::string> getAvailableInstructions() const override;
-  void init(unsigned long long *memoryAddr,
-            unsigned long long bufferSize) override;
+  AVXPayload()
+      : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::kAVX}, /*Name=*/"AVX", /*RegisterSize=*/4,
+                   /*RegisterCount=*/16,
+                   /*InstructionFlops=*/
+                   {{"REG", 4},
+                    {"L1_L", 4},
+                    {"L1_S", 4},
+                    {"L1_LS", 4},
+                    {"L2_L", 4},
+                    {"L2_S", 4},
+                    {"L2_LS", 4},
+                    {"L3_L", 4},
+                    {"L3_S", 4},
+                    {"L3_LS", 4},
+                    {"L3_P", 4},
+                    {"RAM_L", 4},
+                    {"RAM_S", 4},
+                    {"RAM_LS", 4},
+                    {"RAM_P", 4}},
+                   /*InstructionMemory=*/{{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}) {}
 
-  firestarter::environment::payload::Payload *clone() const override {
-    return new AVXPayload(this->supportedFeatures());
-  };
+  /// Compile this payload with supplied settings and optional features.
+  /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches.
+  /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the
+  /// compiled payload.
+  /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine
+  /// of the compiled payload.
+  /// \returns The compiled payload that provides access to the init and load functions.
+  [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                    bool ErrorDetection) const
+      -> environment::payload::CompiledPayload::UniquePtr override;
 
 private:
-  const std::map<std::string, unsigned> instructionFlops = {
-      {"REG", 4},  {"L1_L", 4},  {"L1_S", 4},  {"L1_LS", 4},  {"L2_L", 4},
-      {"L2_S", 4}, {"L2_LS", 4}, {"L3_L", 4},  {"L3_S", 4},   {"L3_LS", 4},
-      {"L3_P", 4}, {"RAM_L", 4}, {"RAM_S", 4}, {"RAM_LS", 4}, {"RAM_P", 4}};
-
-  const std::map<std::string, unsigned> instructionMemory = {
-      {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}};
+  /// Function to initialize the memory used by the high load function.
+  /// \arg MemoryAddr The pointer to the memory.
+  /// \arg BufferSize The number of doubles that is allocated in MemoryAddr.
+  void init(double* MemoryAddr, uint64_t BufferSize) const override;
 };
 } // namespace firestarter::environment::x86::payload
diff --git a/include/firestarter/Environment/X86/Payload/CompiledX86Payload.hpp b/include/firestarter/Environment/X86/Payload/CompiledX86Payload.hpp
new file mode 100644
index 00000000..776f83f4
--- /dev/null
+++ b/include/firestarter/Environment/X86/Payload/CompiledX86Payload.hpp
@@ -0,0 +1,81 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#pragma once
+
+#include "firestarter/Environment/Payload/CompiledPayload.hpp"
+#include "firestarter/Logging/Log.hpp"
+
+#include <asmjit/asmjit.h>
+#include <memory>
+
+namespace firestarter::environment::x86::payload {
+
+/// This class provides the functionality to compile a payload created with asmjit and create a unique pointer to the
+/// CompiledPayload class which can be used to execute the functions of this payload.
+class CompiledX86Payload final : public environment::payload::CompiledPayload {
+private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+  inline static asmjit::JitRuntime Runtime = asmjit::JitRuntime();
+
+  /// Custom deleter to release the memory of the high load function from the asmjit runtime.
+  /// \arg Payload The pointer to this class
+  static void deleter(CompiledX86Payload* Payload) {
+    if (Payload && Payload->highLoadFunctionPtr()) {
+      Runtime.release(Payload->highLoadFunctionPtr());
+    }
+  }
+  /// Custom deleter to release the memory of the high load function from the asmjit runtime.
+  /// \arg Payload The pointer to this class
+  static void deleter(CompiledPayload* Payload) { deleter(dynamic_cast<CompiledX86Payload*>(Payload)); }
+
+  /// Wrap the CompiledPayload class and forward all arguments.
+  /// \arg Stats The stats of the high load function from the payload.
+  /// \arg PayloadPtr A unique pointer to the payload class to allow calling the init and low load functions which do
+  /// not change based on different payload settings.
+  /// \arg HighLoadFunction The pointer to the compiled high load function.
+  CompiledX86Payload(const environment::payload::PayloadStats& Stats,
+                     std::unique_ptr<environment::payload::Payload>&& PayloadPtr, HighLoadFunctionPtr HighLoadFunction)
+      : CompiledPayload(Stats, std::move(PayloadPtr), HighLoadFunction) {}
+
+public:
+  CompiledX86Payload() = delete;
+  ~CompiledX86Payload() override = default;
+
+  /// Create a unique pointer to a compiled payload from payload stats and assembly in a code holder.
+  /// \tparam DerivedPayload The payload class from which the CodeHolder with the assembly was created from.
+  /// \arg Stats The stats of the payload that is contained in the CodeHolder.
+  /// \arg Code The CodeHolder that contains the assembly instruction making up the payload. This will be added to the
+  /// JitRuntime and a pointer to the function will be provided to the CompiledPayload class.
+  /// \returns The unique pointer to the compiled payload.
+  template <class DerivedPayload>
+  [[nodiscard]] static auto create(environment::payload::PayloadStats Stats, asmjit::CodeHolder& Code) -> UniquePtr {
+    HighLoadFunctionPtr HighLoadFunction{};
+    const auto Err = Runtime.add(&HighLoadFunction, &Code);
+    if (Err) {
+      workerLog::error() << "Asmjit adding Assembler to JitRuntime failed";
+    }
+
+    return {new CompiledX86Payload(Stats, std::move(std::make_unique<DerivedPayload>()), HighLoadFunction), deleter};
+  }
+};
+
+} // namespace firestarter::environment::x86::payload
diff --git a/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp b/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp
index 47d8a778..f0e711f6 100644
--- a/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp
+++ b/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp
@@ -21,39 +21,49 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/X86Payload.hpp>
+#include "firestarter/Environment/X86/Payload/X86Payload.hpp"
 
 namespace firestarter::environment::x86::payload {
 
+/// This payload is designed for the FMA4 CPU extension.
 class FMA4Payload final : public X86Payload {
 public:
-  FMA4Payload(asmjit::CpuFeatures const &supportedFeatures)
-      : X86Payload(
-            supportedFeatures,
-            {asmjit::CpuFeatures::X86::kAVX, asmjit::CpuFeatures::X86::kFMA4},
-            "FMA4", 4, 16) {}
+  FMA4Payload() noexcept
+      : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::kAVX, asmjit::CpuFeatures::X86::kFMA4},
+                   /*Name=*/"FMA4", /*RegisterSize=*/4, /*RegisterCount=*/16,
+                   /*InstructionFlops=*/
+                   {{"REG", 8},
+                    {"L1_L", 12},
+                    {"L1_S", 8},
+                    {"L1_LS", 8},
+                    {"L2_L", 8},
+                    {"L2_S", 4},
+                    {"L2_LS", 4},
+                    {"L3_L", 8},
+                    {"L3_S", 4},
+                    {"L3_LS", 4},
+                    {"L3_P", 4},
+                    {"RAM_L", 8},
+                    {"RAM_S", 4},
+                    {"RAM_LS", 4},
+                    {"RAM_P", 4}},
+                   /*InstructionMemory=*/{{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}) {}
 
-  int compilePayload(
-      std::vector<std::pair<std::string, unsigned>> const &proportion,
-      unsigned instructionCacheSize,
-      std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-      unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-      bool errorDetection) override;
-  std::list<std::string> getAvailableInstructions() const override;
-  void init(unsigned long long *memoryAddr,
-            unsigned long long bufferSize) override;
-
-  firestarter::environment::payload::Payload *clone() const override {
-    return new FMA4Payload(this->supportedFeatures());
-  };
+  /// Compile this payload with supplied settings and optional features.
+  /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches.
+  /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the
+  /// compiled payload.
+  /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine
+  /// of the compiled payload.
+  /// \returns The compiled payload that provides access to the init and load functions.
+  [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                    bool ErrorDetection) const
+      -> environment::payload::CompiledPayload::UniquePtr override;
 
 private:
-  const std::map<std::string, unsigned> instructionFlops = {
-      {"REG", 8},  {"L1_L", 12}, {"L1_S", 8},  {"L1_LS", 8},  {"L2_L", 8},
-      {"L2_S", 4}, {"L2_LS", 4}, {"L3_L", 8},  {"L3_S", 4},   {"L3_LS", 4},
-      {"L3_P", 4}, {"RAM_L", 8}, {"RAM_S", 4}, {"RAM_LS", 4}, {"RAM_P", 4}};
-
-  const std::map<std::string, unsigned> instructionMemory = {
-      {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}};
+  /// Function to initialize the memory used by the high load function.
+  /// \arg MemoryAddr The pointer to the memory.
+  /// \arg BufferSize The number of doubles that is allocated in MemoryAddr.
+  void init(double* MemoryAddr, uint64_t BufferSize) const override;
 };
-} // namespace firestarter::environment::x86::payload
+} // namespace firestarter::environment::x86::payload
\ No newline at end of file
diff --git a/include/firestarter/Environment/X86/Payload/FMAPayload.hpp b/include/firestarter/Environment/X86/Payload/FMAPayload.hpp
index 57ab455d..8280a5b2 100644
--- a/include/firestarter/Environment/X86/Payload/FMAPayload.hpp
+++ b/include/firestarter/Environment/X86/Payload/FMAPayload.hpp
@@ -21,40 +21,39 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/X86Payload.hpp>
+#include "firestarter/Environment/X86/Payload/X86Payload.hpp"
 
 namespace firestarter::environment::x86::payload {
+
+/// This payload is designed for the FMA CPU extension.
 class FMAPayload final : public X86Payload {
 public:
-  FMAPayload(asmjit::CpuFeatures const &supportedFeatures)
-      : X86Payload(supportedFeatures,
-                   {asmjit::CpuFeatures::X86::kAVX, asmjit::CpuFeatures::X86::kFMA},
-                   "FMA", 4, 16) {}
-
-  int compilePayload(
-      std::vector<std::pair<std::string, unsigned>> const &proportion,
-      unsigned instructionCacheSize,
-      std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-      unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-      bool errorDetection) override;
-  std::list<std::string> getAvailableInstructions() const override;
-  void init(unsigned long long *memoryAddr,
-            unsigned long long bufferSize) override;
+  FMAPayload() noexcept
+      : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::kAVX, asmjit::CpuFeatures::X86::kFMA}, /*Name=*/"FMA",
+                   /*RegisterSize=*/4, /*RegisterCount=*/16,
+                   /*InstructionFlops=*/{{"REG", 16},  {"L1_L", 16},     {"L1_2L", 16},      {"L1_S", 8},
+                                         {"L1_LS", 8}, {"L1_LS_256", 8}, {"L1_2LS_256", 16}, {"L2_L", 16},
+                                         {"L2_S", 8},  {"L2_LS", 8},     {"L2_LS_256", 8},   {"L2_2LS_256", 16},
+                                         {"L3_L", 16}, {"L3_S", 8},      {"L3_LS", 8},       {"L3_LS_256", 8},
+                                         {"L3_P", 8},  {"RAM_L", 16},    {"RAM_S", 8},       {"RAM_LS", 8},
+                                         {"RAM_P", 8}},
+                   /*InstructionMemory=*/{{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}) {}
 
-  firestarter::environment::payload::Payload *clone() const override {
-    return new FMAPayload(this->supportedFeatures());
-  };
+  /// Compile this payload with supplied settings and optional features.
+  /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches.
+  /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the
+  /// compiled payload.
+  /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine
+  /// of the compiled payload.
+  /// \returns The compiled payload that provides access to the init and load functions.
+  [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                    bool ErrorDetection) const
+      -> environment::payload::CompiledPayload::UniquePtr override;
 
 private:
-  const std::map<std::string, unsigned> instructionFlops = {
-      {"REG", 16},  {"L1_L", 16},     {"L1_2L", 16},      {"L1_S", 8},
-      {"L1_LS", 8}, {"L1_LS_256", 8}, {"L1_2LS_256", 16}, {"L2_L", 16},
-      {"L2_S", 8},  {"L2_LS", 8},     {"L2_LS_256", 8},   {"L2_2LS_256", 16},
-      {"L3_L", 16}, {"L3_S", 8},      {"L3_LS", 8},       {"L3_LS_256", 8},
-      {"L3_P", 8},  {"RAM_L", 16},    {"RAM_S", 8},       {"RAM_LS", 8},
-      {"RAM_P", 8}};
-
-  const std::map<std::string, unsigned> instructionMemory = {
-      {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}};
+  /// Function to initialize the memory used by the high load function.
+  /// \arg MemoryAddr The pointer to the memory.
+  /// \arg BufferSize The number of doubles that is allocated in MemoryAddr.
+  void init(double* MemoryAddr, uint64_t BufferSize) const override;
 };
 } // namespace firestarter::environment::x86::payload
diff --git a/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp b/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp
index d02a28e9..557af0d4 100644
--- a/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp
+++ b/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp
@@ -21,36 +21,49 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/X86Payload.hpp>
+#include "firestarter/Environment/X86/Payload/X86Payload.hpp"
 
 namespace firestarter::environment::x86::payload {
+
+/// This payload is designed for the SSE2 CPU extension.
 class SSE2Payload final : public X86Payload {
 public:
-  SSE2Payload(asmjit::CpuFeatures const &supportedFeatures)
-      : X86Payload(supportedFeatures, {asmjit::CpuFeatures::X86::kSSE2},
-                   "SSE2", 2, 16) {}
-
-  int compilePayload(
-      std::vector<std::pair<std::string, unsigned>> const &proportion,
-      unsigned instructionCacheSize,
-      std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-      unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-      bool errorDetection) override;
-  std::list<std::string> getAvailableInstructions() const override;
-  void init(unsigned long long *memoryAddr,
-            unsigned long long bufferSize) override;
+  SSE2Payload() noexcept
+      : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::kSSE2}, /*Name=*/"SSE2", /*RegisterSize=*/2,
+                   /*RegisterCount=*/16,
+                   /*InstructionFlops=*/
+                   {{"REG", 2},
+                    {"L1_L", 2},
+                    {"L1_S", 2},
+                    {"L1_LS", 2},
+                    {"L2_L", 2},
+                    {"L2_S", 2},
+                    {"L2_LS", 2},
+                    {"L3_L", 2},
+                    {"L3_S", 2},
+                    {"L3_LS", 2},
+                    {"L3_P", 2},
+                    {"RAM_L", 2},
+                    {"RAM_S", 2},
+                    {"RAM_LS", 2},
+                    {"RAM_P", 2}},
+                   /*InstructionMemory=*/{{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}) {}
 
-  firestarter::environment::payload::Payload *clone() const override {
-    return new SSE2Payload(this->supportedFeatures());
-  };
+  /// Compile this payload with supplied settings and optional features.
+  /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches.
+  /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the
+  /// compiled payload.
+  /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine
+  /// of the compiled payload.
+  /// \returns The compiled payload that provides access to the init and load functions.
+  [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                    bool ErrorDetection) const
+      -> environment::payload::CompiledPayload::UniquePtr override;
 
 private:
-  const std::map<std::string, unsigned> instructionFlops = {
-      {"REG", 2},  {"L1_L", 2},  {"L1_S", 2},  {"L1_LS", 2},  {"L2_L", 2},
-      {"L2_S", 2}, {"L2_LS", 2}, {"L3_L", 2},  {"L3_S", 2},   {"L3_LS", 2},
-      {"L3_P", 2}, {"RAM_L", 2}, {"RAM_S", 2}, {"RAM_LS", 2}, {"RAM_P", 2}};
-
-  const std::map<std::string, unsigned> instructionMemory = {
-      {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}};
+  /// Function to initialize the memory used by the high load function.
+  /// \arg MemoryAddr The pointer to the memory.
+  /// \arg BufferSize The number of doubles that is allocated in MemoryAddr.
+  void init(double* MemoryAddr, uint64_t BufferSize) const override;
 };
 } // namespace firestarter::environment::x86::payload
diff --git a/include/firestarter/Environment/X86/Payload/X86Payload.hpp b/include/firestarter/Environment/X86/Payload/X86Payload.hpp
index c0ebadc5..44d5bd4f 100644
--- a/include/firestarter/Environment/X86/Payload/X86Payload.hpp
+++ b/include/firestarter/Environment/X86/Payload/X86Payload.hpp
@@ -21,82 +21,544 @@
 
 #pragma once
 
-#include <firestarter/Environment/Payload/Payload.hpp>
-#include <firestarter/Logging/Log.hpp>
-
-#include <firestarter/DumpRegisterWorkerData.hpp>
-#include <firestarter/LoadWorkerData.hpp>
+#include "firestarter/Constants.hpp"          // IWYU pragma: keep
+#include "firestarter/DumpRegisterStruct.hpp" // IWYU pragma: keep
+#include "firestarter/Environment/Payload/Payload.hpp"
+#include "firestarter/Environment/X86/X86CPUTopology.hpp"
+#include "firestarter/LoadWorkerMemory.hpp"
+#include "firestarter/Logging/Log.hpp" // IWYU pragma: keep
 
 #include <asmjit/x86.h>
+#include <cassert>
+#include <cstdint>
+#include <map> // IWYU pragma: keep
+#include <type_traits>
+#include <utility>
 
-#define INIT_BLOCKSIZE 1024
+constexpr const auto InitBlocksize = 1024;
 
+/// This abstract class models a payload that can be compiled with settings and executed for X86 CPUs.
 namespace firestarter::environment::x86::payload {
 
 class X86Payload : public environment::payload::Payload {
 private:
-  // we can use this to check, if our platform support this payload
-  asmjit::CpuFeatures const &_supportedFeatures;
-  std::list<asmjit::CpuFeatures::X86::Id> featureRequests;
+  /// This list contains the features (cpu extenstions) that are requied to execute the payload.
+  std::list<asmjit::CpuFeatures::X86::Id> FeatureRequests;
 
-protected:
-  //  asmjit::CodeHolder code;
-  asmjit::JitRuntime rt;
-  // typedef int (*LoadFunction)(firestarter::ThreadData *);
-  typedef unsigned long long (*LoadFunction)(unsigned long long *,
-                                             volatile unsigned long long *,
-                                             unsigned long long);
-  LoadFunction loadFunction = nullptr;
-
-  asmjit::CpuFeatures const &supportedFeatures() const {
-    return this->_supportedFeatures;
-  }
+  /// The mapping from instructions to the number of flops per instruction. This map is required to have an entry for
+  /// every instruction.
+  std::map<std::string, unsigned> InstructionFlops;
 
-  template <class IterReg, class VectorReg>
-  void emitErrorDetectionCode(asmjit::x86::Builder &cb, IterReg iter_reg,
-                              asmjit::x86::Gpq addrHigh_reg,
-                              asmjit::x86::Gpq pointer_reg,
-                              asmjit::x86::Gpq temp_reg,
-                              asmjit::x86::Gpq temp_reg2);
+  /// The mapping from instructions to the size of main memory accesses for this instuction. This map is not required to
+  /// contain all instructions.
+  std::map<std::string, unsigned> InstructionMemory;
 
 public:
-  X86Payload(asmjit::CpuFeatures const &supportedFeatures,
-             std::initializer_list<asmjit::CpuFeatures::X86::Id> featureRequests,
-             std::string name, unsigned registerSize, unsigned registerCount)
-      : Payload(name, registerSize, registerCount),
-        _supportedFeatures(supportedFeatures),
-        featureRequests(featureRequests) {}
-
-  bool isAvailable() const override {
-    bool available = true;
-
-    for (auto const &feature : featureRequests) {
-      available &= this->_supportedFeatures.has(feature);
+  /// Abstract constructor for a payload on X86 CPUs.
+  /// \arg FeatureRequests This list with features (cpu extenstions) that are requied to execute the payload.
+  /// \arg Name The name of this payload. It is usally named by the CPU extension this payload uses e.g., SSE2 or FMA.
+  /// \arg RegisterSize The size of the SIMD registers in units of doubles (8B).
+  /// \arg RegisterCount The number of SIMD registers used by the payload.
+  /// \arg InstructionFlops The mapping from instructions to the number of flops per instruction. This map is required
+  /// to have an entry for every instruction.
+  /// \arg InstructionMemory The mapping from instructions to the size of main memory accesses for this instuction. This
+  /// map is not required to contain all instructions.
+  X86Payload(std::initializer_list<asmjit::CpuFeatures::X86::Id> FeatureRequests, std::string Name,
+             unsigned RegisterSize, unsigned RegisterCount, std::map<std::string, unsigned>&& InstructionFlops,
+             std::map<std::string, unsigned>&& InstructionMemory) noexcept
+      : Payload(std::move(Name), RegisterSize, RegisterCount)
+      , FeatureRequests(FeatureRequests)
+      , InstructionFlops(std::move(InstructionFlops))
+      , InstructionMemory(std::move(InstructionMemory)) {}
+
+private:
+  /// Check if this payload is available on the current system. This is equivalent to checking if the supplied Topology
+  /// contains all features that are in FeatureRequests.
+  /// \arg Topology The CPUTopology that is used to check agains if this payload is supported.
+  /// \returns true if the payload is supported on the given CPUTopology.
+  [[nodiscard]] auto isAvailable(const CPUTopology& Topology) const -> bool final {
+    const auto* FinalTopology = dynamic_cast<const X86CPUTopology*>(&Topology);
+    assert(FinalTopology && "isAvailable not called with const X86CPUTopology*");
+
+    bool Available = true;
+
+    for (auto const& Feature : FeatureRequests) {
+      Available &= FinalTopology->featuresAsmjit().has(Feature);
     }
 
-    return available;
+    return Available;
   };
 
-    // A generic implemenation for all x86 payloads
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Woverloaded-virtual"
-#endif
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Woverloaded-virtual"
-  void init(unsigned long long *memoryAddr, unsigned long long bufferSize,
-            double firstValue, double lastValue);
-#pragma GCC diagnostic pop
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-  // use cpuid and usleep as low load
-  void lowLoadFunction(volatile unsigned long long *addrHigh,
-                       unsigned long long period) override;
-
-  unsigned long long highLoadFunction(unsigned long long *addrMem,
-                                      volatile unsigned long long *addrHigh,
-                                      unsigned long long iterations) override;
+protected:
+  /// Emit the code to dump the xmm, ymm or zmm registers into memory for the dump registers feature.
+  /// \tparam Vec the type of the vector register used.
+  /// \arg Cb The asmjit code builder that is used to emit the assembler code.
+  /// \arg PointerReg the register containing the pointer into memory in LoadWorkerMemory that is used in the high-load
+  /// routine.
+  /// \arg VecPtr The function that is used to create a ptr to the vector register
+  template <class Vec>
+  void emitDumpRegisterCode(asmjit::x86::Builder& Cb, const asmjit::x86::Gpq& PointerReg,
+                            asmjit::x86::Mem (*VecPtr)(const asmjit::x86::Gp&, int32_t)) const {
+    constexpr const auto DumpRegisterStructRegisterValuesTopOffset =
+        -static_cast<int32_t>(LoadWorkerMemory::getMemoryOffset()) +
+        static_cast<int32_t>(offsetof(LoadWorkerMemory, ExtraVars.Drs.Padding));
+    constexpr const auto DumpRegisterStructDumpVariableOffset =
+        -static_cast<int32_t>(LoadWorkerMemory::getMemoryOffset()) +
+        static_cast<int32_t>(offsetof(LoadWorkerMemory, ExtraVars.Drs.DumpVar));
+
+    auto SkipRegistersDump = Cb.newLabel();
+
+    Cb.test(ptr_64(PointerReg, DumpRegisterStructDumpVariableOffset), asmjit::Imm(firestarter::DumpVariable::Wait));
+    Cb.jnz(SkipRegistersDump);
+
+    // dump all the vector registers register
+    for (unsigned I = 0; I < registerCount(); I++) {
+      Cb.vmovapd(VecPtr(PointerReg,
+                        DumpRegisterStructRegisterValuesTopOffset - static_cast<int32_t>(registerSize() * 8 * (I + 1))),
+                 Vec(I));
+    }
+
+    // set read flag
+    Cb.mov(ptr_64(PointerReg, DumpRegisterStructDumpVariableOffset), asmjit::Imm(firestarter::DumpVariable::Wait));
+
+    Cb.bind(SkipRegistersDump);
+  }
+
+  /// Emit the code to detect errors between this and two other threads that execute the same payload concurrently. We
+  /// backup the registers in Mm2...Mm7. We will check every 0x3fff iterations. If the check did not succeed we write
+  /// the LoadThreadWorkType::LoadStop flag in the AddrHighReg and therefore abort as soon as we pass the check in the
+  /// high-load routine.
+  /// \tparam MaybeConstIterRegT The type of the iteration register. If this is Mm, we assume that Mm0 is used by the
+  /// payload and the other Mm1...Mm7 are free to use. If they are free we will use them to backup rax, rbx, rcx, rdx,
+  /// r8 and r9. Otherwise we push them on the stack.
+  /// \tparam MaybeConstVectorRegT This is the type of the vector register. It can be either Xmm, Ymm or Zmm. In case of
+  /// Xmm we backup xmm0 on the stack, in case of Ymm we backup ymm0 im Mm4...Mm7 and in case of Zmm we use zmm31 for
+  /// the backup. This register may not be used in the payload.
+  /// \arg Cb The asmjit code builder that is used to emit the assembler code.
+  /// \arg IterReg The register that holds the iteration counter of the high-load loop.
+  /// \arg AddrHighReg The register contains the pointer to the memory address where the LoadThreadWorkType is saved.
+  /// \arg PointerReg The register contains the pointer into memory in LoadWorkerMemory that is used in the high-load
+  /// routine.
+  /// \arg TempReg The first register that can be used to store temporary values.
+  /// \arg TempReg2 The second register that can be used to store temporary values.
+  template <class MaybeConstIterRegT, class MaybeConstVectorRegT>
+  void emitErrorDetectionCode(asmjit::x86::Builder& Cb, MaybeConstIterRegT& IterReg,
+                              const asmjit::x86::Gpq& AddrHighReg, const asmjit::x86::Gpq& PointerReg,
+                              const asmjit::x86::Gpq& TempReg, const asmjit::x86::Gpq& TempReg2) const {
+    using IterRegT = std::remove_const_t<MaybeConstIterRegT>;
+    using VectorRegT = std::remove_const_t<MaybeConstVectorRegT>;
+
+    // we don't want anything to break... so we use asserts for everything that
+    // could break it
+    static_assert(std::is_base_of_v<asmjit::x86::Vec, VectorRegT>, "VectorReg must be of asmjit::asmjit::x86::Vec");
+    static_assert(std::is_same_v<asmjit::x86::Xmm, VectorRegT> || std::is_same_v<asmjit::x86::Ymm, VectorRegT> ||
+                      std::is_same_v<asmjit::x86::Zmm, VectorRegT>,
+                  "VectorReg ist not of any supported type");
+    static_assert(std::is_same_v<asmjit::x86::Mm, IterRegT> || std::is_same_v<asmjit::x86::Gpq, IterRegT>,
+                  "IterReg is not of any supported type");
+
+    if constexpr (std::is_same_v<asmjit::x86::Mm, IterRegT>) {
+      assert(IterReg == asmjit::x86::mm0 && "iter_reg must be mm0");
+    }
+
+    assert(IterReg != TempReg && "iter_reg must be != temp_reg");
+    assert(TempReg != TempReg2 && "temp_reg must be != temp_reg2");
+    assert(TempReg != AddrHighReg && "temp_reg must be != addrHigh_reg");
+    assert(TempReg != PointerReg && "temp_reg must be != pointer_reg");
+
+    assert(IterReg != asmjit::x86::r8 && "iter_reg must be != r8");
+    assert(IterReg != asmjit::x86::r9 && "iter_reg must be != r9");
+    assert(IterReg != asmjit::x86::rax && "iter_reg must be != rax");
+    assert(IterReg != asmjit::x86::rbx && "iter_reg must be != rbx");
+    assert(IterReg != asmjit::x86::rcx && "iter_reg must be != rcx");
+    assert(IterReg != asmjit::x86::rdx && "iter_reg must be != rdx");
+
+    assert(TempReg != asmjit::x86::r8 && "temp_reg must be != r8");
+    assert(TempReg != asmjit::x86::r9 && "temp_reg must be != r9");
+    assert(TempReg != asmjit::x86::rax && "temp_reg must be != rax");
+    assert(TempReg != asmjit::x86::rbx && "temp_reg must be != rbx");
+    assert(TempReg != asmjit::x86::rcx && "temp_reg must be != rcx");
+    assert(TempReg != asmjit::x86::rdx && "temp_reg must be != rdx");
+
+    assert(TempReg2 != asmjit::x86::r8 && "temp_reg2 must be != r8");
+    assert(TempReg2 != asmjit::x86::r9 && "temp_reg2 must be != r9");
+    assert(TempReg2 != asmjit::x86::rax && "temp_reg2 must be != rax");
+    assert(TempReg2 != asmjit::x86::rbx && "temp_reg2 must be != rbx");
+    assert(TempReg2 != asmjit::x86::rcx && "temp_reg2 must be != rcx");
+    assert(TempReg2 != asmjit::x86::rdx && "temp_reg2 must be != rdx");
+
+    assert(AddrHighReg != asmjit::x86::r8 && "addrHigh_reg must be != r8");
+    assert(AddrHighReg != asmjit::x86::r9 && "addrHigh_reg must be != r9");
+    assert(AddrHighReg != asmjit::x86::rax && "addrHigh_reg must be != rax");
+    assert(AddrHighReg != asmjit::x86::rbx && "addrHigh_reg must be != rbx");
+    assert(AddrHighReg != asmjit::x86::rcx && "addrHigh_reg must be != rcx");
+    assert(AddrHighReg != asmjit::x86::rdx && "addrHigh_reg must be != rdx");
+
+    auto SkipErrorDetection = Cb.newLabel();
+
+    if constexpr (std::is_same_v<asmjit::x86::Mm, IterRegT>) {
+      Cb.movq(TempReg, IterReg);
+    } else {
+      Cb.mov(TempReg, IterReg);
+    }
+    // round about 50-100 Hz
+    // more or less, but this isn't really that relevant
+    Cb.and_(TempReg, asmjit::Imm(0x3fff));
+    Cb.test(TempReg, TempReg);
+    Cb.jnz(SkipErrorDetection);
+
+    Cb.mov(TempReg, asmjit::Imm(0xffffffff));
+
+    auto RegisterCount = registerCount();
+
+    // Create a backup of VectorReg(0)
+    if constexpr (std::is_same_v<asmjit::x86::Xmm, VectorRegT>) {
+      Cb.movq(TempReg2, asmjit::x86::xmm0);
+      Cb.push(TempReg2);
+      Cb.crc32(TempReg, TempReg2);
+      Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+      Cb.movq(TempReg2, asmjit::x86::xmm0);
+      Cb.push(TempReg2);
+      Cb.crc32(TempReg, TempReg2);
+
+    } else if constexpr (std::is_same_v<asmjit::x86::Ymm, VectorRegT> && std::is_same_v<asmjit::x86::Mm, IterRegT>) {
+      Cb.movq(TempReg2, asmjit::x86::xmm0);
+      Cb.movq(asmjit::x86::Mm(7), TempReg2);
+      Cb.crc32(TempReg, TempReg2);
+      Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+      Cb.movq(TempReg2, asmjit::x86::xmm0);
+      Cb.movq(asmjit::x86::Mm(6), TempReg2);
+      Cb.crc32(TempReg, TempReg2);
+
+      Cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1));
+
+      Cb.movq(TempReg2, asmjit::x86::xmm0);
+      Cb.movq(asmjit::x86::Mm(5), TempReg2);
+      Cb.crc32(TempReg, TempReg2);
+      Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+      Cb.movq(TempReg2, asmjit::x86::xmm0);
+      Cb.movq(asmjit::x86::Mm(4), TempReg2);
+      Cb.crc32(TempReg, TempReg2);
+    } else if constexpr (std::is_same_v<asmjit::x86::Zmm, VectorRegT> && std::is_same_v<asmjit::x86::Mm, IterRegT>) {
+      // We use vector registers zmm31 for our backup
+      Cb.vmovapd(asmjit::x86::zmm31, asmjit::x86::zmm0);
+      RegisterCount--;
+    }
+
+    // Calculate the hash of the remaining VectorReg
+    // use VectorReg(0) as a temporary place to unpack values
+    for (unsigned I = 1; I < RegisterCount; I++) {
+      if constexpr (std::is_same_v<asmjit::x86::Xmm, VectorRegT>) {
+        Cb.vmovapd(asmjit::x86::xmm0, asmjit::x86::Xmm(I));
+
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+        Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+      } else if constexpr (std::is_same_v<asmjit::x86::Ymm, VectorRegT>) {
+        Cb.vmovapd(asmjit::x86::ymm0, asmjit::x86::Ymm(I));
+
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+        Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+
+        Cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1));
+
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+        Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+      } else if constexpr (std::is_same_v<asmjit::x86::Zmm, VectorRegT>) {
+        Cb.vmovapd(asmjit::x86::ymm0, asmjit::x86::Ymm(I));
+
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+        Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+
+        Cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1));
+
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+        Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+
+        Cb.vextractf32x4(asmjit::x86::xmm0, asmjit::x86::Zmm(I), asmjit::Imm(2));
+
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+        Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+
+        Cb.vextractf32x4(asmjit::x86::xmm0, asmjit::x86::Zmm(I), asmjit::Imm(3));
+
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+        Cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+        Cb.movq(TempReg2, asmjit::x86::xmm0);
+        Cb.crc32(TempReg, TempReg2);
+      }
+    }
+
+    // Restore VectorReg(0) from backup
+    if constexpr (std::is_same_v<asmjit::x86::Xmm, VectorRegT>) {
+      Cb.pop(TempReg2);
+      Cb.movq(asmjit::x86::xmm0, TempReg2);
+      Cb.movlhps(asmjit::x86::xmm0, asmjit::x86::xmm0);
+      Cb.pop(TempReg2);
+      Cb.pinsrw(asmjit::x86::xmm0, TempReg2.r32(), asmjit::Imm(0));
+      Cb.shr(TempReg2, asmjit::Imm(32));
+      Cb.movd(TempReg2.r32(), asmjit::x86::Mm(7));
+      Cb.pinsrw(asmjit::x86::xmm0, TempReg2.r32(), asmjit::Imm(1));
+    } else if constexpr (std::is_same_v<asmjit::x86::Ymm, VectorRegT> && std::is_same_v<asmjit::x86::Mm, IterRegT>) {
+      Cb.movq(TempReg2, asmjit::x86::Mm(5));
+      Cb.movq(asmjit::x86::xmm0, TempReg2);
+      Cb.movq(TempReg2, asmjit::x86::Mm(4));
+      Cb.pinsrq(asmjit::x86::xmm0, TempReg2, asmjit::Imm(1));
+
+      Cb.vinsertf128(asmjit::x86::ymm0, asmjit::x86::ymm0, asmjit::x86::xmm0, asmjit::Imm(1));
+
+      Cb.movq(TempReg2, asmjit::x86::Mm(7));
+      Cb.movq(asmjit::x86::xmm0, TempReg2);
+      Cb.movq(TempReg2, asmjit::x86::Mm(6));
+      Cb.pinsrq(asmjit::x86::xmm0, TempReg2, asmjit::Imm(1));
+    } else if constexpr (std::is_same_v<asmjit::x86::Zmm, VectorRegT> && std::is_same_v<asmjit::x86::Mm, IterRegT>) {
+      // We use vector registers zmm31 for our backup
+      Cb.vmovapd(asmjit::x86::zmm0, asmjit::x86::zmm31);
+    }
+
+    // before starting the communication, backup r8, r9, rax, rbx, rcx and rdx
+    if constexpr (std::is_same_v<asmjit::x86::Mm, IterRegT>) {
+      Cb.movq(asmjit::x86::Mm(7), asmjit::x86::rax);
+      Cb.movq(asmjit::x86::Mm(6), asmjit::x86::rbx);
+      Cb.movq(asmjit::x86::Mm(5), asmjit::x86::rcx);
+      Cb.movq(asmjit::x86::Mm(4), asmjit::x86::rdx);
+      Cb.movq(asmjit::x86::Mm(3), asmjit::x86::r8);
+      Cb.movq(asmjit::x86::Mm(2), asmjit::x86::r9);
+    } else {
+      Cb.push(asmjit::x86::rax);
+      Cb.push(asmjit::x86::rbx);
+      Cb.push(asmjit::x86::rcx);
+      Cb.push(asmjit::x86::rdx);
+      Cb.push(asmjit::x86::r8);
+      Cb.push(asmjit::x86::r9);
+    }
+
+    // do the actual communication
+    // temp_reg contains our hash
+
+    // save the pointer_reg. it might be any of r8, r9, rax, rbx, rcx or rdx
+    Cb.mov(TempReg2, PointerReg);
+
+    // Don't touch me!
+    // This sychronization and communication works even if the threads run at
+    // different (changing) speed, with just one "lock cmpxchg16b" Brought to you
+    // by a few hours of headache for two people.
+    auto Communication = [&](const int32_t ErrorDetetectionStructOffset) {
+      const auto CommunicationOffset =
+          ErrorDetetectionStructOffset + static_cast<int32_t>(offsetof(ErrorDetectionStruct::OneSide, Communication));
+      const auto Local0Offset =
+          ErrorDetetectionStructOffset + static_cast<int32_t>(offsetof(ErrorDetectionStruct::OneSide, Locals[0]));
+      const auto Local1Offset =
+          ErrorDetetectionStructOffset + static_cast<int32_t>(offsetof(ErrorDetectionStruct::OneSide, Locals[1]));
+      const auto Local2Offset =
+          ErrorDetetectionStructOffset + static_cast<int32_t>(offsetof(ErrorDetectionStruct::OneSide, Locals[2]));
+      const auto Local3Offset =
+          ErrorDetetectionStructOffset + static_cast<int32_t>(offsetof(ErrorDetectionStruct::OneSide, Locals[3]));
+      const auto ErrorOffset =
+          ErrorDetetectionStructOffset + static_cast<int32_t>(offsetof(ErrorDetectionStruct::OneSide, Error));
+
+      // communication
+      Cb.mov(asmjit::x86::r8, asmjit::x86::ptr_64(TempReg2, CommunicationOffset));
+
+      // temp data
+      Cb.mov(asmjit::x86::r9, TempReg2);
+
+      Cb.mov(asmjit::x86::rdx, asmjit::x86::ptr_64(asmjit::x86::r9, Local0Offset));
+      Cb.mov(asmjit::x86::rax, asmjit::x86::ptr_64(asmjit::x86::r9, Local1Offset));
+
+      auto L0 = Cb.newLabel();
+      Cb.bind(L0);
+
+      // Atomically ompare the data in the communicaton with the local data.
+      Cb.lock();
+      Cb.cmpxchg16b(asmjit::x86::ptr(asmjit::x86::r8));
+
+      auto L1 = Cb.newLabel();
+      Cb.jnz(L1);
+
+      // Communication had the same data as saved in locals 0 and 1. rcx, rbx saved in communication.
+      // Save written data rcx, rbx in locals 0 and 1.
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local0Offset), asmjit::x86::rcx);
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local1Offset), asmjit::x86::rbx);
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local2Offset), asmjit::Imm(0));
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local3Offset), asmjit::Imm(0));
+
+      Cb.mov(asmjit::x86::rax, asmjit::Imm(2));
+
+      auto L6 = Cb.newLabel();
+      Cb.jmp(L6);
+
+      Cb.bind(L1);
+
+      // Communication had differnt data as saved in locals 0 and 1. rdx, rax contains the data in communication.
+      // Compare the iteration counter of this and the other thread
+      Cb.cmp(asmjit::x86::rcx, asmjit::x86::rdx);
+
+      auto L2 = Cb.newLabel();
+      Cb.jle(L2);
+
+      // The current iteration counter is bigger than the counter of the other thread.
+      // Save the current counter and hash into our local storage.
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local0Offset), asmjit::x86::rcx);
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local1Offset), asmjit::x86::rbx);
+
+      // Repeat the lock cmpxchg16b routine until the other thread catches up.
+      Cb.jmp(L0);
+
+      Cb.bind(L2);
+
+      // The current iteration counter is smaller equal than the iteration counter of the other thread.
+
+      auto L3 = Cb.newLabel();
+
+      // Check if the read value from the other thread is saved locally.
+      Cb.cmp(asmjit::x86::ptr_64(asmjit::x86::r9, Local2Offset), asmjit::Imm(0));
+      Cb.jne(L3);
+      Cb.cmp(asmjit::x86::ptr_64(asmjit::x86::r9, Local3Offset), asmjit::Imm(0));
+      Cb.jne(L3);
+
+      // Save the last read value from the other thread into the local storage.
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local2Offset), asmjit::x86::rdx);
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local3Offset), asmjit::x86::rax);
+
+      Cb.bind(L3);
+
+      // Check if the id of the two threads are equal
+      Cb.cmp(asmjit::x86::rcx, asmjit::x86::ptr_64(asmjit::x86::r9, Local2Offset));
+      Cb.mov(asmjit::x86::rax, asmjit::Imm(4));
+      // If the iteration counter of this thread is smaller, skip this check. The other thread will wait for this one.
+      Cb.jne(L6);
+
+      // Compare the hashes and write teh result
+      Cb.cmp(asmjit::x86::rbx, asmjit::x86::ptr_64(asmjit::x86::r9, Local3Offset));
+      auto L4 = Cb.newLabel();
+      Cb.jne(L4);
+
+      // Hash check succeeded.
+      Cb.mov(asmjit::x86::rax, asmjit::Imm(0));
+
+      auto L5 = Cb.newLabel();
+      Cb.jmp(L5);
+
+      Cb.bind(L4);
+
+      // Hash check failed
+      Cb.mov(asmjit::x86::rax, asmjit::Imm(1));
+
+      Cb.bind(L5);
+
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local2Offset), asmjit::Imm(0));
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, Local3Offset), asmjit::Imm(0));
+
+      Cb.bind(L6);
+
+      // if check failed
+      Cb.cmp(asmjit::x86::rax, asmjit::Imm(1));
+      auto L7 = Cb.newLabel();
+      Cb.jne(L7);
+
+      // write the error flag
+      Cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, ErrorOffset), asmjit::Imm(1));
+
+      // stop the execution after some time
+      Cb.mov(asmjit::x86::ptr_64(AddrHighReg), asmjit::Imm(LoadThreadWorkType::LoadStop));
+      Cb.mfence();
+
+      Cb.bind(L7);
+
+      auto L9 = Cb.newLabel();
+      Cb.jmp(L9);
+    };
+
+    constexpr const auto ErrorDetectionStructCommunicationLeftOffset =
+        -static_cast<int32_t>(LoadWorkerMemory::getMemoryOffset()) +
+        static_cast<int32_t>(offsetof(LoadWorkerMemory, ExtraVars.Eds.Left.Communication));
+    constexpr const auto ErrorDetectionStructCommunicationRightOffset =
+        -static_cast<int32_t>(LoadWorkerMemory::getMemoryOffset()) +
+        static_cast<int32_t>(offsetof(LoadWorkerMemory, ExtraVars.Eds.Right.Communication));
+
+    // left communication
+    // move hash
+    Cb.mov(asmjit::x86::rbx, TempReg);
+    // move iterations counter
+    if constexpr (std::is_same_v<asmjit::x86::Mm, IterRegT>) {
+      Cb.movq(asmjit::x86::rcx, IterReg);
+    } else {
+      Cb.mov(asmjit::x86::rcx, IterReg);
+    }
+
+    Communication(ErrorDetectionStructCommunicationLeftOffset);
+
+    // right communication
+    // move hash
+    Cb.mov(asmjit::x86::rbx, TempReg);
+    // move iterations counter
+    if constexpr (std::is_same_v<asmjit::x86::Mm, IterRegT>) {
+      Cb.movq(asmjit::x86::rcx, IterReg);
+    } else {
+      Cb.mov(asmjit::x86::rcx, IterReg);
+    }
+
+    Communication(ErrorDetectionStructCommunicationRightOffset);
+
+    // restore r8, r9, rax, rbx, rcx and rdx
+    if constexpr (std::is_same_v<asmjit::x86::Mm, IterRegT>) {
+      Cb.movq(asmjit::x86::rax, asmjit::x86::Mm(7));
+      Cb.movq(asmjit::x86::rbx, asmjit::x86::Mm(6));
+      Cb.movq(asmjit::x86::rcx, asmjit::x86::Mm(5));
+      Cb.movq(asmjit::x86::rdx, asmjit::x86::Mm(4));
+      Cb.movq(asmjit::x86::r8, asmjit::x86::Mm(3));
+      Cb.movq(asmjit::x86::r9, asmjit::x86::Mm(2));
+    } else {
+      Cb.pop(asmjit::x86::r9);
+      Cb.pop(asmjit::x86::r8);
+      Cb.pop(asmjit::x86::rdx);
+      Cb.pop(asmjit::x86::rcx);
+      Cb.pop(asmjit::x86::rbx);
+      Cb.pop(asmjit::x86::rax);
+    }
+
+    Cb.bind(SkipErrorDetection);
+  }
+
+  static void initMemory(double* MemoryAddr, uint64_t BufferSize, double FirstValue, double LastValue);
+
+  /// Function to produce a low load on the cpu.
+  /// \arg LoadVar The variable that controls the load. If this variable changes from LoadThreadWorkType::LowLoad to
+  /// something else this function will return.
+  /// \arg Period The period of the low/high load switching. This function will sleep 1% of the Period and check if the
+  /// LoadVar changed.
+  void lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period) const final;
+
+  /// Get the available instruction items that are supported by this payload.
+  /// \returns The available instruction items that are supported by this payload.
+  [[nodiscard]] auto getAvailableInstructions() const -> std::list<std::string> final;
+
+  /// Get the mapping from instructions to the number of flops per instruction. This map is required to have an entry
+  /// for every instruction.
+  [[nodiscard]] auto instructionFlops() const -> const auto& { return InstructionFlops; }
+
+  /// Get the mapping from instructions to the size of main memory accesses for this instuction. This map is not
+  /// required to contain all instructions.
+  [[nodiscard]] auto instructionMemory() const -> const auto& { return InstructionMemory; }
 };
 
 } // namespace firestarter::environment::x86::payload
diff --git a/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp b/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp
index a1776f37..5d624725 100644
--- a/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp
+++ b/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp
@@ -21,35 +21,34 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/X86Payload.hpp>
+#include "firestarter/Environment/X86/Payload/X86Payload.hpp"
 
 namespace firestarter::environment::x86::payload {
+
+/// This payload is designed for the FMA CPU extension in combination with the first generation Zen microarchitecture.
 class ZENFMAPayload final : public X86Payload {
 public:
-  ZENFMAPayload(asmjit::CpuFeatures const &supportedFeatures)
-      : X86Payload(
-            supportedFeatures,
-            {asmjit::CpuFeatures::X86::Id::kAVX, asmjit::CpuFeatures::X86::Id::kFMA},
-            "ZENFMA", 4, 16) {}
-
-  int compilePayload(
-      std::vector<std::pair<std::string, unsigned>> const &proportion,
-      unsigned instructionCacheSize,
-      std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-      unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-      bool errorDetection) override;
-  std::list<std::string> getAvailableInstructions() const override;
-  void init(unsigned long long *memoryAddr,
-            unsigned long long bufferSize) override;
+  ZENFMAPayload() noexcept
+      : X86Payload(/*FeatureRequests=*/{asmjit::CpuFeatures::X86::Id::kAVX, asmjit::CpuFeatures::X86::Id::kFMA},
+                   /*Name=*/"ZENFMA", /*RegisterSize=*/4, /*RegisterCount=*/16,
+                   /*InstructionFlops=*/{{"REG", 8}, {"L1_LS", 8}, {"L2_L", 8}, {"L3_L", 8}, {"RAM_L", 8}},
+                   /*InstructionMemory=*/{{"RAM_L", 64}}) {}
 
-  firestarter::environment::payload::Payload *clone() const override {
-    return new ZENFMAPayload(this->supportedFeatures());
-  };
+  /// Compile this payload with supplied settings and optional features.
+  /// \arg Settings The settings for this payload e.g., the number of lines or the size of the caches.
+  /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the
+  /// compiled payload.
+  /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine
+  /// of the compiled payload.
+  /// \returns The compiled payload that provides access to the init and load functions.
+  [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                    bool ErrorDetection) const
+      -> environment::payload::CompiledPayload::UniquePtr override;
 
 private:
-  const std::map<std::string, unsigned> instructionFlops = {
-      {"REG", 8}, {"L1_LS", 8}, {"L2_L", 8}, {"L3_L", 8}, {"RAM_L", 8}};
-
-  const std::map<std::string, unsigned> instructionMemory = {{"RAM_L", 64}};
+  /// Function to initialize the memory used by the high load function.
+  /// \arg MemoryAddr The pointer to the memory.
+  /// \arg BufferSize The number of doubles that is allocated in MemoryAddr.
+  void init(double* MemoryAddr, uint64_t BufferSize) const override;
 };
 } // namespace firestarter::environment::x86::payload
diff --git a/include/firestarter/Environment/X86/Platform/BulldozerConfig.hpp b/include/firestarter/Environment/X86/Platform/BulldozerConfig.hpp
index 12a922b9..936b3601 100644
--- a/include/firestarter/Environment/X86/Platform/BulldozerConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/BulldozerConfig.hpp
@@ -21,24 +21,20 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/FMA4Payload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/FMA4Payload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class BulldozerConfig final : public X86PlatformConfig {
-
 public:
-  BulldozerConfig(asmjit::CpuFeatures const &supportedFeatures,
-                  unsigned family, unsigned model, unsigned threads)
-      : X86PlatformConfig("BLD_OPTERON", 21, {1, 2, 3}, {1}, 0,
-                          {16384, 1048576, 786432}, 104857600, 1536, family,
-                          model, threads,
-                          new payload::FMA4Payload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>(
-        {{"RAM_L", 1}, {"L3_L", 1}, {"L2_LS", 5}, {"L1_L", 90}, {"REG", 45}});
-  }
+  BulldozerConfig() noexcept
+      : X86PlatformConfig(
+            /*Name=*/"BLD_OPTERON", /*Family=*/21, /*Models=*/{1, 2, 3},
+            /*Settings=*/
+            environment::payload::PayloadSettings(
+                /*Threads=*/{1}, /*DataCacheBufferSize=*/{16384, 1048576, 786432}, /*RamBufferSize=*/104857600,
+                /*Lines=*/1536,
+                /*InstructionGroups=*/{{"RAM_L", 1}, {"L3_L", 1}, {"L2_LS", 5}, {"L1_L", 90}, {"REG", 45}}),
+            /*Payload=*/std::make_shared<const payload::FMA4Payload>()) {}
 };
 } // namespace firestarter::environment::x86::platform
diff --git a/include/firestarter/Environment/X86/Platform/HaswellConfig.hpp b/include/firestarter/Environment/X86/Platform/HaswellConfig.hpp
index f079ec18..768d3597 100644
--- a/include/firestarter/Environment/X86/Platform/HaswellConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/HaswellConfig.hpp
@@ -21,24 +21,20 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/FMAPayload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/FMAPayload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class HaswellConfig final : public X86PlatformConfig {
-
 public:
-  HaswellConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family,
-                unsigned model, unsigned threads)
-      : X86PlatformConfig("HSW_COREI", 6, {60, 61, 69, 70, 71}, {1, 2}, 0,
-                          {32768, 262144, 1572864}, 104857600, 1536, family,
-                          model, threads,
-                          new payload::FMAPayload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>(
-        {{"RAM_L", 2}, {"L3_LS", 3}, {"L2_LS", 9}, {"L1_LS", 90}, {"REG", 40}});
-  }
+  HaswellConfig() noexcept
+      : X86PlatformConfig(
+            /*Name=*/"HSW_COREI", /*Family=*/6, /*Models=*/{60, 61, 69, 70, 71},
+            /*Settings=*/
+            environment::payload::PayloadSettings(
+                /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 1572864}, /*RamBufferSize=*/104857600,
+                /*Lines=*/1536,
+                /*InstructionGroups=*/{{"RAM_L", 2}, {"L3_LS", 3}, {"L2_LS", 9}, {"L1_LS", 90}, {"REG", 40}}),
+            /*Payload=*/std::make_shared<const payload::FMAPayload>()) {}
 };
 } // namespace firestarter::environment::x86::platform
diff --git a/include/firestarter/Environment/X86/Platform/HaswellEPConfig.hpp b/include/firestarter/Environment/X86/Platform/HaswellEPConfig.hpp
index df5a1927..23d2518f 100644
--- a/include/firestarter/Environment/X86/Platform/HaswellEPConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/HaswellEPConfig.hpp
@@ -21,27 +21,20 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/FMAPayload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/FMAPayload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class HaswellEPConfig final : public X86PlatformConfig {
-
 public:
-  HaswellEPConfig(asmjit::CpuFeatures const &supportedFeatures,
-                  unsigned family, unsigned model, unsigned threads)
-      : X86PlatformConfig("HSW_XEONEP", 6, {63, 79}, {1, 2}, 0,
-                          {32768, 262144, 2621440}, 104857600, 1536, family,
-                          model, threads,
-                          new payload::FMAPayload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>({{"RAM_L", 8},
-                                                          {"L3_LS", 1},
-                                                          {"L2_LS", 29},
-                                                          {"L1_LS", 100},
-                                                          {"REG", 100}});
-  }
+  HaswellEPConfig() noexcept
+      : X86PlatformConfig(
+            /*Name=*/"HSW_XEONEP", /*Family=*/6, /*Models=*/{63, 79},
+            /*Settings=*/
+            environment::payload::PayloadSettings(
+                /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 2621440},
+                /*RamBufferSize=*/104857600, /*Lines=*/1536,
+                /*InstructionGroups=*/{{"RAM_L", 8}, {"L3_LS", 1}, {"L2_LS", 29}, {"L1_LS", 100}, {"REG", 100}}),
+            /*Payload=*/std::make_shared<const payload::FMAPayload>()) {}
 };
 } // namespace firestarter::environment::x86::platform
diff --git a/include/firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp b/include/firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp
index de520c56..f849c07b 100644
--- a/include/firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp
@@ -21,24 +21,19 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/AVX512Payload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/AVX512Payload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class KnightsLandingConfig final : public X86PlatformConfig {
-
 public:
-  KnightsLandingConfig(asmjit::CpuFeatures const &supportedFeatures,
-                       unsigned family, unsigned model, unsigned threads)
-      : X86PlatformConfig("KNL_XEONPHI", 6, {87}, {4}, 0,
-                          {32768, 524288, 236279125}, 26214400, 1536, family,
-                          model, threads,
-                          new payload::AVX512Payload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>(
-        {{"RAM_P", 3}, {"L2_S", 8}, {"L1_L", 40}, {"REG", 10}});
-  }
+  KnightsLandingConfig() noexcept
+      : X86PlatformConfig(/*Name=*/"KNL_XEONPHI", /*Family=*/6, /*Models=*/{87},
+                          /*Settings=*/
+                          environment::payload::PayloadSettings(
+                              /*Threads=*/{4}, /*DataCacheBufferSize=*/{32768, 524288, 236279125},
+                              /*RamBufferSize=*/26214400, /*Lines=*/1536,
+                              /*InstructionGroups=*/{{"RAM_P", 3}, {"L2_S", 8}, {"L1_L", 40}, {"REG", 10}}),
+                          /*Payload=*/std::make_shared<const payload::AVX512Payload>()) {}
 };
 } // namespace firestarter::environment::x86::platform
diff --git a/include/firestarter/Environment/X86/Platform/NaplesConfig.hpp b/include/firestarter/Environment/X86/Platform/NaplesConfig.hpp
index 0ad94682..abef11da 100644
--- a/include/firestarter/Environment/X86/Platform/NaplesConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/NaplesConfig.hpp
@@ -21,27 +21,20 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/ZENFMAPayload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/ZENFMAPayload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class NaplesConfig final : public X86PlatformConfig {
-
 public:
-  NaplesConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family,
-               unsigned model, unsigned threads)
-      : X86PlatformConfig("ZEN_EPYC", 23, {1, 8, 17, 24}, {1, 2}, 0,
-                          {65536, 524288, 2097152}, 104857600, 1536, family,
-                          model, threads,
-                          new payload::ZENFMAPayload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>({{"RAM_L", 3},
-                                                          {"L3_L", 14},
-                                                          {"L2_L", 75},
-                                                          {"L1_LS", 81},
-                                                          {"REG", 100}});
-  }
+  NaplesConfig() noexcept
+      : X86PlatformConfig(
+            /*Name=*/"ZEN_EPYC", /*Family=*/23, /*Models=*/{1, 8, 17, 24},
+            /*Settings=*/
+            environment::payload::PayloadSettings(
+                /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{65536, 524288, 2097152}, /*RamBufferSize=*/104857600,
+                /*Lines=*/1536,
+                /*InstructionGroups=*/{{"RAM_L", 3}, {"L3_L", 14}, {"L2_L", 75}, {"L1_LS", 81}, {"REG", 100}}),
+            /*Payload=*/std::make_shared<const payload::ZENFMAPayload>()) {}
 };
 } // namespace firestarter::environment::x86::platform
diff --git a/include/firestarter/Environment/X86/Platform/NehalemConfig.hpp b/include/firestarter/Environment/X86/Platform/NehalemConfig.hpp
index da7764d4..31374061 100644
--- a/include/firestarter/Environment/X86/Platform/NehalemConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/NehalemConfig.hpp
@@ -21,24 +21,19 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/SSE2Payload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/SSE2Payload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class NehalemConfig final : public X86PlatformConfig {
-
 public:
-  NehalemConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family,
-                unsigned model, unsigned threads)
-      : X86PlatformConfig("NHM_COREI", 6, {30, 37, 23}, {1, 2}, 0,
-                          {32768, 262144, 1572864}, 104857600, 1536, family,
-                          model, threads,
-                          new payload::SSE2Payload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>(
-        {{"RAM_P", 1}, {"L1_LS", 70}, {"REG", 2}});
-  }
+  NehalemConfig() noexcept
+      : X86PlatformConfig(
+            /*Name=*/"NHM_COREI", /*Family=*/6, /*Models=*/{30, 37, 23},
+            /*Settings=*/
+            environment::payload::PayloadSettings(/*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 1572864},
+                                                  /*RamBufferSize=*/104857600, /*Lines=*/1536,
+                                                  /*InstructionGroups=*/{{"RAM_P", 1}, {"L1_LS", 70}, {"REG", 2}}),
+            /*Payload=*/std::make_shared<const payload::SSE2Payload>()) {}
 };
 } // namespace firestarter::environment::x86::platform
diff --git a/include/firestarter/Environment/X86/Platform/NehalemEPConfig.hpp b/include/firestarter/Environment/X86/Platform/NehalemEPConfig.hpp
index 06ac2f64..9a6a08bb 100644
--- a/include/firestarter/Environment/X86/Platform/NehalemEPConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/NehalemEPConfig.hpp
@@ -21,24 +21,19 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/SSE2Payload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/SSE2Payload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class NehalemEPConfig final : public X86PlatformConfig {
-
 public:
-  NehalemEPConfig(asmjit::CpuFeatures const &supportedFeatures,
-                  unsigned family, unsigned model, unsigned threads)
-      : X86PlatformConfig("NHM_XEONEP", 6, {26, 44}, {1, 2}, 0,
-                          {32768, 262144, 2097152}, 104857600, 1536, family,
-                          model, threads,
-                          new payload::SSE2Payload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>(
-        {{"RAM_P", 1}, {"L1_LS", 60}, {"REG", 2}});
-  }
+  NehalemEPConfig() noexcept
+      : X86PlatformConfig(/*Name=*/"NHM_XEONEP", /*Family=*/6, /*Models=*/{26, 44},
+                          /*Settings=*/
+                          environment::payload::PayloadSettings(
+                              /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 2097152},
+                              /*RamBufferSize=*/104857600, /*Lines=*/1536,
+                              /*InstructionGroups=*/{{"RAM_P", 1}, {"L1_LS", 60}, {"REG", 2}}),
+                          /*Payload=*/std::make_shared<const payload::SSE2Payload>()) {}
 };
 } // namespace firestarter::environment::x86::platform
diff --git a/include/firestarter/Environment/X86/Platform/RomeConfig.hpp b/include/firestarter/Environment/X86/Platform/RomeConfig.hpp
index f7569bf4..e70161d7 100644
--- a/include/firestarter/Environment/X86/Platform/RomeConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/RomeConfig.hpp
@@ -21,28 +21,21 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/FMAPayload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/FMAPayload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class RomeConfig final : public X86PlatformConfig {
-
 public:
-  RomeConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family,
-             unsigned model, unsigned threads)
-      : X86PlatformConfig("ZEN_2_EPYC", 23, {49}, {1, 2}, 0,
-                          {32768, 524288, 2097152}, 104857600, 1536, family,
-                          model, threads,
-                          new payload::FMAPayload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>({{"RAM_L", 10},
-                                                          {"L3_L", 25},
-                                                          {"L2_L", 91},
-                                                          {"L1_2LS_256", 72},
-                                                          {"L1_LS_256", 82},
-                                                          {"REG", 75}});
-  }
+  RomeConfig() noexcept
+      : X86PlatformConfig(
+            /*Name=*/"ZEN_2_EPYC", /*Family=*/23, /*Models=*/{49},
+            /*Settings=*/
+            environment::payload::PayloadSettings(
+                /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 524288, 2097152}, /*RamBufferSize=*/104857600,
+                /*Lines=*/1536,
+                /*InstructionGroups=*/
+                {{"RAM_L", 10}, {"L3_L", 25}, {"L2_L", 91}, {"L1_2LS_256", 72}, {"L1_LS_256", 82}, {"REG", 75}}),
+            /*Payload=*/std::make_shared<const payload::FMAPayload>()) {}
 };
 } // namespace firestarter::environment::x86::platform
diff --git a/include/firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp b/include/firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp
index 7e928c1f..b5c5b1c4 100644
--- a/include/firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp
@@ -21,27 +21,20 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/AVXPayload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/AVXPayload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class SandyBridgeConfig final : public X86PlatformConfig {
-
 public:
-  SandyBridgeConfig(asmjit::CpuFeatures const &supportedFeatures,
-                    unsigned family, unsigned model, unsigned threads)
-      : X86PlatformConfig("SNB_COREI", 6, {42, 58}, {1, 2}, 0,
-                          {32768, 262144, 1572864}, 104857600, 1536, family,
-                          model, threads,
-                          new payload::AVXPayload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>({{"RAM_L", 2},
-                                                          {"L3_LS", 4},
-                                                          {"L2_LS", 10},
-                                                          {"L1_LS", 90},
-                                                          {"REG", 45}});
-  }
+  SandyBridgeConfig() noexcept
+      : X86PlatformConfig(
+            /*Name=*/"SNB_COREI", /*Family=*/6, /*Models=*/{42, 58},
+            /*Settings=*/
+            environment::payload::PayloadSettings(
+                /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 1572864}, /*RamBufferSize=*/104857600,
+                /*Lines=*/1536,
+                /*InstructionGroups=*/{{"RAM_L", 2}, {"L3_LS", 4}, {"L2_LS", 10}, {"L1_LS", 90}, {"REG", 45}}),
+            /*Payload=*/std::make_shared<const payload::AVXPayload>()) {}
 };
 } // namespace firestarter::environment::x86::platform
diff --git a/include/firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp b/include/firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp
index cb7fcb43..67048ba5 100644
--- a/include/firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp
@@ -19,32 +19,22 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#ifndef INCLUDE_FIRESTARTER_ENVIRONMENT_X86_PLATFORM_SANDYBRIDGEEPCONFIG_H
-#define INCLUDE_FIRESTARTER_ENVIRONMENT_X86_PLATFORM_SANDYBRIDGEEPCONFIG_H
+#pragma once
 
-#include <firestarter/Environment/X86/Payload/AVXPayload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/AVXPayload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class SandyBridgeEPConfig final : public X86PlatformConfig {
-
 public:
-  SandyBridgeEPConfig(asmjit::CpuFeatures const &supportedFeatures,
-                      unsigned family, unsigned model, unsigned threads)
-      : X86PlatformConfig("SNB_XEONEP", 6, {45, 62}, {1, 2}, 0,
-                          {32768, 262144, 2621440}, 104857600, 1536, family,
-                          model, threads,
-                          new payload::AVXPayload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>({{"RAM_L", 3},
-                                                          {"L3_LS", 2},
-                                                          {"L2_LS", 10},
-                                                          {"L1_LS", 90},
-                                                          {"REG", 30}});
-  }
+  SandyBridgeEPConfig() noexcept
+      : X86PlatformConfig(
+            /*Name=*/"SNB_XEONEP", /*Family=*/6, /*Models=*/{45, 62},
+            /*Settings=*/
+            environment::payload::PayloadSettings(
+                /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 2621440}, /*RamBufferSize=*/104857600,
+                /*Lines=*/1536,
+                /*InstructionGroups=*/{{"RAM_L", 3}, {"L3_LS", 2}, {"L2_LS", 10}, {"L1_LS", 90}, {"REG", 30}}),
+            /*Payload=*/std::make_shared<const payload::AVXPayload>()) {}
 };
-} // namespace firestarter::environment::x86::platform
-
-#endif
+} // namespace firestarter::environment::x86::platform
\ No newline at end of file
diff --git a/include/firestarter/Environment/X86/Platform/SkylakeConfig.hpp b/include/firestarter/Environment/X86/Platform/SkylakeConfig.hpp
index aec85be8..8a109d11 100644
--- a/include/firestarter/Environment/X86/Platform/SkylakeConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/SkylakeConfig.hpp
@@ -19,32 +19,22 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#ifndef INCLUDE_FIRESTARTER_ENVIRONMENT_X86_PLATFORM_SKYLAKECONFIG_H
-#define INCLUDE_FIRESTARTER_ENVIRONMENT_X86_PLATFORM_SKYLAKECONFIG_H
+#pragma once
 
-#include <firestarter/Environment/X86/Payload/FMAPayload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/FMAPayload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class SkylakeConfig final : public X86PlatformConfig {
-
 public:
-  SkylakeConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family,
-                unsigned model, unsigned threads)
-      : X86PlatformConfig("SKL_COREI", 6, {78, 94}, {1, 2}, 0,
-                          {32768, 262144, 1572864}, 104857600, 1536, family,
-                          model, threads,
-                          new payload::FMAPayload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>({{"RAM_L", 3},
-                                                          {"L3_LS_256", 5},
-                                                          {"L2_LS_256", 18},
-                                                          {"L1_2LS_256", 78},
-                                                          {"REG", 40}});
-  }
+  SkylakeConfig() noexcept
+      : X86PlatformConfig(/*Name=*/"SKL_COREI", /*Family=*/6, /*Models=*/{78, 94},
+                          /*Settings=*/
+                          environment::payload::PayloadSettings(
+                              /*Threads=*/{1, 2}, /*DataCacheBufferSize=*/{32768, 262144, 1572864},
+                              /*RamBufferSize=*/104857600, /*Lines=*/1536,
+                              /*InstructionGroups=*/
+                              {{"RAM_L", 3}, {"L3_LS_256", 5}, {"L2_LS_256", 18}, {"L1_2LS_256", 78}, {"REG", 40}}),
+                          /*Payload=*/std::make_shared<const payload::FMAPayload>()) {}
 };
-} // namespace firestarter::environment::x86::platform
-
-#endif
+} // namespace firestarter::environment::x86::platform
\ No newline at end of file
diff --git a/include/firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp b/include/firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp
index be767d0b..864ebec9 100644
--- a/include/firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp
@@ -21,31 +21,28 @@
 
 #pragma once
 
-#include <firestarter/Environment/X86/Payload/AVX512Payload.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
+#include "firestarter/Environment/X86/Payload/AVX512Payload.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86::platform {
 class SkylakeSPConfig final : public X86PlatformConfig {
-
 public:
-  SkylakeSPConfig(asmjit::CpuFeatures const &supportedFeatures,
-                  unsigned family, unsigned model, unsigned threads)
-      : X86PlatformConfig("SKL_XEONEP", 6, {85}, {1, 2}, 0,
-                          {32768, 1048576, 1441792}, 1048576000, 1536, family,
-                          model, threads,
-                          new payload::AVX512Payload(supportedFeatures)) {}
-
-  std::vector<std::pair<std::string, unsigned>>
-  getDefaultPayloadSettings() const override {
-    return std::vector<std::pair<std::string, unsigned>>({{"RAM_S", 3},
-                                                          {"RAM_P", 1},
-                                                          {"L3_S", 1},
-                                                          {"L3_P", 1},
-                                                          {"L2_S", 4},
-                                                          {"L2_L", 70},
-                                                          {"L1_S", 0},
-                                                          {"L1_L", 40},
-                                                          {"REG", 140}});
-  }
+  SkylakeSPConfig() noexcept
+      : X86PlatformConfig(/*Name=*/"SKL_XEONEP", /*Family=*/6, /*Models=*/{85},
+                          /*Settings=*/
+                          environment::payload::PayloadSettings(/*Threads=*/{1, 2},
+                                                                /*DataCacheBufferSize=*/{32768, 1048576, 1441792},
+                                                                /*RamBufferSize=*/1048576000, /*Lines=*/1536,
+                                                                /*InstructionGroups=*/
+                                                                {{"RAM_S", 3},
+                                                                 {"RAM_P", 1},
+                                                                 {"L3_S", 1},
+                                                                 {"L3_P", 1},
+                                                                 {"L2_S", 4},
+                                                                 {"L2_L", 70},
+                                                                 {"L1_S", 0},
+                                                                 {"L1_L", 40},
+                                                                 {"REG", 140}}),
+                          /*Payload=*/std::make_shared<const payload::AVX512Payload>()) {}
 };
 } // namespace firestarter::environment::x86::platform
diff --git a/include/firestarter/Environment/X86/Platform/X86PlatformConfig.hpp b/include/firestarter/Environment/X86/Platform/X86PlatformConfig.hpp
index 45956f38..15d54638 100644
--- a/include/firestarter/Environment/X86/Platform/X86PlatformConfig.hpp
+++ b/include/firestarter/Environment/X86/Platform/X86PlatformConfig.hpp
@@ -21,38 +21,79 @@
 
 #pragma once
 
-#include <firestarter/Environment/Platform/PlatformConfig.hpp>
-#include <firestarter/Environment/X86/Payload/X86Payload.hpp>
+#include "firestarter/Environment/CPUTopology.hpp"
+#include "firestarter/Environment/Platform/PlatformConfig.hpp"
+#include "firestarter/Environment/X86/X86CPUTopology.hpp"
 
 namespace firestarter::environment::x86::platform {
 
+/// Models a platform config that is the default based on x86 CPU family and model ids.
 class X86PlatformConfig : public environment::platform::PlatformConfig {
 private:
-  unsigned _family;
-  std::list<unsigned> _models;
-  unsigned _currentFamily;
-  unsigned _currentModel;
-  unsigned _currentThreads;
+  /// The famility id of the processor for which this is the default platform config.
+  unsigned Family;
+  /// The list of model ids in combination with the family for which this is the default platform config.
+  std::list<unsigned> Models;
 
 public:
-  X86PlatformConfig(std::string name, unsigned family,
-                    std::initializer_list<unsigned> models,
-                    std::initializer_list<unsigned> threads,
-                    unsigned instructionCacheSize,
-                    std::initializer_list<unsigned> dataCacheBufferSize,
-                    unsigned ramBuffersize, unsigned lines,
-                    unsigned currentFamily, unsigned currentModel,
-                    unsigned currentThreads, payload::X86Payload *payload)
-      : PlatformConfig(name, threads, instructionCacheSize, dataCacheBufferSize,
-                       ramBuffersize, lines, payload),
-        _family(family), _models(models), _currentFamily(currentFamily),
-        _currentModel(currentModel), _currentThreads(currentThreads) {}
-
-  bool isDefault() const override {
-    return _family == _currentFamily &&
-           (std::find(_models.begin(), _models.end(), _currentModel) !=
-            _models.end()) &&
-           isAvailable();
+  X86PlatformConfig(std::string Name, unsigned Family, std::list<unsigned>&& Models,
+                    environment::payload::PayloadSettings&& Settings,
+                    std::shared_ptr<const environment::payload::Payload>&& Payload) noexcept
+      : PlatformConfig(std::move(Name), std::move(Settings), std::move(Payload))
+      , Family(Family)
+      , Models(std::move(Models)) {}
+
+  /// Check if this platform is available on the current system. This transloate to if the cpu extensions are
+  /// available for the payload that is used.
+  /// \arg Topology The reference to the X86CPUTopology that is used to check agains if this platform is supported.
+  /// \returns true if the platform is supported on the given X86CPUTopology.
+  [[nodiscard]] auto isAvailable(const X86CPUTopology& Topology) const -> bool { return isAvailable(&Topology); }
+
+  /// Check if this platform is available and the default on the current system.
+  /// \arg Topology The reference to the X86CPUTopology that is used to check agains if this payload is supported.
+  /// \returns true if the platform is the default one for a given X86CPUTopology.
+  [[nodiscard]] auto isDefault(const X86CPUTopology& Topology) const -> bool { return isDefault(&Topology); }
+
+  /// Clone a the platform config.
+  [[nodiscard]] auto clone() const -> std::unique_ptr<PlatformConfig> final {
+    auto Ptr = std::make_unique<X86PlatformConfig>(name(), Family, std::list<unsigned>(Models),
+                                                   environment::payload::PayloadSettings(settings()),
+                                                   std::shared_ptr(payload()));
+    return Ptr;
+  }
+
+  /// Clone a concreate platform config.
+  /// \arg InstructionCacheSize The detected size of the instructions cache.
+  /// \arg ThreadPerCore The number of threads per pysical CPU.
+  [[nodiscard]] auto cloneConcreate(std::optional<unsigned> InstructionCacheSize, unsigned ThreadsPerCore) const
+      -> std::unique_ptr<PlatformConfig> final {
+    auto Ptr = clone();
+    Ptr->settings().concretize(InstructionCacheSize, ThreadsPerCore);
+    return Ptr;
+  }
+
+private:
+  /// Check if this platform is available on the current system. This tranlates to if the cpu extensions are
+  /// available for the payload that is used.
+  /// \arg Topology The pointer to the CPUTopology that is used to check agains if this platform is supported.
+  /// \returns true if the platform is supported on the given CPUTopology.
+  [[nodiscard]] auto isAvailable(const CPUTopology* Topology) const -> bool final {
+    return environment::platform::PlatformConfig::isAvailable(Topology);
+  }
+
+  /// Check if this platform is available and the default on the current system. This is done by checking if the family
+  /// id in the CPUTopology matches the one saved in Family and if the model id in the CPUTopology is contained in
+  /// Models.
+  /// \arg Topology The pointer to the CPUTopology that is used to check agains if this payload is supported.
+  /// \returns true if the platform is the default one for a given CPUTopology.
+  [[nodiscard]] auto isDefault(const CPUTopology* Topology) const -> bool final {
+    const auto* FinalTopology = dynamic_cast<const X86CPUTopology*>(Topology);
+    assert(FinalTopology && "isDefault not called with const X86CPUTopology*");
+
+    // Check if the family of the topology matches the family of the config, if the model of the topology is contained
+    // in the models list of the config and if the config is available on the current platform.
+    return Family == FinalTopology->familyId() &&
+           (std::find(Models.begin(), Models.end(), FinalTopology->modelId()) != Models.end()) && isAvailable(Topology);
   }
 };
 
diff --git a/include/firestarter/Environment/X86/X86CPUTopology.hpp b/include/firestarter/Environment/X86/X86CPUTopology.hpp
index 44a02dc2..0a85d040 100644
--- a/include/firestarter/Environment/X86/X86CPUTopology.hpp
+++ b/include/firestarter/Environment/X86/X86CPUTopology.hpp
@@ -21,55 +21,68 @@
 
 #pragma once
 
-#include <firestarter/Environment/CPUTopology.hpp>
+#include "firestarter/Environment/CPUTopology.hpp"
 
 #include <asmjit/asmjit.h>
 
 namespace firestarter::environment::x86 {
 
+/// This class models the properties of a x86_64 processor.
 class X86CPUTopology final : public CPUTopology {
 public:
   X86CPUTopology();
 
-  friend std::ostream &operator<<(std::ostream &stream,
-                                  X86CPUTopology const &cpuTopology);
+  friend auto operator<<(std::ostream& Stream, X86CPUTopology const& CpuTopology) -> std::ostream&;
 
-  std::list<std::string> const &features() const override {
-    return this->featureList;
-  }
-  const asmjit::CpuFeatures& featuresAsmjit() const{
-    return this->cpuInfo.features();
-  }
+  /// Getter for the list of CPU features
+  [[nodiscard]] auto features() const -> std::list<std::string> const& override { return this->FeatureList; }
+  /// Getter for the CPU features class from asmjit
+  [[nodiscard]] auto featuresAsmjit() const -> const asmjit::CpuFeatures& { return this->CpuInfo.features(); }
 
-  std::string const &vendor() const override { return this->_vendor; }
-  std::string const &model() const override { return this->_model; }
+  /// Getter for the clockrate in Hz
+  [[nodiscard]] auto clockrate() const -> uint64_t override;
 
-  unsigned long long clockrate() const override;
+  /// Get the current hardware timestamp
+  [[nodiscard]] auto timestamp() const -> uint64_t override;
 
-  unsigned long long timestamp() const override;
-
-  unsigned familyId() const { return this->cpuInfo.familyId(); }
-  unsigned modelId() const { return this->cpuInfo.modelId(); }
-  unsigned stepping() const { return this->cpuInfo.stepping(); }
+  /// The family id of the x86 processor
+  [[nodiscard]] auto familyId() const -> unsigned { return this->CpuInfo.familyId(); }
+  /// The model id of the x86 processor
+  [[nodiscard]] auto modelId() const -> unsigned { return this->CpuInfo.modelId(); }
+  /// The stepping id of the x86 processor
+  [[nodiscard]] auto stepping() const -> unsigned { return this->CpuInfo.stepping(); }
+  /// The CPU vendor i.e., Intel or AMD.
+  [[nodiscard]] auto vendor() const -> std::string const& final { return Vendor; }
+  /// Get the string containing family, model and stepping ids.
+  [[nodiscard]] auto model() const -> std::string const& final { return Model; }
 
 private:
-  bool hasRdtsc() const { return this->_hasRdtsc; }
-  bool hasInvariantRdtsc() const { return this->_hasInvariantRdtsc; }
-  void cpuid(unsigned long long *a, unsigned long long *b,
-             unsigned long long *c, unsigned long long *d) const;
-
-  asmjit::CpuInfo cpuInfo;
-  std::list<std::string> featureList;
-
-  bool _hasRdtsc;
-  bool _hasInvariantRdtsc;
-  std::string _vendor;
-  std::string _model;
+  /// Does this processor support timestamp counters
+  [[nodiscard]] auto hasRdtsc() const -> bool { return this->HasRdtsc; }
+  /// Does this processor have invariant timestamp counters
+  [[nodiscard]] auto hasInvariantRdtsc() const -> bool { return this->HasInvariantRdtsc; }
+
+  /// A wrapper to the cpuid call to keep a consitent interface between Windows and other platforms.
+  static void cpuid(uint64_t* Rax, uint64_t* Rbx, uint64_t* Rcx, uint64_t* Rdx);
+
+  /// The asmjit CpuInfo for the current processor
+  asmjit::CpuInfo CpuInfo;
+  /// The list of cpufeatures that are supported by the current processpr
+  std::list<std::string> FeatureList;
+
+  /// Does this processor support timestamp counters
+  bool HasRdtsc;
+  /// Does this processor have invariant timestamp counters
+  bool HasInvariantRdtsc;
+
+  /// The CPU vendor i.e., Intel or AMD.
+  std::string Vendor;
+  /// Model string containing family, model and stepping ids.
+  std::string Model;
 };
 
-inline std::ostream &operator<<(std::ostream &stream,
-                                X86CPUTopology const &cpuTopology) {
-  return cpuTopology.print(stream);
+inline auto operator<<(std::ostream& Stream, X86CPUTopology const& CpuTopology) -> std::ostream& {
+  return CpuTopology.print(Stream);
 }
 
-} // namespace firestarter::environment::x86
+} // namespace firestarter::environment::x86
\ No newline at end of file
diff --git a/include/firestarter/Environment/X86/X86Environment.hpp b/include/firestarter/Environment/X86/X86Environment.hpp
index 11ad940e..f4760f7e 100644
--- a/include/firestarter/Environment/X86/X86Environment.hpp
+++ b/include/firestarter/Environment/X86/X86Environment.hpp
@@ -21,91 +21,102 @@
 
 #pragma once
 
-#include <firestarter/Environment/Environment.hpp>
-#include <firestarter/Environment/X86/X86CPUTopology.hpp>
-
-#include <firestarter/Environment/X86/Platform/BulldozerConfig.hpp>
-#include <firestarter/Environment/X86/Platform/HaswellConfig.hpp>
-#include <firestarter/Environment/X86/Platform/HaswellEPConfig.hpp>
-#include <firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp>
-#include <firestarter/Environment/X86/Platform/NaplesConfig.hpp>
-#include <firestarter/Environment/X86/Platform/NehalemConfig.hpp>
-#include <firestarter/Environment/X86/Platform/NehalemEPConfig.hpp>
-#include <firestarter/Environment/X86/Platform/RomeConfig.hpp>
-#include <firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp>
-#include <firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp>
-#include <firestarter/Environment/X86/Platform/SkylakeConfig.hpp>
-#include <firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp>
-#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>
-
-#include <asmjit/asmjit.h>
-
-#include <functional>
-
-#define REGISTER(NAME)                                                         \
-  [](asmjit::CpuFeatures const &supportedFeatures, unsigned family,          \
-     unsigned model, unsigned threads) -> platform::X86PlatformConfig * {      \
-    return new platform::NAME(supportedFeatures, family, model, threads);      \
-  }
+#include "firestarter/Environment/Environment.hpp"
+#include "firestarter/Environment/X86/Platform/BulldozerConfig.hpp"
+#include "firestarter/Environment/X86/Platform/HaswellConfig.hpp"
+#include "firestarter/Environment/X86/Platform/HaswellEPConfig.hpp"
+#include "firestarter/Environment/X86/Platform/KnightsLandingConfig.hpp"
+#include "firestarter/Environment/X86/Platform/NaplesConfig.hpp"
+#include "firestarter/Environment/X86/Platform/NehalemConfig.hpp"
+#include "firestarter/Environment/X86/Platform/NehalemEPConfig.hpp"
+#include "firestarter/Environment/X86/Platform/RomeConfig.hpp"
+#include "firestarter/Environment/X86/Platform/SandyBridgeConfig.hpp"
+#include "firestarter/Environment/X86/Platform/SandyBridgeEPConfig.hpp"
+#include "firestarter/Environment/X86/Platform/SkylakeConfig.hpp"
+#include "firestarter/Environment/X86/Platform/SkylakeSPConfig.hpp"
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
 
 namespace firestarter::environment::x86 {
 
 class X86Environment final : public Environment {
 public:
-  X86Environment() : Environment(new X86CPUTopology()) {}
-
-  ~X86Environment() {
-    for (auto const &config : platformConfigs) {
-      delete config;
-    }
-    for (auto const &config : fallbackPlatformConfigs) {
-      delete config;
-    }
+  X86Environment()
+      : Environment(std::make_unique<X86CPUTopology>()) {}
+
+  /// Getter (which allows modifying) for the current platform config containing the payload, settings, the
+  /// associated name and the default X86 family and models.
+  [[nodiscard]] auto config() -> platform::X86PlatformConfig& final {
+    auto* X86PlatformConfig = dynamic_cast<platform::X86PlatformConfig*>(&Environment::config());
+    assert(X86PlatformConfig && "X86PlatformConfig is a nullptr");
+    return *X86PlatformConfig;
+  }
+
+  /// Const getter for the current platform config containing the payload, settings, the associated name and the default
+  /// X86 family and models.
+  [[nodiscard]] auto config() const -> const platform::X86PlatformConfig& final {
+    const auto* X86PlatformConfig = dynamic_cast<const platform::X86PlatformConfig*>(&Environment::config());
+    assert(X86PlatformConfig && "X86PlatformConfig is a nullptr");
+    return *X86PlatformConfig;
   }
 
-  X86CPUTopology const &topology() {
-    return *reinterpret_cast<X86CPUTopology *>(this->_topology);
+  /// Const getter for the current CPU topology with X86 specific modifications.
+  [[nodiscard]] auto topology() const -> const X86CPUTopology& final {
+    const auto* X86Topology = dynamic_cast<const X86CPUTopology*>(&Environment::topology());
+    assert(X86Topology && "X86Topology is a nullptr");
+    return *X86Topology;
   }
 
-  void evaluateFunctions() override;
-  int selectFunction(unsigned functionId,
-                     bool allowUnavailablePayload) override;
-  int selectInstructionGroups(std::string groups) override;
+  /// Select a PlatformConfig based on its generated id. This function will throw if a payload is not available or the
+  /// id is incorrect. If id is zero we automatically select a matching PlatformConfig.
+  /// \arg FunctionId The id of the PlatformConfig that should be selected.
+  /// \arg AllowUnavailablePayload If true we will not throw if the PlatformConfig is not available.
+  void selectFunction(unsigned FunctionId, bool AllowUnavailablePayload) override;
+
+  /// Parse the selected payload instruction groups and save the in the selected function. Throws if the input is
+  /// invalid.
+  /// \arg Groups The list of instruction groups that is in the format: multiple INSTRUCTION:VALUE pairs
+  /// comma-seperated.
+  void selectInstructionGroups(std::string Groups) override;
+
+  /// Print the available instruction groups of the selected function.
   void printAvailableInstructionGroups() override;
-  void setLineCount(unsigned lineCount) override;
+
+  /// Set the line count in the selected function.
+  /// \arg LineCount The maximum number of instruction that should be in the high-load loop.
+  void setLineCount(unsigned LineCount) override;
+
+  /// Print a summary of the settings of the selected config.
   void printSelectedCodePathSummary() override;
+
+  /// Print a list of available high-load function and if they are available on the current system. This includes all
+  /// PlatformConfigs in combination with all thread per core counts.
   void printFunctionSummary() override;
 
 private:
-  // The available function IDs are generated by iterating through this list of
-  // PlatformConfig. Add new PlatformConfig at the bottom to maintain stable
-  // IDs.
-  const std::list<std::function<platform::X86PlatformConfig *(
-      asmjit::CpuFeatures const &, unsigned, unsigned, unsigned)>>
-      platformConfigsCtor = {
-          REGISTER(KnightsLandingConfig), REGISTER(SkylakeConfig),
-          REGISTER(SkylakeSPConfig),      REGISTER(HaswellConfig),
-          REGISTER(HaswellEPConfig),      REGISTER(SandyBridgeConfig),
-          REGISTER(SandyBridgeEPConfig),  REGISTER(NehalemConfig),
-          REGISTER(NehalemEPConfig),      REGISTER(BulldozerConfig),
-          REGISTER(NaplesConfig),         REGISTER(RomeConfig)};
-
-  std::list<platform::X86PlatformConfig *> platformConfigs;
-
-  // List of fallback PlatformConfig. Add one for each x86 extension.
-  const std::list<std::function<platform::X86PlatformConfig *(
-      asmjit::CpuFeatures const &, unsigned, unsigned, unsigned)>>
-      fallbackPlatformConfigsCtor = {
-          REGISTER(SkylakeSPConfig),   // AVX512
-          REGISTER(BulldozerConfig),   // FMA4
-          REGISTER(HaswellConfig),     // FMA
-          REGISTER(SandyBridgeConfig), // AVX
-          REGISTER(NehalemConfig)      // SSE2
-      };
-
-  std::list<platform::X86PlatformConfig *> fallbackPlatformConfigs;
-
-#undef REGISTER
+  /// The list of availabe platform configs that is printed when supplying the --avail command line argument. The IDs
+  /// for these configs are generated by iterating through this list starting with 1. To maintain stable IDs in
+  /// FIRESTARTER new configs should be added to the bottom of the list.
+  const std::list<std::shared_ptr<platform::X86PlatformConfig>> PlatformConfigs = {
+      std::make_shared<platform::KnightsLandingConfig>(), std::make_shared<platform::SkylakeConfig>(),
+      std::make_shared<platform::SkylakeSPConfig>(),      std::make_shared<platform::HaswellConfig>(),
+      std::make_shared<platform::HaswellEPConfig>(),      std::make_shared<platform::SandyBridgeConfig>(),
+      std::make_shared<platform::SandyBridgeEPConfig>(),  std::make_shared<platform::NehalemConfig>(),
+      std::make_shared<platform::NehalemEPConfig>(),      std::make_shared<platform::BulldozerConfig>(),
+      std::make_shared<platform::NaplesConfig>(),         std::make_shared<platform::RomeConfig>()};
+
+  /// The list of configs that are fallbacks. If none of the PlatformConfigs is the default one on the current CPU, we
+  /// select the first one from this list that is available on the current system. If multiple configs can be available
+  /// on one system the one with higher priority should be at the top of this list. Modern X86 CPUs will support SSE2
+  /// therefore it is the last on the list. CPUs that support AVX512 will most certainly also support FMA and AVX,
+  /// AVX512 takes precedence. This list should contain one entry for each of the supported CPU extensions by the
+  /// FIRESTARTER payloads.
+  const std::list<std::shared_ptr<platform::X86PlatformConfig>> FallbackPlatformConfigs = {
+      std::make_shared<platform::SkylakeSPConfig>(),   // AVX512
+      std::make_shared<platform::BulldozerConfig>(),   // FMA4
+      std::make_shared<platform::HaswellConfig>(),     // FMA
+      std::make_shared<platform::SandyBridgeConfig>(), // AVX
+      std::make_shared<platform::NehalemConfig>()      // SSE2
+  };
 };
 
 } // namespace firestarter::environment::x86
diff --git a/include/firestarter/ErrorDetectionStruct.hpp b/include/firestarter/ErrorDetectionStruct.hpp
index 38bcbc6a..1fc3ad24 100644
--- a/include/firestarter/ErrorDetectionStruct.hpp
+++ b/include/firestarter/ErrorDetectionStruct.hpp
@@ -21,26 +21,31 @@
 
 #pragma once
 
+#include <cstdint>
+
 namespace firestarter {
 
+/// This struct is used for the error detection feature. The error detection works between two threads. The current one
+/// and one on the left. Analogous for the thread on the right. We hash the contents of the vector registers and compare
+/// them with the current iteration counter aginst the other threads.
 struct ErrorDetectionStruct {
-  // we have two cache lines (64B) containing each two 16B local variable and
-  // one ptr (8B)
-
-  // the pointer to 16B of communication
-  volatile unsigned long long *communicationLeft;
-  volatile unsigned long long localsLeft[4];
-  // if this variable is not 0, an error occured in the comparison with the left
-  // thread.
-  volatile unsigned long long errorLeft;
-  volatile unsigned long long paddingLeft[2];
+  struct OneSide {
+    /// The pointer to 16B of communication between the two threads which is used with lock cmpxchg16b
+    uint64_t* Communication;
+    /// The local variables that are used for the error detection algorithm
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+    uint64_t Locals[4];
+    /// If this variable is not 0, an error occured in the comparison with the other thread.
+    uint64_t Error;
+    /// Padding to fill up a cache line.
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+    uint64_t Padding[2];
+  };
 
-  volatile unsigned long long *communicationRight;
-  volatile unsigned long long localsRight[4];
-  // if this variable is not 0, an error occured in the comparison with the
-  // right thread.
-  volatile unsigned long long errorRight;
-  volatile unsigned long long paddingRight[2];
+  /// The data that is used for the error detection algorithm between the current and the thread left to it.
+  OneSide Left;
+  /// The data that is used for the error detection algorithm between the current and the thread right to it.
+  OneSide Right;
 };
 
-} // namespace firestarter
+} // namespace firestarter
\ No newline at end of file
diff --git a/include/firestarter/Firestarter.hpp b/include/firestarter/Firestarter.hpp
index 31347dd2..a51feebb 100644
--- a/include/firestarter/Firestarter.hpp
+++ b/include/firestarter/Firestarter.hpp
@@ -21,36 +21,19 @@
 
 #pragma once
 
-#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)
-#include <firestarter/Cuda/Cuda.hpp>
-#endif
-
-#ifdef FIRESTARTER_BUILD_ONEAPI
-#include <firestarter/OneAPI/OneAPI.hpp>
-#endif
-
-
-
-#include <firestarter/Constants.hpp>
-
-#if defined(linux) || defined(__linux__)
-#include <firestarter/Measurement/MeasurementWorker.hpp>
-#include <firestarter/Optimizer/Algorithm.hpp>
-#include <firestarter/Optimizer/OptimizerWorker.hpp>
-#include <firestarter/Optimizer/Population.hpp>
-#endif
-
-#include <firestarter/DumpRegisterWorkerData.hpp>
-#include <firestarter/LoadWorkerData.hpp>
-
-#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||            \
-    defined(_M_X64)
-#include <firestarter/Environment/X86/X86Environment.hpp>
-#endif
+#include "firestarter/Config.hpp"
+#include "firestarter/Constants.hpp"
+#include "firestarter/Cuda/Cuda.hpp"
+#include "firestarter/DumpRegisterWorkerData.hpp"
+#include "firestarter/LoadWorkerData.hpp"
+#include "firestarter/Measurement/MeasurementWorker.hpp"
+#include "firestarter/OneAPI/OneAPI.hpp"
+#include "firestarter/Optimizer/Algorithm.hpp"
+#include "firestarter/Optimizer/OptimizerWorker.hpp"
+#include "firestarter/Optimizer/Population.hpp"
 
 #include <chrono>
 #include <condition_variable>
-#include <list>
 #include <memory>
 #include <mutex>
 #include <string>
@@ -64,141 +47,140 @@ extern "C" {
 
 namespace firestarter {
 
+/// This is the main class of firestarter and handles the execution of the programm.
 class Firestarter {
 public:
-  Firestarter(const int argc, const char **argv,
-              std::chrono::seconds const &timeout, unsigned loadPercent,
-              std::chrono::microseconds const &period,
-              unsigned requestedNumThreads, std::string const &cpuBind,
-              bool printFunctionSummary, unsigned functionId,
-              bool listInstructionGroups, std::string const &instructionGroups,
-              unsigned lineCount, bool allowUnavailablePayload,
-              bool dumpRegisters,
-              std::chrono::seconds const &dumpRegistersTimeDelta,
-              std::string const &dumpRegistersOutpath, bool errorDetection,
-              int gpus, unsigned gpuMatrixSize, bool gpuUseFloat,
-              bool gpuUseDouble, bool listMetrics, bool measurement,
-              std::chrono::milliseconds const &startDelta,
-              std::chrono::milliseconds const &stopDelta,
-              std::chrono::milliseconds const &measurementInterval,
-              std::vector<std::string> const &metricPaths,
-              std::vector<std::string> const &stdinMetrics, bool optimize,
-              std::chrono::seconds const &preheat,
-              std::string const &optimizationAlgorithm,
-              std::vector<std::string> const &optimizationMetrics,
-              std::chrono::seconds const &evaluationDuration,
-              unsigned individuals, std::string const &optimizeOutfile,
-              unsigned generations, double nsga2_cr, double nsga2_m);
-
-  ~Firestarter();
+  Firestarter() = delete;
 
+  /// Read the config, validate and throw on problems with config. Setup everything that is required for the execution
+  /// of firestarter.
+  /// \arg ProvidedConfig The config for the execution of Firestarter
+  explicit Firestarter(Config&& ProvidedConfig);
+
+  ~Firestarter() = default;
+
+  /// This function takes care of the execution of firestarter. It will start the load on CPUs and GPUs.
   void mainThread();
 
 private:
-  const int _argc;
-  const char **_argv;
-  const std::chrono::seconds _timeout;
-  const unsigned _loadPercent;
-  std::chrono::microseconds _load;
-  std::chrono::microseconds _period;
-  const bool _dumpRegisters;
-  const std::chrono::seconds _dumpRegistersTimeDelta;
-  const std::string _dumpRegistersOutpath;
-  const bool _errorDetection;
-  const int _gpus;
-  const unsigned _gpuMatrixSize;
-  const bool _gpuUseFloat;
-  const bool _gpuUseDouble;
-  const std::chrono::milliseconds _startDelta;
-  const std::chrono::milliseconds _stopDelta;
-  const bool _measurement;
-  const bool _optimize;
-  const std::chrono::seconds _preheat;
-  const std::string _optimizationAlgorithm;
-  const std::vector<std::string> _optimizationMetrics;
-  const std::chrono::seconds _evaluationDuration;
-  const unsigned _individuals;
-  const std::string _optimizeOutfile;
-  const unsigned _generations;
-  const double _nsga2_cr;
-  const double _nsga2_m;
-
-#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||            \
-    defined(_M_X64)
-  environment::x86::X86Environment *_environment = nullptr;
-
-  environment::x86::X86Environment &environment() const {
-    return *_environment;
-  }
-#else
-#error "FIRESTARTER is not implemented for this ISA"
-#endif
+  const Config Cfg;
+
+  /// The class that handles setting up the payload for firestarter
+  std::unique_ptr<environment::Environment> Environment;
+  /// The class for execution of the gemm routine on Cuda or HIP GPUs.
+  std::unique_ptr<cuda::Cuda> Cuda;
+  /// The class for execution of the gemm routine on OneAPI GPUs.
+  std::unique_ptr<oneapi::OneAPI> Oneapi;
+  /// The pointer to the optimization algorithm that is used by the optimization functionality.
+  std::unique_ptr<firestarter::optimizer::Algorithm> Algorithm;
+  /// The thread that is used to dump register contents to a file.
+  std::thread DumpRegisterWorkerThread;
+  /// The shared pointer to the datastructure that handles the management of metrics, acquisition of metric data and
+  /// provids summaries of a time range of metric values.
+  std::shared_ptr<measurement::MeasurementWorker> MeasurementWorker;
+
+  /// The vector of thread handles for the load workers and shared pointer to the their respective data.
+  std::vector<std::pair<std::thread, std::shared_ptr<LoadWorkerData>>> LoadThreads;
+  /// The vector of communication data, where each element is shared between two neighbouring threads for the error
+  /// detection feature.
+  std::vector<std::shared_ptr<uint64_t>> ErrorCommunication;
+
+  /// The population holding the problem that is used for the optimization feature.
+  std::unique_ptr<firestarter::optimizer::Population> Population;
+
+  // NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
+  // TODO(Issue #85): Currently we support one instance of the Firestarter class. Variables that need to be accessed
+  // from outside the class, e.g. in the sigterm handler are inline static.
+
+  /// The instance of the optimization worker that handles the execution of the optimization.
+  inline static std::unique_ptr<optimizer::OptimizerWorker> Optimizer;
+
+  /// Variable to control the termination of the watchdog
+  inline static bool WatchdogTerminate = false;
+  /// Condition variable for the WatchdogTerminate to allow notifying when sleeping for a specific time.
+  inline static std::condition_variable WatchdogTerminateAlert;
+  /// Mutex to guard access to WatchdogTerminate.
+  inline static std::mutex WatchdogTerminateMutex;
+
+  /// Variable to control the load of the threads
+  inline static volatile LoadThreadWorkType LoadVar = LoadThreadWorkType::LoadLow;
+
+  // NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
+
+  /// Spawn the load workers and initialize them.
+  void initLoadWorkers();
+
+  /// Wait for the load worker to join
+  void joinLoadWorkers();
 
-#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)
-  std::unique_ptr<cuda::Cuda> _cuda;
-#endif
+  /// Print the error report for the error detection feature.
+  void printThreadErrorReport();
 
-#ifdef FIRESTARTER_BUILD_ONEAPI
-  std::unique_ptr<oneapi::OneAPI> _oneapi;
-#endif
+  /// Print the performance report. It contains the estimation of the FLOPS and main memory bandwidth.
+  void printPerformanceReport();
 
-#if defined(linux) || defined(__linux__)
-  inline static std::unique_ptr<optimizer::OptimizerWorker> _optimizer;
-  std::shared_ptr<measurement::MeasurementWorker> _measurementWorker;
-  std::unique_ptr<firestarter::optimizer::Algorithm> _algorithm;
-  firestarter::optimizer::Population _population;
-#endif
+  /// Set the load workers to the ThreadInit state.
+  void signalInit() { signalLoadWorkers(LoadThreadState::ThreadInit); }
 
-  // LoadThreadWorker.cpp
-  int initLoadWorkers(bool lowLoad, unsigned long long period);
-  void joinLoadWorkers();
-  void printThreadErrorReport();
-  void printPerformanceReport();
+  /// Set the load workers to the ThreadWork state.
+  void signalWork() { signalLoadWorkers(LoadThreadState::ThreadWork); };
 
-  void signalWork() { signalLoadWorkers(THREAD_WORK); };
+  /// Set the load workers to the ThreadWork state.
+  /// \arg Setting The new setting to switch to.
+  void signalSwitch(std::vector<std::pair<std::string, unsigned>> const& Setting) {
+    struct SwitchLoad {
+      static void func() { LoadVar = LoadThreadWorkType::LoadSwitch; };
+    };
 
-  // WatchdogWorker.cpp
-  int watchdogWorker(std::chrono::microseconds period,
-                     std::chrono::microseconds load,
-                     std::chrono::seconds timeout);
+    for (auto& Thread : LoadThreads) {
+      auto Td = Thread.second;
 
-#ifdef FIRESTARTER_DEBUG_FEATURES
-  // DumpRegisterWorker.cpp
-  int initDumpRegisterWorker(std::chrono::seconds dumpTimeDelta,
-                             std::string dumpFilePath);
-  void joinDumpRegisterWorker();
-#endif
+      Td->config().settings().selectInstructionGroups(Setting);
+    }
 
-  // LoadThreadWorker.cpp
-  void signalLoadWorkers(int comm);
-  static void loadThreadWorker(std::shared_ptr<LoadWorkerData> td);
+    signalLoadWorkers(LoadThreadState::ThreadSwitch, SwitchLoad::func);
+  };
 
-#ifdef FIRESTARTER_DEBUG_FEATURES
-  // DumpRegisterWorker.cpp
-  static void dumpRegisterWorker(std::unique_ptr<DumpRegisterWorkerData> data);
-#endif
+  /// Execute a state change in the load worker threads. This should happen at the same time in all threads. First the
+  /// mutex in all threads are locked an then the state is updated and we wait until we get an acknowledgement from the
+  /// threads.
+  /// \arg State The new state of the threads.
+  /// \arg Function An optional function that will be executed after the state in all threads has been updated and
+  /// before we wait for the acknowledgement of the thread.
+  void signalLoadWorkers(LoadThreadState State, void (*Function)() = nullptr);
 
-  static void setLoad(unsigned long long value);
+  /// The function that is executed for each load thread.
+  /// \arg Td The shared pointer to the data that is required in this thread.
+  static void loadThreadWorker(const std::shared_ptr<LoadWorkerData>& Td);
 
-  static void sigalrmHandler(int signum);
-  static void sigtermHandler(int signum);
+  /// This function handels switching the load from high to low in a loop and stopping the execution if a timeout was
+  /// set.
+  /// \arg Period The period of the high/low switch. Set to zero to disable switching between a high and low load.
+  /// \arg Load The time of the period where high load is applied.
+  /// \arg Timeout The timeout after which firestarter stops. Set to zero to disable.
+  static void watchdogWorker(std::chrono::microseconds Period, std::chrono::microseconds Load,
+                             std::chrono::seconds Timeout);
 
-  // variables to control the termination of the watchdog
-  inline static bool _watchdog_terminate = false;
-  inline static std::condition_variable _watchdogTerminateAlert;
-  inline static std::mutex _watchdogTerminateMutex;
+  /// Start the thread to dump the registers of the first load thread to a file.
+  void initDumpRegisterWorker();
 
-  // variable to control the load of the threads
-  inline static volatile unsigned long long loadVar = LOAD_LOW;
+  /// Wait for the dump register thread to terminate.
+  void joinDumpRegisterWorker();
 
-  std::vector<std::pair<std::thread, std::shared_ptr<LoadWorkerData>>>
-      loadThreads;
+  /// The thread that dumps the registers of the first thread to a file.
+  /// \arg Data The data that is required for the worker thread to dump the register contents to a file.
+  static void dumpRegisterWorker(std::unique_ptr<DumpRegisterWorkerData> Data);
 
-  std::vector<std::shared_ptr<unsigned long long>> errorCommunication;
+  /// Set the load var to a specific value and update it with a memory fence across threads.
+  /// \arg Value The new load value.
+  static void setLoad(LoadThreadWorkType Value);
 
-#ifdef FIRESTARTER_DEBUG_FEATURES
-  std::thread dumpRegisterWorkerThread;
-#endif
+  /// Sigalarm handler does nothing.
+  static void sigalrmHandler(int Signum);
+
+  /// Sigterm handler stops the execution of firestarter
+  /// \arg Signum The signal number is ignored.
+  static void sigtermHandler(int Signum);
 };
 
 } // namespace firestarter
diff --git a/include/firestarter/Json/Summary.hpp b/include/firestarter/Json/Summary.hpp
index 540c4aed..e6f33e5d 100644
--- a/include/firestarter/Json/Summary.hpp
+++ b/include/firestarter/Json/Summary.hpp
@@ -21,24 +21,27 @@
 
 #pragma once
 
-#include <firestarter/Measurement/Summary.hpp>
+#include "firestarter/Measurement/Summary.hpp"
 
+/// Json serializer and deserializer for the firestarter::measurement::Summary struct
 namespace nlohmann {
 template <> struct adl_serializer<firestarter::measurement::Summary> {
-  static firestarter::measurement::Summary from_json(const json &j) {
-    return {j["num_timepoints"].get<size_t>(),
-            std::chrono::milliseconds(
-                j["duration"].get<std::chrono::milliseconds::rep>()),
-            j["average"].get<double>(), j["stddev"].get<double>()};
+  // functions for nlohmann json do not follow LLVM code style
+  // NOLINTBEGIN(readability-identifier-naming)
+  static auto from_json(const json& J) -> firestarter::measurement::Summary {
+    return {J["num_timepoints"].get<size_t>(),
+            std::chrono::milliseconds(J["duration"].get<std::chrono::milliseconds::rep>()), J["average"].get<double>(),
+            J["stddev"].get<double>()};
   }
 
-  static void to_json(json &j, firestarter::measurement::Summary s) {
-    j = json::object();
+  static void to_json(json& J, firestarter::measurement::Summary S) {
+    J = json::object();
 
-    j["num_timepoints"] = s.num_timepoints;
-    j["duration"] = s.duration.count();
-    j["average"] = s.average;
-    j["stddev"] = s.stddev;
+    J["num_timepoints"] = S.NumTimepoints;
+    J["duration"] = S.Duration.count();
+    J["average"] = S.Average;
+    J["stddev"] = S.Stddev;
   }
+  // NOLINTEND(readability-identifier-naming)
 };
 } // namespace nlohmann
diff --git a/include/firestarter/LoadWorkerData.hpp b/include/firestarter/LoadWorkerData.hpp
index ec70476f..1cf3dac3 100644
--- a/include/firestarter/LoadWorkerData.hpp
+++ b/include/firestarter/LoadWorkerData.hpp
@@ -21,108 +21,146 @@
 
 #pragma once
 
-#include <firestarter/Constants.hpp>
-#include <firestarter/DumpRegisterStruct.hpp>
-#include <firestarter/Environment/Environment.hpp>
-#include <firestarter/ErrorDetectionStruct.hpp>
+#include "firestarter/Constants.hpp"
+#include "firestarter/Environment/Environment.hpp"
+#include "firestarter/Environment/Platform/PlatformConfig.hpp"
+#include "firestarter/LoadWorkerMemory.hpp"
 
 #include <atomic>
+#include <cmath>
 #include <memory>
 #include <mutex>
-
-#define PAD_SIZE(size, align)                                                  \
-  align *(int)std::ceil((double)size / (double)align)
-
-#if defined(__APPLE__)
-#define ALIGNED_MALLOC(size, align) aligned_alloc(align, PAD_SIZE(size, align))
-#define ALIGNED_FREE free
-#elif defined(__MINGW64__)
-#define ALIGNED_MALLOC(size, align) _mm_malloc(PAD_SIZE(size, align), align)
-#define ALIGNED_FREE _mm_free
-#elif defined(_MSC_VER)
-#define ALIGNED_MALLOC(size, align)                                            \
-  _aligned_malloc(PAD_SIZE(size, align), align)
-#define ALIGNED_FREE _aligned_free
-#else
-#define ALIGNED_MALLOC(size, align)                                            \
-  std::aligned_alloc(align, PAD_SIZE(size, align))
-#define ALIGNED_FREE std::free
-#endif
+#include <utility>
 
 namespace firestarter {
 
+/// This class contains the information that is required to execute the load routines and change the payload during
+/// executions.
 class LoadWorkerData {
 public:
-  LoadWorkerData(int id, environment::Environment &environment,
-                 volatile unsigned long long *loadVar,
-                 unsigned long long period, bool dumpRegisters,
-                 bool errorDetection)
-      : addrHigh(loadVar), period(period), dumpRegisters(dumpRegisters),
-        errorDetection(errorDetection), _id(id), _environment(environment),
-        _config(new environment::platform::RuntimeConfig(
-            environment.selectedConfig())) {
-    // use REGISTER_MAX_NUM cache lines for the dumped registers
-    // and another cache line for the control variable.
-    // as we are doing aligned moves we only have the option to waste a whole
-    // cacheline
-    addrOffset = dumpRegisters
-                     ? sizeof(DumpRegisterStruct) / sizeof(unsigned long long)
-                     : 0;
-
-    addrOffset += errorDetection ? sizeof(ErrorDetectionStruct) /
-                                       sizeof(unsigned long long)
-                                 : 0;
-  }
+  /// This struct models parameters acquired during the execution of the high-load routine.
+  struct Metrics {
+    /// The number of iteration the high-load loop was executed.
+    std::atomic<uint64_t> Iterations{};
+    /// The start of the execution of the high-load loop.
+    std::atomic<uint64_t> StartTsc{};
+    /// The stop of the execution of the high-load loop.
+    std::atomic<uint64_t> StopTsc{};
+
+    auto operator=(const Metrics& Other) -> Metrics& {
+      if (this == &Other) {
+        return *this;
+      }
 
-  ~LoadWorkerData() {
-    delete _config;
-    if (addrMem - addrOffset != nullptr) {
-      ALIGNED_FREE(addrMem - addrOffset);
+      Iterations.store(Other.Iterations.load());
+      StartTsc.store(Other.StartTsc.load());
+      StopTsc.store(Other.StopTsc.load());
+      return *this;
     }
-  }
+  };
+
+  /// Create the datastructure that is shared between a load worker thread and firestarter.
+  /// \arg Id The id of the load worker thread. They are counted from 0 to the maximum number of threads - 1.
+  /// \arg Environment The reference to the environment which allows setting the thread affinity and getting the current
+  /// timestamp.
+  /// \arg LoadVar The variable that controls the execution of the load worker.
+  /// \arg Period Is used in combination with the LoadVar for the low load routine.
+  /// \arg DumpRegisters Should the code to support dumping registers be baked into the high load routine of the
+  /// compiled payload.
+  /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine
+  /// of the compiled payload.
+  LoadWorkerData(uint64_t Id, const environment::Environment& Environment, volatile LoadThreadWorkType& LoadVar,
+                 std::chrono::microseconds Period, bool DumpRegisters, bool ErrorDetection)
+      : LoadVar(LoadVar)
+      , Period(Period)
+      , DumpRegisters(DumpRegisters)
+      , ErrorDetection(ErrorDetection)
+      , Id(Id)
+      , Environment(Environment)
+      , Config(Environment.config().clone()) {}
 
-  void setErrorCommunication(
-      std::shared_ptr<unsigned long long> communicationLeft,
-      std::shared_ptr<unsigned long long> communicationRight) {
-    this->communicationLeft = communicationLeft;
-    this->communicationRight = communicationRight;
+  ~LoadWorkerData() = default;
+
+  /// Set the shared pointer to the memory shared between two thread for the communication required for the error
+  /// detection feature.
+  /// \arg CommunicationLeft The memory shared with the left thread.
+  /// \arg CommunicationRight The memory shared with the right thread.
+  void setErrorCommunication(std::shared_ptr<uint64_t> CommunicationLeft,
+                             std::shared_ptr<uint64_t> CommunicationRight) {
+    this->CommunicationLeft = std::move(CommunicationLeft);
+    this->CommunicationRight = std::move(CommunicationRight);
   }
 
-  int id() const { return _id; }
-  environment::Environment &environment() const { return _environment; }
-  environment::platform::RuntimeConfig &config() const { return *_config; }
+  /// Gettter for the id of the thread.
+  [[nodiscard]] auto id() const -> uint64_t { return Id; }
+  /// Const getter for the environment.
+  [[nodiscard]] auto environment() const -> const environment::Environment& { return Environment; }
+  /// Getter for the current platform config.
+  [[nodiscard]] auto config() const -> environment::platform::PlatformConfig& { return *Config; }
+
+  /// Access the DumpRegisterStruct. Asserts when dumping registers is not enabled.
+  /// \returns a reference to the DumpRegisterStruct
+  [[nodiscard]] auto dumpRegisterStruct() const -> DumpRegisterStruct& {
+    assert(DumpRegisters && "Tried to access DumpRegisterStruct, but dumping registers is not enabled.");
+    return Memory->ExtraVars.Drs;
+  }
 
-  const ErrorDetectionStruct *errorDetectionStruct() const {
-    return reinterpret_cast<ErrorDetectionStruct *>(addrMem - addrOffset);
+  /// Access the ErrorDetectionStruct. Asserts when error detections is not enabled.
+  /// \returns a reference to the ErrorDetectionStruct
+  [[nodiscard]] auto errorDetectionStruct() const -> ErrorDetectionStruct& {
+    assert(ErrorDetection && "Tried to access ErrorDetectionStruct, but error detection is not enabled.");
+    return Memory->ExtraVars.Eds;
   }
 
-  int comm = THREAD_WAIT;
-  bool ack = false;
-  std::mutex mutex;
-  unsigned long long *addrMem = nullptr;
-  unsigned long long addrOffset;
-  volatile unsigned long long *addrHigh;
-  unsigned long long buffersizeMem;
-  unsigned long long iterations = 0;
-  // save the last iteration count when switching payloads
-  std::atomic<unsigned long long> lastIterations;
-  unsigned long long flops;
-  unsigned long long startTsc;
-  unsigned long long stopTsc;
-  std::atomic<unsigned long long> lastStartTsc;
-  std::atomic<unsigned long long> lastStopTsc;
+  /// The members in this struct are used for the communication between the main thread and the load thread.
+  struct Communication {
+    /// The state of the load worker.
+    LoadThreadState State = LoadThreadState::ThreadWait;
+    /// This variable will be set to true when the state change was acknowledged by the load thread.
+    bool Ack = false;
+    /// The mutex that is used to lock access to the Ack and State variabels.
+    std::mutex Mutex;
+  } Communication;
+
+  /// The memory which is used by the load worker.
+  LoadWorkerMemory::UniquePtr Memory = {nullptr, nullptr};
+
+  /// The compiled payload which contains the pointers to the specific functions which are executed and some stats.
+  environment::payload::CompiledPayload::UniquePtr CompiledPayloadPtr = {nullptr, nullptr};
+
+  /// The variable that controls the execution of the load worker.
+  volatile LoadThreadWorkType& LoadVar;
+
+  /// The size of the buffer that is allocated in the load worker.
+  uint64_t BuffersizeMem{};
+
+  /// The collected metrics from the current execution of the LoadThreadState::ThreadWork state. Do not read from it.
+  Metrics CurrentRun;
+
+  /// The collected metrics from the last execution of the LoadThreadState::ThreadWork state.
+  Metrics LastRun;
+
   // period in usecs
   // used in low load routine to sleep 1/100th of this time
-  unsigned long long period;
-  bool dumpRegisters;
-  bool errorDetection;
-  std::shared_ptr<unsigned long long> communicationLeft;
-  std::shared_ptr<unsigned long long> communicationRight;
-
-private:
-  int _id;
-  environment::Environment &_environment;
-  environment::platform::RuntimeConfig *_config;
+  std::chrono::microseconds Period;
+
+  /// Should the code to support dumping registers be baked into the high load routine of the compiled payload.
+  bool DumpRegisters;
+
+  /// Should the code to support error detection between thread be baked into the high load routine of the compiled
+  /// payload.
+  bool ErrorDetection;
+  /// The pointer to the variable that is used for communication to the left thread for the error detection feature.
+  std::shared_ptr<uint64_t> CommunicationLeft;
+  /// The pointer to the variable that is used for communication to the right thread for the error detection feature.
+  std::shared_ptr<uint64_t> CommunicationRight;
+
+  /// The id of this load thread.
+  const uint64_t Id;
+  /// The reference to the environment which allows setting the thread affinity and getting the current timestamp.
+  const environment::Environment& Environment;
+  /// The config that is cloned from the environment for this specfic load worker.
+  std::unique_ptr<environment::platform::PlatformConfig> Config;
 };
 
 } // namespace firestarter
diff --git a/include/firestarter/LoadWorkerMemory.hpp b/include/firestarter/LoadWorkerMemory.hpp
new file mode 100644
index 00000000..11493665
--- /dev/null
+++ b/include/firestarter/LoadWorkerMemory.hpp
@@ -0,0 +1,91 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#pragma once
+
+#include "firestarter/AlignedAlloc.hpp"
+#include "firestarter/DumpRegisterStruct.hpp"
+#include "firestarter/ErrorDetectionStruct.hpp"
+
+#include <memory>
+
+namespace firestarter {
+
+/// This struct is used to allocate the memory for the high-load routine.
+struct LoadWorkerMemory {
+private:
+  LoadWorkerMemory() = default;
+  ~LoadWorkerMemory() = default;
+
+  /// Function to deallocate the memory for this struct to be used with unique_ptr.
+  /// \arg Ptr The pointer to the memory
+  static void deallocate(void* Ptr) {
+    static_cast<LoadWorkerMemory*>(Ptr)->~LoadWorkerMemory();
+    AlignedAlloc::free(Ptr);
+  }
+
+public:
+  using UniquePtr = std::unique_ptr<LoadWorkerMemory, void (*)(void*)>;
+
+  /// The extra variables that are before the memory used for the calculation in the high-load routine. They are used
+  /// for optional FIRESTARTER features where further communication between the high-load routine is needed e.g., for
+  /// error detection or dumping registers.
+  struct ExtraLoadWorkerVariables {
+    /// The data for the dump registers functionality.
+    DumpRegisterStruct Drs;
+    /// The data for the error detections functionality.
+    ErrorDetectionStruct Eds;
+  } ExtraVars;
+
+  /// A placeholder to extract the address of the memory region with dynamic size which is used for the calculation in
+  /// the high-load routine. Do not write or read to this type directly.
+  EightBytesType DoNotUseAddrMem;
+
+  /// This padding makes shure that we are aligned to a cache line. The allocated memory will most probably reach beyond
+  /// this array.
+  std::array<EightBytesType, 7> DoNotUsePadding;
+
+  /// Get the pointer to the start of the memory use for computations.
+  /// \returns the pointer to the memory.
+  [[nodiscard]] auto getMemoryAddress() -> auto{
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+    return reinterpret_cast<double*>(&DoNotUseAddrMem);
+  }
+
+  /// Get the offset to the memory which is used by the high-load functions
+  /// \returns the offset to the memory
+  [[nodiscard]] constexpr static auto getMemoryOffset() -> auto{ return offsetof(LoadWorkerMemory, DoNotUseAddrMem); }
+
+  /// Allocate the memory for the high-load thread on 64B cache line boundaries and return a unique_ptr.
+  /// \arg Bytes The number of bytes allocated for the array whoose start address is returned by the getMemoryAddress
+  /// function.
+  /// \returns A unique_ptr to the memory for the high-load thread.
+  [[nodiscard]] static auto allocate(const std::size_t Bytes) -> UniquePtr {
+    // Allocate the memory for the ExtraLoadWorkerVariables (which are 64B aligned) and the data for the high-load
+    // routine which may not be 64B aligned.
+    static_assert(sizeof(ExtraLoadWorkerVariables) % 64 == 0,
+                  "ExtraLoadWorkerVariables is not a multiple of 64B i.e., multiple cachelines.");
+    auto* Ptr = AlignedAlloc::malloc(Bytes + sizeof(ExtraLoadWorkerVariables));
+    return {static_cast<LoadWorkerMemory*>(Ptr), deallocate};
+  }
+};
+
+} // namespace firestarter
diff --git a/include/firestarter/Logging/FirstWorkerThreadFilter.hpp b/include/firestarter/Logging/FirstWorkerThreadFilter.hpp
index af8b7ff1..4e501b2e 100644
--- a/include/firestarter/Logging/FirstWorkerThreadFilter.hpp
+++ b/include/firestarter/Logging/FirstWorkerThreadFilter.hpp
@@ -21,31 +21,29 @@
 
 #pragma once
 
-#include <nitro/log/log.hpp>
 #include <nitro/log/severity.hpp>
-
 #include <thread>
 
-namespace firestarter {
-
-namespace logging {
+namespace firestarter::logging {
 
+/// Logging filter for nitro to discard values that do not match a specific thread id.
 template <typename Record> class FirstWorkerThreadFilter {
 public:
-  typedef Record record_type;
+  using record_type = Record;
 
-  static void setFirstThread(std::thread::id newFirstThread) {
-    firstThread = newFirstThread;
-  }
+  /// Set the thread id from which records should not be discarded.
+  /// \arg NewFirstThread The specified thread.
+  static void setFirstThread(std::thread::id NewFirstThread) { FirstThread = NewFirstThread; }
 
-  bool filter(Record &r) const {
-    return r.std_thread_id() == firstThread ||
-           r.severity() >= nitro::log::severity_level::error;
+  /// Filter records. We keep record if they are from the specified thread or if the severity is at least error.
+  /// \arg R The record to filter.
+  /// \returns true if the record should be kept.
+  auto filter(Record& R) const -> bool {
+    return R.std_thread_id() == FirstThread || R.severity() >= nitro::log::severity_level::error;
   }
 
 private:
-  inline static std::thread::id firstThread{};
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+  inline static std::thread::id FirstThread{};
 };
-} // namespace logging
-
-} // namespace firestarter
+} // namespace firestarter::logging
diff --git a/include/firestarter/Logging/Log.hpp b/include/firestarter/Logging/Log.hpp
index f5b613c0..10090668 100644
--- a/include/firestarter/Logging/Log.hpp
+++ b/include/firestarter/Logging/Log.hpp
@@ -21,22 +21,19 @@
 
 #pragma once
 
-#include <firestarter/Logging/FirstWorkerThreadFilter.hpp>
-
-#include <nitro/log/log.hpp>
-#include <nitro/log/severity.hpp>
+#include "firestarter/Logging/FirstWorkerThreadFilter.hpp"
+#include "firestarter/SafeExit.hpp"
 
+#include <cstdlib>
+#include <iostream>
 #include <nitro/log/attribute/message.hpp>
 #include <nitro/log/attribute/severity.hpp>
 #include <nitro/log/attribute/std_thread_id.hpp>
 #include <nitro/log/attribute/timestamp.hpp>
-
 #include <nitro/log/filter/and_filter.hpp>
 #include <nitro/log/filter/severity_filter.hpp>
-
-#include <iomanip>
-#include <ios>
-#include <iostream>
+#include <nitro/log/log.hpp>
+#include <nitro/log/severity.hpp>
 #include <sstream>
 #include <string>
 
@@ -44,70 +41,81 @@ namespace firestarter {
 
 namespace logging {
 
+/// Formatter to log Records with severity warn, error and fatal to stderr and all other Records to stdout. If a record
+/// has severity error or fatal we abort the program.
 class StdOut {
 public:
-  void sink(nitro::log::severity_level severity,
-            const std::string &formatted_record) {
-    switch (severity) {
+  static void sink(nitro::log::severity_level Severity, const std::string& FormattedRecord) {
+    switch (Severity) {
     case nitro::log::severity_level::warn:
     case nitro::log::severity_level::error:
     case nitro::log::severity_level::fatal:
-      std::cerr << formatted_record << std::endl << std::flush;
+      std::cerr << FormattedRecord << '\n' << std::flush;
       break;
     default:
-      std::cout << formatted_record << std::endl << std::flush;
+      std::cout << FormattedRecord << '\n' << std::flush;
       break;
     }
+
+    // Exit on error or fatal
+    if (Severity == nitro::log::severity_level::error || Severity == nitro::log::severity_level::fatal) {
+      safeExit(EXIT_FAILURE);
+    }
   }
 };
 
-using record = nitro::log::record<
-    nitro::log::severity_attribute, nitro::log::message_attribute,
-    nitro::log::timestamp_attribute, nitro::log::std_thread_id_attribute>;
+// NOLINTBEGIN(readability-identifier-naming)
+// The class may not be named Record since this is used as a template argument name in nitro which will cause errors
+// when compiling with MSC.
+using record = nitro::log::record<nitro::log::severity_attribute, nitro::log::message_attribute,
+                                  nitro::log::timestamp_attribute, nitro::log::std_thread_id_attribute>;
+// NOLINTEND(readability-identifier-naming)
 
-template <typename Record> class formater {
+template <typename Record>
+// NOLINTBEGIN(readability-identifier-naming)
+// The class may not be named Formater since this is used as a template argument name in nitro which will cause errors
+// when compiling with MSC. We will also write it with lower case and the correct spelling in case it gets renamed
+// correctly there.
+/// Format Record and add a string representing the severity in front.
+class formatter {
+  // NOLINTEND(readability-identifier-naming)
 public:
-  std::string format(Record &r) {
-    std::stringstream s;
+  auto format(Record& R) -> std::string {
+    std::stringstream S;
 
-    switch (r.severity()) {
+    switch (R.severity()) {
     case nitro::log::severity_level::warn:
-      s << "Warning: ";
+      S << "Warning: ";
       break;
     case nitro::log::severity_level::error:
-      s << "Error: ";
+      S << "Error: ";
       break;
     case nitro::log::severity_level::fatal:
-      s << "Fatal: ";
+      S << "Fatal: ";
       break;
     case nitro::log::severity_level::trace:
-      s << "Debug: ";
+      S << "Debug: ";
       break;
     default:
       break;
     }
 
-    s << r.message();
+    S << R.message();
 
-    return s.str();
+    return S.str();
   }
 };
 
-template <typename Record>
-using filter = nitro::log::filter::severity_filter<Record>;
+template <typename Record> using Filter = nitro::log::filter::severity_filter<Record>;
 
 template <typename Record>
-using workerFilter =
-    nitro::log::filter::and_filter<filter<Record>,
-                                   FirstWorkerThreadFilter<Record>>;
+using WorkerFilter = nitro::log::filter::and_filter<Filter<Record>, FirstWorkerThreadFilter<Record>>;
 
 } // namespace logging
 
-using log = nitro::log::logger<logging::record, logging::formater,
-                               firestarter::logging::StdOut, logging::filter>;
+using log = nitro::log::logger<logging::record, logging::formatter, firestarter::logging::StdOut, logging::Filter>;
 
 using workerLog =
-    nitro::log::logger<logging::record, logging::formater,
-                       firestarter::logging::StdOut, logging::workerFilter>;
+    nitro::log::logger<logging::record, logging::formatter, firestarter::logging::StdOut, logging::WorkerFilter>;
 
 } // namespace firestarter
diff --git a/include/firestarter/Measurement/MeasurementWorker.hpp b/include/firestarter/Measurement/MeasurementWorker.hpp
index 4fc8a6a1..a25c8da3 100644
--- a/include/firestarter/Measurement/MeasurementWorker.hpp
+++ b/include/firestarter/Measurement/MeasurementWorker.hpp
@@ -21,95 +21,118 @@
 
 #pragma once
 
-#include <firestarter/Logging/Log.hpp>
-#include <firestarter/Measurement/Summary.hpp>
-#include <firestarter/Measurement/TimeValue.hpp>
+#include "firestarter/Measurement/Metric/IPCEstimate.hpp"
+#include "firestarter/Measurement/Metric/Perf.hpp"
+#include "firestarter/Measurement/Metric/RAPL.hpp"
+#include "firestarter/Measurement/MetricInterface.h"
+#include "firestarter/Measurement/Summary.hpp"
+#include "firestarter/Measurement/TimeValue.hpp"
+#include "firestarter/WindowsCompat.hpp" // IWYU pragma: keep
 
 #include <chrono>
 #include <map>
 #include <mutex>
 
-extern "C" {
-#include <firestarter/Measurement/Metric/IPCEstimate.h>
-#include <firestarter/Measurement/Metric/Perf.h>
-#include <firestarter/Measurement/Metric/RAPL.h>
-#include <firestarter/Measurement/MetricInterface.h>
-
-#include <pthread.h>
-}
-
-void insertCallback(void *cls, const char *metricName, int64_t timeSinceEpoch,
-                    double value);
+void insertCallback(void* Cls, const char* MetricName, int64_t TimeSinceEpoch, double Value);
 
 namespace firestarter::measurement {
 
+/// This class handles the management of metrics, acquisition of metric data and provids summaries of a time range of
+/// metric values.
 class MeasurementWorker {
 private:
-  pthread_t workerThread;
-  pthread_t stdinThread;
+  /// The thread that handles the values that are read from metrics
+  pthread_t WorkerThread{};
+  /// The thread that handles the metric values that are read from stdin
+  pthread_t StdinThread{};
 
-  std::vector<metric_interface_t *> metrics = {
-      &rapl_metric, &perf_ipc_metric, &perf_freq_metric, &ipc_estimate_metric};
+  /// The vector of metrics that are available. Currently the following metrics are builtin: sysfs-powercap-rapl,
+  /// perf-ipc, perf-freq and ipc-estimate. Metric provided through shared libraries are added to this list.
+  std::vector<const MetricInterface*> Metrics = {&RaplMetric, &PerfIpcMetric, &PerfFreqMetric, &IpcEstimateMetric};
 
-  std::mutex values_mutex;
-  std::map<std::string, std::vector<TimeValue>> values = {};
+  /// Mutex to access the Values map.
+  std::mutex ValuesMutex;
+  /// Map from metric name to the vector of timevalues of this metric.
+  std::map<std::string, std::vector<TimeValue>> Values;
 
-  static int *dataAcquisitionWorker(void *measurementWorker);
+  /// The thread function handles the timed polling of the metric values and saves them to the Value datastructure.
+  static auto dataAcquisitionWorker(void* MeasurementWorker) -> void*;
 
-  static int *stdinDataAcquisitionWorker(void *measurementWorker);
+  /// The thread function that handles the acquisition of the metric values from stdin and saves them to the Value
+  /// datastructure.
+  static auto stdinDataAcquisitionWorker(void* MeasurementWorker) -> void*;
 
-  const metric_interface_t *findMetricByName(std::string metricName);
+  /// Return the pointer to a metric from the Metrics vector that matches the supplied name.
+  /// \arg MetricName The name of the metric
+  /// \returns the pointer to the metric with the specified name or a nullptr
+  auto findMetricByName(std::string MetricName) -> const MetricInterface*;
 
-  std::chrono::milliseconds updateInterval;
+  /// We poll the values of all the metrics after this number of milliseconds.
+  std::chrono::milliseconds UpdateInterval;
 
-  std::chrono::high_resolution_clock::time_point startTime;
+  /// The start time of the measurement that should be summarized with the getValues function.
+  std::chrono::high_resolution_clock::time_point StartTime;
 
-  // some metric values have to be devided by this
-  const unsigned long long numThreads;
+  /// The number of thread FIRESTARTER runs with. This is required by some metrics
+  const uint64_t NumThreads;
 
-  std::string availableMetricsString;
+  std::string AvailableMetricsString;
 
 #ifndef FIRESTARTER_LINK_STATIC
-  std::vector<void *> _metricDylibs = {};
+  /// The pointer to the metrics that are used for dynamic libraries. We need to save them seperately here to call
+  /// dlclose later.
+  std::vector<void*> MetricDylibs;
 #endif
 
-  std::vector<std::string> _stdinMetrics = {};
+  /// The name of the metrics that are supplied from stdin.
+  std::vector<std::string> StdinMetrics;
 
 public:
-  // creates the worker thread
-  MeasurementWorker(std::chrono::milliseconds updateInterval,
-                    unsigned long long numThreads,
-                    std::vector<std::string> const &metricDylibs,
-                    std::vector<std::string> const &stdinMetrics);
-
-  // stops the worker threads
+  /// Initilize the measurement worker. It will spawn the threads for the polling of metic values.
+  /// \arg UpdateInterval The polling time for metric updates.
+  /// \arg NumThreads The number of thread FIRESTARTER is running with.
+  /// \arg MetricDylibsNames The vector of files to which are passed to dlopen for using additional metrics from shared
+  /// libraries.
+  /// \arg StdinMetricsNames The vector of metric names that should be read in from stdin
+  MeasurementWorker(std::chrono::milliseconds UpdateInterval, uint64_t NumThreads,
+                    std::vector<std::string> const& MetricDylibsNames,
+                    std::vector<std::string> const& StdinMetricsNames);
+
+  /// Stops the worker threads
   ~MeasurementWorker();
 
-  std::string const &availableMetrics() const {
-    return this->availableMetricsString;
-  }
+  /// Get the formatting table of all metrics and if they are available
+  [[nodiscard]] auto availableMetrics() const -> std::string const& { return this->AvailableMetricsString; }
 
-  std::vector<std::string> const &stdinMetrics() { return _stdinMetrics; }
+  /// The vector of all metrics that are read from stdin
+  auto stdinMetrics() -> std::vector<std::string> const& { return StdinMetrics; }
 
-  // returns a list of metrics
-  std::vector<std::string> metricNames();
+  /// Get the name of the metrics. This includes all metrics, builins, from dynamic libraries and metrics from stdin.
+  auto metricNames() -> std::vector<std::string>;
 
-  // setup the selected metrics
-  // returns a vector with the names of inialized metrics
-  std::vector<std::string>
-  initMetrics(std::vector<std::string> const &metricNames);
+  /// Initialize the metrics with the provided names.
+  /// \arg MetricNames The metrics to initialize
+  /// \returns The vector of metrics that were successfully initialized.
+  auto initMetrics(std::vector<std::string> const& MetricNames) -> std::vector<std::string>;
 
-  // callback function for metrics
-  void insertCallback(const char *metricName, int64_t timeSinceEpoch,
-                      double value);
+  /// This function insert a time value pair for a specific metric. This function will be provided to metrics to allow
+  /// them to push time value pairs.
+  /// \arg MetricName The name of the metric for which values are inserted
+  /// \arg TimeSinceEpoch The time since epoch of the time value pair
+  /// \arg Value The value of the time value pair
+  void insertCallback(const char* MetricName, int64_t TimeSinceEpoch, double Value);
 
-  // start the measurement
+  /// Set the StartTime to the current timestep
   void startMeasurement();
 
-  // get the measurement values begining from measurement start until now.
-  std::map<std::string, Summary> getValues(
-      std::chrono::milliseconds startDelta = std::chrono::milliseconds::zero(),
-      std::chrono::milliseconds stopDelta = std::chrono::milliseconds::zero());
+  /// Get the measurement values begining from measurement start (set with startMeasurement) until the measurement stop
+  /// (now).
+  /// \arg StartDelta The time to skip from the measurement start
+  /// \arg StopDelta The time to skip from the measurement stop
+  /// \returns The map from all metrics to their respective summaries.
+  auto getValues(std::chrono::milliseconds StartDelta = std::chrono::milliseconds::zero(),
+                 std::chrono::milliseconds StopDelta = std::chrono::milliseconds::zero())
+      -> std::map<std::string, Summary>;
 };
 
 } // namespace firestarter::measurement
diff --git a/include/firestarter/Measurement/Metric/IPCEstimate.hpp b/include/firestarter/Measurement/Metric/IPCEstimate.hpp
new file mode 100644
index 00000000..52bc9cdb
--- /dev/null
+++ b/include/firestarter/Measurement/Metric/IPCEstimate.hpp
@@ -0,0 +1,91 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2021 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#pragma once
+
+#include "firestarter/Measurement/MetricInterface.h"
+
+#include <string>
+
+/// The wrapper for the C interface to the IpcEstimateMetric metric.
+struct IpcEstimateMetricData {
+private:
+  IpcEstimateMetricData() = default;
+
+  /// The error string of this metric
+  std::string ErrorString;
+
+  /// The saved callback to push the metric value
+  void (*Callback)(void*, const char*, int64_t, double){};
+  /// The saved first argument for the callback
+  void* CallbackArg{};
+
+public:
+  IpcEstimateMetricData(IpcEstimateMetricData const&) = delete;
+  void operator=(IpcEstimateMetricData const&) = delete;
+
+  /// Get the instance of this metric
+  static auto instance() -> IpcEstimateMetricData& {
+    static IpcEstimateMetricData Instance;
+    return Instance;
+  }
+
+  /// Deinit the metric.
+  /// \returns EXIT_SUCCESS on success.
+  static auto fini() -> int32_t;
+
+  /// Init the metric.
+  /// \returns EXIT_SUCCESS on success.
+  static auto init() -> int32_t;
+
+  /// Get error in case return code not being EXIT_SUCCESS.
+  /// \returns The error string.
+  static auto getError() -> const char*;
+
+  /// The first argument is the function pointer to the callback. The first argument to this function pointer needs to
+  /// be filled with the second argument to this function.
+  /// The supplied function pointer needs to be called with the metric name for the second, an unix timestamp (time
+  /// since epoch) for the third and a metric value for the forth argument. This allows the metric to provide values in
+  /// a pushing way in contract to the pulling way of the GetReading function.
+  static auto registerInsertCallback(void (*C)(void*, const char*, int64_t, double), void* Arg) -> int32_t;
+
+  /// Push a value with the current timestamp.
+  /// \arg Value The metric value to push.
+  static void insertValue(double Value);
+};
+
+/// This metric provdies the ipc estimated based on the estimated number of instructions and the runtime of the high
+/// load loop. The metric value is dependent on the frequency of the processor. It serves as an estimation of the IPC
+/// times the processor frequency.
+static constexpr const MetricInterface IpcEstimateMetric{
+    /*Name=*/"ipc-estimate",
+    /*Type=*/
+    {/*Absolute=*/1, /*Accumalative=*/0, /*DivideByThreadCount=*/0, /*InsertCallback=*/1, /*IgnoreStartStopDelta=*/1,
+     /*Reserved=*/0},
+    /*Unit=*/"IPC",
+    /*CallbackTime=*/0,
+    /*Callback=*/nullptr,
+    /*Init=*/IpcEstimateMetricData::init,
+    /*Fini=*/IpcEstimateMetricData::fini,
+    /*GetReading=*/nullptr,
+    /*GetError=*/IpcEstimateMetricData::getError,
+    /*RegisterInsertCallback=*/IpcEstimateMetricData::registerInsertCallback,
+};
\ No newline at end of file
diff --git a/include/firestarter/Measurement/Metric/Perf.hpp b/include/firestarter/Measurement/Metric/Perf.hpp
new file mode 100644
index 00000000..8e0e14c7
--- /dev/null
+++ b/include/firestarter/Measurement/Metric/Perf.hpp
@@ -0,0 +1,142 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#pragma once
+
+#include "firestarter/Measurement/MetricInterface.h"
+
+#include <array>
+#include <string>
+
+/// The wrapper for the C interface to the PerfIpcMetric and PerfFreqMetric metric.
+class PerfMetricData {
+private:
+  PerfMetricData() = default;
+
+  static const constexpr char* PerfEventParanoidFile = "/proc/sys/kernel/perf_event_paranoid";
+
+  /// The datastructure that is read from the file descriptor provided by the perf_event_open syscall.
+  struct ReadFormat {
+    struct ValueAndId {
+      uint64_t Value;
+      uint64_t Id;
+    };
+
+    uint64_t Nr;
+    std::array<ValueAndId, 2> Values;
+  };
+
+  /// The error string of this metric
+  std::string ErrorString;
+
+  /// The file descriptor of the perf_event_open syscall for the PERF_COUNT_HW_CPU_CYCLES event. This file descriptor
+  /// handles as a group for the other file descriptor.
+  int CpuCyclesFd = -1;
+  /// The file descriptor of the perf_event_open syscall for the PERF_COUNT_HW_INSTRUCTIONS event.
+  int InstructionsFd = -1;
+  /// The PERF_EVENT_IOC_ID for the cpu cycles file descriptor.
+  uint64_t CpuCyclesId{};
+  /// The PERF_EVENT_IOC_ID for the cpu instruction file descriptor.
+  uint64_t InstructionsId{};
+
+  /// The flag that stop init from being executed multiple times.
+  bool InitDone = false;
+  /// The value that is returned if the init function called multiple times.
+  int32_t InitValue{};
+
+  /// Save the last read metric for the perf-ipc metric. This value will be updated when the perf-ipc metric is read.
+  struct ReadFormat Last {};
+
+  /// Get a reading of the perf-freq and perf-ipc metric. Pointers can be nullptr.
+  /// \arg IpcValue The pointer to which the value for ipc metric value will be saved.
+  /// \arg FreqValue The pointer to which the value for freq metric value will be saved.
+  /// \returns EXIT_SUCCESS if we got a new value.
+  static auto getReading(double* IpcValue, double* FreqValue) -> int32_t;
+
+public:
+  PerfMetricData(PerfMetricData const&) = delete;
+  void operator=(PerfMetricData const&) = delete;
+
+  /// Get the instance of this metric
+  static auto instance() -> PerfMetricData& {
+    static PerfMetricData Instance;
+    return Instance;
+  }
+
+  /// Deinit the metric.
+  /// \returns EXIT_SUCCESS on success.
+  static auto fini() -> int32_t;
+
+  /// Init the metric.
+  /// \returns EXIT_SUCCESS on success.
+  static auto init() -> int32_t;
+
+  /// Read the from a specific PERF_EVENT_IOC_ID out of the ReadFormat datastructure.
+  /// \arg Reader The ReadFormat datastructure from which the value will be extracter
+  /// \arg Id The PERF_EVENT_IOC_ID of the metric that should be read.
+  static auto valueFromId(struct ReadFormat* Reader, uint64_t Id) -> uint64_t;
+
+  /// Get a reading of the perf-ipc metric.
+  /// \arg Value The pointer to which the value will be saved.
+  /// \returns EXIT_SUCCESS if we got a new value.
+  static auto getReadingIpc(double* Value) -> int32_t;
+
+  /// Get a reading of the perf-freq metric.
+  /// \arg Value The pointer to which the value will be saved.
+  /// \returns EXIT_SUCCESS if we got a new value.
+  static auto getReadingFreq(double* Value) -> int32_t;
+
+  /// Get error in case return code not being EXIT_SUCCESS.
+  /// \returns The error string.
+  static auto getError() -> const char*;
+};
+
+/// This metric provides IPC measurement of the programm and all associated threads.
+static constexpr const MetricInterface PerfIpcMetric{
+    /*Name=*/"perf-ipc",
+    /*Type=*/
+    {/*Absolute=*/1, /*Accumalative=*/0, /*DivideByThreadCount=*/0, /*InsertCallback=*/0, /*IgnoreStartStopDelta=*/0,
+     /*Reserved=*/0},
+    /*Unit=*/"IPC",
+    /*CallbackTime=*/0,
+    /*Callback=*/nullptr,
+    /*Init=*/PerfMetricData::init,
+    /*Fini=*/PerfMetricData::fini,
+    /*GetReading=*/PerfMetricData::getReadingIpc,
+    /*GetError=*/PerfMetricData::getError,
+    /*RegisterInsertCallback=*/nullptr,
+};
+
+/// This metric provides frequency measurement on the CPUs used to execute the program on.
+static constexpr const MetricInterface PerfFreqMetric{
+    /*Name=*/"perf-freq",
+    /*Type=*/
+    {/*Absolute=*/0, /*Accumalative=*/1, /*DivideByThreadCount=*/1, /*InsertCallback=*/0, /*IgnoreStartStopDelta=*/0,
+     /*Reserved=*/0},
+    /*Unit=*/"GHz",
+    /*CallbackTime=*/0,
+    /*Callback=*/nullptr,
+    /*Init=*/PerfMetricData::init,
+    /*Fini=*/PerfMetricData::fini,
+    /*GetReading=*/PerfMetricData::getReadingFreq,
+    /*GetError=*/PerfMetricData::getError,
+    /*RegisterInsertCallback=*/nullptr,
+};
\ No newline at end of file
diff --git a/include/firestarter/Measurement/Metric/RAPL.hpp b/include/firestarter/Measurement/Metric/RAPL.hpp
new file mode 100644
index 00000000..59d4a822
--- /dev/null
+++ b/include/firestarter/Measurement/Metric/RAPL.hpp
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#pragma once
+
+#include "firestarter/Measurement/MetricInterface.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+/// The wrapper for the C interface to the RaplMetric metric.
+class RaplMetricData {
+private:
+  /// Datastructure to hold the path of the sysfs rapl entry, the last reading (improtant to detect overflows), the
+  /// counter of the number of overflows and the maximum value that the reading will have.
+  struct ReaderDef {
+    ReaderDef() = delete;
+
+    ReaderDef(std::string Path, int64_t LastReading, int64_t Overflow, int64_t Max)
+        : Path(std::move(Path))
+        , LastReading(LastReading)
+        , Overflow(Overflow)
+        , Max(Max){};
+
+    std::string Path;
+    int64_t LastReading;
+    int64_t Overflow;
+    int64_t Max;
+  };
+
+  /// The path to the sysfs rapl entries
+  static constexpr const char* RaplPath = "/sys/class/powercap";
+
+  /// The error string of this metric
+  std::string ErrorString;
+
+  /// The vector of readers that hold the path and read values from the sysfs rapl
+  std::vector<std::unique_ptr<ReaderDef>> Readers;
+
+  RaplMetricData() = default;
+
+public:
+  RaplMetricData(RaplMetricData const&) = delete;
+  void operator=(RaplMetricData const&) = delete;
+
+  /// Get the instance of this metric
+  static auto instance() -> RaplMetricData& {
+    static RaplMetricData Instance;
+    return Instance;
+  }
+
+  /// Deinit the metric.
+  /// \returns EXIT_SUCCESS on success.
+  static auto fini() -> int32_t;
+
+  /// Init the metric.
+  /// \returns EXIT_SUCCESS on success.
+  static auto init() -> int32_t;
+
+  /// Get a reading of the sysfs-powercap-rapl metric.
+  /// \arg Value The pointer to which the value will be saved.
+  /// \returns EXIT_SUCCESS if we got a new value.
+  static auto getReading(double* Value) -> int32_t;
+
+  /// Get error in case return code not being EXIT_SUCCESS.
+  /// \returns The error string.
+  static auto getError() -> const char*;
+
+  /// This function should be called every 30s. It will make shure that we do not miss an overflow of a counter and
+  /// therefore get a wrong reading.
+  static void callback();
+};
+
+/// This metric provides power measurements through the RAPL interface. Either psys measurement is choosen or if this is
+/// not available the sum of packages and drams.
+static constexpr const MetricInterface RaplMetric{
+    /*Name=*/"sysfs-powercap-rapl",
+    /*Type=*/
+    {/*Absolute=*/0, /*Accumalative=*/1, /*DivideByThreadCount=*/0, /*InsertCallback=*/0, /*IgnoreStartStopDelta=*/0,
+     /*Reserved=*/0},
+    /*Unit=*/"J",
+    /*CallbackTime=*/30000000,
+    /*Callback=*/RaplMetricData::callback,
+    /*Init=*/RaplMetricData::init,
+    /*Fini=*/RaplMetricData::fini,
+    /*GetReading=*/RaplMetricData::getReading,
+    /*GetError=*/RaplMetricData::getError,
+    /*RegisterInsertCallback=*/nullptr,
+};
\ No newline at end of file
diff --git a/include/firestarter/Measurement/MetricInterface.h b/include/firestarter/Measurement/MetricInterface.h
index dbea19e8..03f4872c 100644
--- a/include/firestarter/Measurement/MetricInterface.h
+++ b/include/firestarter/Measurement/MetricInterface.h
@@ -21,65 +21,84 @@
 
 #pragma once
 
+/// This file provides a C style interface to write metrics for FIRESTARTER and provide them as shared libraries.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stdint.h>
 
-// clang-format off
+// NOLINTBEGIN(modernize-use-using)
+
+/// Describe the type of the metric and how values need to be accumulated. Per default metrics are of pulling type where
+/// FIRESTARTER will pull the values through the GetReading function.
 typedef struct {
-  // Either set absolute or accumalative to specify the type of values from the
-  // metric.
-  uint32_t absolute : 1,
-           accumalative : 1,
-           // Set to divide metric values by thread count.
-           divide_by_thread_count : 1,
-           // Set to insert time-value pairs via callback function passed by
-           // register_insert_callback.
-           insert_callback : 1,
-					 // ignore the start and stop delta set by the user
-					 ignore_start_stop_delta : 1,
-           __reserved : 27;
-} metric_type_t;
-// clang-format on
-
-// Define `metric_interface_t metric` inside your shared library to be able to
-// load it during runtime.
+  uint32_t
+      /// Set this to 1 if the metric values provided are absolute.
+      Absolute : 1,
+      /// Set this to 1 if the metric values provided are accumulative.
+      Accumalative : 1,
+      /// Set this to 1 if the metric value needs to be divided by the number of threads.
+      DivideByThreadCount : 1,
+      /// Set this to 1 if the metric will provide time-value data in a pushing way trough the RegisterInsertCallback
+      /// function.
+      InsertCallback : 1,
+      /// Set this to 1 if the accumulation of the metric should ignore the start/stop delta which are specified by the
+      /// user of FIRESTARTER.
+      IgnoreStartStopDelta : 1,
+      /// Reserved space to fill 32 bits
+      Reserved : 27;
+} MetricType;
+
+/// Define `MetricInterface Metric` inside your shared library to be able to load it during runtime.
 typedef struct {
-  // the name of the metric
-  const char *name;
+  /// The name of the metric
+  const char* Name;
+
+  /// Describes what the value of the metrics represents and how it needs to be accumulated.
+  MetricType Type;
 
-  // metric type with bitfield from metric_type_t
-  metric_type_t type;
+  /// The unit of the metric
+  const char* Unit;
 
-  // the unit of the metric
-  const char *unit;
+  /// The time in usecs after which the callback should be called again. Set to 0 to disable.
+  uint64_t CallbackTime;
 
-  uint64_t callback_time;
+  /// This function will be called every `CallbackTime` usecs. Disable by setting `CallbackTime` to 0.
+  void (*Callback)();
 
-  // This function will be called every `callback_time` usecs. Disable by
-  // setting `callback_time` to 0.
-  void (*callback)(void);
+  /// init the metric.
+  /// \returns EXIT_SUCCESS on success.
+  int32_t (*Init)();
 
-  // init the metric.
-  // returns EXIT_SUCCESS on success.
-  int32_t (*init)(void);
+  /// deinit the metric.
+  /// \returns EXIT_SUCCESS on success.
+  int32_t (*Fini)();
 
-  // deinit the metric.
-  // returns EXIT_SUCCESS on success.
-  int32_t (*fini)(void);
+  /// Get a reading of the metric. Set this function pointer to null if MetricType::InsertCallback is specified in the
+  /// Type.
+  /// \arg Value The pointer to which the value will be saved.
+  /// \returns EXIT_SUCCESS if we got a new value.
+  int32_t (*GetReading)(double* Value);
 
-  // Get a reading of the metric
-  // Return EXIT_SUCCESS if we got a new value.
-  // Set this function pointer to NULL if METRIC_INSERT_CALLBACK is specified.
-  int32_t (*get_reading)(double *value);
+  /// Get error in case return code not being EXIT_SUCCESS.
+  /// \returns The error string.
+  const char* (*GetError)();
 
-  // Get error in case return code not being EXIT_SUCCESS
-  const char *(*get_error)(void);
+  /// If MetricType::InsertCallback is specified in the Type this function will be used to pass the metric a callback
+  /// and the first argument to this callback.
+  /// The first argument is the function pointer to the callback. The first argument to this function pointer needs to
+  /// be filled with the second argument to this function.
+  /// The supplied function pointer needs to be called with the metric name for the second, an unix timestamp (time
+  /// since epoch) for the third and a metric value for the forth argument. This allows the metric to provide values in
+  /// a pushing way in contract to the pulling way of the GetReading function.
+  int32_t (*RegisterInsertCallback)(void (*)(void*, const char*, int64_t, double), void*);
 
-  // If METRIC_INSERT_CALLBACK is set in the type, this function will be passed
-  // a callback and the first argument for the callback.
-  // Further arguments of callback are the metric name, an unix timestamp (time
-  // since epoch) and a metric value.
-  int32_t (*register_insert_callback)(void (*)(void *, const char *, int64_t,
-                                               double),
-                                      void *);
+} MetricInterface;
+// NOLINTEND(modernize-use-using)
 
-} metric_interface_t;
+#ifdef __cplusplus
+};
+#endif
\ No newline at end of file
diff --git a/include/firestarter/Measurement/Summary.hpp b/include/firestarter/Measurement/Summary.hpp
index 23f819f0..05c5a925 100644
--- a/include/firestarter/Measurement/Summary.hpp
+++ b/include/firestarter/Measurement/Summary.hpp
@@ -21,30 +21,32 @@
 
 #pragma once
 
-#include <firestarter/Measurement/TimeValue.hpp>
+#include "firestarter/Measurement/MetricInterface.h"
+#include "firestarter/Measurement/TimeValue.hpp"
 
 #include <chrono>
 #include <nlohmann/json.hpp>
 #include <vector>
 
-extern "C" {
-#include <firestarter/Measurement/MetricInterface.h>
-}
-
 namespace firestarter::measurement {
 
+/// This struct summarized multiple timevalues. The duration, the number of time points an average and stddev is saved.
 struct Summary {
-
-  size_t num_timepoints;
-  std::chrono::milliseconds duration;
-
-  double average;
-  double stddev;
-
-  static Summary calculate(std::vector<TimeValue>::iterator begin,
-                           std::vector<TimeValue>::iterator end,
-                           metric_type_t metricType,
-                           unsigned long long numThreads);
+  size_t NumTimepoints;
+  std::chrono::milliseconds Duration;
+
+  double Average;
+  double Stddev;
+
+  /// Calculate the summary over a range of timevalues for a given metric and number of threads.
+  /// \arg Begin The start of the iterator
+  /// \arg End The end of the iterator
+  /// \arg MetricType This describes what each timevalue represents and how the metric needs to be calucated into a
+  /// summary.
+  /// \arg NumThreads The number of threads this metric was accumulated across.
+  /// \returns The summary over the range of timevalues from a specific metric.
+  static auto calculate(std::vector<TimeValue>::iterator Begin, std::vector<TimeValue>::iterator End,
+                        MetricType MetricType, uint64_t NumThreads) -> Summary;
 };
 
 } // namespace firestarter::measurement
diff --git a/include/firestarter/Measurement/TimeValue.hpp b/include/firestarter/Measurement/TimeValue.hpp
index eae3de23..8088385e 100644
--- a/include/firestarter/Measurement/TimeValue.hpp
+++ b/include/firestarter/Measurement/TimeValue.hpp
@@ -25,16 +25,16 @@
 
 namespace firestarter::measurement {
 
+/// This struct models a value that was captured at a specific timepoint.
 struct TimeValue {
-
   TimeValue() = default;
 
-  constexpr TimeValue(std::chrono::high_resolution_clock::time_point t,
-                      double v)
-      : time(t), value(v){};
+  constexpr TimeValue(std::chrono::high_resolution_clock::time_point Time, double Value)
+      : Time(Time)
+      , Value(Value){};
 
-  std::chrono::high_resolution_clock::time_point time;
-  double value;
+  std::chrono::high_resolution_clock::time_point Time;
+  double Value{};
 };
 
 } // namespace firestarter::measurement
diff --git a/include/firestarter/OneAPI/OneAPI.hpp b/include/firestarter/OneAPI/OneAPI.hpp
index cf939388..4022b8c4 100644
--- a/include/firestarter/OneAPI/OneAPI.hpp
+++ b/include/firestarter/OneAPI/OneAPI.hpp
@@ -21,32 +21,63 @@
 
 #pragma once
 
+#include "firestarter/Constants.hpp"
+
 #include <condition_variable>
-#include <mutex>
 #include <thread>
-#include <vector>
 
 namespace firestarter::oneapi {
 
+/// This class handles the workload on OneAPI compatible GPUs. A gemm routine is used to stress them with a
+/// constant high load. This header does not include any OneAPI specific headers to allow us to not guard the
+/// include of this header in other parts of the programm.
 class OneAPI {
 private:
-  std::thread _initThread;
-  std::condition_variable _waitForInitCv;
-  std::mutex _waitForInitCvMutex;
+  /// The thread that is used to initilize the gpus. This thread will wait until each thread that runs the gemm routine
+  /// joins.
+  std::thread InitThread;
 
-  static void initGpus(std::condition_variable &cv,
-                       volatile unsigned long long *loadVar, bool useFloat,
-                       bool useDouble, unsigned matrixSize, int gpus);
+  /// Spawns a thread for each of the selected gpus, initilizes them and starts the execution of the gemm in parallel.
+  /// \arg WaitForInitCv The condition variables used to signal that all gpus are initialized.
+  /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter.
+  /// \arg UseFloat Set to true if we want to stress using single precision floating points.
+  /// \arg UseDouble Set to true if we want to stress using double precision floating points. If neither UseFloat or
+  /// UseDouble is set the precision will be choosen automatically.
+  /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for
+  /// automatic selection.
+  /// \arg Gpus Select the number of gpus to stress or -1 for all.
+  static void initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
+                       bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus);
 
 public:
-  OneAPI(volatile unsigned long long *loadVar, bool useFloat, bool useDouble,
-       unsigned matrixSize, int gpus);
+  /// Initilize the OneAPI class. This will start a thread running the OneAPI::initGpus function and wait until all gpus
+  /// are inititialized.
+  /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter.
+  /// \arg UseFloat Set to true if we want to stress using single precision floating points.
+  /// \arg UseDouble Set to true if we want to stress using double precision floating points. If neither UseFloat or
+  /// UseDouble is set the precision will be choosen automatically.
+  /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for
+  /// automatic selection.
+  /// \arg Gpus Select the number of gpus to stress or -1 for all.
+  OneAPI(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble, unsigned MatrixSize,
+         int Gpus)
+#if defined(FIRESTARTER_BUILD_ONEAPI)
+      ;
+#else
+  {
+    (void)&LoadVar;
+    (void)UseFloat;
+    (void)UseDouble;
+    (void)MatrixSize;
+    (void)Gpus;
+  }
+#endif
 
   ~OneAPI() {
-    if (_initThread.joinable()) {
-      _initThread.join();
+    if (InitThread.joinable()) {
+      InitThread.join();
     }
   }
 };
 
-} // namespace firestarter::oneapi
+} // namespace firestarter::oneapi
\ No newline at end of file
diff --git a/include/firestarter/Optimizer/Algorithm.hpp b/include/firestarter/Optimizer/Algorithm.hpp
index 14009183..be5d5961 100644
--- a/include/firestarter/Optimizer/Algorithm.hpp
+++ b/include/firestarter/Optimizer/Algorithm.hpp
@@ -21,19 +21,26 @@
 
 #pragma once
 
-#include <firestarter/Optimizer/Population.hpp>
+#include "firestarter/Optimizer/Population.hpp"
 
 namespace firestarter::optimizer {
 
+/// Abstract class to provide an interface for evolutionary optimization algorithms.
 class Algorithm {
 public:
-  Algorithm() {}
-  virtual ~Algorithm() {}
+  Algorithm() = default;
+  virtual ~Algorithm() = default;
 
-  virtual void checkPopulation(Population const &pop,
-                               std::size_t populationSize) = 0;
+  /// Check if the population size and the problem matches the requirements of the algorithm. Asserts if this checks
+  /// fail.
+  /// \arg Prob The poblem that should be optimized with this algorithm
+  /// \arg PopulationSize The initial size of the population that is used
+  virtual void check(Problem const& Prob, std::size_t PopulationSize) = 0;
 
-  virtual Population evolve(Population &pop) = 0;
+  /// Evolve the population across multiple iterations.
+  /// \arg Pop The initial population
+  /// \returns The final population after the optimization has run
+  virtual auto evolve(Population& Pop) -> Population = 0;
 };
 
 } // namespace firestarter::optimizer
diff --git a/include/firestarter/Optimizer/Algorithm/NSGA2.hpp b/include/firestarter/Optimizer/Algorithm/NSGA2.hpp
index c1825f73..6b395823 100644
--- a/include/firestarter/Optimizer/Algorithm/NSGA2.hpp
+++ b/include/firestarter/Optimizer/Algorithm/NSGA2.hpp
@@ -21,25 +21,46 @@
 
 #pragma once
 
-#include <firestarter/Optimizer/Algorithm.hpp>
+#include "firestarter/Optimizer/Algorithm.hpp"
 
 namespace firestarter::optimizer::algorithm {
 
+/// This class implements the NSGA2 evolutionary optimization algorithm.
+/// The NSGA2 algorithm, as described in "A fast and elitist multiobjective genetic algorithm: NSGA-II"
+/// (https://dl.acm.org/doi/10.1109/4235.996017), is a multiobjective algorithm allowing FIRESTARTER to optimize with
+/// two (or more) metrics. This is relevant because adding the IPC (instruction per cycle) metric supports the
+/// optimization algorithm to converge towards higher power consumption.
 class NSGA2 : public Algorithm {
 public:
-  NSGA2(unsigned gen, double cr, double m);
-  ~NSGA2() {}
+  /// Initialize the NSGA2 algorithm.
+  /// \arg Gen The number of generation that the algorithm uses to evolve its population.
+  /// \arg Cr The Crossover probability. Must be in range [0,1[
+  /// \arg M Mutation probability. Must be in range [0,1]
+  NSGA2(unsigned Gen, double Cr, double M);
+  ~NSGA2() override = default;
 
-  void checkPopulation(firestarter::optimizer::Population const &pop,
-                       std::size_t populationSize) override;
+  /// Check if the problem and population size matches the requirements of NSGA2. We must have a multi-objective problem
+  /// and at least 5 and a multiple of 4 individuals in our population.
+  /// \arg Prob The poblem that should be optimized with this algorithm
+  /// \arg PopulationSize The initial size of the population that is used
+  void check(firestarter::optimizer::Problem const& Prob, std::size_t PopulationSize) override;
 
-  firestarter::optimizer::Population
-  evolve(firestarter::optimizer::Population &pop) override;
+  /// Evolve the population across multiple iterations.
+  /// \arg Pop The initial population
+  /// \returns The final population after the optimization has run
+  auto evolve(firestarter::optimizer::Population& Pop) -> firestarter::optimizer::Population override;
 
 private:
-  unsigned _gen;
-  double _cr;
-  double _m;
+  // NOLINTBEGIN(cppcoreguidelines-avoid-const-or-ref-data-members)
+
+  /// The number of generations of the NSGA2 algorithm.
+  const unsigned Gen;
+  /// The crossover propability in the range [0,1[.
+  const double Cr;
+  /// The mutation propability in the range [0,1].
+  const double M;
+
+  // NOLINTEND(cppcoreguidelines-avoid-const-or-ref-data-members)
 };
 
 } // namespace firestarter::optimizer::algorithm
diff --git a/include/firestarter/Optimizer/History.hpp b/include/firestarter/Optimizer/History.hpp
index 9dec066d..10d635c1 100644
--- a/include/firestarter/Optimizer/History.hpp
+++ b/include/firestarter/Optimizer/History.hpp
@@ -21,10 +21,11 @@
 
 #pragma once
 
-#include <firestarter/Json/Summary.hpp>
-#include <firestarter/Logging/Log.hpp>
-#include <firestarter/Measurement/Summary.hpp>
-#include <firestarter/Optimizer/Individual.hpp>
+#include "firestarter/Json/Summary.hpp" // IWYU pragma: keep
+#include "firestarter/Logging/Log.hpp"
+#include "firestarter/Measurement/Summary.hpp"
+#include "firestarter/Optimizer/Individual.hpp"
+#include "firestarter/WindowsCompat.hpp" // IWYU pragma: keep
 
 #include <algorithm>
 #include <cassert>
@@ -32,291 +33,313 @@
 #include <ctime>
 #include <fstream>
 #include <iomanip>
-#include <iostream>
+#include <memory>
 #include <nlohmann/json.hpp>
 #include <optional>
-#include <tuple>
 #include <vector>
 
-extern "C" {
-#include <unistd.h>
-}
-
 namespace firestarter::optimizer {
 
+/// Singleton that handle keeping track of the history of evaluated indivudals and their associated metric summaries.
 struct History {
 private:
-  // https://stackoverflow.com/questions/17074324/how-can-i-sort-two-vectors-in-the-same-way-with-criteria-that-uses-only-one-of/17074810#17074810
-  template <typename T, typename Compare>
-  inline static std::vector<std::size_t>
-  sortPermutation(const std::vector<T> &vec, Compare &compare) {
-    std::vector<std::size_t> p(vec.size());
-    std::iota(p.begin(), p.end(), 0);
-    std::sort(p.begin(), p.end(), [&](std::size_t i, std::size_t j) {
-      return compare(vec[i], vec[j]);
-    });
-    return p;
+  /// Find the permuation of a vector when sorting it with a supplied comparison function.
+  /// \tparam T The type of the vector elements
+  /// \tparam CompareT The type of the comparison function.
+  /// \arg Vec The const reference to vector that will be sorted.
+  /// \arg Compare The comparision function which will be used to sort the vector.
+  /// \returns The indices of how the vector would be sorted according to the comparison function.
+  template <typename T, typename CompareT>
+  static auto sortPermutation(const std::vector<T>& Vec, CompareT& Compare) -> std::vector<std::size_t> {
+    // https://stackoverflow.com/questions/17074324/how-can-i-sort-two-vectors-in-the-same-way-with-criteria-that-uses-only-one-of/17074810#17074810
+    std::vector<std::size_t> P(Vec.size());
+    std::iota(P.begin(), P.end(), 0);
+    std::sort(P.begin(), P.end(), [&](std::size_t I, std::size_t J) { return Compare(Vec[I], Vec[J]); });
+    return P;
   }
 
-  inline static void padding(std::stringstream &ss, std::size_t width,
-                             std::size_t taken, char c) {
-    for (std::size_t i = 0; i < (std::max)(width, taken) - taken; ++i) {
-      ss << c;
+  /// Add padding to a stingstream to fill it up to a maximum width.
+  /// \arg Ss The stringstream to add padding to.
+  /// \arg Width The maximum width until which should be padded.
+  /// \arg Taken The number of characters that are already filled up.
+  /// \arg C The character that should be used for padding.
+  static void padding(std::stringstream& Ss, std::size_t Width, std::size_t Taken, char C) {
+    for (std::size_t I = 0; I < (std::max)(Width, Taken) - Taken; ++I) {
+      Ss << C;
     }
   }
 
-  inline static int MAX_ELEMENT_PRINT_COUNT = 20;
-  inline static std::size_t MIN_COLUMN_WIDTH = 10;
+  /// The maximum number of elements that will be printed.
+  static constexpr const int MaxElementPrintCount = 20;
+  /// The minimum width of columns that are printed.
+  static constexpr const std::size_t MinColumnWidth = 10;
 
-  inline static std::vector<Individual> _x = {};
-  inline static std::vector<
-      std::map<std::string, firestarter::measurement::Summary>>
-      _f = {};
+  // NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
+  /// The vector of individuals that have been evaluated. This vector has the same size as F.
+  inline static std::vector<Individual> X = {};
+  /// The vector of metric summaries associated to the evaluated individuals. This vector has the same size as X.
+  inline static std::vector<std::map<std::string, firestarter::measurement::Summary>> F = {};
+  // NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
 
 public:
-  inline static void append(
-      std::vector<unsigned> const &ind,
-      std::map<std::string, firestarter::measurement::Summary> const &metric) {
-    _x.push_back(ind);
-    _f.push_back(metric);
+  /// Append an evaluated individual to the history.
+  /// \arg Ind The individual to add.
+  /// \arg Metric The metric summaries for this individual.
+  static void append(std::vector<unsigned> const& Ind,
+                     std::map<std::string, firestarter::measurement::Summary> const& Metric) {
+    X.push_back(Ind);
+    F.push_back(Metric);
   }
 
-  inline static std::optional<
-      std::map<std::string, firestarter::measurement::Summary>>
-  find(std::vector<unsigned> const &individual) {
-    auto findEqual = [individual](auto const &ind) {
-      return ind == individual;
-    };
-    auto ind = std::find_if(_x.begin(), _x.end(), findEqual);
-    if (ind == _x.end()) {
+  /// Loopup an indiviudal in the history and return the metric summaries if it is in the history.
+  /// \arg Individual The individual which may already be evaluated.
+  /// \returns The metric summaries if the individual is in the history or std::nullopt otherwise.
+  static auto find(std::vector<unsigned> const& Individual)
+      -> std::optional<std::map<std::string, firestarter::measurement::Summary>> {
+    auto FindEqual = [&Individual](auto const& Ind) { return Ind == Individual; };
+    auto Ind = std::find_if(X.begin(), X.end(), FindEqual);
+    if (Ind == X.end()) {
       return {};
     }
-    auto dist = std::distance(_x.begin(), ind);
-    return _f[dist];
+    auto Dist = std::distance(X.begin(), Ind);
+    return F[Dist];
   }
 
-  inline static void
-  printBest(std::vector<std::string> const &optimizationMetrics,
-            std::vector<std::string> const &payloadItems) {
-    // TODO: print paretto front
+  /// Print the best individuals per metric. This will print a table with the average metric value and indiviudals per
+  /// metric.
+  /// \arg OptimizationMetrics The metrics for which the best individual should be printed.
+  /// \arg PayloadItems The instruction of the associated instruction groups used in the optimization.
+  static void printBest(std::vector<std::string> const& OptimizationMetrics,
+                        std::vector<std::string> const& PayloadItems) {
+    // TODO(Issue #76): print paretto front
 
     // print the best 20 individuals for each metric in a format
     // where the user can give it to --run-instruction-groups directly
-    std::map<std::string, std::size_t> columnWidth;
+    std::map<std::string, std::size_t> ColumnWidth;
 
-    for (auto const &metric : optimizationMetrics) {
-      columnWidth[metric] = (std::max)(metric.size(), MIN_COLUMN_WIDTH);
-      firestarter::log::trace() << metric << ": " << columnWidth[metric];
+    for (auto const& Metric : OptimizationMetrics) {
+      ColumnWidth[Metric] = (std::max)(Metric.size(), MinColumnWidth);
+      firestarter::log::trace() << Metric << ": " << ColumnWidth[Metric];
     }
 
-    for (auto const &metric : optimizationMetrics) {
-      using SummaryMap =
-          std::map<std::string, firestarter::measurement::Summary>;
-      auto compareIndividual = [&metric](SummaryMap const &mapA,
-                                         SummaryMap const &mapB) {
-        auto summaryA = mapA.find(metric);
-        auto summaryB = mapB.find(metric);
-
-        if (summaryA == mapA.end() || summaryB == mapB.end()) {
-          summaryA = mapA.find(metric.substr(1));
-          summaryB = mapB.find(metric.substr(1));
-          assert(summaryA != mapA.end());
-          assert(summaryB != mapB.end());
-          return summaryA->second.average < summaryB->second.average;
+    for (auto const& Metric : OptimizationMetrics) {
+      using SummaryMap = std::map<std::string, firestarter::measurement::Summary>;
+      auto CompareIndividual = [&Metric](SummaryMap const& MapA, SummaryMap const& MapB) {
+        auto SummaryA = MapA.find(Metric);
+        auto SummaryB = MapB.find(Metric);
+
+        if (SummaryA == MapA.end() || SummaryB == MapB.end()) {
+          SummaryA = MapA.find(Metric.substr(1));
+          SummaryB = MapB.find(Metric.substr(1));
+          assert(SummaryA != MapA.end());
+          assert(SummaryB != MapB.end());
+          return SummaryA->second.Average < SummaryB->second.Average;
         }
 
-        assert(summaryA != mapA.end());
-        assert(summaryB != mapB.end());
-        return summaryA->second.average > summaryB->second.average;
+        assert(SummaryA != MapA.end());
+        assert(SummaryB != MapB.end());
+        return SummaryA->second.Average > SummaryB->second.Average;
       };
 
-      auto perm = sortPermutation(_f, compareIndividual);
+      auto Perm = sortPermutation(F, CompareIndividual);
 
-      auto formatIndividual =
-          [&payloadItems](std::vector<unsigned> const &individual) {
-            std::string result = "";
-            assert(payloadItems.size() == individual.size());
+      auto FormatIndividual = [&PayloadItems](std::vector<unsigned> const& Individual) {
+        std::string Result;
+        assert(PayloadItems.size() == Individual.size());
 
-            for (std::size_t i = 0; i < individual.size(); ++i) {
-              // skip zero values
-              if (individual[i] == 0) {
-                continue;
-              }
+        for (std::size_t I = 0; I < Individual.size(); ++I) {
+          // skip zero values
+          if (Individual[I] == 0) {
+            continue;
+          }
 
-              if (result.size() != 0) {
-                result += ",";
-              }
-              result += payloadItems[i] + ":" + std::to_string(individual[i]);
-            }
+          if (!Result.empty()) {
+            Result += ",";
+          }
+          Result += PayloadItems[I] + ":" + std::to_string(Individual[I]);
+        }
 
-            return result;
-          };
+        return Result;
+      };
 
-      auto begin = perm.begin();
-      auto end = perm.end();
+      auto Begin = Perm.begin();
+      auto End = Perm.end();
 
-      // stop printing at a max of MAX_ELEMENT_PRINT_COUNT
-      if (std::distance(begin, end) > MAX_ELEMENT_PRINT_COUNT) {
-        end = perm.begin();
-        std::advance(end, MAX_ELEMENT_PRINT_COUNT);
+      // stop printing at a max of MaxElementPrintCount
+      if (std::distance(Begin, End) > MaxElementPrintCount) {
+        End = Perm.begin();
+        std::advance(End, MaxElementPrintCount);
       }
 
       // print each of the best elements
-      std::size_t max = 0;
-      for (auto it = begin; it != end; ++it) {
-        max = (std::max)(max, formatIndividual(_x[*it]).size());
+      std::size_t Max = 0;
+      for (auto It = Begin; It != End; ++It) {
+        Max = (std::max)(Max, FormatIndividual(X[*It]).size());
       }
 
-      std::stringstream firstLine;
-      std::stringstream secondLine;
-      std::string ind = "INDIVIDUAL";
+      std::stringstream FirstLine;
+      std::stringstream SecondLine;
+      std::string const Ind = "INDIVIDUAL";
 
-      firstLine << "  " << ind;
-      padding(firstLine, max, ind.size(), ' ');
+      FirstLine << "  " << Ind;
+      padding(FirstLine, Max, Ind.size(), ' ');
 
-      secondLine << "  ";
-      padding(secondLine, (std::max)(max, ind.size()), 0, '-');
+      SecondLine << "  ";
+      padding(SecondLine, (std::max)(Max, Ind.size()), 0, '-');
 
-      for (auto const &metric : optimizationMetrics) {
-        auto width = columnWidth[metric];
+      for (auto const& Metric : OptimizationMetrics) {
+        auto Width = ColumnWidth[Metric];
 
-        firstLine << " | ";
-        secondLine << "---";
+        FirstLine << " | ";
+        SecondLine << "---";
 
-        firstLine << metric;
-        padding(firstLine, width, metric.size(), ' ');
-        padding(secondLine, width, 0, '-');
+        FirstLine << Metric;
+        padding(FirstLine, Width, Metric.size(), ' ');
+        padding(SecondLine, Width, 0, '-');
       }
 
-      std::stringstream ss;
+      std::stringstream Ss;
 
-      ss << "\n Best individuals sorted by metric " << metric << " "
-         << ((metric[0] == '-') ? "ascending" : "descending") << ":\n"
-         << firstLine.str() << "\n"
-         << secondLine.str() << "\n";
+      Ss << "\n Best individuals sorted by metric " << Metric << " "
+         << ((Metric[0] == '-') ? "ascending" : "descending") << ":\n"
+         << FirstLine.str() << "\n"
+         << SecondLine.str() << "\n";
 
       // print INDIVIDUAL | metric 1 | metric 2 | ... | metric N
-      for (auto it = begin; it != end; ++it) {
-        auto const fitness = _f[*it];
-        auto const ind = formatIndividual(_x[*it]);
+      for (auto It = Begin; It != End; ++It) {
+        auto const& Fitness = F[*It];
+        auto const Ind = FormatIndividual(X[*It]);
 
-        ss << "  " << ind;
-        padding(ss, max, ind.size(), ' ');
+        Ss << "  " << Ind;
+        padding(Ss, Max, Ind.size(), ' ');
 
-        for (auto const &metric : optimizationMetrics) {
-          auto width = columnWidth[metric];
-          std::string value;
+        for (auto const& Metric : OptimizationMetrics) {
+          auto Width = ColumnWidth[Metric];
+          std::string Value;
 
-          auto fitnessOfMetric = fitness.find(metric);
-          auto invertedMetric = metric.substr(1);
-          auto fitnessOfInvertedMetric = fitness.find(invertedMetric);
+          auto FitnessOfMetric = Fitness.find(Metric);
+          auto InvertedMetric = Metric.substr(1);
+          auto FitnessOfInvertedMetric = Fitness.find(InvertedMetric);
 
-          if (fitnessOfMetric != fitness.end()) {
-            value = std::to_string(fitnessOfMetric->second.average);
-          } else if (fitnessOfInvertedMetric != fitness.end()) {
-            value = std::to_string(fitnessOfInvertedMetric->second.average);
+          if (FitnessOfMetric != Fitness.end()) {
+            Value = std::to_string(FitnessOfMetric->second.Average);
+          } else if (FitnessOfInvertedMetric != Fitness.end()) {
+            Value = std::to_string(FitnessOfInvertedMetric->second.Average);
           } else {
             assert(false);
           }
 
-          ss << " | " << value;
-          padding(ss, width, value.size(), ' ');
+          Ss << " | " << Value;
+          padding(Ss, Width, Value.size(), ' ');
         }
-        ss << "\n";
+        Ss << "\n";
       }
 
-      ss << "\n";
+      Ss << "\n";
 
-      firestarter::log::info() << ss.str();
+      firestarter::log::info() << Ss.str();
     }
 
-    firestarter::log::info()
-        << "To run FIRESTARTER with the best individual of a given metric "
-           "use the command line argument "
-           "`--run-instruction-groups=INDIVIDUAL`";
+    firestarter::log::info() << "To run FIRESTARTER with the best individual of a given metric "
+                                "use the command line argument "
+                                "`--run-instruction-groups=INDIVIDUAL`";
   }
 
-  inline static void save(std::string const &path, std::string const &startTime,
-                          std::vector<std::string> const &payloadItems,
-                          const int argc, const char **argv) {
+  /// Save the history to a file. This function is not threadsafe as is calls History::getTime.
+  /// \arg Path The folder in which the outfile shall be created. If it is empty the current directory name or /tmp will
+  /// be choosen.
+  /// \arg StartTime The start time as a string which is saved in the json datastructure.
+  /// \arg PayloadItems The Vector of meta instructions which map to the vector of individuals.
+  /// \arg Argc The Argc of the executed programm.
+  /// \arg Argv The Argv of the executed programm.
+  static void save(std::string const& Path, std::string const& StartTime, std::vector<std::string> const& PayloadItems,
+                   const int Argc, const char** Argv) {
     using json = nlohmann::json;
 
-    json j = json::object();
+    json J = json::object();
 
-    j["individuals"] = json::array();
-    for (auto const &ind : _x) {
-      j["individuals"].push_back(ind);
+    J["individuals"] = json::array();
+    for (auto const& Ind : X) {
+      J["individuals"].push_back(Ind);
     }
 
-    j["metrics"] = json::array();
-    for (auto const &eval : _f) {
-      j["metrics"].push_back(eval);
+    J["metrics"] = json::array();
+    for (auto const& Eval : F) {
+      J["metrics"].push_back(Eval);
     }
 
+    // Initialize a string with length of 256 filled with null characters
+    auto Hostname = std::string(256, 0);
     // get the hostname
-    char cHostname[256];
-    std::string hostname;
-    if (0 != gethostname(cHostname, sizeof(cHostname))) {
-      hostname = "unknown";
-    } else {
-      hostname = cHostname;
+    if (0 != gethostname(Hostname.data(), Hostname.size())) {
+      Hostname = "unknown";
+    }
+
+    // Strip away any remaining null terminators
+    if (const auto Pos = Hostname.find('\0'); Pos != std::string::npos) {
+      Hostname.erase(Pos);
     }
 
-    j["hostname"] = hostname;
+    J["hostname"] = Hostname;
 
-    j["startTime"] = startTime;
-    j["endTime"] = getTime();
+    J["startTime"] = StartTime;
+    J["endTime"] = getTime();
 
     // save the payload items
-    j["payloadItems"] = json::array();
-    for (auto const &item : payloadItems) {
-      j["payloadItems"].push_back(item);
+    J["payloadItems"] = json::array();
+    for (auto const& Item : PayloadItems) {
+      J["payloadItems"].push_back(Item);
     }
 
     // save the arguments
-    j["args"] = json::array();
-    for (int i = 0; i < argc; ++i) {
-      j["args"].push_back(argv[i]);
+    J["args"] = json::array();
+    for (int I = 0; I < Argc; ++I) {
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+      J["args"].push_back(Argv[I]);
     }
 
     // dump the output
-    std::string s = j.dump();
+    const auto S = J.dump();
 
-    firestarter::log::trace() << s;
+    firestarter::log::trace() << S;
 
-    std::string outpath = path;
-    if (outpath.empty()) {
-      char *pwd = get_current_dir_name();
-      if (pwd) {
-        outpath = pwd;
-        free(pwd);
+    std::string Outpath = Path;
+    if (Outpath.empty()) {
+      // Wrap get_current_dir_name in a unique ptr, as it needs to get deleted by free when it is not used anymore.
+      const std::unique_ptr<char, void (*)(void*)> WrappedPwd = {get_current_dir_name(), free};
+      if (WrappedPwd) {
+        // Get the pointer captured in the WrappedPwd (not only the first char as would be with *WrappedPwd)
+        Outpath = WrappedPwd.get();
       } else {
         firestarter::log::warn() << "Could not find $PWD.";
-        outpath = "/tmp";
+        Outpath = "/tmp";
       }
-      outpath += "/" + hostname + "_" + startTime + ".json";
+      Outpath += "/" + Hostname + "_" + StartTime + ".json";
     }
 
-    firestarter::log::info() << "\nDumping output json in " << outpath;
+    firestarter::log::info() << "\nDumping output json in " << Outpath;
 
-    std::ofstream fp(outpath);
+    std::ofstream Fp(Outpath);
 
-    if (fp.bad()) {
-      firestarter::log::error() << "Could not open " << outpath;
+    if (Fp.bad()) {
+      firestarter::log::error() << "Could not open " << Outpath;
       return;
     }
 
-    fp << s;
+    Fp << S;
 
-    fp.close();
+    Fp.close();
   }
 
-  inline static std::string getTime() {
-    auto t = std::time(nullptr);
-    auto tm = *std::localtime(&t);
-    std::stringstream ss;
-    ss << std::put_time(&tm, "%F_%T%z");
-    return ss.str();
+  /// Get the current time in the local timezone as a string formatted by "%F_%T%z". This function is NOT threadsafe.
+  /// \returns The current time in local timezone as a formatted string.
+  static auto getTime() -> std::string {
+    const auto T = std::time(nullptr);
+    // NOLINTNEXTLINE(concurrency-mt-unsafe)
+    const auto* Tm = std::localtime(&T);
+    std::stringstream Ss;
+    Ss << std::put_time(Tm, "%F_%T%z");
+    return Ss.str();
   }
 };
 } // namespace firestarter::optimizer
diff --git a/include/firestarter/Optimizer/OptimizerWorker.hpp b/include/firestarter/Optimizer/OptimizerWorker.hpp
index 90eb80a5..17293ad3 100644
--- a/include/firestarter/Optimizer/OptimizerWorker.hpp
+++ b/include/firestarter/Optimizer/OptimizerWorker.hpp
@@ -19,42 +19,52 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Optimizer/Algorithm.hpp>
-#include <firestarter/Optimizer/Population.hpp>
+#include "firestarter/Optimizer/Algorithm.hpp"
+#include "firestarter/Optimizer/Population.hpp"
+#include "firestarter/WindowsCompat.hpp" // IWYU pragma: keep
 
 #include <chrono>
 #include <memory>
 
-extern "C" {
-#include <pthread.h>
-}
-
 namespace firestarter::optimizer {
 
+/// Class to run the optimization in another thread.
 class OptimizerWorker {
 public:
-  OptimizerWorker(
-      std::unique_ptr<firestarter::optimizer::Algorithm> &&algorithm,
-      firestarter::optimizer::Population &population,
-      std::string const &optimizationAlgorithm, unsigned individuals,
-      std::chrono::seconds const &preheat);
+  /// Start the optimization in another thread.
+  /// \arg Algorithm The algorithm that is used to optimize FIRESTARTER.
+  /// \arg Population The population containing the problem that will be used to optimize FIRESTARTER.
+  /// \arg Individuals The number of individuals for the intial population.
+  /// \arg Preheat The time we preheat before starting the optimization.
+  OptimizerWorker(std::unique_ptr<firestarter::optimizer::Algorithm>&& Algorithm,
+                  std::unique_ptr<firestarter::optimizer::Population>&& Population, unsigned Individuals,
+                  std::chrono::seconds const& Preheat);
 
-  ~OptimizerWorker() {}
+  ~OptimizerWorker() = default;
 
-  void join();
+  /// Join the optimization thread.
+  void join() const;
 
-  void kill();
+  /// Kill the optimization thread.
+  void kill() const;
 
 private:
-  static void *optimizerThread(void *optimizerWorker);
+  /// The thread worker that does the optimization.
+  /// \arg OptimizerWorker The pointer to the OptimizerWorker (this) datastructure.
+  /// \returns a nullptr
+  static auto optimizerThread(void* OptimizerWorker) -> void*;
 
-  std::unique_ptr<firestarter::optimizer::Algorithm> _algorithm;
-  firestarter::optimizer::Population _population;
-  std::string _optimizationAlgorithm;
-  unsigned _individuals;
-  std::chrono::seconds _preheat;
+  /// The algorithm that is used to optimize FIRESTARTER.
+  std::unique_ptr<firestarter::optimizer::Algorithm> Algorithm;
+  /// The population containing the problem that will be used to optimize FIRESTARTER.
+  std::unique_ptr<firestarter::optimizer::Population> Population;
+  /// The number of individuals for the intial population.
+  unsigned Individuals;
+  /// The time we preheat before starting the optimization.
+  std::chrono::seconds Preheat;
 
-  pthread_t workerThread;
+  /// The pthread that is used for the optimization.
+  pthread_t WorkerThread{};
 };
 
 } // namespace firestarter::optimizer
diff --git a/include/firestarter/Optimizer/Population.hpp b/include/firestarter/Optimizer/Population.hpp
index b02f451d..ac857e30 100644
--- a/include/firestarter/Optimizer/Population.hpp
+++ b/include/firestarter/Optimizer/Population.hpp
@@ -19,80 +19,74 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#ifndef FIRESTARTER_OPTIMIZER_POPULATION_HPP
-#define FIRESTARTER_OPTIMIZER_POPULATION_HPP
+#pragma once
 
-#include <firestarter/Optimizer/History.hpp>
-#include <firestarter/Optimizer/Individual.hpp>
-#include <firestarter/Optimizer/Problem.hpp>
+#include "firestarter/Optimizer/Individual.hpp"
+#include "firestarter/Optimizer/Problem.hpp"
 
 #include <cstring>
 #include <memory>
-#include <optional>
-#include <random>
-#include <tuple>
 #include <vector>
 
 namespace firestarter::optimizer {
 
+/// This class models the notion of a population used by the NSGA2 algorithm that contains a number of individuals with
+/// their associated fitness.
 class Population {
 public:
-  // Construct a population from a problem.
-  Population() = default;
+  Population() = delete;
 
-  Population(std::shared_ptr<Problem> &&problem)
-      : _problem(std::move(problem)), gen(rd()) {}
+  /// Construct a population from a problem.
+  explicit Population(std::shared_ptr<Problem>&& ProblemPtr)
+      : ProblemPtr(std::move(ProblemPtr)) {}
 
-  Population(Population &pop)
-      : _problem(pop._problem), _x(pop._x), _f(pop._f), gen(rd()) {}
+  ~Population() = default;
 
-  Population &operator=(Population const &pop) {
-    _problem = std::move(pop._problem);
-    _x = pop._x;
-    _f = pop._f;
-    gen = pop.gen;
+  /// Generate a supplied number of individuals and save them with their fitness in this datastructure. If the number is
+  /// less then the number of dimensions we fill them with random individuals. If it is at least the number of
+  /// dimension, we first create individuals with one dimension equal to one and the rest equal to zero.
+  /// \arg PopulationSize The number of individuals to generate.
+  void generateInitialPopulation(std::size_t PopulationSize);
 
-    return *this;
-  }
+  /// The number of individuals in this population.
+  [[nodiscard]] auto size() const -> std::size_t;
 
-  ~Population() {}
+  /// Append one individual to the population. If a lookup of the fitness in the history is no successful, the
+  /// individual will be evaluated and the fitness saved.
+  /// \arg Ind The individual to be added to the population.
+  void append(Individual const& Ind);
 
-  void generateInitialPopulation(std::size_t populationSize = 0);
+  /// Insert an indiviudal and an associated fitness at a specific index in the population.
+  /// \arg Idx On which index to insert in the population.
+  /// \arg Ind The individual to insert.
+  /// \arg Fit The fitness to insert.
+  void insert(std::size_t Idx, Individual const& Ind, std::vector<double> const& Fit);
 
-  std::size_t size() const;
+  /// Generate a random individual inside the bounds of the problem based on a non-determenistic generator.
+  /// \returns The random individual inside the bounds of the problem.
+  [[nodiscard]] auto getRandomIndividual() const -> Individual;
 
-  // add one individual to the population. fitness will be evaluated.
-  void append(Individual const &ind);
+  /// Const reference to the optimization problem.
+  [[nodiscard]] auto problem() const -> Problem const& { return *ProblemPtr; }
 
-  void insert(std::size_t idx, Individual const &ind,
-              std::vector<double> const &fit);
-
-  // get a random individual inside bounds of problem
-  Individual getRandomIndividual();
-
-  // returns the best individual in case of single-objective.
-  // return nothing in case of mutli-objective.
-  std::optional<Individual> bestIndividual() const;
-
-  Problem const &problem() const { return *_problem; }
-
-  std::vector<Individual> const &x() const { return _x; }
-  std::vector<std::vector<double>> const &f() const { return _f; }
+  /// Const reference to the vector of individuals.
+  [[nodiscard]] auto x() const -> std::vector<Individual> const& { return X; }
+  /// Const reference to the vector of fitnesses.
+  [[nodiscard]] auto f() const -> std::vector<std::vector<double>> const& { return F; }
 
 private:
-  // add one individual to the population with a fitness.
-  void append(Individual const &ind, std::vector<double> const &fit);
-
-  // our problem.
-  std::shared_ptr<Problem> _problem;
-
-  std::vector<Individual> _x;
-  std::vector<std::vector<double>> _f;
-
-  std::random_device rd;
-  std::mt19937 gen;
+  /// Append one individual with a given fitness to the population.
+  /// \arg Ind The individual to be appended to the population.
+  /// \arg Fit The fitness of the individual.
+  void append(Individual const& Ind, std::vector<double> const& Fit);
+
+  /// The optimization problem
+  std::shared_ptr<Problem> ProblemPtr;
+
+  /// The vector of individuals
+  std::vector<Individual> X;
+  /// The vector of fitnesses associated to each individual
+  std::vector<std::vector<double>> F;
 };
 
-} // namespace firestarter::optimizer
-
-#endif
+} // namespace firestarter::optimizer
\ No newline at end of file
diff --git a/include/firestarter/Optimizer/Problem.hpp b/include/firestarter/Optimizer/Problem.hpp
index f88b0bc3..bee3fdbb 100644
--- a/include/firestarter/Optimizer/Problem.hpp
+++ b/include/firestarter/Optimizer/Problem.hpp
@@ -21,8 +21,8 @@
 
 #pragma once
 
-#include <firestarter/Measurement/Summary.hpp>
-#include <firestarter/Optimizer/Individual.hpp>
+#include "firestarter/Measurement/Summary.hpp"
+#include "firestarter/Optimizer/Individual.hpp"
 
 #include <cstring>
 #include <map>
@@ -31,37 +31,50 @@
 
 namespace firestarter::optimizer {
 
+/// This class models the abstract problem which should be optimized. It provides the methods to evaluate an individual
+/// and calculate its fitness.
 class Problem {
+  /// The number of metric evaluations
+  uint64_t Fevals = 0;
+
 public:
-  Problem() : _fevals(0) {}
-  virtual ~Problem() {}
+  Problem() = default;
+  virtual ~Problem() = default;
 
-  // return the fitness for an individual
-  virtual std::map<std::string, firestarter::measurement::Summary>
-  metrics(Individual const &individual) = 0;
+  /// Perform an evaluation of the supplied individual. This returns a map from the metric name to their respective
+  /// summary. This function will increment the fevals.
+  /// \arg Individual The individual that should be evaluated.
+  /// \returns A map from metric name to the summary of this metric for the specific individual
+  virtual auto metrics(Individual const& Individual) -> std::map<std::string, firestarter::measurement::Summary> = 0;
 
-  virtual std::vector<double>
-  fitness(std::map<std::string, firestarter::measurement::Summary> const
-              &summaries) = 0;
+  /// Convert the result of one evaluation into a fitness (vector of doubles) for the supplied summaries
+  /// \arg Summaries The summaries of one evaluation.
+  /// \returns The fitness vector derived from the summaries. The size of this vector is equal to the number of
+  /// objectives.
+  [[nodiscard]] virtual auto fitness(std::map<std::string, firestarter::measurement::Summary> const& Summaries) const
+      -> std::vector<double> = 0;
 
-  // get the bounds of the problem
-  virtual std::vector<std::tuple<unsigned, unsigned>> getBounds() const = 0;
+  /// Get the bounds of the problem. For each dimension a min and max value is supplied.
+  /// \return The min and max bound per dimension.
+  [[nodiscard]] virtual auto getBounds() const -> std::vector<std::tuple<unsigned, unsigned>> = 0;
 
-  // get the number of dimensions of the problem
-  std::size_t getDims() const { return this->getBounds().size(); };
+  /// Get the number of dimensions of the problem.
+  /// \returns The number of dimensions.
+  [[nodiscard]] auto getDims() const -> std::size_t { return this->getBounds().size(); };
 
-  // get the number of objectives.
-  virtual std::size_t getNobjs() const = 0;
+  /// Get the number of optimization objectives for this problem.
+  /// \arg The number of objectives.
+  [[nodiscard]] virtual auto getNobjs() const -> std::size_t = 0;
 
-  // is the problem multiobjective
-  bool isMO() const { return this->getNobjs() > 1; };
+  /// Check if the problem is a multi-objective one.
+  [[nodiscard]] auto isMO() const -> bool { return this->getNobjs() > 1; };
 
-  // get the number of fitness evaluations
-  unsigned long long getFevals() const { return _fevals; };
+  /// Get the number of evaluations.
+  [[nodiscard]] auto getFevals() const -> uint64_t { return Fevals; };
 
 protected:
-  // number of fitness evaluations
-  unsigned long long _fevals;
+  /// Increment the number of evaluations.
+  void incrementFevals() { Fevals++; };
 };
 
 } // namespace firestarter::optimizer
diff --git a/include/firestarter/Optimizer/Problem/CLIArgumentProblem.hpp b/include/firestarter/Optimizer/Problem/CLIArgumentProblem.hpp
index 1ca0de58..4335a4f9 100644
--- a/include/firestarter/Optimizer/Problem/CLIArgumentProblem.hpp
+++ b/include/firestarter/Optimizer/Problem/CLIArgumentProblem.hpp
@@ -21,125 +21,148 @@
 
 #pragma once
 
-#include <firestarter/Optimizer/Problem.hpp>
+#include "firestarter/Measurement/MeasurementWorker.hpp"
+#include "firestarter/Optimizer/Problem.hpp"
 
 #include <cassert>
-#include <chrono>
-#include <cmath>
 #include <functional>
-#include <memory>
 #include <thread>
 #include <tuple>
 #include <utility>
 
 namespace firestarter::optimizer::problem {
 
+/// This class models the problem of optimizing firestarter on the fly. The evaluation of metrics is done by switching
+/// the settings of the high load routine and measuring the metric in the specified runtime.
 class CLIArgumentProblem final : public firestarter::optimizer::Problem {
+private:
+  /// The function which takes instruction groups and switches the payload in the high load function to the supplied
+  /// ones.
+  std::function<void(std::vector<std::pair<std::string, unsigned>> const&)> ChangePayloadFunction;
+  /// The shared pointer to the measurement infrastructure which will be used to get metric values.
+  std::shared_ptr<firestarter::measurement::MeasurementWorker> MeasurementWorker;
+  /// The metrics that are used in the optimization. They may have a dash at the start to allow them to be changed from
+  /// maximization to minimization.
+  std::vector<std::string> Metrics;
+  /// The duration of the measurement.
+  std::chrono::seconds Timeout;
+  /// The time to skip from the measurement start
+  std::chrono::milliseconds StartDelta;
+  /// The time to skip from the measurement stop
+  std::chrono::milliseconds StopDelta;
+  /// The vector of instruction that is used in the optimization for the payload.
+  std::vector<std::string> InstructionGroups;
 
 public:
-  CLIArgumentProblem(
-      std::function<void(std::vector<std::pair<std::string, unsigned>> const &)>
-          &&changePayloadFunction,
-      std::shared_ptr<firestarter::measurement::MeasurementWorker> const
-          &measurementWorker,
-      std::vector<std::string> const &metrics, std::chrono::seconds timeout,
-      std::chrono::milliseconds startDelta, std::chrono::milliseconds stopDelta,
-      std::vector<std::string> const &instructionGroups)
-      : _changePayloadFunction(changePayloadFunction),
-        _measurementWorker(measurementWorker), _metrics(metrics),
-        _timeout(timeout), _startDelta(startDelta), _stopDelta(stopDelta),
-        _instructionGroups(instructionGroups) {
-    assert(_metrics.size() != 0);
+  /// Constructor for the problem of optimizing firestarter on the fly.
+  /// \arg ChangePayloadFunction The function which takes instruction groups and switches the payload in the high load
+  /// function to the supplied ones.
+  /// \arg MeasurementWorker The shared pointer to the measurement infrastructure which will be used to get metric
+  /// values
+  /// \arg Metrics The metrics that are used in the optimization. They may have a dash at the start to allow them to be
+  /// changed from maximization to minimization.
+  /// \arg Timeout The duration of the measurement.
+  /// \arg StartDelta The time to skip from the measurement start
+  /// \arg StopDelta The time to skip from the measurement stop
+  /// \arg InstructionGroups The vector of instruction that is used in the optimization for the payload.
+  CLIArgumentProblem(std::function<void(std::vector<std::pair<std::string, unsigned>> const&)>&& ChangePayloadFunction,
+                     std::shared_ptr<firestarter::measurement::MeasurementWorker> MeasurementWorker,
+                     std::vector<std::string> const& Metrics, std::chrono::seconds Timeout,
+                     std::chrono::milliseconds StartDelta, std::chrono::milliseconds StopDelta,
+                     std::vector<std::string> InstructionGroups)
+      : ChangePayloadFunction(std::move(ChangePayloadFunction))
+      , MeasurementWorker(std::move(MeasurementWorker))
+      , Metrics(Metrics)
+      , Timeout(Timeout)
+      , StartDelta(StartDelta)
+      , StopDelta(StopDelta)
+      , InstructionGroups(std::move(InstructionGroups)) {
+    assert(!Metrics.empty());
   }
 
-  ~CLIArgumentProblem() {}
+  ~CLIArgumentProblem() override = default;
 
-  // return all available metrics for the individual
-  std::map<std::string, firestarter::measurement::Summary>
-  metrics(std::vector<unsigned> const &individual) override {
+  /// Evaluate the given individual by switching the current payload, doing the measurement and returning the results.
+  /// \arg Individual The indivudal that should be measured.
+  /// \returns The map from all metrics to their respective summaries for the measured individual.
+  auto metrics(std::vector<unsigned> const& Individual)
+      -> std::map<std::string, firestarter::measurement::Summary> override {
     // increment evaluation idx
-    _fevals++;
+    incrementFevals();
 
     // change the payload
-    assert(_instructionGroups.size() == individual.size());
-    std::vector<std::pair<std::string, unsigned>> payload = {};
-    auto it1 = _instructionGroups.begin();
-    auto it2 = individual.begin();
-    for (; it1 != _instructionGroups.end(); ++it1, ++it2) {
-      payload.push_back(std::make_pair(*it1, *it2));
+    assert(InstructionGroups.size() == Individual.size());
+    std::vector<std::pair<std::string, unsigned>> Payload = {};
+    auto It1 = InstructionGroups.begin();
+    auto It2 = Individual.begin();
+    for (; It1 != InstructionGroups.end(); ++It1, ++It2) {
+      Payload.emplace_back(*It1, *It2);
     }
-    _changePayloadFunction(payload);
+    ChangePayloadFunction(Payload);
 
     // start the measurement
-    // NOTE: starting the measurement must happen after switching to not mess up
-    // ipc-estimate metric
-    _measurementWorker->startMeasurement();
+    // NOTE: starting the measurement must happen after switching to not
+    // mess up ipc-estimate metric
+    MeasurementWorker->startMeasurement();
 
     // wait for the measurement to finish
-    std::this_thread::sleep_for(_timeout);
+    std::this_thread::sleep_for(Timeout);
 
-    // FIXME: this is an ugly workaround for the ipc-estimate metric
-    // changeing the payload triggers a write of the iteration counter of the
-    // last payload, which we use to estimate the ipc.
-    _changePayloadFunction(payload);
+    // TODO(Issue #82): This is an ugly workaround for the ipc-estimate metric.
+    // Changing the payload triggers a write of the iteration counter of
+    // the last payload, which we use to estimate the ipc.
+    ChangePayloadFunction(Payload);
 
     // return the results
-    return _measurementWorker->getValues(_startDelta, _stopDelta);
+    return MeasurementWorker->getValues(StartDelta, StopDelta);
   }
 
-  std::vector<double> fitness(
-      std::map<std::string, firestarter::measurement::Summary> const &summaries)
-      override {
-    std::vector<double> values = {};
-
-    for (auto const &metricName : _metrics) {
-      auto findName = [metricName](auto const &summary) {
-        auto invertedName = "-" + summary.first;
-        return metricName.compare(summary.first) == 0 ||
-               metricName.compare(invertedName) == 0;
+  /// Calculate the fitness based on the metric summaries of an individual. This will select the metrics that are
+  /// required for the optimization, round them and potentially invert the results if the optimization metric name
+  /// starts with a dash ('-').
+  /// \arg Summaries The metric values for all metrics for an individual
+  /// \return The vector containing the fitness for that metrics that are used in the optimization.
+  [[nodiscard]] auto fitness(std::map<std::string, firestarter::measurement::Summary> const& Summaries) const
+      -> std::vector<double> override {
+    std::vector<double> Values = {};
+
+    for (auto const& MetricName : Metrics) {
+      auto FindName = [MetricName](auto const& Summary) {
+        auto InvertedName = "-" + Summary.first;
+        return MetricName == Summary.first || MetricName == InvertedName;
       };
 
-      auto it = std::find_if(summaries.begin(), summaries.end(), findName);
+      auto It = std::find_if(Summaries.begin(), Summaries.end(), FindName);
 
-      if (it == summaries.end()) {
+      if (It == Summaries.end()) {
         continue;
       }
 
       // round to two decimal places after the comma
-      auto value = std::round(it->second.average * 100.0) / 100.0;
+      auto Value = std::round(It->second.Average * 100.0) / 100.0;
 
       // invert metric
-      if (metricName[0] == '-') {
-        value *= -1.0;
+      if (MetricName[0] == '-') {
+        Value *= -1.0;
       }
 
-      values.push_back(value);
+      Values.push_back(Value);
     }
 
-    return values;
+    return Values;
   }
 
-  // get the bounds of the problem
-  std::vector<std::tuple<unsigned, unsigned>> getBounds() const override {
-    std::vector<std::tuple<unsigned, unsigned>> vec(
-        _instructionGroups.size(), std::make_tuple<unsigned, unsigned>(0, 100));
+  /// Get the bounds of the problem. We currently set these bounds fix to a range from 0 to 100 for every instruction.
+  /// \returns A vector the size of the number of instruction groups containing a tuple(0, 100).
+  [[nodiscard]] auto getBounds() const -> std::vector<std::tuple<unsigned, unsigned>> override {
+    std::vector<std::tuple<unsigned, unsigned>> Vec(InstructionGroups.size(),
+                                                    std::make_tuple<unsigned, unsigned>(0, 100));
 
-    return vec;
+    return Vec;
   }
 
-  // get the number of objectives.
-  std::size_t getNobjs() const override { return _metrics.size(); }
-
-private:
-  std::function<void(std::vector<std::pair<std::string, unsigned>> const &)>
-      _changePayloadFunction;
-  std::shared_ptr<firestarter::measurement::MeasurementWorker>
-      _measurementWorker;
-  std::vector<std::string> _metrics;
-  std::chrono::seconds _timeout;
-  std::chrono::milliseconds _startDelta;
-  std::chrono::milliseconds _stopDelta;
-  std::vector<std::string> _instructionGroups;
+  /// Get the number of optimization objectives.
+  [[nodiscard]] auto getNobjs() const -> std::size_t override { return Metrics.size(); }
 };
 
 } // namespace firestarter::optimizer::problem
diff --git a/include/firestarter/Optimizer/Util/MultiObjective.hpp b/include/firestarter/Optimizer/Util/MultiObjective.hpp
index 00701bfd..049d7be3 100644
--- a/include/firestarter/Optimizer/Util/MultiObjective.hpp
+++ b/include/firestarter/Optimizer/Util/MultiObjective.hpp
@@ -21,7 +21,7 @@
 
 #pragma once
 
-#include <firestarter/Optimizer/Individual.hpp>
+#include "firestarter/Optimizer/Individual.hpp"
 
 #include <random>
 #include <utility>
@@ -29,41 +29,31 @@
 
 namespace firestarter::optimizer::util {
 
-bool less_than_f(double a, double b);
+auto lessThanF(double A, double B) -> bool;
 
-bool greater_than_f(double a, double b);
+auto greaterThanF(double A, double B) -> bool;
 
-bool pareto_dominance(const std::vector<double> &obj1,
-                      const std::vector<double> &obj2);
+auto paretoDominance(const std::vector<double>& Obj1, const std::vector<double>& Obj2) -> bool;
 
-std::tuple<std::vector<std::vector<std::size_t>>,
-           std::vector<std::vector<std::size_t>>, std::vector<std::size_t>,
-           std::vector<std::size_t>>
-fast_non_dominated_sorting(const std::vector<std::vector<double>> &points);
+auto fastNonDominatedSorting(const std::vector<std::vector<double>>& Points)
+    -> std::tuple<std::vector<std::vector<std::size_t>>, std::vector<std::vector<std::size_t>>,
+                  std::vector<std::size_t>, std::vector<std::size_t>>;
 
-std::vector<double>
-crowding_distance(const std::vector<std::vector<double>> &non_dom_front);
+auto crowdingDistance(const std::vector<std::vector<double>>& NonDomFront) -> std::vector<double>;
 
-std::vector<double>::size_type mo_tournament_selection(
-    std::vector<double>::size_type idx1, std::vector<double>::size_type idx2,
-    const std::vector<std::vector<double>::size_type> &non_domination_rank,
-    const std::vector<double> &crowding_d, std::mt19937 &mt);
+auto moTournamentSelection(std::vector<double>::size_type Idx1, std::vector<double>::size_type Idx2,
+                           const std::vector<std::vector<double>::size_type>& NonDominationRank,
+                           const std::vector<double>& CrowdingD, std::mt19937& Mt) -> std::vector<double>::size_type;
 
-std::pair<firestarter::optimizer::Individual,
-          firestarter::optimizer::Individual>
-sbx_crossover(const firestarter::optimizer::Individual &parent1,
-              const firestarter::optimizer::Individual &parent2,
-              const double p_cr, std::mt19937 &mt);
+auto sbxCrossover(const firestarter::optimizer::Individual& Parent1, const firestarter::optimizer::Individual& Parent2,
+                  double PCr, std::mt19937& Mt)
+    -> std::pair<firestarter::optimizer::Individual, firestarter::optimizer::Individual>;
 
-void polynomial_mutation(
-    firestarter::optimizer::Individual &child,
-    const std::vector<std::tuple<unsigned, unsigned>> &bounds, const double p_m,
-    std::mt19937 &mt);
+void polynomialMutation(firestarter::optimizer::Individual& Child,
+                        const std::vector<std::tuple<unsigned, unsigned>>& Bounds, double PM, std::mt19937& Mt);
 
-std::vector<std::size_t>
-select_best_N_mo(const std::vector<std::vector<double>> &input_f,
-                 std::size_t N);
+auto selectBestNMo(const std::vector<std::vector<double>>& InputF, std::size_t N) -> std::vector<std::size_t>;
 
-std::vector<double> ideal(const std::vector<std::vector<double>> &points);
+auto ideal(const std::vector<std::vector<double>>& Points) -> std::vector<double>;
 
 } // namespace firestarter::optimizer::util
diff --git a/include/firestarter/Measurement/Metric/IPCEstimate.h b/include/firestarter/SafeExit.hpp
similarity index 79%
rename from include/firestarter/Measurement/Metric/IPCEstimate.h
rename to include/firestarter/SafeExit.hpp
index 2c14bb0d..68823831 100644
--- a/include/firestarter/Measurement/Metric/IPCEstimate.h
+++ b/include/firestarter/SafeExit.hpp
@@ -1,6 +1,6 @@
 /******************************************************************************
  * FIRESTARTER - A Processor Stress Test Utility
- * Copyright (C) 2021 TU Dresden, Center for Information Services and High
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
  * Performance Computing
  *
  * This program is free software: you can redistribute it and/or modify
@@ -21,8 +21,10 @@
 
 #pragma once
 
-#include <firestarter/Measurement/MetricInterface.h>
+namespace firestarter {
 
-extern metric_interface_t ipc_estimate_metric;
+/// A thread safe wrapper to std::exit
+/// \arg Status The status passed to std::exit
+[[noreturn]] void safeExit(int Status);
 
-extern void ipc_estimate_metric_insert(double value);
+} // namespace firestarter
diff --git a/include/firestarter/WindowsCompat.hpp b/include/firestarter/WindowsCompat.hpp
new file mode 100644
index 00000000..11ef1329
--- /dev/null
+++ b/include/firestarter/WindowsCompat.hpp
@@ -0,0 +1,94 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#pragma once
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+
+/// Define the _mm_mfence and __cpuid function when we are not using MSC to enable the use of if constexpr instead of
+/// ifdefs.
+// NOLINTBEGIN(readability-identifier-naming,cert-dcl37-c,cert-dcl37-cpp,cert-dcl51-cpp,bugprone-reserved-identifier)
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-function"
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#if defined(__clang__)
+#include <emmintrin.h>
+#elif not(defined(__MINGW32__) || defined(__MINGW64__))
+void _mm_mfence() noexcept;
+#endif
+#if not(defined(__INTEL_LLVM_COMPILER))
+void __cpuid(int* /*unused*/, int /*unused*/) noexcept;
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+// NOLINTEND(readability-identifier-naming,cert-dcl37-c,cert-dcl37-cpp,cert-dcl51-cpp,bugprone-reserved-identifier)
+
+#endif
+
+#ifdef _WIN32
+// SIGALRM is not available on Windows
+#define SIGALRM 0
+
+#include <direct.h>
+static inline auto get_current_dir_name() -> char* { return _getcwd(nullptr, 0); }
+#elif defined(__APPLE__)
+#include <unistd.h>
+static inline auto get_current_dir_name() -> char* { return getcwd(nullptr, 0); }
+#else
+#include <unistd.h>
+#endif
+
+// correct include for gethostname
+#ifdef _MSC_VER
+#include <winsock.h>
+#else
+// NOLINTBEGIN(readability-duplicate-include)
+#include <unistd.h>
+// NOLINTEND(readability-duplicate-include)
+#endif
+
+// Make references in header files to pthread_t compatible to MSC. This will not make them functionally work.
+// We will be able to remove this hack once we transition from using pthread to std::thread
+#ifdef _MSC_VER
+struct Placeholder {};
+using pthread_t = Placeholder;
+#else
+extern "C" {
+#include <pthread.h>
+}
+#endif
+
+// Disable __asm__ __volatile__ in MSC
+// Static assert wont work, since if constexpr doesn't seem to work correctly
+#ifdef _MSC_VER
+#define __volatile__(X, ...)                                                                                           \
+  assert(false && "Attempted to use code path that uses the incorrect inline assembly macros for MSC.")
+#define __asm__
+#endif
\ No newline at end of file
diff --git a/lib/.clang-tidy b/lib/.clang-tidy
new file mode 100644
index 00000000..cf4dd00b
--- /dev/null
+++ b/lib/.clang-tidy
@@ -0,0 +1,4 @@
+---
+# Disable all clangd checks for the lib folder
+
+Checks: '-*'
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6136bb35..c0355fa0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,13 +1,18 @@
 SET(FIRESTARTER_FILES
+	firestarter/Config.cpp
 	firestarter/Main.cpp
 	firestarter/Firestarter.cpp
 	firestarter/LoadWorker.cpp
+	firestarter/SafeExit.cpp
 	firestarter/WatchdogWorker.cpp
 	firestarter/DumpRegisterWorker.cpp
+	
+	firestarter/Environment/X86/Platform/X86PlatformConfig.cpp
 
 	firestarter/Environment/Environment.cpp
 	firestarter/Environment/CPUTopology.cpp
-	firestarter/Environment/Payload/Payload.cpp
+	firestarter/Environment/Payload/CompiledPayload.cpp
+	firestarter/Environment/Payload/PayloadSettings.cpp
 
 	# here starts the x86 specific code
 	firestarter/Environment/X86/X86Environment.cpp
diff --git a/src/firestarter/Config.cpp b/src/firestarter/Config.cpp
new file mode 100644
index 00000000..356580d5
--- /dev/null
+++ b/src/firestarter/Config.cpp
@@ -0,0 +1,392 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#include "firestarter/Config.hpp"
+#include "firestarter/Constants.hpp"
+#include "firestarter/Logging/Log.hpp"
+
+#include <cxxopts.hpp>
+
+namespace {
+
+void printCopyright() {
+  firestarter::log::info() << "This program is free software: you can redistribute it and/or "
+                              "modify\n"
+                           << "it under the terms of the GNU General Public License as published "
+                              "by\n"
+                           << "the Free Software Foundation, either version 3 of the License, or\n"
+                           << "(at your option) any later version.\n"
+                           << "\n"
+                           << "You should have received a copy of the GNU General Public License\n"
+                           << "along with this program.  If not, see "
+                              "<http://www.gnu.org/licenses/>.\n";
+}
+
+void printWarranty() {
+  firestarter::log::info() << "This program is distributed in the hope that it will be useful,\n"
+                           << "but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
+                           << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n"
+                           << "GNU General Public License for more details.\n"
+                           << "\n"
+                           << "You should have received a copy of the GNU General Public License\n"
+                           << "along with this program.  If not, see "
+                              "<http://www.gnu.org/licenses/>.\n";
+}
+
+void printHelp(cxxopts::Options const& Parser, std::string const& Section = "") {
+  std::vector<std::pair<std::string, std::string>> Options = {
+    {"information", "Information Options:\n"},
+    {"general", "General Options:\n"},
+    {"specialized-workloads", "Specialized workloads:\n"},
+#ifdef FIRESTARTER_DEBUG_FEATURES
+    {"debug", "Debugging:\n"},
+#endif
+#if defined(linux) || defined(__linux__)
+    {"measurement", "Measurement:\n"},
+    {"optimization", "Optimization:\n"}
+#endif
+  };
+
+  // Select the specific option if sections is no empty
+  if (!Section.empty()) {
+    // section not found
+    auto FindSection = [&Section](std::pair<std::string, std::string> const& Pair) { return Pair.first == Section; };
+    auto SectionsIt = std::find_if(Options.begin(), Options.end(), FindSection);
+    if (SectionsIt == Options.end()) {
+      throw std::invalid_argument("Section \"" + Section + "\" not found in help.");
+    }
+    Options = {*SectionsIt};
+  }
+
+  // clang-format off
+  firestarter::log::info()
+    << Parser.help(Options)
+    << "Examples:\n"
+    << "  ./FIRESTARTER                 starts FIRESTARTER without timeout\n"
+    << "  ./FIRESTARTER -t 300          starts a 5 minute run of FIRESTARTER\n"
+    << "  ./FIRESTARTER -l 50 -t 600    starts a 10 minute run of FIRESTARTER with\n"
+    << "                                50\% high load and 50\% idle time\n"
+    << (firestarter::OptionalFeatures.gpuEnabled() ? 
+       "                                on CPUs and full load on GPUs\n"
+     : "")
+    << "  ./FIRESTARTER -l 75 -p 20000000\n"
+    << "                                starts FIRESTARTER with an interval length\n"
+    << "                                of 2 sec, 1.5s high load"
+    << (firestarter::OptionalFeatures.gpuEnabled() ? 
+       " on CPUs and full load on GPUs\n"
+     : "\n")
+    << (firestarter::OptionalFeatures.OptimizationEnabled ?
+       "  ./FIRESTARTER --measurement --start-delta=300000 -t 900\n"
+       "                                starts FIRESTARTER measuring all available\n"
+       "                                metrics for 15 minutes disregarding the first\n"
+       "                                5 minutes and last two seconds (default to `--stop-delta`)\n"
+       "  ./FIRESTARTER -t 20 --optimize=NSGA2 --optimization-metric sysfs-powercap-rapl,perf-ipc\n"
+       "                                starts FIRESTARTER optimizing with the sysfs-powercap-rapl\n"
+       "                                and perf-ipc metric. The duration is 20s long. The default\n"
+       "                                instruction groups for the current platform will be used.\n"
+     : "")
+    ;
+  // clang-format on
+}
+
+} // namespace
+
+namespace firestarter {
+
+Config::Config(int Argc, const char** Argv)
+    : Argv(Argv)
+    , Argc(Argc) {
+  const auto* ExecutableName = *Argv;
+
+  cxxopts::Options Parser(ExecutableName);
+
+  const auto HelpDescription =
+      std::string("Display usage information. SECTION can be any of: information | general | specialized-workloads") +
+      (firestarter::OptionalFeatures.DebugFeatureEnabled ? " | debug" : "") +
+      (firestarter::OptionalFeatures.OptimizationEnabled ? "\n| measurement | optimization" : "");
+
+  const auto LoadDescription =
+      std::string("Set the percentage of high CPU load to LOAD\n(%) default: 100, valid values: 0 <= LOAD <=\n100, "
+                  "threads will be idle in the remaining time,\nfrequency of load changes is determined by -p.") +
+      (firestarter::OptionalFeatures.gpuEnabled() ? " This option does NOT influence the GPU\nworkload!" : "");
+
+  // clang-format off
+  Parser.add_options("information")
+    ("h,help", HelpDescription,
+      cxxopts::value<std::string>()->implicit_value(""), "SECTION")
+    ("v,version", "Display version information")
+    ("c,copyright", "Display copyright information")
+    ("w,warranty", "Display warranty information")
+    ("q,quiet", "Set log level to Warning")
+    ("r,report", "Display additional information (overridden by -q)")
+    ("debug", "Print debug output")
+    ("a,avail", "List available functions");
+
+  Parser.add_options("general")
+    ("i,function", "Specify integer ID of the load-function to be\nused (as listed by --avail)",
+      cxxopts::value<unsigned>()->default_value("0"), "ID");
+
+  if (firestarter::OptionalFeatures.gpuEnabled()) {
+    Parser.add_options("general")
+      ("f,usegpufloat", "Use single precision matrix multiplications\ninstead of default")
+      ("d,usegpudouble", "Use double precision matrix multiplications\ninstead of default")
+      ("g,gpus", "Number of gpus to use, default: -1 (all)",
+        cxxopts::value<int>()->default_value("-1"))
+      ("m,matrixsize", "Size of the matrix to calculate, default: 0 (maximum)",
+        cxxopts::value<unsigned>()->default_value("0"));
+  }
+
+  Parser.add_options("general")
+    ("t,timeout", "Set the timeout (seconds) after which FIRESTARTER\nterminates itself, default: 0 (no timeout)",
+      cxxopts::value<unsigned>()->default_value("0"), "TIMEOUT")
+    ("l,load", LoadDescription,
+      cxxopts::value<unsigned>()->default_value("100"), "LOAD")
+    ("p,period", "Set the interval length for CPUs to PERIOD\n(usec), default: 100000, each interval contains\na high load and an idle phase, the percentage\nof high load is defined by -l.",
+      cxxopts::value<unsigned>()->default_value("100000"), "PERIOD")
+    ("n,threads", "Specify the number of threads. Cannot be\ncombined with -b | --bind, which impicitly\nspecifies the number of threads.",
+      cxxopts::value<unsigned>()->default_value("0"), "COUNT")
+#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY)
+    ("b,bind", "Select certain CPUs. CPULIST format: \"x,y,z\",\n\"x-y\", \"x-y/step\", and any combination of the\nabove. Cannot be combined with -n | --threads.",
+      cxxopts::value<std::string>()->default_value(""), "CPULIST")
+#endif
+    ("error-detection", "Enable error detection. This aborts execution when the calculated data is corruped by errors. FIRESTARTER must run with 2 or more threads for this feature. Cannot be used with -l | --load and --optimize.");
+
+  Parser.add_options("specialized-workloads")
+    ("list-instruction-groups", "List the available instruction groups for the\npayload of the current platform.")
+    ("run-instruction-groups", "Run the payload with the specified\ninstruction groups. GROUPS format: multiple INST:VAL\npairs comma-seperated.",
+      cxxopts::value<std::string>()->default_value(""), "GROUPS")
+    ("set-line-count", "Set the number of lines for a payload.",
+      cxxopts::value<unsigned>());
+
+  if (firestarter::OptionalFeatures.DebugFeatureEnabled) {
+    Parser.add_options("debug")
+      ("allow-unavailable-payload", "")
+      ("dump-registers", "Dump the working registers on the first\nthread. Depending on the payload these are mm, xmm,\nymm or zmm. Only use it without a timeout and\n100 percent load. DELAY between dumps in secs. Cannot be used with --error-detection.",
+        cxxopts::value<unsigned>()->implicit_value("10"), "DELAY")
+      ("dump-registers-outpath", "Path for the dump of the output files. If\nPATH is not given, current working directory will\nbe used.",
+        cxxopts::value<std::string>()->default_value(""), "PATH");
+  }
+
+  if (firestarter::OptionalFeatures.OptimizationEnabled) {
+    Parser.add_options("measurement")
+      ("list-metrics", "List the available metrics.")
+#ifndef FIRESTARTER_LINK_STATIC
+      ("metric-path", "Add a path to a shared library representing an interface for a metric. This option can be specified multiple times.",
+        cxxopts::value<std::vector<std::string>>()->default_value(""))
+#endif
+      ("metric-from-stdin", "Add a metric NAME with values from stdin.\nFormat of input: \"NAME TIME_SINCE_EPOCH VALUE\\n\".\nTIME_SINCE_EPOCH is a int64 in nanoseconds. VALUE is a double. (Do not forget to flush\nlines!)",
+        cxxopts::value<std::vector<std::string>>(), "NAME")
+      ("measurement", "Start a measurement for the time specified by\n-t | --timeout. (The timeout must be greater\nthan the start and stop deltas.) Cannot be\ncombined with --optimize.")
+      ("measurement-interval", "Interval of measurements in milliseconds, default: 100",
+        cxxopts::value<unsigned>()->default_value("100"))
+      ("start-delta", "Cut of first N milliseconds of measurement, default: 5000",
+        cxxopts::value<unsigned>()->default_value("5000"), "N")
+      ("stop-delta", "Cut of last N milliseconds of measurement, default: 2000",
+        cxxopts::value<unsigned>()->default_value("2000"), "N")
+      ("preheat", "Preheat for N seconds, default: 240",
+        cxxopts::value<unsigned>()->default_value("240"), "N");
+  
+    Parser.add_options("optimization")
+      ("optimize", "Run the optimization with one of these algorithms: NSGA2.\nCannot be combined with --measurement.",
+        cxxopts::value<std::string>())
+      ("optimize-outfile", "Dump the output of the optimization into this\nfile, default: $PWD/$HOSTNAME_$DATE.json",
+        cxxopts::value<std::string>())
+      ("optimization-metric", "Use a metric for optimization. Metrics listed\nwith cli argument --list-metrics or specified\nwith --metric-from-stdin are valid.",
+        cxxopts::value<std::vector<std::string>>())
+      ("individuals", "Number of individuals for the population. For\nNSGA2 specify at least 5 and a multiple of 4,\ndefault: 20",
+        cxxopts::value<unsigned>()->default_value("20"))
+      ("generations", "Number of generations, default: 20",
+        cxxopts::value<unsigned>()->default_value("20"))
+      ("nsga2-cr", "Crossover probability. Must be in range [0,1[\ndefault: 0.6",
+        cxxopts::value<double>()->default_value("0.6"))
+      ("nsga2-m", "Mutation probability. Must be in range [0,1]\ndefault: 0.4",
+        cxxopts::value<double>()->default_value("0.4"));
+  }
+  // clang-format on
+
+  try {
+    auto Options = Parser.parse(Argc, Argv);
+
+    if (static_cast<bool>(Options.count("quiet"))) {
+      firestarter::logging::Filter<firestarter::logging::record>::set_severity(nitro::log::severity_level::warn);
+    } else if (static_cast<bool>(Options.count("report"))) {
+      firestarter::logging::Filter<firestarter::logging::record>::set_severity(nitro::log::severity_level::debug);
+    } else if (static_cast<bool>(Options.count("debug"))) {
+      firestarter::logging::Filter<firestarter::logging::record>::set_severity(nitro::log::severity_level::trace);
+    } else {
+      firestarter::logging::Filter<firestarter::logging::record>::set_severity(nitro::log::severity_level::info);
+    }
+
+    if (static_cast<bool>(Options.count("version"))) {
+      safeExit(EXIT_SUCCESS);
+    }
+
+    if (static_cast<bool>(Options.count("copyright"))) {
+      printCopyright();
+      safeExit(EXIT_SUCCESS);
+    }
+
+    if (static_cast<bool>(Options.count("warranty"))) {
+      printWarranty();
+      safeExit(EXIT_SUCCESS);
+    }
+
+    firestarter::log::info() << "This program comes with ABSOLUTELY NO WARRANTY; for details run `" << ExecutableName
+                             << " -w`.\n"
+                             << "This is free software, and you are welcome to redistribute it\n"
+                             << "under certain conditions; run `" << ExecutableName << " -c` for details.\n";
+
+    if (static_cast<bool>(Options.count("help"))) {
+      auto Section = Options["help"].as<std::string>();
+
+      printHelp(Parser, Section);
+      safeExit(EXIT_SUCCESS);
+    }
+
+    Timeout = std::chrono::seconds(Options["timeout"].as<unsigned>());
+    const auto LoadPercent = Options["load"].as<unsigned>();
+    Period = std::chrono::microseconds(Options["period"].as<unsigned>());
+
+    if (LoadPercent > 100) {
+      throw std::invalid_argument("Option -l/--load may not be above 100.");
+    }
+
+    Load = (Period * LoadPercent) / 100;
+    if (LoadPercent == 100 || Load == std::chrono::microseconds::zero()) {
+      Period = std::chrono::microseconds::zero();
+    }
+
+    ErrorDetection = static_cast<bool>(Options.count("error-detection"));
+    if (ErrorDetection && LoadPercent != 100) {
+      throw std::invalid_argument("Option --error-detection may only be used "
+                                  "with -l/--load equal 100.");
+    }
+
+    if (firestarter::OptionalFeatures.DebugFeatureEnabled) {
+      AllowUnavailablePayload = static_cast<bool>(Options.count("allow-unavailable-payload"));
+      DumpRegisters = static_cast<bool>(Options.count("dump-registers"));
+      if (DumpRegisters) {
+        DumpRegistersTimeDelta = std::chrono::seconds(Options["dump-registers"].as<unsigned>());
+        if (Timeout != std::chrono::microseconds::zero() && LoadPercent != 100) {
+          throw std::invalid_argument("Option --dump-registers may only be used "
+                                      "without a timeout and full load.");
+        }
+        if (ErrorDetection) {
+          throw std::invalid_argument("Options --dump-registers and --error-detection cannot be used "
+                                      "together.");
+        }
+      }
+    }
+
+    RequestedNumThreads = Options["threads"].as<unsigned>();
+
+#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY)
+    CpuBind = Options["bind"].as<std::string>();
+    if (!CpuBind.empty()) {
+      if (RequestedNumThreads != 0) {
+        throw std::invalid_argument("Options -b/--bind and -n/--threads cannot be used together.");
+      }
+    }
+#endif
+
+    if (firestarter::OptionalFeatures.gpuEnabled()) {
+      GpuUseFloat = static_cast<bool>(Options.count("usegpufloat"));
+      GpuUseDouble = static_cast<bool>(Options.count("usegpudouble"));
+
+      if (GpuUseFloat && GpuUseDouble) {
+        throw std::invalid_argument("Options -f/--usegpufloat and "
+                                    "-d/--usegpudouble cannot be used together.");
+      }
+
+      GpuMatrixSize = Options["matrixsize"].as<unsigned>();
+      if (GpuMatrixSize > 0 && GpuMatrixSize < 64) {
+        throw std::invalid_argument("Option -m/--matrixsize may not be below 64.");
+      }
+
+      Gpus = Options["gpus"].as<int>();
+    }
+
+    PrintFunctionSummary = static_cast<bool>(Options.count("avail"));
+
+    FunctionId = Options["function"].as<unsigned>();
+
+    ListInstructionGroups = static_cast<bool>(Options.count("list-instruction-groups"));
+    InstructionGroups = Options["run-instruction-groups"].as<std::string>();
+    if (static_cast<bool>(Options.count("set-line-count"))) {
+      LineCount = Options["set-line-count"].as<unsigned>();
+    }
+
+    if (firestarter::OptionalFeatures.OptimizationEnabled) {
+      StartDelta = std::chrono::milliseconds(Options["start-delta"].as<unsigned>());
+      StopDelta = std::chrono::milliseconds(Options["stop-delta"].as<unsigned>());
+      MeasurementInterval = std::chrono::milliseconds(Options["measurement-interval"].as<unsigned>());
+#ifndef FIRESTARTER_LINK_STATIC
+      MetricPaths = Options["metric-path"].as<std::vector<std::string>>();
+#endif
+      if (static_cast<bool>(Options.count("metric-from-stdin"))) {
+        StdinMetrics = Options["metric-from-stdin"].as<std::vector<std::string>>();
+      }
+      Measurement = static_cast<bool>(Options.count("measurement"));
+      ListMetrics = static_cast<bool>(Options.count("list-metrics"));
+      Optimize = static_cast<bool>(Options.count("optimize"));
+
+      if (Optimize) {
+        if (ErrorDetection) {
+          throw std::invalid_argument("Options --error-detection and --optimize "
+                                      "cannot be used together.");
+        }
+        if (Measurement) {
+          throw std::invalid_argument("Options --measurement and --optimize cannot be used together.");
+        }
+        Preheat = std::chrono::seconds(Options["preheat"].as<unsigned>());
+        OptimizationAlgorithm = Options["optimize"].as<std::string>();
+        if (static_cast<bool>(Options.count("optimization-metric"))) {
+          OptimizationMetrics = Options["optimization-metric"].as<std::vector<std::string>>();
+        }
+        if (LoadPercent != 100) {
+          throw std::invalid_argument("Options -p | --period and -l | --load are "
+                                      "not compatible with --optimize.");
+        }
+        if (Timeout == std::chrono::seconds::zero()) {
+          throw std::invalid_argument("Option -t | --timeout must be specified for optimization.");
+        }
+        EvaluationDuration = Timeout;
+        // this will deactivate the watchdog worker
+        Timeout = std::chrono::seconds::zero();
+        Individuals = Options["individuals"].as<unsigned>();
+        if (static_cast<bool>(Options.count("optimize-outfile"))) {
+          OptimizeOutfile = Options["optimize-outfile"].as<std::string>();
+        }
+        Generations = Options["generations"].as<unsigned>();
+        Nsga2Cr = Options["nsga2-cr"].as<double>();
+        Nsga2M = Options["nsga2-m"].as<double>();
+
+        if (OptimizationAlgorithm != "NSGA2") {
+          throw std::invalid_argument("Option --optimize must be any of: NSGA2");
+        }
+      }
+    }
+  } catch (std::exception& E) {
+    printHelp(Parser);
+    firestarter::log::error() << E.what() << "\n";
+  }
+}
+} // namespace firestarter
\ No newline at end of file
diff --git a/src/firestarter/Cuda/Cuda.cpp b/src/firestarter/Cuda/Cuda.cpp
index e5abece9..9469073a 100644
--- a/src/firestarter/Cuda/Cuda.cpp
+++ b/src/firestarter/Cuda/Cuda.cpp
@@ -1,6 +1,6 @@
 /******************************************************************************
  * FIRESTARTER - A Processor Stress Test Utility
- * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High
+ * Copyright (C) 2020-2024 TU Dresden, Center for Information Services and High
  * Performance Computing
  *
  * This program is free software: you can redistribute it and/or modify
@@ -19,614 +19,284 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-/* CUDA error checking based on CudaWrapper.h
- * https://github.com/ashwin/gDel3D/blob/master/GDelFlipping/src/gDel3D/GPU/CudaWrapper.h
- *
+/******************************************************************************
  * inspired by gpu_burn
  * http://wili.cc/blog/gpu-burn.html
  *****************************************************************************/
 
-#include <firestarter/Cuda/Cuda.hpp>
-#include <firestarter/LoadWorkerData.hpp>
-#include <firestarter/Logging/Log.hpp>
+#include "firestarter/Cuda/Cuda.hpp"
+#include "firestarter/Cuda/CudaHipCompat.hpp"
+#include "firestarter/Logging/Log.hpp"
 
-#ifdef FIRESTARTER_BUILD_CUDA
-  #include <cublas_v2.h>
-  #include <cuda.h>
-  #include <cuda_runtime_api.h>
-  #include <curand_kernel.h>
-  #define FS_ACCEL_PREFIX_LC_LONG cuda
-  #define FS_ACCEL_PREFIX_LC cu
-  #define FS_ACCEL_PREFIX_UC CU
-  #define FS_ACCEL_PREFIX_UC_LONG CUDA
-  #define FS_ACCEL_STRING "CUDA"
-#else
-  #ifdef FIRESTARTER_BUILD_HIP
-    #include <hipblas/hipblas.h>
-    #include <hip/hip_runtime.h>
-    #include <hip/hip_runtime_api.h>
-    #include <hiprand_kernel.h>
-  #define FS_ACCEL_PREFIX_LC_LONG hip
-  #define FS_ACCEL_PREFIX_LC hip
-  #define FS_ACCEL_PREFIX_UC HIP
-  #define FS_ACCEL_PREFIX_UC_LONG HIP
-  #define FS_ACCEL_STRING "HIP"
-  #else
-    #error "Attempting to compile file but neither CUDA nor HIP is used"
-  #endif
-#endif
-#define CONCAT_(prefix, suffix) prefix##suffix
-/// Concatenate `prefix, suffix` into `prefixsuffix`
-#define CONCAT(prefix, suffix) CONCAT_(prefix, suffix)
-//#define FS_ACCEL_ERROR_TYPE CONCAT(FS_ACCEL_PREFIX_LC_LONG,Error_t)
-//#define FS_ACCEL_BLAS_STATUS_TYPE cublasStatus_t
-//#define FS_ACCEL_RAND_STATUS_TYPE curandStatus_t
-
-#include <algorithm>
 #include <atomic>
+#include <cmath>
+#include <cstddef>
 #include <type_traits>
 
-#define ACCELL_SAFE_CALL(cuerr, dev_index)                                       \
-  accell_safe_call(cuerr, dev_index, __FILE__, __LINE__)
-#define SEED 123
-
-using namespace firestarter::cuda;
-
-// CUDA error checking
-static inline void accell_safe_call(CONCAT(FS_ACCEL_PREFIX_LC_LONG,Error_t) cuerr, int dev_index,
-                                  const char *file, const int line) {
-  if (cuerr != CONCAT(FS_ACCEL_PREFIX_LC_LONG,Success) && cuerr != 1) {
-    firestarter::log::error()
-        << FS_ACCEL_STRING" error at " << file << ":" << line << ": error code = " << cuerr
-        << " (" << CONCAT(FS_ACCEL_PREFIX_LC_LONG,GetErrorString)(cuerr)
-        << "), device index: " << dev_index;
-    exit(cuerr);
-  }
-
-  return;
-}
-
-static const char *_accellGetErrorEnum(CONCAT(FS_ACCEL_PREFIX_LC,blasStatus_t) error) {
-  switch (error) {
-  case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_SUCCESS):
-    return FS_ACCEL_STRING"blas status: success";
-  case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_NOT_INITIALIZED):
-    return FS_ACCEL_STRING"blas status: not initialized";
-  case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_ALLOC_FAILED):
-    return FS_ACCEL_STRING"blas status: alloc failed";
-  case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_INVALID_VALUE):
-    return FS_ACCEL_STRING"blas status: invalid value";
-  case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_ARCH_MISMATCH):
-    return FS_ACCEL_STRING"blas status: arch mismatch";
-  case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_MAPPING_ERROR):
-    return FS_ACCEL_STRING"blas status: mapping error";
-  case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_EXECUTION_FAILED):
-    return FS_ACCEL_STRING"blas status: execution failed";
-  case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_INTERNAL_ERROR):
-    return FS_ACCEL_STRING"blas status: internal error";
-  case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_NOT_SUPPORTED):
-    return FS_ACCEL_STRING"blas status: not supported";
-#ifdef FIRESTARTER_BUILD_CUDA
-  case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_LICENSE_ERROR):
-    return FS_ACCEL_STRING"blas status: license error";
-#endif
-#ifdef FIRESTARTER_BUILD_HIP
-    case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_UNKNOWN):
-      return FS_ACCEL_STRING"blas status: unknown";
-    case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_HANDLE_IS_NULLPTR):
-      return FS_ACCEL_STRING"blas status: handle is null pointer";
-    case CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_INVALID_ENUM):
-      return FS_ACCEL_STRING"blas status: invalid enum";
-#endif
-  }
-
-
-  return "<unknown>";
-}
-
-static inline void accell_safe_call(CONCAT(FS_ACCEL_PREFIX_LC,blasStatus_t) cuerr, int dev_index,
-                                  const char *file, const int line) {
-  if (cuerr != CONCAT(FS_ACCEL_PREFIX_UC,BLAS_STATUS_SUCCESS)) {
-    firestarter::log::error()
-        << FS_ACCEL_STRING"BLAS error at " << file << ":" << line
-        << ": error code = " << cuerr << " (" << _accellGetErrorEnum(cuerr)
-        << "), device index: " << dev_index;
-    exit(cuerr);
-  }
-
-  return;
-}
-
-#ifdef FIRESTARTER_BUILD_CUDA
-static inline void accell_safe_call(CONCAT(FS_ACCEL_PREFIX_UC,result) cuerr, int dev_index,
-                                  const char *file, const int line) {
-  if (cuerr != CONCAT(FS_ACCEL_PREFIX_UC_LONG,_SUCCESS)) {
-    const char *errorString;
-
-    ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,GetErrorName)(cuerr, &errorString), dev_index);
-
-    firestarter::log::error()
-        << FS_ACCEL_STRING" error at " << file << ":" << line << ": error code = " << cuerr
-        << " (" << errorString << "), device index: " << dev_index;
-    exit(cuerr);
-  }
-
-  return;
-}
-#endif
-
-static const char *_accellrandGetErrorEnum(CONCAT(FS_ACCEL_PREFIX_LC,randStatus_t) cuerr) {
-  switch (cuerr) {
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_SUCCESS):
-      return FS_ACCEL_STRING"rand status: success";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_VERSION_MISMATCH):
-      return FS_ACCEL_STRING"rand status: version mismatch";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_NOT_INITIALIZED):
-      return FS_ACCEL_STRING"rand status: not initialized";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_ALLOCATION_FAILED):
-      return FS_ACCEL_STRING"rand status: allocation failed";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_TYPE_ERROR):
-      return FS_ACCEL_STRING"rand status: type error";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_OUT_OF_RANGE):
-      return FS_ACCEL_STRING"rand status: out of range";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_LENGTH_NOT_MULTIPLE):
-      return FS_ACCEL_STRING"rand status: length not multiple";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_DOUBLE_PRECISION_REQUIRED):
-      return FS_ACCEL_STRING"rand status: double precision required";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_LAUNCH_FAILURE):
-      return FS_ACCEL_STRING"rand status: launch failure";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_PREEXISTING_FAILURE):
-      return FS_ACCEL_STRING"rand status: preexisting failure";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_INITIALIZATION_FAILED):
-      return FS_ACCEL_STRING"rand status: initialization failed";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_ARCH_MISMATCH):
-      return FS_ACCEL_STRING"rand status: arch mismatch";
-    case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_INTERNAL_ERROR):
-      return FS_ACCEL_STRING"rand status: internal error";
-#ifdef FIRESTARTER_BUILD_HIP
-  case CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_NOT_IMPLEMENTED):
-      return FS_ACCEL_STRING"rand status: not implemented";
-#endif
-  }
-
-  return "<unknown>";
-}
+namespace firestarter::cuda {
 
-static inline void accell_safe_call(CONCAT(FS_ACCEL_PREFIX_LC,randStatus_t) cuerr, int dev_index,
-                                  const char *file, const int line) {
-  if (cuerr != CONCAT(FS_ACCEL_PREFIX_UC,RAND_STATUS_SUCCESS)) {
-    firestarter::log::error()
-        << FS_ACCEL_STRING"RAND error at " << file << ":" << line
-        << ": error code = " << cuerr << " (" << _accellrandGetErrorEnum(cuerr)
-        << "), device index: " << dev_index;
-    exit(cuerr);
-  }
+constexpr const int Seed = 123;
 
-  return;
-}
+namespace {
 
-static int round_up(int num_to_round, int multiple) {
-  if (multiple == 0) {
-    return num_to_round;
-  }
+template <std::size_t Multiple> auto roundUp(int NumToRound) -> int {
+  static_assert(Multiple != 0, "Multiple may not be zero.");
 
-  int remainder = num_to_round % multiple;
-  if (remainder == 0) {
-    return num_to_round;
+  const int Remainder = NumToRound % Multiple;
+  if (Remainder == 0) {
+    return NumToRound;
   }
 
-  return num_to_round + multiple - remainder;
+  return NumToRound + Multiple - Remainder;
 }
 
-#ifdef FIRESTARTER_BUILD_CUDA
-static int get_precision(int useDouble, struct cudaDeviceProp properties) {
-#else
-#ifdef FIRESTARTER_BUILD_HIP
-static int get_precision(int useDouble, struct hipDeviceProp_t properties) {
-#endif
-#endif
+/// Convert the UseDouble input (0 -> single precision, 1 -> double precision, 2 -> automatic) to either 0 or 1 for
+/// float or double respectively. For CUDART_VERSION at least equal 8000 and automatic selection we check if the card a
+/// singleToDoublePrecisionPerfRatio bigger than 3 and select float in this case otherwise double. In all other cases
+/// automatic results in double.
+/// \arg UseDouble The input that specifies either single precision, double precision or automatic selection.
+/// \arg Properties The device properties.
+/// \return The selected precision, either 0 or 1 for float or double respectively.
+auto getPrecision(int UseDouble, const compat::DeviceProperties& Properties) -> int {
 #if (CUDART_VERSION >= 8000)
-// read precision ratio (dp/sp) of GPU to choose the right variant for maximum
-// workload
-  if (useDouble == 2 && properties.singleToDoublePrecisionPerfRatio > 3) {
+  // read precision ratio (dp/sp) of GPU to choose the right variant for maximum
+  // workload
+  if (UseDouble == 2 && Properties.singleToDoublePrecisionPerfRatio > 3) {
     return 0;
-  } else if (useDouble) {
+  }
+  if (UseDouble) {
     return 1;
-  } else {
-    return 0;
   }
-}
+  return 0;
 #else
-// as precision ratio is not supported return default/user input value
-  (void)properties;
+  // as precision ratio is not supported return default/user input value
+  (void)Properties;
 
-  if (useDouble) {
+  if (UseDouble) {
     return 1;
-  } else {
-    return 0;
   }
-}
-#endif
+  return 0;
 
-static int get_precision(int device_index, int useDouble) {
-  size_t memory_avail, memory_total;
-#ifdef FIRESTARTER_BUILD_CUDA
-  CUcontext context;
-  CUdevice device;
-  struct cudaDeviceProp properties;
-  ACCELL_SAFE_CALL(cuDeviceGet(&device, device_index), device_index);
-  ACCELL_SAFE_CALL(cuCtxCreate(&context, 0, device), device_index);
-  ACCELL_SAFE_CALL(cuCtxSetCurrent(context), device_index);
-#else
-#ifdef FIRESTARTER_BUILD_HIP
-  struct hipDeviceProp_t properties;
-  ACCELL_SAFE_CALL(hipSetDevice(device_index), device_index);
-#endif
 #endif
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,MemGetInfo)(&memory_avail, &memory_total), device_index);
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC_LONG,GetDeviceProperties)(&properties, device_index),
-                 device_index);
+}
 
-  useDouble = get_precision(useDouble, properties);
+auto getPrecision(int DeviceIndex, int UseDouble) -> int {
+  std::size_t MemoryAvail{};
+  std::size_t MemoryTotal{};
+  compat::DeviceProperties Properties;
 
-  // we check for double precision support on the GPU and print errormsg, when
-  // the user wants to compute DP on a SP-only-Card.
-  if (useDouble && properties.major <= 1 && properties.minor <= 2) {
-    std::stringstream ss;
-    ss << FS_ACCEL_STRING" GPU " << device_index << ": " << properties.name << " ";
-
-    firestarter::log::error()
-        << ss.str() << "Doesn't support double precision.\n"
-        << ss.str() << "Compute Capability: " << properties.major << "."
-        << properties.minor << ". Requiered for double precision: >=1.3\n"
-        << ss.str()
-        << "Stressing with single precision instead. Maybe use -f parameter.";
-
-    useDouble = 0;
-  }
+  // NOLINTNEXTLINE(readability-qualified-auto)
+  auto StreamOrContext = compat::createContextOrStream(DeviceIndex);
 
-#ifdef FIRESTARTER_BUILD_CUDA
-  ACCELL_SAFE_CALL(cuCtxDestroy(context), device_index);
-#endif
+  compat::accellSafeCall(compat::memGetInfo(MemoryAvail, MemoryTotal), __FILE__, __LINE__, DeviceIndex);
+  compat::accellSafeCall(compat::getDeviceProperties(Properties, DeviceIndex), __FILE__, __LINE__, DeviceIndex);
 
-  return useDouble;
-}
+  UseDouble = getPrecision(UseDouble, Properties);
 
+  const bool DoubleNotSupported =
 #ifdef FIRESTARTER_BUILD_CUDA
-static int get_msize(int device_index, int useDouble) {
-  CUcontext context;
-  CUdevice device;
-  size_t memory_avail, memory_total;
-
-  ACCELL_SAFE_CALL(cuDeviceGet(&device, device_index), device_index);
-  ACCELL_SAFE_CALL(cuCtxCreate(&context, 0, device), device_index);
-  ACCELL_SAFE_CALL(cuCtxSetCurrent(context), device_index);
-  ACCELL_SAFE_CALL(cuMemGetInfo(&memory_avail, &memory_total), device_index);
-
-  ACCELL_SAFE_CALL(cuCtxDestroy(context), device_index);
-
-  return round_up(
-      (int)(0.8 * sqrt(((memory_avail) /
-                        ((useDouble ? sizeof(double) : sizeof(float)) * 3)))),
-      1024); // a multiple of 1024 works always well
-}
+      Properties.major <= 1 && Properties.minor <= 2;
+#else
+      false;
 #endif
 
-static CONCAT(FS_ACCEL_PREFIX_LC,blasStatus_t) gemm(
-                            CONCAT(FS_ACCEL_PREFIX_LC,blasHandle_t) handle,
-                            CONCAT(FS_ACCEL_PREFIX_LC,blasOperation_t) transa,
-                            CONCAT(FS_ACCEL_PREFIX_LC,blasOperation_t) transb,
-                            int &m, int &n, int &k,
-                            const float *alpha, const float *A, int &lda,
-                            const float *B, int &ldb, const float *beta,
-                            float *C, int &ldc) {
-  return CONCAT(FS_ACCEL_PREFIX_LC,blasSgemm)(handle, transa, transb, m, n, k,
-                                              alpha, A, lda, B, ldb,
-                                              beta, C, ldc);
-}
+  // we check for double precision support on the GPU and print errormsg, when
+  // the user wants to compute DP on a SP-only-Card.
+  if (UseDouble && DoubleNotSupported) {
+    std::stringstream Ss;
+    Ss << compat::AccelleratorString << " GPU " << DeviceIndex << ": " << Properties.name << " ";
 
-static CONCAT(FS_ACCEL_PREFIX_LC,blasStatus_t) gemm(
-                            CONCAT(FS_ACCEL_PREFIX_LC,blasHandle_t) handle,
-                            CONCAT(FS_ACCEL_PREFIX_LC,blasOperation_t) transa,
-                            CONCAT(FS_ACCEL_PREFIX_LC,blasOperation_t) transb,
-                            int &m, int &n, int &k,
-                            const double *alpha, const double *A, int &lda,
-                            const double *B, int &ldb, const double *beta,
-                            double *C, int &ldc) {
-  return CONCAT(FS_ACCEL_PREFIX_LC,blasDgemm)(handle, transa, transb, m, n, k,
-                                              alpha, A, lda, B, ldb,
-                                              beta, C, ldc);
-}
+    firestarter::log::error() << Ss.str() << "Doesn't support double precision.\n"
+                              << Ss.str() << "Compute Capability: " << Properties.major << "." << Properties.minor
+                              << ". Requiered for double precision: >=1.3\n"
+                              << Ss.str() << "Stressing with single precision instead. Maybe use -f parameter.";
 
-static CONCAT(FS_ACCEL_PREFIX_LC,randStatus_t) generateUniform(
-                            CONCAT(FS_ACCEL_PREFIX_LC,randGenerator_t) generator,
-                            float *outputPtr, size_t num) {
-  return CONCAT(FS_ACCEL_PREFIX_LC,randGenerateUniform)(generator, outputPtr, num);
-}
+    UseDouble = 0;
+  }
 
-static CONCAT(FS_ACCEL_PREFIX_LC,randStatus_t) generateUniform(
-                            CONCAT(FS_ACCEL_PREFIX_LC,randGenerator_t) generator,
-                            double *outputPtr, size_t num) {
-  return CONCAT(FS_ACCEL_PREFIX_LC,randGenerateUniformDouble)(generator, outputPtr, num);
+  compat::accellSafeCall(compat::destroyContextOrStream(StreamOrContext), __FILE__, __LINE__, DeviceIndex);
+
+  return UseDouble;
 }
 
 // GPU index. Used to pin this thread to the GPU.
-template <typename T>
-static void create_load(std::condition_variable &waitForInitCv,
-                        std::mutex &waitForInitCvMutex, int device_index,
-                        std::atomic<int> &initCount,
-                        volatile unsigned long long *loadVar, int matrixSize) {
-  static_assert(
-      std::is_same<T, float>::value || std::is_same<T, double>::value,
-      "create_load<T>: Template argument T must be either float or double");
-
-  int iterations, i;
-
-  firestarter::log::trace() << "Starting CUDA/HIP with given matrix size "
-                            << matrixSize;
-
-  size_t size_use = 0;
-  if (matrixSize > 0) {
-    size_use = matrixSize;
-  }
-
-  size_t use_bytes, memory_size;
-#ifdef FIRESTARTER_BUILD_CUDA
-  CUcontext context;
-  struct cudaDeviceProp properties;
-  CUdevice device;
-  cublasHandle_t cublas;
-#else
-#ifdef FIRESTARTER_BUILD_HIP
-  hipStream_t stream;
-  struct hipDeviceProp_t properties;
-  hipDevice_t device;
-  hipblasHandle_t cublas;
-#endif
-#endif
+// Size use is one square matrix dim size
+template <typename FloatingPointType>
+void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex, int DeviceIndex,
+                std::atomic<int>& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar,
+                unsigned MatrixSize) {
+  static_assert(std::is_same_v<FloatingPointType, float> || std::is_same_v<FloatingPointType, double>,
+                "create_load<FloatingPointType>: Template argument must be either float or double");
+
+  firestarter::log::trace() << "Starting " << compat::AccelleratorString << " with given matrix size " << MatrixSize;
+
+  compat::DeviceProperties Properties;
+  compat::BlasHandle Blas{};
   // reserving the GPU and initializing cublas
 
-  firestarter::log::trace() << "Getting " FS_ACCEL_STRING " device nr. " << device_index;
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,DeviceGet)(&device, device_index), device_index);
-
-#ifdef FIRESTARTER_BUILD_CUDA
-  firestarter::log::trace() << "Creating " FS_ACCEL_STRING " context for computation on device nr. "
-                     << device_index;
-  ACCELL_SAFE_CALL(cuCtxCreate(&context, 0, device), device_index);
-
-  firestarter::log::trace() << "Set created " FS_ACCEL_STRING " context on device nr. "
-                     << device_index;
-  ACCELL_SAFE_CALL(cuCtxSetCurrent(context), device_index);
-#else
-#ifdef FIRESTARTER_BUILD_HIP
-  firestarter::log::trace() << "Creating " FS_ACCEL_STRING " Stream for computation on device nr. "
-                     << device_index;
-  ACCELL_SAFE_CALL(hipSetDevice(device_index), device_index);
-  ACCELL_SAFE_CALL(hipStreamCreate(&stream), device_index);
-#endif
-#endif
+  // NOLINTNEXTLINE(readability-qualified-auto)
+  auto StreamOrContext = compat::createContextOrStream(DeviceIndex);
 
-  firestarter::log::trace() << "Create " FS_ACCEL_STRING " Blas on device nr. "
-                     << device_index;
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,blasCreate)(&cublas), device_index);
+  firestarter::log::trace() << "Create " << compat::AccelleratorString << " Blas on device nr. " << DeviceIndex;
+  compat::accellSafeCall(compat::blasCreate(Blas), __FILE__, __LINE__, DeviceIndex);
 
-  firestarter::log::trace() << "Get " FS_ACCEL_STRING " device properties (e.g., support for double)"
-                     << " on device nr. "
-                     << device_index;
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC_LONG,GetDeviceProperties)(&properties, device_index),
-                 device_index);
+  firestarter::log::trace() << "Get " << compat::AccelleratorString << " device properties (e.g., support for double)"
+                            << " on device nr. " << DeviceIndex;
+  compat::accellSafeCall(compat::getDeviceProperties(Properties, DeviceIndex), __FILE__, __LINE__, DeviceIndex);
 
   // getting information about the GPU memory
-  size_t memory_avail, memory_total;
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,MemGetInfo)(&memory_avail, &memory_total), device_index);
-
-  firestarter::log::trace() << "Get " FS_ACCEL_STRING " Memory info on device nr. "
-                     << device_index
-                     <<": " << memory_avail << " B avail. from "
-                     << memory_total << " B total";
-
-  // defining memory pointers
-#ifdef FIRESTARTER_BUILD_CUDA
-  CUdeviceptr a_data_ptr;
-  CUdeviceptr b_data_ptr;
-  CUdeviceptr c_data_ptr;
-#else
-#ifdef FIRESTARTER_BUILD_HIP
-  T* a_data_ptr;
-  T* b_data_ptr;
-  T* c_data_ptr;
-#endif
-#endif
-
-  // check if the user has not set a matrix OR has set a too big matrixsite and
-  // if this is true: set a good matrixsize
-  if (!size_use || ((size_use * size_use * sizeof(T) * 3 > memory_avail))) {
-    size_use = round_up((int)(0.8 * sqrt(((memory_avail) / (sizeof(T) * 3)))),
-                        1024); // a multiple of 1024 works always well
+  std::size_t MemoryAvail{};
+  std::size_t MemoryTotal{};
+  compat::accellSafeCall(compat::memGetInfo(MemoryAvail, MemoryTotal), __FILE__, __LINE__, DeviceIndex);
+  firestarter::log::trace() << "Get " << compat::AccelleratorString << " emory info on device nr. " << DeviceIndex
+                            << ": " << MemoryAvail << " B avail. from " << MemoryTotal << " B total";
+
+  // Defining memory pointers. ADataPtr and BDataPtr will point to a square matrix. CDataPtr may be one or multiple
+  // square matrices.
+  FloatingPointType* ADataPtr{};
+  FloatingPointType* BDataPtr{};
+  FloatingPointType* CDataPtr{};
+
+  // If the matrix size is not set or three square matricies with dim size of SizeUse do not fit into the available
+  // memory, select the size so that 3 square matricies will fit into the available device memory where the dim size
+  // is a multiple of 1024. There may be edge cases with small device memory that results in matricies that are not
+  // multiples of 1024.
+  std::size_t MemorySize = sizeof(FloatingPointType) * MatrixSize * MatrixSize;
+  if (!MatrixSize || (MemorySize * 3 > MemoryAvail)) {
+    // a multiple of 1024 works always well
+    MatrixSize = roundUp<1024>(0.8 * std::sqrt(MemoryAvail / sizeof(FloatingPointType) / 3));
+    MemorySize = sizeof(FloatingPointType) * MatrixSize * MatrixSize;
   }
-  firestarter::log::trace() << "Set " FS_ACCEL_STRING " matrix size: " << matrixSize;
-  use_bytes = (size_t)((T)memory_avail);
-  memory_size = sizeof(T) * size_use * size_use;
-  iterations = (use_bytes - 2 * memory_size) / memory_size; // = 1;
 
-  firestarter::log::trace()
-      << "Allocating " FS_ACCEL_STRING " memory on device nr. "
-      << device_index;
+  firestarter::log::trace() << "Set " << compat::AccelleratorString << " matrix size: " << MatrixSize;
+  // Calculate the numnber of C matricies based on the available memory and the matrix size in B.
+  const auto Iterations = (MemoryAvail - 2 * MemorySize) / MemorySize;
+  // The numner of used memory are two time the matrix size in B (Matrix A and B) plus the number of matricies in C.
+  const auto UseBytes = (2 + Iterations) * MemorySize;
 
-  // allocating memory on the GPU
-#ifdef FIRESTARTER_BUILD_CUDA
-  ACCELL_SAFE_CALL(cuMemAlloc(&a_data_ptr, memory_size), device_index);
-  ACCELL_SAFE_CALL(cuMemAlloc(&b_data_ptr, memory_size), device_index);
-  ACCELL_SAFE_CALL(cuMemAlloc(&c_data_ptr, iterations * memory_size),
-                 device_index);
-#else
-#ifdef FIRESTARTER_BUILD_HIP
-  ACCELL_SAFE_CALL(hipMalloc(&a_data_ptr, memory_size), device_index);
-  ACCELL_SAFE_CALL(hipMalloc(&b_data_ptr, memory_size), device_index);
-  ACCELL_SAFE_CALL(hipMalloc(&c_data_ptr, iterations * memory_size),
-                 device_index);
-#endif
-#endif
+  firestarter::log::trace() << "Allocating " << compat::AccelleratorString << " memory on device nr. " << DeviceIndex;
 
-  firestarter::log::trace() << "Allocated " FS_ACCEL_STRING " memory on device nr. "
-                     << device_index
-                     <<". A: " << a_data_ptr << "(Size: "
-                     << memory_size << "B)"
-                     << "\n";
-
-  firestarter::log::trace() << "Allocated " FS_ACCEL_STRING " memory on device nr. "
-                     << device_index
-                     <<". B: " << b_data_ptr << "(Size: "
-                     << memory_size << "B)"
-                     << "\n";
-  firestarter::log::trace() << "Allocated " FS_ACCEL_STRING " memory on device nr. "
-                     << device_index
-                     <<". C: " << c_data_ptr << "(Size: "
-                     << iterations * memory_size << "B)"
-                     << "\n";
-
-  firestarter::log::trace() << "Initializing " FS_ACCEL_STRING " matrices a, b on device nr. "
-                            << device_index
-                            << ". Using "
-                            << size_use * size_use
-                            << " elements of size "
-                            << sizeof(T) << " Byte";
+  // allocating memory on the GPU
+  compat::accellSafeCall(compat::malloc<FloatingPointType>(&ADataPtr, MemorySize), __FILE__, __LINE__, DeviceIndex);
+  compat::accellSafeCall(compat::malloc<FloatingPointType>(&BDataPtr, MemorySize), __FILE__, __LINE__, DeviceIndex);
+  compat::accellSafeCall(compat::malloc<FloatingPointType>(&CDataPtr, Iterations * MemorySize), __FILE__, __LINE__,
+                         DeviceIndex);
+
+  firestarter::log::trace() << "Allocated " << compat::AccelleratorString << " memory on device nr. " << DeviceIndex
+                            << ". A: " << ADataPtr << " (Size: " << MemorySize << "B)"
+                            << "\n";
+  firestarter::log::trace() << "Allocated " << compat::AccelleratorString << " memory on device nr. " << DeviceIndex
+                            << ". B: " << BDataPtr << " (Size: " << MemorySize << "B)"
+                            << "\n";
+  firestarter::log::trace() << "Allocated " << compat::AccelleratorString << " memory on device nr. " << DeviceIndex
+                            << ". C: " << CDataPtr << " (Size: " << Iterations * MemorySize << "B)"
+                            << "\n";
+
+  firestarter::log::trace() << "Initializing " << compat::AccelleratorString << " matrices a, b on device nr. "
+                            << DeviceIndex << ". Using " << MatrixSize * MatrixSize << " elements of size "
+                            << sizeof(FloatingPointType) << " Byte";
   // initialize matrix A and B on the GPU with random values
-  CONCAT(FS_ACCEL_PREFIX_LC,randGenerator_t) random_gen;
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,randCreateGenerator)(
-                              &random_gen,
-                              CONCAT(FS_ACCEL_PREFIX_UC,RAND_RNG_PSEUDO_DEFAULT)),
-                  device_index);
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,randSetPseudoRandomGeneratorSeed)(
-                              random_gen, SEED),
-                   device_index);
-  ACCELL_SAFE_CALL(
-      generateUniform(random_gen, (T *)a_data_ptr, size_use * size_use),
-      device_index);
-  ACCELL_SAFE_CALL(
-      generateUniform(random_gen, (T *)b_data_ptr, size_use * size_use),
-      device_index);
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,randDestroyGenerator)(random_gen),
-                   device_index);
+  {
+    compat::RandGenerator RandomGen{};
+    compat::accellSafeCall(compat::randCreateGeneratorPseudoRandom(RandomGen), __FILE__, __LINE__, DeviceIndex);
+    compat::accellSafeCall(compat::randSetPseudoRandomGeneratorSeed(RandomGen, Seed), __FILE__, __LINE__, DeviceIndex);
+    compat::accellSafeCall(compat::generateUniform<FloatingPointType>(RandomGen, ADataPtr, MatrixSize * MatrixSize),
+                           __FILE__, __LINE__, DeviceIndex);
+    compat::accellSafeCall(compat::generateUniform<FloatingPointType>(RandomGen, BDataPtr, MatrixSize * MatrixSize),
+                           __FILE__, __LINE__, DeviceIndex);
+    compat::accellSafeCall(compat::randDestroyGenerator(RandomGen), __FILE__, __LINE__, DeviceIndex);
+  }
 
   // initialize c_data_ptr with copies of A
-  for (i = 0; i < iterations; i++) {
-      firestarter::log::trace() << "Initializing " FS_ACCEL_STRING " matrix c-"
-                                << i
-                                << " by copying "
-                                << memory_size
-                                << " byte from "
-                                << a_data_ptr
-                                << " to "
-                                << c_data_ptr + (size_t)(i * size_use * size_use * (float)sizeof(T)/(float)sizeof(c_data_ptr))
-                                << "\n";
-    ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,MemcpyDtoD)(
-                                c_data_ptr + (size_t)(i * size_use * size_use * (float)sizeof(T)/(float)sizeof(c_data_ptr)),
-                                a_data_ptr, memory_size),
-                   device_index);
+  for (std::size_t I = 0; I < Iterations; I++) {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    auto DestinationPtr = CDataPtr + (I * MatrixSize * MatrixSize);
+    firestarter::log::trace() << "Initializing " << compat::AccelleratorString << " matrix c-" << I << " by copying "
+                              << MemorySize << " byte from " << ADataPtr << " to " << DestinationPtr << "\n";
+    compat::accellSafeCall(compat::memcpyDtoD<FloatingPointType>(DestinationPtr, ADataPtr, MemorySize), __FILE__,
+                           __LINE__, DeviceIndex);
   }
 
   // save gpuvar->init_count and sys.out
   {
-    std::lock_guard<std::mutex> lk(waitForInitCvMutex);
-
-#define TO_MB(x) (unsigned long)(x / 1024 / 1024)
-  firestarter::log::info()
-      << "   GPU " << device_index << "\n"
-      << "    name:           " << properties.name << "\n"
-      << "    memory:         " << TO_MB(memory_avail) << "/"
-      << TO_MB(memory_total) << " MiB available (using " << TO_MB(use_bytes)
-      << " MiB)\n"
-      << "    matrix size:    " << size_use << "\n"
-      << "    used precision: "
-      << ((sizeof(T) == sizeof(double)) ? "double" : "single");
-#undef TO_MB
-
-    initCount++;
+    const std::lock_guard<std::mutex> Lk(WaitForInitCvMutex);
+
+    auto ToMiB = [](const size_t Val) { return Val / 1024 / 1024; };
+    firestarter::log::info() << "   GPU " << DeviceIndex << "\n"
+                             << "    name:           " << Properties.name << "\n"
+                             << "    memory:         " << ToMiB(MemoryAvail) << "/" << ToMiB(MemoryTotal)
+                             << " MiB available (using " << ToMiB(UseBytes) << " MiB)\n"
+                             << "    matrix size:    " << MatrixSize << "\n"
+                             << "    used precision: "
+                             << ((sizeof(FloatingPointType) == sizeof(double)) ? "double" : "single");
+
+    InitCount++;
   }
-  waitForInitCv.notify_all();
+  WaitForInitCv.notify_all();
 
-  const T alpha = 1.0;
-  const T beta = 0.0;
+  const FloatingPointType Alpha = 1.0;
+  const FloatingPointType Beta = 0.0;
 
-  int size_use_i = size_use;
   // actual stress begins here
-  while (*loadVar != LOAD_STOP) {
-    for (i = 0; i < iterations; i++) {
-      ACCELL_SAFE_CALL(gemm(
-                          cublas,
-                          CONCAT(FS_ACCEL_PREFIX_UC,BLAS_OP_N),
-                          CONCAT(FS_ACCEL_PREFIX_UC,BLAS_OP_N),
-                          size_use_i, size_use_i,
-                          size_use_i, &alpha, (const T *)a_data_ptr, size_use_i,
-                          (const T *)b_data_ptr, size_use_i, &beta,
-                          (T *)c_data_ptr + i * size_use * size_use, size_use_i),
-                     device_index);
-      ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC_LONG,DeviceSynchronize)(),
-                       device_index);
+  while (LoadVar != firestarter::LoadThreadWorkType::LoadStop) {
+    for (std::size_t I = 0; I < Iterations; I++) {
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+      auto CSectionPtr = CDataPtr + (I * MatrixSize * MatrixSize);
+      compat::accellSafeCall(compat::gemm<FloatingPointType>(Blas, compat::BlasOperation::BLAS_OP_N,
+                                                             compat::BlasOperation::BLAS_OP_N, MatrixSize, MatrixSize,
+                                                             MatrixSize, Alpha, ADataPtr, MatrixSize, BDataPtr,
+                                                             MatrixSize, Beta, CSectionPtr, MatrixSize),
+                             __FILE__, __LINE__, DeviceIndex);
+      compat::accellSafeCall(compat::deviceSynchronize(), __FILE__, __LINE__, DeviceIndex);
     }
   }
 
-#ifdef FIRESTARTER_BUILD_CUDA
-  ACCELL_SAFE_CALL(cuMemFree(a_data_ptr), device_index);
-  ACCELL_SAFE_CALL(cuMemFree(b_data_ptr), device_index);
-  ACCELL_SAFE_CALL(cuMemFree(c_data_ptr), device_index);
-#else
-#ifdef FIRESTARTER_BUILD_HIP
-  ACCELL_SAFE_CALL(hipFree(a_data_ptr), device_index);
-  ACCELL_SAFE_CALL(hipFree(b_data_ptr), device_index);
-  ACCELL_SAFE_CALL(hipFree(c_data_ptr), device_index);
-#endif
-#endif
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,blasDestroy)(cublas), device_index);
-#ifdef FIRESTARTER_BUILD_CUDA
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,CtxDestroy)(context), device_index);
-#else
-#ifdef FIRESTARTER_BUILD_HIP
-  ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,StreamDestroy)(stream), device_index);
-#endif
-#endif
+  compat::accellSafeCall(compat::free<FloatingPointType>(ADataPtr), __FILE__, __LINE__, DeviceIndex);
+  compat::accellSafeCall(compat::free<FloatingPointType>(BDataPtr), __FILE__, __LINE__, DeviceIndex);
+  compat::accellSafeCall(compat::free<FloatingPointType>(CDataPtr), __FILE__, __LINE__, DeviceIndex);
+
+  compat::accellSafeCall(compat::blasDestroy(Blas), __FILE__, __LINE__, DeviceIndex);
+
+  compat::accellSafeCall(compat::destroyContextOrStream(StreamOrContext), __FILE__, __LINE__, DeviceIndex);
 }
 
-Cuda::Cuda(volatile unsigned long long *loadVar, bool useFloat, bool useDouble,
-           unsigned matrixSize, int gpus) {
-  std::thread t(Cuda::initGpus, std::ref(_waitForInitCv), loadVar, useFloat,
-                useDouble, matrixSize, gpus);
-  _initThread = std::move(t);
+}; // namespace
+
+Cuda::Cuda(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble, unsigned MatrixSize,
+           int Gpus) {
+  std::condition_variable WaitForInitCv;
+  std::mutex WaitForInitCvMutex;
 
-  std::unique_lock<std::mutex> lk(_waitForInitCvMutex);
+  std::thread T(Cuda::initGpus, std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat, UseDouble, MatrixSize, Gpus);
+  InitThread = std::move(T);
+
+  std::unique_lock<std::mutex> Lk(WaitForInitCvMutex);
   // wait for gpus to initialize
-  _waitForInitCv.wait(lk);
+  WaitForInitCv.wait(Lk);
 }
 
-void Cuda::initGpus(std::condition_variable &cv,
-                    volatile unsigned long long *loadVar, bool useFloat,
-                    bool useDouble, unsigned matrixSize, int gpus) {
-  std::condition_variable waitForInitCv;
-  std::mutex waitForInitCvMutex;
+void Cuda::initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
+                    bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus) {
+  std::condition_variable GpuThreadsWaitForInitCv;
+  std::mutex GpuThreadsWaitForInitCvMutex;
+  std::vector<std::thread> GpuThreads;
 
-  if (gpus) {
-    ACCELL_SAFE_CALL(CONCAT(FS_ACCEL_PREFIX_LC,Init)(0), -1);
-    int devCount;
-#ifdef FIRESTARTER_BUILD_CUDA
-    ACCELL_SAFE_CALL(cuDeviceGetCount(&devCount), -1);
-#else
-#ifdef FIRESTARTER_BUILD_HIP
-    ACCELL_SAFE_CALL(hipGetDeviceCount(&devCount), -1);
-#endif
-#endif
+  if (Gpus != 0) {
+    compat::accellSafeCall(compat::init(0), __FILE__, __LINE__);
+
+    int DevCount{};
+    compat::accellSafeCall(compat::getDeviceCount(DevCount), __FILE__, __LINE__);
 
-    if (devCount) {
-      std::vector<std::thread> gpuThreads;
-      std::atomic<int> initCount = 0;
-      int use_double;
+    if (DevCount) {
+      std::atomic<int> InitCount = 0;
+      int UseDoubleConverted{};
 
-      if (useFloat) {
-        use_double = 0;
-      } else if (useDouble) {
-        use_double = 1;
+      if (UseFloat) {
+        UseDoubleConverted = 0;
+      } else if (UseDouble) {
+        UseDoubleConverted = 1;
       } else {
-        use_double = 2;
+        UseDoubleConverted = 2;
       }
 
       firestarter::log::info()
@@ -636,65 +306,61 @@ void Cuda::initGpus(std::condition_variable &cv,
           << "\n  graphics processor characteristics:";
 
       // use all GPUs if the user gave no information about use_device
-      if (gpus < 0) {
-        gpus = devCount;
+      if (Gpus < 0) {
+        Gpus = DevCount;
       }
 
-      if (gpus > devCount) {
-        firestarter::log::warn()
-            << "You requested more " FS_ACCEL_STRING " devices than available. "
-               "Maybe you set " FS_ACCEL_STRING "_VISIBLE_DEVICES?";
-        firestarter::log::warn()
-            << "FIRESTARTER will use " << devCount << " of the requested "
-            << gpus << " " FS_ACCEL_STRING " device(s)";
-        gpus = devCount;
+      if (Gpus > DevCount) {
+        firestarter::log::warn() << "You requested more " << compat::AccelleratorString
+                                 << " devices than available. "
+                                    "Maybe you set "
+                                 << compat::AccelleratorString << "_VISIBLE_DEVICES?";
+        firestarter::log::warn() << "FIRESTARTER will use " << DevCount << " of the requested " << Gpus << " "
+                                 << compat::AccelleratorString << " device(s)";
+        Gpus = DevCount;
       }
 
       {
-        std::lock_guard<std::mutex> lk(waitForInitCvMutex);
+        const std::lock_guard<std::mutex> Lk(GpuThreadsWaitForInitCvMutex);
 
-        for (int i = 0; i < gpus; ++i) {
+        for (int I = 0; I < Gpus; ++I) {
           // if there's a GPU in the system without Double Precision support, we
           // have to correct this.
-          int precision = get_precision(i, use_double);
-
-          if (precision) {
-            std::thread t(create_load<double>, std::ref(waitForInitCv),
-                          std::ref(waitForInitCvMutex), i, std::ref(initCount),
-                          loadVar, (int)matrixSize);
-            gpuThreads.push_back(std::move(t));
-          } else {
-            std::thread t(create_load<float>, std::ref(waitForInitCv),
-                          std::ref(waitForInitCvMutex), i, std::ref(initCount),
-                          loadVar, (int)matrixSize);
-            gpuThreads.push_back(std::move(t));
-          }
+          const auto Precision = getPrecision(I, UseDoubleConverted);
+          void (*LoadFunc)(std::condition_variable&, std::mutex&, int, std::atomic<int>&,
+                           const volatile firestarter::LoadThreadWorkType&, unsigned) =
+              Precision ? createLoad<double> : createLoad<float>;
+
+          std::thread T(LoadFunc, std::ref(GpuThreadsWaitForInitCv), std::ref(GpuThreadsWaitForInitCvMutex), I,
+                        std::ref(InitCount), std::cref(LoadVar), MatrixSize);
+          GpuThreads.emplace_back(std::move(T));
         }
       }
 
       {
-        std::unique_lock<std::mutex> lk(waitForInitCvMutex);
+        std::unique_lock<std::mutex> Lk(GpuThreadsWaitForInitCvMutex);
         // wait for all threads to initialize
-        waitForInitCv.wait(lk, [&] { return initCount == gpus; });
-      }
-
-      // notify that init is done
-      cv.notify_all();
-
-      /* join computation threads */
-      for (auto &t : gpuThreads) {
-        t.join();
+        GpuThreadsWaitForInitCv.wait(Lk, [&] { return InitCount == Gpus; });
       }
     } else {
-      firestarter::log::info()
-          << "    - No " FS_ACCEL_STRING " devices. Just stressing CPU(s). Maybe use "
-             "FIRESTARTER instead of FIRESTARTER_" FS_ACCEL_STRING "?";
-      cv.notify_all();
+      firestarter::log::info() << "    - No " << compat::AccelleratorString
+                               << " devices. Just stressing CPU(s). Maybe use "
+                                  "FIRESTARTER instead of FIRESTARTER_"
+                               << compat::AccelleratorString << "?";
     }
   } else {
-    firestarter::log::info()
-        << "    --gpus 0 is set. Just stressing CPU(s). Maybe use "
-           "FIRESTARTER instead of FIRESTARTER_" FS_ACCEL_STRING "?";
-    cv.notify_all();
+    firestarter::log::info() << "    --gpus 0 is set. Just stressing CPU(s). Maybe use "
+                                "FIRESTARTER instead of FIRESTARTER_"
+                             << compat::AccelleratorString << "?";
+  }
+
+  // notify that init is done
+  WaitForInitCv.notify_all();
+
+  /* join computation threads */
+  for (auto& Thread : GpuThreads) {
+    Thread.join();
   }
 }
+
+} // namespace firestarter::cuda
\ No newline at end of file
diff --git a/src/firestarter/DumpRegisterWorker.cpp b/src/firestarter/DumpRegisterWorker.cpp
index 3f7ab6a9..127d0f1d 100644
--- a/src/firestarter/DumpRegisterWorker.cpp
+++ b/src/firestarter/DumpRegisterWorker.cpp
@@ -19,30 +19,25 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#ifdef FIRESTARTER_DEBUG_FEATURES
-
-#include <firestarter/Firestarter.hpp>
-#include <firestarter/Logging/Log.hpp>
+#include "firestarter/Firestarter.hpp"
 
 #include <fstream>
 #include <sstream>
 #include <thread>
 
-using namespace firestarter;
-
 namespace {
-static unsigned hammingDistance(unsigned long long x, unsigned long long y) {
-  unsigned dist = 0;
+auto hammingDistance(uint64_t X, uint64_t Y) -> unsigned {
+  unsigned Dist = 0;
 
-  for (unsigned long long val = x ^ y; val > 0; val >>= 1) {
-    dist += val & 1;
+  for (uint64_t Val = X ^ Y; Val > 0; Val >>= 1) {
+    Dist += Val & 1;
   }
 
-  return dist;
+  return Dist;
 }
 
-static std::string registerNameBySize(unsigned registerSize) {
-  switch (registerSize) {
+auto registerNameBySize(unsigned RegisterSize) -> std::string {
+  switch (RegisterSize) {
   case 2:
     return "xmm";
   case 4:
@@ -55,141 +50,120 @@ static std::string registerNameBySize(unsigned registerSize) {
 }
 } // namespace
 
-int Firestarter::initDumpRegisterWorker(std::chrono::seconds dumpTimeDelta,
-                                        std::string dumpFilePath) {
+namespace firestarter {
 
-  auto data = std::make_unique<DumpRegisterWorkerData>(
-      this->loadThreads.begin()->second, dumpTimeDelta, dumpFilePath);
+void Firestarter::initDumpRegisterWorker() {
+  // Create the data for the worker thread. The thread will dump the register contents periodically and calculate the
+  // hamming distance between dumps.
+  auto Data = std::make_unique<DumpRegisterWorkerData>(this->LoadThreads.begin()->second, Cfg.DumpRegistersTimeDelta,
+                                                       Cfg.DumpRegistersOutpath);
 
-  this->dumpRegisterWorkerThread =
-      std::thread(Firestarter::dumpRegisterWorker, std::move(data));
-
-  return EXIT_SUCCESS;
+  // Spawn the thread.
+  DumpRegisterWorkerThread = std::thread(Firestarter::dumpRegisterWorker, std::move(Data));
 }
 
-void Firestarter::joinDumpRegisterWorker() {
-  this->dumpRegisterWorkerThread.join();
-}
-
-void Firestarter::dumpRegisterWorker(
-    std::unique_ptr<DumpRegisterWorkerData> data) {
+void Firestarter::joinDumpRegisterWorker() { this->DumpRegisterWorkerThread.join(); }
 
+void Firestarter::dumpRegisterWorker(std::unique_ptr<DumpRegisterWorkerData> Data) {
+#if defined(linux) || defined(__linux__)
   pthread_setname_np(pthread_self(), "DumpRegWorker");
+#endif
 
-  int registerCount = data->loadWorkerData->config().payload().registerCount();
-  int registerSize = data->loadWorkerData->config().payload().registerSize();
-  std::string registerPrefix = registerNameBySize(registerSize);
-  auto offset = sizeof(DumpRegisterStruct) / sizeof(unsigned long long);
-
-  auto dumpRegisterStruct = reinterpret_cast<DumpRegisterStruct *>(
-      data->loadWorkerData->addrMem - offset);
+  const auto RegisterCount = Data->LoadWorkerDataPtr->config().payload()->registerCount();
+  const auto RegisterSize = Data->LoadWorkerDataPtr->config().payload()->registerSize();
+  const auto Offset = RegisterCount * RegisterSize;
+  const std::string RegisterPrefix = registerNameBySize(RegisterSize);
 
-  auto dumpVar = reinterpret_cast<volatile unsigned long long *>(
-      &dumpRegisterStruct->dumpVar);
+  auto& DumpRegisterStructRef = Data->LoadWorkerDataPtr->Memory->ExtraVars.Drs;
+  auto& DumpVar = DumpRegisterStructRef.DumpVar;
   // memory of simd variables is before the padding
-  volatile unsigned long long *dumpMemAddr =
-      dumpRegisterStruct->padding - registerCount * registerSize;
-
-  // TODO: maybe use aligned_malloc to make memcpy more efficient and don't
-  // interrupt the workload as much?
-  unsigned long long *last = reinterpret_cast<unsigned long long *>(
-      malloc(sizeof(unsigned long long) * offset));
-  unsigned long long *current = reinterpret_cast<unsigned long long *>(
-      malloc(sizeof(unsigned long long) * offset));
-
-  if (last == nullptr || current == nullptr) {
-    log::error() << "Malloc failed in Firestarter::dumpRegisterWorker";
-    exit(ENOMEM);
-  }
+  const auto* DumpMemAddr = DumpRegisterStructRef.Padding.data() - Offset;
+
+  // allocate continous memory that fits the register contents
+  auto Last = std::vector<uint64_t>(Offset);
 
-  std::stringstream dumpFilePath;
-  dumpFilePath << data->dumpFilePath;
+  std::stringstream DumpFilePath;
+  DumpFilePath << Data->DumpFilePath;
 #if defined(__MINGW32__) || defined(__MINGW64__)
-  dumpFilePath << "\\";
+  DumpFilePath << "\\";
 #else
-  dumpFilePath << "/";
+  DumpFilePath << "/";
 #endif
-  dumpFilePath << "hamming_distance.csv";
-  auto dumpFile = std::ofstream(dumpFilePath.str());
+  DumpFilePath << "hamming_distance.csv";
+  auto DumpFile = std::ofstream(DumpFilePath.str());
 
   // dump the header to the csv file
-  dumpFile << "total_hamming_distance,";
-  for (int i = 0; i < registerCount; i++) {
-    for (int j = 0; j < registerSize; j++) {
-      dumpFile << registerPrefix << i << "[" << j << "]";
+  DumpFile << "total_hamming_distance,";
+  for (auto I = 0U; I < RegisterCount; I++) {
+    for (auto J = 0U; J < RegisterSize; J++) {
+      DumpFile << RegisterPrefix << I << "[" << J << "]";
 
-      if (j != registerSize - 1) {
-        dumpFile << ",";
+      if (J != RegisterSize - 1) {
+        DumpFile << ",";
       }
     }
 
-    if (i != registerCount - 1) {
-      dumpFile << ",";
+    if (I != RegisterCount - 1) {
+      DumpFile << ",";
     }
   }
-  dumpFile << std::endl << std::flush;
+  DumpFile << '\n' << std::flush;
 
   // do not output the hamming distance for the first run
-  bool skipFirst = true;
+  bool SkipFirst = true;
 
   // continue until stop and dump the registers every data->dumpTimeDelta
   // seconds
-  for (; *data->loadWorkerData->addrHigh != LOAD_STOP;) {
+  for (; Data->LoadWorkerDataPtr->LoadVar != LoadThreadWorkType::LoadStop;) {
     // signal the thread to dump its largest SIMD registers
-    *dumpVar = DumpVariable::Start;
+    DumpVar = DumpVariable::Start;
     __asm__ __volatile__("mfence;");
-    while (*dumpVar == DumpVariable::Start) {
+    while (DumpVar == DumpVariable::Start) {
       std::this_thread::sleep_for(std::chrono::milliseconds(10));
     }
 
+    auto Current = std::vector<uint64_t>(Offset);
     // copy the register content to minimize the interruption of the load worker
-    std::memcpy(current, (void *)dumpMemAddr,
-                sizeof(unsigned long long) * offset);
+    std::memcpy(Current.data(), DumpMemAddr, Current.size() * sizeof(decltype(Current)::value_type));
 
     // skip the first output, as we first have to get some valid values for last
-    if (!skipFirst) {
+    if (!SkipFirst) {
       // calculate the total hamming distance
-      int totalHammingDistance = 0;
-      for (int i = 0; i < registerCount * registerSize; i++) {
-        totalHammingDistance += hammingDistance(current[i], last[i]);
+      auto TotalHammingDistance = 0U;
+      for (auto I = 0U; I < RegisterCount * RegisterSize; I++) {
+        TotalHammingDistance += hammingDistance(Current[I], Last[I]);
       }
 
-      dumpFile << totalHammingDistance << ",";
+      DumpFile << TotalHammingDistance << ",";
 
       // dump the hamming distance of each double (last, current) pair
-      for (int i = registerCount - 1; i >= 0; i--) {
-        // auto registerNum = registerCount - 1 - i;
-
-        for (auto j = 0; j < registerSize; j++) {
-          auto index = registerSize * i + j;
-          auto hd = static_cast<unsigned long long>(
-              hammingDistance(current[index], last[index]));
-
-          dumpFile << hd;
-          if (j != registerSize - 1) {
-            dumpFile << ",";
+      for (int I = static_cast<int>(RegisterCount) - 1; I >= 0; I--) {
+        for (auto J = 0U; J < RegisterSize; J++) {
+          auto Index = (RegisterSize * I) + J;
+          auto Hd = static_cast<uint64_t>(hammingDistance(Current[Index], Last[Index]));
+
+          DumpFile << Hd;
+          if (J != RegisterSize - 1) {
+            DumpFile << ",";
           }
         }
 
-        if (i != 0) {
-          dumpFile << ",";
+        if (I != 0) {
+          DumpFile << ",";
         }
       }
 
-      dumpFile << std::endl << std::flush;
+      DumpFile << '\n' << std::flush;
     } else {
-      skipFirst = false;
+      SkipFirst = false;
     }
 
-    std::memcpy(last, current, sizeof(unsigned long long) * offset);
+    Last = std::move(Current);
 
-    std::this_thread::sleep_for(std::chrono::seconds(data->dumpTimeDelta));
+    std::this_thread::sleep_for(std::chrono::seconds(Data->DumpTimeDelta));
   }
 
-  dumpFile.close();
-
-  free(last);
-  free(current);
+  DumpFile.close();
 }
 
-#endif
+} // namespace firestarter
\ No newline at end of file
diff --git a/src/firestarter/Environment/CPUTopology.cpp b/src/firestarter/Environment/CPUTopology.cpp
index d7fb4bf0..a7acf3f2 100644
--- a/src/firestarter/Environment/CPUTopology.cpp
+++ b/src/firestarter/Environment/CPUTopology.cpp
@@ -19,127 +19,117 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Environment/CPUTopology.hpp>
-#include <firestarter/Logging/Log.hpp>
+#include "firestarter/Environment/CPUTopology.hpp"
+#include "firestarter/Logging/Log.hpp"
 
 #include <array>
 #include <fstream>
 #include <regex>
+#include <utility>
 
-extern "C" {
-#include <stdio.h>
-}
-
-using namespace firestarter::environment;
+namespace firestarter::environment {
 
-std::ostream &CPUTopology::print(std::ostream &stream) const {
-  stream << "  system summary:\n"
-         << "    number of processors:        " << this->numPackages() << "\n"
-         << "    number of cores (total)):    " << this->numCoresTotal() << "\n"
-         << "  (this includes only cores in the cgroup)"  << "\n"
-         << "    number of threads per core:  " << this->numThreadsPerCore()
+auto CPUTopology::print(std::ostream& Stream) const -> std::ostream& {
+  Stream << "  system summary:\n"
+         << "    number of processors:        " << numPackages() << "\n"
+         << "    number of cores (total)):    " << numCoresTotal() << "\n"
+         << "  (this includes only cores in the cgroup)"
          << "\n"
-         << "    total number of threads:     " << this->numThreads() << "\n\n";
+         << "    number of threads per core:  " << numThreadsPerCore() << "\n"
+         << "    total number of threads:     " << numThreads() << "\n\n";
 
-  std::stringstream ss;
+  std::stringstream Ss;
 
-  for (auto const &ent : this->features()) {
-    ss << ent << " ";
+  for (auto const& Entry : features()) {
+    Ss << Entry << " ";
   }
 
-  stream << "  processor characteristics:\n"
-         << "    architecture:       " << this->architecture() << "\n"
-         << "    vendor:             " << this->vendor() << "\n"
-         << "    processor-name:     " << this->processorName() << "\n"
-         << "    model:              " << this->model() << "\n"
-         << "    frequency:          " << this->clockrate() / 1000000
-         << " MHz\n"
-         << "    supported features: " << ss.str() << "\n"
+  Stream << "  processor characteristics:\n"
+         << "    architecture:       " << architecture() << "\n"
+         << "    vendor:             " << vendor() << "\n"
+         << "    processor-name:     " << processorName() << "\n"
+         << "    model:              " << model() << "\n"
+         << "    frequency:          " << clockrate() / 1000000 << " MHz\n"
+         << "    supported features: " << Ss.str() << "\n"
          << "    Caches:";
 
-  std::vector<hwloc_obj_type_t> caches = {
-      HWLOC_OBJ_L1CACHE,  HWLOC_OBJ_L1ICACHE, HWLOC_OBJ_L2CACHE,
-      HWLOC_OBJ_L2ICACHE, HWLOC_OBJ_L3CACHE,  HWLOC_OBJ_L3ICACHE,
-      HWLOC_OBJ_L4CACHE,  HWLOC_OBJ_L5CACHE,
+  const std::vector<hwloc_obj_type_t> Caches = {
+      HWLOC_OBJ_L1CACHE, HWLOC_OBJ_L1ICACHE, HWLOC_OBJ_L2CACHE, HWLOC_OBJ_L2ICACHE,
+      HWLOC_OBJ_L3CACHE, HWLOC_OBJ_L3ICACHE, HWLOC_OBJ_L4CACHE, HWLOC_OBJ_L5CACHE,
   };
 
-  std::vector<std::string> cacheStrings = {};
+  for (hwloc_obj_type_t const& Cache : Caches) {
+    std::stringstream Ss;
 
-  for (hwloc_obj_type_t const &cache : caches) {
-    int width;
-    char string[128];
-    int shared;
-    hwloc_obj_t cacheObj;
-    std::stringstream ss;
+    auto Width = hwloc_get_nbobjs_by_type(Topology, Cache);
 
-    width = hwloc_get_nbobjs_by_type(this->topology, cache);
+    if (Width >= 1) {
+      Ss << "\n      - ";
 
-    if (width >= 1) {
-      ss << "\n      - ";
+      auto* CacheObj = hwloc_get_obj_by_type(Topology, Cache, 0);
+      std::array<char, 128> String{};
+      auto* StringPtr = String.data();
+      hwloc_obj_type_snprintf(StringPtr, sizeof(String), CacheObj, 0);
 
-      cacheObj = hwloc_get_obj_by_type(this->topology, cache, 0);
-      hwloc_obj_type_snprintf(string, sizeof(string), cacheObj, 0);
-
-      switch (cacheObj->attr->cache.type) {
+      switch (CacheObj->attr->cache.type) {
       case HWLOC_OBJ_CACHE_DATA:
-        ss << "Level " << cacheObj->attr->cache.depth << " Data";
+        Ss << "Level " << CacheObj->attr->cache.depth << " Data";
         break;
       case HWLOC_OBJ_CACHE_INSTRUCTION:
-        ss << "Level " << cacheObj->attr->cache.depth << " Instruction";
+        Ss << "Level " << CacheObj->attr->cache.depth << " Instruction";
         break;
       case HWLOC_OBJ_CACHE_UNIFIED:
       default:
-        ss << "Unified Level " << cacheObj->attr->cache.depth;
+        Ss << "Unified Level " << CacheObj->attr->cache.depth;
         break;
       }
 
-      ss << " Cache, " << cacheObj->attr->cache.size / 1024 << " KiB, "
-         << cacheObj->attr->cache.linesize << " B Cacheline, ";
+      Ss << " Cache, " << CacheObj->attr->cache.size / 1024 << " KiB, " << CacheObj->attr->cache.linesize
+         << " B Cacheline, ";
 
-      switch (cacheObj->attr->cache.associativity) {
+      switch (CacheObj->attr->cache.associativity) {
       case -1:
-        ss << "full";
+        Ss << "full";
         break;
       case 0:
-        ss << "unknown";
+        Ss << "unknown";
         break;
       default:
-        ss << cacheObj->attr->cache.associativity << "-way set";
+        Ss << CacheObj->attr->cache.associativity << "-way set";
         break;
       }
 
-      ss << " associative, ";
+      Ss << " associative, ";
 
-      shared = this->numThreads() / width;
+      auto Shared = numThreads() / Width;
 
-      if (shared > 1) {
-        ss << "shared among " << shared << " threads.";
+      if (Shared > 1) {
+        Ss << "shared among " << Shared << " threads.";
       } else {
-        ss << "per thread.";
+        Ss << "per thread.";
       }
 
-      stream << ss.str();
+      Stream << Ss.str();
     }
   }
 
-  return stream;
+  return Stream;
 }
 
-CPUTopology::CPUTopology(std::string architecture)
-    : _architecture(architecture) {
+CPUTopology::CPUTopology(std::string Architecture)
+    : Architecture(std::move(Architecture)) {
 
-  hwloc_topology_init(&this->topology);
+  hwloc_topology_init(&Topology);
 
   // do not filter icaches
-  hwloc_topology_set_cache_types_filter(this->topology,
-                                        HWLOC_TYPE_FILTER_KEEP_ALL);
+  hwloc_topology_set_cache_types_filter(Topology, HWLOC_TYPE_FILTER_KEEP_ALL);
 
-  hwloc_topology_load(this->topology);
+  hwloc_topology_load(Topology);
 
   // check for hybrid processor
-  int nr_cpukinds = hwloc_cpukinds_get_nr(this->topology, 0);
+  const auto NrCpukinds = hwloc_cpukinds_get_nr(Topology, 0);
 
-  switch (nr_cpukinds) {
+  switch (NrCpukinds) {
   case -1:
     log::warn() << "Hybrid core check failed";
     break;
@@ -147,292 +137,273 @@ CPUTopology::CPUTopology(std::string architecture)
     log::warn() << "Hybrid core check read no information";
     break;
   default:
-    log::trace() << "Number of CPU kinds:" << nr_cpukinds;
+    log::trace() << "Number of CPU kinds:" << NrCpukinds;
   }
-  if (nr_cpukinds > 1) {
+  if (NrCpukinds > 1) {
     log::warn() << "FIRESTARTER detected a hybrid CPU set-up";
   }
   // get number of packages
-  int depth = hwloc_get_type_depth(this->topology, HWLOC_OBJ_PACKAGE);
+  int Depth = hwloc_get_type_depth(Topology, HWLOC_OBJ_PACKAGE);
 
-  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
-    this->_numPackages = 1;
+  if (Depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
+    NumPackages = 1;
     log::warn() << "Could not get number of packages";
   } else {
-    this->_numPackages = hwloc_get_nbobjs_by_depth(this->topology, depth);
+    NumPackages = hwloc_get_nbobjs_by_depth(Topology, Depth);
   }
 
-    log::trace() << "Number of Packages:" << this->_numPackages;
+  log::trace() << "Number of Packages:" << NumPackages;
   // get number of cores per package
-  depth = hwloc_get_type_depth(this->topology, HWLOC_OBJ_CORE);
+  Depth = hwloc_get_type_depth(Topology, HWLOC_OBJ_CORE);
 
-  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
-    this->_numCoresTotal = 1;
+  if (Depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
+    NumCoresTotal = 1;
     log::warn() << "Could not get number of cores";
   } else {
-    this->_numCoresTotal =
-        hwloc_get_nbobjs_by_depth(this->topology, depth);
-    if ( this->_numCoresTotal == 0 ) {
+    NumCoresTotal = hwloc_get_nbobjs_by_depth(Topology, Depth);
+    if (NumCoresTotal == 0) {
       log::warn() << "Could not get number of cores";
-      this->_numCoresTotal = 1;
+      NumCoresTotal = 1;
     }
   }
-  log::trace() << "Number of Cores:" << this->_numCoresTotal;
+  log::trace() << "Number of Cores:" << NumCoresTotal;
 
   // get number of threads per core
-  depth = hwloc_get_type_depth(this->topology, HWLOC_OBJ_PU);
+  Depth = hwloc_get_type_depth(Topology, HWLOC_OBJ_PU);
 
-  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
-    this->_numThreadsPerCore = 1;
+  if (Depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
+    NumThreadsPerCore = 1;
     log::warn() << "Could not get number of threads";
   } else {
-    this->_numThreadsPerCore =
-        hwloc_get_nbobjs_by_depth(this->topology, depth) /
-        this->_numCoresTotal ;
-    if ( this->_numThreadsPerCore == 0 ) {
+    NumThreadsPerCore = hwloc_get_nbobjs_by_depth(Topology, Depth) / NumCoresTotal;
+    if (NumThreadsPerCore == 0) {
       log::warn() << "Could not get number of threads per core";
-      this->_numThreadsPerCore = 1;
+      NumThreadsPerCore = 1;
     }
   }
 
   // get vendor, processor name and clockrate for linux
 #if defined(linux) || defined(__linux__)
-  auto procCpuinfo = this->getFileAsStream("/proc/cpuinfo");
-  std::string line;
-  std::string clockrate = "0";
-
-  while (std::getline(procCpuinfo, line, '\n')) {
-    const std::regex vendorIdRe("^vendor_id.*:\\s*(.*)\\s*$");
-    const std::regex modelNameRe("^model name.*:\\s*(.*)\\s*$");
-    const std::regex cpuMHzRe("^cpu MHz.*:\\s*(.*)\\s*$");
-    std::smatch vendorIdM;
-    std::smatch modelNameM;
-    std::smatch cpuMHzM;
-
-    if (std::regex_match(line, vendorIdM, vendorIdRe)) {
-      this->_vendor = vendorIdM[1].str();
+  {
+    auto ProcCpuinfo = getFileAsStream("/proc/cpuinfo");
+    std::string Line;
+    std::string ClockrateStr = "0";
+
+    while (std::getline(ProcCpuinfo, Line, '\n')) {
+      const std::regex VendorIdRe("^vendor_id.*:\\s*(.*)\\s*$");
+      const std::regex ModelNameRe("^model name.*:\\s*(.*)\\s*$");
+      const std::regex CpuMHzRe("^cpu MHz.*:\\s*(.*)\\s*$");
+      std::smatch VendorIdMatch;
+      std::smatch ModelNameMatch;
+      std::smatch CpuMHzMatch;
+
+      if (std::regex_match(Line, VendorIdMatch, VendorIdRe)) {
+        Vendor = VendorIdMatch[1].str();
+      }
+
+      if (std::regex_match(Line, ModelNameMatch, ModelNameRe)) {
+        ProcessorName = ModelNameMatch[1].str();
+      }
+
+      if (std::regex_match(Line, CpuMHzMatch, CpuMHzRe)) {
+        ClockrateStr = CpuMHzMatch[1].str();
+      }
     }
 
-    if (std::regex_match(line, modelNameM, modelNameRe)) {
-      this->_processorName = modelNameM[1].str();
+    if (Vendor.empty()) {
+      log::warn() << "Could determine vendor from /proc/cpuinfo";
     }
 
-    if (std::regex_match(line, cpuMHzM, cpuMHzRe)) {
-      clockrate = cpuMHzM[1].str();
+    if (ProcessorName.empty()) {
+      log::warn() << "Could determine processor-name from /proc/cpuinfo";
     }
-  }
 
-  if (this->_vendor == "") {
-    log::warn() << "Could determine vendor from /proc/cpuinfo";
-  }
+    if (ClockrateStr == "0") {
+      firestarter::log::warn() << "Can't determine clockrate from /proc/cpuinfo";
+    } else {
+      firestarter::log::trace() << "Clockrate from /proc/cpuinfo is " << ClockrateStr;
+      Clockrate = static_cast<uint64_t>(1000000U) * std::stoi(ClockrateStr);
+    }
 
-  if (this->_processorName == "") {
-    log::warn() << "Could determine processor-name from /proc/cpuinfo";
-  }
+    auto Governor = scalingGovernor();
+    if (!Governor.empty()) {
 
-  if (clockrate == "0") {
-    firestarter::log::warn() << "Can't determine clockrate from /proc/cpuinfo";
-  } else {
-    firestarter::log::trace()
-        << "Clockrate from /proc/cpuinfo is " << clockrate;
-    this->_clockrate = 1e6 * std::stoi(clockrate);
-  }
+      auto ScalingCurFreq = getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq").str();
+      auto CpuinfoCurFreq = getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq").str();
+      auto ScalingMaxFreq = getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq").str();
+      auto CpuinfoMaxFreq = getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq").str();
 
-  auto governor = this->scalingGovernor();
-  if (!governor.empty()) {
-
-    auto scalingCurFreq =
-        this->getFileAsStream(
-                "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq")
-            .str();
-    auto cpuinfoCurFreq =
-        this->getFileAsStream(
-                "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq")
-            .str();
-    auto scalingMaxFreq =
-        this->getFileAsStream(
-                "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq")
-            .str();
-    auto cpuinfoMaxFreq =
-        this->getFileAsStream(
-                "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq")
-            .str();
-
-    if (governor.compare("performance") || governor.compare("powersave")) {
-      if (scalingCurFreq.empty()) {
-        if (!cpuinfoCurFreq.empty()) {
-          clockrate = cpuinfoCurFreq;
+      if (Governor == "performance" || Governor == "powersave") {
+        if (ScalingCurFreq.empty()) {
+          if (!CpuinfoCurFreq.empty()) {
+            ClockrateStr = CpuinfoCurFreq;
+          }
+        } else {
+          ClockrateStr = ScalingCurFreq;
         }
       } else {
-        clockrate = scalingCurFreq;
-      }
-    } else {
-      if (scalingMaxFreq.empty()) {
-        if (!cpuinfoMaxFreq.empty()) {
-          clockrate = cpuinfoMaxFreq;
+        if (ScalingMaxFreq.empty()) {
+          if (!CpuinfoMaxFreq.empty()) {
+            ClockrateStr = CpuinfoMaxFreq;
+          }
+        } else {
+          ClockrateStr = ScalingMaxFreq;
         }
-      } else {
-        clockrate = scalingMaxFreq;
       }
-    }
 
-    this->_clockrate = 1e3 * std::stoi(clockrate);
+      Clockrate = static_cast<uint64_t>(1000U) * std::stoi(ClockrateStr);
+    }
   }
 #endif
 
   // try to detect processor name for macos
 #ifdef __APPLE__
-  // use sysctl to detect the name
-  std::array<char, 128> buffer;
-  auto cmd = "sysctl -n machdep.cpu.brand_string";
-  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
-  if (!pipe) {
-    log::warn() << "Could not determine processor-name";
-  }
-  if (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
-    auto str = std::string(buffer.data());
-    str.erase(std::remove(str.begin(), str.end(), '\n'), str.end());
-    this->_processorName = str;
+  {
+    // use sysctl to detect the name
+    std::array<char, 128> Buffer{};
+    const auto* Cmd = "sysctl -n machdep.cpu.brand_string";
+    std::unique_ptr<FILE, decltype(&pclose)> Pipe(popen(Cmd, "r"), pclose);
+    if (!Pipe) {
+      log::warn() << "Could not determine processor-name";
+    }
+    if (fgets(Buffer.data(), Buffer.size(), Pipe.get()) != nullptr) {
+      auto Str = std::string(Buffer.data());
+      Str.erase(std::remove(Str.begin(), Str.end(), '\n'), Str.end());
+      ProcessorName = Str;
+    }
   }
 #endif
 
 // try to detect processor name for windows
 #ifdef _WIN32
-  // use wmic
-  std::array<char, 128> buffer;
-  auto cmd = "wmic cpu get name";
-  std::unique_ptr<FILE, decltype(&_pclose)> pipe(_popen(cmd, "r"), _pclose);
-  if (!pipe) {
-    log::warn() << "Could not determine processor-name";
-  }
-  auto line = 0;
-  while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
-    if (line != 1) {
-      line++;
-      continue;
+  {
+    // use wmic
+    std::array<char, 128> Buffer{};
+    const auto* Cmd = "wmic cpu get name";
+    std::unique_ptr<FILE, decltype(&_pclose)> Pipe(_popen(Cmd, "r"), _pclose);
+    if (!Pipe) {
+      log::warn() << "Could not determine processor-name";
     }
+    auto Line = 0;
+    while (fgets(Buffer.data(), Buffer.size(), Pipe.get()) != nullptr) {
+      if (Line != 1) {
+        Line++;
+        continue;
+      }
 
-    auto str = std::string(buffer.data());
-    str.erase(std::remove(str.begin(), str.end(), '\n'), str.end());
-    this->_processorName = str;
+      auto Str = std::string(Buffer.data());
+      Str.erase(std::remove(Str.begin(), Str.end(), '\n'), Str.end());
+      ProcessorName = Str;
+    }
   }
 #endif
 
   // get L1i-Cache size
-  int width = hwloc_get_nbobjs_by_type(this->topology, HWLOC_OBJ_L1ICACHE);
+  const auto Width = hwloc_get_nbobjs_by_type(Topology, HWLOC_OBJ_L1ICACHE);
 
-  if (width >= 1) {
-    hwloc_obj_t cacheObj =
-        hwloc_get_obj_by_type(this->topology, HWLOC_OBJ_L1ICACHE, 0);
-    this->_instructionCacheSize = cacheObj->attr->cache.size;
+  if (Width >= 1) {
+    hwloc_obj_t CacheObj = hwloc_get_obj_by_type(Topology, HWLOC_OBJ_L1ICACHE, 0);
+    InstructionCacheSize = CacheObj->attr->cache.size;
   }
 }
 
-CPUTopology::~CPUTopology() { hwloc_topology_destroy(this->topology); }
+CPUTopology::~CPUTopology() { hwloc_topology_destroy(Topology); }
 
-std::stringstream CPUTopology::getFileAsStream(std::string const &filePath) {
-  std::ifstream file(filePath);
-  std::stringstream ss;
+auto CPUTopology::getFileAsStream(std::string const& FilePath) -> std::stringstream {
+  std::ifstream File(FilePath);
+  std::stringstream Ss;
 
-  if (!file.is_open()) {
-    log::trace() << "Could not open " << filePath;
+  if (!File.is_open()) {
+    log::trace() << "Could not open " << FilePath;
   } else {
-    ss << file.rdbuf();
-    file.close();
+    Ss << File.rdbuf();
+    File.close();
   }
 
-  return ss;
+  return Ss;
 }
 
-std::string CPUTopology::scalingGovernor() const {
-  return this
-      ->getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor")
-      .str();
+auto CPUTopology::scalingGovernor() -> std::string {
+  return getFileAsStream("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor").str();
 }
 
-int CPUTopology::getCoreIdFromPU(unsigned pu) const {
-  int width;
-  hwloc_obj_t obj;
+auto CPUTopology::getCoreIdFromPU(unsigned Pu) const -> std::optional<unsigned> {
+  auto Width = hwloc_get_nbobjs_by_type(Topology, HWLOC_OBJ_PU);
 
-  width = hwloc_get_nbobjs_by_type(this->topology, HWLOC_OBJ_PU);
-
-  if (width >= 1) {
-    for (int i = 0; i < width; i++) {
-      obj = hwloc_get_obj_by_type(this->topology, HWLOC_OBJ_PU, i);
-      if (obj->os_index == pu) {
-        for (; obj; obj = obj->parent) {
-          if (obj->type == HWLOC_OBJ_CORE) {
-            return obj->logical_index;
+  if (Width >= 1) {
+    for (int I = 0; I < Width; I++) {
+      auto* Obj = hwloc_get_obj_by_type(Topology, HWLOC_OBJ_PU, I);
+      if (Obj->os_index == Pu) {
+        for (; Obj; Obj = Obj->parent) {
+          if (Obj->type == HWLOC_OBJ_CORE) {
+            return Obj->logical_index;
           }
         }
       }
     }
   }
 
-  return -1;
+  return {};
 }
 
-int CPUTopology::getPkgIdFromPU(unsigned pu) const {
-  int width;
-  hwloc_obj_t obj;
-
-  width = hwloc_get_nbobjs_by_type(this->topology, HWLOC_OBJ_PU);
+auto CPUTopology::getPkgIdFromPU(unsigned Pu) const -> std::optional<unsigned> {
+  auto Width = hwloc_get_nbobjs_by_type(Topology, HWLOC_OBJ_PU);
 
-  if (width >= 1) {
-    for (int i = 0; i < width; i++) {
-      obj = hwloc_get_obj_by_type(this->topology, HWLOC_OBJ_PU, i);
-      if (obj->os_index == pu) {
-        for (; obj; obj = obj->parent) {
-          if (obj->type == HWLOC_OBJ_PACKAGE) {
-            return obj->logical_index;
+  if (Width >= 1) {
+    for (int I = 0; I < Width; I++) {
+      auto* Obj = hwloc_get_obj_by_type(Topology, HWLOC_OBJ_PU, I);
+      if (Obj->os_index == Pu) {
+        for (; Obj; Obj = Obj->parent) {
+          if (Obj->type == HWLOC_OBJ_PACKAGE) {
+            return Obj->logical_index;
           }
         }
       }
     }
   }
 
-  return -1;
+  return {};
 }
 
-unsigned CPUTopology::maxNumThreads() const {
-  unsigned max = 0;
+auto CPUTopology::maxNumThreads() const -> unsigned {
+  unsigned Max = 0;
 
   // There might be more then one kind of cores
-  int nr_cpukinds = hwloc_cpukinds_get_nr(this->topology, 0);
+  const auto NrCpukinds = hwloc_cpukinds_get_nr(Topology, 0);
 
   // fallback in case this did not work ... can happen on some platforms
   // already printed a warning earlier
-  if (nr_cpukinds < 1) {
-    hwloc_obj_t obj;
-    int width = hwloc_get_nbobjs_by_type(this->topology, HWLOC_OBJ_PU);
-    unsigned max = 0;
-
-    for (int i = 0; i < width; i++) {
-      obj = hwloc_get_obj_by_type(this->topology, HWLOC_OBJ_PU, i);
-      max = max < obj->os_index ? obj->os_index : max;
+  if (NrCpukinds < 1) {
+    auto Width = hwloc_get_nbobjs_by_type(Topology, HWLOC_OBJ_PU);
+    unsigned Max = 0;
+
+    for (int I = 0; I < Width; I++) {
+      auto* Obj = hwloc_get_obj_by_type(Topology, HWLOC_OBJ_PU, I);
+      Max = (std::max)(Max, Obj->os_index);
     }
 
-    return max + 1;
+    return Max + 1;
   }
 
   // Allocate bitmap to get CPUs later
-  hwloc_bitmap_t bitmap = hwloc_bitmap_alloc();
-  if (bitmap == NULL) {
+  hwloc_bitmap_t Bitmap = hwloc_bitmap_alloc();
+  if (Bitmap == nullptr) {
     log::error() << "Could not allocate memory for CPU bitmap";
     return 1;
   }
 
   // Find CPUs per kind
-  for (int kind_index = 0; kind_index < nr_cpukinds; kind_index++) {
-    int result = hwloc_cpukinds_get_info(this->topology, kind_index, bitmap,
-                                         NULL, NULL, NULL, 0);
-    if (result) {
-      log::warn() << "Could not get information for CPU kind " << kind_index;
+  for (int KindIndex = 0; KindIndex < NrCpukinds; KindIndex++) {
+    const auto Result = hwloc_cpukinds_get_info(Topology, KindIndex, Bitmap, nullptr, nullptr, nullptr, 0);
+    if (Result) {
+      log::warn() << "Could not get information for CPU kind " << KindIndex;
     }
-    max += hwloc_bitmap_weight(bitmap);
+    Max += hwloc_bitmap_weight(Bitmap);
   }
 
-  hwloc_bitmap_free(bitmap);
+  hwloc_bitmap_free(Bitmap);
 
-  return max;
+  return Max;
 }
+
+}; // namespace firestarter::environment
\ No newline at end of file
diff --git a/src/firestarter/Environment/Environment.cpp b/src/firestarter/Environment/Environment.cpp
index d827ee83..9d3f81c7 100644
--- a/src/firestarter/Environment/Environment.cpp
+++ b/src/firestarter/Environment/Environment.cpp
@@ -19,232 +19,204 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Environment/Environment.hpp>
-#include <firestarter/Logging/Log.hpp>
+#include "firestarter/Environment/Environment.hpp"
+#include "firestarter/Logging/Log.hpp"
 
-#include <iterator>
 #include <regex>
+#include <stdexcept>
 #include <string>
 
-using namespace firestarter::environment;
+namespace firestarter::environment {
 
-#if (defined(linux) || defined(__linux__)) &&                                  \
-    defined(FIRESTARTER_THREAD_AFFINITY)
+#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY)
 
 extern "C" {
 #include <sched.h>
 }
 
-// this code is from the C version of FIRESTARTER
-// TODO: replace this with cpu affinity of hwloc
-#define ADD_CPU_SET(cpu, cpuset)                                               \
-  do {                                                                         \
-    if (this->cpuAllowed(cpu)) {                                               \
-      CPU_SET(cpu, &cpuset);                                                   \
-    } else {                                                                   \
-      if (cpu >= this->topology().numThreads()) {                              \
-        log::error() << "The given bind argument (-b/--bind) includes CPU "    \
-                     << cpu << " that is not available on this system.";       \
-      } else {                                                                 \
-        log::error() << "The given bind argument (-b/--bind) cannot "          \
-                        "be implemented with the cpuset given from the OS\n"   \
-                     << "This can be caused by the taskset tool, cgroups, "    \
-                        "the batch system, or similar mechanisms.\n"           \
-                     << "Please fix the argument to match the restrictions.";  \
-      }                                                                        \
-      return EACCES;                                                           \
-    }                                                                          \
-  } while (0)
-
-int Environment::cpuSet(unsigned id) {
-  cpu_set_t mask;
-
-  CPU_ZERO(&mask);
-  CPU_SET(id, &mask);
-
-  return sched_setaffinity(0, sizeof(cpu_set_t), &mask);
+auto Environment::cpuSet(unsigned Id) -> int {
+  cpu_set_t Mask;
+
+  CPU_ZERO(&Mask);
+  CPU_SET(Id, &Mask);
+
+  return sched_setaffinity(0, sizeof(cpu_set_t), &Mask);
 }
 
-int Environment::cpuAllowed(unsigned id) {
-  cpu_set_t mask;
+auto Environment::cpuAllowed(unsigned Id) -> bool {
+  cpu_set_t Mask;
 
-  CPU_ZERO(&mask);
+  CPU_ZERO(&Mask);
 
-  if (!sched_getaffinity(0, sizeof(cpu_set_t), &mask)) {
-    return CPU_ISSET(id, &mask);
+  if (!sched_getaffinity(0, sizeof(cpu_set_t), &Mask)) {
+    return CPU_ISSET(Id, &Mask);
   }
 
-  return 0;
+  return false;
 }
-#endif
 
-int Environment::evaluateCpuAffinity(unsigned requestedNumThreads,
-                                     std::string cpuBind) {
-#if not((defined(linux) || defined(__linux__)) &&                              \
-        defined(FIRESTARTER_THREAD_AFFINITY))
-  (void)cpuBind;
+void Environment::addCpuSet(unsigned Cpu, cpu_set_t& Mask) const {
+  if (cpuAllowed(Cpu)) {
+    CPU_SET(Cpu, &Mask);
+  } else {
+    if (Cpu >= topology().numThreads()) {
+      throw std::invalid_argument("The given bind argument (-b/--bind) includes CPU " + std::to_string(Cpu) +
+                                  " that is not available on this system.");
+    }
+    throw std::invalid_argument("The given bind argument (-b/--bind) cannot "
+                                "be implemented with the cpuset given from the OS\n"
+                                "This can be caused by the taskset tool, cgroups, "
+                                "the batch system, or similar mechanisms.\n"
+                                "Please fix the argument to match the restrictions.");
+  }
+}
 #endif
 
-  if (requestedNumThreads > 0 &&
-      requestedNumThreads > this->topology().numThreads()) {
+void Environment::evaluateCpuAffinity(unsigned RequestedNumThreads, const std::string& CpuBind) {
+  if (RequestedNumThreads > 0 && RequestedNumThreads > topology().numThreads()) {
     log::warn() << "Not enough CPUs for requested number of threads";
   }
 
-#if (defined(linux) || defined(__linux__)) &&                                  \
-    defined(FIRESTARTER_THREAD_AFFINITY)
-  cpu_set_t cpuset;
+#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY)
+  cpu_set_t Cpuset;
 
-  CPU_ZERO(&cpuset);
+  CPU_ZERO(&Cpuset);
 
-  if (cpuBind.empty()) {
+  if (CpuBind.empty()) {
     // no cpu binding defined
 
     // use all CPUs if not defined otherwise
-    if (requestedNumThreads == 0) {
-      for (unsigned i = 0; i < this->topology().maxNumThreads(); i++) {
-        if (this->cpuAllowed(i)) {
-          CPU_SET(i, &cpuset);
-          requestedNumThreads++;
+    if (RequestedNumThreads == 0) {
+      for (unsigned I = 0; I < topology().maxNumThreads(); I++) {
+        if (cpuAllowed(I)) {
+          CPU_SET(I, &Cpuset);
+          RequestedNumThreads++;
         }
       }
     } else {
       // if -n / --threads is set
-      unsigned cpu_count = 0;
-      for (unsigned i = 0; i < this->topology().maxNumThreads(); i++) {
+      unsigned CpuCount = 0;
+      for (unsigned I = 0; I < topology().maxNumThreads(); I++) {
         // skip if cpu is not available
-        if (!this->cpuAllowed(i)) {
+        if (!cpuAllowed(I)) {
           continue;
         }
-        ADD_CPU_SET(i, cpuset);
-        cpu_count++;
+        addCpuSet(I, Cpuset);
+        CpuCount++;
         // we reached the desired amounts of threads
-        if (cpu_count >= requestedNumThreads) {
+        if (CpuCount >= RequestedNumThreads) {
           break;
         }
       }
       // requested to many threads
-      if (cpu_count < requestedNumThreads) {
-        log::error() << "You are requesting more threads than "
-                        "there are CPUs available in the given cpuset.\n"
-                     << "This can be caused by the taskset tool, cgrous, "
-                        "the batch system, or similar mechanisms.\n"
-                     << "Please fix the -n/--threads argument to match the "
-                        "restrictions.";
-        return EACCES;
+      if (CpuCount < RequestedNumThreads) {
+        throw std::invalid_argument("You are requesting more threads than "
+                                    "there are CPUs available in the given cpuset.\n"
+                                    "This can be caused by the taskset tool, cgrous, "
+                                    "the batch system, or similar mechanisms.\n"
+                                    "Please fix the -n/--threads argument to match the "
+                                    "restrictions.");
       }
     }
   } else {
+    RequestedNumThreads = 0;
+
     // parse CPULIST for binding
-    const std::string delimiter = ",";
-    const std::regex re("^(?:(\\d+)(?:-([1-9]\\d*)(?:\\/([1-9]\\d*))?)?)$");
+    const auto Delimiter = ',';
+    const std::regex Re(R"(^(?:(\d+)(?:-([1-9]\d*)(?:\/([1-9]\d*))?)?)$)");
 
-    std::stringstream ss(cpuBind);
+    std::stringstream Ss(CpuBind);
 
-    while (ss.good()) {
-      std::string token;
-      std::smatch m;
-      std::getline(ss, token, ',');
-      ;
+    while (Ss.good()) {
+      std::string Token;
+      std::smatch M;
+      std::getline(Ss, Token, Delimiter);
 
-      if (std::regex_match(token, m, re)) {
-        unsigned long x, y, s;
+      if (std::regex_match(Token, M, Re)) {
+        uint64_t Y = 0;
+        uint64_t S = 0;
 
-        x = std::stoul(m[1].str());
-        if (m[2].matched) {
-          y = std::stoul(m[2].str());
+        auto X = std::stoul(M[1].str());
+        if (M[2].matched) {
+          Y = std::stoul(M[2].str());
         } else {
-          y = x;
+          Y = X;
         }
-        if (m[3].matched) {
-          s = std::stoul(m[3].str());
+        if (M[3].matched) {
+          S = std::stoul(M[3].str());
         } else {
-          s = 1;
+          S = 1;
         }
-        if (y < x) {
-          log::error() << "y has to be >= x in x-y expressions of CPU list: "
-                       << token;
-          return EXIT_FAILURE;
+        if (Y < X) {
+          throw std::invalid_argument("y has to be >= x in x-y expressions of CPU list: " + Token);
         }
-        for (unsigned long i = x; i <= y; i += s) {
-          ADD_CPU_SET(i, cpuset);
-          requestedNumThreads++;
+        for (auto I = X; I <= Y; I += S) {
+          addCpuSet(I, Cpuset);
+          RequestedNumThreads++;
         }
       } else {
-        log::error() << "Invalid symbols in CPU list: " << token;
-        return EXIT_FAILURE;
+        throw std::invalid_argument("Invalid symbols in CPU list: " + Token);
       }
     }
   }
-#else
-  if (requestedNumThreads == 0) {
-    requestedNumThreads = this->topology().maxNumThreads();
-  }
-#endif
 
-  if (requestedNumThreads == 0) {
-    log::error() << "Found no usable CPUs!";
-    return 127;
+  if (RequestedNumThreads == 0) {
+    throw std::invalid_argument("Found no usable CPUs!");
   }
-#if (defined(linux) || defined(__linux__)) &&                                  \
-    defined(FIRESTARTER_THREAD_AFFINITY)
-  else {
-    for (unsigned i = 0; i < this->topology().maxNumThreads(); i++) {
-      if (CPU_ISSET(i, &cpuset)) {
-        this->cpuBind.push_back(i);
-      }
+
+  // Save the ids of the threads.
+  for (unsigned I = 0; I < topology().maxNumThreads(); I++) {
+    if (CPU_ISSET(I, &Cpuset)) {
+      this->CpuBind.push_back(I);
     }
   }
-#endif
+#else
+  (void)CpuBind;
 
-  if (requestedNumThreads > this->topology().maxNumThreads()) {
-    requestedNumThreads = this->topology().maxNumThreads();
+  if (RequestedNumThreads == 0) {
+    RequestedNumThreads = topology().maxNumThreads();
   }
+#endif
 
-  this->_requestedNumThreads = requestedNumThreads;
-
-  return EXIT_SUCCESS;
+  // Limit the number of thread to the maximum on the CPU.
+  this->RequestedNumThreads = (std::min)(RequestedNumThreads, topology().maxNumThreads());
 }
 
 void Environment::printThreadSummary() {
-  log::info() << "\n  using " << this->requestedNumThreads() << " threads";
-
-#if (defined(linux) || defined(__linux__)) &&                                  \
-    defined(FIRESTARTER_THREAD_AFFINITY)
-  bool printCoreIdInfo = false;
-  size_t i = 0;
-
-  std::vector<unsigned> cpuBind(this->cpuBind);
-  cpuBind.resize(this->requestedNumThreads());
-  for (auto const &bind : cpuBind) {
-    int coreId = this->topology().getCoreIdFromPU(bind);
-    int pkgId = this->topology().getPkgIdFromPU(bind);
-
-    if (coreId != -1 && pkgId != -1) {
-      log::info() << "    - Thread " << i << " run on CPU " << bind << ", core "
-                  << coreId << " in package: " << pkgId;
-      printCoreIdInfo = true;
+  log::info() << "\n  using " << requestedNumThreads() << " threads";
+
+#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY)
+  bool PrintCoreIdInfo = false;
+  size_t I = 0;
+
+  std::vector<unsigned> CpuBind(this->CpuBind);
+  CpuBind.resize(requestedNumThreads());
+  for (auto const& Bind : CpuBind) {
+    const auto CoreId = topology().getCoreIdFromPU(Bind);
+    const auto PkgId = topology().getPkgIdFromPU(Bind);
+
+    if (CoreId && PkgId) {
+      log::info() << "    - Thread " << I << " run on CPU " << Bind << ", core " << *CoreId
+                  << " in package: " << *PkgId;
+      PrintCoreIdInfo = true;
     }
 
-    i++;
+    I++;
   }
 
-  if (printCoreIdInfo) {
-    log::info()
-        << "  The cores are numbered using the logical_index from hwloc.";
+  if (PrintCoreIdInfo) {
+    log::info() << "  The cores are numbered using the logical_index from hwloc.";
   }
 #endif
 }
 
-int Environment::setCpuAffinity(unsigned thread) {
-  if (thread >= this->requestedNumThreads()) {
-    log::error() << "Trying to set more CPUs than available.";
-    return EXIT_FAILURE;
+void Environment::setCpuAffinity(unsigned Thread) const {
+  if (Thread >= requestedNumThreads()) {
+    throw std::invalid_argument("Trying to set more CPUs than available.");
   }
 
-#if (defined(linux) || defined(__linux__)) &&                                  \
-    defined(FIRESTARTER_THREAD_AFFINITY)
-  this->cpuSet(this->cpuBind.at(thread));
+#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY)
+  cpuSet(CpuBind.at(Thread));
 #endif
-
-  return EXIT_SUCCESS;
 }
+}; // namespace firestarter::environment
\ No newline at end of file
diff --git a/src/firestarter/Environment/Payload/CompiledPayload.cpp b/src/firestarter/Environment/Payload/CompiledPayload.cpp
new file mode 100644
index 00000000..33183d7a
--- /dev/null
+++ b/src/firestarter/Environment/Payload/CompiledPayload.cpp
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2020 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#include "firestarter/Environment/Payload/CompiledPayload.hpp"
+#include "firestarter/Environment/Payload/Payload.hpp"
+
+namespace firestarter::environment::payload {
+
+void CompiledPayload::init(double* MemoryAddr, uint64_t BufferSize) { PayloadPtr->init(MemoryAddr, BufferSize); }
+
+void CompiledPayload::lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period) {
+  PayloadPtr->lowLoadFunction(LoadVar, Period);
+};
+
+}; // namespace firestarter::environment::payload
\ No newline at end of file
diff --git a/src/firestarter/Environment/Payload/Payload.cpp b/src/firestarter/Environment/Payload/Payload.cpp
deleted file mode 100644
index 68cfc547..00000000
--- a/src/firestarter/Environment/Payload/Payload.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/******************************************************************************
- * FIRESTARTER - A Processor Stress Test Utility
- * Copyright (C) 2020 TU Dresden, Center for Information Services and High
- * Performance Computing
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
- *
- * Contact: daniel.hackenberg@tu-dresden.de
- *****************************************************************************/
-
-#include <algorithm>
-#include <cmath>
-
-#include <firestarter/Environment/Payload/Payload.hpp>
-
-using namespace firestarter::environment::payload;
-
-unsigned
-Payload::getSequenceStartCount(const std::vector<std::string> &sequence,
-                               const std::string start) {
-  unsigned i = 0;
-
-  for (const auto &item : sequence) {
-    if (0 == item.rfind(start, 0)) {
-      i++;
-    }
-  }
-
-  return i;
-}
-
-std::vector<std::string> Payload::generateSequence(
-    std::vector<std::pair<std::string, unsigned>> const &proportions) {
-  std::vector<std::pair<std::string, unsigned>> prop = proportions;
-
-  prop.erase(std::remove_if(prop.begin(), prop.end(),
-                            [](auto const &pair) { return pair.second == 0; }),
-             prop.end());
-
-  std::vector<std::string> sequence = {};
-
-  if (prop.size() == 0) {
-    return sequence;
-  }
-
-  auto it = prop.begin();
-  auto insertIt = sequence.begin();
-
-  sequence.insert(insertIt, it->second, it->first);
-
-  for (++it; it != prop.end(); ++it) {
-    for (unsigned i = 0; i < it->second; i++) {
-      insertIt = sequence.begin();
-      std::advance(insertIt, 1 + floor(i * (sequence.size() + it->second - i) /
-                                       (float)it->second));
-      sequence.insert(insertIt, it->first);
-    }
-  }
-
-  return sequence;
-}
-
-unsigned Payload::getL2LoopCount(const std::vector<std::string> &sequence,
-                                 const unsigned numberOfLines,
-                                 const unsigned size, const unsigned threads) {
-  if (this->getL2SequenceCount(sequence) == 0) {
-    return 0;
-  }
-  return (0.8 * size / 64 / threads /
-          (this->getL2SequenceCount(sequence) *
-           this->getNumberOfSequenceRepetitions(sequence,
-                                                numberOfLines / threads)));
-}
-
-unsigned Payload::getL3LoopCount(const std::vector<std::string> &sequence,
-                                 const unsigned numberOfLines,
-                                 const unsigned size, const unsigned threads) {
-  if (this->getL3SequenceCount(sequence) == 0) {
-    return 0;
-  }
-  return (0.8 * size / 64 / threads /
-          (this->getL3SequenceCount(sequence) *
-           this->getNumberOfSequenceRepetitions(sequence,
-                                                numberOfLines / threads)));
-}
-
-unsigned Payload::getRAMLoopCount(const std::vector<std::string> &sequence,
-                                  const unsigned numberOfLines,
-                                  const unsigned size, const unsigned threads) {
-  if (this->getRAMSequenceCount(sequence) == 0) {
-    return 0;
-  }
-  return (1.0 * size / 64 / threads /
-          (this->getRAMSequenceCount(sequence) *
-           this->getNumberOfSequenceRepetitions(sequence,
-                                                numberOfLines / threads)));
-}
diff --git a/src/firestarter/Environment/Payload/PayloadSettings.cpp b/src/firestarter/Environment/Payload/PayloadSettings.cpp
new file mode 100644
index 00000000..25ca4ea4
--- /dev/null
+++ b/src/firestarter/Environment/Payload/PayloadSettings.cpp
@@ -0,0 +1,98 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2020 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+#include "firestarter/Environment/Payload/PayloadSettings.hpp"
+
+#include <algorithm>
+#include <cmath>
+
+namespace firestarter::environment::payload {
+
+auto PayloadSettings::getSequenceStartCount(const std::vector<std::string>& Sequence, const std::string& Start)
+    -> unsigned {
+  unsigned I = 0;
+
+  for (const auto& Item : Sequence) {
+    if (0 == Item.rfind(Start, 0)) {
+      I++;
+    }
+  }
+
+  return I;
+}
+
+auto PayloadSettings::generateSequence(std::vector<PayloadSettings::InstructionWithProportion> const& Proportions)
+    -> std::vector<std::string> {
+  std::vector<std::pair<std::string, unsigned>> Prop = Proportions;
+
+  Prop.erase(std::remove_if(Prop.begin(), Prop.end(), [](auto const& Pair) { return Pair.second == 0; }), Prop.end());
+
+  std::vector<std::string> Sequence = {};
+
+  if (Prop.empty()) {
+    return Sequence;
+  }
+
+  auto It = Prop.begin();
+  auto InsertIt = Sequence.begin();
+
+  Sequence.insert(InsertIt, It->second, It->first);
+
+  for (++It; It != Prop.end(); ++It) {
+    for (unsigned I = 0; I < It->second; I++) {
+      InsertIt = Sequence.begin();
+      std::advance(InsertIt, 1 + std::floor(static_cast<float>(I * (Sequence.size() + It->second - I)) /
+                                            static_cast<float>(It->second)));
+      Sequence.insert(InsertIt, It->first);
+    }
+  }
+
+  return Sequence;
+}
+
+auto PayloadSettings::getL2LoopCount(const std::vector<std::string>& Sequence, const unsigned NumberOfLines,
+                                     const unsigned Size) -> unsigned {
+  if (getL2SequenceCount(Sequence) == 0) {
+    return 0;
+  }
+  return static_cast<unsigned>(
+      (0.8 * Size / 64 / (getL2SequenceCount(Sequence) * getNumberOfSequenceRepetitions(Sequence, NumberOfLines))));
+}
+
+auto PayloadSettings::getL3LoopCount(const std::vector<std::string>& Sequence, const unsigned NumberOfLines,
+                                     const unsigned Size) -> unsigned {
+  if (getL3SequenceCount(Sequence) == 0) {
+    return 0;
+  }
+  return static_cast<unsigned>(
+      (0.8 * Size / 64 / (getL3SequenceCount(Sequence) * getNumberOfSequenceRepetitions(Sequence, NumberOfLines))));
+}
+
+auto PayloadSettings::getRAMLoopCount(const std::vector<std::string>& Sequence, const unsigned NumberOfLines,
+                                      const unsigned Size) -> unsigned {
+  if (getRAMSequenceCount(Sequence) == 0) {
+    return 0;
+  }
+  return static_cast<unsigned>(
+      (1.0 * Size / 64 / (getRAMSequenceCount(Sequence) * getNumberOfSequenceRepetitions(Sequence, NumberOfLines))));
+}
+
+}; // namespace firestarter::environment::payload
\ No newline at end of file
diff --git a/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp
index 9316ed39..f52a5410 100644
--- a/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp
+++ b/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp
@@ -19,432 +19,373 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Environment/X86/Payload/AVX512Payload.hpp>
-
-using namespace firestarter::environment::x86::payload;
-using namespace asmjit;
-using namespace asmjit::x86;
-
-int AVX512Payload::compilePayload(
-    std::vector<std::pair<std::string, unsigned>> const &proportion,
-    unsigned instructionCacheSize,
-    std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-    unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-    bool errorDetection) {
+#include "firestarter/Environment/X86/Payload/AVX512Payload.hpp"
+#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp"
+
+namespace firestarter::environment::x86::payload {
+
+auto AVX512Payload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                   bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr {
+  using Imm = asmjit::Imm;
+  using Zmm = asmjit::x86::Zmm;
+  // NOLINTBEGIN(readability-identifier-naming)
+  constexpr asmjit::x86::Mem (*zmmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::zmmword_ptr;
+  constexpr auto zmm0 = asmjit::x86::zmm0;
+  constexpr auto zmm1 = asmjit::x86::zmm1;
+  constexpr auto zmm2 = asmjit::x86::zmm2;
+  // NOLINTEND(readability-identifier-naming)
 
   // Compute the sequence of instruction groups and the number of its repetions
   // to reach the desired size
-  auto sequence = this->generateSequence(proportion);
-  auto repetitions =
-      this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread);
+  auto Sequence = Settings.sequence();
+  auto Repetitions =
+      environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread());
 
   // compute count of flops and memory access for performance report
-  unsigned flops = 0;
-  unsigned bytes = 0;
+  environment::payload::PayloadStats Stats;
 
-  for (const auto &item : sequence) {
-    auto it = this->instructionFlops.find(item);
+  for (const auto& Item : Sequence) {
+    auto It = instructionFlops().find(Item);
 
-    if (it == this->instructionFlops.end()) {
-      workerLog::error() << "Instruction group " << item << " undefined in "
-                         << name() << ".";
-      return EXIT_FAILURE;
+    if (It == instructionFlops().end()) {
+      workerLog::error() << "Instruction group " << Item << " undefined in " << name() << ".";
     }
 
-    flops += it->second;
+    Stats.Flops += It->second;
 
-    it = this->instructionMemory.find(item);
+    It = instructionMemory().find(Item);
 
-    if (it != this->instructionMemory.end()) {
-      bytes += it->second;
+    if (It != instructionMemory().end()) {
+      Stats.Bytes += It->second;
     }
   }
 
-  this->_flops = repetitions * flops;
-  this->_bytes = repetitions * bytes;
-  this->_instructions = repetitions * sequence.size() * 4 + 6;
+  Stats.Flops *= Repetitions;
+  Stats.Bytes *= Repetitions;
+  Stats.Instructions = Repetitions * Sequence.size() * 4 + 6;
 
   // calculate the buffer sizes
-  auto l1i_cache_size = instructionCacheSize / thread;
-  auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin();
-  auto l1_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l2_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l3_size = *dataCacheBufferSizeIterator / thread;
-  auto ram_size = ramBufferSize / thread;
+  const auto L1iCacheSize = Settings.instructionCacheSizePerThread();
+  const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread();
+  auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin();
+  const auto L1Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L2Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L3Size = *DataCacheBufferSizeIterator;
+  const auto RamSize = Settings.ramBufferSizePerThread();
 
   // calculate the reset counters for the buffers
-  auto l2_loop_count =
-      getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread);
-  auto l3_loop_count =
-      getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread);
-  auto ram_loop_count =
-      getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread);
-
-  CodeHolder code;
-  code.init(this->rt.environment());
-
-  if (nullptr != this->loadFunction) {
-    this->rt.release(&this->loadFunction);
-  }
-
-  Builder cb(&code);
-  cb.addDiagnosticOptions(
-    asmjit::DiagnosticOptions::kValidateAssembler | 
-    asmjit::DiagnosticOptions::kValidateIntermediate );
-
-  auto pointer_reg = rax;
-  auto l1_addr = rbx;
-  auto l2_addr = rcx;
-  auto l3_addr = r8;
-  auto ram_addr = r9;
-  auto l2_count_reg = r10;
-  auto l3_count_reg = r11;
-  auto ram_count_reg = r12;
-  auto temp_reg = r13;
-  auto temp_reg2 = rbp;
-  auto offset_reg = r14;
-  auto addrHigh_reg = r15;
-  auto iter_reg = mm0;
-  auto shift_reg = std::vector<Gp>({rdi, rsi, rdx});
-  auto shift_reg32 = std::vector<Gp>({edi, esi, edx});
-  auto nr_shift_regs = 3;
-  auto mul_regs = 3;
-  auto add_regs = 24;
-  auto alt_dst_regs = 5;
-  auto ram_reg = zmm30;
-
-  FuncDetail func;
-  func.init(FuncSignatureT<unsigned long long, unsigned long long *,
-                           volatile unsigned long long *, unsigned long long>(
-                CallConvId::kCDecl),
-            this->rt.environment());
-
-  FuncFrame frame;
-  frame.init(func);
+  const auto L2LoopCount =
+      environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size);
+  const auto L3LoopCount =
+      environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size);
+  const auto RamLoopCount =
+      environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize);
+
+  asmjit::CodeHolder Code;
+  Code.init(asmjit::Environment::host());
+
+  asmjit::x86::Builder Cb(&Code);
+  Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler |
+                          asmjit::DiagnosticOptions::kValidateIntermediate);
+
+  const auto PointerReg = asmjit::x86::rax;
+  const auto L1Addr = asmjit::x86::rbx;
+  const auto L2Addr = asmjit::x86::rcx;
+  const auto L3Addr = asmjit::x86::r8;
+  const auto RamAddr = asmjit::x86::r9;
+  const auto L2CountReg = asmjit::x86::r10;
+  const auto L3CountReg = asmjit::x86::r11;
+  const auto RamCountReg = asmjit::x86::r12;
+  const auto TempReg = asmjit::x86::r13;
+  const auto TempReg2 = asmjit::x86::rbp;
+  const auto OffsetReg = asmjit::x86::r14;
+  const auto AddrHighReg = asmjit::x86::r15;
+  const auto IterReg = asmjit::x86::mm0;
+  const auto ShiftReg = std::vector<asmjit::x86::Gp>({asmjit::x86::rdi, asmjit::x86::rsi, asmjit::x86::rdx});
+  const auto ShiftReg32 = std::vector<asmjit::x86::Gp>({asmjit::x86::edi, asmjit::x86::esi, asmjit::x86::edx});
+  const auto NrShiftRegs = 3;
+  const auto MulRegs = 3;
+  const auto AddRegs = 22;
+  const auto AltDstRegs = 5;
+  const auto RamReg = asmjit::x86::zmm30;
+
+  asmjit::FuncDetail Func;
+  Func.init(asmjit::FuncSignature::build<uint64_t, double*, volatile LoadThreadWorkType*, uint64_t>(
+                asmjit::CallConvId::kCDecl),
+            Code.environment());
+
+  asmjit::FuncFrame Frame;
+  Frame.init(Func);
 
   // make zmm registers dirty
-  for (int i = 0; i < 32; i++) {
-    frame.addDirtyRegs(Zmm(i));
+  for (auto I = 0U; I < 32U; I++) {
+    Frame.addDirtyRegs(Zmm(I));
   }
-  for (int i = 0; i < 8; i++) {
-    frame.addDirtyRegs(Mm(i));
+  for (auto I = 0U; I < 8U; I++) {
+    Frame.addDirtyRegs(asmjit::x86::Mm(I));
   }
   // make all other used registers dirty except RAX
-  frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg,
-                     l3_count_reg, ram_count_reg, temp_reg, offset_reg,
-                     addrHigh_reg, iter_reg, ram_addr);
-  for (const auto &reg : shift_reg) {
-    frame.addDirtyRegs(reg);
+  Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg,
+                     AddrHighReg, IterReg, RamAddr);
+  for (const auto& Reg : ShiftReg) {
+    Frame.addDirtyRegs(Reg);
   }
 
-  FuncArgsAssignment args(&func);
+  asmjit::FuncArgsAssignment Args(&Func);
   // FIXME: asmjit assigment to mm0 does not seem to be supported
-  args.assignAll(pointer_reg, addrHigh_reg, temp_reg);
-  args.updateFuncFrame(frame);
-  frame.finalize();
+  Args.assignAll(PointerReg, AddrHighReg, TempReg);
+  Args.updateFuncFrame(Frame);
+  Frame.finalize();
 
-  cb.emitProlog(frame);
-  cb.emitArgsAssignment(frame, args);
+  Cb.emitProlog(Frame);
+  Cb.emitArgsAssignment(Frame, Args);
 
   // FIXME: movq from temp_reg to iter_reg
-  cb.movq(iter_reg, temp_reg);
+  Cb.movq(IterReg, TempReg);
 
   // stop right away if low load is selected
-  auto FunctionExit = cb.newLabel();
+  auto FunctionExit = Cb.newLabel();
 
-  cb.mov(temp_reg, ptr_64(addrHigh_reg));
-  cb.test(temp_reg, temp_reg);
-  cb.jz(FunctionExit);
+  Cb.mov(TempReg, ptr_64(AddrHighReg));
+  Cb.test(TempReg, TempReg);
+  Cb.jz(FunctionExit);
 
-  cb.mov(offset_reg,
+  Cb.mov(OffsetReg,
          Imm(64)); // increment after each cache/memory access
   // Initialize registers for shift operations
-  for (auto const &reg : shift_reg32) {
-    cb.mov(reg, Imm(0xAAAAAAAA));
+  for (auto const& Reg : ShiftReg32) {
+    Cb.mov(Reg, Imm(0xAAAAAAAA));
   }
   // Initialize AVX512-Registers for FMA Operations
-  cb.vmovapd(zmm0, zmmword_ptr(pointer_reg));
-  cb.vmovapd(zmm1, zmmword_ptr(pointer_reg, 64));
-  cb.vmovapd(zmm2, zmmword_ptr(pointer_reg, 128));
-  auto add_start = mul_regs;
-  auto add_end = mul_regs + add_regs - 1;
-  auto trans_start = add_regs + mul_regs;
-  auto trans_end = add_regs + mul_regs + alt_dst_regs - 1;
-  for (int i = add_start; i <= trans_end; i++) {
-    cb.vmovapd(Zmm(i), zmmword_ptr(pointer_reg, 256 + i * 64));
-  }
-  cb.mov(l1_addr, pointer_reg); // address for L1-buffer
-  cb.mov(l2_addr, pointer_reg);
-  cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer
-  cb.mov(l3_addr, pointer_reg);
-  cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer
-  cb.mov(ram_addr, pointer_reg);
-  cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer
-  cb.mov(l2_count_reg, Imm(l2_loop_count));
-  workerLog::trace() << "reset counter for L2-buffer with "
-                     << l2_loop_count
-                     << " cache line accesses per loop ("
-		     << l2_size/1024
-                     << ") KiB";
-  cb.mov(l3_count_reg, Imm(l3_loop_count));
-  workerLog::trace() << "reset counter for L3-buffer with "
-                     << l3_loop_count
-                     << " cache line accesses per loop ("
-		     << l3_size/1024
-                     << ") KiB";
-  cb.mov(ram_count_reg, Imm(ram_loop_count));
-  workerLog::trace() << "reset counter for RAM-buffer with "
-                     << ram_loop_count
-                     << " cache line accesses per loop ("
-		     << ram_size/1024
-                     << ") KiB";
-
-  cb.align(AlignMode::kCode, 64);
-
-  auto Loop = cb.newLabel();
-  cb.bind(Loop);
-
-  auto shift_pos = 0;
-  bool left = false;
-  auto add_dest = add_start + 1;
-  auto mov_dst = trans_start;
-  auto mov_src = mov_dst + 1;
-  unsigned l1_offset = 0;
-
-#define L1_INCREMENT()                                                         \
-  l1_offset += 64;                                                             \
-  if (l1_offset < l1_size * 0.5) {                                             \
-    cb.add(l1_addr, offset_reg);                                               \
-  } else {                                                                     \
-    l1_offset = 0;                                                             \
-    cb.mov(l1_addr, pointer_reg);                                              \
+  Cb.vmovapd(zmm0, zmmword_ptr(PointerReg, 0));
+  Cb.vmovapd(zmm1, zmmword_ptr(PointerReg, 64));
+  Cb.vmovapd(zmm2, zmmword_ptr(PointerReg, 128));
+  auto AddStart = MulRegs;
+  auto AddEnd = MulRegs + AddRegs - 1;
+  auto TransStart = AddRegs + MulRegs;
+  auto TransEnd = AddRegs + MulRegs + AltDstRegs - 1;
+  for (auto I = AddStart; I <= TransEnd; I++) {
+    Cb.vmovapd(Zmm(I), zmmword_ptr(PointerReg, 256 + (I * 64)));
   }
-
-#define L2_INCREMENT() cb.add(l2_addr, offset_reg)
-
-#define L3_INCREMENT() cb.add(l3_addr, offset_reg)
-
-#define RAM_INCREMENT() cb.add(ram_addr, offset_reg)
-
-  for (unsigned count = 0; count < repetitions; count++) {
-    for (const auto &item : sequence) {
-      if (item == "REG") {
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2);
-        cb.vfmadd231pd(Zmm(mov_dst), zmm2, zmm1);
-        cb.xor_(shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs],
-                temp_reg);
-        mov_dst++;
-      } else if (item == "L1_L") {
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2);
-        cb.vfmadd231pd(Zmm(add_dest), zmm1, zmmword_ptr(l1_addr, 64));
-        L1_INCREMENT();
-      } else if (item == "L1_BROADCAST") {
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2);
-        cb.vbroadcastsd(Zmm(add_dest), ptr_64(l1_addr, 64));
-        L1_INCREMENT();
-      } else if (item == "L1_S") {
-        cb.vmovapd(zmmword_ptr(l1_addr, 64), Zmm(add_dest));
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2);
-        L1_INCREMENT();
-      } else if (item == "L1_LS") {
-        cb.vmovapd(zmmword_ptr(l1_addr, 64), Zmm(add_dest));
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 128));
-        L1_INCREMENT();
-      } else if (item == "L2_L") {
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2);
-        cb.vfmadd231pd(Zmm(add_dest), zmm1, zmmword_ptr(l2_addr, 64));
-        L2_INCREMENT();
-      } else if (item == "L2_S") {
-        cb.vmovapd(zmmword_ptr(l2_addr, 64), Zmm(add_dest));
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2);
-        L2_INCREMENT();
-      } else if (item == "L2_LS") {
-        cb.vmovapd(zmmword_ptr(l2_addr, 64), Zmm(add_dest));
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l2_addr, 128));
-        L2_INCREMENT();
-      } else if (item == "L3_L") {
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2);
-        cb.vfmadd231pd(Zmm(add_dest), zmm1, zmmword_ptr(l3_addr, 64));
-        L3_INCREMENT();
-      } else if (item == "L3_S") {
-        cb.vmovapd(zmmword_ptr(l3_addr, 64), Zmm(add_dest));
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2);
-        L3_INCREMENT();
-      } else if (item == "L3_LS") {
-        cb.vmovapd(zmmword_ptr(l3_addr, 64), Zmm(add_dest));
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l3_addr, 128));
-        L3_INCREMENT();
-      } else if (item == "L3_P") {
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 64));
-        cb.prefetcht2(ptr(l3_addr));
-        L3_INCREMENT();
-      } else if (item == "RAM_L") {
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2);
-        cb.vfmadd231pd(ram_reg, zmm1, zmmword_ptr(ram_addr, 64));
-        RAM_INCREMENT();
-      } else if (item == "RAM_S") {
-        cb.vmovapd(zmmword_ptr(ram_addr, 64), Zmm(add_dest));
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2);
-        RAM_INCREMENT();
-      } else if (item == "RAM_LS") {
-        cb.vmovapd(zmmword_ptr(ram_addr, 64), Zmm(add_dest));
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(ram_addr, 128));
-        RAM_INCREMENT();
-      } else if (item == "RAM_P") {
-        cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 64));
-        cb.prefetcht2(ptr(ram_addr));
-        RAM_INCREMENT();
+  Cb.mov(L1Addr, PointerReg); // address for L1-buffer
+  Cb.mov(L2Addr, PointerReg);
+  Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer
+  Cb.mov(L3Addr, PointerReg);
+  Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer
+  Cb.mov(RamAddr, PointerReg);
+  Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer
+  Cb.mov(L2CountReg, Imm(L2LoopCount));
+  workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop ("
+                     << L2Size / 1024 << ") KiB";
+  Cb.mov(L3CountReg, Imm(L3LoopCount));
+  workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop ("
+                     << L3Size / 1024 << ") KiB";
+  Cb.mov(RamCountReg, Imm(RamLoopCount));
+  workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop ("
+                     << RamSize / 1024 << ") KiB";
+
+  Cb.align(asmjit::AlignMode::kCode, 64);
+
+  auto Loop = Cb.newLabel();
+  Cb.bind(Loop);
+
+  auto ShiftPos = 0;
+  bool Left = false;
+  auto AddDest = AddStart + 1;
+  auto MovDst = TransStart;
+  unsigned L1Offset = 0;
+
+  const auto L1Increment = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg]() {
+    L1Offset += 64;
+    if (L1Offset < L1Size * 0.5) {
+      Cb.add(L1Addr, OffsetReg);
+    } else {
+      L1Offset = 0;
+      Cb.mov(L1Addr, PointerReg);
+    }
+  };
+  const auto L2Increment = [&Cb, &L2Addr, &OffsetReg]() { Cb.add(L2Addr, OffsetReg); };
+  const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); };
+  const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); };
+
+  for (auto Count = 0U; Count < Repetitions; Count++) {
+    for (const auto& Item : Sequence) {
+      if (Item == "REG") {
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2);
+        Cb.vfmadd231pd(Zmm(MovDst), zmm2, zmm1);
+        Cb.xor_(ShiftReg[(ShiftPos + NrShiftRegs - 1) % NrShiftRegs], TempReg);
+        MovDst++;
+      } else if (Item == "L1_L") {
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2);
+        Cb.vfmadd231pd(Zmm(AddDest), zmm1, zmmword_ptr(L1Addr, 64));
+        L1Increment();
+      } else if (Item == "L1_BROADCAST") {
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2);
+        Cb.vbroadcastsd(Zmm(AddDest), ptr_64(L1Addr, 64));
+        L1Increment();
+      } else if (Item == "L1_S") {
+        Cb.vmovapd(zmmword_ptr(L1Addr, 64), Zmm(AddDest));
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2);
+        L1Increment();
+      } else if (Item == "L1_LS") {
+        Cb.vmovapd(zmmword_ptr(L1Addr, 64), Zmm(AddDest));
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(L1Addr, 128));
+        L1Increment();
+      } else if (Item == "L2_L") {
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2);
+        Cb.vfmadd231pd(Zmm(AddDest), zmm1, zmmword_ptr(L2Addr, 64));
+        L2Increment();
+      } else if (Item == "L2_S") {
+        Cb.vmovapd(zmmword_ptr(L2Addr, 64), Zmm(AddDest));
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2);
+        L2Increment();
+      } else if (Item == "L2_LS") {
+        Cb.vmovapd(zmmword_ptr(L2Addr, 64), Zmm(AddDest));
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(L2Addr, 128));
+        L2Increment();
+      } else if (Item == "L3_L") {
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2);
+        Cb.vfmadd231pd(Zmm(AddDest), zmm1, zmmword_ptr(L3Addr, 64));
+        L3Increment();
+      } else if (Item == "L3_S") {
+        Cb.vmovapd(zmmword_ptr(L3Addr, 64), Zmm(AddDest));
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2);
+        L3Increment();
+      } else if (Item == "L3_LS") {
+        Cb.vmovapd(zmmword_ptr(L3Addr, 64), Zmm(AddDest));
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(L3Addr, 128));
+        L3Increment();
+      } else if (Item == "L3_P") {
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(L1Addr, 64));
+        Cb.prefetcht2(ptr(L3Addr));
+        L3Increment();
+      } else if (Item == "RAM_L") {
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2);
+        Cb.vfmadd231pd(RamReg, zmm1, zmmword_ptr(RamAddr, 64));
+        RamIncrement();
+      } else if (Item == "RAM_S") {
+        Cb.vmovapd(zmmword_ptr(RamAddr, 64), Zmm(AddDest));
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmm2);
+        RamIncrement();
+      } else if (Item == "RAM_LS") {
+        Cb.vmovapd(zmmword_ptr(RamAddr, 64), Zmm(AddDest));
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(RamAddr, 128));
+        RamIncrement();
+      } else if (Item == "RAM_P") {
+        Cb.vfmadd231pd(Zmm(AddDest), zmm0, zmmword_ptr(L1Addr, 64));
+        Cb.prefetcht2(ptr(RamAddr));
+        RamIncrement();
       } else {
-        workerLog::error() << "Instruction group " << item << " not found in "
-                           << this->name() << ".";
-        return EXIT_FAILURE;
+        workerLog::error() << "Instruction group " << Item << " not found in " << name() << ".";
       }
 
-      if (left) {
-        cb.shr(shift_reg32[shift_pos], Imm(1));
+      if (Left) {
+        Cb.shr(ShiftReg32[ShiftPos], Imm(1));
       } else {
-        cb.shl(shift_reg32[shift_pos], Imm(1));
+        Cb.shl(ShiftReg32[ShiftPos], Imm(1));
       }
-      add_dest++;
-      if (add_dest > add_end) {
-        add_dest = add_start;
+      AddDest++;
+      if (AddDest > AddEnd) {
+        AddDest = AddStart;
       }
-      if (mov_dst > trans_end) {
-        mov_dst = trans_start;
+      if (MovDst > TransEnd) {
+        MovDst = TransStart;
       }
-      mov_src++;
-      if (mov_src > trans_end) {
-        mov_src = trans_start;
-      }
-      shift_pos++;
-      if (shift_pos == nr_shift_regs) {
-        shift_pos = 0;
-        left = !left;
+      ShiftPos++;
+      if (ShiftPos == NrShiftRegs) {
+        ShiftPos = 0;
+        Left = !Left;
       }
     }
   }
 
-  cb.movq(temp_reg, iter_reg); // restore iteration counter
-  if (this->getRAMSequenceCount(sequence) > 0) {
+  Cb.movq(TempReg, IterReg); // restore iteration counter
+  if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) {
     // reset RAM counter
-    auto NoRamReset = cb.newLabel();
-
-    cb.sub(ram_count_reg, Imm(1));
-    cb.jnz(NoRamReset);
-    cb.mov(ram_count_reg, Imm(ram_loop_count));
-    cb.mov(ram_addr, pointer_reg);
-    cb.add(ram_addr, Imm(l3_size));
-    cb.bind(NoRamReset);
+    auto NoRamReset = Cb.newLabel();
+
+    Cb.sub(RamCountReg, Imm(1));
+    Cb.jnz(NoRamReset);
+    Cb.mov(RamCountReg, Imm(RamLoopCount));
+    Cb.mov(RamAddr, PointerReg);
+    Cb.add(RamAddr, Imm(L3Size));
+    Cb.bind(NoRamReset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.inc(temp_reg); // increment iteration counter
-  if (this->getL2SequenceCount(sequence) > 0) {
+  Cb.inc(TempReg); // increment iteration counter
+  if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) {
     // reset L2-Cache counter
-    auto NoL2Reset = cb.newLabel();
-
-    cb.sub(l2_count_reg, Imm(1));
-    cb.jnz(NoL2Reset);
-    cb.mov(l2_count_reg, Imm(l2_loop_count));
-    cb.mov(l2_addr, pointer_reg);
-    cb.add(l2_addr, Imm(l1_size));
-    cb.bind(NoL2Reset);
+    auto NoL2Reset = Cb.newLabel();
+
+    Cb.sub(L2CountReg, Imm(1));
+    Cb.jnz(NoL2Reset);
+    Cb.mov(L2CountReg, Imm(L2LoopCount));
+    Cb.mov(L2Addr, PointerReg);
+    Cb.add(L2Addr, Imm(L1Size));
+    Cb.bind(NoL2Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.movq(iter_reg, temp_reg); // store iteration counter
-  if (this->getL3SequenceCount(sequence) > 0) {
+  Cb.movq(IterReg, TempReg); // store iteration counter
+  if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) {
     // reset L3-Cache counter
-    auto NoL3Reset = cb.newLabel();
-
-    cb.sub(l3_count_reg, Imm(1));
-    cb.jnz(NoL3Reset);
-    cb.mov(l3_count_reg, Imm(l3_loop_count));
-    cb.mov(l3_addr, pointer_reg);
-    cb.add(l3_addr, Imm(l2_size));
-    cb.bind(NoL3Reset);
+    auto NoL3Reset = Cb.newLabel();
+
+    Cb.sub(L3CountReg, Imm(1));
+    Cb.jnz(NoL3Reset);
+    Cb.mov(L3CountReg, Imm(L3LoopCount));
+    Cb.mov(L3Addr, PointerReg);
+    Cb.add(L3Addr, Imm(L2Size));
+    Cb.bind(NoL3Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.mov(l1_addr, pointer_reg);
-
-  if (dumpRegisters) {
-    auto SkipRegistersDump = cb.newLabel();
+  Cb.mov(L1Addr, PointerReg);
 
-    cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-    cb.jnz(SkipRegistersDump);
-
-    // dump all the ymm register
-    for (int i = 0; i < (int)this->registerCount(); i++) {
-      cb.vmovapd(
-          zmmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)),
-          Zmm(i));
-    }
-
-    // set read flag
-    cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-
-    cb.bind(SkipRegistersDump);
+  if (DumpRegisters) {
+    emitDumpRegisterCode<Zmm>(Cb, PointerReg, zmmword_ptr);
   }
 
-  if (errorDetection) {
-    this->emitErrorDetectionCode<decltype(iter_reg), Zmm>(
-        cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2);
+  if (ErrorDetection) {
+    emitErrorDetectionCode<decltype(IterReg), Zmm>(Cb, IterReg, AddrHighReg, PointerReg, TempReg, TempReg2);
   }
 
-  cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH));
-  cb.jnz(Loop);
+  Cb.test(ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh));
+  Cb.jnz(Loop);
 
-  cb.bind(FunctionExit);
+  Cb.bind(FunctionExit);
 
-  cb.movq(rax, iter_reg);
+  Cb.movq(asmjit::x86::rax, IterReg);
 
-  cb.emitEpilog(frame);
+  Cb.emitEpilog(Frame);
 
-  cb.finalize();
+  Cb.finalize();
 
-  // String sb;
-  // cb.dump(sb);
-
-  Error err = this->rt.add(&this->loadFunction, &code);
-  if (err) {
-    workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in "
-                       << __FILE__ << " at " << __LINE__;
-    return EXIT_FAILURE;
-  }
+  auto CompiledPayloadPtr = CompiledX86Payload::create<AVX512Payload>(Stats, Code);
 
   // skip if we could not determine cache size
-  if (l1i_cache_size != 0) {
-    auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop);
-    auto instructionCachePercentage = 100 * loopSize / l1i_cache_size;
+  if (L1iCacheSize) {
+    auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop);
+    auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize;
 
-    if (loopSize > l1i_cache_size) {
+    if (LoopSize > *L1iCacheSize) {
       workerLog::warn() << "Work-loop is bigger than the L1i-Cache.";
     }
 
-    workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size
-                       << " Bytes (" << instructionCachePercentage
+    workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage
                        << "%) from the L1i-Cache for the work-loop.";
-    workerLog::trace() << "Sequence size: " << sequence.size();
-    workerLog::trace() << "Repetition count: " << repetitions;
+    workerLog::trace() << "Sequence size: " << Sequence.size();
+    workerLog::trace() << "Repetition count: " << Repetitions;
   }
 
-  return EXIT_SUCCESS;
+  return CompiledPayloadPtr;
 }
 
-std::list<std::string> AVX512Payload::getAvailableInstructions() const {
-  std::list<std::string> instructions;
-
-  transform(this->instructionFlops.begin(), this->instructionFlops.end(),
-            back_inserter(instructions),
-            [](const auto &item) { return item.first; });
-
-  return instructions;
+void AVX512Payload::init(double* MemoryAddr, uint64_t BufferSize) const {
+  X86Payload::initMemory(MemoryAddr, BufferSize, 0.27948995982e-4, 0.27948995982e-4);
 }
 
-void AVX512Payload::init(unsigned long long *memoryAddr,
-                         unsigned long long bufferSize) {
-  X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4);
-}
+} // namespace firestarter::environment::x86::payload
\ No newline at end of file
diff --git a/src/firestarter/Environment/X86/Payload/AVXPayload.cpp b/src/firestarter/Environment/X86/Payload/AVXPayload.cpp
index b6899025..b20a85f7 100644
--- a/src/firestarter/Environment/X86/Payload/AVXPayload.cpp
+++ b/src/firestarter/Environment/X86/Payload/AVXPayload.cpp
@@ -19,475 +19,403 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Environment/X86/Payload/AVXPayload.hpp>
-#include <firestarter/Logging/Log.hpp>
-
-#include <iterator>
-#include <utility>
-
-using namespace firestarter::environment::x86::payload;
-using namespace asmjit;
-using namespace asmjit::x86;
-
-int AVXPayload::compilePayload(
-    std::vector<std::pair<std::string, unsigned>> const &proportion,
-    unsigned instructionCacheSize,
-    std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-    unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-    bool errorDetection) {
+#include "firestarter/Environment/X86/Payload/AVXPayload.hpp"
+#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp"
+
+namespace firestarter::environment::x86::payload {
+
+auto AVXPayload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr {
+  using Imm = asmjit::Imm;
+  using Mm = asmjit::x86::Mm;
+  using Xmm = asmjit::x86::Xmm;
+  using Ymm = asmjit::x86::Ymm;
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  constexpr asmjit::x86::Mem (*xmmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::xmmword_ptr;
+
   // Compute the sequence of instruction groups and the number of its repetions
   // to reach the desired size
-  auto sequence = this->generateSequence(proportion);
-  auto repetitions =
-      this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread);
+  auto Sequence = Settings.sequence();
+  auto Repetitions =
+      environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread());
 
   // compute count of flops and memory access for performance report
-  unsigned flops = 0;
-  unsigned bytes = 0;
+  environment::payload::PayloadStats Stats;
 
-  for (const auto &item : sequence) {
-    auto it = this->instructionFlops.find(item);
+  for (const auto& Item : Sequence) {
+    auto It = instructionFlops().find(Item);
 
-    if (it == this->instructionFlops.end()) {
-      workerLog::error() << "Instruction group " << item << " undefined in "
-                         << name() << ".";
-      return EXIT_FAILURE;
+    if (It == instructionFlops().end()) {
+      workerLog::error() << "Instruction group " << Item << " undefined in " << name() << ".";
     }
 
-    flops += it->second;
+    Stats.Flops += It->second;
 
-    it = this->instructionMemory.find(item);
+    It = instructionMemory().find(Item);
 
-    if (it != this->instructionMemory.end()) {
-      bytes += it->second;
+    if (It != instructionMemory().end()) {
+      Stats.Bytes += It->second;
     }
   }
 
-  this->_flops = repetitions * flops;
-  this->_bytes = repetitions * bytes;
-  this->_instructions = repetitions * sequence.size() * 2 + 4;
+  Stats.Flops *= Repetitions;
+  Stats.Bytes *= Repetitions;
+  Stats.Instructions = Repetitions * Sequence.size() * 2 + 4;
 
   // calculate the buffer sizes
-  auto l1i_cache_size = instructionCacheSize / thread;
-  auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin();
-  auto l1_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l2_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l3_size = *dataCacheBufferSizeIterator / thread;
-  auto ram_size = ramBufferSize / thread;
+  const auto L1iCacheSize = Settings.instructionCacheSizePerThread();
+  const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread();
+  auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin();
+  const auto L1Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L2Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L3Size = *DataCacheBufferSizeIterator;
+  const auto RamSize = Settings.ramBufferSizePerThread();
 
   // calculate the reset counters for the buffers
-  auto l2_loop_count =
-      getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread);
-  auto l3_loop_count =
-      getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread);
-  auto ram_loop_count =
-      getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread);
-
-  CodeHolder code;
-  code.init(this->rt.environment());
-
-  if (nullptr != this->loadFunction) {
-    this->rt.release(&this->loadFunction);
-  }
-
-  Builder cb(&code);
-  cb.addDiagnosticOptions(
-    asmjit::DiagnosticOptions::kValidateAssembler | 
-    asmjit::DiagnosticOptions::kValidateIntermediate );
-
-  auto pointer_reg = rax;
-  auto l1_addr = rbx;
-  auto l2_addr = rcx;
-  auto l3_addr = rdx;
-  auto ram_addr = rdi;
-  auto l2_count_reg = r8;
-  auto l3_count_reg = r9;
-  auto ram_count_reg = r10;
-  auto temp_reg = r11;
-  auto temp_reg2 = rbp;
-  auto offset_reg = r12;
-  auto addrHigh_reg = r13;
-  auto iter_reg = r14;
-  auto shift_regs = 6;
-  auto add_regs = 10;
-  auto trans_regs = 6;
-
-  FuncDetail func;
-  func.init(FuncSignatureT<unsigned long long, unsigned long long *,
-                           volatile unsigned long long *, unsigned long long>(
-                CallConvId::kCDecl),
-            this->rt.environment());
-
-  FuncFrame frame;
-  frame.init(func);
+  const auto L2LoopCount =
+      environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size);
+  const auto L3LoopCount =
+      environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size);
+  const auto RamLoopCount =
+      environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize);
+
+  asmjit::CodeHolder Code;
+  Code.init(asmjit::Environment::host());
+
+  asmjit::x86::Builder Cb(&Code);
+  Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler |
+                          asmjit::DiagnosticOptions::kValidateIntermediate);
+
+  const auto PointerReg = asmjit::x86::rax;
+  const auto L1Addr = asmjit::x86::rbx;
+  const auto L2Addr = asmjit::x86::rcx;
+  const auto L3Addr = asmjit::x86::rdx;
+  const auto RamAddr = asmjit::x86::rdi;
+  const auto L2CountReg = asmjit::x86::r8;
+  const auto L3CountReg = asmjit::x86::r9;
+  const auto RamCountReg = asmjit::x86::r10;
+  const auto TempReg = asmjit::x86::r11;
+  const auto TempReg2 = asmjit::x86::rbp;
+  const auto OffsetReg = asmjit::x86::r12;
+  const auto AddrHighReg = asmjit::x86::r13;
+  const auto IterReg = asmjit::x86::r14;
+  const auto ShiftRegs = 6;
+  const auto AddRegs = 10;
+  const auto TransRegs = 6;
+
+  asmjit::FuncDetail Func;
+  Func.init(asmjit::FuncSignature::build<uint64_t, double*, volatile LoadThreadWorkType*, uint64_t>(
+                asmjit::CallConvId::kCDecl),
+            Code.environment());
+
+  asmjit::FuncFrame Frame;
+  Frame.init(Func);
 
   // make xmm registers dirty
-  for (int i = 0; i < 16; i++) {
-    frame.addDirtyRegs(Ymm(i));
+  for (auto I = 0U; I < 16U; I++) {
+    Frame.addDirtyRegs(Ymm(I));
   }
   // make mmx registers dirty
-  for (int i = 0; i < 8; i++) {
-    frame.addDirtyRegs(Mm(i));
+  for (auto I = 0U; I < 8U; I++) {
+    Frame.addDirtyRegs(Mm(I));
   }
   // make all other used registers dirty except RAX
-  frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg,
-                     l3_count_reg, ram_count_reg, temp_reg, temp_reg2,
-                     offset_reg, addrHigh_reg, iter_reg);
+  Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg,
+                     AddrHighReg, IterReg);
 
-  FuncArgsAssignment args(&func);
-  args.assignAll(pointer_reg, addrHigh_reg, iter_reg);
-  args.updateFuncFrame(frame);
-  frame.finalize();
+  asmjit::FuncArgsAssignment Args(&Func);
+  Args.assignAll(PointerReg, AddrHighReg, IterReg);
+  Args.updateFuncFrame(Frame);
+  Frame.finalize();
 
-  cb.emitProlog(frame);
-  cb.emitArgsAssignment(frame, args);
+  Cb.emitProlog(Frame);
+  Cb.emitArgsAssignment(Frame, Args);
 
   // stop right away if low load is selected
-  auto FunctionExit = cb.newLabel();
+  auto FunctionExit = Cb.newLabel();
 
-  cb.mov(temp_reg, ptr_64(addrHigh_reg));
-  cb.test(temp_reg, temp_reg);
-  cb.jz(FunctionExit);
+  Cb.mov(TempReg, ptr_64(AddrHighReg));
+  Cb.test(TempReg, TempReg);
+  Cb.jz(FunctionExit);
 
-  cb.mov(offset_reg,
+  Cb.mov(OffsetReg,
          Imm(64)); // increment after each cache/memory access
 
   // Initialize AVX-Registers for Addition
-  auto add_start = 0;
-  auto add_end = add_regs - 1;
-  auto trans_start = add_regs;
-  auto trans_end = add_regs + trans_regs - 1;
-  if (add_regs > 0) {
-    for (int i = add_start; i <= add_end; i++) {
-      cb.vmovapd(Ymm(i), ymmword_ptr(pointer_reg, 32 * i));
+  auto AddStart = 0;
+  auto AddEnd = AddRegs - 1;
+  auto TransStart = AddRegs;
+  auto TransEnd = AddRegs + TransRegs - 1;
+  if (AddRegs > 0) {
+    for (auto I = AddStart; I <= AddEnd; I++) {
+      Cb.vmovapd(Ymm(I), ymmword_ptr(PointerReg, 32 * I));
     }
   }
 
   // Initialize MMX-Registers for shift operations
-  auto shift_start = 0;
-  auto shift_end = shift_regs - 1;
-  if (shift_regs > 1) {
-    cb.mov(temp_reg, Imm(0x5555555555555555));
-    cb.movq(Mm(shift_start), temp_reg);
-    for (int i = shift_start + 1; i <= shift_end; i++) {
-      cb.movq(Mm(i), Mm(shift_start));
+  auto ShiftStart = 0;
+  auto ShiftEnd = ShiftRegs - 1;
+  if (ShiftRegs > 1) {
+    Cb.mov(TempReg, Imm(0x5555555555555555));
+    Cb.movq(Mm(ShiftStart), TempReg);
+    for (auto I = ShiftStart + 1; I <= ShiftEnd; I++) {
+      Cb.movq(Mm(I), Mm(ShiftStart));
     }
   }
 
   // Initialize AVX-Registers for Transfer-Operations
-  if (trans_regs > 0) {
-    if (trans_start % 2 == 0) {
-      cb.mov(temp_reg, Imm(0x0F0F0F0F0F0F0F0F));
+  if (TransRegs > 0) {
+    if (TransStart % 2 == 0) {
+      Cb.mov(TempReg, Imm(0x0F0F0F0F0F0F0F0F));
     } else {
-      cb.mov(temp_reg, Imm(0xF0F0F0F0F0F0F0F0));
+      Cb.mov(TempReg, Imm(0xF0F0F0F0F0F0F0F0));
     }
-    cb.pinsrq(Xmm(trans_start), temp_reg, Imm(0));
-    cb.pinsrq(Xmm(trans_start), temp_reg, Imm(1));
-    cb.vinsertf128(Ymm(trans_start), Ymm(trans_start), Xmm(trans_start),
-                   Imm(1));
-    for (int i = trans_start + 1; i <= trans_end; i++) {
-      if (i % 2 == 0) {
-        cb.shr(temp_reg, Imm(4));
+    Cb.pinsrq(Xmm(TransStart), TempReg, Imm(0));
+    Cb.pinsrq(Xmm(TransStart), TempReg, Imm(1));
+    Cb.vinsertf128(Ymm(TransStart), Ymm(TransStart), Xmm(TransStart), Imm(1));
+    for (auto I = TransStart + 1; I <= TransEnd; I++) {
+      if (I % 2 == 0) {
+        Cb.shr(TempReg, Imm(4));
       } else {
-        cb.shl(temp_reg, Imm(4));
+        Cb.shl(TempReg, Imm(4));
       }
-      cb.pinsrq(Xmm(i), temp_reg, Imm(0));
-      cb.pinsrq(Xmm(i), temp_reg, Imm(1));
-      cb.vinsertf128(Ymm(i), Ymm(i), Xmm(i), Imm(1));
+      Cb.pinsrq(Xmm(I), TempReg, Imm(0));
+      Cb.pinsrq(Xmm(I), TempReg, Imm(1));
+      Cb.vinsertf128(Ymm(I), Ymm(I), Xmm(I), Imm(1));
     }
   }
 
-  cb.mov(l1_addr, pointer_reg); // address for L1-buffer
-  cb.mov(l2_addr, pointer_reg);
-  cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer
-  cb.mov(l3_addr, pointer_reg);
-  cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer
-  cb.mov(ram_addr, pointer_reg);
-  cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer
-  cb.mov(l2_count_reg, Imm(l2_loop_count));
-  workerLog::trace() << "reset counter for L2-buffer with "
-                     << l2_loop_count
-                     << " cache line accesses per loop ("
-		     << l2_size/1024
-                     << ") KiB";
-  cb.mov(l3_count_reg, Imm(l3_loop_count));
-  workerLog::trace() << "reset counter for L3-buffer with "
-                     << l3_loop_count
-                     << " cache line accesses per loop ("
-		     << l3_size/1024
-                     << ") KiB";
-  cb.mov(ram_count_reg, Imm(ram_loop_count));
-  workerLog::trace() << "reset counter for RAM-buffer with "
-                     << ram_loop_count
-                     << " cache line accesses per loop ("
-		     << ram_size/1024
-                     << ") KiB";
-
-  cb.align(AlignMode::kCode, 64);
-
-  auto Loop = cb.newLabel();
-  cb.bind(Loop);
-
-  auto left = false;
-  auto shift_dst = shift_start;
-  auto add_dest = add_start + 1;
-  auto mov_dst = trans_start;
-  auto mov_src = mov_dst + 1;
-  unsigned l1_offset = 0;
-
-#define L1_INCREMENT()                                                         \
-  l1_offset += 64;                                                             \
-  if (l1_offset < l1_size * 0.5) {                                             \
-    cb.add(l1_addr, offset_reg);                                               \
-  } else {                                                                     \
-    l1_offset = 0;                                                             \
-    cb.mov(l1_addr, pointer_reg);                                              \
-  }
-
-#define L2_INCREMENT() cb.add(l2_addr, offset_reg);
-
-#define L3_INCREMENT() cb.add(l3_addr, offset_reg)
-
-#define RAM_INCREMENT() cb.add(ram_addr, offset_reg)
-
-  for (unsigned count = 0; count < repetitions; count++) {
-    for (const auto &item : sequence) {
-      if (item == "REG") {
-        cb.vaddpd(
-            Ymm(add_dest), Ymm(add_dest),
-            Ymm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        cb.vmovdqa(Ymm(mov_dst), Ymm(mov_src));
-      } else if (item == "L1_L") {
-        cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l1_addr, 32));
-        L1_INCREMENT();
-      } else if (item == "L1_S") {
-        cb.vaddpd(
-            Ymm(add_dest), Ymm(add_dest),
-            Ymm(add_start + (add_dest - add_start + add_regs - 1) % add_regs));
-        cb.vmovapd(xmmword_ptr(l1_addr, 32), Xmm(add_dest));
-        L1_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L1_LS") {
-        cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l1_addr, 32));
-        cb.vmovapd(xmmword_ptr(l1_addr, 64), Xmm(add_dest));
-        L1_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L2_L") {
-        cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l2_addr, 64));
-        L2_INCREMENT();
-      } else if (item == "L2_S") {
-        cb.vaddpd(
-            Ymm(add_dest), Ymm(add_dest),
-            Ymm(add_start + (add_dest - add_start + add_regs - 1) % add_regs));
-        cb.vmovapd(xmmword_ptr(l2_addr, 64), Xmm(add_dest));
-        L2_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L2_LS") {
-        cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l2_addr, 64));
-        cb.vmovapd(xmmword_ptr(l2_addr, 96), Xmm(add_dest));
-        L2_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L3_L") {
-        cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l3_addr, 64));
-        L3_INCREMENT();
-      } else if (item == "L3_S") {
-        cb.vaddpd(
-            Ymm(add_dest), Ymm(add_dest),
-            Ymm(add_start + (add_dest - add_start + add_regs - 1) % add_regs));
-        cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest));
-        L3_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L3_LS") {
-        cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l3_addr, 64));
-        cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest));
-        L3_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L3_P") {
-        cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l1_addr, 32));
-        cb.prefetcht0(ptr(l3_addr));
-        L3_INCREMENT();
-        this->_instructions++;
-      } else if (item == "RAM_L") {
-        cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(ram_addr, 64));
-        RAM_INCREMENT();
-      } else if (item == "RAM_S") {
-        cb.vaddpd(
-            Ymm(add_dest), Ymm(add_dest),
-            Ymm(add_start + (add_dest - add_start + add_regs - 1) % add_regs));
-        cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest));
-        RAM_INCREMENT();
-        this->_instructions++;
-      } else if (item == "RAM_LS") {
-        cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l3_addr, 64));
-        cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest));
-        RAM_INCREMENT();
-        this->_instructions++;
-      } else if (item == "RAM_P") {
-        cb.vaddpd(Ymm(add_dest), Ymm(add_dest), ymmword_ptr(l1_addr, 32));
-        cb.prefetcht2(ptr(ram_addr));
-        RAM_INCREMENT();
-        this->_instructions++;
+  Cb.mov(L1Addr, PointerReg); // address for L1-buffer
+  Cb.mov(L2Addr, PointerReg);
+  Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer
+  Cb.mov(L3Addr, PointerReg);
+  Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer
+  Cb.mov(RamAddr, PointerReg);
+  Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer
+  Cb.mov(L2CountReg, Imm(L2LoopCount));
+  workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop ("
+                     << L2Size / 1024 << ") KiB";
+  Cb.mov(L3CountReg, Imm(L3LoopCount));
+  workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop ("
+                     << L3Size / 1024 << ") KiB";
+  Cb.mov(RamCountReg, Imm(RamLoopCount));
+  workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop ("
+                     << RamSize / 1024 << ") KiB";
+
+  Cb.align(asmjit::AlignMode::kCode, 64);
+
+  auto Loop = Cb.newLabel();
+  Cb.bind(Loop);
+
+  auto Left = false;
+  auto ShiftDest = ShiftStart;
+  auto AddDest = AddStart + 1;
+  auto MovDest = TransStart;
+  auto MovSrc = MovDest + 1;
+  unsigned L1Offset = 0;
+
+  const auto L1Increment = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg]() {
+    L1Offset += 64;
+    if (L1Offset < L1Size * 0.5) {
+      Cb.add(L1Addr, OffsetReg);
+    } else {
+      L1Offset = 0;
+      Cb.mov(L1Addr, PointerReg);
+    }
+  };
+  const auto L2Increment = [&Cb, &L2Addr, &OffsetReg]() { Cb.add(L2Addr, OffsetReg); };
+  const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); };
+  const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); };
+
+  for (auto Count = 0U; Count < Repetitions; Count++) {
+    for (const auto& Item : Sequence) {
+      if (Item == "REG") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), Ymm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        Cb.vmovdqa(Ymm(MovDest), Ymm(MovSrc));
+      } else if (Item == "L1_L") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L1Addr, 32));
+        L1Increment();
+      } else if (Item == "L1_S") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), Ymm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs)));
+        Cb.vmovapd(xmmword_ptr(L1Addr, 32), Xmm(AddDest));
+        L1Increment();
+        Stats.Instructions++;
+      } else if (Item == "L1_LS") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L1Addr, 32));
+        Cb.vmovapd(xmmword_ptr(L1Addr, 64), Xmm(AddDest));
+        L1Increment();
+        Stats.Instructions++;
+      } else if (Item == "L2_L") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L2Addr, 64));
+        L2Increment();
+      } else if (Item == "L2_S") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), Ymm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs)));
+        Cb.vmovapd(xmmword_ptr(L2Addr, 64), Xmm(AddDest));
+        L2Increment();
+        Stats.Instructions++;
+      } else if (Item == "L2_LS") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L2Addr, 64));
+        Cb.vmovapd(xmmword_ptr(L2Addr, 96), Xmm(AddDest));
+        L2Increment();
+        Stats.Instructions++;
+      } else if (Item == "L3_L") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L3Addr, 64));
+        L3Increment();
+      } else if (Item == "L3_S") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), Ymm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs)));
+        Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest));
+        L3Increment();
+        Stats.Instructions++;
+      } else if (Item == "L3_LS") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L3Addr, 64));
+        Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest));
+        L3Increment();
+        Stats.Instructions++;
+      } else if (Item == "L3_P") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L1Addr, 32));
+        Cb.prefetcht0(ptr(L3Addr));
+        L3Increment();
+        Stats.Instructions++;
+      } else if (Item == "RAM_L") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(RamAddr, 64));
+        RamIncrement();
+      } else if (Item == "RAM_S") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), Ymm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs)));
+        Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest));
+        RamIncrement();
+        Stats.Instructions++;
+      } else if (Item == "RAM_LS") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L3Addr, 64));
+        Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest));
+        RamIncrement();
+        Stats.Instructions++;
+      } else if (Item == "RAM_P") {
+        Cb.vaddpd(Ymm(AddDest), Ymm(AddDest), ymmword_ptr(L1Addr, 32));
+        Cb.prefetcht2(ptr(RamAddr));
+        RamIncrement();
+        Stats.Instructions++;
       } else {
-        workerLog::error() << "Instruction group " << item << " not found in "
-                           << this->name() << ".";
-        return EXIT_FAILURE;
+        workerLog::error() << "Instruction group " << Item << " not found in " << name() << ".";
       }
 
-      if (shift_regs > 1) {
-        this->_instructions++;
-        if (left) {
-          cb.psrlw(Mm(shift_start + (shift_dst - shift_start + 3) % shift_regs),
-                   Mm(shift_dst));
+      if (ShiftRegs > 1) {
+        Stats.Instructions++;
+        if (Left) {
+          Cb.psrlw(Mm(ShiftStart + ((ShiftDest - ShiftStart + 3) % ShiftRegs)), Mm(ShiftDest));
         } else {
-          cb.psllw(Mm(shift_start + (shift_dst - shift_start + 3) % shift_regs),
-                   Mm(shift_dst));
+          Cb.psllw(Mm(ShiftStart + ((ShiftDest - ShiftStart + 3) % ShiftRegs)), Mm(ShiftDest));
         }
       }
 
-      add_dest++;
-      if (add_dest > add_end) {
+      AddDest++;
+      if (AddDest > AddEnd) {
         // DO NOT REMOVE the + 1. It serves for the good of ymm0. If it was to
         // be overriden, the values in the other registers would rise up to inf.
-        add_dest = add_start + 1;
+        AddDest = AddStart + 1;
       }
-      mov_dst++;
-      if (mov_dst > trans_end) {
-        mov_dst = trans_start;
+      MovDest++;
+      if (MovDest > TransEnd) {
+        MovDest = TransStart;
       }
-      mov_src++;
-      if (mov_src > trans_end) {
-        mov_src = trans_start;
+      MovSrc++;
+      if (MovSrc > TransEnd) {
+        MovSrc = TransStart;
       }
-      if (shift_regs > 1) {
-        shift_dst++;
-        if (shift_dst > shift_end) {
-          shift_dst = shift_start;
-          left = !left;
+      if (ShiftRegs > 1) {
+        ShiftDest++;
+        if (ShiftDest > ShiftEnd) {
+          ShiftDest = ShiftStart;
+          Left = !Left;
         }
       }
     }
   }
 
-  if (this->getRAMSequenceCount(sequence) > 0) {
+  if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) {
     // reset RAM counter
-    auto NoRamReset = cb.newLabel();
-
-    cb.sub(ram_count_reg, Imm(1));
-    cb.jnz(NoRamReset);
-    cb.mov(ram_count_reg, Imm(ram_loop_count));
-    cb.mov(ram_addr, pointer_reg);
-    cb.add(ram_addr, Imm(l3_size));
-    cb.bind(NoRamReset);
+    auto NoRamReset = Cb.newLabel();
+
+    Cb.sub(RamCountReg, Imm(1));
+    Cb.jnz(NoRamReset);
+    Cb.mov(RamCountReg, Imm(RamLoopCount));
+    Cb.mov(RamAddr, PointerReg);
+    Cb.add(RamAddr, Imm(L3Size));
+    Cb.bind(NoRamReset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  if (this->getL2SequenceCount(sequence) > 0) {
+  if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) {
     // reset L2-Cache counter
-    auto NoL2Reset = cb.newLabel();
-
-    cb.sub(l2_count_reg, Imm(1));
-    cb.jnz(NoL2Reset);
-    cb.mov(l2_count_reg, Imm(l2_loop_count));
-    cb.mov(l2_addr, pointer_reg);
-    cb.add(l2_addr, Imm(l1_size));
-    cb.bind(NoL2Reset);
+    auto NoL2Reset = Cb.newLabel();
+
+    Cb.sub(L2CountReg, Imm(1));
+    Cb.jnz(NoL2Reset);
+    Cb.mov(L2CountReg, Imm(L2LoopCount));
+    Cb.mov(L2Addr, PointerReg);
+    Cb.add(L2Addr, Imm(L1Size));
+    Cb.bind(NoL2Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  if (this->getL3SequenceCount(sequence) > 0) {
+  if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) {
     // reset L3-Cache counter
-    auto NoL3Reset = cb.newLabel();
-
-    cb.sub(l3_count_reg, Imm(1));
-    cb.jnz(NoL3Reset);
-    cb.mov(l3_count_reg, Imm(l3_loop_count));
-    cb.mov(l3_addr, pointer_reg);
-    cb.add(l3_addr, Imm(l2_size));
-    cb.bind(NoL3Reset);
+    auto NoL3Reset = Cb.newLabel();
+
+    Cb.sub(L3CountReg, Imm(1));
+    Cb.jnz(NoL3Reset);
+    Cb.mov(L3CountReg, Imm(L3LoopCount));
+    Cb.mov(L3Addr, PointerReg);
+    Cb.add(L3Addr, Imm(L2Size));
+    Cb.bind(NoL3Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.inc(iter_reg); // increment iteration counter
-  cb.mov(l1_addr, pointer_reg);
-
-  if (dumpRegisters) {
-    auto SkipRegistersDump = cb.newLabel();
+  Cb.inc(IterReg); // increment iteration counter
+  Cb.mov(L1Addr, PointerReg);
 
-    cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-    cb.jnz(SkipRegistersDump);
-
-    // dump all the ymm register
-    for (int i = 0; i < (int)this->registerCount(); i++) {
-      cb.vmovapd(
-          ymmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)),
-          Ymm(i));
-    }
-
-    // set read flag
-    cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-
-    cb.bind(SkipRegistersDump);
+  if (DumpRegisters) {
+    emitDumpRegisterCode<Ymm>(Cb, PointerReg, asmjit::x86::ymmword_ptr);
   }
 
-  if (errorDetection) {
-    this->emitErrorDetectionCode<decltype(iter_reg), Ymm>(
-        cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2);
+  if (ErrorDetection) {
+    emitErrorDetectionCode<decltype(IterReg), Ymm>(Cb, IterReg, AddrHighReg, PointerReg, TempReg, TempReg2);
   }
 
-  cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH));
-  cb.jnz(Loop);
+  Cb.test(ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh));
+  Cb.jnz(Loop);
 
-  cb.bind(FunctionExit);
+  Cb.bind(FunctionExit);
 
-  cb.mov(rax, iter_reg); // restore iteration counter
+  Cb.mov(asmjit::x86::rax, IterReg); // restore iteration counter
 
-  cb.emitEpilog(frame);
+  Cb.emitEpilog(Frame);
 
-  cb.finalize();
+  Cb.finalize();
 
-  // String sb;
-  // cb.dump(sb);
-
-  Error err = this->rt.add(&this->loadFunction, &code);
-  if (err) {
-    workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in "
-                       << __FILE__ << " at " << __LINE__;
-    return EXIT_FAILURE;
-  }
+  auto CompiledPayloadPtr = CompiledX86Payload::create<AVXPayload>(Stats, Code);
 
   // skip if we could not determine cache size
-  if (l1i_cache_size != 0) {
-    auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop);
-    auto instructionCachePercentage = 100 * loopSize / l1i_cache_size;
+  if (L1iCacheSize) {
+    auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop);
+    auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize;
 
-    if (loopSize > l1i_cache_size) {
+    if (LoopSize > *L1iCacheSize) {
       workerLog::warn() << "Work-loop is bigger than the L1i-Cache.";
     }
 
-    workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size
-                       << " Bytes (" << instructionCachePercentage
+    workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage
                        << "%) from the L1i-Cache for the work-loop.";
-    workerLog::trace() << "Sequence size: " << sequence.size();
-    workerLog::trace() << "Repetition count: " << repetitions;
+    workerLog::trace() << "Sequence size: " << Sequence.size();
+    workerLog::trace() << "Repetition count: " << Repetitions;
   }
 
-  return EXIT_SUCCESS;
+  return CompiledPayloadPtr;
 }
 
-std::list<std::string> AVXPayload::getAvailableInstructions() const {
-  std::list<std::string> instructions;
-
-  transform(this->instructionFlops.begin(), this->instructionFlops.end(),
-            back_inserter(instructions),
-            [](const auto &item) { return item.first; });
-
-  return instructions;
+void AVXPayload::init(double* MemoryAddr, uint64_t BufferSize) const {
+  X86Payload::initMemory(MemoryAddr, BufferSize, 1.654738925401e-10, 1.654738925401e-15);
 }
 
-void AVXPayload::init(unsigned long long *memoryAddr,
-                      unsigned long long bufferSize) {
-  X86Payload::init(memoryAddr, bufferSize, 1.654738925401e-10,
-                   1.654738925401e-15);
-}
+} // namespace firestarter::environment::x86::payload
\ No newline at end of file
diff --git a/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp b/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp
index 32e81752..202d34c7 100644
--- a/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp
+++ b/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp
@@ -19,459 +19,376 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Environment/X86/Payload/FMA4Payload.hpp>
-#include <firestarter/Logging/Log.hpp>
-
-#include <iterator>
-#include <utility>
-
-using namespace firestarter::environment::x86::payload;
-using namespace asmjit;
-using namespace asmjit::x86;
-
-int FMA4Payload::compilePayload(
-    std::vector<std::pair<std::string, unsigned>> const &proportion,
-    unsigned instructionCacheSize,
-    std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-    unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-    bool errorDetection) {
+#include "firestarter/Environment/X86/Payload/FMA4Payload.hpp"
+#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp"
+
+namespace firestarter::environment::x86::payload {
+
+auto FMA4Payload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                 bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr {
+  using Imm = asmjit::Imm;
+  using Xmm = asmjit::x86::Xmm;
+  // NOLINTBEGIN(readability-identifier-naming)
+  constexpr asmjit::x86::Mem (*xmmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::xmmword_ptr;
+  constexpr auto xmm0 = asmjit::x86::xmm0;
+  constexpr auto xmm1 = asmjit::x86::xmm1;
+  // NOLINTEND(readability-identifier-naming)
+
   // Compute the sequence of instruction groups and the number of its repetions
   // to reach the desired size
-  auto sequence = this->generateSequence(proportion);
-  auto repetitions =
-      this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread);
+  auto Sequence = Settings.sequence();
+  auto Repetitions =
+      environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread());
 
   // compute count of flops and memory access for performance report
-  unsigned flops = 0;
-  unsigned bytes = 0;
+  environment::payload::PayloadStats Stats;
 
-  for (const auto &item : sequence) {
-    auto it = this->instructionFlops.find(item);
+  for (const auto& Item : Sequence) {
+    auto It = instructionFlops().find(Item);
 
-    if (it == this->instructionFlops.end()) {
-      workerLog::error() << "Instruction group " << item << " undefined in "
-                         << name() << ".";
-      return EXIT_FAILURE;
+    if (It == instructionFlops().end()) {
+      workerLog::error() << "Instruction group " << Item << " undefined in " << name() << ".";
     }
 
-    flops += it->second;
+    Stats.Flops += It->second;
 
-    it = this->instructionMemory.find(item);
+    It = instructionMemory().find(Item);
 
-    if (it != this->instructionMemory.end()) {
-      bytes += it->second;
+    if (It != instructionMemory().end()) {
+      Stats.Bytes += It->second;
     }
   }
 
-  this->_flops = repetitions * flops;
-  this->_bytes = repetitions * bytes;
-  this->_instructions = repetitions * sequence.size() * 4 + 6;
+  Stats.Flops *= Repetitions;
+  Stats.Bytes *= Repetitions;
+  Stats.Instructions = Repetitions * Sequence.size() * 4 + 6;
 
   // calculate the buffer sizes
-  auto l1i_cache_size = instructionCacheSize / thread;
-  auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin();
-  auto l1_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l2_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l3_size = *dataCacheBufferSizeIterator / thread;
-  auto ram_size = ramBufferSize / thread;
+  const auto L1iCacheSize = Settings.instructionCacheSizePerThread();
+  const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread();
+  auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin();
+  const auto L1Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L2Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L3Size = *DataCacheBufferSizeIterator;
+  const auto RamSize = Settings.ramBufferSizePerThread();
 
   // calculate the reset counters for the buffers
-  auto l2_loop_count =
-      getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread);
-  auto l3_loop_count =
-      getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread);
-  auto ram_loop_count =
-      getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread);
-
-  CodeHolder code;
-  code.init(this->rt.environment());
-
-  if (nullptr != this->loadFunction) {
-    this->rt.release(&this->loadFunction);
-  }
-
-  Builder cb(&code);
-  cb.addDiagnosticOptions(
-    asmjit::DiagnosticOptions::kValidateAssembler | 
-    asmjit::DiagnosticOptions::kValidateIntermediate );
-
-  auto pointer_reg = rax;
-  auto l1_addr = rbx;
-  auto l2_addr = rcx;
-  auto l3_addr = r8;
-  auto ram_addr = r9;
-  auto l2_count_reg = r10;
-  auto l3_count_reg = r11;
-  auto ram_count_reg = r12;
-  auto temp_reg = r13;
-  auto temp_reg2 = rbp;
-  auto offset_reg = r14;
-  auto addrHigh_reg = r15;
-  auto iter_reg = mm0;
-  auto shift_reg = std::vector<Gp>({rdi, rsi, rdx});
-  auto shift_reg32 = std::vector<Gp>({edi, esi, edx});
-  auto nr_shift_regs = 3;
-  auto mul_regs = 2;
-  auto add_regs = 9;
-  auto alt_dst_regs = 3;
-  auto ram_reg = xmm15;
-
-  FuncDetail func;
-  func.init(FuncSignatureT<unsigned long long, unsigned long long *,
-                           volatile unsigned long long *, unsigned long long>(
-                CallConvId::kCDecl),
-            this->rt.environment());
-
-  FuncFrame frame;
-  frame.init(func);
+  const auto L2LoopCount =
+      environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size);
+  const auto L3LoopCount =
+      environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size);
+  const auto RamLoopCount =
+      environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize);
+
+  asmjit::CodeHolder Code;
+  Code.init(asmjit::Environment::host());
+
+  asmjit::x86::Builder Cb(&Code);
+  Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler |
+                          asmjit::DiagnosticOptions::kValidateIntermediate);
+
+  const auto PointerReg = asmjit::x86::rax;
+  const auto L1Addr = asmjit::x86::rbx;
+  const auto L2Addr = asmjit::x86::rcx;
+  const auto L3Addr = asmjit::x86::r8;
+  const auto RamAddr = asmjit::x86::r9;
+  const auto L2CountReg = asmjit::x86::r10;
+  const auto L3CountReg = asmjit::x86::r11;
+  const auto RamCountReg = asmjit::x86::r12;
+  const auto TempReg = asmjit::x86::r13;
+  const auto TempReg2 = asmjit::x86::rbp;
+  const auto OffsetReg = asmjit::x86::r14;
+  const auto AddrHighReg = asmjit::x86::r15;
+  const auto IterReg = asmjit::x86::mm0;
+  const auto ShiftReg = std::vector<asmjit::x86::Gp>({asmjit::x86::rdi, asmjit::x86::rsi, asmjit::x86::rdx});
+  const auto ShiftReg32 = std::vector<asmjit::x86::Gp>({asmjit::x86::edi, asmjit::x86::esi, asmjit::x86::edx});
+  const auto NbShiftRegs = 3;
+  const auto MulRegs = 2;
+  const auto AddRegs = 9;
+  const auto AltDestRegs = 3;
+  const auto RamReg = asmjit::x86::xmm15;
+
+  asmjit::FuncDetail Func;
+  Func.init(asmjit::FuncSignature::build<uint64_t, double*, volatile LoadThreadWorkType*, uint64_t>(
+                asmjit::CallConvId::kCDecl),
+            Code.environment());
+
+  asmjit::FuncFrame Frame;
+  Frame.init(Func);
 
   // make (x|y)mm registers dirty
-  for (int i = 0; i < 16; i++) {
-    frame.addDirtyRegs(Ymm(i));
+  for (auto I = 0; I < 16; I++) {
+    Frame.addDirtyRegs(asmjit::x86::Ymm(I));
   }
-  for (int i = 0; i < 8; i++) {
-    frame.addDirtyRegs(Mm(i));
+  for (auto I = 0; I < 8; I++) {
+    Frame.addDirtyRegs(asmjit::x86::Mm(I));
   }
   // make all other used registers dirty except RAX
-  frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg,
-                     l3_count_reg, ram_count_reg, temp_reg, temp_reg2,
-                     offset_reg, addrHigh_reg, iter_reg, ram_addr);
-  for (const auto &reg : shift_reg) {
-    frame.addDirtyRegs(reg);
+  Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg,
+                     AddrHighReg, IterReg, RamAddr);
+  for (const auto& Reg : ShiftReg) {
+    Frame.addDirtyRegs(Reg);
   }
 
-  FuncArgsAssignment args(&func);
+  asmjit::FuncArgsAssignment Args(&Func);
   // FIXME: asmjit assigment to mm0 does not seem to be supported
-  args.assignAll(pointer_reg, addrHigh_reg, temp_reg);
-  args.updateFuncFrame(frame);
-  frame.finalize();
+  Args.assignAll(PointerReg, AddrHighReg, TempReg);
+  Args.updateFuncFrame(Frame);
+  Frame.finalize();
 
-  cb.emitProlog(frame);
-  cb.emitArgsAssignment(frame, args);
+  Cb.emitProlog(Frame);
+  Cb.emitArgsAssignment(Frame, Args);
 
   // FIXME: movq from temp_reg to iter_reg
-  cb.movq(iter_reg, temp_reg);
+  Cb.movq(IterReg, TempReg);
 
   // stop right away if low load is selected
-  auto FunctionExit = cb.newLabel();
+  auto FunctionExit = Cb.newLabel();
 
-  cb.mov(temp_reg, ptr_64(addrHigh_reg));
-  cb.test(temp_reg, temp_reg);
-  cb.jz(FunctionExit);
+  Cb.mov(TempReg, ptr_64(AddrHighReg));
+  Cb.test(TempReg, TempReg);
+  Cb.jz(FunctionExit);
 
-  cb.mov(offset_reg,
+  Cb.mov(OffsetReg,
          Imm(64)); // increment after each cache/memory access
   // Initialize registers for shift operations
-  for (auto const &reg : shift_reg32) {
-    cb.mov(reg, Imm(0xAAAAAAAA));
+  for (auto const& Reg : ShiftReg32) {
+    Cb.mov(Reg, Imm(0xAAAAAAAA));
   }
   // Initialize AVX-Registers for FMA4 Operations
-  cb.vmovapd(ymm0, ymmword_ptr(pointer_reg));
-  cb.vmovapd(ymm1, ymmword_ptr(pointer_reg));
-  auto add_start = mul_regs;
-  auto add_end = mul_regs + add_regs - 1;
-  auto trans_start = add_regs + mul_regs;
-  auto trans_end = add_regs + mul_regs + alt_dst_regs - 1;
-  for (int i = add_start; i <= trans_end; i++) {
-    cb.vmovapd(Ymm(i), ymmword_ptr(pointer_reg, 256 + i * 32));
+  Cb.vmovapd(asmjit::x86::ymm0, asmjit::x86::ymmword_ptr(PointerReg));
+  Cb.vmovapd(asmjit::x86::ymm1, asmjit::x86::ymmword_ptr(PointerReg));
+  auto AddStart = MulRegs;
+  auto AddEnd = MulRegs + AddRegs - 1;
+  auto TransStart = AddRegs + MulRegs;
+  auto TransEnd = AddRegs + MulRegs + AltDestRegs - 1;
+  for (auto I = AddStart; I <= TransEnd; I++) {
+    Cb.vmovapd(asmjit::x86::Ymm(I), asmjit::x86::ymmword_ptr(PointerReg, 256 + (I * 32)));
   }
-  cb.mov(l1_addr, pointer_reg); // address for L1-buffer
-  cb.mov(l2_addr, pointer_reg);
-  cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer
-  cb.mov(l3_addr, pointer_reg);
-  cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer
-  cb.mov(ram_addr, pointer_reg);
-  cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer
-  cb.mov(l2_count_reg, Imm(l2_loop_count));
-  workerLog::trace() << "reset counter for L2-buffer with "
-                     << l2_loop_count
-                     << " cache line accesses per loop ("
-		     << l2_size/1024
-                     << ") KiB";
-  cb.mov(l3_count_reg, Imm(l3_loop_count));
-  workerLog::trace() << "reset counter for L3-buffer with "
-                     << l3_loop_count
-                     << " cache line accesses per loop ("
-		     << l3_size/1024
-                     << ") KiB";
-  cb.mov(ram_count_reg, Imm(ram_loop_count));
-  workerLog::trace() << "reset counter for RAM-buffer with "
-                     << ram_loop_count
-                     << " cache line accesses per loop ("
-		     << ram_size/1024
-                     << ") KiB";
-
-  cb.align(AlignMode::kCode, 64);
-
-  auto Loop = cb.newLabel();
-  cb.bind(Loop);
-
-  auto shift_pos = 0;
-  bool left = false;
-  auto add_dest = add_start + 1;
-  auto mov_dst = trans_start;
-  auto mov_src = mov_dst + 1;
-  unsigned l1_offset = 0;
-
-#define L1_INCREMENT()                                                         \
-  l1_offset += 64;                                                             \
-  if (l1_offset < l1_size * 0.5) {                                             \
-    cb.add(l1_addr, offset_reg);                                               \
-  } else {                                                                     \
-    l1_offset = 0;                                                             \
-    cb.mov(l1_addr, pointer_reg);                                              \
-  }
-
-#define L2_INCREMENT() cb.add(l2_addr, offset_reg);
-
-#define L3_INCREMENT() cb.add(l3_addr, offset_reg)
-
-#define RAM_INCREMENT() cb.add(ram_addr, offset_reg)
-
-  for (unsigned count = 0; count < repetitions; count++) {
-    for (const auto &item : sequence) {
-      if (item == "REG") {
-        cb.vfmaddpd(
-            Xmm(add_dest), Xmm(add_dest), xmm0,
-            Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        cb.vfmaddpd(
-            Xmm(mov_dst), Xmm(mov_dst), xmm1,
-            Xmm(add_start + (add_dest - add_start + add_regs + 2) % add_regs));
-        cb.xor_(shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs],
-                temp_reg);
-        mov_dst++;
-      } else if (item == "L1_L") {
-        cb.vfmaddpd(
-            Xmm(add_dest), Xmm(add_dest), xmm0,
-            Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        cb.vfmaddpd(Ymm(add_dest), Ymm(add_dest), ymm1,
-                    ymmword_ptr(l1_addr, 32));
-        L1_INCREMENT();
-      } else if (item == "L1_S") {
-        cb.vmovapd(xmmword_ptr(l1_addr, 32), Xmm(add_dest));
-        cb.vfmaddpd(
-            Ymm(add_dest), Ymm(add_dest), ymm0,
-            Ymm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        L1_INCREMENT();
-      } else if (item == "L1_LS") {
-        cb.vmovapd(xmmword_ptr(l1_addr, 64), Xmm(add_dest));
-        cb.vfmaddpd(Ymm(add_dest), Ymm(add_dest), ymm0,
-                    ymmword_ptr(l1_addr, 32));
-        L1_INCREMENT();
-      } else if (item == "L2_L") {
-        cb.vfmaddpd(
-            Xmm(add_dest), Xmm(add_dest), xmm0,
-            Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm1,
-                    xmmword_ptr(l2_addr, 64));
-        L2_INCREMENT();
-      } else if (item == "L2_S") {
-        cb.vmovapd(xmmword_ptr(l2_addr, 64), Xmm(add_dest));
-        cb.vfmaddpd(
-            Xmm(add_dest), Xmm(add_dest), xmm0,
-            Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        L2_INCREMENT();
-      } else if (item == "L2_LS") {
-        cb.vmovapd(xmmword_ptr(l2_addr, 96), Xmm(add_dest));
-        cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm0,
-                    xmmword_ptr(l2_addr, 64));
-        L2_INCREMENT();
-      } else if (item == "L3_L") {
-        cb.vfmaddpd(
-            Xmm(add_dest), Xmm(add_dest), xmm0,
-            Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm1,
-                    xmmword_ptr(l3_addr, 64));
-        L3_INCREMENT();
-      } else if (item == "L3_S") {
-        cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest));
-        cb.vfmaddpd(
-            Xmm(add_dest), Xmm(add_dest), xmm0,
-            Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        L3_INCREMENT();
-      } else if (item == "L3_LS") {
-        cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest));
-        cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm0,
-                    xmmword_ptr(l3_addr, 64));
-        L3_INCREMENT();
-      } else if (item == "L3_P") {
-        cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm0,
-                    xmmword_ptr(l1_addr, 32));
-        cb.prefetcht2(ptr(l3_addr));
-        L3_INCREMENT();
-      } else if (item == "RAM_L") {
-        cb.vfmaddpd(
-            Xmm(add_dest), Xmm(add_dest), xmm0,
-            Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        cb.vfmaddpd(ram_reg, ram_reg, xmm1, xmmword_ptr(ram_addr, 64));
-        RAM_INCREMENT();
-      } else if (item == "RAM_S") {
-        cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest));
-        cb.vfmaddpd(
-            Xmm(add_dest), Xmm(add_dest), xmm0,
-            Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        RAM_INCREMENT();
-      } else if (item == "RAM_LS") {
-        cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest));
-        cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm0,
-                    xmmword_ptr(ram_addr, 32));
-        RAM_INCREMENT();
-      } else if (item == "RAM_P") {
-        cb.vfmaddpd(Xmm(add_dest), Xmm(add_dest), xmm0,
-                    xmmword_ptr(l1_addr, 32));
-        cb.prefetcht2(ptr(ram_addr));
-        RAM_INCREMENT();
+  Cb.mov(L1Addr, PointerReg); // address for L1-buffer
+  Cb.mov(L2Addr, PointerReg);
+  Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer
+  Cb.mov(L3Addr, PointerReg);
+  Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer
+  Cb.mov(RamAddr, PointerReg);
+  Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer
+  Cb.mov(L2CountReg, Imm(L2LoopCount));
+  workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop ("
+                     << L2Size / 1024 << ") KiB";
+  Cb.mov(L3CountReg, Imm(L3LoopCount));
+  workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop ("
+                     << L3Size / 1024 << ") KiB";
+  Cb.mov(RamCountReg, Imm(RamLoopCount));
+  workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop ("
+                     << RamSize / 1024 << ") KiB";
+
+  Cb.align(asmjit::AlignMode::kCode, 64);
+
+  auto Loop = Cb.newLabel();
+  Cb.bind(Loop);
+
+  auto ShiftPos = 0;
+  bool Left = false;
+  auto AddDest = AddStart + 1;
+  auto MovDest = TransStart;
+  auto MovSrc = MovDest + 1;
+  unsigned L1Offset = 0;
+
+  const auto L1Increment = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg]() {
+    L1Offset += 64;
+    if (L1Offset < L1Size * 0.5) {
+      Cb.add(L1Addr, OffsetReg);
+    } else {
+      L1Offset = 0;
+      Cb.mov(L1Addr, PointerReg);
+    }
+  };
+  const auto L2Increment = [&Cb, &L2Addr, &OffsetReg]() { Cb.add(L2Addr, OffsetReg); };
+  const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); };
+  const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); };
+
+  for (auto Count = 0U; Count < Repetitions; Count++) {
+    for (const auto& Item : Sequence) {
+      if (Item == "REG") {
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        Cb.vfmaddpd(Xmm(MovDest), Xmm(MovDest), xmm1, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 2) % AddRegs)));
+        Cb.xor_(ShiftReg[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs], TempReg);
+        MovDest++;
+      } else if (Item == "L1_L") {
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        Cb.vfmaddpd(asmjit::x86::Ymm(AddDest), asmjit::x86::Ymm(AddDest), asmjit::x86::ymm1,
+                    asmjit::x86::ymmword_ptr(L1Addr, 32));
+        L1Increment();
+      } else if (Item == "L1_S") {
+        Cb.vmovapd(xmmword_ptr(L1Addr, 32), Xmm(AddDest));
+        Cb.vfmaddpd(asmjit::x86::Ymm(AddDest), asmjit::x86::Ymm(AddDest), asmjit::x86::ymm0,
+                    asmjit::x86::Ymm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        L1Increment();
+      } else if (Item == "L1_LS") {
+        Cb.vmovapd(xmmword_ptr(L1Addr, 64), Xmm(AddDest));
+        Cb.vfmaddpd(asmjit::x86::Ymm(AddDest), asmjit::x86::Ymm(AddDest), asmjit::x86::ymm0,
+                    asmjit::x86::ymmword_ptr(L1Addr, 32));
+        L1Increment();
+      } else if (Item == "L2_L") {
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm1, xmmword_ptr(L2Addr, 64));
+        L2Increment();
+      } else if (Item == "L2_S") {
+        Cb.vmovapd(xmmword_ptr(L2Addr, 64), Xmm(AddDest));
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        L2Increment();
+      } else if (Item == "L2_LS") {
+        Cb.vmovapd(xmmword_ptr(L2Addr, 96), Xmm(AddDest));
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, xmmword_ptr(L2Addr, 64));
+        L2Increment();
+      } else if (Item == "L3_L") {
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm1, xmmword_ptr(L3Addr, 64));
+        L3Increment();
+      } else if (Item == "L3_S") {
+        Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest));
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        L3Increment();
+      } else if (Item == "L3_LS") {
+        Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest));
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, xmmword_ptr(L3Addr, 64));
+        L3Increment();
+      } else if (Item == "L3_P") {
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, xmmword_ptr(L1Addr, 32));
+        Cb.prefetcht2(asmjit::x86::ptr(L3Addr));
+        L3Increment();
+      } else if (Item == "RAM_L") {
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        Cb.vfmaddpd(RamReg, RamReg, xmm1, xmmword_ptr(RamAddr, 64));
+        RamIncrement();
+      } else if (Item == "RAM_S") {
+        Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest));
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        RamIncrement();
+      } else if (Item == "RAM_LS") {
+        Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest));
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, xmmword_ptr(RamAddr, 32));
+        RamIncrement();
+      } else if (Item == "RAM_P") {
+        Cb.vfmaddpd(Xmm(AddDest), Xmm(AddDest), xmm0, xmmword_ptr(L1Addr, 32));
+        Cb.prefetcht2(asmjit::x86::ptr(RamAddr));
+        RamIncrement();
       } else {
-        workerLog::error() << "Instruction group " << item << " not found in "
-                           << this->name() << ".";
-        return EXIT_FAILURE;
+        workerLog::error() << "Instruction group " << Item << " not found in " << name() << ".";
       }
 
-      if (left) {
-        cb.shr(shift_reg32[shift_pos], Imm(1));
+      if (Left) {
+        Cb.shr(ShiftReg32[ShiftPos], Imm(1));
       } else {
-        cb.shl(shift_reg32[shift_pos], Imm(1));
+        Cb.shl(ShiftReg32[ShiftPos], Imm(1));
       }
-      add_dest++;
-      if (add_dest > add_end) {
-        add_dest = add_start;
+      AddDest++;
+      if (AddDest > AddEnd) {
+        AddDest = AddStart;
       }
-      if (mov_dst > trans_end) {
-        mov_dst = trans_start;
+      if (MovDest > TransEnd) {
+        MovDest = TransStart;
       }
-      mov_src++;
-      if (mov_src > trans_end) {
-        mov_src = trans_start;
+      MovSrc++;
+      if (MovSrc > TransEnd) {
+        MovSrc = TransStart;
       }
-      shift_pos++;
-      if (shift_pos == nr_shift_regs) {
-        shift_pos = 0;
-        left = !left;
+      ShiftPos++;
+      if (ShiftPos == NbShiftRegs) {
+        ShiftPos = 0;
+        Left = !Left;
       }
     }
   }
 
-  cb.movq(temp_reg, iter_reg); // restore iteration counter
-  if (this->getRAMSequenceCount(sequence) > 0) {
+  Cb.movq(TempReg, IterReg); // restore iteration counter
+  if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) {
     // reset RAM counter
-    auto NoRamReset = cb.newLabel();
-
-    cb.sub(ram_count_reg, Imm(1));
-    cb.jnz(NoRamReset);
-    cb.mov(ram_count_reg, Imm(ram_loop_count));
-    cb.mov(ram_addr, pointer_reg);
-    cb.add(ram_addr, Imm(l3_size));
-    cb.bind(NoRamReset);
+    auto NoRamReset = Cb.newLabel();
+
+    Cb.sub(RamCountReg, Imm(1));
+    Cb.jnz(NoRamReset);
+    Cb.mov(RamCountReg, Imm(RamLoopCount));
+    Cb.mov(RamAddr, PointerReg);
+    Cb.add(RamAddr, Imm(L3Size));
+    Cb.bind(NoRamReset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.inc(temp_reg); // increment iteration counter
-  if (this->getL2SequenceCount(sequence) > 0) {
+  Cb.inc(TempReg); // increment iteration counter
+  if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) {
     // reset L2-Cache counter
-    auto NoL2Reset = cb.newLabel();
-
-    cb.sub(l2_count_reg, Imm(1));
-    cb.jnz(NoL2Reset);
-    cb.mov(l2_count_reg, Imm(l2_loop_count));
-    cb.mov(l2_addr, pointer_reg);
-    cb.add(l2_addr, Imm(l1_size));
-    cb.bind(NoL2Reset);
+    auto NoL2Reset = Cb.newLabel();
+
+    Cb.sub(L2CountReg, Imm(1));
+    Cb.jnz(NoL2Reset);
+    Cb.mov(L2CountReg, Imm(L2LoopCount));
+    Cb.mov(L2Addr, PointerReg);
+    Cb.add(L2Addr, Imm(L1Size));
+    Cb.bind(NoL2Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.movq(iter_reg, temp_reg); // store iteration counter
-  if (this->getL3SequenceCount(sequence) > 0) {
+  Cb.movq(IterReg, TempReg); // store iteration counter
+  if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) {
     // reset L3-Cache counter
-    auto NoL3Reset = cb.newLabel();
-
-    cb.sub(l3_count_reg, Imm(1));
-    cb.jnz(NoL3Reset);
-    cb.mov(l3_count_reg, Imm(l3_loop_count));
-    cb.mov(l3_addr, pointer_reg);
-    cb.add(l3_addr, Imm(l2_size));
-    cb.bind(NoL3Reset);
+    auto NoL3Reset = Cb.newLabel();
+
+    Cb.sub(L3CountReg, Imm(1));
+    Cb.jnz(NoL3Reset);
+    Cb.mov(L3CountReg, Imm(L3LoopCount));
+    Cb.mov(L3Addr, PointerReg);
+    Cb.add(L3Addr, Imm(L2Size));
+    Cb.bind(NoL3Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.mov(l1_addr, pointer_reg);
-
-  if (dumpRegisters) {
-    auto SkipRegistersDump = cb.newLabel();
+  Cb.mov(L1Addr, PointerReg);
 
-    cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-    cb.jnz(SkipRegistersDump);
-
-    // dump all the ymm register
-    for (int i = 0; i < (int)this->registerCount(); i++) {
-      cb.vmovapd(
-          ymmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)),
-          Ymm(i));
-    }
-
-    // set read flag
-    cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-
-    cb.bind(SkipRegistersDump);
+  if (DumpRegisters) {
+    emitDumpRegisterCode<asmjit::x86::Ymm>(Cb, PointerReg, asmjit::x86::ymmword_ptr);
   }
 
-  if (errorDetection) {
-    this->emitErrorDetectionCode<decltype(iter_reg), Ymm>(
-        cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2);
+  if (ErrorDetection) {
+    emitErrorDetectionCode<decltype(IterReg), asmjit::x86::Ymm>(Cb, IterReg, AddrHighReg, PointerReg, TempReg,
+                                                                TempReg2);
   }
 
-  cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH));
-  cb.jnz(Loop);
+  Cb.test(asmjit::x86::ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh));
+  Cb.jnz(Loop);
 
-  cb.bind(FunctionExit);
+  Cb.bind(FunctionExit);
 
-  cb.movq(rax, iter_reg);
+  Cb.movq(asmjit::x86::rax, IterReg);
 
-  cb.emitEpilog(frame);
+  Cb.emitEpilog(Frame);
 
-  cb.finalize();
+  Cb.finalize();
 
-  // String sb;
-  // cb.dump(sb);
-
-  Error err = this->rt.add(&this->loadFunction, &code);
-  if (err) {
-    workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in "
-                       << __FILE__ << " at " << __LINE__;
-    return EXIT_FAILURE;
-  }
+  auto CompiledPayloadPtr = CompiledX86Payload::create<FMA4Payload>(Stats, Code);
 
   // skip if we could not determine cache size
-  if (l1i_cache_size != 0) {
-    auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop);
-    auto instructionCachePercentage = 100 * loopSize / l1i_cache_size;
+  if (L1iCacheSize) {
+    auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop);
+    auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize;
 
-    if (loopSize > l1i_cache_size) {
+    if (LoopSize > *L1iCacheSize) {
       workerLog::warn() << "Work-loop is bigger than the L1i-Cache.";
     }
 
-    workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size
-                       << " Bytes (" << instructionCachePercentage
+    workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage
                        << "%) from the L1i-Cache for the work-loop.";
-    workerLog::trace() << "Sequence size: " << sequence.size();
-    workerLog::trace() << "Repetition count: " << repetitions;
+    workerLog::trace() << "Sequence size: " << Sequence.size();
+    workerLog::trace() << "Repetition count: " << Repetitions;
   }
 
-  return EXIT_SUCCESS;
+  return CompiledPayloadPtr;
 }
 
-std::list<std::string> FMA4Payload::getAvailableInstructions() const {
-  std::list<std::string> instructions;
-
-  transform(this->instructionFlops.begin(), this->instructionFlops.end(),
-            back_inserter(instructions),
-            [](const auto &item) { return item.first; });
-
-  return instructions;
+void FMA4Payload::init(double* MemoryAddr, uint64_t BufferSize) const {
+  X86Payload::initMemory(MemoryAddr, BufferSize, 0.27948995982e-4, 0.27948995982e-4);
 }
 
-void FMA4Payload::init(unsigned long long *memoryAddr,
-                       unsigned long long bufferSize) {
-  X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4);
-}
+} // namespace firestarter::environment::x86::payload
\ No newline at end of file
diff --git a/src/firestarter/Environment/X86/Payload/FMAPayload.cpp b/src/firestarter/Environment/X86/Payload/FMAPayload.cpp
index e3087c01..cec0021a 100644
--- a/src/firestarter/Environment/X86/Payload/FMAPayload.cpp
+++ b/src/firestarter/Environment/X86/Payload/FMAPayload.cpp
@@ -19,468 +19,411 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Environment/X86/Payload/FMAPayload.hpp>
-#include <firestarter/Logging/Log.hpp>
-
-#include <iterator>
-#include <utility>
-
-using namespace firestarter::environment::x86::payload;
-using namespace asmjit;
-using namespace asmjit::x86;
-
-int FMAPayload::compilePayload(
-    std::vector<std::pair<std::string, unsigned>> const &proportion,
-    unsigned instructionCacheSize,
-    std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-    unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-    bool errorDetection) {
+#include "firestarter/Environment/X86/Payload/FMAPayload.hpp"
+#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp"
+
+namespace firestarter::environment::x86::payload {
+
+auto FMAPayload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr {
+  using Imm = asmjit::Imm;
+  using Xmm = asmjit::x86::Xmm;
+  using Ymm = asmjit::x86::Ymm;
+  // NOLINTBEGIN(readability-identifier-naming)
+  constexpr asmjit::x86::Mem (*ymmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::ymmword_ptr;
+  constexpr auto ymm0 = asmjit::x86::ymm0;
+  constexpr auto ymm1 = asmjit::x86::ymm1;
+  constexpr auto ymm2 = asmjit::x86::ymm2;
+  // NOLINTEND(readability-identifier-naming)
+
   // Compute the sequence of instruction groups and the number of its repetions
   // to reach the desired size
-  auto sequence = this->generateSequence(proportion);
-  auto repetitions =
-      this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread);
+  auto Sequence = Settings.sequence();
+  auto Repetitions =
+      environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread());
 
   // compute count of flops and memory access for performance report
-  unsigned flops = 0;
-  unsigned bytes = 0;
+  environment::payload::PayloadStats Stats;
 
-  for (const auto &item : sequence) {
-    auto it = this->instructionFlops.find(item);
+  for (const auto& Item : Sequence) {
+    auto It = instructionFlops().find(Item);
 
-    if (it == this->instructionFlops.end()) {
-      workerLog::error() << "Instruction group " << item << " undefined in "
-                         << name() << ".";
-      return EXIT_FAILURE;
+    if (It == instructionFlops().end()) {
+      workerLog::error() << "Instruction group " << Item << " undefined in " << name() << ".";
     }
 
-    flops += it->second;
+    Stats.Flops += It->second;
 
-    it = this->instructionMemory.find(item);
+    It = instructionMemory().find(Item);
 
-    if (it != this->instructionMemory.end()) {
-      bytes += it->second;
+    if (It != instructionMemory().end()) {
+      Stats.Bytes += It->second;
     }
   }
 
-  this->_flops = repetitions * flops;
-  this->_bytes = repetitions * bytes;
-  this->_instructions = repetitions * sequence.size() * 4 + 6;
+  Stats.Flops *= Repetitions;
+  Stats.Bytes *= Repetitions;
+  Stats.Instructions = Repetitions * Sequence.size() * 4 + 6;
 
   // calculate the buffer sizes
-  auto l1i_cache_size = instructionCacheSize / thread;
-  auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin();
-  auto l1_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l2_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l3_size = *dataCacheBufferSizeIterator / thread;
-  auto ram_size = ramBufferSize / thread;
+  const auto L1iCacheSize = Settings.instructionCacheSizePerThread();
+  const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread();
+  auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin();
+  const auto L1Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L2Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L3Size = *DataCacheBufferSizeIterator;
+  const auto RamSize = Settings.ramBufferSizePerThread();
 
   // calculate the reset counters for the buffers
-  auto l2_loop_count =
-      getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread);
-  auto l3_loop_count =
-      getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread);
-  auto ram_loop_count =
-      getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread);
-
-  CodeHolder code;
-  code.init(this->rt.environment());
-
-  if (nullptr != this->loadFunction) {
-    this->rt.release(&this->loadFunction);
-  }
-
-  Builder cb(&code);
-  cb.addDiagnosticOptions(
-    asmjit::DiagnosticOptions::kValidateAssembler | 
-    asmjit::DiagnosticOptions::kValidateIntermediate );
-
-  auto pointer_reg = rax;
-  auto l1_addr = rbx;
-  auto l2_addr = rcx;
-  auto l3_addr = r8;
-  auto ram_addr = r9;
-  auto l2_count_reg = r10;
-  auto l3_count_reg = r11;
-  auto ram_count_reg = r12;
-  auto temp_reg = r13;
-  auto temp_reg2 = rbp;
-  auto offset_reg = r14;
-  auto addrHigh_reg = r15;
-  auto iter_reg = mm0;
-  auto shift_reg = std::vector<Gp>({rdi, rsi, rdx});
-  auto shift_reg32 = std::vector<Gp>({edi, esi, edx});
-  auto nr_shift_regs = 3;
-  auto mul_regs = 3;
-  auto add_regs = 9;
-  auto alt_dst_regs = 3;
-  auto ram_reg = ymm15;
-
-  FuncDetail func;
-  func.init(FuncSignatureT<unsigned long long, unsigned long long *,
-                           volatile unsigned long long *, unsigned long long>(
-                CallConvId::kCDecl),
-            this->rt.environment());
-
-  FuncFrame frame;
-  frame.init(func);
+  const auto L2LoopCount =
+      environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size);
+  const auto L3LoopCount =
+      environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size);
+  const auto RamLoopCount =
+      environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize);
+
+  asmjit::CodeHolder Code;
+  Code.init(asmjit::Environment::host());
+
+  asmjit::x86::Builder Cb(&Code);
+  Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler |
+                          asmjit::DiagnosticOptions::kValidateIntermediate);
+
+  const auto PointerReg = asmjit::x86::rax;
+  const auto L1Addr = asmjit::x86::rbx;
+  const auto L2Addr = asmjit::x86::rcx;
+  const auto L3Addr = asmjit::x86::r8;
+  const auto RamAddr = asmjit::x86::r9;
+  const auto L2CountReg = asmjit::x86::r10;
+  const auto L3CountReg = asmjit::x86::r11;
+  const auto RamCountReg = asmjit::x86::r12;
+  const auto TempReg = asmjit::x86::r13;
+  const auto TempReg2 = asmjit::x86::rbp;
+  const auto OffsetReg = asmjit::x86::r14;
+  const auto AddrHighReg = asmjit::x86::r15;
+  const auto IterReg = asmjit::x86::mm0;
+  const auto ShiftRegs = std::vector<asmjit::x86::Gp>({asmjit::x86::rdi, asmjit::x86::rsi, asmjit::x86::rdx});
+  const auto ShiftRegs32 = std::vector<asmjit::x86::Gp>({asmjit::x86::edi, asmjit::x86::esi, asmjit::x86::edx});
+  const auto NbShiftRegs = 3;
+  const auto MulRegs = 3;
+  const auto AddRegs = 9;
+  const auto AltDestRegs = 3;
+  const auto RamReg = asmjit::x86::ymm15;
+
+  asmjit::FuncDetail Func;
+  Func.init(asmjit::FuncSignature::build<uint64_t, double*, volatile LoadThreadWorkType*, uint64_t>(
+                asmjit::CallConvId::kCDecl),
+            Code.environment());
+
+  asmjit::FuncFrame Frame;
+  Frame.init(Func);
 
   // make (x|y)mm registers dirty
-  for (int i = 0; i < 16; i++) {
-    frame.addDirtyRegs(Ymm(i));
+  for (auto I = 0U; I < 16U; I++) {
+    Frame.addDirtyRegs(Ymm(I));
   }
-  for (int i = 0; i < 8; i++) {
-    frame.addDirtyRegs(Mm(i));
+  for (auto I = 0U; I < 8U; I++) {
+    Frame.addDirtyRegs(asmjit::x86::Mm(I));
   }
   // make all other used registers dirty except RAX
-  frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg,
-                     l3_count_reg, ram_count_reg, temp_reg, temp_reg2,
-                     offset_reg, addrHigh_reg, iter_reg, ram_addr);
-  for (const auto &reg : shift_reg) {
-    frame.addDirtyRegs(reg);
+  Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg,
+                     AddrHighReg, IterReg, RamAddr);
+  for (const auto& Reg : ShiftRegs) {
+    Frame.addDirtyRegs(Reg);
   }
 
-  FuncArgsAssignment args(&func);
+  asmjit::FuncArgsAssignment Args(&Func);
   // FIXME: asmjit assigment to mm0 does not seem to be supported
-  args.assignAll(pointer_reg, addrHigh_reg, temp_reg);
-  args.updateFuncFrame(frame);
-  frame.finalize();
+  Args.assignAll(PointerReg, AddrHighReg, TempReg);
+  Args.updateFuncFrame(Frame);
+  Frame.finalize();
 
-  cb.emitProlog(frame);
-  cb.emitArgsAssignment(frame, args);
+  Cb.emitProlog(Frame);
+  Cb.emitArgsAssignment(Frame, Args);
 
   // FIXME: movq from temp_reg to iter_reg
-  cb.movq(iter_reg, temp_reg);
+  Cb.movq(IterReg, TempReg);
 
   // stop right away if low load is selected
-  auto FunctionExit = cb.newLabel();
+  auto FunctionExit = Cb.newLabel();
 
-  cb.mov(temp_reg, ptr_64(addrHigh_reg));
-  cb.test(temp_reg, temp_reg);
-  cb.jz(FunctionExit);
+  Cb.mov(TempReg, ptr_64(AddrHighReg));
+  Cb.test(TempReg, TempReg);
+  Cb.jz(FunctionExit);
 
-  cb.mov(offset_reg,
+  Cb.mov(OffsetReg,
          Imm(64)); // increment after each cache/memory access
   // Initialize registers for shift operations
-  for (auto const &reg : shift_reg32) {
-    cb.mov(reg, Imm(0xAAAAAAAA));
+  for (auto const& Reg : ShiftRegs32) {
+    Cb.mov(Reg, Imm(0xAAAAAAAA));
   }
   // Initialize AVX-Registers for FMA Operations
-  cb.vmovapd(ymm0, ymmword_ptr(pointer_reg));
-  cb.vmovapd(ymm1, ymmword_ptr(pointer_reg, 32));
-  cb.vmovapd(ymm2, ymmword_ptr(pointer_reg, 64));
-  auto add_start = mul_regs;
-  auto add_end = mul_regs + add_regs - 1;
-  auto trans_start = add_regs + mul_regs;
-  auto trans_end = add_regs + mul_regs + alt_dst_regs - 1;
-  for (int i = add_start; i <= trans_end; i++) {
-    cb.vmovapd(Ymm(i), ymmword_ptr(pointer_reg, 256 + i * 32));
-  }
-  cb.mov(l1_addr, pointer_reg); // address for L1-buffer
-  cb.mov(l2_addr, pointer_reg);
-  cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer
-  cb.mov(l3_addr, pointer_reg);
-  cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer
-  cb.mov(ram_addr, pointer_reg);
-  cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer
-  cb.mov(l2_count_reg, Imm(l2_loop_count));
-  workerLog::trace() << "reset counter for L2-buffer with "
-                     << l2_loop_count
-                     << " cache line accesses per loop ("
-		     << l2_size/1024
-                     << ") KiB";
-  cb.mov(l3_count_reg, Imm(l3_loop_count));
-  workerLog::trace() << "reset counter for L3-buffer with "
-                     << l3_loop_count
-                     << " cache line accesses per loop ("
-		     << l3_size/1024
-                     << ") KiB";
-  cb.mov(ram_count_reg, Imm(ram_loop_count));
-  workerLog::trace() << "reset counter for RAM-buffer with "
-                     << ram_loop_count
-                     << " cache line accesses per loop ("
-		     << ram_size/1024
-                     << ") KiB";
-
-  cb.align(AlignMode::kCode, 64);
-
-  auto Loop = cb.newLabel();
-  cb.bind(Loop);
-
-  auto shift_pos = 0;
-  bool left = false;
-  auto add_dest = add_start + 1;
-  auto mov_dst = trans_start;
-  auto mov_src = mov_dst + 1;
-  unsigned l1_offset = 0;
-
-#define L1_INCREMENT_TIMES(n)                                                  \
-  l1_offset += n * 64;                                                         \
-  if (l1_offset < l1_size * 0.5) {                                             \
-    cb.add(l1_addr, offset_reg);                                               \
-  } else {                                                                     \
-    l1_offset = 0;                                                             \
-    cb.mov(l1_addr, pointer_reg);                                              \
-  }
-
-#define L1_INCREMENT() L1_INCREMENT_TIMES(1)
-
-#define L2_INCREMENT_TIMES(n)                                                  \
-  if (n == 1) {                                                                \
-    cb.add(l2_addr, offset_reg);                                               \
-  } else {                                                                     \
-    cb.add(l2_addr, n * 64);                                                   \
+  Cb.vmovapd(ymm0, ymmword_ptr(PointerReg, 0));
+  Cb.vmovapd(ymm1, ymmword_ptr(PointerReg, 32));
+  Cb.vmovapd(ymm2, ymmword_ptr(PointerReg, 64));
+  auto AddStart = MulRegs;
+  auto AddEnd = MulRegs + AddRegs - 1;
+  auto TransStart = AddRegs + MulRegs;
+  auto TransEnd = AddRegs + MulRegs + AltDestRegs - 1;
+  for (auto I = AddStart; I <= TransEnd; I++) {
+    Cb.vmovapd(Ymm(I), ymmword_ptr(PointerReg, 256 + (I * 32)));
   }
-
-#define L2_INCREMENT() L2_INCREMENT_TIMES(1)
-
-#define L3_INCREMENT() cb.add(l3_addr, offset_reg)
-
-#define RAM_INCREMENT() cb.add(ram_addr, offset_reg)
-
-  for (unsigned count = 0; count < repetitions; count++) {
-    for (const auto &item : sequence) {
-      if (item == "REG") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2);
-        cb.vfmadd231pd(Ymm(mov_dst), ymm2, ymm1);
-        cb.xor_(shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs],
-                temp_reg);
-        mov_dst++;
-      } else if (item == "L1_L") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2);
-        cb.vfmadd231pd(Ymm(add_dest), ymm1, ymmword_ptr(l1_addr, 32));
-        L1_INCREMENT();
-      } else if (item == "L1_2L") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 32));
-        cb.vfmadd231pd(Ymm(mov_dst), ymm1, ymmword_ptr(l1_addr, 64));
-        L1_INCREMENT();
-      } else if (item == "L1_S") {
-        cb.vmovapd(xmmword_ptr(l1_addr, 32), Xmm(add_dest));
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2);
-        L1_INCREMENT();
-      } else if (item == "L1_LS") {
-        cb.vmovapd(xmmword_ptr(l1_addr, 64), Xmm(add_dest));
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 32));
-        L1_INCREMENT();
-      } else if (item == "L1_LS_256") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 64));
-        cb.vmovapd(ymmword_ptr(l1_addr, 32), Ymm(add_dest));
-        L1_INCREMENT();
-      } else if (item == "L1_2LS_256") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 64));
-        cb.vfmadd231pd(Ymm(mov_dst), ymm1, ymmword_ptr(l1_addr, 96));
-        cb.vmovapd(ymmword_ptr(l1_addr, 32), Ymm(add_dest));
-        L1_INCREMENT_TIMES(2);
-      } else if (item == "L2_L") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2);
-        cb.vfmadd231pd(Ymm(add_dest), ymm1, ymmword_ptr(l2_addr, 64));
-        L2_INCREMENT();
-      } else if (item == "L2_S") {
-        cb.vmovapd(xmmword_ptr(l2_addr, 64), Xmm(add_dest));
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2);
-        L2_INCREMENT();
-      } else if (item == "L2_LS") {
-        cb.vmovapd(xmmword_ptr(l2_addr, 96), Xmm(add_dest));
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l2_addr, 64));
-        L2_INCREMENT();
-      } else if (item == "L2_LS_256") {
-        cb.vmovapd(ymmword_ptr(l2_addr, 96), Ymm(add_dest));
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ptr(l2_addr, 64));
-        L2_INCREMENT();
-      } else if (item == "L2_2LS_256") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ptr(l2_addr, 64));
-        cb.vfmadd231pd(Ymm(mov_dst), ymm1, ptr(l2_addr, 96));
-        cb.vmovapd(ymmword_ptr(l2_addr, 32), Ymm(add_dest));
-        L2_INCREMENT_TIMES(2);
-      } else if (item == "L3_L") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2);
-        cb.vfmadd231pd(Ymm(add_dest), ymm1, ymmword_ptr(l3_addr, 64));
-        L3_INCREMENT();
-      } else if (item == "L3_S") {
-        cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest));
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2);
-        L3_INCREMENT();
-      } else if (item == "L3_LS") {
-        cb.vmovapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest));
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l3_addr, 64));
-        L3_INCREMENT();
-      } else if (item == "L3_LS_256") {
-        cb.vmovapd(ymmword_ptr(l3_addr, 96), Ymm(add_dest));
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l3_addr, 64));
-        L3_INCREMENT();
-      } else if (item == "L3_P") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 32));
-        cb.prefetcht2(ptr(l3_addr));
-        L3_INCREMENT();
-      } else if (item == "RAM_L") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2);
-        cb.vfmadd231pd(ram_reg, ymm1, ymmword_ptr(ram_addr, 64));
-        RAM_INCREMENT();
-      } else if (item == "RAM_S") {
-        cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest));
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymm2);
-        RAM_INCREMENT();
-      } else if (item == "RAM_LS") {
-        cb.vmovapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest));
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(ram_addr, 32));
-        RAM_INCREMENT();
-      } else if (item == "RAM_P") {
-        cb.vfmadd231pd(Ymm(add_dest), ymm0, ymmword_ptr(l1_addr, 32));
-        cb.prefetcht2(ptr(ram_addr));
-        RAM_INCREMENT();
+  Cb.mov(L1Addr, PointerReg); // address for L1-buffer
+  Cb.mov(L2Addr, PointerReg);
+  Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer
+  Cb.mov(L3Addr, PointerReg);
+  Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer
+  Cb.mov(RamAddr, PointerReg);
+  Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer
+  Cb.mov(L2CountReg, Imm(L2LoopCount));
+  workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop ("
+                     << L2Size / 1024 << ") KiB";
+  Cb.mov(L3CountReg, Imm(L3LoopCount));
+  workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop ("
+                     << L3Size / 1024 << ") KiB";
+  Cb.mov(RamCountReg, Imm(RamLoopCount));
+  workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop ("
+                     << RamSize / 1024 << ") KiB";
+
+  Cb.align(asmjit::AlignMode::kCode, 64);
+
+  auto Loop = Cb.newLabel();
+  Cb.bind(Loop);
+
+  auto ShiftPos = 0;
+  bool Left = false;
+  auto AddDest = AddStart + 1;
+  auto MovDest = TransStart;
+  auto MovSrc = MovDest + 1;
+  unsigned L1Offset = 0;
+
+  const auto L1IncrementTimes = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg](unsigned Times) {
+    L1Offset += Times * 64;
+    if (L1Offset < L1Size * 0.5) {
+      Cb.add(L1Addr, OffsetReg);
+    } else {
+      L1Offset = 0;
+      Cb.mov(L1Addr, PointerReg);
+    }
+  };
+  const auto L1Increment = [&L1IncrementTimes] { L1IncrementTimes(1); };
+  const auto L2IncrementTimes = [&Cb, &L2Addr, &OffsetReg](unsigned Times) {
+    if (Times == 1) {
+      Cb.add(L2Addr, OffsetReg);
+    } else {
+      Cb.add(L2Addr, Times * 64);
+    }
+  };
+  const auto L2Increment = [&L2IncrementTimes] { L2IncrementTimes(1); };
+  const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); };
+  const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); };
+
+  for (auto Count = 0U; Count < Repetitions; Count++) {
+    for (const auto& Item : Sequence) {
+      if (Item == "REG") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2);
+        Cb.vfmadd231pd(Ymm(MovDest), ymm2, ymm1);
+        Cb.xor_(ShiftRegs[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs], TempReg);
+        MovDest++;
+      } else if (Item == "L1_L") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2);
+        Cb.vfmadd231pd(Ymm(AddDest), ymm1, ymmword_ptr(L1Addr, 32));
+        L1Increment();
+      } else if (Item == "L1_2L") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 32));
+        Cb.vfmadd231pd(Ymm(MovDest), ymm1, ymmword_ptr(L1Addr, 64));
+        L1Increment();
+      } else if (Item == "L1_S") {
+        Cb.vmovapd(xmmword_ptr(L1Addr, 32), Xmm(AddDest));
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2);
+        L1Increment();
+      } else if (Item == "L1_LS") {
+        Cb.vmovapd(xmmword_ptr(L1Addr, 64), Xmm(AddDest));
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 32));
+        L1Increment();
+      } else if (Item == "L1_LS_256") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 64));
+        Cb.vmovapd(ymmword_ptr(L1Addr, 32), Ymm(AddDest));
+        L1Increment();
+      } else if (Item == "L1_2LS_256") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 64));
+        Cb.vfmadd231pd(Ymm(MovDest), ymm1, ymmword_ptr(L1Addr, 96));
+        Cb.vmovapd(ymmword_ptr(L1Addr, 32), Ymm(AddDest));
+        L1IncrementTimes(2);
+      } else if (Item == "L2_L") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2);
+        Cb.vfmadd231pd(Ymm(AddDest), ymm1, ymmword_ptr(L2Addr, 64));
+        L2Increment();
+      } else if (Item == "L2_S") {
+        Cb.vmovapd(xmmword_ptr(L2Addr, 64), Xmm(AddDest));
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2);
+        L2Increment();
+      } else if (Item == "L2_LS") {
+        Cb.vmovapd(xmmword_ptr(L2Addr, 96), Xmm(AddDest));
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L2Addr, 64));
+        L2Increment();
+      } else if (Item == "L2_LS_256") {
+        Cb.vmovapd(ymmword_ptr(L2Addr, 96), Ymm(AddDest));
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ptr(L2Addr, 64));
+        L2Increment();
+      } else if (Item == "L2_2LS_256") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ptr(L2Addr, 64));
+        Cb.vfmadd231pd(Ymm(MovDest), ymm1, ptr(L2Addr, 96));
+        Cb.vmovapd(ymmword_ptr(L2Addr, 32), Ymm(AddDest));
+        L2IncrementTimes(2);
+      } else if (Item == "L3_L") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2);
+        Cb.vfmadd231pd(Ymm(AddDest), ymm1, ymmword_ptr(L3Addr, 64));
+        L3Increment();
+      } else if (Item == "L3_S") {
+        Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest));
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2);
+        L3Increment();
+      } else if (Item == "L3_LS") {
+        Cb.vmovapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest));
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L3Addr, 64));
+        L3Increment();
+      } else if (Item == "L3_LS_256") {
+        Cb.vmovapd(ymmword_ptr(L3Addr, 96), Ymm(AddDest));
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L3Addr, 64));
+        L3Increment();
+      } else if (Item == "L3_P") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 32));
+        Cb.prefetcht2(ptr(L3Addr));
+        L3Increment();
+      } else if (Item == "RAM_L") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2);
+        Cb.vfmadd231pd(RamReg, ymm1, ymmword_ptr(RamAddr, 64));
+        RamIncrement();
+      } else if (Item == "RAM_S") {
+        Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest));
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymm2);
+        RamIncrement();
+      } else if (Item == "RAM_LS") {
+        Cb.vmovapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest));
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(RamAddr, 32));
+        RamIncrement();
+      } else if (Item == "RAM_P") {
+        Cb.vfmadd231pd(Ymm(AddDest), ymm0, ymmword_ptr(L1Addr, 32));
+        Cb.prefetcht2(ptr(RamAddr));
+        RamIncrement();
       } else {
-        workerLog::error() << "Instruction group " << item << " not found in "
-                           << this->name() << ".";
-        return EXIT_FAILURE;
+        workerLog::error() << "Instruction group " << Item << " not found in " << name() << ".";
       }
 
-      if (item != "L1_2LS_256" && item != "L2_2LS_256") {
-        if (left) {
-          cb.shr(shift_reg32[shift_pos], Imm(1));
+      if (Item != "L1_2LS_256" && Item != "L2_2LS_256") {
+        if (Left) {
+          Cb.shr(ShiftRegs32[ShiftPos], Imm(1));
         } else {
-          cb.shl(shift_reg32[shift_pos], Imm(1));
+          Cb.shl(ShiftRegs32[ShiftPos], Imm(1));
         }
       }
-      add_dest++;
-      if (add_dest > add_end) {
-        add_dest = add_start;
+      AddDest++;
+      if (AddDest > AddEnd) {
+        AddDest = AddStart;
       }
-      if (mov_dst > trans_end) {
-        mov_dst = trans_start;
+      if (MovDest > TransEnd) {
+        MovDest = TransStart;
       }
-      mov_src++;
-      if (mov_src > trans_end) {
-        mov_src = trans_start;
+      MovSrc++;
+      if (MovSrc > TransEnd) {
+        MovSrc = TransStart;
       }
-      shift_pos++;
-      if (shift_pos == nr_shift_regs) {
-        shift_pos = 0;
-        left = !left;
+      ShiftPos++;
+      if (ShiftPos == NbShiftRegs) {
+        ShiftPos = 0;
+        Left = !Left;
       }
     }
   }
 
-  cb.movq(temp_reg, iter_reg); // restore iteration counter
-  if (this->getRAMSequenceCount(sequence) > 0) {
+  Cb.movq(TempReg, IterReg); // restore iteration counter
+  if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) {
     // reset RAM counter
-    auto NoRamReset = cb.newLabel();
-
-    cb.sub(ram_count_reg, Imm(1));
-    cb.jnz(NoRamReset);
-    cb.mov(ram_count_reg, Imm(ram_loop_count));
-    cb.mov(ram_addr, pointer_reg);
-    cb.add(ram_addr, Imm(l3_size));
-    cb.bind(NoRamReset);
+    auto NoRamReset = Cb.newLabel();
+
+    Cb.sub(RamCountReg, Imm(1));
+    Cb.jnz(NoRamReset);
+    Cb.mov(RamCountReg, Imm(RamLoopCount));
+    Cb.mov(RamAddr, PointerReg);
+    Cb.add(RamAddr, Imm(L3Size));
+    Cb.bind(NoRamReset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.inc(temp_reg); // increment iteration counter
-  if (this->getL2SequenceCount(sequence) > 0) {
+  Cb.inc(TempReg); // increment iteration counter
+  if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) {
     // reset L2-Cache counter
-    auto NoL2Reset = cb.newLabel();
-
-    cb.sub(l2_count_reg, Imm(1));
-    cb.jnz(NoL2Reset);
-    cb.mov(l2_count_reg, Imm(l2_loop_count));
-    cb.mov(l2_addr, pointer_reg);
-    cb.add(l2_addr, Imm(l1_size));
-    cb.bind(NoL2Reset);
+    auto NoL2Reset = Cb.newLabel();
+
+    Cb.sub(L2CountReg, Imm(1));
+    Cb.jnz(NoL2Reset);
+    Cb.mov(L2CountReg, Imm(L2LoopCount));
+    Cb.mov(L2Addr, PointerReg);
+    Cb.add(L2Addr, Imm(L1Size));
+    Cb.bind(NoL2Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.movq(iter_reg, temp_reg); // store iteration counter
-  if (this->getL3SequenceCount(sequence) > 0) {
+  Cb.movq(IterReg, TempReg); // store iteration counter
+  if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) {
     // reset L3-Cache counter
-    auto NoL3Reset = cb.newLabel();
-
-    cb.sub(l3_count_reg, Imm(1));
-    cb.jnz(NoL3Reset);
-    cb.mov(l3_count_reg, Imm(l3_loop_count));
-    cb.mov(l3_addr, pointer_reg);
-    cb.add(l3_addr, Imm(l2_size));
-    cb.bind(NoL3Reset);
+    auto NoL3Reset = Cb.newLabel();
+
+    Cb.sub(L3CountReg, Imm(1));
+    Cb.jnz(NoL3Reset);
+    Cb.mov(L3CountReg, Imm(L3LoopCount));
+    Cb.mov(L3Addr, PointerReg);
+    Cb.add(L3Addr, Imm(L2Size));
+    Cb.bind(NoL3Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.mov(l1_addr, pointer_reg);
+  Cb.mov(L1Addr, PointerReg);
 
-  if (dumpRegisters) {
-    auto SkipRegistersDump = cb.newLabel();
-
-    cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-    cb.jnz(SkipRegistersDump);
-
-    // dump all the ymm register
-    for (int i = 0; i < (int)this->registerCount(); i++) {
-      cb.vmovapd(
-          ymmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)),
-          Ymm(i));
-    }
-
-    // set read flag
-    cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-
-    cb.bind(SkipRegistersDump);
+  if (DumpRegisters) {
+    emitDumpRegisterCode<Ymm>(Cb, PointerReg, ymmword_ptr);
   }
 
-  if (errorDetection) {
-    this->emitErrorDetectionCode<decltype(iter_reg), Ymm>(
-        cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2);
+  if (ErrorDetection) {
+    emitErrorDetectionCode<decltype(IterReg), Ymm>(Cb, IterReg, AddrHighReg, PointerReg, TempReg, TempReg2);
   }
 
-  cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH));
-  cb.jnz(Loop);
+  Cb.test(ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh));
+  Cb.jnz(Loop);
 
-  cb.bind(FunctionExit);
+  Cb.bind(FunctionExit);
 
-  cb.movq(rax, iter_reg);
+  Cb.movq(asmjit::x86::rax, IterReg);
 
-  cb.emitEpilog(frame);
+  Cb.emitEpilog(Frame);
 
-  cb.finalize();
+  Cb.finalize();
 
-  // String sb;
-  // cb.dump(sb);
-
-  Error err = this->rt.add(&this->loadFunction, &code);
-  if (err) {
-    workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in "
-                       << __FILE__ << " at " << __LINE__;
-    return EXIT_FAILURE;
-  }
+  auto CompiledPayloadPtr = CompiledX86Payload::create<FMAPayload>(Stats, Code);
 
   // skip if we could not determine cache size
-  if (l1i_cache_size != 0) {
-    auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop);
-    auto instructionCachePercentage = 100 * loopSize / l1i_cache_size;
+  if (L1iCacheSize) {
+    auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop);
+    auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize;
 
-    if (loopSize > l1i_cache_size) {
+    if (LoopSize > *L1iCacheSize) {
       workerLog::warn() << "Work-loop is bigger than the L1i-Cache.";
     }
 
-    workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size
-                       << " Bytes (" << instructionCachePercentage
+    workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage
                        << "%) from the L1i-Cache for the work-loop.";
-    workerLog::trace() << "Sequence size: " << sequence.size();
-    workerLog::trace() << "Repetition count: " << repetitions;
+    workerLog::trace() << "Sequence size: " << Sequence.size();
+    workerLog::trace() << "Repetition count: " << Repetitions;
   }
 
-  return EXIT_SUCCESS;
+  return CompiledPayloadPtr;
 }
 
-std::list<std::string> FMAPayload::getAvailableInstructions() const {
-  std::list<std::string> instructions;
-
-  transform(this->instructionFlops.begin(), this->instructionFlops.end(),
-            back_inserter(instructions),
-            [](const auto &item) { return item.first; });
-
-  return instructions;
+void FMAPayload::init(double* MemoryAddr, uint64_t BufferSize) const {
+  X86Payload::initMemory(MemoryAddr, BufferSize, 0.27948995982e-4, 0.27948995982e-4);
 }
 
-void FMAPayload::init(unsigned long long *memoryAddr,
-                      unsigned long long bufferSize) {
-  X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4);
-}
+} // namespace firestarter::environment::x86::payload
\ No newline at end of file
diff --git a/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp b/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp
index d22880d1..fc77c8e1 100644
--- a/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp
+++ b/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp
@@ -19,466 +19,394 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Environment/X86/Payload/SSE2Payload.hpp>
-#include <firestarter/Logging/Log.hpp>
-
-#include <iterator>
-#include <utility>
-
-using namespace firestarter::environment::x86::payload;
-using namespace asmjit;
-using namespace asmjit::x86;
-
-int SSE2Payload::compilePayload(
-    std::vector<std::pair<std::string, unsigned>> const &proportion,
-    unsigned instructionCacheSize,
-    std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-    unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-    bool errorDetection) {
+#include "firestarter/Environment/X86/Payload/SSE2Payload.hpp"
+#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp"
+
+namespace firestarter::environment::x86::payload {
+
+auto SSE2Payload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                 bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr {
+  using Imm = asmjit::Imm;
+  using Mm = asmjit::x86::Mm;
+  using Xmm = asmjit::x86::Xmm;
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  constexpr asmjit::x86::Mem (*xmmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::xmmword_ptr;
+
   // Compute the sequence of instruction groups and the number of its repetions
   // to reach the desired size
-  auto sequence = this->generateSequence(proportion);
-  auto repetitions =
-      this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread);
+  auto Sequence = Settings.sequence();
+  auto Repetitions =
+      environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread());
 
   // compute count of flops and memory access for performance report
-  unsigned flops = 0;
-  unsigned bytes = 0;
+  environment::payload::PayloadStats Stats;
 
-  for (const auto &item : sequence) {
-    auto it = this->instructionFlops.find(item);
+  for (const auto& Item : Sequence) {
+    auto It = instructionFlops().find(Item);
 
-    if (it == this->instructionFlops.end()) {
-      workerLog::error() << "Instruction group " << item << " undefined in "
-                         << name() << ".";
-      return EXIT_FAILURE;
+    if (It == instructionFlops().end()) {
+      workerLog::error() << "Instruction group " << Item << " undefined in " << name() << ".";
     }
 
-    flops += it->second;
+    Stats.Flops += It->second;
 
-    it = this->instructionMemory.find(item);
+    It = instructionMemory().find(Item);
 
-    if (it != this->instructionMemory.end()) {
-      bytes += it->second;
+    if (It != instructionMemory().end()) {
+      Stats.Bytes += It->second;
     }
   }
 
-  this->_flops = repetitions * flops;
-  this->_bytes = repetitions * bytes;
-  this->_instructions = repetitions * sequence.size() * 2 + 4;
+  Stats.Flops *= Repetitions;
+  Stats.Bytes *= Repetitions;
+  Stats.Instructions = Repetitions * Sequence.size() * 2 + 4;
 
   // calculate the buffer sizes
-  auto l1i_cache_size = instructionCacheSize / thread;
-  auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin();
-  auto l1_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l2_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l3_size = *dataCacheBufferSizeIterator / thread;
-  auto ram_size = ramBufferSize / thread;
+  const auto L1iCacheSize = Settings.instructionCacheSizePerThread();
+  const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread();
+  auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin();
+  const auto L1Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L2Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L3Size = *DataCacheBufferSizeIterator;
+  const auto RamSize = Settings.ramBufferSizePerThread();
 
   // calculate the reset counters for the buffers
-  auto l2_loop_count =
-      getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread);
-  auto l3_loop_count =
-      getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread);
-  auto ram_loop_count =
-      getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread);
-
-  CodeHolder code;
-  code.init(this->rt.environment());
-
-  if (nullptr != this->loadFunction) {
-    this->rt.release(&this->loadFunction);
-  }
-
-  Builder cb(&code);
-  cb.addDiagnosticOptions(
-    asmjit::DiagnosticOptions::kValidateAssembler | 
-    asmjit::DiagnosticOptions::kValidateIntermediate );
-
-  auto pointer_reg = rax;
-  auto l1_addr = rbx;
-  auto l2_addr = rcx;
-  auto l3_addr = rdx;
-  auto ram_addr = rdi;
-  auto l2_count_reg = r8;
-  auto l3_count_reg = r9;
-  auto ram_count_reg = r10;
-  auto temp_reg = r11;
-  auto temp_reg2 = rbp;
-  auto offset_reg = r12;
-  auto addrHigh_reg = r13;
-  auto iter_reg = r14;
-  auto mov_regs = 0;
-  auto add_regs = 14;
-  auto trans_regs = 2;
-
-  FuncDetail func;
-  func.init(FuncSignatureT<unsigned long long, unsigned long long *,
-                           volatile unsigned long long *, unsigned long long>(
-                CallConvId::kCDecl),
-            this->rt.environment());
-
-  FuncFrame frame;
-  frame.init(func);
+  const auto L2LoopCount =
+      environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size);
+  const auto L3LoopCount =
+      environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size);
+  const auto RamLoopCount =
+      environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize);
+
+  asmjit::CodeHolder Code;
+  Code.init(asmjit::Environment::host());
+
+  asmjit::x86::Builder Cb(&Code);
+  Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler |
+                          asmjit::DiagnosticOptions::kValidateIntermediate);
+
+  const auto PointerReg = asmjit::x86::rax;
+  const auto L1Addr = asmjit::x86::rbx;
+  const auto L2Addr = asmjit::x86::rcx;
+  const auto L3Addr = asmjit::x86::rdx;
+  const auto RamAddr = asmjit::x86::rdi;
+  const auto L2CountReg = asmjit::x86::r8;
+  const auto L3CountReg = asmjit::x86::r9;
+  const auto RamCountReg = asmjit::x86::r10;
+  const auto TempReg = asmjit::x86::r11;
+  const auto TempReg2 = asmjit::x86::rbp;
+  const auto OffsetReg = asmjit::x86::r12;
+  const auto AddrHighReg = asmjit::x86::r13;
+  const auto IterReg = asmjit::x86::r14;
+  constexpr const auto MovRegs = 0;
+  const auto AddRegs = 14;
+  const auto TransRegs = 2;
+
+  asmjit::FuncDetail Func;
+  Func.init(asmjit::FuncSignature::build<uint64_t, double*, volatile LoadThreadWorkType*, uint64_t>(
+                asmjit::CallConvId::kCDecl),
+            Code.environment());
+
+  asmjit::FuncFrame Frame;
+  Frame.init(Func);
 
   // make xmm registers dirty
-  for (int i = 0; i < 16; i++) {
-    frame.addDirtyRegs(Xmm(i));
+  for (auto I = 0U; I < 16U; I++) {
+    Frame.addDirtyRegs(Xmm(I));
   }
   // make mmx registers dirty
-  for (int i = 0; i < 8; i++) {
-    frame.addDirtyRegs(Mm(i));
+  for (auto I = 0U; I < 8U; I++) {
+    Frame.addDirtyRegs(Mm(I));
   }
   // make all other used registers dirty except RAX
-  frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg,
-                     l3_count_reg, ram_count_reg, temp_reg, temp_reg2,
-                     offset_reg, addrHigh_reg, iter_reg);
+  Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg,
+                     AddrHighReg, IterReg);
 
-  FuncArgsAssignment args(&func);
-  args.assignAll(pointer_reg, addrHigh_reg, iter_reg);
-  args.updateFuncFrame(frame);
-  frame.finalize();
+  asmjit::FuncArgsAssignment Args(&Func);
+  Args.assignAll(PointerReg, AddrHighReg, IterReg);
+  Args.updateFuncFrame(Frame);
+  Frame.finalize();
 
-  cb.emitProlog(frame);
-  cb.emitArgsAssignment(frame, args);
+  Cb.emitProlog(Frame);
+  Cb.emitArgsAssignment(Frame, Args);
 
   // stop right away if low load is selected
-  auto FunctionExit = cb.newLabel();
+  auto FunctionExit = Cb.newLabel();
 
-  cb.mov(temp_reg, ptr_64(addrHigh_reg));
-  cb.test(temp_reg, temp_reg);
-  cb.jz(FunctionExit);
+  Cb.mov(TempReg, ptr_64(AddrHighReg));
+  Cb.test(TempReg, TempReg);
+  Cb.jz(FunctionExit);
 
-  cb.mov(offset_reg,
+  Cb.mov(OffsetReg,
          Imm(64)); // increment after each cache/memory access
 
   // Initialize SSE-Registers for Addition
-  auto add_start = 0;
-  auto add_end = add_regs - 1;
-  auto trans_start = add_regs;
-  auto trans_end = add_regs + trans_regs - 1;
-  if (add_regs > 0) {
-    for (int i = add_start; i <= add_end; i++) {
-      cb.movapd(Xmm(i), xmmword_ptr(pointer_reg, 32 * i));
+  const auto AddStart = 0;
+  const auto AddEnd = AddRegs - 1;
+  const auto TransStart = AddRegs;
+  const auto TransEnd = AddRegs + TransRegs - 1;
+  if (AddRegs > 0) {
+    for (auto I = AddStart; I <= AddEnd; I++) {
+      Cb.movapd(Xmm(I), xmmword_ptr(PointerReg, 32 * I));
     }
   }
 
   // Initialize MMX-Registers for shift operations
-  auto mov_start = 0;
-  auto mov_end = mov_regs - 1;
-  if (mov_regs > 0) {
-    cb.mov(temp_reg, Imm(0x5555555555555555));
-    cb.movq(Mm(mov_start), temp_reg);
-    for (int i = mov_start + 1; i <= mov_end; i++) {
-      cb.movq(Mm(i), Mm(mov_start));
+  const auto MovStart = 0;
+  const auto MovEnd = MovRegs - 1;
+  if (MovRegs > 0) {
+    Cb.mov(TempReg, Imm(0x5555555555555555));
+    Cb.movq(Mm(MovStart), TempReg);
+    for (auto I = MovStart + 1; I <= MovEnd; I++) {
+      Cb.movq(Mm(I), Mm(MovStart));
     }
   }
 
   // Initialize SSE-Registers for Transfer-Operations
-  if (trans_regs > 0) {
-    if (trans_start % 2 == 0) {
-      cb.mov(temp_reg, Imm(0x0F0F0F0F0F0F0F0F));
+  if (TransRegs > 0) {
+    if (TransStart % 2 == 0) {
+      Cb.mov(TempReg, Imm(0x0F0F0F0F0F0F0F0F));
     } else {
-      cb.mov(temp_reg, Imm(0xF0F0F0F0F0F0F0F0));
+      Cb.mov(TempReg, Imm(0xF0F0F0F0F0F0F0F0));
     }
-    cb.pinsrq(Xmm(trans_start), temp_reg, Imm(0));
-    cb.pinsrq(Xmm(trans_start), temp_reg, Imm(1));
-    for (int i = trans_start + 1; i <= trans_end; i++) {
-      if (i % 2 == 0) {
-        cb.shr(temp_reg, Imm(4));
+    Cb.pinsrq(Xmm(TransStart), TempReg, Imm(0));
+    Cb.pinsrq(Xmm(TransStart), TempReg, Imm(1));
+    for (auto I = TransStart + 1; I <= TransEnd; I++) {
+      if (I % 2 == 0) {
+        Cb.shr(TempReg, Imm(4));
       } else {
-        cb.shl(temp_reg, Imm(4));
+        Cb.shl(TempReg, Imm(4));
       }
-      cb.pinsrq(Xmm(i), temp_reg, Imm(0));
-      cb.pinsrq(Xmm(i), temp_reg, Imm(1));
+      Cb.pinsrq(Xmm(I), TempReg, Imm(0));
+      Cb.pinsrq(Xmm(I), TempReg, Imm(1));
     }
   }
 
-  cb.mov(l1_addr, pointer_reg); // address for L1-buffer
-  cb.mov(l2_addr, pointer_reg);
-  cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer
-  cb.mov(l3_addr, pointer_reg);
-  cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer
-  cb.mov(ram_addr, pointer_reg);
-  cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer
-  cb.mov(l2_count_reg, Imm(l2_loop_count));
-  workerLog::trace() << "reset counter for L2-buffer with "
-                     << l2_loop_count
-                     << " cache line accesses per loop ("
-		     << l2_size/1024
-                     << ") KiB";
-  cb.mov(l3_count_reg, Imm(l3_loop_count));
-  workerLog::trace() << "reset counter for L3-buffer with "
-                     << l3_loop_count
-                     << " cache line accesses per loop ("
-		     << l3_size/1024
-                     << ") KiB";
-  cb.mov(ram_count_reg, Imm(ram_loop_count));
-  workerLog::trace() << "reset counter for RAM-buffer with "
-                     << ram_loop_count
-                     << " cache line accesses per loop ("
-		     << ram_size/1024
-                     << ") KiB";
-
-  cb.align(AlignMode::kCode, 64);
-
-  auto Loop = cb.newLabel();
-  cb.bind(Loop);
-
-  auto movq_dst = mov_start;
-  auto add_dest = add_start + 1;
-  auto mov_dst = trans_start;
-  auto mov_src = mov_dst + 1;
-  unsigned l1_offset = 0;
-
-#define L1_INCREMENT()                                                         \
-  l1_offset += 64;                                                             \
-  if (l1_offset < l1_size * 0.5) {                                             \
-    cb.add(l1_addr, offset_reg);                                               \
-  } else {                                                                     \
-    l1_offset = 0;                                                             \
-    cb.mov(l1_addr, pointer_reg);                                              \
-  }
-
-#define L2_INCREMENT() cb.add(l2_addr, offset_reg);
-
-#define L3_INCREMENT() cb.add(l3_addr, offset_reg)
-
-#define RAM_INCREMENT() cb.add(ram_addr, offset_reg)
-
-  for (unsigned count = 0; count < repetitions; count++) {
-    for (const auto &item : sequence) {
-      if (item == "REG") {
-        cb.addpd(
-            Xmm(add_dest),
-            Xmm(add_start + (add_dest - add_start + add_regs + 1) % add_regs));
-        cb.movdqa(Xmm(mov_dst), Xmm(mov_src));
-      } else if (item == "L1_L") {
-        cb.addpd(Xmm(add_dest), xmmword_ptr(l1_addr, 32));
-        L1_INCREMENT();
-      } else if (item == "L1_S") {
-        cb.addpd(
-            Xmm(add_dest),
-            Xmm(add_start + (add_dest - add_start + add_regs - 1) % add_regs));
-        cb.movapd(xmmword_ptr(l1_addr, 32), Xmm(add_dest));
-        L1_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L1_LS") {
-        cb.addpd(Xmm(add_dest), xmmword_ptr(l1_addr, 32));
-        cb.movapd(xmmword_ptr(l1_addr, 64), Xmm(add_dest));
-        L1_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L2_L") {
-        cb.addpd(Xmm(add_dest), xmmword_ptr(l2_addr, 64));
-        L2_INCREMENT();
-      } else if (item == "L2_S") {
-        cb.addpd(
-            Xmm(add_dest),
-            Xmm(add_start + (add_dest - add_start + add_regs - 1) % add_regs));
-        cb.movapd(xmmword_ptr(l2_addr, 64), Xmm(add_dest));
-        L2_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L2_LS") {
-        cb.addpd(Xmm(add_dest), xmmword_ptr(l2_addr, 64));
-        cb.movapd(xmmword_ptr(l2_addr, 96), Xmm(add_dest));
-        L2_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L3_L") {
-        cb.addpd(Xmm(add_dest), xmmword_ptr(l3_addr, 64));
-        L3_INCREMENT();
-      } else if (item == "L3_S") {
-        cb.addpd(
-            Xmm(add_dest),
-            Xmm(add_start + (add_dest - add_start + add_regs - 1) % add_regs));
-        cb.movapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest));
-        L3_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L3_LS") {
-        cb.addpd(Xmm(add_dest), xmmword_ptr(l3_addr, 64));
-        cb.movapd(xmmword_ptr(l3_addr, 96), Xmm(add_dest));
-        L3_INCREMENT();
-        this->_instructions++;
-      } else if (item == "L3_P") {
-        cb.addpd(Xmm(add_dest), xmmword_ptr(l1_addr, 32));
-        cb.prefetcht0(ptr(l3_addr));
-        L3_INCREMENT();
-        this->_instructions++;
-      } else if (item == "RAM_L") {
-        cb.addpd(Xmm(add_dest), xmmword_ptr(ram_addr, 64));
-        RAM_INCREMENT();
-      } else if (item == "RAM_S") {
-        cb.addpd(
-            Xmm(add_dest),
-            Xmm(add_start + (add_dest - add_start + add_regs - 1) % add_regs));
-        cb.movapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest));
-        RAM_INCREMENT();
-        this->_instructions++;
-      } else if (item == "RAM_LS") {
-        cb.addpd(Xmm(add_dest), xmmword_ptr(l3_addr, 64));
-        cb.movapd(xmmword_ptr(ram_addr, 64), Xmm(add_dest));
-        RAM_INCREMENT();
-        this->_instructions++;
-      } else if (item == "RAM_P") {
-        cb.addpd(Xmm(add_dest), xmmword_ptr(l1_addr, 32));
-        cb.prefetcht2(ptr(ram_addr));
-        RAM_INCREMENT();
-        this->_instructions++;
+  Cb.mov(L1Addr, PointerReg); // address for L1-buffer
+  Cb.mov(L2Addr, PointerReg);
+  Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer
+  Cb.mov(L3Addr, PointerReg);
+  Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer
+  Cb.mov(RamAddr, PointerReg);
+  Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer
+  Cb.mov(L2CountReg, Imm(L2LoopCount));
+  workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop ("
+                     << L2Size / 1024 << ") KiB";
+  Cb.mov(L3CountReg, Imm(L3LoopCount));
+  workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop ("
+                     << L3Size / 1024 << ") KiB";
+  Cb.mov(RamCountReg, Imm(RamLoopCount));
+  workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop ("
+                     << RamSize / 1024 << ") KiB";
+
+  Cb.align(asmjit::AlignMode::kCode, 64);
+
+  auto Loop = Cb.newLabel();
+  Cb.bind(Loop);
+
+  auto MovqDest = MovStart;
+  auto AddDest = AddStart + 1;
+  auto MovDest = TransStart;
+  auto MovSrc = MovDest + 1;
+  unsigned L1Offset = 0;
+
+  const auto L1Increment = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg]() {
+    L1Offset += 64;
+    if (L1Offset < L1Size * 0.5) {
+      Cb.add(L1Addr, OffsetReg);
+    } else {
+      L1Offset = 0;
+      Cb.mov(L1Addr, PointerReg);
+    }
+  };
+  const auto L2Increment = [&Cb, &L2Addr, &OffsetReg]() { Cb.add(L2Addr, OffsetReg); };
+  const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); };
+  const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); };
+
+  for (auto Count = 0U; Count < Repetitions; Count++) {
+    for (const auto& Item : Sequence) {
+      if (Item == "REG") {
+        Cb.addpd(Xmm(AddDest), Xmm(AddStart + ((AddDest - AddStart + AddRegs + 1) % AddRegs)));
+        Cb.movdqa(Xmm(MovDest), Xmm(MovSrc));
+      } else if (Item == "L1_L") {
+        Cb.addpd(Xmm(AddDest), xmmword_ptr(L1Addr, 32));
+        L1Increment();
+      } else if (Item == "L1_S") {
+        Cb.addpd(Xmm(AddDest), Xmm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs)));
+        Cb.movapd(xmmword_ptr(L1Addr, 32), Xmm(AddDest));
+        L1Increment();
+        Stats.Instructions++;
+      } else if (Item == "L1_LS") {
+        Cb.addpd(Xmm(AddDest), xmmword_ptr(L1Addr, 32));
+        Cb.movapd(xmmword_ptr(L1Addr, 64), Xmm(AddDest));
+        L1Increment();
+        Stats.Instructions++;
+      } else if (Item == "L2_L") {
+        Cb.addpd(Xmm(AddDest), xmmword_ptr(L2Addr, 64));
+        L2Increment();
+      } else if (Item == "L2_S") {
+        Cb.addpd(Xmm(AddDest), Xmm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs)));
+        Cb.movapd(xmmword_ptr(L2Addr, 64), Xmm(AddDest));
+        L2Increment();
+        Stats.Instructions++;
+      } else if (Item == "L2_LS") {
+        Cb.addpd(Xmm(AddDest), xmmword_ptr(L2Addr, 64));
+        Cb.movapd(xmmword_ptr(L2Addr, 96), Xmm(AddDest));
+        L2Increment();
+        Stats.Instructions++;
+      } else if (Item == "L3_L") {
+        Cb.addpd(Xmm(AddDest), xmmword_ptr(L3Addr, 64));
+        L3Increment();
+      } else if (Item == "L3_S") {
+        Cb.addpd(Xmm(AddDest), Xmm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs)));
+        Cb.movapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest));
+        L3Increment();
+        Stats.Instructions++;
+      } else if (Item == "L3_LS") {
+        Cb.addpd(Xmm(AddDest), xmmword_ptr(L3Addr, 64));
+        Cb.movapd(xmmword_ptr(L3Addr, 96), Xmm(AddDest));
+        L3Increment();
+        Stats.Instructions++;
+      } else if (Item == "L3_P") {
+        Cb.addpd(Xmm(AddDest), xmmword_ptr(L1Addr, 32));
+        Cb.prefetcht0(ptr(L3Addr));
+        L3Increment();
+        Stats.Instructions++;
+      } else if (Item == "RAM_L") {
+        Cb.addpd(Xmm(AddDest), xmmword_ptr(RamAddr, 64));
+        RamIncrement();
+      } else if (Item == "RAM_S") {
+        Cb.addpd(Xmm(AddDest), Xmm(AddStart + ((AddDest - AddStart + AddRegs - 1) % AddRegs)));
+        Cb.movapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest));
+        RamIncrement();
+        Stats.Instructions++;
+      } else if (Item == "RAM_LS") {
+        Cb.addpd(Xmm(AddDest), xmmword_ptr(L3Addr, 64));
+        Cb.movapd(xmmword_ptr(RamAddr, 64), Xmm(AddDest));
+        RamIncrement();
+        Stats.Instructions++;
+      } else if (Item == "RAM_P") {
+        Cb.addpd(Xmm(AddDest), xmmword_ptr(L1Addr, 32));
+        Cb.prefetcht2(ptr(RamAddr));
+        RamIncrement();
+        Stats.Instructions++;
       } else {
-        workerLog::error() << "Instruction group " << item << " not found in "
-                           << this->name() << ".";
-        return EXIT_FAILURE;
+        workerLog::error() << "Instruction group " << Item << " not found in " << name() << ".";
       }
 
-      if (mov_regs > 0) {
-        this->_instructions++;
-        cb.movq(
-            Mm(mov_start + (movq_dst - mov_start + mov_regs - 1) % mov_regs),
-            Mm(movq_dst));
+      if constexpr (MovRegs > 0) {
+        Stats.Instructions++;
+        Cb.movq(Mm(MovStart + ((MovqDest - MovStart + MovRegs - 1) % MovRegs)), Mm(MovqDest));
       }
 
-      add_dest++;
-      if (add_dest > add_end) {
+      AddDest++;
+      if (AddDest > AddEnd) {
         // DO NOT REMOVE the + 1. It serves for the good of ymm0. If it was to
         // be overriden, the values in the other registers would rise up to inf.
-        add_dest = add_start + 1;
+        AddDest = AddStart + 1;
       }
-      mov_dst++;
-      if (mov_dst > trans_end) {
-        mov_dst = trans_start;
+      MovDest++;
+      if (MovDest > TransEnd) {
+        MovDest = TransStart;
       }
-      mov_src++;
-      if (mov_src > trans_end) {
-        mov_src = trans_start;
+      MovSrc++;
+      if (MovSrc > TransEnd) {
+        MovSrc = TransStart;
       }
-      if (mov_regs > 0) {
-        movq_dst++;
-        if (movq_dst > mov_end) {
-          movq_dst = mov_start;
+      if (MovRegs > 0) {
+        MovqDest++;
+        if (MovqDest > MovEnd) {
+          MovqDest = MovStart;
         }
       }
     }
   }
 
-  if (this->getRAMSequenceCount(sequence) > 0) {
+  if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) {
     // reset RAM counter
-    auto NoRamReset = cb.newLabel();
-
-    cb.sub(ram_count_reg, Imm(1));
-    cb.jnz(NoRamReset);
-    cb.mov(ram_count_reg, Imm(ram_loop_count));
-    cb.mov(ram_addr, pointer_reg);
-    cb.add(ram_addr, Imm(l3_size));
-    cb.bind(NoRamReset);
+    auto NoRamReset = Cb.newLabel();
+
+    Cb.sub(RamCountReg, Imm(1));
+    Cb.jnz(NoRamReset);
+    Cb.mov(RamCountReg, Imm(RamLoopCount));
+    Cb.mov(RamAddr, PointerReg);
+    Cb.add(RamAddr, Imm(L3Size));
+    Cb.bind(NoRamReset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  if (this->getL2SequenceCount(sequence) > 0) {
+  if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) {
     // reset L2-Cache counter
-    auto NoL2Reset = cb.newLabel();
-
-    cb.sub(l2_count_reg, Imm(1));
-    cb.jnz(NoL2Reset);
-    cb.mov(l2_count_reg, Imm(l2_loop_count));
-    cb.mov(l2_addr, pointer_reg);
-    cb.add(l2_addr, Imm(l1_size));
-    cb.bind(NoL2Reset);
+    auto NoL2Reset = Cb.newLabel();
+
+    Cb.sub(L2CountReg, Imm(1));
+    Cb.jnz(NoL2Reset);
+    Cb.mov(L2CountReg, Imm(L2LoopCount));
+    Cb.mov(L2Addr, PointerReg);
+    Cb.add(L2Addr, Imm(L1Size));
+    Cb.bind(NoL2Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  if (this->getL3SequenceCount(sequence) > 0) {
+  if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) {
     // reset L3-Cache counter
-    auto NoL3Reset = cb.newLabel();
-
-    cb.sub(l3_count_reg, Imm(1));
-    cb.jnz(NoL3Reset);
-    cb.mov(l3_count_reg, Imm(l3_loop_count));
-    cb.mov(l3_addr, pointer_reg);
-    cb.add(l3_addr, Imm(l2_size));
-    cb.bind(NoL3Reset);
+    auto NoL3Reset = Cb.newLabel();
+
+    Cb.sub(L3CountReg, Imm(1));
+    Cb.jnz(NoL3Reset);
+    Cb.mov(L3CountReg, Imm(L3LoopCount));
+    Cb.mov(L3Addr, PointerReg);
+    Cb.add(L3Addr, Imm(L2Size));
+    Cb.bind(NoL3Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.inc(iter_reg); // increment iteration counter
-  cb.mov(l1_addr, pointer_reg);
-
-  if (dumpRegisters) {
-    auto SkipRegistersDump = cb.newLabel();
+  Cb.inc(IterReg); // increment iteration counter
+  Cb.mov(L1Addr, PointerReg);
 
-    cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-    cb.jnz(SkipRegistersDump);
-
-    // dump all the xmm register
-    for (int i = 0; i < (int)this->registerCount(); i++) {
-      cb.movapd(
-          xmmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)),
-          Xmm(i));
-    }
-
-    // set read flag
-    cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-
-    cb.bind(SkipRegistersDump);
+  if (DumpRegisters) {
+    emitDumpRegisterCode<Xmm>(Cb, PointerReg, xmmword_ptr);
   }
 
-  if (errorDetection) {
-    this->emitErrorDetectionCode<decltype(iter_reg), Xmm>(
-        cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2);
+  if (ErrorDetection) {
+    emitErrorDetectionCode<decltype(IterReg), Xmm>(Cb, IterReg, AddrHighReg, PointerReg, TempReg, TempReg2);
   }
 
-  cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH));
-  cb.jnz(Loop);
+  Cb.test(ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh));
+  Cb.jnz(Loop);
 
-  cb.bind(FunctionExit);
+  Cb.bind(FunctionExit);
 
-  cb.mov(rax, iter_reg); // restore iteration counter
+  Cb.mov(asmjit::x86::rax, IterReg); // restore iteration counter
 
-  cb.emitEpilog(frame);
+  Cb.emitEpilog(Frame);
 
-  cb.finalize();
+  Cb.finalize();
 
-  // String sb;
-  // cb.dump(sb);
-
-  Error err = this->rt.add(&this->loadFunction, &code);
-  if (err) {
-    workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in "
-                       << __FILE__ << " at " << __LINE__;
-    return EXIT_FAILURE;
-  }
+  auto CompiledPayloadPtr = CompiledX86Payload::create<SSE2Payload>(Stats, Code);
 
   // skip if we could not determine cache size
-  if (l1i_cache_size != 0) {
-    auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop);
-    auto instructionCachePercentage = 100 * loopSize / l1i_cache_size;
+  if (L1iCacheSize) {
+    auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop);
+    auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize;
 
-    if (loopSize > l1i_cache_size) {
+    if (LoopSize > *L1iCacheSize) {
       workerLog::warn() << "Work-loop is bigger than the L1i-Cache.";
     }
 
-    workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size
-                       << " Bytes (" << instructionCachePercentage
+    workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage
                        << "%) from the L1i-Cache for the work-loop.";
-    workerLog::trace() << "Sequence size: " << sequence.size();
-    workerLog::trace() << "Repetition count: " << repetitions;
+    workerLog::trace() << "Sequence size: " << Sequence.size();
+    workerLog::trace() << "Repetition count: " << Repetitions;
   }
 
-  return EXIT_SUCCESS;
+  return CompiledPayloadPtr;
 }
 
-std::list<std::string> SSE2Payload::getAvailableInstructions() const {
-  std::list<std::string> instructions;
-
-  transform(this->instructionFlops.begin(), this->instructionFlops.end(),
-            back_inserter(instructions),
-            [](const auto &item) { return item.first; });
-
-  return instructions;
+void SSE2Payload::init(double* MemoryAddr, uint64_t BufferSize) const {
+  X86Payload::initMemory(MemoryAddr, BufferSize, 1.654738925401e-10, 1.654738925401e-15);
 }
 
-void SSE2Payload::init(unsigned long long *memoryAddr,
-                       unsigned long long bufferSize) {
-  X86Payload::init(memoryAddr, bufferSize, 1.654738925401e-10,
-                   1.654738925401e-15);
-}
+} // namespace firestarter::environment::x86::payload
\ No newline at end of file
diff --git a/src/firestarter/Environment/X86/Payload/X86Payload.cpp b/src/firestarter/Environment/X86/Payload/X86Payload.cpp
index 42a2fa5b..296d1052 100644
--- a/src/firestarter/Environment/X86/Payload/X86Payload.cpp
+++ b/src/firestarter/Environment/X86/Payload/X86Payload.cpp
@@ -19,468 +19,76 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
+#include "firestarter/Environment/X86/Payload/X86Payload.hpp"
+#include "firestarter/Constants.hpp"
+#include "firestarter/WindowsCompat.hpp"
+
+#include <cassert>
 #include <chrono>
 #include <thread>
-#include <type_traits>
-
-#ifdef _MSC_VER
-#include <array>
-#include <intrin.h>
-#endif
-
-#include <firestarter/Environment/X86/Payload/X86Payload.hpp>
 
-using namespace firestarter::environment::x86::payload;
+namespace firestarter::environment::x86::payload {
 
-void X86Payload::lowLoadFunction(volatile unsigned long long *addrHigh,
-                                 unsigned long long period) {
-  int nap;
-#ifdef _MSC_VER
-  std::array<int, 4> cpuid;
-#endif
+void X86Payload::lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period) const {
+  auto Nap = Period / 100;
 
-  nap = period / 100;
-#ifndef _MSC_VER
-  __asm__ __volatile__("mfence;"
-                       "cpuid;" ::
-                           : "eax", "ebx", "ecx", "edx");
-#else
-  _mm_mfence();
-  __cpuid(cpuid.data(), 0);
-#endif
-  // while signal low load
-  while (*addrHigh == LOAD_LOW) {
-#ifndef _MSC_VER
-    __asm__ __volatile__("mfence;"
-                         "cpuid;" ::
-                             : "eax", "ebx", "ecx", "edx");
-#else
+  if constexpr (firestarter::OptionalFeatures.IsMsc) {
+    std::array<int, 4> Cpuid{};
     _mm_mfence();
-    __cpuid(cpuid.data(), 0);
-#endif
-    std::this_thread::sleep_for(std::chrono::microseconds(nap));
-#ifndef _MSC_VER
+    __cpuid(Cpuid.data(), 0);
+  } else {
     __asm__ __volatile__("mfence;"
                          "cpuid;" ::
                              : "eax", "ebx", "ecx", "edx");
-#else
-    _mm_mfence();
-    __cpuid(cpuid.data(), 0);
-#endif
-  }
-}
-
-void X86Payload::init(unsigned long long *memoryAddr,
-                      unsigned long long bufferSize, double firstValue,
-                      double lastValue) {
-  unsigned long long i = 0;
-
-  for (; i < INIT_BLOCKSIZE; i++)
-    *((double *)(memoryAddr + i)) = 0.25 + (double)i * 8.0 * firstValue;
-  for (; i <= bufferSize - INIT_BLOCKSIZE; i += INIT_BLOCKSIZE)
-    std::memcpy(memoryAddr + i, memoryAddr + i - INIT_BLOCKSIZE,
-                sizeof(unsigned long long) * INIT_BLOCKSIZE);
-  for (; i < bufferSize; i++)
-    *((double *)(memoryAddr + i)) = 0.25 + (double)i * 8.0 * lastValue;
-}
-
-unsigned long long
-X86Payload::highLoadFunction(unsigned long long *addrMem,
-                             volatile unsigned long long *addrHigh,
-                             unsigned long long iterations) {
-  return this->loadFunction(addrMem, addrHigh, iterations);
-}
-
-// add MM regs to dirty regs
-// zmm31 is used for backup if VectorReg is of type asmjit::x86::Zmm
-template <class IterReg, class VectorReg>
-void X86Payload::emitErrorDetectionCode(asmjit::x86::Builder &cb,
-                                        IterReg iter_reg,
-                                        asmjit::x86::Gpq addrHigh_reg,
-                                        asmjit::x86::Gpq pointer_reg,
-                                        asmjit::x86::Gpq temp_reg,
-                                        asmjit::x86::Gpq temp_reg2) {
-  // we don't want anything to break... so we use asserts for everything that
-  // could break it
-  static_assert(std::is_base_of<asmjit::x86::Vec, VectorReg>::value,
-                "VectorReg must be of asmjit::asmjit::x86::Vec");
-  static_assert(std::is_same<asmjit::x86::Xmm, VectorReg>::value ||
-                    std::is_same<asmjit::x86::Ymm, VectorReg>::value ||
-                    std::is_same<asmjit::x86::Zmm, VectorReg>::value,
-                "VectorReg ist not of any supported type");
-  static_assert(std::is_same<asmjit::x86::Mm, IterReg>::value ||
-                    std::is_same<asmjit::x86::Gpq, IterReg>::value,
-                "IterReg is not of any supported type");
-
-  if constexpr (std::is_same<asmjit::x86::Mm, IterReg>::value) {
-    assert((iter_reg == asmjit::x86::mm0, "iter_reg must be mm0"));
-  }
-
-  assert((iter_reg != temp_reg, "iter_reg must be != temp_reg"));
-  assert((temp_reg != temp_reg2, "temp_reg must be != temp_reg2"));
-  assert((temp_reg != addrHigh_reg, "temp_reg must be != addrHigh_reg"));
-  assert((temp_reg != pointer_reg, "temp_reg must be != pointer_reg"));
-
-  assert((iter_reg != asmjit::x86::r8, "iter_reg must be != r8"));
-  assert((iter_reg != asmjit::x86::r9, "iter_reg must be != r9"));
-  assert((iter_reg != asmjit::x86::rax, "iter_reg must be != rax"));
-  assert((iter_reg != asmjit::x86::rbx, "iter_reg must be != rbx"));
-  assert((iter_reg != asmjit::x86::rcx, "iter_reg must be != rcx"));
-  assert((iter_reg != asmjit::x86::rdx, "iter_reg must be != rdx"));
-
-  assert((temp_reg != asmjit::x86::r8, "temp_reg must be != r8"));
-  assert((temp_reg != asmjit::x86::r9, "temp_reg must be != r9"));
-  assert((temp_reg != asmjit::x86::rax, "temp_reg must be != rax"));
-  assert((temp_reg != asmjit::x86::rbx, "temp_reg must be != rbx"));
-  assert((temp_reg != asmjit::x86::rcx, "temp_reg must be != rcx"));
-  assert((temp_reg != asmjit::x86::rdx, "temp_reg must be != rdx"));
-
-  assert((temp_reg2 != asmjit::x86::r8, "temp_reg2 must be != r8"));
-  assert((temp_reg2 != asmjit::x86::r9, "temp_reg2 must be != r9"));
-  assert((temp_reg2 != asmjit::x86::rax, "temp_reg2 must be != rax"));
-  assert((temp_reg2 != asmjit::x86::rbx, "temp_reg2 must be != rbx"));
-  assert((temp_reg2 != asmjit::x86::rcx, "temp_reg2 must be != rcx"));
-  assert((temp_reg2 != asmjit::x86::rdx, "temp_reg2 must be != rdx"));
-
-  assert((addrHigh_reg != asmjit::x86::r8, "addrHigh_reg must be != r8"));
-  assert((addrHigh_reg != asmjit::x86::r9, "addrHigh_reg must be != r9"));
-  assert((addrHigh_reg != asmjit::x86::rax, "addrHigh_reg must be != rax"));
-  assert((addrHigh_reg != asmjit::x86::rbx, "addrHigh_reg must be != rbx"));
-  assert((addrHigh_reg != asmjit::x86::rcx, "addrHigh_reg must be != rcx"));
-  assert((addrHigh_reg != asmjit::x86::rdx, "addrHigh_reg must be != rdx"));
-
-  auto SkipErrorDetection = cb.newLabel();
-
-  if constexpr (std::is_same<asmjit::x86::Mm, IterReg>::value) {
-    cb.movq(temp_reg, iter_reg);
-  } else {
-    cb.mov(temp_reg, iter_reg);
-  }
-  // round about 50-100 Hz
-  // more or less, but this isn't really that relevant
-  cb.and_(temp_reg, asmjit::Imm(0x3fff));
-  cb.test(temp_reg, temp_reg);
-  cb.jnz(SkipErrorDetection);
-
-  cb.mov(temp_reg, asmjit::Imm(0xffffffff));
-
-  int registerCount = (int)this->registerCount();
-
-  // Create a backup of VectorReg(0)
-  if constexpr (std::is_same<asmjit::x86::Xmm, VectorReg>::value) {
-    cb.movq(temp_reg2, asmjit::x86::xmm0);
-    cb.push(temp_reg2);
-    cb.crc32(temp_reg, temp_reg2);
-    cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-    cb.movq(temp_reg2, asmjit::x86::xmm0);
-    cb.push(temp_reg2);
-    cb.crc32(temp_reg, temp_reg2);
-
-  } else if constexpr (std::is_same<asmjit::x86::Ymm, VectorReg>::value &&
-                       std::is_same<asmjit::x86::Mm, IterReg>::value) {
-    cb.movq(temp_reg2, asmjit::x86::xmm0);
-    cb.movq(asmjit::x86::Mm(7), temp_reg2);
-    cb.crc32(temp_reg, temp_reg2);
-    cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-    cb.movq(temp_reg2, asmjit::x86::xmm0);
-    cb.movq(asmjit::x86::Mm(6), temp_reg2);
-    cb.crc32(temp_reg, temp_reg2);
-
-    cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1));
-
-    cb.movq(temp_reg2, asmjit::x86::xmm0);
-    cb.movq(asmjit::x86::Mm(5), temp_reg2);
-    cb.crc32(temp_reg, temp_reg2);
-    cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-    cb.movq(temp_reg2, asmjit::x86::xmm0);
-    cb.movq(asmjit::x86::Mm(4), temp_reg2);
-    cb.crc32(temp_reg, temp_reg2);
-  } else if constexpr (std::is_same<asmjit::x86::Zmm, VectorReg>::value &&
-                       std::is_same<asmjit::x86::Mm, IterReg>::value) {
-    // We use vector registers zmm31 for our backup
-    cb.vmovapd(asmjit::x86::zmm31, asmjit::x86::zmm0);
-    registerCount--;
   }
 
-  // Calculate the hash of the remaining VectorReg
-  // use VectorReg(0) as a temporary place to unpack values
-  for (int i = 1; i < registerCount; i++) {
-    if constexpr (std::is_same<asmjit::x86::Xmm, VectorReg>::value) {
-      cb.vmovapd(asmjit::x86::xmm0, asmjit::x86::Xmm(i));
-
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-      cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-    } else if constexpr (std::is_same<asmjit::x86::Ymm, VectorReg>::value) {
-      cb.vmovapd(asmjit::x86::ymm0, asmjit::x86::Ymm(i));
-
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-      cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-
-      cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1));
-
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-      cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-    } else if constexpr (std::is_same<asmjit::x86::Zmm, VectorReg>::value) {
-      cb.vmovapd(asmjit::x86::ymm0, asmjit::x86::Ymm(i));
-
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-      cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-
-      cb.vextractf128(asmjit::x86::xmm0, asmjit::x86::ymm0, asmjit::Imm(1));
-
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-      cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-
-      cb.vextractf32x4(asmjit::x86::xmm0, asmjit::x86::Zmm(i), asmjit::Imm(2));
-
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-      cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-
-      cb.vextractf32x4(asmjit::x86::xmm0, asmjit::x86::Zmm(i), asmjit::Imm(3));
-
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
-      cb.movhlps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-      cb.movq(temp_reg2, asmjit::x86::xmm0);
-      cb.crc32(temp_reg, temp_reg2);
+  // while signal low load
+  while (LoadVar == LoadThreadWorkType::LoadLow) {
+    if constexpr (firestarter::OptionalFeatures.IsMsc) {
+      std::array<int, 4> Cpuid{};
+      _mm_mfence();
+      __cpuid(Cpuid.data(), 0);
+    } else {
+      __asm__ __volatile__("mfence;"
+                           "cpuid;" ::
+                               : "eax", "ebx", "ecx", "edx");
+    }
+    std::this_thread::sleep_for(Nap);
+    if constexpr (firestarter::OptionalFeatures.IsMsc) {
+      std::array<int, 4> Cpuid{};
+      _mm_mfence();
+      __cpuid(Cpuid.data(), 0);
+    } else {
+      __asm__ __volatile__("mfence;"
+                           "cpuid;" ::
+                               : "eax", "ebx", "ecx", "edx");
     }
   }
+}
 
-  // Restore VectorReg(0) from backup
-  if constexpr (std::is_same<asmjit::x86::Xmm, VectorReg>::value) {
-    cb.pop(temp_reg2);
-    cb.movq(asmjit::x86::xmm0, temp_reg2);
-    cb.movlhps(asmjit::x86::xmm0, asmjit::x86::xmm0);
-    cb.pop(temp_reg2);
-    cb.pinsrw(asmjit::x86::xmm0, temp_reg2.r32(), asmjit::Imm(0));
-    cb.shr(temp_reg2, asmjit::Imm(32));
-    cb.movd(temp_reg2.r32(), asmjit::x86::Mm(7));
-    cb.pinsrw(asmjit::x86::xmm0, temp_reg2.r32(), asmjit::Imm(1));
-  } else if constexpr (std::is_same<asmjit::x86::Ymm, VectorReg>::value &&
-                       std::is_same<asmjit::x86::Mm, IterReg>::value) {
-    cb.movq(temp_reg2, asmjit::x86::Mm(5));
-    cb.movq(asmjit::x86::xmm0, temp_reg2);
-    cb.movq(temp_reg2, asmjit::x86::Mm(4));
-    cb.pinsrq(asmjit::x86::xmm0, temp_reg2, asmjit::Imm(1));
-
-    cb.vinsertf128(asmjit::x86::ymm0, asmjit::x86::ymm0, asmjit::x86::xmm0,
-                   asmjit::Imm(1));
-
-    cb.movq(temp_reg2, asmjit::x86::Mm(7));
-    cb.movq(asmjit::x86::xmm0, temp_reg2);
-    cb.movq(temp_reg2, asmjit::x86::Mm(6));
-    cb.pinsrq(asmjit::x86::xmm0, temp_reg2, asmjit::Imm(1));
-  } else if constexpr (std::is_same<asmjit::x86::Zmm, VectorReg>::value &&
-                       std::is_same<asmjit::x86::Mm, IterReg>::value) {
-    // We use vector registers zmm31 for our backup
-    cb.vmovapd(asmjit::x86::zmm0, asmjit::x86::zmm31);
-  }
+void X86Payload::initMemory(double* MemoryAddr, uint64_t BufferSize, double FirstValue, double LastValue) {
+  uint64_t I = 0;
 
-  // before starting the communication, backup r8, r9, rax, rbx, rcx and rdx
-  if constexpr (std::is_same<asmjit::x86::Mm, IterReg>::value) {
-    cb.movq(asmjit::x86::Mm(7), asmjit::x86::rax);
-    cb.movq(asmjit::x86::Mm(6), asmjit::x86::rbx);
-    cb.movq(asmjit::x86::Mm(5), asmjit::x86::rcx);
-    cb.movq(asmjit::x86::Mm(4), asmjit::x86::rdx);
-    cb.movq(asmjit::x86::Mm(3), asmjit::x86::r8);
-    cb.movq(asmjit::x86::Mm(2), asmjit::x86::r9);
-  } else {
-    cb.push(asmjit::x86::rax);
-    cb.push(asmjit::x86::rbx);
-    cb.push(asmjit::x86::rcx);
-    cb.push(asmjit::x86::rdx);
-    cb.push(asmjit::x86::r8);
-    cb.push(asmjit::x86::r9);
+  // NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+  for (; I < InitBlocksize; I++) {
+    MemoryAddr[I] = 0.25 + static_cast<double>(I) * 8.0 * FirstValue;
   }
-
-  // do the actual communication
-  // temp_reg contains our hash
-
-  // save the pointer_reg. it might be any of r8, r9, rax, rbx, rcx or rdx
-  cb.mov(temp_reg2, pointer_reg);
-
-  // Don't touch me!
-  // This sychronization and communication works even if the threads run at
-  // different (changing) speed, with just one "lock cmpxchg16b" Brought to you
-  // by a few hours of headache for two people.
-  auto communication = [&](auto offset) {
-    // communication
-    cb.mov(asmjit::x86::r8, asmjit::x86::ptr_64(temp_reg2, offset));
-
-    // temp data
-    cb.mov(asmjit::x86::r9, temp_reg2);
-    cb.add(asmjit::x86::r9, asmjit::Imm(offset + 8));
-
-    cb.mov(asmjit::x86::rdx, asmjit::x86::ptr_64(asmjit::x86::r9, 0));
-    cb.mov(asmjit::x86::rax, asmjit::x86::ptr_64(asmjit::x86::r9, 8));
-
-    auto L0 = cb.newLabel();
-    cb.bind(L0);
-
-    cb.lock();
-    cb.cmpxchg16b(asmjit::x86::ptr(asmjit::x86::r8));
-
-    auto L1 = cb.newLabel();
-    cb.jnz(L1);
-
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 0), asmjit::x86::rcx);
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 8), asmjit::x86::rbx);
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 16), asmjit::Imm(0));
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 24), asmjit::Imm(0));
-
-    cb.mov(asmjit::x86::rax, asmjit::Imm(2));
-
-    auto L6 = cb.newLabel();
-    cb.jmp(L6);
-
-    cb.bind(L1);
-
-    cb.cmp(asmjit::x86::rcx, asmjit::x86::rdx);
-
-    auto L2 = cb.newLabel();
-    cb.jle(L2);
-
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 0), asmjit::x86::rcx);
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 8), asmjit::x86::rbx);
-
-    cb.jmp(L0);
-
-    cb.bind(L2);
-
-    auto L3 = cb.newLabel();
-
-    cb.cmp(asmjit::x86::ptr_64(asmjit::x86::r9, 16), asmjit::Imm(0));
-    cb.jne(L3);
-    cb.cmp(asmjit::x86::ptr_64(asmjit::x86::r9, 24), asmjit::Imm(0));
-    cb.jne(L3);
-
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 16), asmjit::x86::rdx);
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 24), asmjit::x86::rax);
-
-    cb.bind(L3);
-
-    cb.cmp(asmjit::x86::rcx, asmjit::x86::ptr_64(asmjit::x86::r9, 16));
-    cb.mov(asmjit::x86::rax, asmjit::Imm(4));
-    cb.jne(L6);
-
-    cb.cmp(asmjit::x86::rbx, asmjit::x86::ptr_64(asmjit::x86::r9, 24));
-    auto L4 = cb.newLabel();
-    cb.jne(L4);
-
-    cb.mov(asmjit::x86::rax, asmjit::Imm(0));
-
-    auto L5 = cb.newLabel();
-    cb.jmp(L5);
-
-    cb.bind(L4);
-
-    cb.mov(asmjit::x86::rax, asmjit::Imm(1));
-
-    cb.bind(L5);
-
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 16), asmjit::Imm(0));
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 24), asmjit::Imm(0));
-
-    cb.bind(L6);
-
-    // if check failed
-    cb.cmp(asmjit::x86::rax, asmjit::Imm(1));
-    auto L7 = cb.newLabel();
-    cb.jne(L7);
-
-    // write the error flag
-    cb.mov(asmjit::x86::ptr_64(asmjit::x86::r9, 32), asmjit::Imm(1));
-
-    // stop the execution after some time
-    cb.mov(asmjit::x86::ptr_64(addrHigh_reg), asmjit::Imm(LOAD_STOP));
-    cb.mfence();
-
-    cb.bind(L7);
-
-    auto L9 = cb.newLabel();
-    cb.jmp(L9);
-  };
-
-  // left communication
-  // move hash
-  cb.mov(asmjit::x86::rbx, temp_reg);
-  // move iterations counter
-  if constexpr (std::is_same<asmjit::x86::Mm, IterReg>::value) {
-    cb.movq(asmjit::x86::rcx, iter_reg);
-  } else {
-    cb.mov(asmjit::x86::rcx, iter_reg);
+  for (; I <= BufferSize - InitBlocksize; I += InitBlocksize) {
+    std::memcpy(MemoryAddr + I, MemoryAddr + I - InitBlocksize, sizeof(uint64_t) * InitBlocksize);
   }
-
-  communication(-128);
-
-  // right communication
-  // move hash
-  cb.mov(asmjit::x86::rbx, temp_reg);
-  // move iterations counter
-  if constexpr (std::is_same<asmjit::x86::Mm, IterReg>::value) {
-    cb.movq(asmjit::x86::rcx, iter_reg);
-  } else {
-    cb.mov(asmjit::x86::rcx, iter_reg);
+  for (; I < BufferSize; I++) {
+    MemoryAddr[I] = 0.25 + static_cast<double>(I) * 8.0 * LastValue;
   }
+  // NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+}
 
-  communication(-64);
+auto X86Payload::getAvailableInstructions() const -> std::list<std::string> {
+  std::list<std::string> Instructions;
 
-  // restore r8, r9, rax, rbx, rcx and rdx
-  if constexpr (std::is_same<asmjit::x86::Mm, IterReg>::value) {
-    cb.movq(asmjit::x86::rax, asmjit::x86::Mm(7));
-    cb.movq(asmjit::x86::rbx, asmjit::x86::Mm(6));
-    cb.movq(asmjit::x86::rcx, asmjit::x86::Mm(5));
-    cb.movq(asmjit::x86::rdx, asmjit::x86::Mm(4));
-    cb.movq(asmjit::x86::r8, asmjit::x86::Mm(3));
-    cb.movq(asmjit::x86::r9, asmjit::x86::Mm(2));
-  } else {
-    cb.pop(asmjit::x86::r9);
-    cb.pop(asmjit::x86::r8);
-    cb.pop(asmjit::x86::rdx);
-    cb.pop(asmjit::x86::rcx);
-    cb.pop(asmjit::x86::rbx);
-    cb.pop(asmjit::x86::rax);
-  }
+  transform(InstructionFlops.begin(), InstructionFlops.end(), back_inserter(Instructions),
+            [](const auto& Item) { return Item.first; });
 
-  cb.bind(SkipErrorDetection);
+  return Instructions;
 }
 
-template void
-X86Payload::emitErrorDetectionCode<asmjit::x86::Gpq, asmjit::x86::Xmm>(
-    asmjit::x86::Builder &cb, asmjit::x86::Gpq iter_reg,
-    asmjit::x86::Gpq addrHigh_reg, asmjit::x86::Gpq pointer_reg,
-    asmjit::x86::Gpq temp_reg, asmjit::x86::Gpq temp_reg2);
-template void
-X86Payload::emitErrorDetectionCode<asmjit::x86::Gpq, asmjit::x86::Ymm>(
-    asmjit::x86::Builder &cb, asmjit::x86::Gpq iter_reg,
-    asmjit::x86::Gpq addrHigh_reg, asmjit::x86::Gpq pointer_reg,
-    asmjit::x86::Gpq temp_reg, asmjit::x86::Gpq temp_reg2);
-
-template void
-X86Payload::emitErrorDetectionCode<asmjit::x86::Mm, asmjit::x86::Ymm>(
-    asmjit::x86::Builder &cb, asmjit::x86::Mm iter_reg,
-    asmjit::x86::Gpq addrHigh_reg, asmjit::x86::Gpq pointer_reg,
-    asmjit::x86::Gpq temp_reg, asmjit::x86::Gpq temp_reg2);
-template void
-X86Payload::emitErrorDetectionCode<asmjit::x86::Mm, asmjit::x86::Zmm>(
-    asmjit::x86::Builder &cb, asmjit::x86::Mm iter_reg,
-    asmjit::x86::Gpq addrHigh_reg, asmjit::x86::Gpq pointer_reg,
-    asmjit::x86::Gpq temp_reg, asmjit::x86::Gpq temp_reg2);
+}; // namespace firestarter::environment::x86::payload
\ No newline at end of file
diff --git a/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp b/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp
index 9e99ca2d..4857f82d 100644
--- a/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp
+++ b/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp
@@ -19,423 +19,361 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Environment/X86/Payload/ZENFMAPayload.hpp>
-#include <firestarter/Logging/Log.hpp>
-
-#include <iterator>
-#include <utility>
-
-using namespace firestarter::environment::x86::payload;
-using namespace asmjit;
-using namespace asmjit::x86;
-
-int ZENFMAPayload::compilePayload(
-    std::vector<std::pair<std::string, unsigned>> const &proportion,
-    unsigned instructionCacheSize,
-    std::list<unsigned> const &dataCacheBufferSize, unsigned ramBufferSize,
-    unsigned thread, unsigned numberOfLines, bool dumpRegisters,
-    bool errorDetection) {
+#include "firestarter/Environment/X86/Payload/ZENFMAPayload.hpp"
+#include "firestarter/Environment/X86/Payload/CompiledX86Payload.hpp"
+
+namespace firestarter::environment::x86::payload {
+
+auto ZENFMAPayload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters,
+                                   bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr {
+  using Imm = asmjit::Imm;
+  using Xmm = asmjit::x86::Xmm;
+  using Ymm = asmjit::x86::Ymm;
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  constexpr asmjit::x86::Mem (*xmmword_ptr)(const asmjit::x86::Gp&, int32_t) = asmjit::x86::xmmword_ptr;
+
   // Compute the sequence of instruction groups and the number of its repetions
   // to reach the desired size
-  auto sequence = this->generateSequence(proportion);
-  auto repetitions =
-      this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread);
+  auto Sequence = Settings.sequence();
+  auto Repetitions =
+      environment::payload::PayloadSettings::getNumberOfSequenceRepetitions(Sequence, Settings.linesPerThread());
 
   // compute count of flops and memory access for performance report
-  unsigned flops = 0;
-  unsigned bytes = 0;
+  environment::payload::PayloadStats Stats;
 
-  for (const auto &item : sequence) {
-    auto it = this->instructionFlops.find(item);
+  for (const auto& Item : Sequence) {
+    auto It = instructionFlops().find(Item);
 
-    if (it == this->instructionFlops.end()) {
-      workerLog::error() << "Instruction group " << item << " undefined in "
-                         << name() << ".";
-      return EXIT_FAILURE;
+    if (It == instructionFlops().end()) {
+      workerLog::error() << "Instruction group " << Item << " undefined in " << name() << ".";
     }
 
-    flops += it->second;
+    Stats.Flops += It->second;
 
-    it = this->instructionMemory.find(item);
+    It = instructionMemory().find(Item);
 
-    if (it != this->instructionMemory.end()) {
-      bytes += it->second;
+    if (It != instructionMemory().end()) {
+      Stats.Bytes += It->second;
     }
   }
 
-  this->_flops = repetitions * flops;
-  this->_bytes = repetitions * bytes;
-  this->_instructions = repetitions * sequence.size() * 4 + 6;
+  Stats.Flops *= Repetitions;
+  Stats.Bytes *= Repetitions;
+  Stats.Instructions = Repetitions * Sequence.size() * 4 + 6;
 
   // calculate the buffer sizes
-  auto l1i_cache_size = instructionCacheSize / thread;
-  auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin();
-  auto l1_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l2_size = *dataCacheBufferSizeIterator / thread;
-  std::advance(dataCacheBufferSizeIterator, 1);
-  auto l3_size = *dataCacheBufferSizeIterator / thread;
-  auto ram_size = ramBufferSize / thread;
+  const auto L1iCacheSize = Settings.instructionCacheSizePerThread();
+  const auto DataCacheBufferSizes = Settings.dataCacheBufferSizePerThread();
+  auto DataCacheBufferSizeIterator = DataCacheBufferSizes.begin();
+  const auto L1Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L2Size = *DataCacheBufferSizeIterator;
+  std::advance(DataCacheBufferSizeIterator, 1);
+  const auto L3Size = *DataCacheBufferSizeIterator;
+  const auto RamSize = Settings.ramBufferSizePerThread();
 
   // calculate the reset counters for the buffers
-  auto l2_loop_count =
-      getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread);
-  auto l3_loop_count =
-      getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread);
-  auto ram_loop_count =
-      getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread);
-
-  CodeHolder code;
-  code.init(this->rt.environment());
-
-  if (nullptr != this->loadFunction) {
-    this->rt.release(&this->loadFunction);
-  }
-
-  Builder cb(&code);
-  cb.addDiagnosticOptions(
-    asmjit::DiagnosticOptions::kValidateAssembler | 
-    asmjit::DiagnosticOptions::kValidateIntermediate );
-
-  auto pointer_reg = rax;
-  auto l1_addr = rbx;
-  auto l2_addr = rcx;
-  auto l3_addr = r8;
-  auto ram_addr = r9;
-  auto l2_count_reg = r10;
-  auto l3_count_reg = r11;
-  auto ram_count_reg = r12;
-  auto temp_reg = r13;
-  auto temp_reg2 = rbp;
-  auto offset_reg = r14;
-  auto addrHigh_reg = r15;
-  auto iter_reg = mm0;
-  auto shift_reg = std::vector<Gp>({rdi, rsi, rdx});
-  auto nr_shift_regs = 3;
-  auto nr_add_regs = 11;
-  auto ram_reg = ymm15;
-
-  FuncDetail func;
-  func.init(FuncSignatureT<unsigned long long, unsigned long long *,
-                           volatile unsigned long long *, unsigned long long>(
-                CallConvId::kCDecl),
-            this->rt.environment());
-
-  FuncFrame frame;
-  frame.init(func);
+  const auto L2LoopCount =
+      environment::payload::PayloadSettings::getL2LoopCount(Sequence, Settings.linesPerThread(), L2Size);
+  const auto L3LoopCount =
+      environment::payload::PayloadSettings::getL3LoopCount(Sequence, Settings.linesPerThread(), L3Size);
+  const auto RamLoopCount =
+      environment::payload::PayloadSettings::getRAMLoopCount(Sequence, Settings.linesPerThread(), RamSize);
+
+  asmjit::CodeHolder Code;
+  Code.init(asmjit::Environment::host());
+
+  asmjit::x86::Builder Cb(&Code);
+  Cb.addDiagnosticOptions(asmjit::DiagnosticOptions::kValidateAssembler |
+                          asmjit::DiagnosticOptions::kValidateIntermediate);
+
+  auto PointerReg = asmjit::x86::rax;
+  auto L1Addr = asmjit::x86::rbx;
+  auto L2Addr = asmjit::x86::rcx;
+  auto L3Addr = asmjit::x86::r8;
+  auto RamAddr = asmjit::x86::r9;
+  auto L2CountReg = asmjit::x86::r10;
+  auto L3CountReg = asmjit::x86::r11;
+  auto RamCountReg = asmjit::x86::r12;
+  auto TempReg = asmjit::x86::r13;
+  auto TempReg2 = asmjit::x86::rbp;
+  auto OffsetReg = asmjit::x86::r14;
+  auto AddrHighReg = asmjit::x86::r15;
+  auto IterReg = asmjit::x86::mm0;
+  auto ShiftRegs = std::vector<asmjit::x86::Gp>({asmjit::x86::rdi, asmjit::x86::rsi, asmjit::x86::rdx});
+  auto NbShiftRegs = 3;
+  auto NbAddRegs = 11;
+  auto RamReg = asmjit::x86::ymm15;
+
+  asmjit::FuncDetail Func;
+  Func.init(asmjit::FuncSignature::build<uint64_t, double*, volatile LoadThreadWorkType*, uint64_t>(
+                asmjit::CallConvId::kCDecl),
+            Code.environment());
+
+  asmjit::FuncFrame Frame;
+  Frame.init(Func);
 
   // make (x|y)mm registers dirty
-  for (int i = 0; i < 16; i++) {
-    frame.addDirtyRegs(Ymm(i));
+  for (auto I = 0U; I < 16U; I++) {
+    Frame.addDirtyRegs(asmjit::x86::Ymm(I));
   }
-  for (int i = 0; i < 8; i++) {
-    frame.addDirtyRegs(Mm(i));
+  for (auto I = 0U; I < 8U; I++) {
+    Frame.addDirtyRegs(asmjit::x86::Mm(I));
   }
   // make all other used registers dirty except RAX
-  frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg,
-                     l3_count_reg, ram_count_reg, temp_reg, temp_reg2,
-                     offset_reg, addrHigh_reg, iter_reg, ram_addr);
-  for (const auto &reg : shift_reg) {
-    frame.addDirtyRegs(reg);
+  Frame.addDirtyRegs(L1Addr, L2Addr, L3Addr, RamAddr, L2CountReg, L3CountReg, RamCountReg, TempReg, TempReg2, OffsetReg,
+                     AddrHighReg, IterReg, RamAddr);
+  for (const auto& Reg : ShiftRegs) {
+    Frame.addDirtyRegs(Reg);
   }
 
-  FuncArgsAssignment args(&func);
+  asmjit::FuncArgsAssignment Args(&Func);
   // FIXME: asmjit assigment to mm0 does not seem to be supported
-  args.assignAll(pointer_reg, addrHigh_reg, temp_reg);
-  args.updateFuncFrame(frame);
-  frame.finalize();
+  Args.assignAll(PointerReg, AddrHighReg, TempReg);
+  Args.updateFuncFrame(Frame);
+  Frame.finalize();
 
-  cb.emitProlog(frame);
-  cb.emitArgsAssignment(frame, args);
+  Cb.emitProlog(Frame);
+  Cb.emitArgsAssignment(Frame, Args);
 
   // FIXME: movq from temp_reg to iter_reg
-  cb.movq(iter_reg, temp_reg);
+  Cb.movq(IterReg, TempReg);
 
   // stop right away if low load is selected
-  auto FunctionExit = cb.newLabel();
+  auto FunctionExit = Cb.newLabel();
 
-  cb.mov(temp_reg, ptr_64(addrHigh_reg));
-  cb.test(temp_reg, temp_reg);
-  cb.jz(FunctionExit);
+  Cb.mov(TempReg, ptr_64(AddrHighReg));
+  Cb.test(TempReg, TempReg);
+  Cb.jz(FunctionExit);
 
-  cb.mov(offset_reg,
+  Cb.mov(OffsetReg,
          Imm(64)); // increment after each cache/memory access
   // Initialize registers for shift operations
-  for (auto const &reg : shift_reg) {
-    cb.mov(reg, Imm(0xAAAAAAAAAAAAAAAA));
+  for (auto const& Reg : ShiftRegs) {
+    Cb.mov(Reg, Imm(0xAAAAAAAAAAAAAAAA));
   }
   // Initialize AVX-Registers for FMA Operations
-  cb.vmovapd(ymm0, ymmword_ptr(pointer_reg));
-  cb.vmovapd(ymm1, ymmword_ptr(pointer_reg, 32));
+  Cb.vmovapd(asmjit::x86::ymm0, ymmword_ptr(PointerReg));
+  Cb.vmovapd(asmjit::x86::ymm1, ymmword_ptr(PointerReg, 32));
 
-  auto add_regs_start = 2;
-  auto add_regs_end = add_regs_start + nr_add_regs - 1;
-  for (int i = add_regs_start; i <= add_regs_end; i++) {
-    cb.vmovapd(Ymm(i), ymmword_ptr(pointer_reg, 256 + i * 32));
+  auto AddRegsStart = 2;
+  auto AddRegsEnd = AddRegsStart + NbAddRegs - 1;
+  for (auto I = AddRegsStart; I <= AddRegsEnd; I++) {
+    Cb.vmovapd(Ymm(I), ymmword_ptr(PointerReg, 256 + (I * 32)));
   }
 
   // Initialize xmm14 for shift operation
   // cb.mov(temp_reg, Imm(1));
   // cb.movd(temp_reg, Xmm(14));
-  cb.movd(shift_reg[0], Xmm(13));
-  cb.vbroadcastss(Xmm(13), Xmm(13));
-  cb.vmovapd(Xmm(14), Xmm(13));
-  cb.vpsrlq(Xmm(14), Xmm(14), Imm(1));
-
-  cb.mov(l1_addr, pointer_reg); // address for L1-buffer
-  cb.mov(l2_addr, pointer_reg);
-  cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer
-  cb.mov(l3_addr, pointer_reg);
-  cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer
-  cb.mov(ram_addr, pointer_reg);
-  cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer
-  cb.mov(l2_count_reg, Imm(l2_loop_count));
-  workerLog::trace() << "reset counter for L2-buffer with "
-                     << l2_loop_count
-                     << " cache line accesses per loop ("
-		     << l2_size/1024
-                     << ") KiB";
-  cb.mov(l3_count_reg, Imm(l3_loop_count));
-  workerLog::trace() << "reset counter for L3-buffer with "
-                     << l3_loop_count
-                     << " cache line accesses per loop ("
-		     << l3_size/1024
-                     << ") KiB";
-  cb.mov(ram_count_reg, Imm(ram_loop_count));
-  workerLog::trace() << "reset counter for RAM-buffer with "
-                     << ram_loop_count
-                     << " cache line accesses per loop ("
-		     << ram_size/1024
-                     << ") KiB";
-
-  cb.align(AlignMode::kCode, 64);
-
-  auto Loop = cb.newLabel();
-  cb.bind(Loop);
-
-  auto shift_pos = 0;
-  bool left = false;
-  auto itemCount = 0;
-  auto add_dest = add_regs_start;
-  unsigned l1_offset = 0;
-
-#define L1_INCREMENT()                                                         \
-  l1_offset += 64;                                                             \
-  if (l1_offset < l1_size * 0.5) {                                             \
-    cb.add(l1_addr, offset_reg);                                               \
-  } else {                                                                     \
-    l1_offset = 0;                                                             \
-    cb.mov(l1_addr, pointer_reg);                                              \
-  }
-
-#define L2_INCREMENT() cb.add(l2_addr, offset_reg);
-
-#define L3_INCREMENT() cb.add(l3_addr, offset_reg)
-
-#define RAM_INCREMENT() cb.add(ram_addr, offset_reg)
+  Cb.movd(ShiftRegs[0], Xmm(13));
+  Cb.vbroadcastss(Xmm(13), Xmm(13));
+  Cb.vmovapd(Xmm(14), Xmm(13));
+  Cb.vpsrlq(Xmm(14), Xmm(14), Imm(1));
+
+  Cb.mov(L1Addr, PointerReg); // address for L1-buffer
+  Cb.mov(L2Addr, PointerReg);
+  Cb.add(L2Addr, Imm(L1Size)); // address for L2-buffer
+  Cb.mov(L3Addr, PointerReg);
+  Cb.add(L3Addr, Imm(L2Size)); // address for L3-buffer
+  Cb.mov(RamAddr, PointerReg);
+  Cb.add(RamAddr, Imm(L3Size)); // address for RAM-buffer
+  Cb.mov(L2CountReg, Imm(L2LoopCount));
+  workerLog::trace() << "reset counter for L2-buffer with " << L2LoopCount << " cache line accesses per loop ("
+                     << L2Size / 1024 << ") KiB";
+  Cb.mov(L3CountReg, Imm(L3LoopCount));
+  workerLog::trace() << "reset counter for L3-buffer with " << L3LoopCount << " cache line accesses per loop ("
+                     << L3Size / 1024 << ") KiB";
+  Cb.mov(RamCountReg, Imm(RamLoopCount));
+  workerLog::trace() << "reset counter for RAM-buffer with " << RamLoopCount << " cache line accesses per loop ("
+                     << RamSize / 1024 << ") KiB";
+
+  Cb.align(asmjit::AlignMode::kCode, 64);
+
+  auto Loop = Cb.newLabel();
+  Cb.bind(Loop);
+
+  auto ShiftPos = 0;
+  bool Left = false;
+  unsigned ItemCount = 0;
+  auto AddDest = AddRegsStart;
+  unsigned L1Offset = 0;
+
+  const auto L1Increment = [&Cb, &L1Offset, &L1Size, &L1Addr, &OffsetReg, &PointerReg]() {
+    L1Offset += 64;
+    if (L1Offset < L1Size * 0.5) {
+      Cb.add(L1Addr, OffsetReg);
+    } else {
+      L1Offset = 0;
+      Cb.mov(L1Addr, PointerReg);
+    }
+  };
+  const auto L2Increment = [&Cb, &L2Addr, &OffsetReg]() { Cb.add(L2Addr, OffsetReg); };
+  const auto L3Increment = [&Cb, &L3Addr, &OffsetReg]() { Cb.add(L3Addr, OffsetReg); };
+  const auto RamIncrement = [&Cb, &RamAddr, &OffsetReg]() { Cb.add(RamAddr, OffsetReg); };
 
-  for (unsigned count = 0; count < repetitions; count++) {
-    for (const auto &item : sequence) {
+  for (auto Count = 0U; Count < Repetitions; Count++) {
+    for (const auto& Item : Sequence) {
 
       // swap second and third param of fma instruction to force bitchanges on
       // the pipes to its execution units
-      Ymm secondParam;
-      Ymm thirdParam;
-      if (0 == itemCount % 2) {
-        secondParam = ymm0;
-        thirdParam = ymm1;
+      Ymm SecondParam;
+      Ymm ThirdParam;
+      if (0 == ItemCount % 2) {
+        SecondParam = asmjit::x86::ymm0;
+        ThirdParam = asmjit::x86::ymm1;
       } else {
-        secondParam = ymm1;
-        thirdParam = ymm0;
+        SecondParam = asmjit::x86::ymm1;
+        ThirdParam = asmjit::x86::ymm0;
       }
 
-      if (item == "REG") {
-        cb.vfmadd231pd(Ymm(add_dest), secondParam, thirdParam);
-        cb.xor_(temp_reg,
-                shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs]);
-        if (left) {
-          cb.shr(shift_reg[shift_pos], Imm(1));
+      if (Item == "REG") {
+        Cb.vfmadd231pd(Ymm(AddDest), SecondParam, ThirdParam);
+        Cb.xor_(TempReg, ShiftRegs[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs]);
+        if (Left) {
+          Cb.shr(ShiftRegs[ShiftPos], Imm(1));
         } else {
-          cb.shl(shift_reg[shift_pos], Imm(1));
+          Cb.shl(ShiftRegs[ShiftPos], Imm(1));
         }
-      } else if (item == "L1_LS") {
-        cb.vfmadd231pd(Ymm(add_dest), secondParam, ymmword_ptr(l1_addr, 32));
-        cb.vmovapd(xmmword_ptr(l1_addr, 64), Xmm(add_dest));
-        L1_INCREMENT();
-      } else if (item == "L2_L") {
-        cb.vfmadd231pd(Ymm(add_dest), secondParam, ymmword_ptr(l2_addr, 64));
-        cb.xor_(temp_reg,
-                shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs]);
-        L2_INCREMENT();
-      } else if (item == "L3_L") {
-        cb.vfmadd231pd(Ymm(add_dest), secondParam, ymmword_ptr(l3_addr, 64));
-        cb.xor_(temp_reg,
-                shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs]);
-        L3_INCREMENT();
-      } else if (item == "RAM_L") {
-        cb.vfmadd231pd(Ymm(ram_reg), secondParam, ymmword_ptr(ram_addr, 32));
-        cb.xor_(temp_reg,
-                shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs]);
-        RAM_INCREMENT();
+      } else if (Item == "L1_LS") {
+        Cb.vfmadd231pd(Ymm(AddDest), SecondParam, ymmword_ptr(L1Addr, 32));
+        Cb.vmovapd(xmmword_ptr(L1Addr, 64), Xmm(AddDest));
+        L1Increment();
+      } else if (Item == "L2_L") {
+        Cb.vfmadd231pd(Ymm(AddDest), SecondParam, ymmword_ptr(L2Addr, 64));
+        Cb.xor_(TempReg, ShiftRegs[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs]);
+        L2Increment();
+      } else if (Item == "L3_L") {
+        Cb.vfmadd231pd(Ymm(AddDest), SecondParam, ymmword_ptr(L3Addr, 64));
+        Cb.xor_(TempReg, ShiftRegs[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs]);
+        L3Increment();
+      } else if (Item == "RAM_L") {
+        Cb.vfmadd231pd(Ymm(RamReg), SecondParam, ymmword_ptr(RamAddr, 32));
+        Cb.xor_(TempReg, ShiftRegs[(ShiftPos + NbShiftRegs - 1) % NbShiftRegs]);
+        RamIncrement();
       } else {
-        workerLog::error() << "Instruction group " << item << " not found in "
-                           << this->name() << ".";
-        return EXIT_FAILURE;
+        workerLog::error() << "Instruction group " << Item << " not found in " << name() << ".";
       }
 
       // make sure the shifts do could end up shifting out the data one end.
-      if (itemCount < (int)(sequence.size() * repetitions -
-                            (sequence.size() * repetitions) % 4)) {
-        switch (itemCount % 4) {
+      if (ItemCount < (Sequence.size() * Repetitions) - ((Sequence.size() * Repetitions) % 4)) {
+        // all cases are covered
+        // NOLINTNEXTLINE(bugprone-switch-missing-default-case)
+        switch (ItemCount % 4) {
         case 0:
-          cb.vpsrlq(Xmm(13), Xmm(13), Imm(1));
+          Cb.vpsrlq(Xmm(13), Xmm(13), Imm(1));
           break;
         case 1:
-          cb.vpsllq(Xmm(14), Xmm(14), Imm(1));
+          Cb.vpsllq(Xmm(14), Xmm(14), Imm(1));
           break;
         case 2:
-          cb.vpsllq(Xmm(13), Xmm(13), Imm(1));
+          Cb.vpsllq(Xmm(13), Xmm(13), Imm(1));
           break;
         case 3:
-          cb.vpsrlq(Xmm(14), Xmm(14), Imm(1));
+          Cb.vpsrlq(Xmm(14), Xmm(14), Imm(1));
           break;
         }
       }
 
-      itemCount++;
+      ItemCount++;
 
-      add_dest++;
-      if (add_dest > add_regs_end) {
-        add_dest = add_regs_start;
+      AddDest++;
+      if (AddDest > AddRegsEnd) {
+        AddDest = AddRegsStart;
       }
 
-      shift_pos++;
-      if (shift_pos == nr_shift_regs) {
-        shift_pos = 0;
-        left = !left;
+      ShiftPos++;
+      if (ShiftPos == NbShiftRegs) {
+        ShiftPos = 0;
+        Left = !Left;
       }
     }
   }
 
-  cb.movq(temp_reg, iter_reg); // restore iteration counter
-  if (this->getRAMSequenceCount(sequence) > 0) {
+  Cb.movq(TempReg, IterReg); // restore iteration counter
+  if (environment::payload::PayloadSettings::getRAMSequenceCount(Sequence) > 0) {
     // reset RAM counter
-    auto NoRamReset = cb.newLabel();
-
-    cb.sub(ram_count_reg, Imm(1));
-    cb.jnz(NoRamReset);
-    cb.mov(ram_count_reg, Imm(ram_loop_count));
-    cb.mov(ram_addr, pointer_reg);
-    cb.add(ram_addr, Imm(l3_size));
-    cb.bind(NoRamReset);
+    auto NoRamReset = Cb.newLabel();
+
+    Cb.sub(RamCountReg, Imm(1));
+    Cb.jnz(NoRamReset);
+    Cb.mov(RamCountReg, Imm(RamLoopCount));
+    Cb.mov(RamAddr, PointerReg);
+    Cb.add(RamAddr, Imm(L3Size));
+    Cb.bind(NoRamReset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.inc(temp_reg); // increment iteration counter
-  if (this->getL2SequenceCount(sequence) > 0) {
+  Cb.inc(TempReg); // increment iteration counter
+  if (environment::payload::PayloadSettings::getL2SequenceCount(Sequence) > 0) {
     // reset L2-Cache counter
-    auto NoL2Reset = cb.newLabel();
-
-    cb.sub(l2_count_reg, Imm(1));
-    cb.jnz(NoL2Reset);
-    cb.mov(l2_count_reg, Imm(l2_loop_count));
-    cb.mov(l2_addr, pointer_reg);
-    cb.add(l2_addr, Imm(l1_size));
-    cb.bind(NoL2Reset);
+    auto NoL2Reset = Cb.newLabel();
+
+    Cb.sub(L2CountReg, Imm(1));
+    Cb.jnz(NoL2Reset);
+    Cb.mov(L2CountReg, Imm(L2LoopCount));
+    Cb.mov(L2Addr, PointerReg);
+    Cb.add(L2Addr, Imm(L1Size));
+    Cb.bind(NoL2Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.movq(iter_reg, temp_reg); // store iteration counter
-  if (this->getL3SequenceCount(sequence) > 0) {
+  Cb.movq(IterReg, TempReg); // store iteration counter
+  if (environment::payload::PayloadSettings::getL3SequenceCount(Sequence) > 0) {
     // reset L3-Cache counter
-    auto NoL3Reset = cb.newLabel();
-
-    cb.sub(l3_count_reg, Imm(1));
-    cb.jnz(NoL3Reset);
-    cb.mov(l3_count_reg, Imm(l3_loop_count));
-    cb.mov(l3_addr, pointer_reg);
-    cb.add(l3_addr, Imm(l2_size));
-    cb.bind(NoL3Reset);
+    auto NoL3Reset = Cb.newLabel();
+
+    Cb.sub(L3CountReg, Imm(1));
+    Cb.jnz(NoL3Reset);
+    Cb.mov(L3CountReg, Imm(L3LoopCount));
+    Cb.mov(L3Addr, PointerReg);
+    Cb.add(L3Addr, Imm(L2Size));
+    Cb.bind(NoL3Reset);
     // adds always two instruction
-    this->_instructions += 2;
+    Stats.Instructions += 2;
   }
-  cb.mov(l1_addr, pointer_reg);
+  Cb.mov(L1Addr, PointerReg);
 
-  if (dumpRegisters) {
-    auto SkipRegistersDump = cb.newLabel();
-
-    cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-    cb.jnz(SkipRegistersDump);
-
-    // dump all the ymm register
-    for (int i = 0; i < (int)this->registerCount(); i++) {
-      cb.vmovapd(
-          ymmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)),
-          Ymm(i));
-    }
-
-    // set read flag
-    cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait));
-
-    cb.bind(SkipRegistersDump);
+  if (DumpRegisters) {
+    emitDumpRegisterCode<Ymm>(Cb, PointerReg, asmjit::x86::ymmword_ptr);
   }
 
-  if (errorDetection) {
-    this->emitErrorDetectionCode<decltype(iter_reg), Ymm>(
-        cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2);
+  if (ErrorDetection) {
+    emitErrorDetectionCode<decltype(IterReg), Ymm>(Cb, IterReg, AddrHighReg, PointerReg, TempReg, TempReg2);
   }
 
-  cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH));
-  cb.jnz(Loop);
+  Cb.test(ptr_64(AddrHighReg), Imm(LoadThreadWorkType::LoadHigh));
+  Cb.jnz(Loop);
 
-  cb.bind(FunctionExit);
+  Cb.bind(FunctionExit);
 
-  cb.movq(rax, iter_reg);
+  Cb.movq(asmjit::x86::rax, IterReg);
 
-  cb.emitEpilog(frame);
+  Cb.emitEpilog(Frame);
 
-  cb.finalize();
+  Cb.finalize();
 
-  // String sb;
-  // cb.dump(sb);
-
-  Error err = this->rt.add(&this->loadFunction, &code);
-  if (err) {
-    workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in "
-                       << __FILE__ << " at " << __LINE__;
-    return EXIT_FAILURE;
-  }
+  auto CompiledPayloadPtr = CompiledX86Payload::create<ZENFMAPayload>(Stats, Code);
 
   // skip if we could not determine cache size
-  if (l1i_cache_size != 0) {
-    auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop);
-    auto instructionCachePercentage = 100 * loopSize / l1i_cache_size;
+  if (L1iCacheSize) {
+    auto LoopSize = Code.labelOffset(FunctionExit) - Code.labelOffset(Loop);
+    auto InstructionCachePercentage = 100 * LoopSize / *L1iCacheSize;
 
-    if (loopSize > l1i_cache_size) {
+    if (LoopSize > *L1iCacheSize) {
       workerLog::warn() << "Work-loop is bigger than the L1i-Cache.";
     }
 
-    workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size
-                       << " Bytes (" << instructionCachePercentage
+    workerLog::trace() << "Using " << LoopSize << " of " << *L1iCacheSize << " Bytes (" << InstructionCachePercentage
                        << "%) from the L1i-Cache for the work-loop.";
-    workerLog::trace() << "Sequence size: " << sequence.size();
-    workerLog::trace() << "Repetition count: " << repetitions;
+    workerLog::trace() << "Sequence size: " << Sequence.size();
+    workerLog::trace() << "Repetition count: " << Repetitions;
   }
 
-  return EXIT_SUCCESS;
+  return CompiledPayloadPtr;
 }
 
-std::list<std::string> ZENFMAPayload::getAvailableInstructions() const {
-  std::list<std::string> instructions;
-
-  transform(this->instructionFlops.begin(), this->instructionFlops.end(),
-            back_inserter(instructions),
-            [](const auto &item) { return item.first; });
-
-  return instructions;
+void ZENFMAPayload::init(double* MemoryAddr, uint64_t BufferSize) const {
+  X86Payload::initMemory(MemoryAddr, BufferSize, 0.27948995982e-4, 0.27948995982e-4);
 }
 
-void ZENFMAPayload::init(unsigned long long *memoryAddr,
-                         unsigned long long bufferSize) {
-  X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4);
-}
+} // namespace firestarter::environment::x86::payload
\ No newline at end of file
diff --git a/src/firestarter/Environment/X86/Platform/X86PlatformConfig.cpp b/src/firestarter/Environment/X86/Platform/X86PlatformConfig.cpp
new file mode 100644
index 00000000..fa4d4399
--- /dev/null
+++ b/src/firestarter/Environment/X86/Platform/X86PlatformConfig.cpp
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * FIRESTARTER - A Processor Stress Test Utility
+ * Copyright (C) 2024 TU Dresden, Center for Information Services and High
+ * Performance Computing
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/\>.
+ *
+ * Contact: daniel.hackenberg@tu-dresden.de
+ *****************************************************************************/
+
+// This file exists to get an entry in the compile commands database. Clangd will interpolate the include directories
+// for header files based on the source file with the best matching score. This file should be the best score for the
+// included header. Therefore we should not see any errors in this file for missing includes. For more infomation
+// look in the LLVM code base: clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
+
+#include "firestarter/Environment/X86/Platform/X86PlatformConfig.hpp"
\ No newline at end of file
diff --git a/src/firestarter/Environment/X86/X86CPUTopology.cpp b/src/firestarter/Environment/X86/X86CPUTopology.cpp
index 8b8abe2b..64d64cfb 100644
--- a/src/firestarter/Environment/X86/X86CPUTopology.cpp
+++ b/src/firestarter/Environment/X86/X86CPUTopology.cpp
@@ -19,8 +19,8 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Environment/X86/X86CPUTopology.hpp>
-#include <firestarter/Logging/Log.hpp>
+#include "firestarter/Environment/X86/X86CPUTopology.hpp"
+#include "firestarter/Logging/Log.hpp"
 
 #include <ctime>
 
@@ -31,99 +31,100 @@
 #pragma intrinsic(__rdtsc)
 #endif
 
-using namespace firestarter::environment::x86;
+namespace firestarter::environment::x86 {
 
 X86CPUTopology::X86CPUTopology()
-    : CPUTopology("x86_64"), cpuInfo(asmjit::CpuInfo::host()),
-      _vendor(this->cpuInfo.vendor()) {
-
-  std::stringstream ss;
-  ss << "Family " << this->familyId() << ", Model " << this->modelId()
-     << ", Stepping " << this->stepping();
-  this->_model = ss.str();
+    : CPUTopology("x86_64")
+    , CpuInfo(asmjit::CpuInfo::host())
+    , Vendor(CpuInfo.vendor()) {
+
+  {
+    std::stringstream Ss;
+    Ss << "Family " << familyId() << ", Model " << modelId() << ", Stepping " << stepping();
+    Model = Ss.str();
+  }
 
-  for (int i = 0; i <= (int)asmjit::CpuFeatures::X86::Id::kMaxValue; i++) {
-    if (!this->cpuInfo.hasFeature(i)) {
+  for (auto FeatureId = 0; FeatureId <= asmjit::CpuFeatures::X86::Id::kMaxValue; FeatureId++) {
+    if (!CpuInfo.hasFeature(FeatureId)) {
       continue;
     }
 
-    asmjit::String sb;
+    asmjit::String Sb;
 
-    auto error = asmjit::Formatter::formatFeature(sb, this->cpuInfo.arch(), i);
-    if (error != asmjit::ErrorCode::kErrorOk) {
-      log::warn() << "Formatting cpu features got asmjit error: " << error;
+    auto Error = asmjit::Formatter::formatFeature(Sb, CpuInfo.arch(), FeatureId);
+    if (Error != asmjit::ErrorCode::kErrorOk) {
+      log::warn() << "Formatting cpu features got asmjit error: " << Error;
     }
 
-    this->featureList.push_back(std::string(sb.data()));
+    FeatureList.emplace_back(Sb.data());
   }
 
-  unsigned long long a = 0, b = 0, c = 0, d = 0;
+  uint64_t Rax = 0;
+  uint64_t Rbx = 0;
+  uint64_t Rcx = 0;
+  uint64_t Rdx = 0;
 
   // check if we have rdtsc
-  this->cpuid(&a, &b, &c, &d);
-  if (a >= 1) {
-    a = 1;
-    this->cpuid(&a, &b, &c, &d);
-    if ((int)d & (1 << 4)) {
-      this->_hasRdtsc = true;
-    } else {
-      this->_hasRdtsc = false;
-    }
+  cpuid(&Rax, &Rbx, &Rcx, &Rdx);
+  if (Rax >= 1) {
+    Rax = 1;
+    cpuid(&Rax, &Rbx, &Rcx, &Rdx);
+    HasRdtsc = (Rdx & (1 << 4)) != 0;
   }
 
   // check if we have invarant rdtsc
-  if (this->hasRdtsc()) {
-    a = 0, b = 0, c = 0, d = 0;
+  if (hasRdtsc()) {
+    Rax = 0, Rbx = 0, Rcx = 0, Rdx = 0;
 
-    this->_hasInvariantRdtsc = true;
+    HasInvariantRdtsc = true;
 
     /* TSCs are usable if CPU supports only one frequency in C0 (no
        speedstep/Cool'n'Quite)
        or if multiple frequencies are available and the constant/invariant TSC
        feature flag is set */
 
-    if (0 == this->vendor().compare("INTEL")) {
+    if ("INTEL" == vendor()) {
       /*check if Powermanagement and invariant TSC are supported*/
-      a = 1;
-      this->cpuid(&a, &b, &c, &d);
+      Rax = 1;
+      cpuid(&Rax, &Rbx, &Rcx, &Rdx);
       /* no Frequency control */
-      if ((!(d & (1 << 22))) && (!(c & (1 << 7)))) {
-        this->_hasInvariantRdtsc = true;
+      if ((!(Rdx & (1 << 22))) && (!(Rcx & (1 << 7)))) {
+        HasInvariantRdtsc = true;
       } else {
-        a = 0x80000000;
-        this->cpuid(&a, &b, &c, &d);
-        if (a >= 0x80000007) {
-          a = 0x80000007;
-          this->cpuid(&a, &b, &c, &d);
+        Rax = 0x80000000;
+        cpuid(&Rax, &Rbx, &Rcx, &Rdx);
+        if (Rax >= 0x80000007) {
+          Rax = 0x80000007;
+          cpuid(&Rax, &Rbx, &Rcx, &Rdx);
           /* invariant TSC */
-          if (d & (1 << 8)) {
-            this->_hasInvariantRdtsc = true;
+          if (Rdx & (1 << 8)) {
+            HasInvariantRdtsc = true;
           }
         }
       }
     }
 
-    if (0 == this->vendor().compare("AMD")) {
+    if ("AMD" == vendor()) {
       /*check if Powermanagement and invariant TSC are supported*/
-      a = 0x80000000;
-      this->cpuid(&a, &b, &c, &d);
-      if (a >= 0x80000007) {
-        a = 0x80000007;
-        this->cpuid(&a, &b, &c, &d);
+      Rax = 0x80000000;
+      cpuid(&Rax, &Rbx, &Rcx, &Rdx);
+      if (Rax >= 0x80000007) {
+        Rax = 0x80000007;
+        cpuid(&Rax, &Rbx, &Rcx, &Rdx);
 
         /* no Frequency control */
-        if ((!(d & (1 << 7))) && (!(d & (1 << 1)))) {
-          this->_hasInvariantRdtsc = true;
+        if ((!(Rdx & (1 << 7))) && (!(Rdx & (1 << 1)))) {
+          HasInvariantRdtsc = true;
         }
         /* invariant TSC */
-        if (d & (1 << 8)) {
-          this->_hasInvariantRdtsc = true;
+        if (Rdx & (1 << 8)) {
+          HasInvariantRdtsc = true;
         }
       }
       /* assuming no frequency control if cpuid does not provide the extended
          function to test for it */
       else {
-        this->_hasInvariantRdtsc = true;
+        HasInvariantRdtsc = true;
       }
     }
   }
@@ -133,123 +134,124 @@ X86CPUTopology::X86CPUTopology()
 // only constant TSCs will be used (i.e. power management indepent TSCs)
 // save frequency in highest P-State or use generic fallback if no invarient TSC
 // is available
-unsigned long long X86CPUTopology::clockrate() const {
-  typedef std::chrono::high_resolution_clock Clock;
-  typedef std::chrono::microseconds ticks;
+auto X86CPUTopology::clockrate() const -> uint64_t {
+  using ClockT = std::chrono::high_resolution_clock;
+  using TicksT = std::chrono::microseconds;
 
-  unsigned long long start1_tsc, start2_tsc, end1_tsc, end2_tsc;
-  unsigned long long time_diff;
-  unsigned long long clock_lower_bound, clock_upper_bound, clock;
-  unsigned long long clockrate = 0;
-  int i, num_measurements = 0, min_measurements;
+  uint64_t Clockrate = 0;
+  uint64_t MinMeasurements = 0;
 
-  Clock::time_point start_time, end_time;
+  ClockT::time_point StartTime;
+  ClockT::time_point EndTime;
 
 #if not(defined(__APPLE__) || defined(_WIN32))
-  auto governor = this->scalingGovernor();
-  if (governor.empty()) {
+  auto Governor = scalingGovernor();
+  if (Governor.empty()) {
     return CPUTopology::clockrate();
   }
 
   /* non invariant TSCs can be used if CPUs run at fixed frequency */
-  if (!this->hasInvariantRdtsc() && governor.compare("performance") &&
-      governor.compare("powersave")) {
+  if (!hasInvariantRdtsc() && Governor != "performance" && Governor != "powersave") {
     return CPUTopology::clockrate();
   }
 
-  min_measurements = 5;
+  MinMeasurements = 5;
 #else
-  min_measurements = 20;
+  MinMeasurements = 20;
 #endif
 
-  i = 3;
+  for (uint64_t NumMeasurements = 0, TimeDiff = 0, Duration = 3; TimeDiff < 10000 || NumMeasurements < MinMeasurements;
+       Duration += 2) {
+    uint64_t End1Tsc = 0;
+    uint64_t End2Tsc = 0;
 
-  do {
     // start timestamp
-    start1_tsc = this->timestamp();
-    start_time = Clock::now();
-    start2_tsc = this->timestamp();
+    const uint64_t Start1Tsc = timestamp();
+    StartTime = ClockT::now();
+    const uint64_t Start2Tsc = timestamp();
 
-    // waiting
-    do {
-      end1_tsc = this->timestamp();
-    } while (end1_tsc < start2_tsc + 1000000 * i); /* busy waiting */
+    // busy wait waiting for duration to pass
+    for (; End1Tsc < Start2Tsc + 1000000 * Duration;) {
+      End1Tsc = timestamp();
+    }
 
     // end timestamp
-    do {
-      end1_tsc = this->timestamp();
-      end_time = Clock::now();
-      end2_tsc = this->timestamp();
+    End1Tsc = timestamp();
+    EndTime = ClockT::now();
+    End2Tsc = timestamp();
 
-      time_diff =
-          std::chrono::duration_cast<ticks>(end_time - start_time).count();
-    } while (0 == time_diff);
+    TimeDiff = std::chrono::duration_cast<TicksT>(EndTime - StartTime).count();
 
-    clock_lower_bound = (((end1_tsc - start2_tsc) * 1000000) / (time_diff));
-    clock_upper_bound = (((end2_tsc - start1_tsc) * 1000000) / (time_diff));
+    // measurement not long enough
+    if (TimeDiff <= 2000) {
+      continue;
+    }
 
     // if both values differ significantly, the measurement could have been
     // interrupted between 2 rdtsc's
-    if (((double)clock_lower_bound > (((double)clock_upper_bound) * 0.999)) &&
-        ((time_diff) > 2000)) {
-      num_measurements++;
-      clock = (clock_lower_bound + clock_upper_bound) / 2;
-      if (clockrate == 0)
-        clockrate = clock;
+    const uint64_t ClockLowerBound = (((End1Tsc - Start2Tsc) * 1000000) / (TimeDiff));
+    const uint64_t ClockUpperBound = (((End2Tsc - Start1Tsc) * 1000000) / (TimeDiff));
+
+    if (static_cast<double>(ClockLowerBound) > ((static_cast<double>(ClockUpperBound)) * 0.999)) {
+      NumMeasurements++;
+      const uint64_t Clock = (ClockLowerBound + ClockUpperBound) / 2;
+      const bool ClockrateUpdateCondition = Clockrate == 0 ||
 #ifndef _WIN32
-      else if (clock < clockrate)
-        clockrate = clock;
+                                            Clock < Clockrate;
 #else
-      else if (clock > clockrate)
-        clockrate = clock;
+                                            Clock > Clockrate;
 #endif
+      if (ClockrateUpdateCondition) {
+        Clockrate = Clock;
+      }
     }
-    i += 2;
-  } while (((time_diff) < 10000) || (num_measurements < min_measurements));
+  }
 
-  return clockrate;
+  return Clockrate;
 }
 
-unsigned long long X86CPUTopology::timestamp() const {
-#ifndef _MSC_VER
-  unsigned long long reg_a, reg_d;
-#else
-  unsigned long long i;
-#endif
-
-  if (!this->hasRdtsc()) {
+auto X86CPUTopology::timestamp() const -> uint64_t {
+  if (!hasRdtsc()) {
     return 0;
   }
 
 #ifndef _MSC_VER
-  __asm__ __volatile__("rdtsc;" : "=a"(reg_a), "=d"(reg_d));
-  return (reg_d << 32) | (reg_a & 0xffffffffULL);
+  // NOLINTBEGIN(misc-const-correctness)
+  uint64_t Rax = 0;
+  uint64_t Rdx = 0;
+  // NOLINTEND(misc-const-correctness)
+  __asm__ __volatile__("rdtsc;" : "=a"(Rax), "=d"(Rdx));
+  return (Rdx << 32) | (Rax & 0xffffffffULL);
 #else
-  i = __rdtsc();
-  return i;
+  return __rdtsc();
 #endif
 }
 
-void X86CPUTopology::cpuid(unsigned long long *a, unsigned long long *b,
-                           unsigned long long *c, unsigned long long *d) const {
+void X86CPUTopology::cpuid(uint64_t* Rax, uint64_t* Rbx, uint64_t* Rcx, uint64_t* Rdx) {
 #ifndef _MSC_VER
-  unsigned long long reg_a, reg_b, reg_c, reg_d;
-
+  // NOLINTBEGIN(misc-const-correctness)
+  uint64_t RaxOut = 0;
+  uint64_t RbxOut = 0;
+  uint64_t RcxOut = 0;
+  uint64_t RdxOut = 0;
+  // NOLINTEND(misc-const-correctness)
   __asm__ __volatile__("cpuid;"
-                       : "=a"(reg_a), "=b"(reg_b), "=c"(reg_c), "=d"(reg_d)
-                       : "a"(*a), "b"(*b), "c"(*c), "d"(*d));
-  *a = reg_a;
-  *b = reg_b;
-  *c = reg_c;
-  *d = reg_d;
+                       : "=a"(RaxOut), "=b"(RbxOut), "=c"(RcxOut), "=d"(RdxOut)
+                       : "a"(*Rax), "b"(*Rbx), "c"(*Rcx), "d"(*Rdx));
+  *Rax = RaxOut;
+  *Rbx = RbxOut;
+  *Rcx = RcxOut;
+  *Rdx = RdxOut;
 #else
   std::array<int, 4> cpuid;
 
-  __cpuidex(cpuid.data(), *a, *c);
+  __cpuidex(cpuid.data(), *Rax, *Rcx);
 
-  *a = cpuid[0];
-  *b = cpuid[1];
-  *c = cpuid[2];
-  *d = cpuid[3];
+  *Rax = cpuid[0];
+  *Rbx = cpuid[1];
+  *Rcx = cpuid[2];
+  *Rdx = cpuid[3];
 #endif
 }
+
+} // namespace firestarter::environment::x86
\ No newline at end of file
diff --git a/src/firestarter/Environment/X86/X86Environment.cpp b/src/firestarter/Environment/X86/X86Environment.cpp
index d981358d..3ecd89c1 100644
--- a/src/firestarter/Environment/X86/X86Environment.cpp
+++ b/src/firestarter/Environment/X86/X86Environment.cpp
@@ -19,201 +19,155 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Environment/X86/X86Environment.hpp>
-#include <firestarter/Logging/Log.hpp>
+#include "firestarter/Environment/X86/X86Environment.hpp"
+#include "firestarter/Logging/Log.hpp"
 
 #include <algorithm>
 #include <cstdio>
+#include <iomanip>
 #include <regex>
 
-using namespace firestarter::environment::x86;
+namespace firestarter::environment::x86 {
 
-void X86Environment::evaluateFunctions() {
-  for (auto ctor : this->platformConfigsCtor) {
-    // add asmjit for model and family detection
-    this->platformConfigs.push_back(
-        ctor(this->topology().featuresAsmjit(), this->topology().familyId(),
-             this->topology().modelId(), this->topology().numThreadsPerCore()));
-  }
-
-  for (auto ctor : this->fallbackPlatformConfigsCtor) {
-    this->fallbackPlatformConfigs.push_back(
-        ctor(this->topology().featuresAsmjit(), this->topology().familyId(),
-             this->topology().modelId(), this->topology().numThreadsPerCore()));
-  }
-}
-
-int X86Environment::selectFunction(unsigned functionId,
-                                   bool allowUnavailablePayload) {
-  unsigned id = 1;
-  std::string defaultPayloadName("");
+void X86Environment::selectFunction(unsigned FunctionId, bool AllowUnavailablePayload) {
+  unsigned Id = 1;
+  std::optional<std::string> DefaultPayloadName;
 
   // if functionId is 0 get the default or fallback
-  for (auto config : this->platformConfigs) {
-    for (auto const &[thread, functionName] : config->getThreadMap()) {
+  for (const auto& PlatformConfigPtr : PlatformConfigs) {
+    for (auto const& ThreadsPerCore : PlatformConfigPtr->settings().threads()) {
       // the selected function
-      if (id == functionId) {
-        if (!config->isAvailable()) {
-          log::error() << "Function " << functionId << " (\"" << functionName
-                       << "\") requires " << config->payload().name()
-                       << ", which is not supported by the processor.";
-          if (!allowUnavailablePayload) {
-            return EXIT_FAILURE;
+      if (Id == FunctionId) {
+        if (!PlatformConfigPtr->isAvailable(topology())) {
+          const auto ErrorString = "Function " + std::to_string(FunctionId) + " (\"" +
+                                   PlatformConfigPtr->functionName(ThreadsPerCore) + "\") requires " +
+                                   PlatformConfigPtr->payload()->name() + ", which is not supported by the processor.";
+          if (AllowUnavailablePayload) {
+            log::warn() << ErrorString;
+          } else {
+            throw std::invalid_argument(ErrorString);
           }
         }
         // found function
-        this->_selectedConfig =
-            new ::firestarter::environment::platform::RuntimeConfig(
-                *config, thread, this->topology().instructionCacheSize());
-        return EXIT_SUCCESS;
+        setConfig(PlatformConfigPtr->cloneConcreate(topology().instructionCacheSize(), ThreadsPerCore));
+        return;
       }
       // default function
-      if (0 == functionId && config->isDefault()) {
-        if (thread == this->topology().numThreadsPerCore()) {
-          this->_selectedConfig =
-              new ::firestarter::environment::platform::RuntimeConfig(
-                  *config, thread, this->topology().instructionCacheSize());
-          return EXIT_SUCCESS;
-        } else {
-          defaultPayloadName = config->payload().name();
+      if (0 == FunctionId && PlatformConfigPtr->isDefault(topology())) {
+        if (ThreadsPerCore == topology().numThreadsPerCore()) {
+          setConfig(PlatformConfigPtr->cloneConcreate(topology().instructionCacheSize(), ThreadsPerCore));
+          return;
         }
+        DefaultPayloadName = PlatformConfigPtr->payload()->name();
       }
-      id++;
+      Id++;
     }
   }
 
   // no default found
   // use fallback
-  if (0 == functionId) {
-    if (!defaultPayloadName.empty()) {
+  if (0 == FunctionId) {
+    if (DefaultPayloadName) {
       // default payload available, but number of threads per core is not
       // supported
-      log::warn() << "No " << defaultPayloadName << " code path for "
-                  << this->topology().numThreadsPerCore()
+      log::warn() << "No " << *DefaultPayloadName << " code path for " << topology().numThreadsPerCore()
                   << " threads per core!";
     }
-    log::warn() << this->topology().vendor() << " " << this->topology().model()
+    log::warn() << topology().vendor() << " " << topology().model()
                 << " is not supported by this version of FIRESTARTER!\n"
                 << "Check project website for updates.";
 
     // loop over available implementation and check if they are marked as
     // fallback
-    for (auto config : this->fallbackPlatformConfigs) {
-      if (config->isAvailable()) {
-        auto selectedThread = 0;
-        auto selectedFunctionName = std::string("");
-        for (auto const &[thread, functionName] : config->getThreadMap()) {
-          if (thread == this->topology().numThreadsPerCore()) {
-            selectedThread = thread;
-            selectedFunctionName = functionName;
+    for (const auto& FallbackPlatformConfigPtr : FallbackPlatformConfigs) {
+      if (FallbackPlatformConfigPtr->isAvailable(topology())) {
+        std::optional<unsigned> SelectedThreadsPerCore;
+        // find the fallback implementation with the correct thread per core count
+        for (auto const& ThreadsPerCore : FallbackPlatformConfigPtr->settings().threads()) {
+          if (ThreadsPerCore == topology().numThreadsPerCore()) {
+            SelectedThreadsPerCore = ThreadsPerCore;
           }
         }
-        if (selectedThread == 0) {
-          selectedThread = config->getThreadMap().begin()->first;
-          selectedFunctionName = config->getThreadMap().begin()->second;
+        // Otherwise select the first available thread per core count
+        if (!SelectedThreadsPerCore) {
+          SelectedThreadsPerCore = FallbackPlatformConfigPtr->settings().threads().front();
         }
-        this->_selectedConfig =
-            new ::firestarter::environment::platform::RuntimeConfig(
-                *config, selectedThread,
-                this->topology().instructionCacheSize());
-        log::warn() << "Using function " << selectedFunctionName
+        setConfig(
+            FallbackPlatformConfigPtr->cloneConcreate(topology().instructionCacheSize(), *SelectedThreadsPerCore));
+        log::warn() << "Using function " << FallbackPlatformConfigPtr->functionName(*SelectedThreadsPerCore)
                     << " as fallback.\n"
                     << "You can use the parameter --function to try other "
                        "functions.";
-        return EXIT_SUCCESS;
+        return;
       }
     }
 
     // no fallback found
-    log::error() << "No fallback implementation found for available ISA "
-                    "extensions.";
-    return EXIT_FAILURE;
+    throw std::invalid_argument("No fallback implementation found for available ISA "
+                                "extensions.");
   }
 
-  log::error() << "unknown function id: " << functionId
-               << ", see --avail for available ids";
-  return EXIT_FAILURE;
+  throw std::invalid_argument("unknown function id: " + std::to_string(FunctionId) + ", see --avail for available ids");
 }
 
-int X86Environment::selectInstructionGroups(std::string groups) {
-  const std::string delimiter = ",";
-  const std::regex re("^(\\w+):(\\d+)$");
-  const auto availableInstructionGroups = this->selectedConfig()
-                                              .platformConfig()
-                                              .payload()
-                                              .getAvailableInstructions();
-
-  std::stringstream ss(groups);
-  std::vector<std::pair<std::string, unsigned>> payloadSettings = {};
-
-  while (ss.good()) {
-    std::string token;
-    std::smatch m;
-    std::getline(ss, token, ',');
-
-    if (std::regex_match(token, m, re)) {
-      if (std::find(availableInstructionGroups.begin(),
-                    availableInstructionGroups.end(),
-                    m[1].str()) == availableInstructionGroups.end()) {
-        log::error()
-            << "Invalid instruction-group: " << m[1].str()
-            << "\n       --run-instruction-groups format: multiple INST:VAL "
-               "pairs comma-seperated";
-        return EXIT_FAILURE;
+void X86Environment::selectInstructionGroups(std::string Groups) {
+  const auto Delimiter = ',';
+  const std::regex Re("^(\\w+):(\\d+)$");
+  const auto AvailableInstructionGroups = config().payload()->getAvailableInstructions();
+
+  std::stringstream Ss(Groups);
+  std::vector<std::pair<std::string, unsigned>> PayloadSettings = {};
+
+  while (Ss.good()) {
+    std::string Token;
+    std::smatch M;
+    std::getline(Ss, Token, Delimiter);
+
+    if (std::regex_match(Token, M, Re)) {
+      if (std::find(AvailableInstructionGroups.begin(), AvailableInstructionGroups.end(), M[1].str()) ==
+          AvailableInstructionGroups.end()) {
+        throw std::invalid_argument("Invalid instruction-group: " + M[1].str() +
+                                    "\n       --run-instruction-groups format: multiple INST:VAL "
+                                    "pairs comma-seperated");
       }
-      int num = std::stoul(m[2].str());
-      if (num == 0) {
-        log::error()
-            << "instruction-group VAL may not contain number 0"
-            << "\n       --run-instruction-groups format: multiple INST:VAL "
-               "pairs comma-seperated";
-        return EXIT_FAILURE;
+      auto Num = std::stoul(M[2].str());
+      if (Num == 0) {
+        throw std::invalid_argument("instruction-group VAL may not contain number 0"
+                                    "\n       --run-instruction-groups format: multiple INST:VAL "
+                                    "pairs comma-seperated");
       }
-      payloadSettings.push_back(std::make_pair(m[1].str(), num));
+      PayloadSettings.emplace_back(M[1].str(), Num);
     } else {
-      log::error()
-          << "Invalid symbols in instruction-group: " << token
-          << "\n       --run-instruction-groups format: multiple INST:VAL "
-             "pairs comma-seperated";
-      return EXIT_FAILURE;
+      throw std::invalid_argument("Invalid symbols in instruction-group: " + Token +
+                                  "\n       --run-instruction-groups format: multiple INST:VAL "
+                                  "pairs comma-seperated");
     }
   }
 
-  this->selectedConfig().setPayloadSettings(payloadSettings);
-
-  log::info() << "  Running custom instruction group: " << groups;
+  config().settings().selectInstructionGroups(PayloadSettings);
 
-  return EXIT_SUCCESS;
+  log::info() << "  Running custom instruction group: " << Groups;
 }
 
 void X86Environment::printAvailableInstructionGroups() {
-  std::stringstream ss;
+  std::stringstream Ss;
 
-  for (auto const &item : this->selectedConfig()
-                              .platformConfig()
-                              .payload()
-                              .getAvailableInstructions()) {
-    ss << item << ",";
+  for (auto const& Item : config().payload()->getAvailableInstructions()) {
+    Ss << Item << ",";
   }
 
-  auto s = ss.str();
-  if (s.size() > 0) {
-    s.pop_back();
+  auto S = Ss.str();
+  if (!S.empty()) {
+    S.pop_back();
   }
 
-  log::info() << " available instruction-groups for payload "
-              << this->selectedConfig().platformConfig().payload().name()
-              << ":\n"
-              << "  " << s;
+  log::info() << " available instruction-groups for payload " << config().payload()->name() << ":\n"
+              << "  " << S;
 }
 
-void X86Environment::setLineCount(unsigned lineCount) {
-  this->selectedConfig().setLineCount(lineCount);
-}
+void X86Environment::setLineCount(unsigned LineCount) { config().settings().setLineCount(LineCount); }
 
-void X86Environment::printSelectedCodePathSummary() {
-  this->selectedConfig().printCodePathSummary();
-}
+void X86Environment::printSelectedCodePathSummary() { config().printCodePathSummary(); }
 
 void X86Environment::printFunctionSummary() {
   log::info() << " available load-functions:\n"
@@ -224,21 +178,19 @@ void X86Environment::printFunctionSummary() {
                  "-------------------------------------------------------------"
                  "-----------------------------";
 
-  unsigned id = 1;
-
-  for (auto const &config : this->platformConfigs) {
-    for (auto const &[thread, functionName] : config->getThreadMap()) {
-      const char *available = config->isAvailable() ? "yes" : "no";
-      const char *fmt = "  %4u | %-30s | %-24s | %s";
-      int sz =
-          std::snprintf(nullptr, 0, fmt, id, functionName.c_str(), available,
-                        config->getDefaultPayloadSettingsString().c_str());
-      std::vector<char> buf(sz + 1);
-      std::snprintf(&buf[0], buf.size(), fmt, id, functionName.c_str(),
-                    available,
-                    config->getDefaultPayloadSettingsString().c_str());
-      log::info() << std::string(&buf[0]);
-      id++;
+  auto Id = 1U;
+
+  for (auto const& Config : PlatformConfigs) {
+    for (auto const& ThreadsPerCore : Config->settings().threads()) {
+      const char* Available = Config->isAvailable(topology()) ? "yes" : "no";
+      const auto& FunctionName = Config->functionName(ThreadsPerCore);
+      const auto& InstructionGroupsString = Config->settings().getInstructionGroupsString();
+
+      log::info() << "  " << std::right << std::setw(4) << Id << " | " << std::left << std::setw(30) << FunctionName
+                  << " | " << std::left << std::setw(24) << Available << " | " << InstructionGroupsString;
+      Id++;
     }
   }
 }
+
+} // namespace firestarter::environment::x86
\ No newline at end of file
diff --git a/src/firestarter/Firestarter.cpp b/src/firestarter/Firestarter.cpp
index 5fb58ad4..379e2039 100644
--- a/src/firestarter/Firestarter.cpp
+++ b/src/firestarter/Firestarter.cpp
@@ -19,438 +19,279 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Firestarter.hpp>
-#include <firestarter/Logging/Log.hpp>
-#if defined(linux) || defined(__linux__)
-#include <firestarter/Optimizer/Algorithm/NSGA2.hpp>
-#include <firestarter/Optimizer/History.hpp>
-#include <firestarter/Optimizer/Problem/CLIArgumentProblem.hpp>
-extern "C" {
-#include <firestarter/Measurement/Metric/IPCEstimate.h>
-}
-#endif
+#include "firestarter/Firestarter.hpp"
+#include "firestarter/Environment/X86/X86Environment.hpp"
+#include "firestarter/Logging/Log.hpp"
+#include "firestarter/Measurement/Metric/IPCEstimate.hpp"
+#include "firestarter/Optimizer/Algorithm/NSGA2.hpp"
+#include "firestarter/Optimizer/History.hpp"
+#include "firestarter/Optimizer/Problem/CLIArgumentProblem.hpp"
 
 #include <csignal>
-#include <functional>
-#include <thread>
-
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-
-using namespace firestarter;
-
-Firestarter::Firestarter(
-    const int argc, const char **argv, std::chrono::seconds const &timeout,
-    unsigned loadPercent, std::chrono::microseconds const &period,
-    unsigned requestedNumThreads, std::string const &cpuBind,
-    bool printFunctionSummary, unsigned functionId, bool listInstructionGroups,
-    std::string const &instructionGroups, unsigned lineCount,
-    bool allowUnavailablePayload, bool dumpRegisters,
-    std::chrono::seconds const &dumpRegistersTimeDelta,
-    std::string const &dumpRegistersOutpath, bool errorDetection, int gpus,
-    unsigned gpuMatrixSize, bool gpuUseFloat, bool gpuUseDouble,
-    bool listMetrics, bool measurement,
-    std::chrono::milliseconds const &startDelta,
-    std::chrono::milliseconds const &stopDelta,
-    std::chrono::milliseconds const &measurementInterval,
-    std::vector<std::string> const &metricPaths,
-    std::vector<std::string> const &stdinMetrics, bool optimize,
-    std::chrono::seconds const &preheat,
-    std::string const &optimizationAlgorithm,
-    std::vector<std::string> const &optimizationMetrics,
-    std::chrono::seconds const &evaluationDuration, unsigned individuals,
-    std::string const &optimizeOutfile, unsigned generations, double nsga2_cr,
-    double nsga2_m)
-    : _argc(argc), _argv(argv), _timeout(timeout), _loadPercent(loadPercent),
-      _period(period), _dumpRegisters(dumpRegisters),
-      _dumpRegistersTimeDelta(dumpRegistersTimeDelta),
-      _dumpRegistersOutpath(dumpRegistersOutpath),
-      _errorDetection(errorDetection), _gpus(gpus),
-      _gpuMatrixSize(gpuMatrixSize), _gpuUseFloat(gpuUseFloat),
-      _gpuUseDouble(gpuUseDouble), _startDelta(startDelta),
-      _stopDelta(stopDelta), _measurement(measurement), _optimize(optimize),
-      _preheat(preheat), _optimizationAlgorithm(optimizationAlgorithm),
-      _optimizationMetrics(optimizationMetrics),
-      _evaluationDuration(evaluationDuration), _individuals(individuals),
-      _optimizeOutfile(optimizeOutfile), _generations(generations),
-      _nsga2_cr(nsga2_cr), _nsga2_m(nsga2_m) {
-  int returnCode;
-
-  _load = (_period * _loadPercent) / 100;
-  if (_loadPercent == 100 || _load == std::chrono::microseconds::zero()) {
-    _period = std::chrono::microseconds::zero();
-  }
+#include <cstdlib>
+#include <memory>
+
+namespace firestarter {
 
-#if defined(linux) || defined(__linux__)
-#else
-  (void)listMetrics;
-  (void)measurementInterval;
-  (void)metricPaths;
-  (void)stdinMetrics;
-#endif
-
-#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||            \
-    defined(_M_X64)
-  this->_environment = new environment::x86::X86Environment();
-#endif
-
-  if (EXIT_SUCCESS != (returnCode = this->environment().evaluateCpuAffinity(
-                           requestedNumThreads, cpuBind))) {
-    std::exit(returnCode);
+Firestarter::Firestarter(Config&& ProvidedConfig)
+    : Cfg(std::move(ProvidedConfig)) {
+  if constexpr (firestarter::OptionalFeatures.IsX86) {
+    Environment = std::make_unique<environment::x86::X86Environment>();
   }
 
-#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||            \
-    defined(_M_X64)
-  // Error detection uses crc32 instruction added by the SSE4.2 extension to x86
-  if (_errorDetection) {
-    if (!_environment->topology().featuresAsmjit().has(asmjit::CpuFeatures::X86::kSSE4_2)) {
-      throw std::invalid_argument("Option --error-detection requires the crc32 "
-                                  "instruction added with SSE_4_2.\n");
+  Environment->evaluateCpuAffinity(Cfg.RequestedNumThreads, Cfg.CpuBind);
+
+  if constexpr (firestarter::OptionalFeatures.IsX86) {
+    // Error detection uses crc32 instruction added by the SSE4.2 extension to x86
+    if (Cfg.ErrorDetection) {
+      const auto& X86Env = *dynamic_cast<environment::x86::X86Environment*>(Environment.get());
+      if (!X86Env.topology().featuresAsmjit().has(asmjit::CpuFeatures::X86::kSSE4_2)) {
+        throw std::invalid_argument("Option --error-detection requires the crc32 "
+                                    "instruction added with SSE_4_2.\n");
+      }
     }
   }
-#endif
 
-  if (_errorDetection && this->environment().requestedNumThreads() < 2) {
-    throw std::invalid_argument(
-        "Option --error-detection must run with 2 or more threads. Number of "
-        "threads is " +
-        std::to_string(this->environment().requestedNumThreads()) + "\n");
+  if (Cfg.ErrorDetection && Environment->requestedNumThreads() < 2) {
+    throw std::invalid_argument("Option --error-detection must run with 2 or more threads. Number of "
+                                "threads is " +
+                                std::to_string(Environment->requestedNumThreads()) + "\n");
   }
 
-  this->environment().evaluateFunctions();
-
-  if (printFunctionSummary) {
-    this->environment().printFunctionSummary();
-    std::exit(EXIT_SUCCESS);
+  if (Cfg.PrintFunctionSummary) {
+    Environment->printFunctionSummary();
+    safeExit(EXIT_SUCCESS);
   }
 
-  if (EXIT_SUCCESS != (returnCode = this->environment().selectFunction(
-                           functionId, allowUnavailablePayload))) {
-    std::exit(returnCode);
-  }
+  Environment->selectFunction(Cfg.FunctionId, Cfg.AllowUnavailablePayload);
 
-  if (listInstructionGroups) {
-    this->environment().printAvailableInstructionGroups();
-    std::exit(EXIT_SUCCESS);
+  if (Cfg.ListInstructionGroups) {
+    Environment->printAvailableInstructionGroups();
+    safeExit(EXIT_SUCCESS);
   }
 
-  if (!instructionGroups.empty()) {
-    if (EXIT_SUCCESS !=
-        (returnCode =
-             this->environment().selectInstructionGroups(instructionGroups))) {
-      std::exit(returnCode);
-    }
+  if (!Cfg.InstructionGroups.empty()) {
+    Environment->selectInstructionGroups(Cfg.InstructionGroups);
   }
 
-  if (lineCount != 0) {
-    this->environment().setLineCount(lineCount);
+  if (Cfg.LineCount != 0) {
+    Environment->setLineCount(Cfg.LineCount);
   }
 
-#if defined(linux) || defined(__linux__)
-  if (_measurement || listMetrics || _optimize) {
-    _measurementWorker = std::make_shared<measurement::MeasurementWorker>(
-        measurementInterval, this->environment().requestedNumThreads(),
-        metricPaths, stdinMetrics);
+  if constexpr (firestarter::OptionalFeatures.OptimizationEnabled) {
+    if (Cfg.Measurement || Cfg.ListMetrics || Cfg.Optimize) {
+      MeasurementWorker = std::make_shared<measurement::MeasurementWorker>(
+          Cfg.MeasurementInterval, Environment->requestedNumThreads(), Cfg.MetricPaths, Cfg.StdinMetrics);
 
-    if (listMetrics) {
-      log::info() << _measurementWorker->availableMetrics();
-      std::exit(EXIT_SUCCESS);
-    }
+      if (Cfg.ListMetrics) {
+        log::info() << MeasurementWorker->availableMetrics();
+        safeExit(EXIT_SUCCESS);
+      }
+
+      // init all metrics
+      auto All = MeasurementWorker->metricNames();
+      auto Initialized = MeasurementWorker->initMetrics(All);
 
-    // init all metrics
-    auto all = _measurementWorker->metricNames();
-    auto initialized = _measurementWorker->initMetrics(all);
+      if (Initialized.empty()) {
+        std::invalid_argument("No metrics initialized");
+      }
 
-    if (initialized.size() == 0) {
-      log::error() << "No metrics initialized";
-      std::exit(EXIT_FAILURE);
+      // check if selected metrics are initialized
+      for (auto const& OptimizationMetric : Cfg.OptimizationMetrics) {
+        auto NameEqual = [OptimizationMetric](auto const& Name) {
+          auto InvertedName = "-" + Name;
+          return Name == OptimizationMetric || InvertedName == OptimizationMetric;
+        };
+        // metric name is not found
+        if (std::find_if(All.begin(), All.end(), NameEqual) == All.end()) {
+          std::invalid_argument("Metric \"" + OptimizationMetric + "\" does not exist.");
+        }
+        // metric has not initialized properly
+        if (std::find_if(Initialized.begin(), Initialized.end(), NameEqual) == Initialized.end()) {
+          std::invalid_argument("Metric \"" + OptimizationMetric + "\" failed to initialize.");
+        }
+      }
     }
 
-    // check if selected metrics are initialized
-    for (auto const &optimizationMetric : optimizationMetrics) {
-      auto nameEqual = [optimizationMetric](auto const &name) {
-        auto invertedName = "-" + name;
-        return name.compare(optimizationMetric) == 0 ||
-               invertedName.compare(optimizationMetric) == 0;
+    if (Cfg.Optimize) {
+      auto ApplySettings = [this](std::vector<std::pair<std::string, unsigned>> const& Setting) {
+        using Clock = std::chrono::high_resolution_clock;
+        auto Start = Clock::now();
+
+        signalSwitch(Setting);
+
+        LoadVar = LoadThreadWorkType::LoadHigh;
+
+        signalWork();
+
+        uint64_t StartTimestamp = (std::numeric_limits<uint64_t>::max)();
+        uint64_t StopTimestamp = 0;
+
+        for (auto const& Thread : LoadThreads) {
+          auto Td = Thread.second;
+
+          StartTimestamp = std::min<uint64_t>(StartTimestamp, Td->LastRun.StartTsc);
+          StopTimestamp = std::max<uint64_t>(StopTimestamp, Td->LastRun.StopTsc);
+        }
+
+        for (auto const& Thread : LoadThreads) {
+          auto Td = Thread.second;
+          IpcEstimateMetricData::insertValue(
+              static_cast<double>(Td->LastRun.Iterations) *
+              static_cast<double>(LoadThreads.front().second->CompiledPayloadPtr->stats().Instructions) /
+              static_cast<double>(StopTimestamp - StartTimestamp));
+        }
+
+        auto End = Clock::now();
+
+        log::trace() << "Switching payload took "
+                     << std::chrono::duration_cast<std::chrono::milliseconds>(End - Start).count() << "ms";
       };
-      // metric name is not found
-      if (std::find_if(all.begin(), all.end(), nameEqual) == all.end()) {
-        log::error() << "Metric \"" << optimizationMetric
-                     << "\" does not exist.";
-        std::exit(EXIT_FAILURE);
-      }
-      // metric has not initialized properly
-      if (std::find_if(initialized.begin(), initialized.end(), nameEqual) ==
-          initialized.end()) {
-        log::error() << "Metric \"" << optimizationMetric
-                     << "\" failed to initialize.";
-        std::exit(EXIT_FAILURE);
+
+      auto Prob = std::make_shared<firestarter::optimizer::problem::CLIArgumentProblem>(
+          std::move(ApplySettings), MeasurementWorker, Cfg.OptimizationMetrics, Cfg.EvaluationDuration, Cfg.StartDelta,
+          Cfg.StopDelta, Environment->config().settings().instructionGroupItems());
+
+      Population = std::make_unique<firestarter::optimizer::Population>(std::move(Prob));
+
+      if (Cfg.OptimizationAlgorithm == "NSGA2") {
+        Algorithm =
+            std::make_unique<firestarter::optimizer::algorithm::NSGA2>(Cfg.Generations, Cfg.Nsga2Cr, Cfg.Nsga2M);
+      } else {
+        throw std::invalid_argument("Algorithm " + Cfg.OptimizationAlgorithm + " unknown.");
       }
-    }
-  }
 
-  if (_optimize) {
-    auto applySettings = std::bind(
-        [this](std::vector<std::pair<std::string, unsigned>> const &setting) {
-          using Clock = std::chrono::high_resolution_clock;
-          auto start = Clock::now();
-
-          for (auto &thread : this->loadThreads) {
-            auto td = thread.second;
-
-            td->config().setPayloadSettings(setting);
-          }
-
-          for (auto const &thread : this->loadThreads) {
-            auto td = thread.second;
-
-            td->mutex.lock();
-          }
-
-          for (auto const &thread : this->loadThreads) {
-            auto td = thread.second;
-
-            td->comm = THREAD_SWITCH;
-            td->mutex.unlock();
-          }
-
-          this->loadVar = LOAD_SWITCH;
-
-          for (auto const &thread : this->loadThreads) {
-            auto td = thread.second;
-            bool ack;
-
-            do {
-              td->mutex.lock();
-              ack = td->ack;
-              td->mutex.unlock();
-            } while (!ack);
-
-            td->mutex.lock();
-            td->ack = false;
-            td->mutex.unlock();
-          }
-
-          this->loadVar = LOAD_HIGH;
-
-          this->signalWork();
-
-          unsigned long long startTimestamp = 0xffffffffffffffff;
-          unsigned long long stopTimestamp = 0;
-
-          for (auto const &thread : this->loadThreads) {
-            auto td = thread.second;
-
-            if (startTimestamp > td->lastStartTsc) {
-              startTimestamp = td->lastStartTsc;
-            }
-            if (stopTimestamp < td->lastStopTsc) {
-              stopTimestamp = td->lastStopTsc;
-            }
-          }
-
-          for (auto const &thread : this->loadThreads) {
-            auto td = thread.second;
-            ipc_estimate_metric_insert(
-                (double)td->lastIterations *
-                (double)this->loadThreads.front()
-                    .second->config()
-                    .payload()
-                    .instructions() /
-                (double)(stopTimestamp - startTimestamp));
-          }
-
-          auto end = Clock::now();
-
-          log::trace() << "Switching payload took "
-                       << std::chrono::duration_cast<std::chrono::milliseconds>(
-                              end - start)
-                              .count()
-                       << "ms";
-        },
-        std::placeholders::_1);
-
-    auto prob =
-        std::make_shared<firestarter::optimizer::problem::CLIArgumentProblem>(
-            std::move(applySettings), _measurementWorker, _optimizationMetrics,
-            _evaluationDuration, _startDelta, _stopDelta,
-            this->environment().selectedConfig().payloadItems());
-
-    _population = firestarter::optimizer::Population(std::move(prob));
-
-    if (_optimizationAlgorithm == "NSGA2") {
-      _algorithm = std::make_unique<firestarter::optimizer::algorithm::NSGA2>(
-          _generations, _nsga2_cr, _nsga2_m);
-    } else {
-      throw std::invalid_argument("Algorithm " + _optimizationAlgorithm +
-                                  " unknown.");
+      Algorithm->check(Population->problem(), Cfg.Individuals);
     }
-
-    _algorithm->checkPopulation(
-        static_cast<firestarter::optimizer::Population const &>(_population),
-        _individuals);
   }
-#endif
 
-  this->environment().printSelectedCodePathSummary();
+  Environment->printSelectedCodePathSummary();
 
-  log::info() << this->environment().topology();
+  log::info() << Environment->topology();
 
   // setup thread with either high or low load configured at the start
   // low loads has to know the length of the period
-  if (EXIT_SUCCESS != (returnCode = this->initLoadWorkers((_loadPercent == 0),
-                                                          _period.count()))) {
-    std::exit(returnCode);
-  }
+  initLoadWorkers();
 
   // add some signal handler for aborting FIRESTARTER
-#ifndef _WIN32
-  std::signal(SIGALRM, Firestarter::sigalrmHandler);
-#endif
-
-  std::signal(SIGTERM, Firestarter::sigtermHandler);
-  std::signal(SIGINT, Firestarter::sigtermHandler);
-}
-
-Firestarter::~Firestarter() {
-#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)
-  _cuda.reset();
-#endif
-#ifdef FIRESTARTER_BUILD_ONEAPI
-  _oneapi.reset();
-#endif
+  if constexpr (!firestarter::OptionalFeatures.IsWin32) {
+    (void)std::signal(SIGALRM, Firestarter::sigalrmHandler);
+  }
 
-  delete _environment;
+  (void)std::signal(SIGTERM, Firestarter::sigtermHandler);
+  (void)std::signal(SIGINT, Firestarter::sigtermHandler);
 }
 
 void Firestarter::mainThread() {
-  this->environment().printThreadSummary();
+  Environment->printThreadSummary();
 
-#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)
-  _cuda = std::make_unique<cuda::Cuda>(&this->loadVar, _gpuUseFloat,
-                                       _gpuUseDouble, _gpuMatrixSize, _gpus);
-#endif
+  Cuda = std::make_unique<cuda::Cuda>(LoadVar, Cfg.GpuUseFloat, Cfg.GpuUseDouble, Cfg.GpuMatrixSize, Cfg.Gpus);
+  Oneapi = std::make_unique<oneapi::OneAPI>(LoadVar, Cfg.GpuUseFloat, Cfg.GpuUseDouble, Cfg.GpuMatrixSize, Cfg.Gpus);
 
-#ifdef FIRESTARTER_BUILD_ONEAPI
-  _oneapi = std::make_unique<oneapi::OneAPI>(&this->loadVar, _gpuUseFloat,
-                                       _gpuUseDouble, _gpuMatrixSize, _gpus);
-#endif
-
-
-#if defined(linux) || defined(__linux__)
-  // if measurement is enabled, start it here
-  if (_measurement) {
-    _measurementWorker->startMeasurement();
+  if constexpr (firestarter::OptionalFeatures.OptimizationEnabled) {
+    // if measurement is enabled, start it here
+    if (Cfg.Measurement) {
+      MeasurementWorker->startMeasurement();
+    }
   }
-#endif
 
-  this->signalWork();
+  signalWork();
 
-#ifdef FIRESTARTER_DEBUG_FEATURES
-  if (_dumpRegisters) {
-    int returnCode;
-    if (EXIT_SUCCESS != (returnCode = this->initDumpRegisterWorker(
-                             _dumpRegistersTimeDelta, _dumpRegistersOutpath))) {
-      std::exit(returnCode);
+  if constexpr (firestarter::OptionalFeatures.DumpRegisterEnabled) {
+    if (Cfg.DumpRegisters) {
+      initDumpRegisterWorker();
     }
   }
-#endif
 
   // worker thread for load control
-  this->watchdogWorker(_period, _load, _timeout);
+  watchdogWorker(Cfg.Period, Cfg.Load, Cfg.Timeout);
 
-#if defined(linux) || defined(__linux__)
-  // check if optimization is selected
-  if (_optimize) {
-    auto startTime = optimizer::History::getTime();
+  if constexpr (firestarter::OptionalFeatures.OptimizationEnabled) {
+    // check if optimization is selected
+    if (Cfg.Optimize) {
+      auto StartTime = optimizer::History::getTime();
 
-    Firestarter::_optimizer = std::make_unique<optimizer::OptimizerWorker>(
-        std::move(_algorithm), _population, _optimizationAlgorithm,
-        _individuals, _preheat);
+      Firestarter::Optimizer = std::make_unique<optimizer::OptimizerWorker>(std::move(Algorithm), std::move(Population),
+                                                                            Cfg.Individuals, Cfg.Preheat);
 
-    // wait here until optimizer thread terminates
-    Firestarter::_optimizer->join();
+      // wait here until optimizer thread terminates
+      Firestarter::Optimizer->join();
+      Firestarter::Optimizer.reset();
 
-    auto payloadItems = this->environment().selectedConfig().payloadItems();
+      auto PayloadItems = Environment->config().settings().instructionGroupItems();
 
-    firestarter::optimizer::History::save(_optimizeOutfile, startTime,
-                                          payloadItems, _argc, _argv);
+      firestarter::optimizer::History::save(Cfg.OptimizeOutfile, StartTime, PayloadItems, Cfg.Argc, Cfg.Argv);
 
-    // print the best 20 according to each metric
-    firestarter::optimizer::History::printBest(_optimizationMetrics,
-                                               payloadItems);
+      // print the best 20 according to each metric
+      firestarter::optimizer::History::printBest(Cfg.OptimizationMetrics, PayloadItems);
 
-    // stop all the load threads
-    std::raise(SIGTERM);
+      // stop all the load threads
+      (void)std::raise(SIGTERM);
+    }
   }
-#endif
 
   // wait for watchdog to timeout or until user terminates
-  this->joinLoadWorkers();
-#ifdef FIRESTARTER_DEBUG_FEATURES
-  if (_dumpRegisters) {
-    this->joinDumpRegisterWorker();
+  joinLoadWorkers();
+  if constexpr (firestarter::OptionalFeatures.DumpRegisterEnabled) {
+    if (Cfg.DumpRegisters) {
+      joinDumpRegisterWorker();
+    }
   }
-#endif
 
-  if (!_optimize) {
-    this->printPerformanceReport();
+  if (!Cfg.Optimize) {
+    printPerformanceReport();
   }
 
-#if defined(linux) || defined(__linux__)
-  // if measurment is enabled, stop it here
-  if (_measurement) {
-    // TODO: clear this up
-    log::info() << "metric,num_timepoints,duration_ms,average,stddev";
-    for (auto const &[name, sum] :
-         _measurementWorker->getValues(_startDelta, _stopDelta)) {
-      log::info() << std::quoted(name) << "," << sum.num_timepoints << ","
-                  << sum.duration.count() << "," << sum.average << ","
-                  << sum.stddev;
+  if constexpr (firestarter::OptionalFeatures.OptimizationEnabled) {
+    // if measurment is enabled, stop it here
+    if (Cfg.Measurement) {
+      // TODO(Issue #77): clear this up
+      log::info() << "metric,num_timepoints,duration_ms,average,stddev";
+      for (auto const& [name, sum] : MeasurementWorker->getValues(Cfg.StartDelta, Cfg.StopDelta)) {
+        log::info() << std::quoted(name) << "," << sum.NumTimepoints << "," << sum.Duration.count() << ","
+                    << sum.Average << "," << sum.Stddev;
+      }
     }
   }
-#endif
 
-  if (_errorDetection) {
-    this->printThreadErrorReport();
+  if (Cfg.ErrorDetection) {
+    printThreadErrorReport();
   }
 }
 
-void Firestarter::setLoad(unsigned long long value) {
+void Firestarter::setLoad(LoadThreadWorkType Value) {
   // signal load change to workers
-  Firestarter::loadVar = value;
-#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||            \
-    defined(_M_X64)
-#ifndef _MSC_VER
-  __asm__ __volatile__("mfence;");
-#else
-  _mm_mfence();
-#endif
-#else
-#error "FIRESTARTER is not implemented for this ISA"
-#endif
+  Firestarter::LoadVar = Value;
+  if constexpr (firestarter::OptionalFeatures.IsX86) {
+    if constexpr (firestarter::OptionalFeatures.IsMsc) {
+      _mm_mfence();
+    } else {
+      __asm__ __volatile__("mfence;");
+    }
+  }
 }
 
-void Firestarter::sigalrmHandler(int signum) { (void)signum; }
+void Firestarter::sigalrmHandler(int Signum) { (void)Signum; }
 
-void Firestarter::sigtermHandler(int signum) {
-  (void)signum;
+void Firestarter::sigtermHandler(int Signum) {
+  (void)Signum;
 
-  Firestarter::setLoad(LOAD_STOP);
+  Firestarter::setLoad(LoadThreadWorkType::LoadStop);
   // exit loop
   // used in case of 0 < load < 100
   // or interrupt sleep for timeout
   {
-    std::lock_guard<std::mutex> lk(Firestarter::_watchdogTerminateMutex);
-    Firestarter::_watchdog_terminate = true;
+    const std::lock_guard<std::mutex> Lk(Firestarter::WatchdogTerminateMutex);
+    Firestarter::WatchdogTerminate = true;
   }
-  Firestarter::_watchdogTerminateAlert.notify_all();
+  Firestarter::WatchdogTerminateAlert.notify_all();
 
-#if defined(linux) || defined(__linux__)
-  // if we have optimization running stop it
-  if (Firestarter::_optimizer) {
-    Firestarter::_optimizer->kill();
+  if constexpr (firestarter::OptionalFeatures.OptimizationEnabled) {
+    // if we have optimization running stop it
+    if (Firestarter::Optimizer) {
+      Firestarter::Optimizer->kill();
+    }
   }
-#endif
 }
+
+} // namespace firestarter
\ No newline at end of file
diff --git a/src/firestarter/LoadWorker.cpp b/src/firestarter/LoadWorker.cpp
index 3c922cf6..4d473832 100644
--- a/src/firestarter/LoadWorker.cpp
+++ b/src/firestarter/LoadWorker.cpp
@@ -19,14 +19,15 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/ErrorDetectionStruct.hpp>
-#include <firestarter/Firestarter.hpp>
-#include <firestarter/Logging/Log.hpp>
+#include "firestarter/AlignedAlloc.hpp"
+#include "firestarter/Constants.hpp"
+#include "firestarter/ErrorDetectionStruct.hpp"
+#include "firestarter/Firestarter.hpp"
+#include "firestarter/LoadWorkerData.hpp"
+#include "firestarter/Logging/Log.hpp"
 
 #if defined(linux) || defined(__linux__)
-extern "C" {
-#include <firestarter/Measurement/Metric/IPCEstimate.h>
-}
+#include "firestarter/Measurement/Metric/IPCEstimate.hpp"
 #endif
 
 #ifdef ENABLE_VTRACING
@@ -37,146 +38,135 @@ extern "C" {
 #endif
 
 #include <cmath>
+#include <cstdint>
 #include <cstdlib>
-#include <functional>
+#include <iomanip>
+#include <limits>
+#include <sstream>
 #include <thread>
 
-using namespace firestarter;
-
-auto aligned_free_deleter = [](void *p) { ALIGNED_FREE(p); };
+namespace firestarter {
 
-int Firestarter::initLoadWorkers(bool lowLoad, unsigned long long period) {
-  int returnCode;
-
-  if (EXIT_SUCCESS != (returnCode = this->environment().setCpuAffinity(0))) {
-    return EXIT_FAILURE;
-  }
+void Firestarter::initLoadWorkers() {
+  Environment->setCpuAffinity(0);
 
   // setup load variable to execute low or high load once the threads switch to
   // work.
-  this->loadVar = lowLoad ? LOAD_LOW : LOAD_HIGH;
+  LoadVar = Cfg.Load == std::chrono::microseconds::zero() ? LoadThreadWorkType::LoadLow : LoadThreadWorkType::LoadHigh;
 
-  auto numThreads = this->environment().requestedNumThreads();
+  auto NumThreads = Environment->requestedNumThreads();
 
   // create a std::vector<std::shared_ptr<>> of requestenNumThreads()
   // communication pointers and add these to the threaddata
-  if (_errorDetection) {
-    for (unsigned long long i = 0; i < numThreads; i++) {
-      auto commPtr = reinterpret_cast<unsigned long long *>(
-          ALIGNED_MALLOC(2 * sizeof(unsigned long long), 64));
-      assert(commPtr);
-      this->errorCommunication.push_back(
-          std::shared_ptr<unsigned long long>(commPtr, aligned_free_deleter));
-      log::debug() << "Threads " << (i + numThreads - 1) % numThreads << " and "
-                   << i << " commPtr = 0x" << std::setfill('0')
-                   << std::setw(sizeof(unsigned long long) * 2) << std::hex
-                   << (unsigned long long)commPtr;
+  if (Cfg.ErrorDetection) {
+    for (uint64_t I = 0; I < NumThreads; I++) {
+      auto* CommPtr = static_cast<uint64_t*>(AlignedAlloc::malloc(2 * sizeof(uint64_t)));
+      assert(CommPtr);
+      ErrorCommunication.emplace_back(std::shared_ptr<uint64_t>(CommPtr, AlignedAlloc::free));
+      log::debug() << "Threads " << (I + NumThreads - 1) % NumThreads << " and " << I << " commPtr = 0x"
+                   << std::setfill('0') << std::setw(sizeof(uint64_t) * 2)
+                   << std::hex
+                   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+                   << reinterpret_cast<uint64_t>(CommPtr);
     }
   }
 
-  for (unsigned long long i = 0; i < numThreads; i++) {
-    auto td = std::make_shared<LoadWorkerData>(i, this->environment(),
-                                               &this->loadVar, period,
-                                               _dumpRegisters, _errorDetection);
+  for (uint64_t I = 0; I < NumThreads; I++) {
+    auto Td = std::make_shared<LoadWorkerData>(I, std::cref(*Environment), std::ref(LoadVar), Cfg.Period,
+                                               Cfg.DumpRegisters, Cfg.ErrorDetection);
 
-    if (_errorDetection) {
+    if (Cfg.ErrorDetection) {
       // distribute pointers for error deteciton. (set threads in a ring)
       // give this thread the left pointer i and right pointer (i+1) %
       // requestedNumThreads().
-      td->setErrorCommunication(this->errorCommunication[i],
-                                this->errorCommunication[(i + 1) % numThreads]);
+      Td->setErrorCommunication(ErrorCommunication[I], ErrorCommunication[(I + 1) % NumThreads]);
     }
 
-    auto dataCacheSizeIt =
-        td->config().platformConfig().dataCacheBufferSize().begin();
-    auto ramBufferSize = td->config().platformConfig().ramBufferSize();
-
-    td->buffersizeMem = (*dataCacheSizeIt + *std::next(dataCacheSizeIt, 1) +
-                         *std::next(dataCacheSizeIt, 2) + ramBufferSize) /
-                        td->config().thread() / sizeof(unsigned long long);
+    Td->BuffersizeMem = Td->config().settings().totalBufferSizePerThread() / sizeof(uint64_t);
 
     // create the thread
-    std::thread t(Firestarter::loadThreadWorker, td);
+    std::thread T(Firestarter::loadThreadWorker, Td);
 
-    log::trace() << "Created thread #" << i << " with ID: " << t.get_id();
+    log::trace() << "Created thread #" << I << " with ID: " << T.get_id();
 
-    if (i == 0) {
+    if (I == 0) {
       // only show error for all worker threads except first.
-      firestarter::logging::FirstWorkerThreadFilter<
-          firestarter::logging::record>::setFirstThread(t.get_id());
+      firestarter::logging::FirstWorkerThreadFilter<firestarter::logging::record>::setFirstThread(T.get_id());
     }
 
-    this->loadThreads.push_back(std::make_pair(std::move(t), td));
+    LoadThreads.emplace_back(std::move(T), Td);
   }
 
-  this->signalLoadWorkers(THREAD_INIT);
-
-  return EXIT_SUCCESS;
+  signalLoadWorkers(LoadThreadState::ThreadInit);
 }
 
-void Firestarter::signalLoadWorkers(int comm) {
-  bool ack;
-
-  // start the work
-  for (auto const &thread : this->loadThreads) {
-    auto td = thread.second;
+void Firestarter::signalLoadWorkers(const LoadThreadState State, void (*Function)()) {
+  // aquire the lock on all threads
+  for (auto const& Thread : LoadThreads) {
+    auto Td = Thread.second;
 
-    td->mutex.lock();
+    Td->Communication.Mutex.lock();
   }
 
-  for (auto const &thread : this->loadThreads) {
-    auto td = thread.second;
+  // switch the state on all threads
+  for (auto const& Thread : LoadThreads) {
+    auto Td = Thread.second;
 
-    td->comm = comm;
-    td->mutex.unlock();
+    Td->Communication.State = State;
+    Td->Communication.Mutex.unlock();
   }
 
-  for (auto const &thread : this->loadThreads) {
-    auto td = thread.second;
+  // Execute a function after the state in the threads has been updated. This may be required to terminate an inner
+  // loop.
+  if (Function) {
+    Function();
+  }
+
+  // wait for all threads to finish
+  for (auto const& Thread : LoadThreads) {
+    auto Td = Thread.second;
 
-    do {
-      td->mutex.lock();
-      ack = td->ack;
-      td->mutex.unlock();
-    } while (!ack);
+    // Wait until we receive the acknowledge
+    for (bool Ack = false; !Ack;) {
+      Td->Communication.Mutex.lock();
+      Ack = Td->Communication.Ack;
+      Td->Communication.Mutex.unlock();
+    }
 
-    td->mutex.lock();
-    td->ack = false;
-    td->mutex.unlock();
+    Td->Communication.Mutex.lock();
+    Td->Communication.Ack = false;
+    Td->Communication.Mutex.unlock();
   }
 }
 
 void Firestarter::joinLoadWorkers() {
   // wait for threads after watchdog has requested termination
-  for (auto &thread : this->loadThreads) {
-    thread.first.join();
+  for (auto& Thread : LoadThreads) {
+    Thread.first.join();
   }
 }
 
 void Firestarter::printThreadErrorReport() {
-  if (_errorDetection) {
-    auto maxSize = this->loadThreads.size();
+  if (Cfg.ErrorDetection) {
+    auto MaxSize = LoadThreads.size();
 
-    std::vector<bool> errors(maxSize, false);
+    std::vector<bool> Errors(MaxSize, false);
 
-    for (decltype(maxSize) i = 0; i < maxSize; i++) {
-      auto errorDetectionStruct =
-          this->loadThreads[i].second->errorDetectionStruct();
+    for (decltype(MaxSize) I = 0; I < MaxSize; I++) {
+      const auto& ErrorDetectionStructPtr = LoadThreads[I].second->errorDetectionStruct();
 
-      if (errorDetectionStruct->errorLeft) {
-        errors[(i + maxSize - 1) % maxSize] = true;
+      if (ErrorDetectionStructPtr.Left.Error) {
+        Errors[(I + MaxSize - 1) % MaxSize] = true;
       }
-      if (errorDetectionStruct->errorRight) {
-        errors[i] = true;
+      if (ErrorDetectionStructPtr.Right.Error) {
+        Errors[I] = true;
       }
     }
 
-    for (decltype(maxSize) i = 0; i < maxSize; i++) {
-      if (errors[i]) {
-        log::fatal()
-            << "Data mismatch between Threads " << i << " and "
-            << (i + 1) % maxSize
-            << ".\n       This may be caused by bit-flips in the hardware.";
+    for (decltype(MaxSize) I = 0; I < MaxSize; I++) {
+      if (Errors[I]) {
+        log::fatal() << "Data mismatch between Threads " << I << " and " << (I + 1) % MaxSize
+                     << ".\n       This may be caused by bit-flips in the hardware.";
       }
     }
   }
@@ -184,168 +174,138 @@ void Firestarter::printThreadErrorReport() {
 
 void Firestarter::printPerformanceReport() {
   // performance report
-  unsigned long long startTimestamp = 0xffffffffffffffff;
-  unsigned long long stopTimestamp = 0;
+  uint64_t StartTimestamp = (std::numeric_limits<uint64_t>::max)();
+  uint64_t StopTimestamp = 0;
 
-  unsigned long long iterations = 0;
+  uint64_t Iterations = 0;
 
   log::debug() << "\nperformance report:\n";
 
-  for (auto const &thread : this->loadThreads) {
-    auto td = thread.second;
+  for (auto const& Thread : LoadThreads) {
+    auto Td = Thread.second;
 
-    log::debug() << "Thread " << td->id() << ": " << td->iterations
-                 << " iterations, tsc_delta: " << td->stopTsc - td->startTsc;
+    log::debug() << "Thread " << Td->id() << ": " << Td->LastRun.Iterations
+                 << " iterations, tsc_delta: " << Td->LastRun.StopTsc - Td->LastRun.StartTsc;
 
-    if (startTimestamp > td->startTsc) {
-      startTimestamp = td->startTsc;
-    }
-    if (stopTimestamp < td->stopTsc) {
-      stopTimestamp = td->stopTsc;
-    }
+    StartTimestamp = (std::min)(StartTimestamp, Td->LastRun.StartTsc.load());
+    StopTimestamp = (std::max)(StopTimestamp, Td->LastRun.StopTsc.load());
 
-    iterations += td->iterations;
+    Iterations += Td->LastRun.Iterations.load();
   }
 
-  double runtime = (double)(stopTimestamp - startTimestamp) /
-                   (double)this->environment().topology().clockrate();
-  double gFlops =
-      (double)this->loadThreads.front().second->config().payload().flops() *
-      0.000000001 * (double)iterations / runtime;
-  double bandwidth =
-      (double)this->loadThreads.front().second->config().payload().bytes() *
-      0.000000001 * (double)iterations / runtime;
+  double const Runtime =
+      static_cast<double>(StopTimestamp - StartTimestamp) / static_cast<double>(Environment->topology().clockrate());
+  double const GFlops = static_cast<double>(LoadThreads.front().second->CompiledPayloadPtr->stats().Flops) *
+                        0.000000001 * static_cast<double>(Iterations) / Runtime;
+  double const Bandwidth = static_cast<double>(LoadThreads.front().second->CompiledPayloadPtr->stats().Bytes) *
+                           0.000000001 * static_cast<double>(Iterations) / Runtime;
 
   // insert values for ipc-estimate metric
   // if we are on linux
 #if defined(linux) || defined(__linux__)
-  if (_measurement) {
-    for (auto const &thread : this->loadThreads) {
-      auto td = thread.second;
-      ipc_estimate_metric_insert((double)td->iterations *
-                                 (double)this->loadThreads.front()
-                                     .second->config()
-                                     .payload()
-                                     .instructions() /
-                                 (double)(stopTimestamp - startTimestamp));
+  if (Cfg.Measurement) {
+    for (auto const& Thread : LoadThreads) {
+      auto Td = Thread.second;
+      IpcEstimateMetricData::insertValue(
+          static_cast<double>(Td->LastRun.Iterations) *
+          static_cast<double>(LoadThreads.front().second->CompiledPayloadPtr->stats().Instructions) /
+          static_cast<double>(StopTimestamp - StartTimestamp));
     }
   }
 #endif
 
-  // format runtime, gflops and bandwidth %.2f
-  const char *fmt = "%.2f";
-  int size;
-
-#define FORMAT(input)                                                          \
-  size = std::snprintf(nullptr, 0, fmt, input);                                \
-  std::vector<char> input##Vector(size + 1);                                   \
-  std::snprintf(&input##Vector[0], input##Vector.size(), fmt, input);          \
-  auto input##String = std::string(&input##Vector[0])
-
-  FORMAT(runtime);
-  FORMAT(gFlops);
-  FORMAT(bandwidth);
-
-#undef FORMAT
-
-  log::debug()
-      << "\n"
-      << "total iterations: " << iterations << "\n"
-      << "runtime: " << runtimeString << " seconds ("
-      << stopTimestamp - startTimestamp << " cycles)\n"
-      << "\n"
-      << "estimated floating point performance: " << gFlopsString << " GFLOPS\n"
-      << "estimated memory bandwidth*: " << bandwidthString << " GB/s\n"
-      << "\n"
-      << "* this estimate is highly unreliable if --function is used in order "
-         "to "
-         "select\n"
-      << "  a function that is not optimized for your architecture, or if "
-         "FIRESTARTER is\n"
-      << "  executed on an unsupported architecture!";
+  // format runtime, gflops and bandwidth with two decimal places
+  const auto FormatString = [](double Value) -> std::string {
+    std::stringstream Ss;
+    Ss << std::fixed << std::setprecision(2) << Value;
+    return Ss.str();
+  };
+
+  log::debug() << "\n"
+               << "total iterations: " << Iterations << "\n"
+               << "runtime: " << FormatString(Runtime) << " seconds (" << StopTimestamp - StartTimestamp << " cycles)\n"
+               << "\n"
+               << "estimated floating point performance: " << FormatString(GFlops) << " GFLOPS\n"
+               << "estimated memory bandwidth*: " << FormatString(Bandwidth) << " GB/s\n"
+               << "\n"
+               << "* this estimate is highly unreliable if --function is used in order "
+                  "to "
+                  "select\n"
+               << "  a function that is not optimized for your architecture, or if "
+                  "FIRESTARTER is\n"
+               << "  executed on an unsupported architecture!";
 }
 
-void Firestarter::loadThreadWorker(std::shared_ptr<LoadWorkerData> td) {
+void Firestarter::loadThreadWorker(const std::shared_ptr<LoadWorkerData>& Td) {
 
-  int old = THREAD_WAIT;
+  auto OldState = LoadThreadState::ThreadWait;
 
 #if defined(linux) || defined(__linux__)
   pthread_setname_np(pthread_self(), "LoadWorker");
 #endif
 
   for (;;) {
-    td->mutex.lock();
-    int comm = td->comm;
-    td->mutex.unlock();
+    Td->Communication.Mutex.lock();
+    auto CurState = Td->Communication.State;
+    Td->Communication.Mutex.unlock();
 
-    if (comm != old) {
-      old = comm;
+    if (CurState != OldState) {
+      OldState = CurState;
 
-      td->mutex.lock();
-      td->ack = true;
-      td->mutex.unlock();
+      Td->Communication.Mutex.lock();
+      Td->Communication.Ack = true;
+      Td->Communication.Mutex.unlock();
     } else {
       std::this_thread::sleep_for(std::chrono::microseconds(1));
       continue;
     }
 
-    switch (comm) {
+    switch (CurState) {
     // allocate and initialize memory
-    case THREAD_INIT:
+    case LoadThreadState::ThreadInit:
       // set affinity
-      td->environment().setCpuAffinity(td->id());
+      Td->environment().setCpuAffinity(Td->id());
 
       // compile payload
-      td->config().payload().compilePayload(
-          td->config().payloadSettings(), td->config().instructionCacheSize(),
-          td->config().dataCacheBufferSize(), td->config().ramBufferSize(),
-          td->config().thread(), td->config().lines(), td->dumpRegisters,
-          td->errorDetection);
+      Td->CompiledPayloadPtr =
+          Td->config().payload()->compilePayload(Td->config().settings(), Td->DumpRegisters, Td->ErrorDetection);
 
       // allocate memory
       // if we should dump some registers, we use the first part of the memory
       // for them.
-      td->addrMem =
-          reinterpret_cast<unsigned long long *>(ALIGNED_MALLOC(
-              (td->buffersizeMem + td->addrOffset) * sizeof(unsigned long long),
-              64)) +
-          td->addrOffset;
+      Td->Memory = LoadWorkerMemory::allocate(Td->BuffersizeMem * sizeof(uint64_t));
 
       // exit application on error
-      if (td->addrMem - td->addrOffset == nullptr) {
-        workerLog::error() << "Could not allocate memory for CPU load thread "
-                           << td->id() << "\n";
-        exit(ENOMEM);
+      if (Td->Memory == nullptr) {
+        workerLog::error() << "Could not allocate memory for CPU load thread " << Td->id() << "\n";
       }
 
-      if (td->dumpRegisters) {
-        reinterpret_cast<DumpRegisterStruct *>(td->addrMem - td->addrOffset)
-            ->dumpVar = DumpVariable::Wait;
+      if (Td->DumpRegisters) {
+        Td->dumpRegisterStruct().DumpVar = DumpVariable::Wait;
       }
 
-      if (td->errorDetection) {
-        auto errorDetectionStruct = reinterpret_cast<ErrorDetectionStruct *>(
-            td->addrMem - td->addrOffset);
+      if (Td->ErrorDetection) {
+        auto& ErrorDetectionStructRef = Td->errorDetectionStruct();
 
-        std::memset(errorDetectionStruct, 0, sizeof(ErrorDetectionStruct));
+        std::memset(&ErrorDetectionStructRef, 0, sizeof(ErrorDetectionStruct));
 
         // distribute left and right communication pointers
-        errorDetectionStruct->communicationLeft = td->communicationLeft.get();
-        errorDetectionStruct->communicationRight = td->communicationRight.get();
+        ErrorDetectionStructRef.Left.Communication = Td->CommunicationLeft.get();
+        ErrorDetectionStructRef.Right.Communication = Td->CommunicationRight.get();
 
         // do first touch memset 0 for the communication pointers
-        std::memset((void *)errorDetectionStruct->communicationLeft, 0,
-                    sizeof(unsigned long long) * 2);
+        std::memset(static_cast<void*>(ErrorDetectionStructRef.Left.Communication), 0, sizeof(uint64_t) * 2);
       }
 
       // call init function
-      td->config().payload().init(td->addrMem, td->buffersizeMem);
+      Td->CompiledPayloadPtr->init(Td->Memory->getMemoryAddress(), Td->BuffersizeMem);
 
       break;
     // perform stress test
-    case THREAD_WORK:
+    case LoadThreadState::ThreadWork:
+      Td->CurrentRun.Iterations = 0;
       // record threads start timestamp
-      td->startTsc = td->environment().topology().timestamp();
+      Td->CurrentRun.StartTsc = Td->environment().topology().timestamp();
 
       // will be terminated by watchdog
       for (;;) {
@@ -354,11 +314,10 @@ void Firestarter::loadThreadWorker(std::shared_ptr<LoadWorkerData> td) {
         VT_USER_START("HIGH_LOAD_FUNC");
 #endif
 #ifdef ENABLE_SCOREP
-        SCOREP_USER_REGION_BY_NAME_BEGIN("HIGH",
-                                         SCOREP_USER_REGION_TYPE_COMMON);
+        SCOREP_USER_REGION_BY_NAME_BEGIN("HIGH", SCOREP_USER_REGION_TYPE_COMMON);
 #endif
-        td->iterations = td->config().payload().highLoadFunction(
-            td->addrMem, td->addrHigh, td->iterations);
+        Td->CurrentRun.Iterations = Td->CompiledPayloadPtr->highLoadFunction(Td->Memory->getMemoryAddress(),
+                                                                             Td->LoadVar, Td->CurrentRun.Iterations);
 
         // call low load function
 #ifdef ENABLE_VTRACING
@@ -369,7 +328,7 @@ void Firestarter::loadThreadWorker(std::shared_ptr<LoadWorkerData> td) {
         SCOREP_USER_REGION_BY_NAME_END("HIGH");
         SCOREP_USER_REGION_BY_NAME_BEGIN("LOW", SCOREP_USER_REGION_TYPE_COMMON);
 #endif
-        td->config().payload().lowLoadFunction(td->addrHigh, td->period);
+        Td->CompiledPayloadPtr->lowLoadFunction(Td->LoadVar, Td->Period);
 #ifdef ENABLE_VTRACING
         VT_USER_END("LOW_LOAD_FUNC");
 #endif
@@ -378,41 +337,33 @@ void Firestarter::loadThreadWorker(std::shared_ptr<LoadWorkerData> td) {
 #endif
 
         // terminate if master signals end of run and record stop timestamp
-        if (*td->addrHigh == LOAD_STOP) {
-          td->stopTsc = td->environment().topology().timestamp();
+        if (Td->LoadVar == LoadThreadWorkType::LoadStop) {
+          Td->CurrentRun.StopTsc = Td->environment().topology().timestamp();
+          Td->LastRun = Td->CurrentRun;
 
           return;
         }
 
-        if (*td->addrHigh == LOAD_SWITCH) {
-          td->stopTsc = td->environment().topology().timestamp();
+        if (Td->LoadVar == LoadThreadWorkType::LoadSwitch) {
+          Td->CurrentRun.StopTsc = Td->environment().topology().timestamp();
+          Td->LastRun = Td->CurrentRun;
 
           break;
         }
       }
       break;
-    case THREAD_SWITCH:
+    case LoadThreadState::ThreadSwitch:
       // compile payload
-      td->config().payload().compilePayload(
-          td->config().payloadSettings(), td->config().instructionCacheSize(),
-          td->config().dataCacheBufferSize(), td->config().ramBufferSize(),
-          td->config().thread(), td->config().lines(), td->dumpRegisters,
-          td->errorDetection);
+      Td->CompiledPayloadPtr =
+          Td->config().payload()->compilePayload(Td->config().settings(), Td->DumpRegisters, Td->ErrorDetection);
 
       // call init function
-      td->config().payload().init(td->addrMem, td->buffersizeMem);
-
-      // save old iteration count
-      td->lastIterations = td->iterations;
-      td->lastStartTsc = td->startTsc;
-      td->lastStopTsc = td->stopTsc;
-      td->iterations = 0;
+      Td->CompiledPayloadPtr->init(Td->Memory->getMemoryAddress(), Td->BuffersizeMem);
       break;
-    case THREAD_WAIT:
+    case LoadThreadState::ThreadWait:
       break;
-    case THREAD_STOP:
-    default:
-      return;
     }
   }
 }
+
+} // namespace firestarter
\ No newline at end of file
diff --git a/src/firestarter/Main.cpp b/src/firestarter/Main.cpp
index 844052d5..24269db3 100644
--- a/src/firestarter/Main.cpp
+++ b/src/firestarter/Main.cpp
@@ -19,484 +19,35 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Firestarter.hpp>
-#include <firestarter/Logging/Log.hpp>
-
-#include <cxxopts.hpp>
-
-#include <string>
-
-struct Config {
-  inline static const std::vector<std::pair<std::string, std::string>>
-      optionsMap = {{"information", "Information Options:\n"},
-                    {"general", "General Options:\n"},
-                    {"specialized-workloads", "Specialized workloads:\n"},
-#ifdef FIRESTARTER_DEBUG_FEATURES
-                    {"debug", "Debugging:\n"},
-#endif
-#if defined(linux) || defined(__linux__)
-                    {"measurement", "Measurement:\n"},
-                    {"optimization", "Optimization:\n"}
-#endif
-  };
-
-  // default parameters
-  std::chrono::seconds timeout;
-  unsigned loadPercent;
-  std::chrono::microseconds period;
-  unsigned requestedNumThreads;
-  std::string cpuBind = "";
-  bool printFunctionSummary;
-  unsigned functionId;
-  bool listInstructionGroups;
-  std::string instructionGroups;
-  unsigned lineCount = 0;
-  // debug features
-  bool allowUnavailablePayload = false;
-  bool dumpRegisters = false;
-  std::chrono::seconds dumpRegistersTimeDelta = std::chrono::seconds(0);
-  std::string dumpRegistersOutpath = "";
-  bool errorDetection = false;
-  // CUDA parameters
-  int gpus = 0;
-  unsigned gpuMatrixSize = 0;
-  bool gpuUseFloat = false;
-  bool gpuUseDouble = false;
-  // linux features
-  bool listMetrics = false;
-  bool measurement = false;
-  std::chrono::milliseconds startDelta = std::chrono::milliseconds(0);
-  std::chrono::milliseconds stopDelta = std::chrono::milliseconds(0);
-  std::chrono::milliseconds measurementInterval = std::chrono::milliseconds(0);
-  std::vector<std::string> stdinMetrics;
-  // linux and dynamic linked binary
-  std::vector<std::string> metricPaths;
-
-  // optimization
-  bool optimize = false;
-  std::chrono::seconds preheat;
-  std::string optimizationAlgorithm;
-  std::vector<std::string> optimizationMetrics;
-  std::chrono::seconds evaluationDuration;
-  unsigned individuals;
-  std::string optimizeOutfile = "";
-  unsigned generations;
-  double nsga2_cr;
-  double nsga2_m;
-
-  Config(int argc, const char **argv);
-};
-
-void print_copyright() {
-  firestarter::log::info()
-      << "This program is free software: you can redistribute it and/or "
-         "modify\n"
-      << "it under the terms of the GNU General Public License as published "
-         "by\n"
-      << "the Free Software Foundation, either version 3 of the License, or\n"
-      << "(at your option) any later version.\n"
-      << "\n"
-      << "You should have received a copy of the GNU General Public License\n"
-      << "along with this program.  If not, see "
-         "<http://www.gnu.org/licenses/>.\n";
-}
-
-void print_warranty() {
-  firestarter::log::info()
-      << "This program is distributed in the hope that it will be useful,\n"
-      << "but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
-      << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n"
-      << "GNU General Public License for more details.\n"
-      << "\n"
-      << "You should have received a copy of the GNU General Public License\n"
-      << "along with this program.  If not, see "
-         "<http://www.gnu.org/licenses/>.\n";
-}
-
-void print_help(cxxopts::Options const &parser, std::string const &section) {
-  std::vector<std::pair<std::string, std::string>> options(
-      Config::optionsMap.size());
-
-  if (section.size() == 0) {
-    std::copy(Config::optionsMap.begin(), Config::optionsMap.end(),
-              options.begin());
-  } else {
-    auto findSection = [&](std::pair<std::string, std::string> const &pair) {
-      return pair.first == section;
-    };
-    auto it = std::copy_if(Config::optionsMap.begin(), Config::optionsMap.end(),
-                           options.begin(), findSection);
-    options.resize(std::distance(options.begin(), it));
-  }
-
-  // clang-format off
-  firestarter::log::info()
-    << parser.help(options)
-    << "Examples:\n"
-    << "  ./FIRESTARTER                 starts FIRESTARTER without timeout\n"
-    << "  ./FIRESTARTER -t 300          starts a 5 minute run of FIRESTARTER\n"
-    << "  ./FIRESTARTER -l 50 -t 600    starts a 10 minute run of FIRESTARTER with\n"
-    << "                                50\% high load and 50\% idle time\n"
-#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_ONEAPI) || defined(FIRESTARTER_BUILD_HIP)
-    << "                                on CPUs and full load on GPUs\n"
-#endif
-    << "  ./FIRESTARTER -l 75 -p 20000000\n"
-    << "                                starts FIRESTARTER with an interval length\n"
-    << "                                of 2 sec, 1.5s high load"
-#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_ONEAPI) || defined(FIRESTARTER_BUILD_HIP)
-    << "                                on CPUs and full load on GPUs\n"
-#else
-    << "\n"
-#endif
-#if defined(linux) || defined(__linux__) 
-    << "  ./FIRESTARTER --measurement --start-delta=300000 -t 900\n"
-    << "                                starts FIRESTARTER measuring all available\n"
-    << "                                metrics for 15 minutes disregarding the first\n"
-    << "                                5 minutes and last two seconds (default to `--stop-delta`)\n"
-    << "  ./FIRESTARTER -t 20 --optimize=NSGA2 --optimization-metric sysfs-powercap-rapl,perf-ipc\n"
-    << "                                starts FIRESTARTER optimizing with the sysfs-powercap-rapl\n"
-    << "                                and perf-ipc metric. The duration is 20s long. The default\n"
-    << "                                instruction groups for the current platform will be used.\n"
-#endif
-    ;
-  // clang-format on
-}
-
-Config::Config(int argc, const char **argv) {
-
-  cxxopts::Options parser(argv[0]);
-
-  // clang-format off
-  parser.add_options("information")
-    ("h,help", "Display usage information. SECTION can be any of: information | general | specialized-workloads"
-#ifdef FIRESTARTER_DEBUG_FEATURES
-     " | debug"
-#endif
-#if defined(linux) || defined(__linux__)
-     "\n| measurement | optimization"
-#endif
-     ,
-      cxxopts::value<std::string>()->implicit_value(""), "SECTION")
-    ("v,version", "Display version information")
-    ("c,copyright", "Display copyright information")
-    ("w,warranty", "Display warranty information")
-    ("q,quiet", "Set log level to Warning")
-    ("r,report", "Display additional information (overridden by -q)")
-    ("debug", "Print debug output")
-    ("a,avail", "List available functions");
-
-  parser.add_options("general")
-    ("i,function", "Specify integer ID of the load-function to be\nused (as listed by --avail)",
-      cxxopts::value<unsigned>()->default_value("0"), "ID")
-#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_ONEAPI) || defined(FIRESTARTER_BUILD_HIP)
-    ("f,usegpufloat", "Use single precision matrix multiplications\ninstead of default")
-    ("d,usegpudouble", "Use double precision matrix multiplications\ninstead of default")
-    ("g,gpus", "Number of gpus to use, default: -1 (all)",
-      cxxopts::value<int>()->default_value("-1"))
-    ("m,matrixsize", "Size of the matrix to calculate, default: 0 (maximum)",
-      cxxopts::value<unsigned>()->default_value("0"))
-#endif
-    ("t,timeout", "Set the timeout (seconds) after which FIRESTARTER\nterminates itself, default: 0 (no timeout)",
-      cxxopts::value<unsigned>()->default_value("0"), "TIMEOUT")
-    ("l,load", "Set the percentage of high CPU load to LOAD\n(%) default: 100, valid values: 0 <= LOAD <=\n100, threads will be idle in the remaining time,\nfrequency of load changes is determined by -p."
-#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_ONEAPI) || defined(FIRESTARTER_BUILD_HIP)
-     " This option does NOT influence the GPU\nworkload!"
-#endif
-     , cxxopts::value<unsigned>()->default_value("100"), "LOAD")
-    ("p,period", "Set the interval length for CPUs to PERIOD\n(usec), default: 100000, each interval contains\na high load and an idle phase, the percentage\nof high load is defined by -l.",
-      cxxopts::value<unsigned>()->default_value("100000"), "PERIOD")
-    ("n,threads", "Specify the number of threads. Cannot be\ncombined with -b | --bind, which impicitly\nspecifies the number of threads.",
-      cxxopts::value<unsigned>()->default_value("0"), "COUNT")
-#if (defined(linux) || defined(__linux__)) && defined(FIRESTARTER_THREAD_AFFINITY)
-    ("b,bind", "Select certain CPUs. CPULIST format: \"x,y,z\",\n\"x-y\", \"x-y/step\", and any combination of the\nabove. Cannot be combined with -n | --threads.",
-      cxxopts::value<std::string>()->default_value(""), "CPULIST")
-#endif
-    ("error-detection", "Enable error detection. This aborts execution when the calculated data is corruped by errors. FIRESTARTER must run with 2 or more threads for this feature. Cannot be used with -l | --load and --optimize.");
-
-  parser.add_options("specialized-workloads")
-    ("list-instruction-groups", "List the available instruction groups for the\npayload of the current platform.")
-    ("run-instruction-groups", "Run the payload with the specified\ninstruction groups. GROUPS format: multiple INST:VAL\npairs comma-seperated.",
-      cxxopts::value<std::string>()->default_value(""), "GROUPS")
-    ("set-line-count", "Set the number of lines for a payload.",
-      cxxopts::value<unsigned>());
-
-#ifdef FIRESTARTER_DEBUG_FEATURES
-  parser.add_options("debug")
-    ("allow-unavailable-payload", "")
-    ("dump-registers", "Dump the working registers on the first\nthread. Depending on the payload these are mm, xmm,\nymm or zmm. Only use it without a timeout and\n100 percent load. DELAY between dumps in secs. Cannot be used with --error-detection.",
-      cxxopts::value<unsigned>()->implicit_value("10"), "DELAY")
-    ("dump-registers-outpath", "Path for the dump of the output files. If\nPATH is not given, current working directory will\nbe used.",
-      cxxopts::value<std::string>()->default_value(""), "PATH");
-#endif
-
-#if defined(linux) || defined(__linux__)
-  parser.add_options("measurement")
-    ("list-metrics", "List the available metrics.")
-#ifndef FIRESTARTER_LINK_STATIC
-    ("metric-path", "Add a path to a shared library representing an interface for a metric. This option can be specified multiple times.",
-      cxxopts::value<std::vector<std::string>>()->default_value(""))
-#endif
-    ("metric-from-stdin", "Add a metric NAME with values from stdin.\nFormat of input: \"NAME TIME_SINCE_EPOCH VALUE\\n\".\nTIME_SINCE_EPOCH is a int64 in nanoseconds. VALUE is a double. (Do not forget to flush\nlines!)",
-      cxxopts::value<std::vector<std::string>>(), "NAME")
-    ("measurement", "Start a measurement for the time specified by\n-t | --timeout. (The timeout must be greater\nthan the start and stop deltas.) Cannot be\ncombined with --optimize.")
-    ("measurement-interval", "Interval of measurements in milliseconds, default: 100",
-      cxxopts::value<unsigned>()->default_value("100"))
-    ("start-delta", "Cut of first N milliseconds of measurement, default: 5000",
-      cxxopts::value<unsigned>()->default_value("5000"), "N")
-    ("stop-delta", "Cut of last N milliseconds of measurement, default: 2000",
-      cxxopts::value<unsigned>()->default_value("2000"), "N")
-    ("preheat", "Preheat for N seconds, default: 240",
-      cxxopts::value<unsigned>()->default_value("240"), "N");
-
-  parser.add_options("optimization")
-    ("optimize", "Run the optimization with one of these algorithms: NSGA2.\nCannot be combined with --measurement.",
-      cxxopts::value<std::string>())
-    ("optimize-outfile", "Dump the output of the optimization into this\nfile, default: $PWD/$HOSTNAME_$DATE.json",
-      cxxopts::value<std::string>())
-    ("optimization-metric", "Use a metric for optimization. Metrics listed\nwith cli argument --list-metrics or specified\nwith --metric-from-stdin are valid.",
-      cxxopts::value<std::vector<std::string>>())
-    ("individuals", "Number of individuals for the population. For\nNSGA2 specify at least 5 and a multiple of 4,\ndefault: 20",
-      cxxopts::value<unsigned>()->default_value("20"))
-    ("generations", "Number of generations, default: 20",
-      cxxopts::value<unsigned>()->default_value("20"))
-    ("nsga2-cr", "Crossover probability. Must be in range [0,1[\ndefault: 0.6",
-      cxxopts::value<double>()->default_value("0.6"))
-    ("nsga2-m", "Mutation probability. Must be in range [0,1]\ndefault: 0.4",
-      cxxopts::value<double>()->default_value("0.4"));
-#endif
-  // clang-format on
-
-  try {
-    auto options = parser.parse(argc, argv);
-
-    if (options.count("quiet")) {
-      firestarter::logging::filter<firestarter::logging::record>::set_severity(
-          nitro::log::severity_level::warn);
-    } else if (options.count("report")) {
-      firestarter::logging::filter<firestarter::logging::record>::set_severity(
-          nitro::log::severity_level::debug);
-    } else if (options.count("debug")) {
-      firestarter::logging::filter<firestarter::logging::record>::set_severity(
-          nitro::log::severity_level::trace);
-    } else {
-      firestarter::logging::filter<firestarter::logging::record>::set_severity(
-          nitro::log::severity_level::info);
-    }
-
-    if (options.count("version")) {
-      std::exit(EXIT_SUCCESS);
-    }
-
-    if (options.count("copyright")) {
-      print_copyright();
-      std::exit(EXIT_SUCCESS);
-    }
-
-    if (options.count("warranty")) {
-      print_warranty();
-      std::exit(EXIT_SUCCESS);
-    }
-
-    firestarter::log::info()
-        << "This program comes with ABSOLUTELY NO WARRANTY; for details run `"
-        << argv[0] << " -w`.\n"
-        << "This is free software, and you are welcome to redistribute it\n"
-        << "under certain conditions; run `" << argv[0]
-        << " -c` for details.\n";
-
-    if (options.count("help")) {
-      auto section = options["help"].as<std::string>();
-
-      // section not found
-      auto findSection = [&](std::pair<std::string, std::string> const &pair) {
-        return pair.first == section;
-      };
-      if (std::find_if(optionsMap.begin(), optionsMap.end(), findSection) ==
-              optionsMap.end() &&
-          section.size() != 0) {
-        throw std::invalid_argument("Section \"" + section +
-                                    "\" not found in help.");
-      }
-
-      print_help(parser, section);
-      std::exit(EXIT_SUCCESS);
-    }
-
-    timeout = std::chrono::seconds(options["timeout"].as<unsigned>());
-    loadPercent = options["load"].as<unsigned>();
-    period = std::chrono::microseconds(options["period"].as<unsigned>());
-
-    if (loadPercent > 100) {
-      throw std::invalid_argument("Option -l/--load may not be above 100.");
-    }
-
-    errorDetection = options.count("error-detection");
-    if (errorDetection && loadPercent != 100) {
-      throw std::invalid_argument("Option --error-detection may only be used "
-                                  "with -l/--load equal 100.");
-    }
-
-#ifdef FIRESTARTER_DEBUG_FEATURES
-    allowUnavailablePayload = options.count("allow-unavailable-payload");
-    dumpRegisters = options.count("dump-registers");
-    if (dumpRegisters) {
-      dumpRegistersTimeDelta =
-          std::chrono::seconds(options["dump-registers"].as<unsigned>());
-      if (timeout != std::chrono::microseconds::zero() && loadPercent != 100) {
-        throw std::invalid_argument("Option --dump-registers may only be used "
-                                    "without a timeout and full load.");
-      }
-      if (errorDetection) {
-        throw std::invalid_argument(
-            "Options --dump-registers and --error-detection cannot be used "
-            "together.");
-      }
-    }
-#endif
-
-    requestedNumThreads = options["threads"].as<unsigned>();
-
-#if (defined(linux) || defined(__linux__)) &&                                  \
-    defined(FIRESTARTER_THREAD_AFFINITY)
-    cpuBind = options["bind"].as<std::string>();
-    if (!cpuBind.empty()) {
-      if (requestedNumThreads != 0) {
-        throw std::invalid_argument(
-            "Options -b/--bind and -n/--threads cannot be used together.");
-      }
-    }
-#endif
-
-#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_ONEAPI) || defined(FIRESTARTER_BUILD_HIP)
-    gpuUseFloat = options.count("usegpufloat");
-    gpuUseDouble = options.count("usegpudouble");
-
-    if (gpuUseFloat && gpuUseDouble) {
-      throw std::invalid_argument("Options -f/--usegpufloat and "
-                                  "-d/--usegpudouble cannot be used together.");
-    }
-
-    gpuMatrixSize = options["matrixsize"].as<unsigned>();
-    if (gpuMatrixSize > 0 && gpuMatrixSize < 64) {
-      throw std::invalid_argument(
-          "Option -m/--matrixsize may not be below 64.");
-    }
-
-    gpus = options["gpus"].as<int>();
-#endif
-
-    printFunctionSummary = options.count("avail");
-
-    functionId = options["function"].as<unsigned>();
-
-    listInstructionGroups = options.count("list-instruction-groups");
-    instructionGroups = options["run-instruction-groups"].as<std::string>();
-    if (options.count("set-line-count")) {
-      lineCount = options["set-line-count"].as<unsigned>();
-    }
-
-#if defined(linux) || defined(__linux__)
-    startDelta =
-        std::chrono::milliseconds(options["start-delta"].as<unsigned>());
-    stopDelta = std::chrono::milliseconds(options["stop-delta"].as<unsigned>());
-    measurementInterval = std::chrono::milliseconds(
-        options["measurement-interval"].as<unsigned>());
-#ifndef FIRESTARTER_LINK_STATIC
-    metricPaths = options["metric-path"].as<std::vector<std::string>>();
-#endif
-    if (options.count("metric-from-stdin")) {
-      stdinMetrics =
-          options["metric-from-stdin"].as<std::vector<std::string>>();
-    }
-    measurement = options.count("measurement");
-    listMetrics = options.count("list-metrics");
-
-    if ((optimize = options.count("optimize"))) {
-      if (errorDetection) {
-        throw std::invalid_argument("Options --error-detection and --optimize "
-                                    "cannot be used together.");
-      }
-      if (measurement) {
-        throw std::invalid_argument(
-            "Options --measurement and --optimize cannot be used together.");
-      }
-      preheat = std::chrono::seconds(options["preheat"].as<unsigned>());
-      optimizationAlgorithm = options["optimize"].as<std::string>();
-      if (options.count("optimization-metric")) {
-        optimizationMetrics =
-            options["optimization-metric"].as<std::vector<std::string>>();
-      }
-      if (loadPercent != 100) {
-        throw std::invalid_argument("Options -p | --period and -l | --load are "
-                                    "not compatible with --optimize.");
-      }
-      if (timeout == std::chrono::seconds::zero()) {
-        throw std::invalid_argument(
-            "Option -t | --timeout must be specified for optimization.");
-      }
-      evaluationDuration = timeout;
-      // this will deactivate the watchdog worker
-      timeout = std::chrono::seconds::zero();
-      individuals = options["individuals"].as<unsigned>();
-      if (options.count("optimize-outfile")) {
-        optimizeOutfile = options["optimize-outfile"].as<std::string>();
-      }
-      generations = options["generations"].as<unsigned>();
-      nsga2_cr = options["nsga2-cr"].as<double>();
-      nsga2_m = options["nsga2-m"].as<double>();
-
-      if (optimizationAlgorithm != "NSGA2") {
-        throw std::invalid_argument("Option --optimize must be any of: NSGA2");
-      }
-    }
-#endif
-
-  } catch (std::exception &e) {
-    firestarter::log::error() << e.what() << "\n";
-    print_help(parser, "");
-    std::exit(EXIT_FAILURE);
-  }
-}
-
-int main(int argc, const char **argv) {
-
-  firestarter::log::info()
-      << "FIRESTARTER - A Processor Stress Test Utility, Version "
-      << _FIRESTARTER_VERSION_STRING << "\n"
-      << "Copyright (C) " << _FIRESTARTER_BUILD_YEAR
-      << " TU Dresden, Center for Information Services and High Performance "
-         "Computing"
-      << "\n";
+#include "firestarter/Config.hpp"
+#include "firestarter/Firestarter.hpp"
+#include "firestarter/Logging/Log.hpp"
+
+auto main(int argc, const char** argv) -> int {
+  firestarter::log::info() << "FIRESTARTER - A Processor Stress Test Utility, Version " << _FIRESTARTER_VERSION_STRING
+                           << "\n"
+                           << "Copyright (C) " << _FIRESTARTER_BUILD_YEAR
+                           << " TU Dresden, Center for Information Services and High Performance "
+                              "Computing"
+                           << "\n";
 #ifdef _FIRESTARTER_VERSION_TEMPERED
-  firestarter::log::info() 
-      << "*The version and/or year was explicitely set during build and does not "
-      << "necessarily represent the actual version.\n"
-      << "This helps maintainers to keep track of versions, e.g., on a cluster."
-      << "\n";
+  firestarter::log::info() << "*The version and/or year was explicitely set during build and does not "
+                           << "necessarily represent the actual version.\n"
+                           << "This helps maintainers to keep track of versions, e.g., on a cluster."
+                           << "\n";
 #endif
 
-  Config cfg{argc, argv};
-
   try {
-    firestarter::Firestarter firestarter(
-        argc, argv, cfg.timeout, cfg.loadPercent, cfg.period,
-        cfg.requestedNumThreads, cfg.cpuBind, cfg.printFunctionSummary,
-        cfg.functionId, cfg.listInstructionGroups, cfg.instructionGroups,
-        cfg.lineCount, cfg.allowUnavailablePayload, cfg.dumpRegisters,
-        cfg.dumpRegistersTimeDelta, cfg.dumpRegistersOutpath,
-        cfg.errorDetection, cfg.gpus, cfg.gpuMatrixSize, cfg.gpuUseFloat,
-        cfg.gpuUseDouble, cfg.listMetrics, cfg.measurement, cfg.startDelta,
-        cfg.stopDelta, cfg.measurementInterval, cfg.metricPaths,
-        cfg.stdinMetrics, cfg.optimize, cfg.preheat, cfg.optimizationAlgorithm,
-        cfg.optimizationMetrics, cfg.evaluationDuration, cfg.individuals,
-        cfg.optimizeOutfile, cfg.generations, cfg.nsga2_cr, cfg.nsga2_m);
+    firestarter::Config Cfg{argc, argv};
+
+    firestarter::Firestarter Firestarter(std::move(Cfg));
 
-    firestarter.mainThread();
+    Firestarter.mainThread();
 
-  } catch (std::exception const &e) {
-    firestarter::log::error() << e.what();
+  } catch (std::exception const& E) {
+    firestarter::log::error() << E.what();
     return EXIT_FAILURE;
   }
 
   return EXIT_SUCCESS;
-}
+}
\ No newline at end of file
diff --git a/src/firestarter/Measurement/MeasurementWorker.cpp b/src/firestarter/Measurement/MeasurementWorker.cpp
index 498330ab..25294e04 100644
--- a/src/firestarter/Measurement/MeasurementWorker.cpp
+++ b/src/firestarter/Measurement/MeasurementWorker.cpp
@@ -19,10 +19,11 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Measurement/MeasurementWorker.hpp>
+#include "firestarter/Measurement/MeasurementWorker.hpp"
+#include "firestarter/Logging/Log.hpp"
 
+#include <cstdarg>
 #include <queue>
-#include <thread>
 
 #ifndef FIRESTARTER_LINK_STATIC
 extern "C" {
@@ -30,414 +31,389 @@ extern "C" {
 }
 #endif
 
-void insertCallback(void *cls, const char *metricName, int64_t timeSinceEpoch,
-                    double value) {
-  static_cast<firestarter::measurement::MeasurementWorker *>(cls)
-      ->insertCallback(metricName, timeSinceEpoch, value);
+void insertCallback(void* Cls, const char* MetricName, int64_t TimeSinceEpoch, double Value) {
+  static_cast<firestarter::measurement::MeasurementWorker*>(Cls)->insertCallback(MetricName, TimeSinceEpoch, Value);
 }
 
-using namespace firestarter::measurement;
+namespace {
 
-MeasurementWorker::MeasurementWorker(
-    std::chrono::milliseconds updateInterval, unsigned long long numThreads,
-    std::vector<std::string> const &metricDylibs,
-    std::vector<std::string> const &stdinMetrics)
-    : updateInterval(updateInterval), numThreads(numThreads) {
+// NOLINTBEGIN(cert-dcl50-cpp,cppcoreguidelines-pro-type-vararg,cppcoreguidelines-pro-bounds-array-to-pointer-decay,clang-analyzer-valist.Uninitialized)
+auto scanStdin(const char* Fmt, int Count, ...) -> bool {
+  va_list Args;
+  va_start(Args, Count);
+  auto ReturnCode = std::vscanf(Fmt, Args);
+  va_end(Args);
+  return ReturnCode == Count;
+}
+// NOLINTEND(cert-dcl50-cpp,cppcoreguidelines-pro-type-vararg,cppcoreguidelines-pro-bounds-array-to-pointer-decay,clang-analyzer-valist.Uninitialized)
+
+} // namespace
+
+namespace firestarter::measurement {
+
+MeasurementWorker::MeasurementWorker(std::chrono::milliseconds UpdateInterval, uint64_t NumThreads,
+                                     std::vector<std::string> const& MetricDylibsNames,
+                                     std::vector<std::string> const& StdinMetricsNames)
+    : UpdateInterval(UpdateInterval)
+    , NumThreads(NumThreads) {
 
 #ifndef FIRESTARTER_LINK_STATIC
   // open dylibs and find metric symbol.
-  // create an entry in _metricDylibs with handle from dlopen and
-  // metric_interface_t structure. add this structe as a pointer to metrics.
-  for (auto const &dylib : metricDylibs) {
-    void *handle;
-    const char *filename = dylib.c_str();
+  // create an entry in MetricDylibs with handle from dlopen and
+  // MetricInterface structure. add this structe as a pointer to metrics.
+  for (auto const& Dylib : MetricDylibsNames) {
+    void* Handle = nullptr;
+    const char* Filename = Dylib.c_str();
 
-    handle = dlopen(dylib.c_str(), RTLD_NOW | RTLD_LOCAL);
+    Handle = dlopen(Dylib.c_str(), RTLD_NOW | RTLD_LOCAL);
 
-    if (!handle) {
-      firestarter::log::error() << filename << ": " << dlerror();
+    if (!Handle) {
+      firestarter::log::error() << Filename << ": " << dlerror();
       continue;
     }
 
     // clear existing error
     dlerror();
 
-    metric_interface_t *metric = nullptr;
+    MetricInterface* Metric = nullptr;
 
-    metric = (metric_interface_t *)dlsym(handle, "metric");
+    Metric = static_cast<MetricInterface*>(dlsym(Handle, "metric"));
 
-    char *error;
-    if ((error = dlerror()) != NULL) {
-      firestarter::log::error() << filename << ": " << error;
-      dlclose(handle);
+    char* Error = nullptr;
+    if ((Error = dlerror()) != nullptr) {
+      firestarter::log::error() << Filename << ": " << Error;
+      dlclose(Handle);
       continue;
     }
 
-    if (this->findMetricByName(metric->name) != nullptr) {
-      firestarter::log::error()
-          << "A metric named \"" << metric->name << "\" is already loaded.";
-      dlclose(handle);
+    if (findMetricByName(Metric->Name) != nullptr) {
+      firestarter::log::error() << "A metric named \"" << Metric->Name << "\" is already loaded.";
+      dlclose(Handle);
       continue;
     }
 
     // lets push our metric object and the handle
-    this->_metricDylibs.push_back(handle);
-    this->metrics.push_back(metric);
+    MetricDylibs.push_back(Handle);
+    Metrics.push_back(Metric);
   }
 #else
-  (void)metricDylibs;
+  (void)MetricDylibsNames;
 #endif
 
   // setup metric objects for metric names passed from stdin.
-  for (auto const &name : stdinMetrics) {
-    if (this->findMetricByName(name) != nullptr) {
-      firestarter::log::error()
-          << "A metric named \"" << name << "\" is already loaded.";
+  for (auto const& Name : StdinMetricsNames) {
+    if (findMetricByName(Name) != nullptr) {
+      firestarter::log::error() << "A metric named \"" << Name << "\" is already loaded.";
       continue;
     }
 
-    this->_stdinMetrics.push_back(name);
+    StdinMetrics.push_back(Name);
   }
 
-  std::stringstream ss;
-  unsigned maxLength = 0;
-  std::map<std::string, bool> available;
+  std::stringstream Ss;
+  unsigned MaxLength = 0;
+  std::map<std::string, bool> Available;
 
-  for (auto const &metric : this->metrics) {
-    std::string name(metric->name);
-    maxLength = maxLength < name.size() ? name.size() : maxLength;
-    int returnCode = metric->init();
-    metric->fini();
-    available[name] = returnCode == EXIT_SUCCESS ? true : false;
+  for (auto const& Metric : Metrics) {
+    const std::string Name(Metric->Name);
+    MaxLength = MaxLength < Name.size() ? Name.size() : MaxLength;
+    auto ReturnCode = Metric->Init();
+    Metric->Fini();
+    Available[Name] = ReturnCode == EXIT_SUCCESS;
   }
 
-  unsigned padding = maxLength > 6 ? maxLength - 6 : 0;
-  ss << "  METRIC" << std::string(padding + 1, ' ') << "| available\n";
-  ss << "  " << std::string(padding + 7, '-') << "-----------\n";
-  for (auto const &[key, value] : available) {
-    ss << "  " << key << std::string(padding + 7 - key.size(), ' ') << "| ";
-    ss << (value ? "yes" : "no") << "\n";
+  const auto Padding = MaxLength > 6 ? MaxLength - 6 : 0;
+  Ss << "  METRIC" << std::string(Padding + 1, ' ') << "| available\n";
+  Ss << "  " << std::string(Padding + 7, '-') << "-----------\n";
+  for (auto const& [key, value] : Available) {
+    Ss << "  " << key << std::string(Padding + 7 - key.size(), ' ') << "| ";
+    Ss << (value ? "yes" : "no") << "\n";
   }
 
-  this->availableMetricsString = ss.str();
+  AvailableMetricsString = Ss.str();
 
-  pthread_create(&this->workerThread, NULL,
-                 reinterpret_cast<void *(*)(void *)>(
-                     MeasurementWorker::dataAcquisitionWorker),
-                 this);
+  pthread_create(&WorkerThread, nullptr, MeasurementWorker::dataAcquisitionWorker, this);
 
   // create a worker for getting metric values from stdin
-  if (this->_stdinMetrics.size() > 0) {
-    pthread_create(&this->stdinThread, NULL,
-                   reinterpret_cast<void *(*)(void *)>(
-                       MeasurementWorker::stdinDataAcquisitionWorker),
-                   this);
+  if (!StdinMetrics.empty()) {
+    pthread_create(&StdinThread, nullptr, MeasurementWorker::stdinDataAcquisitionWorker, this);
   }
 }
 
 MeasurementWorker::~MeasurementWorker() {
-  pthread_cancel(this->workerThread);
+  pthread_cancel(WorkerThread);
 
-  pthread_join(this->workerThread, NULL);
+  pthread_join(WorkerThread, nullptr);
 
-  if (this->_stdinMetrics.size() > 0) {
-    pthread_cancel(this->stdinThread);
+  if (!StdinMetrics.empty()) {
+    pthread_cancel(StdinThread);
 
-    pthread_join(this->stdinThread, NULL);
+    pthread_join(StdinThread, nullptr);
   }
 
-  for (auto const &[key, value] : this->values) {
-    auto metric = this->findMetricByName(key);
-    if (metric == nullptr) {
+  for (auto const& [key, value] : Values) {
+    const auto* Metric = findMetricByName(key);
+    if (Metric == nullptr) {
       continue;
     }
 
-    metric->fini();
+    Metric->Fini();
   }
 
 #ifndef FIRESTARTER_LINK_STATIC
-  for (auto handle : this->_metricDylibs) {
-    dlclose(handle);
+  for (auto* Handle : MetricDylibs) {
+    dlclose(Handle);
   }
 #endif
 }
 
-std::vector<std::string> MeasurementWorker::metricNames() {
-  std::vector<std::string> metrics;
-  std::transform(
-      this->metrics.begin(), this->metrics.end(), std::back_inserter(metrics),
-      [](auto &metric) -> std::string { return std::string(metric->name); });
-  for (auto const &name : this->_stdinMetrics) {
-    metrics.push_back(name);
+auto MeasurementWorker::metricNames() -> std::vector<std::string> {
+  std::vector<std::string> MetricNames;
+  std::transform(Metrics.begin(), Metrics.end(), std::back_inserter(MetricNames),
+                 [](auto& Metric) -> std::string { return std::string(Metric->Name); });
+  for (auto const& Name : StdinMetrics) {
+    MetricNames.push_back(Name);
   }
 
-  return metrics;
+  return MetricNames;
 }
 
-const metric_interface_t *
-MeasurementWorker::findMetricByName(std::string metricName) {
-  auto name_equal = [metricName](auto &metricInterface) {
-    return metricName.compare(metricInterface->name) == 0;
-  };
-  auto metric =
-      std::find_if(this->metrics.begin(), this->metrics.end(), name_equal);
+auto MeasurementWorker::findMetricByName(std::string MetricName) -> const MetricInterface* {
+  auto NameEqual = [&MetricName](auto& MetricInterface) { return MetricName == MetricInterface->Name; };
+  auto Metric = std::find_if(Metrics.begin(), Metrics.end(), NameEqual);
 
   // metric not found
-  if (metric == this->metrics.end()) {
+  if (Metric == Metrics.end()) {
     return nullptr;
   }
   // metric found
-  return const_cast<const metric_interface_t *>(*metric);
+  return *Metric;
 }
 
 // this must be called by the main thread.
 // if not done so things like perf_event_attr.inherit might not work as expected
-std::vector<std::string>
-MeasurementWorker::initMetrics(std::vector<std::string> const &metricNames) {
-  this->values_mutex.lock();
+auto MeasurementWorker::initMetrics(std::vector<std::string> const& MetricNames) -> std::vector<std::string> {
+  ValuesMutex.lock();
 
-  std::vector<std::string> initialized = {};
+  std::vector<std::string> Initialized = {};
 
   // try to find each metric and initialize it
-  for (auto const &metricName : metricNames) {
+  for (auto const& MetricName : MetricNames) {
     // init values map with empty vector
-    auto name_equal = [metricName](auto const &pair) {
-      return metricName.compare(pair.first) == 0;
-    };
-    auto pair =
-        std::find_if(this->values.begin(), this->values.end(), name_equal);
-    if (pair != this->values.end()) {
-      pair->second.clear();
+    auto NameEqual = [&MetricName](auto const& Pair) { return MetricName == Pair.first; };
+    auto Pair = std::find_if(Values.begin(), Values.end(), NameEqual);
+    if (Pair != Values.end()) {
+      Pair->second.clear();
     } else {
-      auto metric = this->findMetricByName(metricName);
-      if (metric != nullptr) {
-        int returnValue = metric->init();
-        if (returnValue != EXIT_SUCCESS) {
-          log::error() << "Metric " << metric->name << ": "
-                       << metric->get_error();
+      const auto* Metric = findMetricByName(MetricName);
+      if (Metric != nullptr) {
+        const auto ReturnValue = Metric->Init();
+        if (ReturnValue != EXIT_SUCCESS) {
+          log::warn() << "Metric " << Metric->Name << ": " << Metric->GetError();
           continue;
         }
       }
-      this->values[metricName] = std::vector<TimeValue>();
-      if (metric != nullptr) {
-        if (metric->type.insert_callback) {
-          metric->register_insert_callback(::insertCallback, this);
+      Values[MetricName] = std::vector<TimeValue>();
+      if (Metric != nullptr) {
+        if (Metric->Type.InsertCallback) {
+          Metric->RegisterInsertCallback(::insertCallback, this);
         }
       }
-      initialized.push_back(metricName);
+      Initialized.push_back(MetricName);
     }
   }
 
-  this->values_mutex.unlock();
+  ValuesMutex.unlock();
 
-  return initialized;
+  return Initialized;
 }
 
-void MeasurementWorker::insertCallback(const char *metricName,
-                                       int64_t timeSinceEpoch, double value) {
-  this->values_mutex.lock();
+void MeasurementWorker::insertCallback(const char* MetricName, int64_t TimeSinceEpoch, double Value) {
+  ValuesMutex.lock();
 
   using Duration = std::chrono::duration<int64_t, std::nano>;
-  auto time =
-      std::chrono::time_point<std::chrono::high_resolution_clock, Duration>(
-          Duration(timeSinceEpoch));
-  auto name_equal = [metricName](auto const &pair) {
-    return std::string(metricName).compare(pair.first) == 0;
-  };
-  auto pair =
-      std::find_if(this->values.begin(), this->values.end(), name_equal);
+  auto Time = std::chrono::time_point<std::chrono::high_resolution_clock, Duration>(Duration(TimeSinceEpoch));
+  auto NameEqual = [&MetricName](auto const& Pair) { return std::string(MetricName) == Pair.first; };
+  auto Pair = std::find_if(Values.begin(), Values.end(), NameEqual);
 
-  if (pair != this->values.end()) {
-    pair->second.push_back(TimeValue(time, value));
+  if (Pair != Values.end()) {
+    Pair->second.emplace_back(Time, Value);
   }
 
-  this->values_mutex.unlock();
+  ValuesMutex.unlock();
 }
 
-void MeasurementWorker::startMeasurement() {
-  this->startTime = std::chrono::high_resolution_clock::now();
-}
+void MeasurementWorker::startMeasurement() { StartTime = std::chrono::high_resolution_clock::now(); }
 
-std::map<std::string, Summary>
-MeasurementWorker::getValues(std::chrono::milliseconds startDelta,
-                             std::chrono::milliseconds stopDelta) {
-  std::map<std::string, Summary> measurment = {};
+auto MeasurementWorker::getValues(std::chrono::milliseconds StartDelta, std::chrono::milliseconds StopDelta)
+    -> std::map<std::string, Summary> {
+  std::map<std::string, Summary> Measurment = {};
 
-  this->values_mutex.lock();
+  ValuesMutex.lock();
 
-  for (auto &[key, values] : this->values) {
-    auto startTime = this->startTime;
-    auto endTime = std::chrono::high_resolution_clock::now();
-    auto metric = this->findMetricByName(key);
+  for (auto& [key, values] : Values) {
+    auto StartTime = this->StartTime;
+    auto EndTime = std::chrono::high_resolution_clock::now();
+    const auto* Metric = findMetricByName(key);
 
-    metric_type_t type;
-    std::memset(&type, 0, sizeof(type));
-    if (metric == nullptr) {
-      type.absolute = 1;
+    MetricType Type;
+    std::memset(&Type, 0, sizeof(Type));
+    if (Metric == nullptr) {
+      Type.Absolute = 1;
 
-      startTime += startDelta;
-      endTime -= stopDelta;
+      StartTime += StartDelta;
+      EndTime -= StopDelta;
     } else {
-      std::memcpy(&type, &metric->type, sizeof(type));
+      std::memcpy(&Type, &Metric->Type, sizeof(Type));
 
-      if (metric->type.ignore_start_stop_delta == 0) {
-        startTime += startDelta;
-        endTime -= stopDelta;
+      if (Metric->Type.IgnoreStartStopDelta == 0) {
+        StartTime += StartDelta;
+        EndTime -= StopDelta;
       }
     }
 
-    decltype(values) croppedValues(values.size());
+    decltype(values) CroppedValues(values.size());
 
-    auto findAll = [startTime, endTime](auto const &tv) {
-      return startTime <= tv.time && tv.time <= endTime;
-    };
-    auto it = std::copy_if(values.begin(), values.end(), croppedValues.begin(),
-                           findAll);
-    croppedValues.resize(std::distance(croppedValues.begin(), it));
+    auto FindAll = [&StartTime, &EndTime](auto const& Tv) { return StartTime <= Tv.Time && Tv.Time <= EndTime; };
+    auto It = std::copy_if(values.begin(), values.end(), CroppedValues.begin(), FindAll);
+    CroppedValues.resize(std::distance(CroppedValues.begin(), It));
 
-    Summary sum = Summary::calculate(croppedValues.begin(), croppedValues.end(),
-                                     type, this->numThreads);
+    const auto Sum = Summary::calculate(CroppedValues.begin(), CroppedValues.end(), Type, NumThreads);
 
-    measurment[key] = sum;
+    Measurment[key] = Sum;
   }
 
-  this->values_mutex.unlock();
+  ValuesMutex.unlock();
 
-  return measurment;
+  return Measurment;
 }
 
-int *MeasurementWorker::dataAcquisitionWorker(void *measurementWorker) {
+auto MeasurementWorker::dataAcquisitionWorker(void* MeasurementWorker) -> void* {
+  // NOLINTNEXTLINE(cert-pos47-c,concurrency-thread-canceltype-asynchronous)
+  pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, nullptr);
 
-  pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
-
-  auto _this = reinterpret_cast<MeasurementWorker *>(measurementWorker);
+  auto* This = static_cast<class MeasurementWorker*>(MeasurementWorker);
 
 #ifndef __APPLE__
   pthread_setname_np(pthread_self(), "DataAcquisition");
 #endif
 
-  using clock = std::chrono::high_resolution_clock;
+  using Clock = std::chrono::high_resolution_clock;
 
-  using callbackTuple =
-      std::tuple<void (*)(void), std::chrono::microseconds,
-                 std::chrono::high_resolution_clock::time_point>;
-  auto callbackTupleComparator = [](callbackTuple left, callbackTuple right) {
-    return std::get<2>(left) > std::get<2>(right);
+  using CallbackTuple =
+      std::tuple<void (*)(void), std::chrono::microseconds, std::chrono::high_resolution_clock::time_point>;
+  auto CallbackTupleComparator = [](CallbackTuple Left, CallbackTuple Right) {
+    return std::get<2>(Left) > std::get<2>(Right);
   };
 
   // this datastructure holds a tuple of our callback, the callback frequency
   // and the next timepoint. it will be sorted, so the pop function will give
   // back the next callback
-  std::priority_queue<callbackTuple, std::vector<callbackTuple>,
-                      decltype(callbackTupleComparator)>
-      callbackQueue(callbackTupleComparator);
+  std::priority_queue<CallbackTuple, std::vector<CallbackTuple>, decltype(CallbackTupleComparator)> CallbackQueue(
+      CallbackTupleComparator);
 
-  _this->values_mutex.lock();
+  This->ValuesMutex.lock();
 
-  for (auto const &[key, value] : _this->values) {
-    auto metric_interface = _this->findMetricByName(key);
+  for (auto const& [key, value] : This->Values) {
+    const auto* MetricInterface = This->findMetricByName(key);
 
-    if (metric_interface == nullptr) {
+    if (MetricInterface == nullptr) {
       continue;
     }
 
-    auto callbackTime =
-        std::chrono::microseconds(metric_interface->callback_time);
-    if (callbackTime.count() == 0) {
+    auto CallbackTime = std::chrono::microseconds(MetricInterface->CallbackTime);
+    if (CallbackTime.count() == 0) {
       continue;
     }
 
-    auto currentTime = clock::now();
+    auto CurrentTime = Clock::now();
 
-    callbackQueue.push(
-        std::make_tuple(metric_interface->callback, callbackTime, currentTime));
+    CallbackQueue.emplace(MetricInterface->Callback, CallbackTime, CurrentTime);
   }
 
-  _this->values_mutex.unlock();
+  This->ValuesMutex.unlock();
 
-  auto nextFetch = clock::now() + _this->updateInterval;
+  auto NextFetch = Clock::now() + This->UpdateInterval;
 
   for (;;) {
-    auto now = clock::now();
+    auto Now = Clock::now();
 
-    if (nextFetch <= now) {
-      _this->values_mutex.lock();
+    if (NextFetch <= Now) {
+      This->ValuesMutex.lock();
 
-      for (auto &[metricName, values] : _this->values) {
-        auto metric_interface = _this->findMetricByName(metricName);
+      for (auto& [metricName, values] : This->Values) {
+        const auto* MetricInterface = This->findMetricByName(metricName);
 
-        if (metric_interface == nullptr) {
+        if (MetricInterface == nullptr) {
           continue;
         }
 
-        double value;
+        double Value = NAN;
 
-        if (!metric_interface->type.insert_callback &&
-            metric_interface->get_reading != nullptr) {
-          if (EXIT_SUCCESS == metric_interface->get_reading(&value)) {
-            auto tv =
-                TimeValue(std::chrono::high_resolution_clock::now(), value);
-            values.push_back(tv);
+        if (!MetricInterface->Type.InsertCallback && MetricInterface->GetReading != nullptr) {
+          if (EXIT_SUCCESS == MetricInterface->GetReading(&Value)) {
+            auto Tv = TimeValue(std::chrono::high_resolution_clock::now(), Value);
+            values.push_back(Tv);
           }
         }
       }
 
-      _this->values_mutex.unlock();
+      This->ValuesMutex.unlock();
 
-      nextFetch = now + _this->updateInterval;
+      NextFetch = Now + This->UpdateInterval;
     }
 
-    auto nextWake = nextFetch;
+    auto NextWake = NextFetch;
 
-    if (!callbackQueue.empty()) {
-      auto [callbackFunction, callbackTime, nextCallback] = callbackQueue.top();
+    if (!CallbackQueue.empty()) {
+      auto [callbackFunction, callbackTime, nextCallback] = CallbackQueue.top();
 
-      if (nextCallback <= now) {
+      if (nextCallback <= Now) {
         // remove the elment from the queue
-        callbackQueue.pop();
+        CallbackQueue.pop();
 
         // call our callback
         callbackFunction();
 
         // add it with the updated callback time to the queue again
-        nextCallback = now + callbackTime;
-        callbackQueue.push(
-            std::make_tuple(callbackFunction, callbackTime, nextCallback));
+        nextCallback = Now + callbackTime;
+        CallbackQueue.emplace(callbackFunction, callbackTime, nextCallback);
       }
 
-      nextWake = nextCallback < nextWake ? nextCallback : nextWake;
+      NextWake = nextCallback < NextWake ? nextCallback : NextWake;
     }
 
-    std::this_thread::sleep_for(nextWake - clock::now());
+    std::this_thread::sleep_for(NextWake - Clock::now());
   }
 }
 
-int *MeasurementWorker::stdinDataAcquisitionWorker(void *measurementWorker) {
-
-  pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
+auto MeasurementWorker::stdinDataAcquisitionWorker(void* MeasurementWorker) -> void* {
+  // NOLINTNEXTLINE(cert-pos47-c,concurrency-thread-canceltype-asynchronous)
+  pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, nullptr);
 
-  auto _this = reinterpret_cast<MeasurementWorker *>(measurementWorker);
+  auto* This = static_cast<class MeasurementWorker*>(MeasurementWorker);
 
 #ifndef __APPLE__
   pthread_setname_np(pthread_self(), "StdinDataAcquis");
 #endif
 
-  for (std::string line; std::getline(std::cin, line);) {
-    int64_t time;
-    double value;
-    char name[128];
-    if (std::sscanf(line.c_str(), "%127s %ld %lf", name, &time, &value) == 3) {
-      auto name_equal = [name](auto const &allowedName) {
-        return allowedName.compare(std::string(name)) == 0;
-      };
-      auto item = std::find_if(_this->stdinMetrics().begin(),
-                               _this->stdinMetrics().end(), name_equal);
-      // metric name is allowed
-      if (item != _this->stdinMetrics().end()) {
-        _this->insertCallback(name, time, value);
-      }
+  for (;;) {
+    int64_t Time = 0;
+    double Value = NAN;
+    std::array<char, 128> Name = {0};
+
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
+    if (!scanStdin("%127s %ld %lf", 3, Name.data(), &Time, &Value)) {
+      continue;
     }
-  }
 
-  return NULL;
+    auto NameEqual = [&Name](auto const& AllowedName) { return AllowedName == std::string(Name.data()); };
+    auto Item = std::find_if(This->stdinMetrics().begin(), This->stdinMetrics().end(), NameEqual);
+    // metric name is allowed
+    if (Item != This->stdinMetrics().end()) {
+      This->insertCallback(Name.data(), Time, Value);
+    }
+  }
 }
+
+} // namespace firestarter::measurement
\ No newline at end of file
diff --git a/src/firestarter/Measurement/Metric/IPCEstimate.cpp b/src/firestarter/Measurement/Metric/IPCEstimate.cpp
index a58f91bb..5cd49b88 100644
--- a/src/firestarter/Measurement/Metric/IPCEstimate.cpp
+++ b/src/firestarter/Measurement/Metric/IPCEstimate.cpp
@@ -19,72 +19,51 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
+#include "firestarter/Measurement/Metric/IPCEstimate.hpp"
+
 #include <chrono>
 #include <cstdlib>
-#include <string>
-
-extern "C" {
-#include <firestarter/Measurement/Metric/IPCEstimate.h>
-#include <firestarter/Measurement/MetricInterface.h>
-}
 
-static std::string errorString = "";
+auto IpcEstimateMetricData::fini() -> int32_t {
+  auto& Instance = instance();
 
-static void (*callback)(void *, const char *, int64_t, double) = nullptr;
-static void *callback_arg = nullptr;
-
-static int32_t fini(void) {
-  callback = nullptr;
-  callback_arg = nullptr;
+  Instance.Callback = nullptr;
+  Instance.CallbackArg = nullptr;
 
   return EXIT_SUCCESS;
 }
 
-static int32_t init(void) {
-  errorString = "";
+auto IpcEstimateMetricData::init() -> int32_t {
+  instance().ErrorString = "";
 
   return EXIT_SUCCESS;
 }
 
-static const char *get_error(void) {
-  const char *errorCString = errorString.c_str();
-  return errorCString;
+auto IpcEstimateMetricData::getError() -> const char* {
+  const char* ErrorCString = instance().ErrorString.c_str();
+  return ErrorCString;
 }
 
-static int32_t register_insert_callback(void (*c)(void *, const char *, int64_t,
-                                                  double),
-                                        void *arg) {
-  callback = c;
-  callback_arg = arg;
+auto IpcEstimateMetricData::registerInsertCallback(void (*C)(void*, const char*, int64_t, double), void* Arg)
+    -> int32_t {
+  auto& Instance = instance();
+
+  Instance.Callback = C;
+  Instance.CallbackArg = Arg;
+
   return EXIT_SUCCESS;
 }
 
-void ipc_estimate_metric_insert(double value) {
-  if (callback == nullptr || callback_arg == nullptr) {
+void IpcEstimateMetricData::insertValue(double Value) {
+  auto& Instance = instance();
+
+  if (Instance.Callback == nullptr || Instance.CallbackArg == nullptr) {
     return;
   }
 
-  int64_t t = std::chrono::duration_cast<std::chrono::nanoseconds>(
-                  std::chrono::high_resolution_clock::now().time_since_epoch())
-                  .count();
-
-  callback(callback_arg, "ipc-estimate", t, value);
-}
+  const int64_t T =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now().time_since_epoch())
+          .count();
 
-metric_interface_t ipc_estimate_metric = {
-    .name = "ipc-estimate",
-    .type = {.absolute = 1,
-             .accumalative = 0,
-             .divide_by_thread_count = 0,
-             .insert_callback = 1,
-             .ignore_start_stop_delta = 1,
-             .__reserved = 0},
-    .unit = "IPC",
-    .callback_time = 0,
-    .callback = nullptr,
-    .init = init,
-    .fini = fini,
-    .get_reading = nullptr,
-    .get_error = get_error,
-    .register_insert_callback = register_insert_callback,
-};
+  Instance.Callback(Instance.CallbackArg, "ipc-estimate", T, Value);
+}
\ No newline at end of file
diff --git a/src/firestarter/Measurement/Metric/Perf.cpp b/src/firestarter/Measurement/Metric/Perf.cpp
index 48f3120b..92a09cf1 100644
--- a/src/firestarter/Measurement/Metric/Perf.cpp
+++ b/src/firestarter/Measurement/Metric/Perf.cpp
@@ -19,81 +19,67 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
+#include "firestarter/Measurement/Metric/Perf.hpp"
+
+#include <array>
+#include <cassert>
 #include <cstring>
 #include <string>
 
 extern "C" {
-#include <firestarter/Measurement/Metric/Perf.h>
-#include <firestarter/Measurement/MetricInterface.h>
-
 #include <linux/perf_event.h>
 #include <sys/ioctl.h>
 #include <sys/syscall.h>
 #include <unistd.h>
+}
 
-#define PERF_EVENT_PARANOID "/proc/sys/kernel/perf_event_paranoid"
-
-struct read_format {
-  uint64_t nr;
-  struct {
-    uint64_t value;
-    uint64_t id;
-  } values[2];
-};
-
-static std::string errorString = "";
-
-static int cpu_cycles_fd = -1;
-static int instructions_fd = -1;
-static uint64_t cpu_cycles_id;
-static uint64_t instructions_id;
-static bool init_done = false;
-static int32_t init_value;
-
-static struct read_format last;
-
-static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
-                            int cpu, int group_fd, unsigned long flags) {
-  return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+namespace {
+auto perfEventOpen(struct perf_event_attr* HwEvent, pid_t Pid, int Cpu, int GroupFd, uint64_t Flags) -> int {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
+  return static_cast<int>(syscall(__NR_perf_event_open, HwEvent, Pid, Cpu, GroupFd, Flags));
 }
+} // namespace
 
-static int32_t fini(void) {
-  if (!(cpu_cycles_fd < 0)) {
-    close(cpu_cycles_fd);
-    cpu_cycles_fd = -1;
+auto PerfMetricData::fini() -> int32_t {
+  auto& Instance = instance();
+
+  if (!(Instance.CpuCyclesFd < 0)) {
+    close(Instance.CpuCyclesFd);
+    Instance.CpuCyclesFd = -1;
   }
-  if (!(instructions_fd < 0)) {
-    close(instructions_fd);
-    instructions_fd = -1;
+  if (!(Instance.InstructionsFd < 0)) {
+    close(Instance.InstructionsFd);
+    Instance.InstructionsFd = -1;
   }
-  init_done = false;
+  Instance.InitDone = false;
   return EXIT_SUCCESS;
 }
 
-static int32_t init(void) {
-  if (init_done) {
-    return init_value;
+auto PerfMetricData::init() -> int32_t {
+  auto& Instance = instance();
+
+  if (Instance.InitDone) {
+    return Instance.InitValue;
   }
 
-  if (access(PERF_EVENT_PARANOID, F_OK) == -1) {
+  if (access(PerfEventParanoidFile, F_OK) == -1) {
     // https://man7.org/linux/man-pages/man2/perf_event_open.2.html
     // The official way of knowing if perf_event_open() support is enabled
     // is checking for the existence of the file
     // /proc/sys/kernel/perf_event_paranoid.
-    errorString =
-        "syscall perf_event_open not supported or file " PERF_EVENT_PARANOID
-        " does not exist";
-    init_value = EXIT_FAILURE;
-    init_done = true;
+    Instance.ErrorString =
+        "syscall perf_event_open not supported or file " + std::string(PerfEventParanoidFile) + " does not exist";
+    Instance.InitValue = EXIT_FAILURE;
+    Instance.InitDone = true;
     return EXIT_FAILURE;
   }
 
-  struct perf_event_attr cpu_cycles_attr;
-  std::memset(&cpu_cycles_attr, 0, sizeof(struct perf_event_attr));
-  cpu_cycles_attr.type = PERF_TYPE_HARDWARE;
-  cpu_cycles_attr.size = sizeof(struct perf_event_attr);
-  cpu_cycles_attr.config = PERF_COUNT_HW_CPU_CYCLES;
-  cpu_cycles_attr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+  struct perf_event_attr CpuCyclesAttr {};
+  std::memset(&CpuCyclesAttr, 0, sizeof(struct perf_event_attr));
+  CpuCyclesAttr.type = PERF_TYPE_HARDWARE;
+  CpuCyclesAttr.size = sizeof(struct perf_event_attr);
+  CpuCyclesAttr.config = PERF_COUNT_HW_CPU_CYCLES;
+  CpuCyclesAttr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
   // https://man7.org/linux/man-pages/man2/perf_event_open.2.html
   //     inherit
   // The inherit bit specifies that this counter should count
@@ -113,166 +99,133 @@ static int32_t init(void) {
   // changed the check
   // - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
   // + if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
-  cpu_cycles_attr.inherit = 1;
-  cpu_cycles_attr.exclude_kernel = 1;
-  cpu_cycles_attr.exclude_hv = 1;
-
-  if ((cpu_cycles_fd = perf_event_open(
-           &cpu_cycles_attr,
-           // pid == 0 and cpu == -1
-           // This measures the calling process/thread on any CPU.
-           0, -1,
-           // The group_fd argument allows event groups to be created.  An event
-           // group has one event which is the group leader.  The leader is
-           // created first, with group_fd = -1.  The rest of the group members
-           // are created with subsequent perf_event_open() calls with group_fd
-           // being set to the file descriptor of the group leader.
-           -1, 0)) < 0) {
+  CpuCyclesAttr.inherit = 1;
+  CpuCyclesAttr.exclude_kernel = 1;
+  CpuCyclesAttr.exclude_hv = 1;
+
+  Instance.CpuCyclesFd = perfEventOpen(&CpuCyclesAttr,
+                                       // pid == 0 and cpu == -1
+                                       // This measures the calling process/thread on any CPU.
+                                       0, -1,
+                                       // The group_fd argument allows event groups to be created.  An event
+                                       // group has one event which is the group leader.  The leader is
+                                       // created first, with group_fd = -1.  The rest of the group members
+                                       // are created with subsequent perf_event_open() calls with group_fd
+                                       // being set to the file descriptor of the group leader.
+                                       -1, 0);
+
+  if (Instance.CpuCyclesFd < 0) {
     fini();
-    errorString = "perf_event_open failed for PERF_COUNT_HW_CPU_CYCLES";
-    init_value = EXIT_FAILURE;
-    init_done = true;
+    Instance.ErrorString = "perf_event_open failed for PERF_COUNT_HW_CPU_CYCLES";
+    Instance.InitValue = EXIT_FAILURE;
+    Instance.InitDone = true;
     return EXIT_FAILURE;
   }
 
-  ioctl(cpu_cycles_fd, PERF_EVENT_IOC_ID, &cpu_cycles_id);
-
-  struct perf_event_attr instructions_attr;
-  std::memset(&instructions_attr, 0, sizeof(struct perf_event_attr));
-  instructions_attr.type = PERF_TYPE_HARDWARE;
-  instructions_attr.size = sizeof(struct perf_event_attr);
-  instructions_attr.config = PERF_COUNT_HW_INSTRUCTIONS;
-  instructions_attr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
-  instructions_attr.inherit = 1;
-  instructions_attr.exclude_kernel = 1;
-  instructions_attr.exclude_hv = 1;
-
-  if ((instructions_fd = perf_event_open(
-           &instructions_attr,
-           // pid == 0 and cpu == -1
-           // This measures the calling process/thread on any CPU.
-           0, -1,
-           // The group_fd argument allows event groups to be created.  An event
-           // group has one event which is the group leader.  The leader is
-           // created first, with group_fd = -1.  The rest of the group members
-           // are created with subsequent perf_event_open() calls with group_fd
-           // being set to the file descriptor of the group leader.
-           cpu_cycles_fd, 0)) < 0) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
+  ioctl(Instance.CpuCyclesFd, PERF_EVENT_IOC_ID, &Instance.CpuCyclesId);
+
+  struct perf_event_attr InstructionsAttr {};
+  std::memset(&InstructionsAttr, 0, sizeof(struct perf_event_attr));
+  InstructionsAttr.type = PERF_TYPE_HARDWARE;
+  InstructionsAttr.size = sizeof(struct perf_event_attr);
+  InstructionsAttr.config = PERF_COUNT_HW_INSTRUCTIONS;
+  InstructionsAttr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+  InstructionsAttr.inherit = 1;
+  InstructionsAttr.exclude_kernel = 1;
+  InstructionsAttr.exclude_hv = 1;
+
+  Instance.InstructionsFd = perfEventOpen(&InstructionsAttr,
+                                          // pid == 0 and cpu == -1
+                                          // This measures the calling process/thread on any CPU.
+                                          0, -1,
+                                          // The group_fd argument allows event groups to be created.  An event
+                                          // group has one event which is the group leader.  The leader is
+                                          // created first, with group_fd = -1.  The rest of the group members
+                                          // are created with subsequent perf_event_open() calls with group_fd
+                                          // being set to the file descriptor of the group leader.
+                                          Instance.CpuCyclesFd, 0);
+
+  if (Instance.InstructionsFd < 0) {
     fini();
-    errorString = "perf_event_open failed for PERF_COUNT_HW_INSTRUCTIONS";
-    init_value = EXIT_FAILURE;
-    init_done = true;
+    Instance.ErrorString = "perf_event_open failed for PERF_COUNT_HW_INSTRUCTIONS";
+    Instance.InitValue = EXIT_FAILURE;
+    Instance.InitDone = true;
     return EXIT_FAILURE;
   }
 
-  ioctl(instructions_fd, PERF_EVENT_IOC_ID, &instructions_id);
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
+  ioctl(Instance.InstructionsFd, PERF_EVENT_IOC_ID, &Instance.InstructionsId);
 
-  ioctl(cpu_cycles_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
-  ioctl(cpu_cycles_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
+  ioctl(Instance.CpuCyclesFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
+  ioctl(Instance.CpuCyclesFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
 
-  if (0 == read(cpu_cycles_fd, &last, sizeof(last))) {
+  if (0 == read(Instance.CpuCyclesFd, &Instance.Last, sizeof(Last))) {
     fini();
-    errorString = "group read failed in init";
-    init_value = EXIT_FAILURE;
-    init_done = true;
+    Instance.ErrorString = "group read failed in init";
+    Instance.InitValue = EXIT_FAILURE;
+    Instance.InitDone = true;
     return EXIT_FAILURE;
   }
 
-  init_value = EXIT_SUCCESS;
-  init_done = true;
+  Instance.InitValue = EXIT_SUCCESS;
+  Instance.InitDone = true;
   return EXIT_SUCCESS;
 }
 
-static uint64_t value_from_id(struct read_format *values, uint64_t id) {
-  for (decltype(values->nr) i = 0; i < values->nr; ++i) {
-    if (id == values->values[i].id) {
-      return values->values[i].value;
+auto PerfMetricData::valueFromId(struct ReadFormat* Reader, uint64_t Id) -> uint64_t {
+  for (decltype(Reader->Nr) I = 0; I < Reader->Nr; ++I) {
+    assert(I < 2 && "Index is out of bounds");
+    // NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index)
+    if (Id == Reader->Values[I].Id) {
+      return Reader->Values[I].Value;
     }
+    // NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index)
   }
 
   return 0;
 }
 
-static int32_t get_reading(double *ipc_value, double *freq_value) {
+auto PerfMetricData::getReading(double* IpcValue, double* FreqValue) -> int32_t {
+  auto& Instance = instance();
 
-  if (cpu_cycles_fd < 0 || instructions_fd < 0) {
+  if (Instance.CpuCyclesFd < 0 || Instance.InstructionsFd < 0) {
     fini();
     return EXIT_FAILURE;
   }
 
-  struct read_format read_values;
+  struct ReadFormat ReadValues {};
 
-  if (0 == read(cpu_cycles_fd, &read_values, sizeof(read_values))) {
+  if (0 == read(Instance.CpuCyclesFd, &ReadValues, sizeof(ReadValues))) {
     fini();
-    errorString = "group read failed";
+    Instance.ErrorString = "group read failed";
     return EXIT_FAILURE;
   }
 
-  if (ipc_value != nullptr) {
-    uint64_t diff[2];
-    diff[0] = value_from_id(&read_values, instructions_id) -
-              value_from_id(&last, instructions_id);
-    diff[1] = value_from_id(&read_values, cpu_cycles_id) -
-              value_from_id(&last, cpu_cycles_id);
+  if (IpcValue != nullptr) {
+    std::array<uint64_t, 2> Diff = {
+        valueFromId(&ReadValues, Instance.InstructionsId) - valueFromId(&Instance.Last, Instance.InstructionsId),
+        valueFromId(&ReadValues, Instance.CpuCyclesId) - valueFromId(&Instance.Last, Instance.CpuCyclesId)};
 
-    std::memcpy(&last, &read_values, sizeof(last));
+    std::memcpy(&Instance.Last, &ReadValues, sizeof(Last));
 
-    *ipc_value = (double)diff[0] / (double)diff[1];
+    *IpcValue = static_cast<double>(Diff[0]) / static_cast<double>(Diff[1]);
   }
 
-  if (freq_value != nullptr) {
-    *freq_value = (double)value_from_id(&read_values, cpu_cycles_id) / 1e9;
+  if (FreqValue != nullptr) {
+    *FreqValue = static_cast<double>(valueFromId(&ReadValues, Instance.CpuCyclesId)) / 1e9;
   }
 
   return EXIT_SUCCESS;
 }
 
-static int32_t get_reading_ipc(double *value) {
-  return get_reading(value, nullptr);
-}
+auto PerfMetricData::getReadingIpc(double* Value) -> int32_t { return getReading(Value, nullptr); }
 
-static int32_t get_reading_freq(double *value) {
-  return get_reading(nullptr, value);
-}
-
-static const char *get_error(void) {
-  const char *errorCString = errorString.c_str();
-  return errorCString;
-}
-}
+auto PerfMetricData::getReadingFreq(double* Value) -> int32_t { return getReading(nullptr, Value); }
 
-metric_interface_t perf_ipc_metric = {
-    .name = "perf-ipc",
-    .type = {.absolute = 1,
-             .accumalative = 0,
-             .divide_by_thread_count = 0,
-             .insert_callback = 0,
-             .ignore_start_stop_delta = 0,
-             .__reserved = 0},
-    .unit = "IPC",
-    .callback_time = 0,
-    .callback = nullptr,
-    .init = init,
-    .fini = fini,
-    .get_reading = get_reading_ipc,
-    .get_error = get_error,
-    .register_insert_callback = nullptr,
-};
-
-metric_interface_t perf_freq_metric = {
-    .name = "perf-freq",
-    .type = {.absolute = 0,
-             .accumalative = 1,
-             .divide_by_thread_count = 1,
-             .insert_callback = 0,
-             .ignore_start_stop_delta = 0,
-             .__reserved = 0},
-    .unit = "GHz",
-    .callback_time = 0,
-    .callback = nullptr,
-    .init = init,
-    .fini = fini,
-    .get_reading = get_reading_freq,
-    .get_error = get_error,
-    .register_insert_callback = nullptr,
-};
+auto PerfMetricData::getError() -> const char* {
+  const char* ErrorCString = instance().ErrorString.c_str();
+  return ErrorCString;
+}
\ No newline at end of file
diff --git a/src/firestarter/Measurement/Metric/RAPL.cpp b/src/firestarter/Measurement/Metric/RAPL.cpp
index 5f6b4bd7..458b2643 100644
--- a/src/firestarter/Measurement/Metric/RAPL.cpp
+++ b/src/firestarter/Measurement/Metric/RAPL.cpp
@@ -19,55 +19,34 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
+#include "firestarter/Measurement/Metric/RAPL.hpp"
+
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <memory>
 #include <sstream>
+#include <string>
 #include <vector>
 
 extern "C" {
-#include <firestarter/Measurement/Metric/RAPL.h>
-#include <firestarter/Measurement/MetricInterface.h>
-
 #include <dirent.h>
+}
 
-#define RAPL_PATH "/sys/class/powercap"
-
-static std::string errorString = "";
-
-struct reader_def {
-  char *path;
-  long long int last_reading;
-  long long int overflow;
-  long long int max;
-};
-
-struct reader_def_free {
-  void operator()(struct reader_def *def) {
-    if (def != nullptr) {
-      if (((void *)def->path) != nullptr) {
-        free((void *)def->path);
-      }
-      free((void *)def);
-    }
-  }
-};
-
-static std::vector<std::shared_ptr<struct reader_def>> readers = {};
-
-static int32_t fini(void) {
-  readers.clear();
+auto RaplMetricData::fini() -> int32_t {
+  instance().Readers.clear();
 
   return EXIT_SUCCESS;
 }
 
-static int32_t init(void) {
-  errorString = "";
+auto RaplMetricData::init() -> int32_t {
+  auto& Instance = instance();
 
-  DIR *raplDir = opendir(RAPL_PATH);
-  if (raplDir == NULL) {
-    errorString = "Could not open " RAPL_PATH;
+  Instance.ErrorString = "";
+
+  DIR* RaplDir = opendir(RaplPath);
+  if (RaplDir == nullptr) {
+    Instance.ErrorString = "Could not open " + std::string(RaplPath);
     return EXIT_FAILURE;
   }
 
@@ -76,111 +55,86 @@ static int32_t init(void) {
   // and finally package only.
 
   // contains an empty path if it is not found
-  std::string psysPath = "";
+  std::string PsysPath;
 
   // a vector of all paths to package and dram
-  std::vector<std::string> paths = {};
-
-  struct dirent *dir;
-  while ((dir = readdir(raplDir)) != NULL) {
-    std::stringstream path;
-    std::stringstream namePath;
-    path << RAPL_PATH << "/" << dir->d_name;
-    namePath << path.str() << "/name";
-
-    std::ifstream nameStream(namePath.str());
-    if (!nameStream.good()) {
+  std::vector<std::string> Paths = {};
+
+  struct dirent* Dir = nullptr;
+
+  // As long as the DIR object (named RaplDir here) is not shared between threads this call is thread-safe:
+  // https://www.gnu.org/software/libc/manual/html_node/Reading_002fClosing-Directory.html
+  // NOLINTNEXTLINE(concurrency-mt-unsafe)
+  while ((Dir = readdir(RaplDir)) != nullptr) {
+    std::stringstream Path;
+    std::stringstream NamePath;
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
+    Path << RaplPath << "/" << Dir->d_name;
+    NamePath << Path.str() << "/name";
+
+    std::ifstream NameStream(NamePath.str());
+    if (!NameStream.good()) {
       // an error opening the file occured
       continue;
     }
 
-    std::string name;
-    std::getline(nameStream, name);
+    std::string Name;
+    std::getline(NameStream, Name);
 
-    if (name == "psys") {
+    if (Name == "psys") {
       // found psys
-      psysPath = path.str();
-    } else if (0 == name.rfind("package", 0) || name == "dram") {
+      PsysPath = Path.str();
+    } else if (0 == Name.rfind("package", 0) || Name == "dram") {
       // find all package and dram
-      paths.push_back(path.str());
+      Paths.push_back(Path.str());
     }
   }
-  closedir(raplDir);
+  closedir(RaplDir);
 
   // make psys the only value if available
-  if (!psysPath.empty()) {
-    paths.clear();
-    paths.push_back(psysPath);
+  if (!PsysPath.empty()) {
+    Paths.clear();
+    Paths.push_back(PsysPath);
   }
 
   // paths now contains all interesting nodes
 
-  if (paths.size() == 0) {
-    errorString = "No valid entries in " RAPL_PATH;
+  if (Paths.empty()) {
+    Instance.ErrorString = "No valid entries in " + std::string(RaplPath);
     return EXIT_FAILURE;
   }
 
-  for (auto const &path : paths) {
-    std::stringstream energyUjPath;
-    energyUjPath << path << "/energy_uj";
-    std::ifstream energyReadingStream(energyUjPath.str());
-    if (!energyReadingStream.good()) {
-      errorString = "Could not read energy_uj";
+  for (auto const& Path : Paths) {
+    std::stringstream EnergyUjPath;
+    EnergyUjPath << Path << "/energy_uj";
+    std::ifstream EnergyReadingStream(EnergyUjPath.str());
+    if (!EnergyReadingStream.good()) {
+      Instance.ErrorString = "Could not read energy_uj";
       break;
     }
 
-    std::stringstream maxEnergyUjRangePath;
-    maxEnergyUjRangePath << path << "/max_energy_range_uj";
-    std::ifstream maxEnergyReadingStream(maxEnergyUjRangePath.str());
-    if (!maxEnergyReadingStream.good()) {
-      errorString = "Could not read max_energy_range_uj";
+    std::stringstream MaxEnergyUjRangePath;
+    MaxEnergyUjRangePath << Path << "/max_energy_range_uj";
+    std::ifstream MaxEnergyReadingStream(MaxEnergyUjRangePath.str());
+    if (!MaxEnergyReadingStream.good()) {
+      Instance.ErrorString = "Could not read max_energy_range_uj";
       break;
     }
 
-    unsigned long long reading;
-    unsigned long long max;
-    std::string buffer;
-    int read;
+    std::string Buffer;
 
-    std::getline(energyReadingStream, buffer);
-    read = std::sscanf(buffer.c_str(), "%llu", &reading);
-
-    if (read == 0) {
-      std::stringstream ss;
-      ss << "Contents in file " << energyUjPath.str()
-         << " do not conform to mask (unsigned long long)";
-      errorString = ss.str();
-      break;
-    }
+    std::getline(EnergyReadingStream, Buffer);
+    const auto Reading = std::stoul(Buffer);
 
-    std::getline(maxEnergyReadingStream, buffer);
-    read = std::sscanf(buffer.c_str(), "%llu", &max);
+    std::getline(MaxEnergyReadingStream, Buffer);
+    const auto Max = std::stoul(Buffer);
 
-    if (read == 0) {
-      std::stringstream ss;
-      ss << "Contents in file " << maxEnergyUjRangePath.str()
-         << " do not conform to mask (unsigned long long)";
-      errorString = ss.str();
-      break;
-    }
+    auto Def = std::make_unique<ReaderDef>(/*Path=*/Path, /*LastReading=*/Reading, /*Overflow=*/0, /*Max=*/Max);
 
-    std::shared_ptr<struct reader_def> def(
-        reinterpret_cast<struct reader_def *>(
-            malloc(sizeof(struct reader_def))),
-        reader_def_free());
-    auto pathName = path.c_str();
-    size_t size = (strlen(pathName) + 1) * sizeof(char);
-    void *name = malloc(size);
-    memcpy(name, pathName, size);
-    def->path = (char *)name;
-    def->max = max;
-    def->last_reading = reading;
-    def->overflow = 0;
-
-    readers.push_back(def);
+    Instance.Readers.emplace_back(std::move(Def));
   }
 
-  if (errorString.size() != 0) {
+  if (!Instance.ErrorString.empty()) {
     fini();
     return EXIT_FAILURE;
   }
@@ -188,60 +142,39 @@ static int32_t init(void) {
   return EXIT_SUCCESS;
 }
 
-static int32_t get_reading(double *value) {
-  double finalReading = 0.0;
+auto RaplMetricData::getReading(double* Value) -> int32_t {
+  double FinalReading = 0.0;
 
-  for (auto &def : readers) {
-    long long int reading;
-    std::string buffer;
+  for (auto& Def : instance().Readers) {
+    std::string Buffer;
 
-    std::stringstream energyUjPath;
-    energyUjPath << def->path << "/energy_uj";
-    std::ifstream energyReadingStream(energyUjPath.str());
-    std::getline(energyReadingStream, buffer);
-    std::sscanf(buffer.c_str(), "%llu", &reading);
+    std::stringstream EnergyUjPath;
+    EnergyUjPath << Def->Path << "/energy_uj";
+    std::ifstream EnergyReadingStream(EnergyUjPath.str());
+    std::getline(EnergyReadingStream, Buffer);
+    const auto Reading = std::stoll(Buffer);
 
-    if (reading < def->last_reading) {
-      def->overflow += 1;
+    if (Reading < Def->LastReading) {
+      Def->Overflow += 1;
     }
 
-    def->last_reading = reading;
+    Def->LastReading = Reading;
 
-    finalReading +=
-        1.0E-6 * (double)(def->overflow * def->max + def->last_reading);
+    FinalReading += 1.0E-6 * static_cast<double>((Def->Overflow * Def->Max) + Def->LastReading);
   }
 
-  if (value != nullptr) {
-    *value = finalReading;
+  if (Value != nullptr) {
+    *Value = FinalReading;
   }
 
   return EXIT_SUCCESS;
 }
 
-static const char *get_error(void) {
-  const char *errorCString = errorString.c_str();
-  return errorCString;
+auto RaplMetricData::getError() -> const char* {
+  const char* ErrorCString = instance().ErrorString.c_str();
+  return ErrorCString;
 }
 
 // this function will be called periodically to make sure we do not miss an
 // overflow of the counter
-static void callback(void) { get_reading(nullptr); }
-}
-
-metric_interface_t rapl_metric = {
-    .name = "sysfs-powercap-rapl",
-    .type = {.absolute = 0,
-             .accumalative = 1,
-             .divide_by_thread_count = 0,
-             .insert_callback = 0,
-             .ignore_start_stop_delta = 0,
-             .__reserved = 0},
-    .unit = "J",
-    .callback_time = 30000000,
-    .callback = callback,
-    .init = init,
-    .fini = fini,
-    .get_reading = get_reading,
-    .get_error = get_error,
-    .register_insert_callback = nullptr,
-};
+void RaplMetricData::callback() { getReading(nullptr); }
\ No newline at end of file
diff --git a/src/firestarter/Measurement/Summary.cpp b/src/firestarter/Measurement/Summary.cpp
index 590c4e01..1fecb99f 100644
--- a/src/firestarter/Measurement/Summary.cpp
+++ b/src/firestarter/Measurement/Summary.cpp
@@ -19,88 +19,82 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Measurement/Summary.hpp>
+#include "firestarter/Measurement/Summary.hpp"
 
 #include <cassert>
 #include <cmath>
 
-using namespace firestarter::measurement;
+namespace firestarter::measurement {
 
 // this functions borows a lot of code from
 // https://github.com/metricq/metricq-cpp/blob/master/tools/metricq-summary/src/summary.cpp
-Summary Summary::calculate(std::vector<TimeValue>::iterator begin,
-                           std::vector<TimeValue>::iterator end,
-                           metric_type_t metricType,
-                           unsigned long long numThreads) {
-  std::vector<TimeValue> values = {};
-
-  // TODO: i would really like to make this code a bit more readable, but i
-  // could not find a way yet.
-  if (metricType.accumalative) {
-    TimeValue prev;
-
-    if (begin != end) {
-      prev = *begin++;
-      for (auto it = begin; it != end; ++it) {
-        auto time_diff =
-            1e-6 *
-            (double)std::chrono::duration_cast<std::chrono::microseconds>(
-                it->time - prev.time)
-                .count();
-        auto value_diff = it->value - prev.value;
-
-        double value = value_diff / time_diff;
-
-        if (metricType.divide_by_thread_count) {
-          value /= numThreads;
+auto Summary::calculate(std::vector<TimeValue>::iterator Begin, std::vector<TimeValue>::iterator End,
+                        MetricType MetricType, uint64_t NumThreads) -> Summary {
+  std::vector<TimeValue> Values;
+
+  if (MetricType.Accumalative) {
+    TimeValue Prev;
+
+    if (Begin != End) {
+      Prev = *Begin++;
+      for (auto It = Begin; It != End; ++It) {
+        auto TimeDiff = 1e-6 * static_cast<double>(
+                                   std::chrono::duration_cast<std::chrono::microseconds>(It->Time - Prev.Time).count());
+        auto ValueDiff = It->Value - Prev.Value;
+
+        double Value = ValueDiff / TimeDiff;
+
+        if (MetricType.DivideByThreadCount) {
+          Value /= static_cast<double>(NumThreads);
         }
 
-        values.push_back(TimeValue(prev.time, value));
-        prev = *it;
+        Values.emplace_back(Prev.Time, Value);
+        Prev = *It;
       }
     }
-  } else if (metricType.absolute) {
-    for (auto it = begin; it != end; ++it) {
-      double value = it->value;
+  } else if (MetricType.Absolute) {
+    for (auto It = Begin; It != End; ++It) {
+      double Value = It->Value;
 
-      if (metricType.divide_by_thread_count) {
-        value /= numThreads;
+      if (MetricType.DivideByThreadCount) {
+        Value /= static_cast<double>(NumThreads);
       }
 
-      values.push_back(TimeValue(it->time, value));
+      Values.emplace_back(It->Time, Value);
     }
   } else {
     assert(false);
   }
 
-  begin = values.begin();
-  end = values.end();
+  Begin = Values.begin();
+  End = Values.end();
 
-  Summary summary{};
+  Summary SummaryVal{};
 
-  summary.num_timepoints = std::distance(begin, end);
+  SummaryVal.NumTimepoints = std::distance(Begin, End);
 
-  if (summary.num_timepoints > 0) {
+  if (SummaryVal.NumTimepoints > 0) {
 
-    auto last = begin;
-    std::advance(last, summary.num_timepoints - 1);
-    summary.duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-        last->time - begin->time);
+    auto Last = Begin;
+    std::advance(Last, SummaryVal.NumTimepoints - 1);
+    SummaryVal.Duration = std::chrono::duration_cast<std::chrono::milliseconds>(Last->Time - Begin->Time);
 
-    auto sum_over_nths = [&begin, end, summary](auto fn) {
-      double acc = 0.0;
-      for (auto it = begin; it != end; ++it) {
-        acc += fn(it->value);
+    auto SumOverNths = [&Begin, End, SummaryVal](auto Fn) {
+      double Acc = 0.0;
+      for (auto It = Begin; It != End; ++It) {
+        Acc += Fn(It->Value);
       }
-      return acc / summary.num_timepoints;
+      return Acc / static_cast<double>(SummaryVal.NumTimepoints);
     };
 
-    summary.average = sum_over_nths([](double v) { return v; });
-    summary.stddev = std::sqrt(sum_over_nths([&summary](double v) {
-      double centered = v - summary.average;
-      return centered * centered;
+    SummaryVal.Average = SumOverNths([](double V) { return V; });
+    SummaryVal.Stddev = std::sqrt(SumOverNths([&SummaryVal](double V) {
+      const auto Centered = V - SummaryVal.Average;
+      return Centered * Centered;
     }));
   }
 
-  return summary;
+  return SummaryVal;
 }
+
+} // namespace firestarter::measurement
\ No newline at end of file
diff --git a/src/firestarter/OneAPI/OneAPI.cpp b/src/firestarter/OneAPI/OneAPI.cpp
index f09f79b0..3a5cfc4d 100644
--- a/src/firestarter/OneAPI/OneAPI.cpp
+++ b/src/firestarter/OneAPI/OneAPI.cpp
@@ -22,297 +22,275 @@
 /* OneAPI for GPUs, based on CUDA component
  *****************************************************************************/
 
-#include <firestarter/OneAPI/OneAPI.hpp>
-#include <firestarter/LoadWorkerData.hpp>
-#include <firestarter/Logging/Log.hpp>
+#include "firestarter/OneAPI/OneAPI.hpp"
+#include "firestarter/Logging/Log.hpp"
 
-
-#include <sycl/sycl.hpp>
 #include <oneapi/mkl.hpp>
-
+#include <sycl/sycl.hpp>
 
 #include <algorithm>
 #include <atomic>
+#include <cmath>
 #include <type_traits>
 
-using namespace firestarter::oneapi;
+namespace firestarter::oneapi {
 
+namespace {
 
-/* Random number generation helpers */
-template <typename T>
-void generate_random_data(size_t elems, T *v)
-{
-    for (size_t i = 0; i < elems; i++)
-        v[i] = double(std::rand()) / RAND_MAX;
-}
+/// Helper function to generate random floating point values between 0 and 1 in an array.
+/// \tparam FloatingPointType The type of floating point value of the array. Either float or double.
+/// \arg NumberOfElems The number of elements of the array.
+/// \arg Array The array of floating point values which should be initilized with random data between 0 and 1.
+template <typename FloatingPointType> void fillArrayWithRandomFloats(size_t NumberOfElems, FloatingPointType* Array) {
+  static_assert(std::is_same_v<FloatingPointType, float> || std::is_same_v<FloatingPointType, double>,
+                "fillArrayWithRandomFloats<FloatingPointType>: Template argument must be either float or double");
 
-template <typename T>
-void replicate_data(sycl::queue &Q, T *dst, size_t dst_elems, const T *src, size_t src_elems)
-{
-    firestarter::log::trace() << "replicate_data " << dst_elems << " elements from " <<
-                                 src << " to " << dst ;
-    while (dst_elems > 0) {
-        auto copy_elems = std::min(dst_elems, src_elems);
-        Q.copy(src,  dst, copy_elems);
-        dst += copy_elems;
-        dst_elems -= copy_elems;
-    }
-    Q.wait();
+  for (size_t i = 0; i < NumberOfElems; i++) {
+    Array[i] = static_cast<FloatingPointType>(std::rand()) / static_cast<FloatingPointType>(RAND_MAX);
+  }
 }
 
-static int get_precision(int device_index, int useDouble) {
+template <typename FloatingPointType>
+void replicateData(sycl::queue& Q, FloatingPointType* Dst, size_t DstElems, const FloatingPointType* Src,
+                   size_t SrcElems) {
+  static_assert(std::is_same_v<FloatingPointType, float> || std::is_same_v<FloatingPointType, double>,
+                "fillArrayWithRandomFloats<FloatingPointType>: Template argument must be either float or double");
+
+  firestarter::log::trace() << "replicateData<FloatingPointType> " << DstElems << " elements from " << Src << " to "
+                            << Dst;
+  while (DstElems > 0) {
+    auto copy_elems = std::min(DstElems, SrcElems);
+    Q.copy(Src, Dst, copy_elems);
+    Dst += copy_elems;
+    DstElems -= copy_elems;
+  }
+  Q.wait();
+}
 
-  firestarter::log::trace() << "Checking useDouble " << useDouble;
+int getPrecision(int DeviceIndex, int UseDouble) {
+  firestarter::log::trace() << "Checking UseDouble " << UseDouble;
 
-  if (!useDouble){
+  if (!UseDouble) {
     return 0;
   }
 
-  int supports_double = 0;
+  int SupportsDouble = 0;
 
-  auto platforms = sycl::platform::get_platforms();
+  auto Platforms = sycl::platform::get_platforms();
 
-  if (platforms.empty()) {
+  if (Platforms.empty()) {
     firestarter::log::warn() << "No SYCL platforms found.";
     return -1;
   }
   // Choose a platform based on specific criteria (e.g., device type)
-  sycl::platform chosenPlatform;
-  auto nr_gpus = 0;
-  for (const auto &platform : platforms) {
-    firestarter::log::trace() << "Checking SYCL platform " << platform.get_info<sycl::info::platform::name>();
-    auto devices = platform.get_devices();
-    nr_gpus = 0;
-    for (const auto &device : devices) {
-      firestarter::log::trace() << "Checking SYCL device " << device.get_info<sycl::info::device::name>();
-      if (device.is_gpu()) { // Choose GPU, you can use other criteria
+  // TODO(Issue #75): We may select the incorrect platform with gpu devices of the wrong vendor/type.
+  sycl::platform ChosenPlatform;
+  auto NbGpus = 0;
+  for (const auto& Platform : Platforms) {
+    firestarter::log::trace() << "Checking SYCL platform " << Platform.get_info<sycl::info::platform::name>();
+    auto Devices = Platform.get_devices();
+    NbGpus = 0;
+    for (const auto& Device : Devices) {
+      firestarter::log::trace() << "Checking SYCL device " << Device.get_info<sycl::info::device::name>();
+      if (Device.is_gpu()) { // Choose GPU, you can use other criteria
         firestarter::log::trace() << " ... is GPU";
-        chosenPlatform = platform;
-        nr_gpus++;
+        ChosenPlatform = Platform;
+        NbGpus++;
       }
     }
   }
 
-  if (!nr_gpus) {
+  if (!NbGpus) {
     firestarter::log::warn() << "No suitable platform with GPU found.";
     return -1;
   }
   // Get a list of devices for the chosen platform
 
-
   firestarter::log::trace() << "Get support for double"
-                     << " on device nr. "
-                     << device_index;
-  auto devices = chosenPlatform.get_devices();
-  if (devices[device_index].has(sycl::aspect::fp64))
-    supports_double=1;
+                            << " on device nr. " << DeviceIndex;
+  auto Devices = ChosenPlatform.get_devices();
+  if (Devices[DeviceIndex].has(sycl::aspect::fp64))
+    SupportsDouble = 1;
 
-  return supports_double;
+  return SupportsDouble;
 }
 
-static int round_up(int num_to_round, int multiple) {
-  if (multiple == 0) {
-    return num_to_round;
-  }
+template <std::size_t Multiple> auto roundUp(int NumToRound) -> int {
+  static_assert(Multiple != 0, "Multiple may not be zero.");
 
-  int remainder = num_to_round % multiple;
-  if (remainder == 0) {
-    return num_to_round;
+  const int Remainder = NumToRound % Multiple;
+  if (Remainder == 0) {
+    return NumToRound;
   }
 
-  return num_to_round + multiple - remainder;
+  return NumToRound + Multiple - Remainder;
 }
 
-
 // GPU index. Used to pin this thread to the GPU.
-template <typename T>
-static void create_load(std::condition_variable &waitForInitCv,
-                        std::mutex &waitForInitCvMutex, int device_index,
-                        std::atomic<int> &initCount,
-                        volatile unsigned long long *loadVar, int matrixSize) {
-  static_assert(
-      std::is_same<T, float>::value || std::is_same<T, double>::value,
-      "create_load<T>: Template argument T must be either float or double");
-
-  firestarter::log::trace() << "Starting OneAPI with given matrix size "
-                            << matrixSize;
-
-  size_t size_use = 0;
-  if (matrixSize > 0) {
-    size_use = matrixSize;
-  }
+// The main difference to the CUDA/HIP version is that we do not run multiple iterations of C=A*B, just one single
+// iteration.
+template <typename FloatingPointType>
+void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex, int DeviceIndex,
+                std::atomic<int>& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar,
+                unsigned MatrixSize) {
+  static_assert(std::is_same<FloatingPointType, float>::value || std::is_same<FloatingPointType, double>::value,
+                "createLoad<T>: Template argument T must be either float or double");
 
-  size_t use_bytes;
+  firestarter::log::trace() << "Starting OneAPI with given matrix size " << MatrixSize;
 
   // reserving the GPU and initializing
 
-  firestarter::log::trace() << "Getting device nr. " << device_index;
+  firestarter::log::trace() << "Getting device nr. " << DeviceIndex;
 
-  auto platforms = sycl::platform::get_platforms();
+  auto Platforms = sycl::platform::get_platforms();
 
-  if (platforms.empty()) {
+  if (Platforms.empty()) {
     firestarter::log::warn() << "No SYCL platforms found.";
     return;
   }
 
   // Choose a platform based on specific criteria (e.g., device type)
-  sycl::platform chosenPlatform;
-  auto nr_gpus = 0;
-  for (const auto &platform : platforms) {
-    auto devices = platform.get_devices();
-    nr_gpus = 0;
-    for (const auto &device : devices) {
-        if (device.is_gpu()) { // Choose GPU, you can use other criteria
-            chosenPlatform = platform;
-            nr_gpus++;
-        }
+  sycl::platform ChosenPlatform;
+  auto NbGpus = 0;
+  for (const auto& Platform : Platforms) {
+    auto Devices = Platform.get_devices();
+    NbGpus = 0;
+    for (const auto& Device : Devices) {
+      if (Device.is_gpu()) { // Choose GPU, you can use other criteria
+        ChosenPlatform = Platform;
+        NbGpus++;
+      }
     }
   }
 
-  if (!nr_gpus) {
+  if (!NbGpus) {
     firestarter::log::warn() << "No suitable platform with GPU found.";
     return;
   }
 
-    // Get a list of devices for the chosen platform
-  auto devices = chosenPlatform.get_devices();
-  
+  // Get a list of devices for the chosen platform
+  auto Devices = ChosenPlatform.get_devices();
 
-  firestarter::log::trace() << "Creating SYCL queue for computation on device nr. "
-                     << device_index;
-  auto chosenDevice = devices[device_index];
-  sycl::queue device_queue(chosenDevice);
+  firestarter::log::trace() << "Creating SYCL queue for computation on device nr. " << DeviceIndex;
+  auto ChosenDevice = Devices[DeviceIndex];
+  auto DeviceQueue = sycl::queue(ChosenDevice);
 
-  firestarter::log::trace() << "Get memory size on device nr. " << device_index;
-  
+  firestarter::log::trace() << "Get memory size on device nr. " << DeviceIndex;
 
   // getting information about the GPU memory
-  size_t memory_total = devices[device_index].get_info<sycl::info::device::global_mem_size>();
-
-  firestarter::log::trace() << "Get Memory info on device nr. "
-                     << device_index
-                     <<": has " << memory_total << " B global memory";
-
-  // check if the user has not set a matrix OR has set a too big matrixsite and
-  // if this is true: set a good matrixsize
-  if (!size_use || ((size_use * size_use * sizeof(T) * 3 > memory_total))) {
-    size_use = round_up((int)(0.8 * sqrt(((memory_total) / (sizeof(T) * 3)))),
-                        1024); // a multiple of 1024 works always well
+  size_t MemoryTotal = Devices[DeviceIndex].get_info<sycl::info::device::global_mem_size>();
+
+  firestarter::log::trace() << "Get Memory info on device nr. " << DeviceIndex << ": has " << MemoryTotal
+                            << " B global memory";
+
+  // If the matrix size is not set or three square matricies with dim size of SizeUse do not fit into the available
+  // memory, select the size so that 3 square matricies will fit into the available device memory where the dim size
+  // is a multiple of 1024.
+  std::size_t MemorySize = sizeof(FloatingPointType) * MatrixSize * MatrixSize;
+  if (!MatrixSize || (MemorySize * 3 > MemoryTotal)) {
+    // a multiple of 1024 works always well
+    MatrixSize = roundUp<1024>(0.8 * std::sqrt(MemoryTotal / sizeof(FloatingPointType) / 3));
+    MemorySize = sizeof(FloatingPointType) * MatrixSize * MatrixSize;
   }
 
-  firestarter::log::trace() << "Set OneAPI matrix size in B: " << size_use;
-  use_bytes =sizeof(T) * size_use * size_use * 3;
-
-
+  firestarter::log::trace() << "Set OneAPI matrix size in B: " << MatrixSize;
 
   /* Allocate A/B/C matrices */
 
-  firestarter::log::trace()
-      << "Allocating memory on device nr. "
-      << device_index;
-  auto A = malloc_device<T>(size_use * size_use, device_queue);
-  auto B = malloc_device<T>(size_use * size_use, device_queue);
-  auto C = malloc_device<T>(size_use * size_use, device_queue);
+  firestarter::log::trace() << "Allocating memory on device nr. " << DeviceIndex;
+  auto* A = sycl::malloc_device<FloatingPointType>(MatrixSize * MatrixSize, DeviceQueue);
+  auto* B = sycl::malloc_device<FloatingPointType>(MatrixSize * MatrixSize, DeviceQueue);
+  auto* C = sycl::malloc_device<FloatingPointType>(MatrixSize * MatrixSize, DeviceQueue);
 
   /* Create 64 MB random data on Host */
-  constexpr int rd_size = 1024*1024*64;
-  auto random_data = malloc_host<T>(rd_size, device_queue);
-  generate_random_data(rd_size, random_data);
+  constexpr int RandomSize = 1024 * 1024 * 64;
+  auto* RandomData = sycl::malloc_host<FloatingPointType>(RandomSize, DeviceQueue);
+  fillArrayWithRandomFloats<FloatingPointType>(RandomSize, RandomData);
 
-  firestarter::log::trace()
-      << "Copy memory to device nr. "
-      << device_index;
+  firestarter::log::trace() << "Copy memory to device nr. " << DeviceIndex;
   /* fill A and B with random data */
-  replicate_data(device_queue, A, size_use * size_use, random_data, rd_size);
-  replicate_data(device_queue, B, size_use * size_use, random_data, rd_size);
+  replicateData(DeviceQueue, A, MatrixSize * MatrixSize, RandomData, RandomSize);
+  replicateData(DeviceQueue, B, MatrixSize * MatrixSize, RandomData, RandomSize);
 
   {
-    std::lock_guard<std::mutex> lk(waitForInitCvMutex);
-
-#define TO_MB(x) (unsigned long)(x / 1024 / 1024)
-  firestarter::log::info()
-      << "   GPU " << device_index << "\n"
-      << "    name:           " << devices[device_index].get_info<sycl::info::device::name>() << "\n"
-      << "    memory:         " << TO_MB(memory_total) << " MiB total (using " << TO_MB(use_bytes)
-      << " MiB)\n"
-      << "    matrix size:    " << size_use << "\n"
-      << "    used precision: "
-      << ((sizeof(T) == sizeof(double)) ? "double" : "single");
-#undef TO_MB
-
-    initCount++;
+    std::lock_guard<std::mutex> lk(WaitForInitCvMutex);
+
+    auto ToMiB = [](const size_t Val) { return Val / 1024 / 1024; };
+    firestarter::log::info() << "   GPU " << DeviceIndex << "\n"
+                             << "    name:           " << Devices[DeviceIndex].get_info<sycl::info::device::name>()
+                             << "\n"
+                             << "    memory:         " << ToMiB(MemoryTotal) << " MiB total (using "
+                             << ToMiB(MemorySize) << " MiB)\n"
+                             << "    matrix size:    " << MatrixSize << "\n"
+                             << "    used precision: "
+                             << ((sizeof(FloatingPointType) == sizeof(double)) ? "double" : "single");
+
+    InitCount++;
   }
-  waitForInitCv.notify_all();
-
-  firestarter::log::trace() << "Run gemm on device nr. " << device_index;
-  /* With this, we could run multiple gemms ...*/
-/*  auto run_gemms = [=, &device_queue](int runs) -> double {
-      using namespace oneapi::mkl;
-      for (int i = 0; i < runs; i++)
-          
-      return runs;
-  };
-*/
-  while (*loadVar != LOAD_STOP) {
-  firestarter::log::trace() << "Run gemm on device nr. " << device_index;
-    oneapi::mkl::blas::gemm(device_queue, oneapi::mkl::transpose::N, oneapi::mkl::transpose::N, size_use, size_use, size_use, 1, A, size_use, B, size_use, 0, C, size_use);
-  firestarter::log::trace() << "wait gemm on device nr. " << device_index;
-    device_queue.wait_and_throw();
+  WaitForInitCv.notify_all();
+
+  firestarter::log::trace() << "Run gemm on device nr. " << DeviceIndex;
+  while (LoadVar != firestarter::LoadThreadWorkType::LoadStop) {
+    firestarter::log::trace() << "Run gemm on device nr. " << DeviceIndex;
+    ::oneapi::mkl::blas::gemm(DeviceQueue, ::oneapi::mkl::transpose::N, ::oneapi::mkl::transpose::N, MatrixSize,
+                              MatrixSize, MatrixSize, 1, A, MatrixSize, B, MatrixSize, 0, C, MatrixSize);
+    firestarter::log::trace() << "wait gemm on device nr. " << DeviceIndex;
+    DeviceQueue.wait_and_throw();
   }
-
 }
 
-OneAPI::OneAPI(volatile unsigned long long *loadVar, bool useFloat, bool useDouble,
-           unsigned matrixSize, int gpus) {
-  std::thread t(OneAPI::initGpus, std::ref(_waitForInitCv), loadVar, useFloat,
-                useDouble, matrixSize, gpus);
-  _initThread = std::move(t);
+} // namespace
+
+OneAPI::OneAPI(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble,
+               unsigned MatrixSize, int Gpus) {
+  std::condition_variable WaitForInitCv;
+  std::mutex WaitForInitCvMutex;
+
+  std::thread T(OneAPI::initGpus, std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat, UseDouble, MatrixSize, Gpus);
+  InitThread = std::move(T);
 
-  std::unique_lock<std::mutex> lk(_waitForInitCvMutex);
+  std::unique_lock<std::mutex> Lk(WaitForInitCvMutex);
   // wait for gpus to initialize
-  _waitForInitCv.wait(lk);
+  WaitForInitCv.wait(Lk);
 }
 
-void OneAPI::initGpus(std::condition_variable &cv,
-                    volatile unsigned long long *loadVar, bool useFloat,
-                    bool useDouble, unsigned matrixSize, int gpus) {
-  std::condition_variable waitForInitCv;
-  std::mutex waitForInitCvMutex;
+void OneAPI::initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
+                      bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus) {
+  std::condition_variable GpuThreadsWaitForInitCv;
+  std::mutex GpuThreadsWaitForInitCvMutex;
+  std::vector<std::thread> GpuThreads;
 
-  if (gpus) {
+  if (Gpus != 0) {
+    auto Platforms = sycl::platform::get_platforms();
 
-    auto platforms = sycl::platform::get_platforms();
-
-    if (platforms.empty()) {
+    if (Platforms.empty()) {
       std::cerr << "No SYCL platforms found." << std::endl;
       return;
     }
 
     // Choose a platform based on specific criteria (e.g., device type)
-    sycl::platform chosenPlatform;
-    auto devCount = 0;
-    for (const auto &platform : platforms) {
-      auto devices = platform.get_devices();
-      devCount = 0;
-      for (const auto &device : devices) {
-          if (device.is_gpu()) { // Choose GPU, you can use other criteria
-              chosenPlatform = platform;
-              devCount++;
-          }
+    // TODO(Issue #75): We may select the incorrect platform with gpu devices of the wrong vendor/type.
+    auto DevCount = 0;
+    for (const auto& Platform : Platforms) {
+      auto Devices = Platform.get_devices();
+      DevCount = 0;
+      for (const auto& Device : Devices) {
+        if (Device.is_gpu()) { // Choose GPU, you can use other criteria
+          DevCount++;
+        }
       }
     }
 
-    if (devCount) {
-      std::vector<std::thread> gpuThreads;
-      std::atomic<int> initCount = 0;
-      int use_double;
+    if (DevCount) {
+      std::atomic<int> InitCount = 0;
+      int UseDoubleConverted;
 
-      if (useFloat) {
-        use_double = 0;
-      } else if (useDouble) {
-        use_double = 1;
+      if (UseFloat) {
+        UseDoubleConverted = 0;
+      } else if (UseDouble) {
+        UseDoubleConverted = 1;
       } else {
-        use_double = 2;
+        UseDoubleConverted = 2;
       }
 
       firestarter::log::info()
@@ -322,67 +300,58 @@ void OneAPI::initGpus(std::condition_variable &cv,
           << "\n  graphics processor characteristics:";
 
       // use all GPUs if the user gave no information about use_device
-      if (gpus < 0) {
-        gpus = devCount;
+      if (Gpus < 0) {
+        Gpus = DevCount;
       }
-      if (gpus > devCount) {
-        firestarter::log::warn() << "You requested more OneAPI devices than available.";
-        firestarter::log::warn()
-            << "FIRESTARTER will use " << devCount << " of the requested "
-            << gpus << " OneAPI device(s)";
-        gpus = devCount;
+
+      if (Gpus > DevCount) {
+        firestarter::log::warn() << "You requested more OneAPI devices than available. "
+                                    "Maybe you set OneAPI_VISIBLE_DEVICES?";
+        firestarter::log::warn() << "FIRESTARTER will use " << DevCount << " of the requested " << Gpus
+                                 << " OneAPI device(s)";
+        Gpus = DevCount;
       }
 
       {
-        std::lock_guard<std::mutex> lk(waitForInitCvMutex);
+        const std::lock_guard<std::mutex> Lk(GpuThreadsWaitForInitCvMutex);
 
-        for (int i = 0; i < gpus; ++i) {
-          // if there's a GPU in the system without Double Precision support, we
-          // have to correct this.
-          int precision = get_precision(i, use_double);
-          if (precision == -1){
+        for (int I = 0; I < Gpus; ++I) {
+          const auto Precision = getPrecision(I, UseDoubleConverted);
+          if (Precision == -1) {
             firestarter::log::warn() << "This should not have happened. Could not get precision via SYCL.";
           }
+          void (*LoadFunc)(std::condition_variable&, std::mutex&, int, std::atomic<int>&,
+                           const volatile firestarter::LoadThreadWorkType&, unsigned) =
+              Precision ? createLoad<double> : createLoad<float>;
 
-          if (precision) {
-            firestarter::log::trace() << "Starting OneAPI GPU double workload.";
-            std::thread t(create_load<double>, std::ref(waitForInitCv),
-                          std::ref(waitForInitCvMutex), i, std::ref(initCount),
-                          loadVar, (int)matrixSize);
-            gpuThreads.push_back(std::move(t));
-          } else {
-            firestarter::log::trace() << "Starting OneAPI GPU float workload.";
-            std::thread t(create_load<float>, std::ref(waitForInitCv),
-                          std::ref(waitForInitCvMutex), i, std::ref(initCount),
-                          loadVar, (int)matrixSize);
-            gpuThreads.push_back(std::move(t));
-          }
+          std::thread T(LoadFunc, std::ref(GpuThreadsWaitForInitCv), std::ref(GpuThreadsWaitForInitCvMutex), I,
+                        std::ref(InitCount), std::cref(LoadVar), MatrixSize);
+          GpuThreads.emplace_back(std::move(T));
         }
       }
 
       {
-        std::unique_lock<std::mutex> lk(waitForInitCvMutex);
+        std::unique_lock<std::mutex> Lk(GpuThreadsWaitForInitCvMutex);
         // wait for all threads to initialize
-        waitForInitCv.wait(lk, [&] { return initCount == gpus; });
-      }
-
-      // notify that init is done
-      cv.notify_all();
-
-      /* join computation threads */
-      for (auto &t : gpuThreads) {
-        t.join();
+        GpuThreadsWaitForInitCv.wait(Lk, [&] { return InitCount == Gpus; });
       }
     } else {
-      firestarter::log::info()
-          << "    - No OneAPI devices. Just stressing CPU(s). Maybe use "
-             "FIRESTARTER instead of FIRESTARTER_OneAPI?";
-      cv.notify_all();
+      firestarter::log::info() << "    - No OneAPI"
+                               << " devices. Just stressing CPU(s). Maybe use "
+                                  "FIRESTARTER instead of FIRESTARTER_OneAPI?";
     }
   } else {
-    firestarter::log::info()
-        << "    --gpus 0 is set. Just stressing CPU(s). Maybe use "
-           "FIRESTARTER instead of FIRESTARTER_OneAPI?";
-    cv.notify_all();
+    firestarter::log::info() << "    --gpus 0 is set. Just stressing CPU(s). Maybe use "
+                                "FIRESTARTER instead of FIRESTARTER_OneAPI?";
+  }
+
+  // notify that init is done
+  WaitForInitCv.notify_all();
+
+  /* join computation threads */
+  for (auto& Thread : GpuThreads) {
+    Thread.join();
   }
 }
+
+} // namespace firestarter::oneapi
\ No newline at end of file
diff --git a/src/firestarter/Optimizer/Algorithm/NSGA2.cpp b/src/firestarter/Optimizer/Algorithm/NSGA2.cpp
index 7c8a8146..c515e429 100644
--- a/src/firestarter/Optimizer/Algorithm/NSGA2.cpp
+++ b/src/firestarter/Optimizer/Algorithm/NSGA2.cpp
@@ -21,165 +21,161 @@
 
 // This file borrows a lot of code from https://github.com/esa/pagmo2
 
-#include <firestarter/Optimizer/Algorithm/NSGA2.hpp>
-#include <firestarter/Optimizer/Individual.hpp>
-#include <firestarter/Optimizer/Util/MultiObjective.hpp>
+#include "firestarter/Optimizer/Algorithm/NSGA2.hpp"
+#include "firestarter/Logging/Log.hpp"
+#include "firestarter/Optimizer/Individual.hpp"
+#include "firestarter/Optimizer/Util/MultiObjective.hpp"
 
 #include <algorithm>
+#include <iomanip>
+#include <random>
 #include <stdexcept>
 
-using namespace firestarter::optimizer::algorithm;
+namespace firestarter::optimizer::algorithm {
 
-NSGA2::NSGA2(unsigned gen, double cr, double m)
-    : Algorithm(), _gen(gen), _cr(cr), _m(m) {
-  if (cr >= 1. || cr < 0.) {
+NSGA2::NSGA2(unsigned Gen, double Cr, double M)
+    : Gen(Gen)
+    , Cr(Cr)
+    , M(M) {
+  if (Cr >= 1. || Cr < 0.) {
     throw std::invalid_argument("The crossover probability must be in the "
                                 "[0,1[ range, while a value of " +
-                                std::to_string(cr) + " was detected");
+                                std::to_string(Cr) + " was detected");
   }
-  if (m < 0. || m > 1.) {
+  if (M < 0. || M > 1.) {
     throw std::invalid_argument("The mutation probability must be in the [0,1] "
                                 "range, while a value of " +
-                                std::to_string(m) + " was detected");
+                                std::to_string(M) + " was detected");
   }
 }
 
-void NSGA2::checkPopulation(firestarter::optimizer::Population const &pop,
-                            std::size_t populationSize) {
-  const auto &prob = pop.problem();
-
-  if (!prob.isMO()) {
-    throw std::invalid_argument(
-        "NSGA2 is a multiobjective algorithms, while number of objectives is " +
-        std::to_string(prob.getNobjs()));
+void NSGA2::check(firestarter::optimizer::Problem const& Prob, std::size_t PopulationSize) {
+  if (!Prob.isMO()) {
+    throw std::invalid_argument("NSGA2 is a multiobjective algorithms, while number of objectives is " +
+                                std::to_string(Prob.getNobjs()));
   }
 
-  if (populationSize < 5u || (populationSize % 4 != 0u)) {
+  if (PopulationSize < 5U || (PopulationSize % 4 != 0U)) {
     throw std::invalid_argument("for NSGA-II at least 5 individuals in the "
                                 "population are needed and the "
                                 "population size must be a multiple of 4. "
                                 "Detected input population size is: " +
-                                std::to_string(populationSize));
+                                std::to_string(PopulationSize));
   }
 }
 
-firestarter::optimizer::Population
-NSGA2::evolve(firestarter::optimizer::Population &pop) {
-  const auto &prob = pop.problem();
-  const auto bounds = prob.getBounds();
-  auto NP = pop.size();
-  auto fevals0 = prob.getFevals();
+auto NSGA2::evolve(firestarter::optimizer::Population& Pop) -> firestarter::optimizer::Population {
+  const auto& Prob = Pop.problem();
+  const auto Bounds = Prob.getBounds();
+  auto NP = Pop.size();
+  auto Fevals0 = Prob.getFevals();
 
-  this->checkPopulation(
-      const_cast<firestarter::optimizer::Population const &>(pop), NP);
+  this->check(Prob, NP);
 
-  std::random_device rd;
-  std::mt19937 rng(rd());
+  std::random_device Rd;
+  std::mt19937 Rng(Rd());
 
-  std::vector<Individual::size_type> best_idx(NP), shuffle1(NP), shuffle2(NP);
-  Individual::size_type parent1_idx, parent2_idx;
-  std::pair<Individual, Individual> children;
+  std::vector<Individual::size_type> BestIdx(NP);
+  std::vector<Individual::size_type> Shuffle1(NP);
+  std::vector<Individual::size_type> Shuffle2(NP);
+  Individual::size_type Parent1Idx = 0;
+  Individual::size_type Parent2Idx = 0;
+  std::pair<Individual, Individual> Children;
 
-  std::iota(shuffle1.begin(), shuffle1.end(), Individual::size_type(0));
-  std::iota(shuffle2.begin(), shuffle2.end(), Individual::size_type(0));
+  std::iota(Shuffle1.begin(), Shuffle1.end(), static_cast<Individual::size_type>(0));
+  std::iota(Shuffle2.begin(), Shuffle2.end(), static_cast<Individual::size_type>(0));
 
   {
-    std::stringstream ss;
+    std::stringstream Ss;
 
-    ss << std::endl << std::setw(7) << "Gen:" << std::setw(15) << "Fevals:";
-    for (decltype(prob.getNobjs()) i = 0; i < prob.getNobjs(); ++i) {
-      ss << std::setw(15) << "ideal" << std::to_string(i + 1u) << ":";
+    Ss << '\n' << std::setw(7) << "Gen:" << std::setw(15) << "Fevals:";
+    for (decltype(Prob.getNobjs()) I = 0; I < Prob.getNobjs(); ++I) {
+      Ss << std::setw(15) << "ideal" << std::to_string(I + 1U) << ":";
     }
-    firestarter::log::info() << ss.str();
+    firestarter::log::info() << Ss.str();
   }
 
-  for (decltype(_gen) gen = 1u; gen <= _gen; ++gen) {
+  for (auto I = 1U; I <= Gen; ++I) {
     {
       // Print the logs
-      std::vector<double> idealPoint = util::ideal(pop.f());
-      std::stringstream ss;
+      const auto IdealPoint = util::ideal(Pop.f());
+      std::stringstream Ss;
 
-      ss << std::setw(7) << gen << std::setw(15) << prob.getFevals() - fevals0;
-      for (decltype(idealPoint.size()) i = 0; i < idealPoint.size(); ++i) {
-        ss << std::setw(15) << idealPoint[i];
+      Ss << std::setw(7) << I << std::setw(15) << Prob.getFevals() - Fevals0;
+      for (const auto I : IdealPoint) {
+        Ss << std::setw(15) << I;
       }
 
-      firestarter::log::info() << ss.str();
+      firestarter::log::info() << Ss.str();
     }
 
     // At each generation we make a copy of the population into popnew
-    firestarter::optimizer::Population popnew(pop);
+    firestarter::optimizer::Population Popnew(Pop);
 
     // We create some pseudo-random permutation of the poulation indexes
-    std::random_shuffle(shuffle1.begin(), shuffle1.end());
-    std::random_shuffle(shuffle2.begin(), shuffle2.end());
+    std::shuffle(Shuffle1.begin(), Shuffle1.end(), Rng);
+    std::shuffle(Shuffle2.begin(), Shuffle2.end(), Rng);
 
     // We compute crowding distance and non dominated rank for the current
     // population
-    auto fnds_res = util::fast_non_dominated_sorting(pop.f());
-    auto ndf =
-        std::get<0>(fnds_res); // non dominated fronts [[0,3,2],[1,5,6],[4],...]
-    std::vector<double> pop_cd(
-        NP); // crowding distances of the whole population
-    auto ndr =
-        std::get<3>(fnds_res); // non domination rank [0,1,0,0,2,1,1, ... ]
-    for (const auto &front_idxs : ndf) {
-      if (front_idxs.size() ==
-          1u) { // handles the case where the front has collapsed to one point
-        pop_cd[front_idxs[0]] = std::numeric_limits<double>::infinity();
-      } else if (front_idxs.size() == 2u) { // handles the case where the front
+    auto FndsRes = util::fastNonDominatedSorting(Pop.f());
+    auto Ndf = std::get<0>(FndsRes); // non dominated fronts [[0,3,2],[1,5,6],[4],...]
+    std::vector<double> PopCd(NP);   // crowding distances of the whole population
+    auto Ndr = std::get<3>(FndsRes); // non domination rank [0,1,0,0,2,1,1, ... ]
+    for (const auto& FrontIdxs : Ndf) {
+      if (FrontIdxs.size() == 1U) { // handles the case where the front has collapsed to one point
+        PopCd[FrontIdxs[0]] = std::numeric_limits<double>::infinity();
+      } else if (FrontIdxs.size() == 2U) { // handles the case where the front
         // has collapsed to one point
-        pop_cd[front_idxs[0]] = std::numeric_limits<double>::infinity();
-        pop_cd[front_idxs[1]] = std::numeric_limits<double>::infinity();
+        PopCd[FrontIdxs[0]] = std::numeric_limits<double>::infinity();
+        PopCd[FrontIdxs[1]] = std::numeric_limits<double>::infinity();
       } else {
-        std::vector<std::vector<double>> front;
-        for (auto idx : front_idxs) {
-          front.push_back(pop.f()[idx]);
+        std::vector<std::vector<double>> Front;
+        Front.reserve(FrontIdxs.size());
+        for (auto Idx : FrontIdxs) {
+          Front.push_back(Pop.f()[Idx]);
         }
-        auto cd = util::crowding_distance(front);
-        for (decltype(cd.size()) i = 0u; i < cd.size(); ++i) {
-          pop_cd[front_idxs[i]] = cd[i];
+        auto Cd = util::crowdingDistance(Front);
+        for (decltype(Cd.size()) I = 0U; I < Cd.size(); ++I) {
+          PopCd[FrontIdxs[I]] = Cd[I];
         }
       }
     }
 
     // We then loop thorugh all individuals with increment 4 to select two pairs
     // of parents that will each create 2 new offspring
-    for (decltype(NP) i = 0u; i < NP; i += 4) {
+    for (decltype(NP) I = 0U; I < NP; I += 4) {
       // We create two offsprings using the shuffled list 1
-      parent1_idx = util::mo_tournament_selection(shuffle1[i], shuffle1[i + 1],
-                                                  ndr, pop_cd, rng);
-      parent2_idx = util::mo_tournament_selection(
-          shuffle1[i + 2], shuffle1[i + 3], ndr, pop_cd, rng);
-      children = util::sbx_crossover(pop.x()[parent1_idx], pop.x()[parent2_idx],
-                                     _cr, rng);
-      util::polynomial_mutation(children.first, bounds, _m, rng);
-      util::polynomial_mutation(children.second, bounds, _m, rng);
-
-      popnew.append(children.first);
-      popnew.append(children.second);
+      Parent1Idx = util::moTournamentSelection(Shuffle1[I], Shuffle1[I + 1], Ndr, PopCd, Rng);
+      Parent2Idx = util::moTournamentSelection(Shuffle1[I + 2], Shuffle1[I + 3], Ndr, PopCd, Rng);
+      Children = util::sbxCrossover(Pop.x()[Parent1Idx], Pop.x()[Parent2Idx], Cr, Rng);
+      util::polynomialMutation(Children.first, Bounds, M, Rng);
+      util::polynomialMutation(Children.second, Bounds, M, Rng);
+
+      Popnew.append(Children.first);
+      Popnew.append(Children.second);
 
       // We repeat with the shuffled list 2
-      parent1_idx = util::mo_tournament_selection(shuffle2[i], shuffle2[i + 1],
-                                                  ndr, pop_cd, rng);
-      parent2_idx = util::mo_tournament_selection(
-          shuffle2[i + 2], shuffle2[i + 3], ndr, pop_cd, rng);
-      children = util::sbx_crossover(pop.x()[parent1_idx], pop.x()[parent2_idx],
-                                     _cr, rng);
-      util::polynomial_mutation(children.first, bounds, _m, rng);
-      util::polynomial_mutation(children.second, bounds, _m, rng);
-
-      popnew.append(children.first);
-      popnew.append(children.second);
-    } // popnew now contains 2NP individuals
-    // This method returns the sorted N best individuals in the population
+      Parent1Idx = util::moTournamentSelection(Shuffle2[I], Shuffle2[I + 1], Ndr, PopCd, Rng);
+      Parent2Idx = util::moTournamentSelection(Shuffle2[I + 2], Shuffle2[I + 3], Ndr, PopCd, Rng);
+      Children = util::sbxCrossover(Pop.x()[Parent1Idx], Pop.x()[Parent2Idx], Cr, Rng);
+      util::polynomialMutation(Children.first, Bounds, M, Rng);
+      util::polynomialMutation(Children.second, Bounds, M, Rng);
+
+      Popnew.append(Children.first);
+      Popnew.append(Children.second);
+    }
+    // Popnew now contains 2NP individuals
+
+    // Save the best NP individuals in the population
     // according to the crowded comparison operator
-    best_idx = util::select_best_N_mo(popnew.f(), NP);
-    // We insert into the population
-    for (decltype(NP) i = 0; i < NP; ++i) {
-      pop.insert(i, popnew.x()[best_idx[i]], popnew.f()[best_idx[i]]);
+    BestIdx = util::selectBestNMo(Popnew.f(), NP);
+    for (decltype(NP) I = 0; I < NP; ++I) {
+      Pop.insert(I, Popnew.x()[BestIdx[I]], Popnew.f()[BestIdx[I]]);
     }
   }
 
-  return pop;
+  return Pop;
 }
+
+} // namespace firestarter::optimizer::algorithm
\ No newline at end of file
diff --git a/src/firestarter/Optimizer/OptimizerWorker.cpp b/src/firestarter/Optimizer/OptimizerWorker.cpp
index 48819fd5..a82c1fa8 100644
--- a/src/firestarter/Optimizer/OptimizerWorker.cpp
+++ b/src/firestarter/Optimizer/OptimizerWorker.cpp
@@ -19,54 +19,56 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Optimizer/OptimizerWorker.hpp>
+#include "firestarter/Optimizer/OptimizerWorker.hpp"
+#include "firestarter/Optimizer/Algorithm/NSGA2.hpp"
 
 #include <thread>
+#include <utility>
 
-using namespace firestarter::optimizer;
+namespace firestarter::optimizer {
 
-OptimizerWorker::OptimizerWorker(
-    std::unique_ptr<firestarter::optimizer::Algorithm> &&algorithm,
-    firestarter::optimizer::Population &population,
-    std::string const &optimizationAlgorithm, unsigned individuals,
-    std::chrono::seconds const &preheat)
-    : _algorithm(std::move(algorithm)), _population(population),
-      _optimizationAlgorithm(optimizationAlgorithm), _individuals(individuals),
-      _preheat(preheat) {
-  pthread_create(
-      &this->workerThread, NULL,
-      reinterpret_cast<void *(*)(void *)>(OptimizerWorker::optimizerThread),
-      this);
+OptimizerWorker::OptimizerWorker(std::unique_ptr<firestarter::optimizer::Algorithm>&& Algorithm,
+                                 std::unique_ptr<firestarter::optimizer::Population>&& Population, unsigned Individuals,
+                                 std::chrono::seconds const& Preheat)
+    : Algorithm(std::move(Algorithm))
+    , Population(std::move(Population))
+    , Individuals(Individuals)
+    , Preheat(Preheat) {
+  pthread_create(&this->WorkerThread, nullptr, OptimizerWorker::optimizerThread, this);
 }
 
-void OptimizerWorker::kill() {
+void OptimizerWorker::kill() const {
   // we ignore ESRCH errno if thread already exited
-  pthread_cancel(this->workerThread);
+  pthread_cancel(WorkerThread);
 }
 
-void OptimizerWorker::join() {
+void OptimizerWorker::join() const {
   // we ignore ESRCH errno if thread already exited
-  pthread_join(this->workerThread, NULL);
+  pthread_join(WorkerThread, nullptr);
 }
 
-void *OptimizerWorker::optimizerThread(void *optimizerWorker) {
-  pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
+auto OptimizerWorker::optimizerThread(void* OptimizerWorker) -> void* {
+  // NOLINTBEGIN(cert-pos47-c,concurrency-thread-canceltype-asynchronous)
+  pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, nullptr);
+  // NOLINTEND(cert-pos47-c,concurrency-thread-canceltype-asynchronous)
 
-  auto _this = reinterpret_cast<OptimizerWorker *>(optimizerWorker);
+  auto* This = static_cast<class OptimizerWorker*>(OptimizerWorker);
 
 #ifndef __APPLE__
   pthread_setname_np(pthread_self(), "Optimizer");
 #endif
 
   // heat the cpu before attempting to optimize
-  std::this_thread::sleep_for(_this->_preheat);
+  std::this_thread::sleep_for(This->Preheat);
 
   // For NSGA2 we start with a initial population
-  if (_this->_optimizationAlgorithm == "NSGA2") {
-    _this->_population.generateInitialPopulation(_this->_individuals);
+  if (dynamic_cast<algorithm::NSGA2*>(This->Algorithm.get())) {
+    This->Population->generateInitialPopulation(This->Individuals);
   }
 
-  _this->_algorithm->evolve(_this->_population);
+  This->Algorithm->evolve(*This->Population);
 
-  return NULL;
+  return nullptr;
 }
+
+} // namespace firestarter::optimizer
\ No newline at end of file
diff --git a/src/firestarter/Optimizer/Population.cpp b/src/firestarter/Optimizer/Population.cpp
index 7d3a7e1a..a5a21527 100644
--- a/src/firestarter/Optimizer/Population.cpp
+++ b/src/firestarter/Optimizer/Population.cpp
@@ -19,125 +19,106 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Logging/Log.hpp>
-#include <firestarter/Optimizer/Population.hpp>
+#include "firestarter/Optimizer/Population.hpp"
+#include "firestarter/Logging/Log.hpp"
+#include "firestarter/Optimizer/History.hpp"
 
-#include <algorithm>
 #include <cassert>
-#include <stdexcept>
+#include <random>
 
-using namespace firestarter::optimizer;
+namespace firestarter::optimizer {
 
-void Population::generateInitialPopulation(std::size_t populationSize) {
-  firestarter::log::trace() << "Generating " << populationSize
-                            << " random individuals for initial population.";
+void Population::generateInitialPopulation(std::size_t PopulationSize) {
+  firestarter::log::trace() << "Generating " << PopulationSize << " random individuals for initial population.";
 
-  auto dims = this->problem().getDims();
-  auto remaining = populationSize;
+  auto Dims = this->problem().getDims();
+  auto Remaining = PopulationSize;
 
-  if (!(populationSize < dims)) {
-    for (decltype(dims) i = 0; i < dims; i++) {
-      Individual vec(dims, 0);
-      vec[i] = 1;
-      this->append(vec);
+  if (!(PopulationSize < Dims)) {
+    for (decltype(Dims) I = 0; I < Dims; I++) {
+      Individual Vec(Dims, 0);
+      Vec[I] = 1;
+      this->append(Vec);
+      Remaining--;
     }
-
-    remaining -= dims;
   } else {
-    firestarter::log::trace()
-        << "Population size (" << std::to_string(populationSize)
-        << ") is less than size of problem dimension (" << std::to_string(dims)
-        << ")";
+    firestarter::log::trace() << "Population size (" << std::to_string(PopulationSize)
+                              << ") is less than size of problem dimension (" << std::to_string(Dims) << ")";
   }
 
-  for (decltype(remaining) i = 0; i < remaining; i++) {
+  for (decltype(Remaining) I = 0; I < Remaining; I++) {
     this->append(this->getRandomIndividual());
   }
 }
 
-std::size_t Population::size() const { return _x.size(); }
+auto Population::size() const -> std::size_t { return X.size(); }
 
-void Population::append(Individual const &ind) {
-  assert(this->problem().getDims() == ind.size());
+void Population::append(Individual const& Ind) {
+  assert(this->problem().getDims() == Ind.size());
 
-  std::map<std::string, firestarter::measurement::Summary> metrics;
+  std::map<std::string, firestarter::measurement::Summary> Metrics;
 
   // check if we already evaluated this individual
-  auto optional_metric = History::find(ind);
-  if (optional_metric.has_value()) {
-    metrics = optional_metric.value();
+  const auto OptionalMetric = History::find(Ind);
+  if (OptionalMetric.has_value()) {
+    Metrics = OptionalMetric.value();
   } else {
-    metrics = this->_problem->metrics(ind);
+    Metrics = this->ProblemPtr->metrics(Ind);
   }
 
-  auto fitness = this->_problem->fitness(metrics);
+  auto Fitness = this->ProblemPtr->fitness(Metrics);
 
-  this->append(ind, fitness);
+  this->append(Ind, Fitness);
 
-  if (!optional_metric.has_value()) {
-    History::append(ind, metrics);
+  if (!OptionalMetric.has_value()) {
+    History::append(Ind, Metrics);
   }
 }
 
-void Population::append(Individual const &ind, std::vector<double> const &fit) {
-  std::stringstream ss;
-  ss << "  - Fitness: ";
-  for (auto const &v : fit) {
-    ss << v << " ";
+void Population::append(Individual const& Ind, std::vector<double> const& Fit) {
+  std::stringstream Ss;
+  Ss << "  - Fitness: ";
+  for (auto const& V : Fit) {
+    Ss << V << " ";
   }
-  firestarter::log::trace() << ss.str();
+  firestarter::log::trace() << Ss.str();
 
-  assert(this->problem().getNobjs() == fit.size());
-  assert(this->problem().getDims() == ind.size());
+  assert(this->problem().getNobjs() == Fit.size());
+  assert(this->problem().getDims() == Ind.size());
 
-  this->_x.push_back(ind);
-  this->_f.push_back(fit);
+  this->X.push_back(Ind);
+  this->F.push_back(Fit);
 }
 
-void Population::insert(std::size_t idx, Individual const &ind,
-                        std::vector<double> const &fit) {
+void Population::insert(std::size_t Idx, Individual const& Ind, std::vector<double> const& Fit) {
   // assert that population is big enough
-  assert(_x.size() > idx);
+  assert(X.size() > Idx);
 
-  _x[idx] = ind;
-  _f[idx] = fit;
+  X[Idx] = Ind;
+  F[Idx] = Fit;
 }
 
-Individual Population::getRandomIndividual() {
-  auto dims = this->problem().getDims();
-  auto const bounds = this->problem().getBounds();
-
-  firestarter::log::trace() << "Generating random individual of size: " << dims;
+auto Population::getRandomIndividual() const -> Individual {
+  auto Dims = this->problem().getDims();
+  auto const Bounds = this->problem().getBounds();
 
-  Individual out(dims);
+  std::random_device Rd;
+  std::mt19937 Rng(Rd());
 
-  for (decltype(dims) i = 0; i < dims; i++) {
-    auto const lb = std::get<0>(bounds[i]);
-    auto const ub = std::get<1>(bounds[i]);
+  firestarter::log::trace() << "Generating random individual of size: " << Dims;
 
-    out[i] = std::uniform_int_distribution<unsigned>(lb, ub)(this->gen);
+  Individual Out(Dims);
 
-    firestarter::log::trace()
-        << "  - " << i << ": [" << lb << "," << ub << "]: " << out[i];
-  }
+  for (decltype(Dims) I = 0; I < Dims; I++) {
+    auto const Lb = std::get<0>(Bounds[I]);
+    auto const Ub = std::get<1>(Bounds[I]);
 
-  return out;
-}
+    Out[I] = std::uniform_int_distribution<unsigned>(Lb, Ub)(Rng);
 
-std::optional<Individual> Population::bestIndividual() const {
-  // return an empty vector if the problem is multi objective, as there is no
-  // single best individual
-  if (this->problem().isMO()) {
-    return {};
+    firestarter::log::trace() << "  - " << I << ": [" << Lb << "," << Ub << "]: " << Out[I];
   }
 
-  // assert that we have individuals
-  assert(this->_x.size() > 0);
-
-  auto best = std::max_element(this->_x.begin(), this->_x.end(),
-                               [](auto a, auto b) { return a < b; });
-
-  assert(best != this->_x.end());
-
-  return *best;
+  return Out;
 }
+
+} // namespace firestarter::optimizer
\ No newline at end of file
diff --git a/src/firestarter/Optimizer/Util/MultiObjective.cpp b/src/firestarter/Optimizer/Util/MultiObjective.cpp
index 2c87ba2f..7cae260a 100644
--- a/src/firestarter/Optimizer/Util/MultiObjective.cpp
+++ b/src/firestarter/Optimizer/Util/MultiObjective.cpp
@@ -21,7 +21,7 @@
 
 // This file borrows a lot of code from https://github.com/esa/pagmo2
 
-#include <firestarter/Optimizer/Util/MultiObjective.hpp>
+#include "firestarter/Optimizer/Util/MultiObjective.hpp"
 
 #include <algorithm>
 #include <stdexcept>
@@ -32,35 +32,30 @@ namespace firestarter::optimizer::util {
 // Less than compares floating point types placing nans after inf or before -inf
 // It is a useful function when calling e.g. std::sort to guarantee a weak
 // strict ordering and avoid an undefined behaviour
-bool less_than_f(double a, double b) {
-  if (!std::isnan(a)) {
-    if (!std::isnan(b))
-      return a < b; // a < b
-    else
-      return true; // a < nan
-  } else {
-    if (!std::isnan(b))
-      return false; // nan < b
-    else
-      return false; // nan < nan
+auto lessThanF(double A, double B) -> bool {
+  if (!std::isnan(A)) {
+    if (!std::isnan(B)) {
+      return A < B; // a < b
+    }
+    return true; // a < nan
   }
+  // nan < b or nan < nan
+  return false;
 }
 
 // Greater than compares floating point types placing nans after inf or before
 // -inf It is a useful function when calling e.g. std::sort to guarantee a weak
 // strict ordering and avoid an undefined behaviour
-bool greater_than_f(double a, double b) {
-  if (!std::isnan(a)) {
-    if (!std::isnan(b))
-      return a > b; // a > b
-    else
-      return false; // a > nan
-  } else {
-    if (!std::isnan(b))
-      return true; // nan > b
-    else
-      return false; // nan > nan
+auto greaterThanF(double A, double B) -> bool {
+  if (!std::isnan(A)) {
+    if (!std::isnan(B)) {
+      return A > B; // a > b
+    }
+    return false; // a > nan
   }
+  // nan > b -> true
+  // nan > nan -> false
+  return !std::isnan(B);
 }
 
 /// Pareto-dominance
@@ -81,23 +76,22 @@ bool greater_than_f(double a, double b) {
  * @throws std::invalid_argument if the dimensions of the two objectives are
  * different
  */
-bool pareto_dominance(const std::vector<double> &obj1,
-                      const std::vector<double> &obj2) {
-  if (obj1.size() != obj2.size()) {
+auto paretoDominance(const std::vector<double>& Obj1, const std::vector<double>& Obj2) -> bool {
+  if (Obj1.size() != Obj2.size()) {
     throw std::invalid_argument(
-        "Different number of objectives found in input fitnesses: " +
-        std::to_string(obj1.size()) + " and " + std::to_string(obj2.size()) +
-        ". I cannot define dominance");
+        "Different number of objectives found in input fitnesses: " + std::to_string(Obj1.size()) + " and " +
+        std::to_string(Obj2.size()) + ". I cannot define dominance");
   }
-  bool found_strictly_dominating_dimension = false;
-  for (decltype(obj1.size()) i = 0u; i < obj1.size(); ++i) {
-    if (greater_than_f(obj2[i], obj1[i])) {
+  bool FoundStrictlyDominatingDimension = false;
+  for (decltype(Obj1.size()) I = 0U; I < Obj1.size(); ++I) {
+    if (greaterThanF(Obj2[I], Obj1[I])) {
       return false;
-    } else if (less_than_f(obj2[i], obj1[i])) {
-      found_strictly_dominating_dimension = true;
+    }
+    if (lessThanF(Obj2[I], Obj1[I])) {
+      FoundStrictlyDominatingDimension = true;
     }
   }
-  return found_strictly_dominating_dimension;
+  return FoundStrictlyDominatingDimension;
 }
 
 /// Fast non dominated sorting
@@ -130,67 +124,63 @@ bool pareto_dominance(const std::vector<double> &obj1,
  *
  * @throws std::invalid_argument If the size of \p points is not at least 2
  */
-std::tuple<std::vector<std::vector<std::size_t>>,
-           std::vector<std::vector<std::size_t>>, std::vector<std::size_t>,
-           std::vector<std::size_t>>
-fast_non_dominated_sorting(const std::vector<std::vector<double>> &points) {
-  auto N = points.size();
+auto fastNonDominatedSorting(const std::vector<std::vector<double>>& Points)
+    -> std::tuple<std::vector<std::vector<std::size_t>>, std::vector<std::vector<std::size_t>>,
+                  std::vector<std::size_t>, std::vector<std::size_t>> {
+  auto N = Points.size();
   // We make sure to have two points at least (one could also be allowed)
-  if (N < 2u) {
-    throw std::invalid_argument(
-        "At least two points are needed for fast_non_dominated_sorting: " +
-        std::to_string(N) + " detected.");
+  if (N < 2U) {
+    throw std::invalid_argument("At least two points are needed for fast_non_dominated_sorting: " + std::to_string(N) +
+                                " detected.");
   }
   // Initialize the return values
-  std::vector<std::vector<std::size_t>> non_dom_fronts(1u);
-  std::vector<std::vector<std::size_t>> dom_list(N);
-  std::vector<std::size_t> dom_count(N);
-  std::vector<std::size_t> non_dom_rank(N);
+  std::vector<std::vector<std::size_t>> NonDomFronts(1U);
+  std::vector<std::vector<std::size_t>> DomList(N);
+  std::vector<std::size_t> DomCount(N);
+  std::vector<std::size_t> NonDomRank(N);
 
   // Start the fast non dominated sort algorithm
-  for (decltype(N) i = 0u; i < N; ++i) {
-    dom_list[i].clear();
-    dom_count[i] = 0u;
-    for (decltype(N) j = 0u; j < i; ++j) {
-      if (pareto_dominance(points[i], points[j])) {
-        dom_list[i].push_back(j);
-        ++dom_count[j];
-      } else if (pareto_dominance(points[j], points[i])) {
-        dom_list[j].push_back(i);
-        ++dom_count[i];
+  for (decltype(N) I = 0U; I < N; ++I) {
+    DomList[I].clear();
+    DomCount[I] = 0U;
+    for (decltype(N) J = 0U; J < I; ++J) {
+      if (paretoDominance(Points[I], Points[J])) {
+        DomList[I].push_back(J);
+        ++DomCount[J];
+      } else if (paretoDominance(Points[J], Points[I])) {
+        DomList[J].push_back(I);
+        ++DomCount[I];
       }
     }
   }
-  for (decltype(N) i = 0u; i < N; ++i) {
-    if (dom_count[i] == 0u) {
-      non_dom_rank[i] = 0u;
-      non_dom_fronts[0].push_back(i);
+  for (decltype(N) I = 0U; I < N; ++I) {
+    if (DomCount[I] == 0U) {
+      NonDomRank[I] = 0U;
+      NonDomFronts[0].push_back(I);
     }
   }
   // we copy dom_count as we want to output its value at this point
-  auto dom_count_copy(dom_count);
-  auto current_front = non_dom_fronts[0];
-  std::vector<std::vector<std::size_t>>::size_type front_counter(0u);
-  while (current_front.size() != 0u) {
-    std::vector<std::size_t> next_front;
-    for (decltype(current_front.size()) p = 0u; p < current_front.size(); ++p) {
-      for (decltype(dom_list[current_front[p]].size()) q = 0u;
-           q < dom_list[current_front[p]].size(); ++q) {
-        --dom_count_copy[dom_list[current_front[p]][q]];
-        if (dom_count_copy[dom_list[current_front[p]][q]] == 0u) {
-          non_dom_rank[dom_list[current_front[p]][q]] = front_counter + 1u;
-          next_front.push_back(dom_list[current_front[p]][q]);
+  auto DomCountCopy(DomCount);
+  auto CurrentFront = NonDomFronts[0];
+  std::vector<std::vector<std::size_t>>::size_type FrontCounter(0U);
+  while (!CurrentFront.empty()) {
+    std::vector<std::size_t> NextFront;
+    for (const auto& P : CurrentFront) {
+      for (const auto& Q : DomList[P]) {
+        --DomCountCopy[Q];
+        if (DomCountCopy[Q] == 0U) {
+          NonDomRank[Q] = FrontCounter + 1U;
+          NextFront.push_back(Q);
         }
       }
     }
-    ++front_counter;
-    current_front = next_front;
-    if (current_front.size() != 0u) {
-      non_dom_fronts.push_back(current_front);
+    ++FrontCounter;
+    CurrentFront = NextFront;
+    if (!CurrentFront.empty()) {
+      NonDomFronts.push_back(CurrentFront);
     }
   }
-  return std::make_tuple(std::move(non_dom_fronts), std::move(dom_list),
-                         std::move(dom_count), std::move(non_dom_rank));
+  return std::make_tuple(std::move(NonDomFronts), std::move(DomList), std::move(DomCount), std::move(NonDomRank));
 }
 
 /// Crowding distance
@@ -218,69 +208,64 @@ fast_non_dominated_sorting(const std::vector<std::vector<double>> &points) {
  * @throws std::invalid_argument If points in \p non_dom_front do not all have
  * the same dimensionality
  */
-std::vector<double>
-crowding_distance(const std::vector<std::vector<double>> &non_dom_front) {
-  auto N = non_dom_front.size();
+auto crowdingDistance(const std::vector<std::vector<double>>& NonDomFront) -> std::vector<double> {
+  auto N = NonDomFront.size();
   // We make sure to have two points at least
-  if (N < 2u) {
-    throw std::invalid_argument(
-        "A non dominated front must contain at least two points: " +
-        std::to_string(N) + " detected.");
+  if (N < 2U) {
+    throw std::invalid_argument("A non dominated front must contain at least two points: " + std::to_string(N) +
+                                " detected.");
   }
-  auto M = non_dom_front[0].size();
+  auto M = NonDomFront[0].size();
   // We make sure the first point of the input non dominated front contains at
   // least two objectives
-  if (M < 2u) {
+  if (M < 2U) {
     throw std::invalid_argument("Points in the non dominated front must "
                                 "contain at least two objectives: " +
                                 std::to_string(M) + " detected.");
   }
   // We make sure all points contain the same number of objectives
-  if (!std::all_of(
-          non_dom_front.begin(), non_dom_front.end(),
-          [M](const std::vector<double> &item) { return item.size() == M; })) {
+  if (!std::all_of(NonDomFront.begin(), NonDomFront.end(),
+                   [M](const std::vector<double>& Item) { return Item.size() == M; })) {
     throw std::invalid_argument("A non dominated front must contain points of "
                                 "uniform dimensionality. Some "
                                 "different sizes were instead detected.");
   }
-  std::vector<std::size_t> indexes(N);
-  std::iota(indexes.begin(), indexes.end(), std::size_t(0u));
-  std::vector<double> retval(N, 0.);
-  for (decltype(M) i = 0u; i < M; ++i) {
-    std::sort(indexes.begin(), indexes.end(),
-              [i, &non_dom_front](std::size_t idx1, std::size_t idx2) {
-                return less_than_f(non_dom_front[idx1][i],
-                                   non_dom_front[idx2][i]);
-              });
-    retval[indexes[0]] = std::numeric_limits<double>::infinity();
-    retval[indexes[N - 1u]] = std::numeric_limits<double>::infinity();
-    double df =
-        non_dom_front[indexes[N - 1u]][i] - non_dom_front[indexes[0]][i];
-    for (decltype(N - 2u) j = 1u; j < N - 1u; ++j) {
-      retval[indexes[j]] += (non_dom_front[indexes[j + 1u]][i] -
-                             non_dom_front[indexes[j - 1u]][i]) /
-                            df;
+  std::vector<std::size_t> Indexes(N);
+  std::iota(Indexes.begin(), Indexes.end(), static_cast<std::size_t>(0U));
+  std::vector<double> Retval(N, 0.);
+  for (decltype(M) I = 0U; I < M; ++I) {
+    std::sort(Indexes.begin(), Indexes.end(), [I, &NonDomFront](std::size_t Idx1, std::size_t Idx2) {
+      return lessThanF(NonDomFront[Idx1][I], NonDomFront[Idx2][I]);
+    });
+    Retval[Indexes[0]] = std::numeric_limits<double>::infinity();
+    Retval[Indexes[N - 1U]] = std::numeric_limits<double>::infinity();
+    const double Df = NonDomFront[Indexes[N - 1U]][I] - NonDomFront[Indexes[0]][I];
+    for (decltype(N - 2U) J = 1U; J < N - 1U; ++J) {
+      Retval[Indexes[J]] += (NonDomFront[Indexes[J + 1U]][I] - NonDomFront[Indexes[J - 1U]][I]) / Df;
     }
   }
-  return retval;
+  return Retval;
 }
 
 // Multi-objective tournament selection. Requires all sizes to be consistent.
 // Does not check if input is well formed.
-std::vector<double>::size_type mo_tournament_selection(
-    std::vector<double>::size_type idx1, std::vector<double>::size_type idx2,
-    const std::vector<std::vector<double>::size_type> &non_domination_rank,
-    const std::vector<double> &crowding_d, std::mt19937 &mt) {
-  if (non_domination_rank[idx1] < non_domination_rank[idx2])
-    return idx1;
-  if (non_domination_rank[idx1] > non_domination_rank[idx2])
-    return idx2;
-  if (crowding_d[idx1] > crowding_d[idx2])
-    return idx1;
-  if (crowding_d[idx1] < crowding_d[idx2])
-    return idx2;
-  std::uniform_real_distribution<> drng(0., 1.);
-  return ((drng(mt) < 0.5) ? idx1 : idx2);
+auto moTournamentSelection(std::vector<double>::size_type Idx1, std::vector<double>::size_type Idx2,
+                           const std::vector<std::vector<double>::size_type>& NonDominationRank,
+                           const std::vector<double>& CrowdingD, std::mt19937& Mt) -> std::vector<double>::size_type {
+  if (NonDominationRank[Idx1] < NonDominationRank[Idx2]) {
+    return Idx1;
+  }
+  if (NonDominationRank[Idx1] > NonDominationRank[Idx2]) {
+    return Idx2;
+  }
+  if (CrowdingD[Idx1] > CrowdingD[Idx2]) {
+    return Idx1;
+  }
+  if (CrowdingD[Idx1] < CrowdingD[Idx2]) {
+    return Idx2;
+  }
+  std::uniform_real_distribution<> Drng(0., 1.);
+  return ((Drng(Mt) < 0.5) ? Idx1 : Idx2);
 }
 
 // Implementation of the binary crossover.
@@ -288,66 +273,56 @@ std::vector<double>::size_type mo_tournament_selection(
 // otherwise Requires dimensions of the parent and bounds to be equal -> out of
 // bound reads. nix is the integer dimension (integer alleles assumed at the end
 // of the chromosome)
-std::pair<firestarter::optimizer::Individual,
-          firestarter::optimizer::Individual>
-sbx_crossover(const firestarter::optimizer::Individual &parent1,
-              const firestarter::optimizer::Individual &parent2,
-              const double p_cr, std::mt19937 &mt) {
+auto sbxCrossover(const firestarter::optimizer::Individual& Parent1, const firestarter::optimizer::Individual& Parent2,
+                  const double PCr, std::mt19937& Mt)
+    -> std::pair<firestarter::optimizer::Individual, firestarter::optimizer::Individual> {
   // Decision vector dimensions
-  auto nix = parent1.size();
-  firestarter::optimizer::Individual::size_type site1, site2;
+  auto Nix = Parent1.size();
   // Initialize the child decision vectors
-  firestarter::optimizer::Individual child1 = parent1;
-  firestarter::optimizer::Individual child2 = parent2;
+  firestarter::optimizer::Individual Child1 = Parent1;
+  firestarter::optimizer::Individual Child2 = Parent2;
   // Random distributions
-  std::uniform_real_distribution<> drng(0.,
+  std::uniform_real_distribution<> Drng(0.,
                                         1.); // to generate a number in [0, 1)
 
   // This implements a Simulated Binary Crossover SBX
-  if (drng(mt) <
-      p_cr) { // No crossever at all will happen with probability p_cr
+  if (Drng(Mt) < PCr) { // No crossever at all will happen with probability p_cr
     // This implements two-points crossover and applies it to the integer part
     // of the chromosome.
-    if (nix > 0u) {
-      std::uniform_int_distribution<
-          firestarter::optimizer::Individual::size_type>
-          ra_num(0, nix - 1u);
-      site1 = ra_num(mt);
-      site2 = ra_num(mt);
-      if (site1 > site2) {
-        std::swap(site1, site2);
+    if (Nix > 0U) {
+      std::uniform_int_distribution<firestarter::optimizer::Individual::size_type> RaNum(0, Nix - 1U);
+      auto Site1 = RaNum(Mt);
+      auto Site2 = RaNum(Mt);
+      if (Site1 > Site2) {
+        std::swap(Site1, Site2);
       }
-      for (decltype(site2) j = site1; j <= site2; ++j) {
-        child1[j] = parent2[j];
-        child2[j] = parent1[j];
+      for (decltype(Site2) J = Site1; J <= Site2; ++J) {
+        Child1[J] = Parent2[J];
+        Child2[J] = Parent1[J];
       }
     }
   }
-  return std::make_pair(std::move(child1), std::move(child2));
+  return std::make_pair(std::move(Child1), std::move(Child2));
 }
 
 // Performs polynomial mutation. Requires all sizes to be consistent. Does not
 // check if input is well formed. p_m is the mutation probability
-void polynomial_mutation(
-    firestarter::optimizer::Individual &child,
-    const std::vector<std::tuple<unsigned, unsigned>> &bounds, const double p_m,
-    std::mt19937 &mt) {
+void polynomialMutation(firestarter::optimizer::Individual& Child,
+                        const std::vector<std::tuple<unsigned, unsigned>>& Bounds, const double PM, std::mt19937& Mt) {
   // Decision vector dimensions
-  auto nix = child.size();
+  auto Nix = Child.size();
   // Random distributions
-  std::uniform_real_distribution<> drng(0.,
+  std::uniform_real_distribution<> Drng(0.,
                                         1.); // to generate a number in [0, 1)
   // This implements the integer mutation for an individual
-  for (decltype(nix) j = 0; j < nix; ++j) {
-    if (drng(mt) < p_m) {
+  for (decltype(Nix) J = 0; J < Nix; ++J) {
+    if (Drng(Mt) < PM) {
       // We need to draw a random integer in [lb, ub].
-      auto lb = std::get<0>(bounds[j]);
-      auto ub = std::get<1>(bounds[j]);
-      std::uniform_int_distribution<
-          firestarter::optimizer::Individual::size_type>
-          dist(lb, ub);
-      auto mutated = dist(mt);
-      child[j] = mutated;
+      auto Lb = std::get<0>(Bounds[J]);
+      auto Ub = std::get<1>(Bounds[J]);
+      std::uniform_int_distribution<firestarter::optimizer::Individual::size_type> Dist(Lb, Ub);
+      auto Mutated = Dist(Mt);
+      Child[J] = Mutated;
     }
   }
 }
@@ -384,61 +359,58 @@ void polynomial_mutation(
  * @throws unspecified all exceptions thrown by
  * pagmo::fast_non_dominated_sorting and pagmo::crowding_distance
  */
-std::vector<std::size_t>
-select_best_N_mo(const std::vector<std::vector<double>> &input_f,
-                 std::size_t N) {
-  if (N == 0u) { // corner case
+auto selectBestNMo(const std::vector<std::vector<double>>& InputF, std::size_t N) -> std::vector<std::size_t> {
+  if (N == 0U) { // corner case
     return {};
   }
-  if (input_f.size() == 0u) { // corner case
+  if (InputF.empty()) { // corner case
     return {};
   }
-  if (input_f.size() == 1u) { // corner case
-    return {0u};
+  if (InputF.size() == 1U) { // corner case
+    return {0U};
   }
-  if (N >= input_f.size()) { // corner case
-    std::vector<std::size_t> retval(input_f.size());
-    std::iota(retval.begin(), retval.end(), std::size_t(0u));
-    return retval;
+  if (N >= InputF.size()) { // corner case
+    std::vector<std::size_t> Retval(InputF.size());
+    std::iota(Retval.begin(), Retval.end(), static_cast<std::size_t>(0U));
+    return Retval;
   }
-  std::vector<std::size_t> retval;
-  std::vector<std::size_t>::size_type front_id(0u);
+  std::vector<std::size_t> Retval;
+  std::vector<std::size_t>::size_type FrontId(0U);
   // Run fast-non-dominated sorting
-  auto tuple = fast_non_dominated_sorting(input_f);
+  auto Tuple = fastNonDominatedSorting(InputF);
   // Insert all non dominated fronts if not more than N
-  for (const auto &front : std::get<0>(tuple)) {
-    if (retval.size() + front.size() <= N) {
-      for (auto i : front) {
-        retval.push_back(i);
+  for (const auto& Front : std::get<0>(Tuple)) {
+    if (Retval.size() + Front.size() <= N) {
+      for (auto I : Front) {
+        Retval.push_back(I);
       }
-      if (retval.size() == N) {
-        return retval;
+      if (Retval.size() == N) {
+        return Retval;
       }
-      ++front_id;
+      ++FrontId;
     } else {
       break;
     }
   }
-  auto front = std::get<0>(tuple)[front_id];
-  std::vector<std::vector<double>> non_dom_fits(front.size());
+  auto Front = std::get<0>(Tuple)[FrontId];
+  std::vector<std::vector<double>> NonDomFits(Front.size());
   // Run crowding distance for the front
-  for (decltype(front.size()) i = 0u; i < front.size(); ++i) {
-    non_dom_fits[i] = input_f[front[i]];
+  for (decltype(Front.size()) I = 0U; I < Front.size(); ++I) {
+    NonDomFits[I] = InputF[Front[I]];
   }
-  std::vector<double> cds(crowding_distance(non_dom_fits));
+  std::vector<double> Cds(crowdingDistance(NonDomFits));
   // We now have front and crowding distance, we sort the front w.r.t. the
   // crowding
-  std::vector<std::size_t> idxs(front.size());
-  std::iota(idxs.begin(), idxs.end(), std::size_t(0u));
-  std::sort(idxs.begin(), idxs.end(),
-            [&cds](std::size_t idx1, std::size_t idx2) {
-              return greater_than_f(cds[idx1], cds[idx2]);
-            }); // Descending order1
-  auto remaining = N - retval.size();
-  for (decltype(remaining) i = 0u; i < remaining; ++i) {
-    retval.push_back(front[idxs[i]]);
+  std::vector<std::size_t> Idxs(Front.size());
+  std::iota(Idxs.begin(), Idxs.end(), static_cast<std::size_t>(0U));
+  std::sort(Idxs.begin(), Idxs.end(), [&Cds](std::size_t Idx1, std::size_t Idx2) {
+    return greaterThanF(Cds[Idx1], Cds[Idx2]);
+  }); // Descending order1
+  auto Remaining = N - Retval.size();
+  for (decltype(Remaining) I = 0U; I < Remaining; ++I) {
+    Retval.push_back(Front[Idxs[I]]);
   }
-  return retval;
+  return Retval;
 }
 
 /// Ideal point
@@ -458,31 +430,30 @@ select_best_N_mo(const std::vector<std::vector<double>> &input_f,
  * @throws std::invalid_argument if the input objective vectors are not all of
  * the same size
  */
-std::vector<double> ideal(const std::vector<std::vector<double>> &points) {
+auto ideal(const std::vector<std::vector<double>>& Points) -> std::vector<double> {
   // Corner case
-  if (points.size() == 0u) {
+  if (Points.empty()) {
     return {};
   }
 
   // Sanity checks
-  auto M = points[0].size();
-  for (const auto &f : points) {
-    if (f.size() != M) {
+  auto M = Points[0].size();
+  for (const auto& F : Points) {
+    if (F.size() != M) {
       throw std::invalid_argument("Input vector of objectives must contain "
                                   "fitness vector of equal dimension " +
                                   std::to_string(M));
     }
   }
   // Actual algorithm
-  std::vector<double> retval(M);
-  for (decltype(M) i = 0u; i < M; ++i) {
-    retval[i] = (*std::min_element(
-        points.begin(), points.end(),
-        [i](const std::vector<double> &f1, const std::vector<double> &f2) {
-          return util::greater_than_f(f1[i], f2[i]);
-        }))[i];
+  std::vector<double> Retval(M);
+  for (decltype(M) I = 0U; I < M; ++I) {
+    Retval[I] = (*std::min_element(Points.begin(), Points.end(),
+                                   [I](const std::vector<double>& F1, const std::vector<double>& F2) {
+                                     return util::greaterThanF(F1[I], F2[I]);
+                                   }))[I];
   }
-  return retval;
+  return Retval;
 }
 
 } // namespace firestarter::optimizer::util
diff --git a/include/firestarter/Measurement/Metric/Perf.h b/src/firestarter/SafeExit.cpp
similarity index 75%
rename from include/firestarter/Measurement/Metric/Perf.h
rename to src/firestarter/SafeExit.cpp
index 72221cca..4aed7a50 100644
--- a/include/firestarter/Measurement/Metric/Perf.h
+++ b/src/firestarter/SafeExit.cpp
@@ -19,10 +19,16 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#pragma once
+#include "firestarter/SafeExit.hpp"
 
-#include <firestarter/Measurement/MetricInterface.h>
+#include <mutex>
 
-extern metric_interface_t perf_ipc_metric;
+[[noreturn]] void firestarter::safeExit(const int Status) {
+  // This mutex is shared across all calls to safeExit, therefore also calls between different threads
+  static std::mutex ExitMutex;
 
-extern metric_interface_t perf_freq_metric;
+  ExitMutex.lock();
+
+  // NOLINTNEXTLINE(concurrency-mt-unsafe)
+  std::exit(Status);
+}
\ No newline at end of file
diff --git a/src/firestarter/WatchdogWorker.cpp b/src/firestarter/WatchdogWorker.cpp
index 6a3f6b95..b5a73787 100644
--- a/src/firestarter/WatchdogWorker.cpp
+++ b/src/firestarter/WatchdogWorker.cpp
@@ -19,7 +19,7 @@
  * Contact: daniel.hackenberg@tu-dresden.de
  *****************************************************************************/
 
-#include <firestarter/Firestarter.hpp>
+#include "firestarter/Firestarter.hpp"
 
 #include <cerrno>
 #include <csignal>
@@ -28,11 +28,10 @@
 #include <SCOREP_User.h>
 #endif
 
-using namespace firestarter;
+namespace firestarter {
 
-int Firestarter::watchdogWorker(std::chrono::microseconds period,
-                                std::chrono::microseconds load,
-                                std::chrono::seconds timeout) {
+void Firestarter::watchdogWorker(std::chrono::microseconds Period, std::chrono::microseconds Load,
+                                 std::chrono::seconds Timeout) {
 
   using clock = std::chrono::high_resolution_clock;
   using nsec = std::chrono::nanoseconds;
@@ -40,56 +39,53 @@ int Firestarter::watchdogWorker(std::chrono::microseconds period,
   using sec = std::chrono::seconds;
 
   // calculate idle time to be the rest of the period
-  auto idle = period - load;
+  auto Idle = Period - Load;
 
   // elapsed time
-  nsec time(0);
+  nsec Time(0);
 
   // do no enter the loop if we do not have to set the load level periodically,
   // at 0 or 100 load.
-  if (period > usec::zero()) {
+  if (Period > usec::zero()) {
     // this first time is critical as the period will be alligend from this
     // point
-    std::chrono::time_point<clock> startTime = clock::now();
+    const auto StartTime = clock::now();
 
     // this loop will set the load level periodically.
     for (;;) {
-      std::chrono::time_point<clock> currentTime = clock::now();
+      const auto CurrentTime = clock::now();
 
       // get the time already advanced in the current timeslice
       // this can happen if a load function does not terminates just on time
-      nsec advance = std::chrono::duration_cast<nsec>(currentTime - startTime) %
-                     std::chrono::duration_cast<nsec>(period);
+      const auto Advance =
+          std::chrono::duration_cast<nsec>(CurrentTime - StartTime) % std::chrono::duration_cast<nsec>(Period);
 
       // subtract the advaned time from our timeslice by spilting it based on
       // the load level
-      nsec load_reduction =
-          (std::chrono::duration_cast<nsec>(load).count() * advance) /
-          std::chrono::duration_cast<nsec>(period).count();
-      nsec idle_reduction = advance - load_reduction;
+      const auto LoadReduction =
+          (std::chrono::duration_cast<nsec>(Load).count() * Advance) / std::chrono::duration_cast<nsec>(Period).count();
+      const auto IdleReduction = Advance - LoadReduction;
 
       // signal high load level
-      this->setLoad(LOAD_HIGH);
+      setLoad(LoadThreadWorkType::LoadHigh);
 
       // calculate values for nanosleep
-      nsec load_nsec = load - load_reduction;
+      const auto LoadNsec = Load - LoadReduction;
 
       // wait for time to be ellapsed with high load
 #ifdef ENABLE_VTRACING
       VT_USER_START("WD_HIGH");
 #endif
 #ifdef ENABLE_SCOREP
-      SCOREP_USER_REGION_BY_NAME_BEGIN("WD_HIGH",
-                                       SCOREP_USER_REGION_TYPE_COMMON);
+      SCOREP_USER_REGION_BY_NAME_BEGIN("WD_HIGH", SCOREP_USER_REGION_TYPE_COMMON);
 #endif
       {
-        std::unique_lock<std::mutex> lk(this->_watchdogTerminateMutex);
+        std::unique_lock<std::mutex> Lk(WatchdogTerminateMutex);
         // abort waiting if we get the interrupt signal
-        this->_watchdogTerminateAlert.wait_for(
-            lk, load_nsec, [this]() { return this->_watchdog_terminate; });
+        WatchdogTerminateAlert.wait_for(Lk, LoadNsec, []() { return WatchdogTerminate; });
         // terminate on interrupt
-        if (this->_watchdog_terminate) {
-          return EXIT_SUCCESS;
+        if (WatchdogTerminate) {
+          return;
         }
       }
 #ifdef ENABLE_VTRACING
@@ -100,27 +96,25 @@ int Firestarter::watchdogWorker(std::chrono::microseconds period,
 #endif
 
       // signal low load
-      this->setLoad(LOAD_LOW);
+      setLoad(LoadThreadWorkType::LoadLow);
 
       // calculate values for nanosleep
-      nsec idle_nsec = idle - idle_reduction;
+      const auto IdleNsec = Idle - IdleReduction;
 
       // wait for time to be ellapsed with low load
 #ifdef ENABLE_VTRACING
       VT_USER_START("WD_LOW");
 #endif
 #ifdef ENABLE_SCOREP
-      SCOREP_USER_REGION_BY_NAME_BEGIN("WD_LOW",
-                                       SCOREP_USER_REGION_TYPE_COMMON);
+      SCOREP_USER_REGION_BY_NAME_BEGIN("WD_LOW", SCOREP_USER_REGION_TYPE_COMMON);
 #endif
       {
-        std::unique_lock<std::mutex> lk(this->_watchdogTerminateMutex);
+        std::unique_lock<std::mutex> Lk(WatchdogTerminateMutex);
         // abort waiting if we get the interrupt signal
-        this->_watchdogTerminateAlert.wait_for(
-            lk, idle_nsec, [this]() { return this->_watchdog_terminate; });
+        WatchdogTerminateAlert.wait_for(Lk, IdleNsec, []() { return WatchdogTerminate; });
         // terminate on interrupt
-        if (this->_watchdog_terminate) {
-          return EXIT_SUCCESS;
+        if (WatchdogTerminate) {
+          return;
         }
       }
 #ifdef ENABLE_VTRACING
@@ -131,16 +125,15 @@ int Firestarter::watchdogWorker(std::chrono::microseconds period,
 #endif
 
       // increment elapsed time
-      time += period;
+      Time += Period;
 
       // exit when termination signal is received or timeout is reached
       {
-        std::lock_guard<std::mutex> lk(this->_watchdogTerminateMutex);
-        if (this->_watchdog_terminate ||
-            (timeout > sec::zero() && (time > timeout))) {
-          this->setLoad(LOAD_STOP);
+        const std::lock_guard<std::mutex> Lk(WatchdogTerminateMutex);
+        if (WatchdogTerminate || (Timeout > sec::zero() && (Time > Timeout))) {
+          setLoad(LoadThreadWorkType::LoadStop);
 
-          return EXIT_SUCCESS;
+          return;
         }
       }
     }
@@ -148,18 +141,15 @@ int Firestarter::watchdogWorker(std::chrono::microseconds period,
 
   // if timeout is set, sleep for this time and stop execution.
   // else return and wait for sigterm handler to request threads to stop.
-  if (timeout > sec::zero()) {
+  if (Timeout > sec::zero()) {
     {
-      std::unique_lock<std::mutex> lk(Firestarter::_watchdogTerminateMutex);
+      std::unique_lock<std::mutex> Lk(Firestarter::WatchdogTerminateMutex);
       // abort waiting if we get the interrupt signal
-      Firestarter::_watchdogTerminateAlert.wait_for(
-          lk, timeout, []() { return Firestarter::_watchdog_terminate; });
+      Firestarter::WatchdogTerminateAlert.wait_for(Lk, Timeout, []() { return WatchdogTerminate; });
     }
 
-    this->setLoad(LOAD_STOP);
-
-    return EXIT_SUCCESS;
+    setLoad(LoadThreadWorkType::LoadStop);
   }
-
-  return EXIT_SUCCESS;
 }
+
+} // namespace firestarter
\ No newline at end of file
diff --git a/tooling/clang-tidy.py b/tooling/clang-tidy.py
new file mode 100755
index 00000000..b8ce7b5c
--- /dev/null
+++ b/tooling/clang-tidy.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import json
+from pathlib import Path
+import subprocess
+import click
+import multiprocessing
+import sys
+import typing
+import random
+from functools import partial
+
+# Find all source files from the compile commands database that are in a specific directory.
+def find_source_files_from_compile_commands(compile_commands_path: Path, sources_dir: Path) -> typing.List[Path]:
+    with open(compile_commands_path, 'r') as fp:
+        compile_commands = json.loads(fp.read())
+        sources = [ entry['file'] for entry in compile_commands ]
+        sources = list(filter(lambda file: str(file).startswith(str(sources_dir)), sources))
+        return sources
+
+# Split a list of paths into multiple list of paths
+def split_in_chunks(chunk_size: int, input: typing.List[Path]) -> typing.List[typing.List[Path]]:
+    length = len(input) // chunk_size
+    if length * chunk_size < len(input):
+        length += 1
+    
+    return [ input[i:i+length] for i in range(0, len(input), length) ]
+
+# Run clang-tidy on a set of input files and return the stdout
+def run_clang_tidy(files: typing.List[Path], project_root_path: Path, build_root_path: Path, clang_tidy_file_path: Path) -> bytes:
+    command_args = ['clang-tidy', '-extra-arg=-std=c++17', f'-p={build_root_path}', f'--config-file={clang_tidy_file_path}', '--format-style=file']
+    command_args += files
+    print(f'Starting {command_args}')
+    p = subprocess.Popen(command_args, stdout=subprocess.PIPE, cwd=project_root_path)
+
+    # Wait for clang-tidy instances to terminate
+    if p.poll() is None:
+        p.wait()
+        stdout, _ = p.communicate()
+        return stdout + b'\n'
+    
+    return b''
+
+@click.group()
+def cli():
+    pass
+
+@cli.command(help='Exsits successfully if the report is empty')
+@click.option('--build-root', help='The folder where the clang-tidy-report.txt is located.', required=True)
+def check(build_root):
+    build_root_path = Path(build_root).absolute()
+
+    print(f'Looking for clang-tidy-report.txt in {build_root_path}')
+    clang_tidy_report_path = build_root_path / Path('clang-tidy-report.txt')
+    if clang_tidy_report_path.exists():
+        print(f'Found {clang_tidy_report_path}')
+    else:
+        sys.exit("Dind't find clang-tidy-report.txt. Aborting.")
+
+    with open(clang_tidy_report_path, 'r') as fp:
+        content = fp.read().rstrip()
+        if len(content) == 0:
+            print('No content in clang-tidy-report.txt')
+        else:
+            sys.exit('Found content in clang-tidy-report.txt')
+
+@cli.command(help='Create the clang-tidy report')
+@click.option('--project-root', default=Path(__file__).parent.parent.absolute(), help='The folder where the git repository is located.')
+@click.option('--build-root', help='The folder where the compile_commands.json is located.', required=True)
+@click.option('--cores', default=multiprocessing.cpu_count(), help='The number of clang-tidy processes to spawn.')
+def clang_tidy_report(project_root, build_root, cores):
+    project_root_path = Path(project_root).absolute()
+    build_root_path = Path(build_root).absolute()
+    src_path = project_root_path / Path('src')
+
+    print(f'Looking for compile_commands.json in {build_root_path}')
+    compile_commands_path = build_root_path / Path('compile_commands.json')
+    if compile_commands_path.exists():
+        print(f'Found {compile_commands_path}')
+    else:
+        sys.exit("Dind't find compile_commands.json. Aborting.")
+        
+    print(f'Looking for .clang-tidy in {project_root_path}')
+    clang_tidy_file_path = project_root_path / Path('.clang-tidy')
+    if clang_tidy_file_path.exists():
+        print(f'Found {clang_tidy_file_path}')
+    else:
+        sys.exit("Dind't find .clang-tidy. Aborting.")
+
+    files = find_source_files_from_compile_commands(compile_commands_path, src_path)
+    print(f'Found {len(files)} source and header files.')
+    
+    print(f'Lanching {cores} instances of clang-tidy in project root: {project_root_path}')
+
+    # Shuffle files to improve runtime performance. Use seed 123 to keep it the same across runs.
+    files_shuffled = files.copy()
+    random.Random(123).shuffle(files_shuffled)
+
+    # Spawn multiple python thread that each start their own instance of clang-tidy. Opening all processes in the same python thread caused problems with github actions.
+    with multiprocessing.Pool(cores) as p:
+        stdout = p.map(partial(run_clang_tidy, project_root_path=project_root_path, build_root_path=build_root_path, clang_tidy_file_path=clang_tidy_file_path), split_in_chunks(cores, files_shuffled))
+
+    clang_tidy_report_file = build_root_path / Path('clang-tidy-report.txt')
+    print(f'Writing report to {clang_tidy_report_file}')
+    with open(clang_tidy_report_file, 'wb') as fp:
+        fp.write(b''.join(stdout))
+
+if __name__ == '__main__':
+    cli()
\ No newline at end of file