diff --git a/CMakeLists.txt b/CMakeLists.txt index c94e0144..eb982a5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,7 @@ git_submodule_update() if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") else() -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -O2 -fdata-sections -ffunction-sections") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mamx-tile -Wall -Wextra -O2 -fdata-sections -ffunction-sections") endif() if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") diff --git a/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp b/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp index b23f1b97..170874f4 100644 --- a/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp +++ b/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp @@ -24,6 +24,17 @@ #include namespace firestarter::environment::x86::payload { + +// Define struct that is used as config and loaded through ldtilecfg() +typedef struct __tile_config +{ + uint8_t palette_id; + uint8_t start_row; + uint8_t reserved_0[14]; + uint16_t colsb[16]; + uint8_t rows[16]; +} __tilecfg; + class AVX512Payload final : public X86Payload { public: AVX512Payload(asmjit::CpuFeatures const &supportedFeatures) @@ -44,12 +55,17 @@ class AVX512Payload final : public X86Payload { return new AVX512Payload(this->supportedFeatures()); }; + static void create_AMX_config(__tilecfg *tileinfo); + static void request_permission(); + static void init_buffer_rand(uintptr_t buf1, uintptr_t buf2); + private: const std::map instructionFlops = { {"REG", 32}, {"L1_L", 32}, {"L1_BROADCAST", 16}, {"L1_S", 16}, {"L1_LS", 16}, {"L2_L", 32}, {"L2_S", 16}, {"L2_LS", 16}, {"L3_L", 32}, {"L3_S", 16}, {"L3_LS", 16}, {"L3_P", 16}, - {"RAM_L", 32}, {"RAM_S", 16}, {"RAM_LS", 16}, {"RAM_P", 16}}; + {"RAM_L", 32}, {"RAM_S", 16}, {"RAM_LS", 16}, {"RAM_P", 16}, + {"AMX", 512}}; const std::map instructionMemory = { {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}; diff --git a/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp b/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp new file mode 100644 index 00000000..fe8f5b84 --- /dev/null +++ b/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp @@ -0,0 +1,52 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2020 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include +#include + +namespace firestarter::environment::x86::platform { +class SapphireRapidsConfig final : public X86PlatformConfig { + +public: + SapphireRapidsConfig(asmjit::CpuFeatures const &supportedFeatures, + unsigned family, unsigned model, unsigned threads) + : X86PlatformConfig("SPR_XEONEP", 6, {143}, {1, 2}, 0, + {32768, 1048576, 1441792}, 1048576000, 1536, family, + model, threads, + new payload::AVX512Payload(supportedFeatures)) {} + + std::vector> + getDefaultPayloadSettings() const override { + return std::vector>({{"RAM_S", 3}, + {"RAM_P", 1}, + {"L3_S", 1}, + {"L3_P", 1}, + {"L2_S", 4}, + {"L2_L", 70}, + {"L1_S", 0}, + {"L1_L", 40}, + {"REG", 140}, + {"AMX", 1}}); + } +}; +} // namespace firestarter::environment::x86::platform diff --git a/include/firestarter/Environment/X86/X86Environment.hpp b/include/firestarter/Environment/X86/X86Environment.hpp index 11ad940e..05155611 100644 --- a/include/firestarter/Environment/X86/X86Environment.hpp +++ b/include/firestarter/Environment/X86/X86Environment.hpp @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -88,7 +89,8 @@ class X86Environment final : public Environment { REGISTER(HaswellEPConfig), REGISTER(SandyBridgeConfig), REGISTER(SandyBridgeEPConfig), REGISTER(NehalemConfig), REGISTER(NehalemEPConfig), REGISTER(BulldozerConfig), - REGISTER(NaplesConfig), REGISTER(RomeConfig)}; + REGISTER(NaplesConfig), REGISTER(RomeConfig), + REGISTER(SapphireRapidsConfig)}; std::list platformConfigs; @@ -96,6 +98,7 @@ class X86Environment final : public Environment { const std::list> fallbackPlatformConfigsCtor = { + REGISTER(SapphireRapidsConfig), // AMX + AVX512 REGISTER(SkylakeSPConfig), // AVX512 REGISTER(BulldozerConfig), // FMA4 REGISTER(HaswellConfig), // FMA diff --git a/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp index 9316ed39..5d1f1c04 100644 --- a/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp @@ -1,6 +1,6 @@ /****************************************************************************** * FIRESTARTER - A Processor Stress Test Utility - * Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High + * Copyright (C) 2020 TU Dresden, Center for Information Services and High * Performance Computing * * This program is free software: you can redistribute it and/or modify @@ -18,8 +18,24 @@ * * Contact: daniel.hackenberg@tu-dresden.de *****************************************************************************/ - #include +#include +#include +#include /* Definition of ARCH_* constants */ + +#define XFEATURE_XTILECFG 17 +#define XFEATURE_XTILEDATA 18 +#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) +#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) +#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) + +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 + +#define MAX 1024 +#define MAX_ROWS 16 +#define MAX_COLS 64 + using namespace firestarter::environment::x86::payload; using namespace asmjit; @@ -38,6 +54,17 @@ int AVX512Payload::compilePayload( auto repetitions = this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread); + // Check if AMX is in instruction mix and supported by CPU + if (std::find(sequence.begin(), sequence.end(), "AMX") != sequence.end()) { + if(this->supportedFeatures().x86().hasAMX_BF16()){ + workerLog::trace() << "AMX BF16 operations are supported by this processor."; + } + else{ + workerLog::error() << "[ERROR] AMX BF16 operations are not supported by this processor."; + return EXIT_FAILURE; + } + } + // compute count of flops and memory access for performance report unsigned flops = 0; unsigned bytes = 0; @@ -164,6 +191,42 @@ int AVX512Payload::compilePayload( for (auto const ® : shift_reg32) { cb.mov(reg, Imm(0xAAAAAAAA)); } + + + // Init AMX registers and config + __tilecfg tile_data = {0}; + request_permission(); + create_AMX_config(&tile_data); // Create tilecfg and fill it + + static bool init = true; + uintptr_t src1, src2; + uint64_t src3; + unsigned int aligned_alloc_size = static_cast(MAX*sizeof(__bfloat16)); + if(aligned_alloc_size % 1024){ // aligned_alloc expects size to be multiple of alignment (aka 1024) + aligned_alloc_size = aligned_alloc_size + (1024 - (aligned_alloc_size % 1024)); + } + src1 = (uintptr_t) aligned_alloc(1024, aligned_alloc_size); + src2 = (uintptr_t) aligned_alloc(1024, aligned_alloc_size); + src3 = (uint64_t) aligned_alloc(1024, aligned_alloc_size); + if(((void*)src1 == nullptr) || (void*)src2 == nullptr || (void*)src3 == nullptr){ // uintptr_t garantuees we can cast it to void* and back + std::cout << "[ERROR]: Allocation of source and target buffer for AMX failed. Aborting...\n"; + exit(1); + } + + //Init buffers + init_buffer_rand(src1, src2); + memset((void*) src3, 0, aligned_alloc_size); + + cb.tileloaddt1(tmm6, zmmword_ptr(src1)); + cb.tileloaddt1(tmm7, zmmword_ptr(src2)); // Ensure no overflows through loading x and -x in src2 + + cb.tileloaddt1(tmm0, zmmword_ptr(src3)); // Preload with 0 + cb.tileloaddt1(tmm1, zmmword_ptr(src3)); + cb.tileloaddt1(tmm2, zmmword_ptr(src3)); + cb.tileloaddt1(tmm3, zmmword_ptr(src3)); + cb.tileloaddt1(tmm4, zmmword_ptr(src3)); + cb.tileloaddt1(tmm5, zmmword_ptr(src3)); + // Initialize AVX512-Registers for FMA Operations cb.vmovapd(zmm0, zmmword_ptr(pointer_reg)); cb.vmovapd(zmm1, zmmword_ptr(pointer_reg, 64)); @@ -212,6 +275,7 @@ int AVX512Payload::compilePayload( auto mov_dst = trans_start; auto mov_src = mov_dst + 1; unsigned l1_offset = 0; + int counter=0; #define L1_INCREMENT() \ l1_offset += 64; \ @@ -296,6 +360,9 @@ int AVX512Payload::compilePayload( cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 64)); cb.prefetcht2(ptr(ram_addr)); RAM_INCREMENT(); + } else if (item == "AMX") { + cb.tdpbf16ps(Tmm(counter%6), tmm6, tmm7); + counter++; } else { workerLog::error() << "Instruction group " << item << " not found in " << this->name() << "."; @@ -448,3 +515,74 @@ void AVX512Payload::init(unsigned long long *memoryAddr, unsigned long long bufferSize) { X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4); } + +void AVX512Payload::create_AMX_config(__tilecfg *tileinfo){ + // Create tile_cfg, fill it and return + int i; + tileinfo->palette_id = 1; + tileinfo->start_row = 0; + + + for (i = 0; i < 8; ++i) + { + tileinfo->colsb[i] = MAX_COLS; + tileinfo->rows[i] = MAX_ROWS; + } + + _tile_loadconfig(tileinfo); +} + + +void AVX512Payload::request_permission(){ + + long rc; + unsigned long bitmask; + rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA); + + if(rc){ + workerLog::error() << "XTILE_DATA request failed: " << rc; + } + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask); + if (rc){ + workerLog::error() << "prctl(ARCH_GET_XCOMP_PERM) error: " << rc; + } + if (bitmask & XFEATURE_MASK_XTILE){ + workerLog::trace() << "ARCH_REQ_XCOMP_PERM XTILE_DATA successful."; + } + else{ + workerLog::error() << "[ERROR] ARCH_REQ_XCOMP_PERM XTILE_DATA unsuccessful!"; + } + + +} + +void AVX512Payload::init_buffer_rand(uintptr_t src1, uintptr_t src2){ + + // Initialize buffer with random values + // Multiplication always produces either 1 or -1 + // Accumulation operation always on (1 + -1) = 0 ensures stable values + + __bfloat16 *buf1 = (__bfloat16*) src1; + __bfloat16 *buf2 = (__bfloat16*) src2; + + // TODO: Change MAX_ROWS/MAXC_COLS from constant to maximum size check by asmJit + // Currently not supported by asmJit + // Alternative: Manually parse CPUID + + for(int i = 0; i