Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMX extension #68

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
95f1c83
[ADD] AMX implementation and Sapphire rapids config
cyssi-cb Sep 20, 2024
1fdf36f
[FIX] update asmjit calls
cyssi-cb Sep 20, 2024
2b9adf9
[FIX] include Sapphire Rapids config
cyssi-cb Sep 20, 2024
a91ea44
[FIX] asmjit call in sapphire rapids config
cyssi-cb Sep 20, 2024
e2a8731
[FIX] add new files to cmake
cyssi-cb Sep 20, 2024
529bb7f
[FIX] adapted workload to new asmjit api
cyssi-cb Sep 20, 2024
b3b94f8
[FIX] adapted workload to new asmjit api
cyssi-cb Sep 20, 2024
8e8e82d
[FIX] add missing compiler flags
cyssi-cb Sep 20, 2024
892f234
[FIX] register SApphire Rapids config
cyssi-cb Sep 20, 2024
5c5f935
[REMOVED] unneded prints
cyssi-cb Sep 20, 2024
edf806c
[ADD] use bf16
cyssi-cb Sep 20, 2024
af302d6
[FIX] typo
cyssi-cb Sep 20, 2024
7caf6cb
[ADD] limit init value
cyssi-cb Sep 23, 2024
38fe61d
[REMOVED] unnecessary defines
cyssi-cb Sep 23, 2024
0427834
[ADD] use AVX512 config with AMX
cyssi-cb Sep 23, 2024
9fc69b6
[FIX] correct CPUID model, function as static, logging behavior
cyssi-cb Nov 18, 2024
b5fa958
[FIX] includes
cyssi-cb Nov 18, 2024
fb60acd
[FIX] includes
cyssi-cb Nov 18, 2024
5576797
[ADD] check for AMX during compilePayload
cyssi-cb Nov 18, 2024
ee707cf
[FIX] spelling
cyssi-cb Nov 18, 2024
6e66d26
[FIX] merge AMX into AVX512 workload with runtime check for AMX feature
cyssi-cb Nov 18, 2024
824520c
[FIX] CMakeLists
cyssi-cb Nov 18, 2024
87d6af9
[FIX] naming convention
cyssi-cb Nov 18, 2024
b775b8c
[FIX] spelling
cyssi-cb Nov 18, 2024
ce86848
[FIX] spelling
cyssi-cb Nov 18, 2024
3a789bf
[FIX] spelling
cyssi-cb Nov 18, 2024
83e91eb
[REMOVE] unnecesary payload definition, now merged into AVX512Payload…
cyssi-cb Nov 18, 2024
fdf01db
Merge branch 'master' into amx
cyssi-cb Nov 18, 2024
ce35b28
[FIX] Cmake
cyssi-cb Nov 18, 2024
b1577ce
[FIX] move __tilecfg definition into header
cyssi-cb Nov 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ git_submodule_update()

if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
else()
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -O2 -fdata-sections -ffunction-sections")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mamx-tile -Wall -Wextra -O2 -fdata-sections -ffunction-sections")
endif()

if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
Expand Down
18 changes: 17 additions & 1 deletion include/firestarter/Environment/X86/Payload/AVX512Payload.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,17 @@
#include <firestarter/Environment/X86/Payload/X86Payload.hpp>

namespace firestarter::environment::x86::payload {

// Define struct that is used as config and loaded through ldtilecfg()
typedef struct __tile_config
{
uint8_t palette_id;
uint8_t start_row;
uint8_t reserved_0[14];
uint16_t colsb[16];
uint8_t rows[16];
} __tilecfg;

class AVX512Payload final : public X86Payload {
public:
AVX512Payload(asmjit::CpuFeatures const &supportedFeatures)
Expand All @@ -44,12 +55,17 @@ class AVX512Payload final : public X86Payload {
return new AVX512Payload(this->supportedFeatures());
};

static void create_AMX_config(__tilecfg *tileinfo);
static void request_permission();
static void init_buffer_rand(uintptr_t buf1, uintptr_t buf2);

private:
const std::map<std::string, unsigned> instructionFlops = {
{"REG", 32}, {"L1_L", 32}, {"L1_BROADCAST", 16}, {"L1_S", 16},
{"L1_LS", 16}, {"L2_L", 32}, {"L2_S", 16}, {"L2_LS", 16},
{"L3_L", 32}, {"L3_S", 16}, {"L3_LS", 16}, {"L3_P", 16},
{"RAM_L", 32}, {"RAM_S", 16}, {"RAM_LS", 16}, {"RAM_P", 16}};
{"RAM_L", 32}, {"RAM_S", 16}, {"RAM_LS", 16}, {"RAM_P", 16},
{"AMX", 512}};

const std::map<std::string, unsigned> instructionMemory = {
{"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}};
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/******************************************************************************
* FIRESTARTER - A Processor Stress Test Utility
* Copyright (C) 2020 TU Dresden, Center for Information Services and High
* Performance Computing
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/\>.
*
* Contact: [email protected]
*****************************************************************************/

#pragma once

#include <firestarter/Environment/X86/Payload/AVX512Payload.hpp>
#include <firestarter/Environment/X86/Platform/X86PlatformConfig.hpp>

namespace firestarter::environment::x86::platform {
class SapphireRapidsConfig final : public X86PlatformConfig {

public:
SapphireRapidsConfig(asmjit::CpuFeatures const &supportedFeatures,
unsigned family, unsigned model, unsigned threads)
: X86PlatformConfig("SPR_XEONEP", 6, {143}, {1, 2}, 0,
{32768, 1048576, 1441792}, 1048576000, 1536, family,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume these values also have not been updated

model, threads,
new payload::AVX512Payload(supportedFeatures)) {}

std::vector<std::pair<std::string, unsigned>>
getDefaultPayloadSettings() const override {
return std::vector<std::pair<std::string, unsigned>>({{"RAM_S", 3},
{"RAM_P", 1},
{"L3_S", 1},
{"L3_P", 1},
{"L2_S", 4},
{"L2_L", 70},
{"L1_S", 0},
{"L1_L", 40},
{"REG", 140},
{"AMX", 1}});
}
};
} // namespace firestarter::environment::x86::platform
5 changes: 4 additions & 1 deletion include/firestarter/Environment/X86/X86Environment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <firestarter/Environment/Environment.hpp>
#include <firestarter/Environment/X86/X86CPUTopology.hpp>

#include <firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp>
#include <firestarter/Environment/X86/Platform/BulldozerConfig.hpp>
#include <firestarter/Environment/X86/Platform/HaswellConfig.hpp>
#include <firestarter/Environment/X86/Platform/HaswellEPConfig.hpp>
Expand Down Expand Up @@ -88,14 +89,16 @@ class X86Environment final : public Environment {
REGISTER(HaswellEPConfig), REGISTER(SandyBridgeConfig),
REGISTER(SandyBridgeEPConfig), REGISTER(NehalemConfig),
REGISTER(NehalemEPConfig), REGISTER(BulldozerConfig),
REGISTER(NaplesConfig), REGISTER(RomeConfig)};
REGISTER(NaplesConfig), REGISTER(RomeConfig),
REGISTER(SapphireRapidsConfig)};

std::list<platform::X86PlatformConfig *> platformConfigs;

// List of fallback PlatformConfig. Add one for each x86 extension.
const std::list<std::function<platform::X86PlatformConfig *(
asmjit::CpuFeatures const &, unsigned, unsigned, unsigned)>>
fallbackPlatformConfigsCtor = {
REGISTER(SapphireRapidsConfig), // AMX + AVX512
REGISTER(SkylakeSPConfig), // AVX512
REGISTER(BulldozerConfig), // FMA4
REGISTER(HaswellConfig), // FMA
Expand Down
142 changes: 140 additions & 2 deletions src/firestarter/Environment/X86/Payload/AVX512Payload.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/******************************************************************************
* FIRESTARTER - A Processor Stress Test Utility
* Copyright (C) 2020-2023 TU Dresden, Center for Information Services and High
* Copyright (C) 2020 TU Dresden, Center for Information Services and High
* Performance Computing
*
* This program is free software: you can redistribute it and/or modify
Expand All @@ -18,8 +18,24 @@
*
* Contact: [email protected]
*****************************************************************************/

#include <firestarter/Environment/X86/Payload/AVX512Payload.hpp>
#include <sys/syscall.h>
#include <immintrin.h>
#include <asm/prctl.h> /* Definition of ARCH_* constants */

#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18
#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)

#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023

#define MAX 1024
#define MAX_ROWS 16
#define MAX_COLS 64


using namespace firestarter::environment::x86::payload;
using namespace asmjit;
Expand All @@ -38,6 +54,17 @@ int AVX512Payload::compilePayload(
auto repetitions =
this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread);

// Check if AMX is in instruction mix and supported by CPU
if (std::find(sequence.begin(), sequence.end(), "AMX") != sequence.end()) {
if(this->supportedFeatures().x86().hasAMX_BF16()){
workerLog::trace() << "AMX BF16 operations are supported by this processor.";
}
else{
workerLog::error() << "[ERROR] AMX BF16 operations are not supported by this processor.";
return EXIT_FAILURE;
}
}

// compute count of flops and memory access for performance report
unsigned flops = 0;
unsigned bytes = 0;
Expand Down Expand Up @@ -164,6 +191,42 @@ int AVX512Payload::compilePayload(
for (auto const &reg : shift_reg32) {
cb.mov(reg, Imm(0xAAAAAAAA));
}


// Init AMX registers and config
__tilecfg tile_data = {0};
request_permission();
create_AMX_config(&tile_data); // Create tilecfg and fill it

static bool init = true;
uintptr_t src1, src2;
uint64_t src3;
unsigned int aligned_alloc_size = static_cast<unsigned int>(MAX*sizeof(__bfloat16));
if(aligned_alloc_size % 1024){ // aligned_alloc expects size to be multiple of alignment (aka 1024)
aligned_alloc_size = aligned_alloc_size + (1024 - (aligned_alloc_size % 1024));
}
src1 = (uintptr_t) aligned_alloc(1024, aligned_alloc_size);
src2 = (uintptr_t) aligned_alloc(1024, aligned_alloc_size);
src3 = (uint64_t) aligned_alloc(1024, aligned_alloc_size);
if(((void*)src1 == nullptr) || (void*)src2 == nullptr || (void*)src3 == nullptr){ // uintptr_t garantuees we can cast it to void* and back
std::cout << "[ERROR]: Allocation of source and target buffer for AMX failed. Aborting...\n";
exit(1);
}

//Init buffers
init_buffer_rand(src1, src2);
memset((void*) src3, 0, aligned_alloc_size);

cb.tileloaddt1(tmm6, zmmword_ptr(src1));
cb.tileloaddt1(tmm7, zmmword_ptr(src2)); // Ensure no overflows through loading x and -x in src2

cb.tileloaddt1(tmm0, zmmword_ptr(src3)); // Preload with 0
cb.tileloaddt1(tmm1, zmmword_ptr(src3));
cb.tileloaddt1(tmm2, zmmword_ptr(src3));
cb.tileloaddt1(tmm3, zmmword_ptr(src3));
cb.tileloaddt1(tmm4, zmmword_ptr(src3));
cb.tileloaddt1(tmm5, zmmword_ptr(src3));

// Initialize AVX512-Registers for FMA Operations
cb.vmovapd(zmm0, zmmword_ptr(pointer_reg));
cb.vmovapd(zmm1, zmmword_ptr(pointer_reg, 64));
Expand Down Expand Up @@ -212,6 +275,7 @@ int AVX512Payload::compilePayload(
auto mov_dst = trans_start;
auto mov_src = mov_dst + 1;
unsigned l1_offset = 0;
int counter=0;

#define L1_INCREMENT() \
l1_offset += 64; \
Expand Down Expand Up @@ -296,6 +360,9 @@ int AVX512Payload::compilePayload(
cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 64));
cb.prefetcht2(ptr(ram_addr));
RAM_INCREMENT();
} else if (item == "AMX") {
cb.tdpbf16ps(Tmm(counter%6), tmm6, tmm7);
counter++;
} else {
workerLog::error() << "Instruction group " << item << " not found in "
<< this->name() << ".";
Expand Down Expand Up @@ -448,3 +515,74 @@ void AVX512Payload::init(unsigned long long *memoryAddr,
unsigned long long bufferSize) {
X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4);
}

void AVX512Payload::create_AMX_config(__tilecfg *tileinfo){
// Create tile_cfg, fill it and return
int i;
tileinfo->palette_id = 1;
tileinfo->start_row = 0;


for (i = 0; i < 8; ++i)
{
tileinfo->colsb[i] = MAX_COLS;
tileinfo->rows[i] = MAX_ROWS;
}

_tile_loadconfig(tileinfo);
}


void AVX512Payload::request_permission(){

long rc;
unsigned long bitmask;
rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);

if(rc){
workerLog::error() << "XTILE_DATA request failed: " << rc;
}

rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
if (rc){
workerLog::error() << "prctl(ARCH_GET_XCOMP_PERM) error: " << rc;
}
if (bitmask & XFEATURE_MASK_XTILE){
workerLog::trace() << "ARCH_REQ_XCOMP_PERM XTILE_DATA successful.";
}
else{
workerLog::error() << "[ERROR] ARCH_REQ_XCOMP_PERM XTILE_DATA unsuccessful!";
}


}

void AVX512Payload::init_buffer_rand(uintptr_t src1, uintptr_t src2){

// Initialize buffer with random values
// Multiplication always produces either 1 or -1
// Accumulation operation always on (1 + -1) = 0 ensures stable values

__bfloat16 *buf1 = (__bfloat16*) src1;
__bfloat16 *buf2 = (__bfloat16*) src2;

// TODO: Change MAX_ROWS/MAXC_COLS from constant to maximum size check by asmJit
// Currently not supported by asmJit
// Alternative: Manually parse CPUID

for(int i = 0; i<MAX_ROWS; i++){
__bfloat16 random_init = (__bfloat16) (rand() % 65536); // Limit maximum size as 1/x needs to fit bfloat16
for(int j = 0; j<MAX_COLS; j++){
buf1[i*MAX_COLS+j] = (__bfloat16) (random_init);
if(!(j%2)){
buf2[i*MAX_COLS+j] = (__bfloat16) ((-1) / random_init);
}
else if(j%2){
buf2[i*MAX_COLS+j] = (__bfloat16) (1 / random_init);
}
}
}

}


Loading