Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Release tag for BesTLA 1.0a (#190)
Browse files Browse the repository at this point in the history
  • Loading branch information
luoyu-intel authored Mar 22, 2024
1 parent 34e3740 commit 3bdc76d
Show file tree
Hide file tree
Showing 28 changed files with 2,746 additions and 1,742 deletions.
45 changes: 30 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ project(bestla LANGUAGES CXX VERSION 0.1.0)
file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)

option(BTLA_USE_OPENMP "Enable OpenMP thread pool" OFF)
option(BTLA_ENABLE_OPENMP "Compile OpenMP thread pool if OMP can be found" OFF)

option(BTLA_UT_ALL "Enable all unit tests" OFF)
option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
Expand All @@ -19,7 +19,7 @@ option(BTLA_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
option(BTLA_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
option(BTLA_UT_NOASAN "Disable sanitize" OFF)
option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
option(BTLA_UT_OPENMP "Use OpenMP" ON)
option(BTLA_UT_OPENMP "Use OpenMP for UT tests" OFF)

add_library(${PROJECT_NAME} INTERFACE)
add_library(neural_speed::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
Expand All @@ -30,10 +30,10 @@ target_include_directories(
)


if(BTLA_USE_OPENMP)
message(STATUS "BesTLA using OpenMP")
if(BTLA_ENABLE_OPENMP)
message(STATUS "BesTLA enable OpenMP ThreadPool")
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
endif(BTLA_USE_OPENMP)
endif(BTLA_ENABLE_OPENMP)

if(WIN32)
target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
Expand Down Expand Up @@ -64,20 +64,22 @@ endif()

function(add_ut_flag UT_OPTION)
if(${${UT_OPTION}})
target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
# target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
add_compile_definitions(${UT_OPTION})
endif()
endfunction()

if(UT_BUILD)
file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everthing even run parts of UTs
file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everything even run parts of UTs
list(REMOVE_ITEM srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
include_directories(${PROJECT_NAME})
include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
if(BTLA_UT_OPENMP)
include(FindOpenMP)
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX)
endif()
if(BTLA_UT_OPENMP)
include(FindOpenMP)
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX)
endif()
if(NOT WIN32)
if(NOT BTLA_UT_NOASAN)
target_compile_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
Expand All @@ -96,8 +98,21 @@ if(UT_BUILD)
add_ut_flag(BTLA_UT_KERNEL_INTRIN)
add_ut_flag(BTLA_UT_KERNEL_JIT)
add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
add_ut_flag(BTLA_UT_BENCHMARK)

target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
endif(UT_BUILD)

if(BTLA_UT_BENCHMARK)
file(GLOB srcs ${PROJECT_NAME}/ut/bestla_benchmark.cpp) #compile everything even run parts of UTs
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_benchmark ${srcs} ${headers} ${ut_headers})
if(BTLA_UT_OPENMP)
include(FindOpenMP)
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE OpenMP::OpenMP_CXX)
endif()
if(NOT WIN32)
target_link_options(${PROJECT_NAME}_benchmark PRIVATE -lpthread)
endif()
target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME})
endif(BTLA_UT_BENCHMARK)
4 changes: 3 additions & 1 deletion bestla/bestla.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ enum class BTLA_ISA : uint8_t {
AMX_INT8,
AVX512_FP16,
AVX512_BF16,
ISA_COUNT,
};
enum class BTLA_DTYPE : uint32_t {
EleBitsMask = 0xff,
EleBitsShift = 0,
EleBitsUndef = 0,
EleBits3 = 3,
EleBits4 = 4,
EleBits8 = 8,
EleBits16 = 16,
Expand Down Expand Up @@ -63,8 +65,8 @@ enum class BTLA_DTYPE : uint32_t {
DQ8_BNB = EleBits8 | TypeFloat | SubType4,
S8 = EleBits8 | TypeInt,
U8 = EleBits8 | TypeInt | SubType1,
S3_CLIP = EleBits3 | TypeInt,
S4_CLIP = EleBits4 | TypeInt,
S4_FULLRANGE = EleBits4 | TypeInt | SubType1,
F4_E2M1 = EleBits4 | TypeFloat,
F4_BNB = EleBits4 | TypeFloat | SubType1,
F4_NF4 = EleBits4 | TypeFloat | SubType2,
Expand Down
95 changes: 82 additions & 13 deletions bestla/bestla_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ class CpuDevice {
public:
inline int getThreads() { return numthreads; }
inline int getCores() { return numcores; }
inline uint32_t getL3CacheSize() { return L3Cache; }
inline uint32_t getL2CacheSize() { return L2Cache; }
inline uint32_t getL1CacheSize() { return L1Cache; }
inline uint32_t getL2CacheSize_E() { return E_L2Cache; }
Expand All @@ -228,7 +229,7 @@ class CpuDevice {
inline bool AMX_BF16() { return mHasAMX_BF16; }
inline bool AVX512_BF16() { return mHasAVX512_BF16; }
inline bool AVX512_FP16() { return mHasAVX512_FP16; }
inline float getPE() { return (P_core.size() * P_power) / (E_core.size() * E_power); }
inline float* const getPE() { return PE; }
inline size_t getPcoreNum() { return P_core.size(); }
inline size_t getEcoreNum() { return E_core.size(); }
inline size_t getSMTcoreNum() { return SMT_core.size(); }
Expand Down Expand Up @@ -327,13 +328,41 @@ class CpuDevice {
mHybrid = false;
}
}
numcores = P_core.size() + E_core.size();
numthreads = P_core.size() * 2 + E_core.size();
numcores = static_cast<int>(P_core.size() + E_core.size());
numthreads = static_cast<int>(P_core.size() + E_core.size() + SMT_core.size());

{
// set PE
uint32_t tmp[4];
_cpu.getCpuid(1, tmp);
if (p) printf("!!!\t%x\t%x\t%x\t%x!!!\n", tmp[0], tmp[1], tmp[2], tmp[3]);
const int famliy = (tmp[0] >> 8) & ((1u << 4) - 1); // cpu.extractBit(a[0], 8, 11);
const int extendedModel = (tmp[0] >> 16) & ((1u << 4) - 1); // cpu.extractBit(a[0], 16, 24);
{
for (int i = 0; i < int(BTLA_ISA::ISA_COUNT); i++) PE[i] = 1.0f;
// CPU identification refer to: https://en.wikichip.org/wiki/intel/cpuid
if (famliy == 6) switch (extendedModel) {
case 9: // ALD
PE[int(BTLA_ISA::AVX2)] = 3.0f;
PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
break;
case 10: // MTL
PE[int(BTLA_ISA::AVX2)] = 2.2f;
PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
break;
case 11: // RPL
PE[int(BTLA_ISA::AVX2)] = 1.8f;
PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
break;
}
}
}
} else {
L1Cache = _cpu.getDataCacheSize(0);
L2Cache = _cpu.getDataCacheSize(1);
numthreads = numcores;
}
L3Cache = _cpu.getDataCacheSize(2);
#if FIXED_CACHE
L2Cache = L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : L2Cache;
E_L2Cache = E_L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : E_L2Cache;
Expand All @@ -357,7 +386,7 @@ class CpuDevice {
Xbyak::util::Cpu cpu;
uint32_t tmp[4];
cpu.getCpuid(0x1A, tmp);
int core_type = (tmp[0] >> 24) & ((1u << 7) - 1); // cpu.extractBit(a[0], 24, 31);
int core_type = (tmp[0] >> 24) & ((1u << 8) - 1); // cpu.extractBit(a[0], 24, 31);
switch (core_type) {
case 32:
// printf("Atom\n");
Expand Down Expand Up @@ -407,7 +436,7 @@ class CpuDevice {
}
static void core_bond(int core) {
#ifdef _WIN32
SetThreadAffinityMask(GetCurrentThread(), 1 << core);
SetThreadAffinityMask(GetCurrentThread(), 1LL << core);
#else
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
Expand All @@ -420,7 +449,7 @@ class CpuDevice {
static void core_bond(std::thread& thread, int core) {
#ifdef _WIN32
HANDLE handle = thread.native_handle();
SetThreadAffinityMask(handle, 1 << core);
SetThreadAffinityMask(handle, 1LL << core);
#else
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
Expand All @@ -434,29 +463,69 @@ class CpuDevice {
bool isHybrid() { return mHybrid; }

protected:
uint32_t L2Cache, L1Cache;
uint32_t L2Cache, L1Cache, L3Cache;
bool mHybrid = false;
bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
mHasAVX512_FP16;
int numcores;
int numthreads;
std::vector<int> P_core, E_core, SMT_core;
uint32_t E_L2Cache, E_L1Cache;
float P_power = 4.8, E_power = 2.3;
float PE[int(BTLA_ISA::ISA_COUNT)];
};

#define GetCPUDevice() auto _cd = bestla::device::CpuDevice::getInstance();

class CpuBase {
class CpuRuntime {
public:
CpuBase() {
CpuRuntime() = default;
static CpuRuntime& getInstance(int thread) {
static std::map<int, CpuRuntime> instances;
if (instances.count(thread) == 0) instances[thread] = CpuRuntime(thread);
return instances[thread];
}

inline float getPE(const BTLA_ISA isa) {
// printf("GET:%d\t%f\n",int(isa), *cur_PE);
return PE[int(isa)] * P_core_num / E_core_num;
}

inline void adjustPE(const BTLA_ISA isa, const float PE_) {
// printf("Adjust:%d,%f\n",int(isa),PE_);
PE[int(isa)] *= PE_;
}

size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
int P_core_num = 0, E_core_num = 0;
bool mHybrid = false;

private:
CpuRuntime(int thread) {
GetCPUDevice();
mL2Cache = _cd->getL2CacheSize();
mL1Cache = _cd->getL1CacheSize();
mNumThreads = _cd->getThreads();
maxThreads = _cd->getThreads();
mHybrid = false;
if (_cd->isHybrid() && thread > _cd->getPcoreNum()) {
if (thread > _cd->getPcoreNum() + _cd->getEcoreNum()) {
mL1Cache_P = mL1Cache / 2;
mL2Cache_P = mL2Cache / 2;
P_core_num = _cd->getPcoreNum();
E_core_num = _cd->getEcoreNum();
} else {
mL1Cache_P = mL1Cache;
mL2Cache_P = mL2Cache;
P_core_num = static_cast<int>(_cd->getPcoreNum());
E_core_num = thread - P_core_num;
}
mL1Cache_E = _cd->getL1CacheSize_E();
mL2Cache_E = _cd->getL2CacheSize_E();
mHybrid = true;
memcpy(PE, _cd->getPE(), int(BTLA_ISA::ISA_COUNT) * sizeof(float));
}
}
size_t mL2Cache, mL1Cache;
int mNumThreads;
float PE[int(BTLA_ISA::ISA_COUNT)];
int maxThreads;
};
} // namespace device
} // namespace bestla
2 changes: 1 addition & 1 deletion bestla/bestla_gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -2716,7 +2716,7 @@ class AvxvnniN8P4 : protected bestla::xbyak::JitAvxvnni {
vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
add(reg_tmp1, reg_astride);
for (int i = 0; i < NRegs; i++) {
vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
}
}
}
Expand Down
Loading

0 comments on commit 3bdc76d

Please sign in to comment.