Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
syna main: cd6fc32 (#248)
Browse files Browse the repository at this point in the history
  • Loading branch information
luoyu-intel authored May 8, 2024
1 parent 3bdc76d commit 2f79436
Show file tree
Hide file tree
Showing 38 changed files with 13,177 additions and 1,780 deletions.
50 changes: 37 additions & 13 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)

option(BTLA_ENABLE_OPENMP "Compile OpenMP thread pool if OMP can be found" OFF)
option(BTLA_SYCL "Compile OpenMP thread pool if OMP can be found" OFF)

option(BTLA_UT_ALL "Enable all unit tests" OFF)
option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
Expand All @@ -21,14 +22,27 @@ option(BTLA_UT_NOASAN "Disable sanitize" OFF)
option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
option(BTLA_UT_OPENMP "Use OpenMP for UT tests" OFF)





add_library(${PROJECT_NAME} INTERFACE)
add_library(neural_speed::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
target_include_directories(
${PROJECT_NAME} INTERFACE
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
)

set(sycl_headers)
set(sycl_libs)
if(BTLA_SYCL)
include(cmake/sycl.cmake)
file(GLOB sycl_headers ${PROJECT_NAME}/sycl/*.h ${PROJECT_NAME}/sycl/*.hpp)
add_compile_definitions(BTLA_SYCL)
list(APPEND sycl_libs IntelSYCL::SYCL_CXX)
add_compile_options(-march=native)
#add_link_options(-fsycl-targets=spir64 -Xsycl-target-backend "-options -ze-opt-large-register-file")
endif(BTLA_SYCL)

if(BTLA_ENABLE_OPENMP)
message(STATUS "BesTLA enable OpenMP ThreadPool")
Expand Down Expand Up @@ -69,17 +83,25 @@ function(add_ut_flag UT_OPTION)
endif()
endfunction()

set(benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
# list(APPEND benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)


if(UT_BUILD)
file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everything even run parts of UTs
list(REMOVE_ITEM srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
file(GLOB sycl_srcs ${PROJECT_NAME}/ut/sycl*)
if(NOT BTLA_SYCL)
list(REMOVE_ITEM srcs ${sycl_srcs})
endif()
list(REMOVE_ITEM srcs ${benchmark_srcs})
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
if(BTLA_UT_OPENMP)
include(FindOpenMP)
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX)
endif()
include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${sycl_headers} ${ut_headers})
if(BTLA_UT_OPENMP)
include(FindOpenMP)
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX)
endif()
if(NOT WIN32)
if(NOT BTLA_UT_NOASAN)
target_compile_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
Expand All @@ -98,14 +120,16 @@ if(UT_BUILD)
add_ut_flag(BTLA_UT_KERNEL_INTRIN)
add_ut_flag(BTLA_UT_KERNEL_JIT)
add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
if(BTLA_SYCL)
add_compile_definitions(BTLA_UT_SYCL)
endif()
target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME} ${sycl_libs})
endif(UT_BUILD)

if(BTLA_UT_BENCHMARK)
file(GLOB srcs ${PROJECT_NAME}/ut/bestla_benchmark.cpp) #compile everything even run parts of UTs
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_benchmark ${srcs} ${headers} ${ut_headers})
add_executable(${PROJECT_NAME}_benchmark ${benchmark_srcs} ${headers} ${ut_headers})
if(BTLA_UT_OPENMP)
include(FindOpenMP)
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
Expand All @@ -114,5 +138,5 @@ if(BTLA_UT_BENCHMARK)
if(NOT WIN32)
target_link_options(${PROJECT_NAME}_benchmark PRIVATE -lpthread)
endif()
target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME})
target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME} ${sycl_libs})
endif(BTLA_UT_BENCHMARK)
28 changes: 28 additions & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,34 @@
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-release",
"cacheVariables": { "BTLA_UT_ALL": "ON" }
},
{
"name": "x64-debug-sycl",
"displayName": "x64 Debug SYCL",
"description": "x64 Debug SYCL",
"inherits": "windows-base",
"architecture": {
"value": "x64",
"strategy": "external"
},
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"BTLA_UT_DEBUG": "ON",
"BTLA_UT_ALL": "OFF",
"BTLA_SYCL": "ON",
"BTLA_UT_BENCHMARK": "ON",
"CMAKE_CXX_COMPILER": "icx",
"CMAKE_C_COMPILER": "icx"
}
},
{
"name": "x64-release-sycl",
"displayName": "x64 Release for SYCL",
"description": "x64 SYCL",
"inherits": "x64-debug-sycl",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release"
}
}
]
}
2 changes: 2 additions & 0 deletions bestla/bestla.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ enum class BTLA_DTYPE : uint32_t {
EleBitsMask = 0xff,
EleBitsShift = 0,
EleBitsUndef = 0,
EleBits2 = 2,
EleBits3 = 3,
EleBits4 = 4,
EleBits8 = 8,
Expand Down Expand Up @@ -65,6 +66,7 @@ enum class BTLA_DTYPE : uint32_t {
DQ8_BNB = EleBits8 | TypeFloat | SubType4,
S8 = EleBits8 | TypeInt,
U8 = EleBits8 | TypeInt | SubType1,
S2_CLIP = EleBits2 | TypeInt,
S3_CLIP = EleBits3 | TypeInt,
S4_CLIP = EleBits4 | TypeInt,
F4_E2M1 = EleBits4 | TypeFloat,
Expand Down
47 changes: 24 additions & 23 deletions bestla/bestla_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,10 @@ class CpuDevice {
if (tmp[3] & (1U << 15)) mHybrid = true;
if (p) printf("!!!Hybrid:%d\t%x\t%x\t%x\t%x!!!\n", mHybrid, tmp[0], tmp[1], tmp[2], tmp[3]);
}
int total_cores = numcores * _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::SmtLevel);
if (total_cores <= 16) mClient = true;
if (mHybrid) {
int total_cores = numcores * _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::SmtLevel);
mClient = true;
std::vector<int> core_type(total_cores), core_id(total_cores), L1(total_cores), L2(total_cores);
std::map<int, int> core_id_count;

Expand Down Expand Up @@ -311,21 +313,14 @@ class CpuDevice {
for (auto& i : SMT_core) printf("%d,", i);
printf("\n");
}
if (!E_core.empty() && !P_core.empty()) {
mHybrid = !(E_core.empty() || P_core.empty()); // in case of bond core by external
if (!E_core.empty()) {
E_L1Cache = L1[E_core[0]];
E_L2Cache = L2[E_core[0]] / 4;
uint32_t scale = SMT_core.empty() ? 1 : 2;
L1Cache = E_L1Cache > L1[P_core[0]] / scale ? L1[P_core[0]] / scale : E_L1Cache;
L2Cache = E_L2Cache > L2[P_core[0]] / scale ? L2[P_core[0]] / scale : E_L2Cache;
} else if (!P_core.empty()) {
uint32_t scale = SMT_core.empty() ? 1 : 2;
L1Cache = L1[P_core[0]] / scale;
L2Cache = L2[P_core[0]] / scale;
mHybrid = false;
} else {
L1Cache = L1[E_core[0]];
L2Cache = L2[E_core[0]] / 4;
mHybrid = false;
};
if (!P_core.empty()) {
L1Cache = L1[P_core[0]];
L2Cache = L2[P_core[0]];
}
}
numcores = static_cast<int>(P_core.size() + E_core.size());
Expand All @@ -345,14 +340,17 @@ class CpuDevice {
case 9: // ALD
PE[int(BTLA_ISA::AVX2)] = 3.0f;
PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
PE[int(BTLA_ISA::NoSIMD)] = 3.5f;
break;
case 10: // MTL
PE[int(BTLA_ISA::AVX2)] = 2.2f;
PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
break;
case 11: // RPL
PE[int(BTLA_ISA::AVX2)] = 1.8f;
PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
break;
}
}
Expand Down Expand Up @@ -442,7 +440,7 @@ class CpuDevice {
CPU_ZERO(&cpuset);
CPU_SET(core, &cpuset);
int s = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
if (s != 0) printf("ERROR\n");
if (s != 0) printf("Bond Core ERROR:%d\n", core);
#endif
}

Expand All @@ -456,15 +454,16 @@ class CpuDevice {
CPU_SET(core, &cpuset);
pthread_t pt = thread.native_handle();
int s = pthread_setaffinity_np(pt, sizeof(cpuset), &cpuset);
if (s != 0) printf("ERROR\n");
if (s != 0) printf("Bond Core ERROR:%d\n", core);
#endif
}

bool isHybrid() { return mHybrid; }
bool isClient() { return mClient; }

protected:
uint32_t L2Cache, L1Cache, L3Cache;
bool mHybrid = false;
bool mHybrid = false, mClient = false;
bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
mHasAVX512_FP16;
int numcores;
Expand Down Expand Up @@ -492,7 +491,7 @@ class CpuRuntime {

inline void adjustPE(const BTLA_ISA isa, const float PE_) {
// printf("Adjust:%d,%f\n",int(isa),PE_);
PE[int(isa)] *= PE_;
PE[int(isa)] = PE[int(isa)] * PE_ * 0.7 + PE[int(isa)] * 0.3;
}

size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
Expand All @@ -506,7 +505,7 @@ class CpuRuntime {
mL1Cache = _cd->getL1CacheSize();
maxThreads = _cd->getThreads();
mHybrid = false;
if (_cd->isHybrid() && thread > _cd->getPcoreNum()) {
if (_cd->isClient() && thread > _cd->getPcoreNum()) {
if (thread > _cd->getPcoreNum() + _cd->getEcoreNum()) {
mL1Cache_P = mL1Cache / 2;
mL2Cache_P = mL2Cache / 2;
Expand All @@ -518,10 +517,12 @@ class CpuRuntime {
P_core_num = static_cast<int>(_cd->getPcoreNum());
E_core_num = thread - P_core_num;
}
mL1Cache_E = _cd->getL1CacheSize_E();
mL2Cache_E = _cd->getL2CacheSize_E();
mHybrid = true;
memcpy(PE, _cd->getPE(), int(BTLA_ISA::ISA_COUNT) * sizeof(float));
if (_cd->isHybrid()) {
mL1Cache_E = _cd->getL1CacheSize_E();
mL2Cache_E = _cd->getL2CacheSize_E();
mHybrid = true;
memcpy(PE, _cd->getPE(), int(BTLA_ISA::ISA_COUNT) * sizeof(float));
}
}
}
float PE[int(BTLA_ISA::ISA_COUNT)];
Expand Down
42 changes: 24 additions & 18 deletions bestla/bestla_epilogue.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,16 @@ class AccumulatorWriteBack {
using DType = _DST_T;
using Param = ParamAccumulatorWriteBack<DType>;

BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
if constexpr (std::is_same_v<_SRC_T, DType>) {
if (cacheptr == cptr) {
return BTLA_CODE::Success;
}
}

return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep, _param.ldc,
_param.elt_const_v);
}
Expand All @@ -50,8 +56,8 @@ template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP _OP>
class CustomAccumulatorWriteBackWithEltop {
public:
using Param = ParamAccumulatorWriteBack<_DST_T>;
BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
Expand Down Expand Up @@ -95,8 +101,8 @@ class AlphaBetaProcessFp32 {
public:
using Param = ParamAlphaBetaProcess<float>;

BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto DOffset = M_offset * _param.ldd + N_offset;
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
Expand All @@ -118,9 +124,9 @@ template <BTLA_ISA ISA_T>
class CompFp32BlockEpilogue {
public:
using Param = ParamCompFp32BlockEpilogue;
BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
size_t cachesize) {
static BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset,
const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
void* tmpcache, size_t cachesize) {
auto ret = BTLA_CODE::NotSupport;
if (_param.scaledtype == BTLA_DTYPE::F32) {
ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
Expand Down Expand Up @@ -169,8 +175,8 @@ template <BTLA_ISA ISA_T>
class DequantInt32ToFp32 {
public:
using Param = ParamDequantInt32ToFp32;
BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
Expand Down Expand Up @@ -198,9 +204,9 @@ template <BTLA_ISA ISA_T>
class CompInt8BlockEpilogue {
public:
using Param = ParamCompInt8BlockEpilogue;
BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
size_t cachesize) {
static BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset,
const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
void* tmpcache, size_t cachesize) {
BTLA_CODE ret = BTLA_CODE::NotSupport;
float* scab = nullptr;
size_t ScaleBTmpSize = N * sizeof(float);
Expand Down Expand Up @@ -280,8 +286,8 @@ template <BTLA_ISA ISA_T>
class ZpDequantInt32ToFp32 {
public:
using Param = ParamZpDequantInt32ToFp32;
BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
Expand Down Expand Up @@ -321,8 +327,8 @@ template <BTLA_ISA ISA_T>
class AlphaBetaProcessS32U8 {
public:
using Param = ParamAlphaBetaProcessS32U8;
BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,
Expand Down
Loading

0 comments on commit 2f79436

Please sign in to comment.