syna main: cd6fc32 (#248)

intel · May 8, 2024 · 2f79436 · 2f79436
1 parent 3bdc76d
commit 2f79436
Show file tree

Hide file tree

Showing 38 changed files with 13,177 additions and 1,780 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -5,6 +5,7 @@ file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
 file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
 
 option(BTLA_ENABLE_OPENMP "Compile OpenMP thread pool if OMP can be found" OFF)
+option(BTLA_SYCL "Compile OpenMP thread pool if OMP can be found" OFF)
 
 option(BTLA_UT_ALL "Enable all unit tests" OFF)
 option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
@@ -21,14 +22,27 @@ option(BTLA_UT_NOASAN "Disable sanitize" OFF)
 option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
 option(BTLA_UT_OPENMP "Use OpenMP for UT tests" OFF)
 
+
+
+
+
 add_library(${PROJECT_NAME} INTERFACE)
 add_library(neural_speed::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 target_include_directories(
 	${PROJECT_NAME} INTERFACE
 	"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
 	"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
 )
-
+set(sycl_headers)
+set(sycl_libs)
+if(BTLA_SYCL)
+  include(cmake/sycl.cmake)
+  file(GLOB sycl_headers ${PROJECT_NAME}/sycl/*.h ${PROJECT_NAME}/sycl/*.hpp)
+  add_compile_definitions(BTLA_SYCL)
+  list(APPEND sycl_libs IntelSYCL::SYCL_CXX)
+  add_compile_options(-march=native)
+  #add_link_options(-fsycl-targets=spir64 -Xsycl-target-backend "-options -ze-opt-large-register-file")
+endif(BTLA_SYCL)
 
 if(BTLA_ENABLE_OPENMP)
   message(STATUS "BesTLA enable OpenMP ThreadPool")
@@ -69,17 +83,25 @@ function(add_ut_flag UT_OPTION)
 	endif()
 endfunction()
 
+set(benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
+# list(APPEND benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)
+
+
 if(UT_BUILD)
 	file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everything even run parts of UTs
-    list(REMOVE_ITEM srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
+  file(GLOB sycl_srcs ${PROJECT_NAME}/ut/sycl*)
+  if(NOT BTLA_SYCL)
+    list(REMOVE_ITEM srcs ${sycl_srcs})
+  endif()
+  list(REMOVE_ITEM srcs ${benchmark_srcs})
 	file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
-    include_directories(${PROJECT_NAME})
-	add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
-    if(BTLA_UT_OPENMP)
-      include(FindOpenMP)
-      target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
-      target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX)
-    endif()
+  include_directories(${PROJECT_NAME})
+	add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${sycl_headers} ${ut_headers})
+  if(BTLA_UT_OPENMP)
+    include(FindOpenMP)
+    target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
+    target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX)
+  endif()
 	if(NOT WIN32)
 		if(NOT BTLA_UT_NOASAN)
 		  target_compile_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
@@ -98,14 +120,16 @@ if(UT_BUILD)
 	add_ut_flag(BTLA_UT_KERNEL_INTRIN)
 	add_ut_flag(BTLA_UT_KERNEL_JIT)
 	add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
-	target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
+  if(BTLA_SYCL)
+    add_compile_definitions(BTLA_UT_SYCL)
+  endif()
+	target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME} ${sycl_libs})
 endif(UT_BUILD)
 
 if(BTLA_UT_BENCHMARK)
-  file(GLOB srcs ${PROJECT_NAME}/ut/bestla_benchmark.cpp) #compile everything even run parts of UTs
   file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
   include_directories(${PROJECT_NAME})
-	add_executable(${PROJECT_NAME}_benchmark ${srcs} ${headers} ${ut_headers})
+	add_executable(${PROJECT_NAME}_benchmark ${benchmark_srcs} ${headers} ${ut_headers})
   if(BTLA_UT_OPENMP)
     include(FindOpenMP)
     target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
@@ -114,5 +138,5 @@ if(BTLA_UT_BENCHMARK)
   if(NOT WIN32)
 		target_link_options(${PROJECT_NAME}_benchmark PRIVATE -lpthread)
 	endif()
-  target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME})
+  target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME} ${sycl_libs})
 endif(BTLA_UT_BENCHMARK)
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -83,6 +83,34 @@
       "description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
       "inherits": "x64-release",
       "cacheVariables": { "BTLA_UT_ALL": "ON" }
+    },
+    {
+      "name": "x64-debug-sycl",
+      "displayName": "x64 Debug SYCL",
+      "description": "x64 Debug SYCL",
+      "inherits": "windows-base",
+      "architecture": {
+        "value": "x64",
+        "strategy": "external"
+      },
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Debug",
+        "BTLA_UT_DEBUG": "ON",
+        "BTLA_UT_ALL": "OFF",
+        "BTLA_SYCL": "ON",
+        "BTLA_UT_BENCHMARK": "ON",
+        "CMAKE_CXX_COMPILER": "icx",
+        "CMAKE_C_COMPILER": "icx"
+      }
+    },
+    {
+      "name": "x64-release-sycl",
+      "displayName": "x64 Release for SYCL",
+      "description": "x64 SYCL",
+      "inherits": "x64-debug-sycl",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release"
+      }
     }
   ]
 }
diff --git a/bestla/bestla.h b/bestla/bestla.h
@@ -37,6 +37,7 @@ enum class BTLA_DTYPE : uint32_t {
   EleBitsMask = 0xff,
   EleBitsShift = 0,
   EleBitsUndef = 0,
+  EleBits2 = 2,
   EleBits3 = 3,
   EleBits4 = 4,
   EleBits8 = 8,
@@ -65,6 +66,7 @@ enum class BTLA_DTYPE : uint32_t {
   DQ8_BNB = EleBits8 | TypeFloat | SubType4,
   S8 = EleBits8 | TypeInt,
   U8 = EleBits8 | TypeInt | SubType1,
+  S2_CLIP = EleBits2 | TypeInt,
   S3_CLIP = EleBits3 | TypeInt,
   S4_CLIP = EleBits4 | TypeInt,
   F4_E2M1 = EleBits4 | TypeFloat,

diff --git a/bestla/bestla_device.h b/bestla/bestla_device.h
@@ -259,8 +259,10 @@ class CpuDevice {
       if (tmp[3] & (1U << 15)) mHybrid = true;
       if (p) printf("!!!Hybrid:%d\t%x\t%x\t%x\t%x!!!\n", mHybrid, tmp[0], tmp[1], tmp[2], tmp[3]);
     }
+    int total_cores = numcores * _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::SmtLevel);
+    if (total_cores <= 16) mClient = true;
     if (mHybrid) {
-      int total_cores = numcores * _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::SmtLevel);
+      mClient = true;
       std::vector<int> core_type(total_cores), core_id(total_cores), L1(total_cores), L2(total_cores);
       std::map<int, int> core_id_count;
 
@@ -311,21 +313,14 @@ class CpuDevice {
           for (auto& i : SMT_core) printf("%d,", i);
           printf("\n");
         }
-        if (!E_core.empty() && !P_core.empty()) {
+        mHybrid = !(E_core.empty() || P_core.empty());  // in case of bond core by external
+        if (!E_core.empty()) {
           E_L1Cache = L1[E_core[0]];
           E_L2Cache = L2[E_core[0]] / 4;
-          uint32_t scale = SMT_core.empty() ? 1 : 2;
-          L1Cache = E_L1Cache > L1[P_core[0]] / scale ? L1[P_core[0]] / scale : E_L1Cache;
-          L2Cache = E_L2Cache > L2[P_core[0]] / scale ? L2[P_core[0]] / scale : E_L2Cache;
-        } else if (!P_core.empty()) {
-          uint32_t scale = SMT_core.empty() ? 1 : 2;
-          L1Cache = L1[P_core[0]] / scale;
-          L2Cache = L2[P_core[0]] / scale;
-          mHybrid = false;
-        } else {
-          L1Cache = L1[E_core[0]];
-          L2Cache = L2[E_core[0]] / 4;
-          mHybrid = false;
+        };
+        if (!P_core.empty()) {
+          L1Cache = L1[P_core[0]];
+          L2Cache = L2[P_core[0]];
         }
       }
       numcores = static_cast<int>(P_core.size() + E_core.size());
@@ -345,14 +340,17 @@ class CpuDevice {
               case 9:  // ALD
                 PE[int(BTLA_ISA::AVX2)] = 3.0f;
                 PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.5f;
                 break;
               case 10:  // MTL
                 PE[int(BTLA_ISA::AVX2)] = 2.2f;
                 PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
                 break;
               case 11:  // RPL
                 PE[int(BTLA_ISA::AVX2)] = 1.8f;
                 PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
                 break;
             }
         }
@@ -442,7 +440,7 @@ class CpuDevice {
     CPU_ZERO(&cpuset);
     CPU_SET(core, &cpuset);
     int s = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
-    if (s != 0) printf("ERROR\n");
+    if (s != 0) printf("Bond Core ERROR:%d\n", core);
 #endif
   }
 
@@ -456,15 +454,16 @@ class CpuDevice {
     CPU_SET(core, &cpuset);
     pthread_t pt = thread.native_handle();
     int s = pthread_setaffinity_np(pt, sizeof(cpuset), &cpuset);
-    if (s != 0) printf("ERROR\n");
+    if (s != 0) printf("Bond Core ERROR:%d\n", core);
 #endif
   }
 
   bool isHybrid() { return mHybrid; }
+  bool isClient() { return mClient; }
 
  protected:
   uint32_t L2Cache, L1Cache, L3Cache;
-  bool mHybrid = false;
+  bool mHybrid = false, mClient = false;
   bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
       mHasAVX512_FP16;
   int numcores;
@@ -492,7 +491,7 @@ class CpuRuntime {
 
   inline void adjustPE(const BTLA_ISA isa, const float PE_) {
     // printf("Adjust:%d,%f\n",int(isa),PE_);
-    PE[int(isa)] *= PE_;
+    PE[int(isa)] = PE[int(isa)] * PE_ * 0.7 + PE[int(isa)] * 0.3;
   }
 
   size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
@@ -506,7 +505,7 @@ class CpuRuntime {
     mL1Cache = _cd->getL1CacheSize();
     maxThreads = _cd->getThreads();
     mHybrid = false;
-    if (_cd->isHybrid() && thread > _cd->getPcoreNum()) {
+    if (_cd->isClient() && thread > _cd->getPcoreNum()) {
       if (thread > _cd->getPcoreNum() + _cd->getEcoreNum()) {
         mL1Cache_P = mL1Cache / 2;
         mL2Cache_P = mL2Cache / 2;
@@ -518,10 +517,12 @@ class CpuRuntime {
         P_core_num = static_cast<int>(_cd->getPcoreNum());
         E_core_num = thread - P_core_num;
       }
-      mL1Cache_E = _cd->getL1CacheSize_E();
-      mL2Cache_E = _cd->getL2CacheSize_E();
-      mHybrid = true;
-      memcpy(PE, _cd->getPE(), int(BTLA_ISA::ISA_COUNT) * sizeof(float));
+      if (_cd->isHybrid()) {
+        mL1Cache_E = _cd->getL1CacheSize_E();
+        mL2Cache_E = _cd->getL2CacheSize_E();
+        mHybrid = true;
+        memcpy(PE, _cd->getPE(), int(BTLA_ISA::ISA_COUNT) * sizeof(float));
+      }
     }
   }
   float PE[int(BTLA_ISA::ISA_COUNT)];

diff --git a/bestla/bestla_epilogue.h b/bestla/bestla_epilogue.h
@@ -37,10 +37,16 @@ class AccumulatorWriteBack {
   using DType = _DST_T;
   using Param = ParamAccumulatorWriteBack<DType>;
 
-  BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
+    if constexpr (std::is_same_v<_SRC_T, DType>) {
+      if (cacheptr == cptr) {
+        return BTLA_CODE::Success;
+      }
+    }
+
     return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep, _param.ldc,
                                                                             _param.elt_const_v);
   }
@@ -50,8 +56,8 @@ template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP _OP>
 class CustomAccumulatorWriteBackWithEltop {
  public:
   using Param = ParamAccumulatorWriteBack<_DST_T>;
-  BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
@@ -95,8 +101,8 @@ class AlphaBetaProcessFp32 {
  public:
   using Param = ParamAlphaBetaProcess<float>;
 
-  BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto DOffset = M_offset * _param.ldd + N_offset;
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
@@ -118,9 +124,9 @@ template <BTLA_ISA ISA_T>
 class CompFp32BlockEpilogue {
  public:
   using Param = ParamCompFp32BlockEpilogue;
-  BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                    const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                    size_t cachesize) {
+  static BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset,
+                           const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
+                           void* tmpcache, size_t cachesize) {
     auto ret = BTLA_CODE::NotSupport;
     if (_param.scaledtype == BTLA_DTYPE::F32) {
       ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
@@ -169,8 +175,8 @@ template <BTLA_ISA ISA_T>
 class DequantInt32ToFp32 {
  public:
   using Param = ParamDequantInt32ToFp32;
-  BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
@@ -198,9 +204,9 @@ template <BTLA_ISA ISA_T>
 class CompInt8BlockEpilogue {
  public:
   using Param = ParamCompInt8BlockEpilogue;
-  BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                    const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                    size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset,
+                           const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
+                           void* tmpcache, size_t cachesize) {
     BTLA_CODE ret = BTLA_CODE::NotSupport;
     float* scab = nullptr;
     size_t ScaleBTmpSize = N * sizeof(float);
@@ -280,8 +286,8 @@ template <BTLA_ISA ISA_T>
 class ZpDequantInt32ToFp32 {
  public:
   using Param = ParamZpDequantInt32ToFp32;
-  BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
@@ -321,8 +327,8 @@ template <BTLA_ISA ISA_T>
 class AlphaBetaProcessS32U8 {
  public:
   using Param = ParamAlphaBetaProcessS32U8;
-  BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,