intel · luoyu-intel · May 7, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 9, 2024
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -8,7 +8,13 @@
       "generator": "Ninja",
       "binaryDir": "${sourceDir}/out/build/${presetName}",
       "installDir": "${sourceDir}/out/install/${presetName}",
-      "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" },
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Debug",
+        "NS_PROFILING": "ON",
+        "NS_USE_OMP": "ON",
+        "BTLA_UT_DEBUG": "ON",
+        "BTLA_UT_BENCHMARK": "ON"
+      },
       "condition": {
         "type": "equals",
         "lhs": "${hostSystemName}",
@@ -107,16 +113,32 @@
         "BTLA_UT_OPENMP": "OFF"
       }
     },
+    {
+      "name": "x64-debug-sycl",
+      "displayName": "x64 Debug SYCL",
+      "description": "x64 Debug SYCL",
+      "inherits": "windows-base",
+      "architecture": {
+        "value": "x64",
+        "strategy": "external"
+      },
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Debug",
+        "BTLA_UT_DEBUG": "ON",
+        "BTLA_UT_ALL": "OFF",
+        "BTLA_SYCL": "ON",
+        "BTLA_UT_BENCHMARK": "ON",
+        "CMAKE_CXX_COMPILER": "icx",
+        "CMAKE_C_COMPILER": "icx"
+      }
+    },
     {
       "name": "x64-release-sycl",
-      "displayName": "x64 Release SYCL",
+      "displayName": "x64 Release for SYCL",
       "description": "x64 SYCL",
-      "inherits": "x64-debug",
+      "inherits": "x64-debug-sycl",
       "cacheVariables": {
-        "CMAKE_CXX_COMPILER": "icx-cl",
-        "CMAKE_C_COMPILER": "icx-cl",
-        "CMAKE_BUILD_TYPE": "Release",
-        "BTLA_UT_ALL": "ON"
+        "CMAKE_BUILD_TYPE": "Release"
       }
     }
   ]

diff --git a/bestla/CMakeLists.txt b/bestla/CMakeLists.txt
@@ -40,6 +40,7 @@ if(BTLA_SYCL)
   file(GLOB sycl_headers ${PROJECT_NAME}/sycl/*.h ${PROJECT_NAME}/sycl/*.hpp)
   add_compile_definitions(BTLA_SYCL)
   list(APPEND sycl_libs IntelSYCL::SYCL_CXX)
+  add_compile_options(-march=native)
   #add_link_options(-fsycl-targets=spir64 -Xsycl-target-backend "-options -ze-opt-large-register-file")
 endif(BTLA_SYCL)
 

diff --git a/bestla/bestla/bestla_device.h b/bestla/bestla/bestla_device.h
@@ -340,14 +340,17 @@ class CpuDevice {
               case 9:  // ALD
                 PE[int(BTLA_ISA::AVX2)] = 3.0f;
                 PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.5f;
                 break;
               case 10:  // MTL
                 PE[int(BTLA_ISA::AVX2)] = 2.2f;
                 PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
                 break;
               case 11:  // RPL
                 PE[int(BTLA_ISA::AVX2)] = 1.8f;
                 PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
                 break;
             }
         }
@@ -488,7 +491,7 @@ class CpuRuntime {
 
   inline void adjustPE(const BTLA_ISA isa, const float PE_) {
     // printf("Adjust:%d,%f\n",int(isa),PE_);
-    PE[int(isa)] *= PE_;
+    PE[int(isa)] = PE[int(isa)] * PE_ * 0.7 + PE[int(isa)] * 0.3;
   }
 
   size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
@@ -514,7 +517,7 @@ class CpuRuntime {
         P_core_num = static_cast<int>(_cd->getPcoreNum());
         E_core_num = thread - P_core_num;
       }
-      if (mHybrid) {
+      if (_cd->isHybrid()) {
         mL1Cache_E = _cd->getL1CacheSize_E();
         mL2Cache_E = _cd->getL2CacheSize_E();
         mHybrid = true;

diff --git a/bestla/bestla/bestla_epilogue.h b/bestla/bestla/bestla_epilogue.h
@@ -37,10 +37,16 @@ class AccumulatorWriteBack {
   using DType = _DST_T;
   using Param = ParamAccumulatorWriteBack<DType>;
 
-  BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
+    if constexpr (std::is_same_v<_SRC_T, DType>) {
+      if (cacheptr == cptr) {
+        return BTLA_CODE::Success;
+      }
+    }
+
     return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep, _param.ldc,
                                                                             _param.elt_const_v);
   }
@@ -50,8 +56,8 @@ template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP _OP>
 class CustomAccumulatorWriteBackWithEltop {
  public:
   using Param = ParamAccumulatorWriteBack<_DST_T>;
-  BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
@@ -95,8 +101,8 @@ class AlphaBetaProcessFp32 {
  public:
   using Param = ParamAlphaBetaProcess<float>;
 
-  BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto DOffset = M_offset * _param.ldd + N_offset;
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
@@ -118,9 +124,9 @@ template <BTLA_ISA ISA_T>
 class CompFp32BlockEpilogue {
  public:
   using Param = ParamCompFp32BlockEpilogue;
-  BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                    const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                    size_t cachesize) {
+  static BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset,
+                           const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
+                           void* tmpcache, size_t cachesize) {
     auto ret = BTLA_CODE::NotSupport;
     if (_param.scaledtype == BTLA_DTYPE::F32) {
       ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
@@ -169,8 +175,8 @@ template <BTLA_ISA ISA_T>
 class DequantInt32ToFp32 {
  public:
   using Param = ParamDequantInt32ToFp32;
-  BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
@@ -198,9 +204,9 @@ template <BTLA_ISA ISA_T>
 class CompInt8BlockEpilogue {
  public:
   using Param = ParamCompInt8BlockEpilogue;
-  BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                    const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                    size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset,
+                           const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
+                           void* tmpcache, size_t cachesize) {
     BTLA_CODE ret = BTLA_CODE::NotSupport;
     float* scab = nullptr;
     size_t ScaleBTmpSize = N * sizeof(float);
@@ -280,8 +286,8 @@ template <BTLA_ISA ISA_T>
 class ZpDequantInt32ToFp32 {
  public:
   using Param = ParamZpDequantInt32ToFp32;
-  BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
@@ -321,8 +327,8 @@ template <BTLA_ISA ISA_T>
 class AlphaBetaProcessS32U8 {
  public:
   using Param = ParamAlphaBetaProcessS32U8;
-  BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,