[BesTLA] Refactor quantization-related kernels (#209)

* add instrinsic for s4_clip * remove unstable time data * add q4fp32 kernel * add avx2 version of u8s8 * add hybrid support * use hybrid scheduler for ggml and fp32 kernels. * fix typo * fix err * debug overflow of non-vnni instruction * add ref for gemv * pass UT * add S8S8S32 and S8S8Fp32 for AVX_VNNI * add benchmark S8S8 is 50% of U8S8 * add benchmark and model test * model check * disable dynamic PE ratio * add s8s8 code and benchmark case * add s3 weight ref * add avx2 s3 gemv * add s3 benchmark * use JIT kernel * compile on gcc * add blend filter for dynamic PE * for more stable result * use optmized threading as default * speed up int3 gemv * clang-format * remove alignas * use multi-threading for ROPE * add int2 gemv ref; add avx2 unpack 2bit * add avx2 gemv for int2 * add blocksize=16 case * use vec register instead of general register * protect asym weight * fix s8s8 avx_vnni code * add zero point support for MatB * add manual unroll for sgemv * nbits qunatization from high bits to low bits. support asym quant of s2 * add gemv ut for all 4bit and 2bit functions * complete avx2: s4->s8 packrow=1,2,4; s4->fp(f32,bf16) packrow=1,2,4 * add bf16 UTs * test all AVX2 int4: sym&asym, comp_fp32&comp_int8, packrow=1,2,4 * split s4_fp code * add AVX2 sgemv * test all 4bit AVX2 combinations * support MTILE for int4 gemv * support MTILE for int2 gemv * fix perf of comp_fp32 * add ref of gemv_2bit_fp32_fp32, add new 3bit kernel * complete int2 decompress kernels * sync 2bit and 4bit unpack functions * finish all int3 gemv kernels * add s8s8 3bit gemv UT * fully test int3 weight: group=32,128, comp=fp32,int8 wrapper:gemm and gemv * add DecompressKBlockS3Fp avx2 * test int2 and in3 with comp_fp32 and comp_int8 * speedup int3 int2 with comp_fp32 * fix debug code * prevent compiling from unsupported template * enable asym quantization. test llama2 model for group=32, weight_dtype=int2, int3, int4, alg=asym, compute_dtype=int8 * remove LauncherKBlock * remove all LauncherKBlock * sync s8 weight's compression and decompression functions * remove gemv_s* functions * add AVX2_VNNI * add kblock avx2vnni * test all gemm cases with AVX2_VNNI * fix the correct ISA * add AVX2_VNNI to gemv dispatcher * remove code * benchmark int4 with AVX2_VNNI * support AVX2_VNNI for model quantization and inference * add avx512 s4s8 * add s8fp * pass avx512+int4 UTs * fix compile errors with GCC * remove deprecated functions and wrappers * remove unused UT case * enable gemv for amx_int8 * remove static_assert * disable blocksize=32 for amx_int8 * add avx512: s2s8, s2fp * full test of all avx512 int2 * add avx512 s2 benchmark * support avx512 int2 * fix compile error * add 3bit sgemv avx512 * enable all UTs * fix UT error * pop vnni flags * use correct avx2 epi32_epi16 * use omp and std at the same time * clang-format * check compiler before enable gemv * correct assert condition * help compiler a little * set VS2022 * fix condition * clang-format * compiled with dpcpp * optimize on 1185g7 * add avx2_vnni for int3&int2 * clang-format * fix code errors * compile with gcc9 * revert rope parallel * refactor quantization data process in python * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add mtile for epilogue in gemv * clang-format * Revert "revert rope parallel" This reverts commit 7dc4dd8. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
intel · May 7, 2024 · 7d49516 · 7d49516
1 parent f72fdba
commit 7d49516
Show file tree

Hide file tree

Showing 40 changed files with 9,982 additions and 2,205 deletions.
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -8,7 +8,13 @@
       "generator": "Ninja",
       "binaryDir": "${sourceDir}/out/build/${presetName}",
       "installDir": "${sourceDir}/out/install/${presetName}",
-      "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" },
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Debug",
+        "NS_PROFILING": "ON",
+        "NS_USE_OMP": "ON",
+        "BTLA_UT_DEBUG": "ON",
+        "BTLA_UT_BENCHMARK": "ON"
+      },
       "condition": {
         "type": "equals",
         "lhs": "${hostSystemName}",
@@ -107,16 +113,32 @@
         "BTLA_UT_OPENMP": "OFF"
       }
     },
+    {
+      "name": "x64-debug-sycl",
+      "displayName": "x64 Debug SYCL",
+      "description": "x64 Debug SYCL",
+      "inherits": "windows-base",
+      "architecture": {
+        "value": "x64",
+        "strategy": "external"
+      },
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Debug",
+        "BTLA_UT_DEBUG": "ON",
+        "BTLA_UT_ALL": "OFF",
+        "BTLA_SYCL": "ON",
+        "BTLA_UT_BENCHMARK": "ON",
+        "CMAKE_CXX_COMPILER": "icx",
+        "CMAKE_C_COMPILER": "icx"
+      }
+    },
     {
       "name": "x64-release-sycl",
-      "displayName": "x64 Release SYCL",
+      "displayName": "x64 Release for SYCL",
       "description": "x64 SYCL",
-      "inherits": "x64-debug",
+      "inherits": "x64-debug-sycl",
       "cacheVariables": {
-        "CMAKE_CXX_COMPILER": "icx-cl",
-        "CMAKE_C_COMPILER": "icx-cl",
-        "CMAKE_BUILD_TYPE": "Release",
-        "BTLA_UT_ALL": "ON"
+        "CMAKE_BUILD_TYPE": "Release"
       }
     }
   ]

diff --git a/bestla/CMakeLists.txt b/bestla/CMakeLists.txt
@@ -40,6 +40,7 @@ if(BTLA_SYCL)
   file(GLOB sycl_headers ${PROJECT_NAME}/sycl/*.h ${PROJECT_NAME}/sycl/*.hpp)
   add_compile_definitions(BTLA_SYCL)
   list(APPEND sycl_libs IntelSYCL::SYCL_CXX)
+  add_compile_options(-march=native)
   #add_link_options(-fsycl-targets=spir64 -Xsycl-target-backend "-options -ze-opt-large-register-file")
 endif(BTLA_SYCL)
 

diff --git a/bestla/bestla/bestla_device.h b/bestla/bestla/bestla_device.h
@@ -340,14 +340,17 @@ class CpuDevice {
               case 9:  // ALD
                 PE[int(BTLA_ISA::AVX2)] = 3.0f;
                 PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.5f;
                 break;
               case 10:  // MTL
                 PE[int(BTLA_ISA::AVX2)] = 2.2f;
                 PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
                 break;
               case 11:  // RPL
                 PE[int(BTLA_ISA::AVX2)] = 1.8f;
                 PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
                 break;
             }
         }
@@ -488,7 +491,7 @@ class CpuRuntime {
 
   inline void adjustPE(const BTLA_ISA isa, const float PE_) {
     // printf("Adjust:%d,%f\n",int(isa),PE_);
-    PE[int(isa)] *= PE_;
+    PE[int(isa)] = PE[int(isa)] * PE_ * 0.7 + PE[int(isa)] * 0.3;
   }
 
   size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
@@ -514,7 +517,7 @@ class CpuRuntime {
         P_core_num = static_cast<int>(_cd->getPcoreNum());
         E_core_num = thread - P_core_num;
       }
-      if (mHybrid) {
+      if (_cd->isHybrid()) {
         mL1Cache_E = _cd->getL1CacheSize_E();
         mL2Cache_E = _cd->getL2CacheSize_E();
         mHybrid = true;

diff --git a/bestla/bestla/bestla_epilogue.h b/bestla/bestla/bestla_epilogue.h
@@ -37,10 +37,16 @@ class AccumulatorWriteBack {
   using DType = _DST_T;
   using Param = ParamAccumulatorWriteBack<DType>;
 
-  BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
+    if constexpr (std::is_same_v<_SRC_T, DType>) {
+      if (cacheptr == cptr) {
+        return BTLA_CODE::Success;
+      }
+    }
+
     return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep, _param.ldc,
                                                                             _param.elt_const_v);
   }
@@ -50,8 +56,8 @@ template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP _OP>
 class CustomAccumulatorWriteBackWithEltop {
  public:
   using Param = ParamAccumulatorWriteBack<_DST_T>;
-  BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
@@ -95,8 +101,8 @@ class AlphaBetaProcessFp32 {
  public:
   using Param = ParamAlphaBetaProcess<float>;
 
-  BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto DOffset = M_offset * _param.ldd + N_offset;
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
@@ -118,9 +124,9 @@ template <BTLA_ISA ISA_T>
 class CompFp32BlockEpilogue {
  public:
   using Param = ParamCompFp32BlockEpilogue;
-  BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                    const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                    size_t cachesize) {
+  static BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset,
+                           const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
+                           void* tmpcache, size_t cachesize) {
     auto ret = BTLA_CODE::NotSupport;
     if (_param.scaledtype == BTLA_DTYPE::F32) {
       ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
@@ -169,8 +175,8 @@ template <BTLA_ISA ISA_T>
 class DequantInt32ToFp32 {
  public:
   using Param = ParamDequantInt32ToFp32;
-  BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
@@ -198,9 +204,9 @@ template <BTLA_ISA ISA_T>
 class CompInt8BlockEpilogue {
  public:
   using Param = ParamCompInt8BlockEpilogue;
-  BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                    const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                    size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset,
+                           const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
+                           void* tmpcache, size_t cachesize) {
     BTLA_CODE ret = BTLA_CODE::NotSupport;
     float* scab = nullptr;
     size_t ScaleBTmpSize = N * sizeof(float);
@@ -280,8 +286,8 @@ template <BTLA_ISA ISA_T>
 class ZpDequantInt32ToFp32 {
  public:
   using Param = ParamZpDequantInt32ToFp32;
-  BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
@@ -321,8 +327,8 @@ template <BTLA_ISA ISA_T>
 class AlphaBetaProcessS32U8 {
  public:
   using Param = ParamAlphaBetaProcessS32U8;
-  BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
     auto COffset = M_offset * _param.ldc + N_offset;
     auto cptr = _param.C + COffset;
     return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,