Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
[BesTLA] Refactor quantization-related kernels (#209)
Browse files Browse the repository at this point in the history
* add instrinsic for s4_clip

* remove unstable time data

* add q4fp32 kernel

* add avx2 version of u8s8

* add hybrid support

* use hybrid scheduler for ggml and fp32 kernels.

* fix typo

* fix err

* debug overflow of non-vnni instruction

* add ref for gemv

* pass UT

* add S8S8S32 and S8S8Fp32 for AVX_VNNI

* add benchmark S8S8 is 50% of U8S8

* add benchmark and model test

* model check

* disable dynamic PE ratio

* add s8s8 code and benchmark case

* add s3 weight ref

* add avx2 s3 gemv

* add s3 benchmark

* use JIT kernel

* compile on gcc

* add blend filter for dynamic PE

* for more stable result

* use optmized threading as default

* speed up int3 gemv

* clang-format

* remove alignas

* use multi-threading for ROPE

* add int2 gemv ref; add avx2 unpack 2bit

* add avx2 gemv for int2

* add blocksize=16 case

* use vec register instead of general register

* protect asym weight

* fix s8s8 avx_vnni code

* add zero point support for MatB

* add manual unroll for sgemv

* nbits qunatization from high bits to low bits. support asym quant of s2

* add gemv ut for all 4bit and 2bit functions

* complete avx2: s4->s8 packrow=1,2,4;  s4->fp(f32,bf16) packrow=1,2,4

* add bf16 UTs

* test all AVX2 int4: sym&asym, comp_fp32&comp_int8, packrow=1,2,4

* split s4_fp code

* add AVX2 sgemv

* test all 4bit AVX2 combinations

* support MTILE for int4 gemv

* support MTILE for int2 gemv

* fix perf of comp_fp32

* add ref of gemv_2bit_fp32_fp32, add new 3bit kernel

* complete int2 decompress kernels

* sync 2bit and 4bit unpack functions

* finish all int3 gemv kernels

* add s8s8 3bit gemv UT

* fully test int3 weight: group=32,128, comp=fp32,int8 wrapper:gemm and gemv

* add DecompressKBlockS3Fp avx2

* test int2 and in3 with comp_fp32 and comp_int8

* speedup int3 int2 with comp_fp32

* fix debug code

* prevent compiling from unsupported template

* enable asym quantization. test llama2 model for group=32, weight_dtype=int2, int3, int4, alg=asym, compute_dtype=int8

* remove LauncherKBlock

* remove all LauncherKBlock

* sync s8 weight's compression and decompression functions

* remove gemv_s* functions

* add AVX2_VNNI

* add kblock avx2vnni

* test all gemm cases with AVX2_VNNI

* fix the correct ISA

* add AVX2_VNNI to gemv dispatcher

* remove code

* benchmark int4 with AVX2_VNNI

* support AVX2_VNNI for model quantization and inference

* add avx512 s4s8

* add s8fp

* pass avx512+int4 UTs

* fix compile errors with GCC

* remove deprecated functions and wrappers

* remove unused UT case

* enable gemv for amx_int8

* remove static_assert

* disable blocksize=32 for amx_int8

* add avx512: s2s8, s2fp

* full test of all avx512 int2

* add avx512 s2 benchmark

* support avx512 int2

* fix compile error

* add 3bit sgemv avx512

* enable all UTs

* fix UT error

* pop vnni flags

* use correct avx2 epi32_epi16

* use omp and std at the same time

* clang-format

* check compiler before enable gemv

* correct assert condition

* help compiler a little

* set VS2022

* fix condition

* clang-format

* compiled with dpcpp

* optimize on 1185g7

* add avx2_vnni for int3&int2

* clang-format

* fix code errors

* compile with gcc9

* revert rope parallel

* refactor quantization data process in python

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add mtile for epilogue in gemv

* clang-format

* Revert "revert rope parallel"

This reverts commit 7dc4dd8.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
luoyu-intel and pre-commit-ci[bot] authored May 7, 2024
1 parent f72fdba commit 7d49516
Show file tree
Hide file tree
Showing 40 changed files with 9,982 additions and 2,205 deletions.
36 changes: 29 additions & 7 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@
"generator": "Ninja",
"binaryDir": "${sourceDir}/out/build/${presetName}",
"installDir": "${sourceDir}/out/install/${presetName}",
"cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" },
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"NS_PROFILING": "ON",
"NS_USE_OMP": "ON",
"BTLA_UT_DEBUG": "ON",
"BTLA_UT_BENCHMARK": "ON"
},
"condition": {
"type": "equals",
"lhs": "${hostSystemName}",
Expand Down Expand Up @@ -107,16 +113,32 @@
"BTLA_UT_OPENMP": "OFF"
}
},
{
"name": "x64-debug-sycl",
"displayName": "x64 Debug SYCL",
"description": "x64 Debug SYCL",
"inherits": "windows-base",
"architecture": {
"value": "x64",
"strategy": "external"
},
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"BTLA_UT_DEBUG": "ON",
"BTLA_UT_ALL": "OFF",
"BTLA_SYCL": "ON",
"BTLA_UT_BENCHMARK": "ON",
"CMAKE_CXX_COMPILER": "icx",
"CMAKE_C_COMPILER": "icx"
}
},
{
"name": "x64-release-sycl",
"displayName": "x64 Release SYCL",
"displayName": "x64 Release for SYCL",
"description": "x64 SYCL",
"inherits": "x64-debug",
"inherits": "x64-debug-sycl",
"cacheVariables": {
"CMAKE_CXX_COMPILER": "icx-cl",
"CMAKE_C_COMPILER": "icx-cl",
"CMAKE_BUILD_TYPE": "Release",
"BTLA_UT_ALL": "ON"
"CMAKE_BUILD_TYPE": "Release"
}
}
]
Expand Down
1 change: 1 addition & 0 deletions bestla/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ if(BTLA_SYCL)
file(GLOB sycl_headers ${PROJECT_NAME}/sycl/*.h ${PROJECT_NAME}/sycl/*.hpp)
add_compile_definitions(BTLA_SYCL)
list(APPEND sycl_libs IntelSYCL::SYCL_CXX)
add_compile_options(-march=native)
#add_link_options(-fsycl-targets=spir64 -Xsycl-target-backend "-options -ze-opt-large-register-file")
endif(BTLA_SYCL)

Expand Down
7 changes: 5 additions & 2 deletions bestla/bestla/bestla_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,14 +340,17 @@ class CpuDevice {
case 9: // ALD
PE[int(BTLA_ISA::AVX2)] = 3.0f;
PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
PE[int(BTLA_ISA::NoSIMD)] = 3.5f;
break;
case 10: // MTL
PE[int(BTLA_ISA::AVX2)] = 2.2f;
PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
break;
case 11: // RPL
PE[int(BTLA_ISA::AVX2)] = 1.8f;
PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
break;
}
}
Expand Down Expand Up @@ -488,7 +491,7 @@ class CpuRuntime {

inline void adjustPE(const BTLA_ISA isa, const float PE_) {
// printf("Adjust:%d,%f\n",int(isa),PE_);
PE[int(isa)] *= PE_;
PE[int(isa)] = PE[int(isa)] * PE_ * 0.7 + PE[int(isa)] * 0.3;
}

size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
Expand All @@ -514,7 +517,7 @@ class CpuRuntime {
P_core_num = static_cast<int>(_cd->getPcoreNum());
E_core_num = thread - P_core_num;
}
if (mHybrid) {
if (_cd->isHybrid()) {
mL1Cache_E = _cd->getL1CacheSize_E();
mL2Cache_E = _cd->getL2CacheSize_E();
mHybrid = true;
Expand Down
42 changes: 24 additions & 18 deletions bestla/bestla/bestla_epilogue.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,16 @@ class AccumulatorWriteBack {
using DType = _DST_T;
using Param = ParamAccumulatorWriteBack<DType>;

BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
if constexpr (std::is_same_v<_SRC_T, DType>) {
if (cacheptr == cptr) {
return BTLA_CODE::Success;
}
}

return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep, _param.ldc,
_param.elt_const_v);
}
Expand All @@ -50,8 +56,8 @@ template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP _OP>
class CustomAccumulatorWriteBackWithEltop {
public:
using Param = ParamAccumulatorWriteBack<_DST_T>;
BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
Expand Down Expand Up @@ -95,8 +101,8 @@ class AlphaBetaProcessFp32 {
public:
using Param = ParamAlphaBetaProcess<float>;

BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto DOffset = M_offset * _param.ldd + N_offset;
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
Expand All @@ -118,9 +124,9 @@ template <BTLA_ISA ISA_T>
class CompFp32BlockEpilogue {
public:
using Param = ParamCompFp32BlockEpilogue;
BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
size_t cachesize) {
static BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset,
const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
void* tmpcache, size_t cachesize) {
auto ret = BTLA_CODE::NotSupport;
if (_param.scaledtype == BTLA_DTYPE::F32) {
ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
Expand Down Expand Up @@ -169,8 +175,8 @@ template <BTLA_ISA ISA_T>
class DequantInt32ToFp32 {
public:
using Param = ParamDequantInt32ToFp32;
BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
Expand Down Expand Up @@ -198,9 +204,9 @@ template <BTLA_ISA ISA_T>
class CompInt8BlockEpilogue {
public:
using Param = ParamCompInt8BlockEpilogue;
BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
size_t cachesize) {
static BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset,
const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
void* tmpcache, size_t cachesize) {
BTLA_CODE ret = BTLA_CODE::NotSupport;
float* scab = nullptr;
size_t ScaleBTmpSize = N * sizeof(float);
Expand Down Expand Up @@ -280,8 +286,8 @@ template <BTLA_ISA ISA_T>
class ZpDequantInt32ToFp32 {
public:
using Param = ParamZpDequantInt32ToFp32;
BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
Expand Down Expand Up @@ -321,8 +327,8 @@ template <BTLA_ISA ISA_T>
class AlphaBetaProcessS32U8 {
public:
using Param = ParamAlphaBetaProcessS32U8;
BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,
Expand Down
Loading

0 comments on commit 7d49516

Please sign in to comment.