Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

[BesTLA] Refactor quantization-related kernels #209

Merged
merged 111 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
111 commits
Select commit Hold shift + click to select a range
f99d29c
add instrinsic for s4_clip
luoyu-intel Apr 8, 2024
be59a64
remove unstable time data
luoyu-intel Apr 8, 2024
7b11d14
add q4fp32 kernel
luoyu-intel Apr 9, 2024
f878c86
add avx2 version of u8s8
luoyu-intel Apr 9, 2024
65e3e96
add hybrid support
luoyu-intel Apr 9, 2024
30a2286
use hybrid scheduler for ggml and fp32 kernels.
luoyu-intel Apr 9, 2024
16563a9
fix typo
luoyu-intel Apr 9, 2024
86101a5
fix err
luoyu-intel Apr 9, 2024
1deff17
debug overflow of non-vnni instruction
luoyu-intel Apr 11, 2024
7d4ab6d
add ref for gemv
luoyu-intel Apr 12, 2024
6f341a3
pass UT
luoyu-intel Apr 15, 2024
1d1b97e
add S8S8S32 and S8S8Fp32 for AVX_VNNI
luoyu-intel Apr 15, 2024
32731bd
add benchmark S8S8 is 50% of U8S8
luoyu-intel Apr 15, 2024
3096f72
add benchmark and model test
luoyu-intel Apr 15, 2024
6ad88ed
model check
luoyu-intel Apr 15, 2024
a9b7fcf
disable dynamic PE ratio
luoyu-intel Apr 15, 2024
bcbda2b
add s8s8 code and benchmark case
luoyu-intel Apr 16, 2024
7e24a80
add s3 weight ref
luoyu-intel Apr 16, 2024
d1a87f8
add avx2 s3 gemv
luoyu-intel Apr 16, 2024
1c3bfde
add s3 benchmark
luoyu-intel Apr 16, 2024
f382578
use JIT kernel
luoyu-intel Apr 16, 2024
38c2c38
compile on gcc
luoyu-intel Apr 16, 2024
0014866
add blend filter for dynamic PE
luoyu-intel Apr 17, 2024
4410425
for more stable result
luoyu-intel Apr 17, 2024
0e62b4a
use optmized threading as default
luoyu-intel Apr 17, 2024
88715bf
speed up int3 gemv
luoyu-intel Apr 17, 2024
2933321
clang-format
luoyu-intel Apr 17, 2024
3a05e0d
remove alignas
luoyu-intel Apr 17, 2024
d471b71
use multi-threading for ROPE
luoyu-intel Apr 18, 2024
386aa89
add int2 gemv ref; add avx2 unpack 2bit
luoyu-intel Apr 18, 2024
dd82a24
add avx2 gemv for int2
luoyu-intel Apr 18, 2024
acbde0d
add blocksize=16 case
luoyu-intel Apr 18, 2024
2e3812c
use vec register instead of general register
luoyu-intel Apr 18, 2024
436d436
protect asym weight
luoyu-intel Apr 18, 2024
96fe7d1
fix s8s8 avx_vnni code
luoyu-intel Apr 19, 2024
ab76539
add zero point support for MatB
luoyu-intel Apr 19, 2024
a47db04
add manual unroll for sgemv
luoyu-intel Apr 19, 2024
a3aa32a
nbits qunatization from high bits to low bits. support asym quant of s2
luoyu-intel Apr 23, 2024
36a282d
add gemv ut for all 4bit and 2bit functions
luoyu-intel Apr 23, 2024
4f22289
complete avx2: s4->s8 packrow=1,2,4; s4->fp(f32,bf16) packrow=1,2,4
luoyu-intel Apr 24, 2024
77695be
add bf16 UTs
luoyu-intel Apr 24, 2024
9ac8aa1
test all AVX2 int4: sym&asym, comp_fp32&comp_int8, packrow=1,2,4
luoyu-intel Apr 25, 2024
1a62444
split s4_fp code
luoyu-intel Apr 25, 2024
739a101
add AVX2 sgemv
luoyu-intel Apr 25, 2024
20c7e5a
test all 4bit AVX2 combinations
luoyu-intel Apr 25, 2024
8ac5da8
support MTILE for int4 gemv
luoyu-intel Apr 25, 2024
a22cd2a
support MTILE for int2 gemv
luoyu-intel Apr 25, 2024
4ccc3ea
fix perf of comp_fp32
luoyu-intel Apr 25, 2024
74f7c82
add ref of gemv_2bit_fp32_fp32, add new 3bit kernel
luoyu-intel Apr 26, 2024
b9a2137
complete int2 decompress kernels
luoyu-intel Apr 26, 2024
71c24b0
sync 2bit and 4bit unpack functions
luoyu-intel Apr 26, 2024
feea801
finish all int3 gemv kernels
luoyu-intel Apr 26, 2024
628b983
add s8s8 3bit gemv UT
luoyu-intel Apr 26, 2024
2478de4
fully test int3 weight: group=32,128, comp=fp32,int8 wrapper:gemm and…
luoyu-intel Apr 26, 2024
c85b0e3
add DecompressKBlockS3Fp avx2
luoyu-intel Apr 28, 2024
a427bef
test int2 and in3 with comp_fp32 and comp_int8
luoyu-intel Apr 28, 2024
4182499
speedup int3 int2 with comp_fp32
luoyu-intel Apr 28, 2024
8af7a4a
fix debug code
luoyu-intel Apr 28, 2024
67bb716
prevent compiling from unsupported template
luoyu-intel Apr 28, 2024
f87dc92
enable asym quantization. test llama2 model for group=32, weight_dtyp…
luoyu-intel Apr 28, 2024
112eaa5
remove LauncherKBlock
luoyu-intel Apr 28, 2024
13de7ed
remove all LauncherKBlock
luoyu-intel Apr 28, 2024
7588eec
sync s8 weight's compression and decompression functions
luoyu-intel Apr 28, 2024
ae2f6ac
remove gemv_s* functions
luoyu-intel Apr 28, 2024
5daaf4b
add AVX2_VNNI
luoyu-intel Apr 28, 2024
4be8411
add kblock avx2vnni
luoyu-intel Apr 28, 2024
39e3c76
test all gemm cases with AVX2_VNNI
luoyu-intel Apr 28, 2024
6d381eb
fix the correct ISA
luoyu-intel Apr 28, 2024
afa9e05
add AVX2_VNNI to gemv dispatcher
luoyu-intel Apr 28, 2024
5e109fa
remove code
luoyu-intel Apr 28, 2024
58eb244
benchmark int4 with AVX2_VNNI
luoyu-intel Apr 28, 2024
a3943e9
support AVX2_VNNI for model quantization and inference
luoyu-intel Apr 28, 2024
8e46f80
add avx512 s4s8
luoyu-intel Apr 28, 2024
e35d446
add s8fp
luoyu-intel Apr 28, 2024
067cb13
pass avx512+int4 UTs
luoyu-intel Apr 28, 2024
733dcec
fix compile errors with GCC
luoyu-intel Apr 29, 2024
f3a067c
remove deprecated functions and wrappers
luoyu-intel Apr 29, 2024
91f0e68
remove unused UT case
luoyu-intel Apr 29, 2024
c0ae10b
enable gemv for amx_int8
luoyu-intel Apr 29, 2024
22182d2
remove static_assert
luoyu-intel Apr 29, 2024
4a60f29
disable blocksize=32 for amx_int8
luoyu-intel Apr 29, 2024
c02ef5c
add avx512: s2s8, s2fp
luoyu-intel Apr 29, 2024
b9a55f0
full test of all avx512 int2
luoyu-intel Apr 29, 2024
c7b81c0
add avx512 s2 benchmark
luoyu-intel Apr 29, 2024
58284ce
support avx512 int2
luoyu-intel Apr 29, 2024
58ee0e3
fix compile error
luoyu-intel Apr 29, 2024
17e913d
add 3bit sgemv avx512
luoyu-intel Apr 29, 2024
77d7cf5
enable all UTs
luoyu-intel Apr 29, 2024
a6053ed
fix UT error
luoyu-intel Apr 29, 2024
4677e7a
pop vnni flags
luoyu-intel Apr 29, 2024
14a397e
use correct avx2 epi32_epi16
luoyu-intel Apr 29, 2024
7504550
use omp and std at the same time
luoyu-intel Apr 29, 2024
c11c16a
clang-format
luoyu-intel Apr 29, 2024
6b9fa7c
check compiler before enable gemv
luoyu-intel Apr 30, 2024
10a1e57
correct assert condition
luoyu-intel Apr 30, 2024
ce15390
help compiler a little
luoyu-intel Apr 30, 2024
859bcb9
set VS2022
luoyu-intel Apr 30, 2024
5f375fa
fix condition
luoyu-intel Apr 30, 2024
b1557e8
clang-format
luoyu-intel Apr 30, 2024
842babe
compiled with dpcpp
luoyu-intel Apr 30, 2024
17590eb
optimize on 1185g7
luoyu-intel Apr 30, 2024
177488c
add avx2_vnni for int3&int2
luoyu-intel Apr 30, 2024
3096862
clang-format
luoyu-intel Apr 30, 2024
a2de1e7
fix code errors
luoyu-intel Apr 30, 2024
a861d4e
compile with gcc9
luoyu-intel Apr 30, 2024
7dc4dd8
revert rope parallel
luoyu-intel Apr 30, 2024
23f3b0a
refactor quantization data process in python
luoyu-intel Apr 30, 2024
7bb4912
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 30, 2024
ea84cdb
add mtile for epilogue in gemv
luoyu-intel Apr 30, 2024
32a116b
clang-format
luoyu-intel Apr 30, 2024
00be66e
Revert "revert rope parallel"
luoyu-intel Apr 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 29 additions & 7 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@
"generator": "Ninja",
"binaryDir": "${sourceDir}/out/build/${presetName}",
"installDir": "${sourceDir}/out/install/${presetName}",
"cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" },
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"NS_PROFILING": "ON",
"NS_USE_OMP": "ON",
"BTLA_UT_DEBUG": "ON",
"BTLA_UT_BENCHMARK": "ON"
},
"condition": {
"type": "equals",
"lhs": "${hostSystemName}",
Expand Down Expand Up @@ -107,16 +113,32 @@
"BTLA_UT_OPENMP": "OFF"
}
},
{
"name": "x64-debug-sycl",
"displayName": "x64 Debug SYCL",
"description": "x64 Debug SYCL",
"inherits": "windows-base",
"architecture": {
"value": "x64",
"strategy": "external"
},
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"BTLA_UT_DEBUG": "ON",
"BTLA_UT_ALL": "OFF",
"BTLA_SYCL": "ON",
"BTLA_UT_BENCHMARK": "ON",
"CMAKE_CXX_COMPILER": "icx",
"CMAKE_C_COMPILER": "icx"
}
},
{
"name": "x64-release-sycl",
"displayName": "x64 Release SYCL",
"displayName": "x64 Release for SYCL",
"description": "x64 SYCL",
"inherits": "x64-debug",
"inherits": "x64-debug-sycl",
"cacheVariables": {
"CMAKE_CXX_COMPILER": "icx-cl",
"CMAKE_C_COMPILER": "icx-cl",
"CMAKE_BUILD_TYPE": "Release",
"BTLA_UT_ALL": "ON"
"CMAKE_BUILD_TYPE": "Release"
}
}
]
Expand Down
1 change: 1 addition & 0 deletions bestla/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ if(BTLA_SYCL)
file(GLOB sycl_headers ${PROJECT_NAME}/sycl/*.h ${PROJECT_NAME}/sycl/*.hpp)
add_compile_definitions(BTLA_SYCL)
list(APPEND sycl_libs IntelSYCL::SYCL_CXX)
add_compile_options(-march=native)
zhewang1-intc marked this conversation as resolved.
Show resolved Hide resolved
#add_link_options(-fsycl-targets=spir64 -Xsycl-target-backend "-options -ze-opt-large-register-file")
endif(BTLA_SYCL)

Expand Down
7 changes: 5 additions & 2 deletions bestla/bestla/bestla_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,14 +340,17 @@ class CpuDevice {
case 9: // ALD
PE[int(BTLA_ISA::AVX2)] = 3.0f;
PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
PE[int(BTLA_ISA::NoSIMD)] = 3.5f;
break;
case 10: // MTL
PE[int(BTLA_ISA::AVX2)] = 2.2f;
PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
break;
case 11: // RPL
PE[int(BTLA_ISA::AVX2)] = 1.8f;
PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
break;
}
}
Expand Down Expand Up @@ -488,7 +491,7 @@ class CpuRuntime {

inline void adjustPE(const BTLA_ISA isa, const float PE_) {
// printf("Adjust:%d,%f\n",int(isa),PE_);
PE[int(isa)] *= PE_;
PE[int(isa)] = PE[int(isa)] * PE_ * 0.7 + PE[int(isa)] * 0.3;
}

size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
Expand All @@ -514,7 +517,7 @@ class CpuRuntime {
P_core_num = static_cast<int>(_cd->getPcoreNum());
E_core_num = thread - P_core_num;
}
if (mHybrid) {
if (_cd->isHybrid()) {
mL1Cache_E = _cd->getL1CacheSize_E();
mL2Cache_E = _cd->getL2CacheSize_E();
mHybrid = true;
Expand Down
42 changes: 24 additions & 18 deletions bestla/bestla/bestla_epilogue.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,16 @@ class AccumulatorWriteBack {
using DType = _DST_T;
using Param = ParamAccumulatorWriteBack<DType>;

BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
if constexpr (std::is_same_v<_SRC_T, DType>) {
if (cacheptr == cptr) {
return BTLA_CODE::Success;
}
}

return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep, _param.ldc,
_param.elt_const_v);
}
Expand All @@ -50,8 +56,8 @@ template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP _OP>
class CustomAccumulatorWriteBackWithEltop {
public:
using Param = ParamAccumulatorWriteBack<_DST_T>;
BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
Expand Down Expand Up @@ -95,8 +101,8 @@ class AlphaBetaProcessFp32 {
public:
using Param = ParamAlphaBetaProcess<float>;

BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto DOffset = M_offset * _param.ldd + N_offset;
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
Expand All @@ -118,9 +124,9 @@ template <BTLA_ISA ISA_T>
class CompFp32BlockEpilogue {
public:
using Param = ParamCompFp32BlockEpilogue;
BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
size_t cachesize) {
static BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset,
const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
void* tmpcache, size_t cachesize) {
auto ret = BTLA_CODE::NotSupport;
if (_param.scaledtype == BTLA_DTYPE::F32) {
ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
Expand Down Expand Up @@ -169,8 +175,8 @@ template <BTLA_ISA ISA_T>
class DequantInt32ToFp32 {
public:
using Param = ParamDequantInt32ToFp32;
BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
Expand Down Expand Up @@ -198,9 +204,9 @@ template <BTLA_ISA ISA_T>
class CompInt8BlockEpilogue {
public:
using Param = ParamCompInt8BlockEpilogue;
BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
size_t cachesize) {
static BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset,
const int N_offset, const int K_offset, const int M, const int N, const Param& _param,
void* tmpcache, size_t cachesize) {
BTLA_CODE ret = BTLA_CODE::NotSupport;
float* scab = nullptr;
size_t ScaleBTmpSize = N * sizeof(float);
Expand Down Expand Up @@ -280,8 +286,8 @@ template <BTLA_ISA ISA_T>
class ZpDequantInt32ToFp32 {
public:
using Param = ParamZpDequantInt32ToFp32;
BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
Expand Down Expand Up @@ -321,8 +327,8 @@ template <BTLA_ISA ISA_T>
class AlphaBetaProcessS32U8 {
public:
using Param = ParamAlphaBetaProcessS32U8;
BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
const int N, const Param& _param, void* tmpcache, size_t cachesize) {
static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset,
const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
auto COffset = M_offset * _param.ldc + N_offset;
auto cptr = _param.C + COffset;
return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,
Expand Down
Loading
Loading