Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MLAS AArch64] SQNBitGemm CompInt8 kernel #18953

Merged
merged 36 commits into from
Jan 13, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
8940c0a
only register q4gemm benchmarks if q4gemm is available
edgchen1 Dec 4, 2023
a6a8ce6
some mlas cmake updates
edgchen1 Dec 5, 2023
53a46ca
change BlkLen from template param to function param
edgchen1 Dec 12, 2023
e2a9eee
Save work
edgchen1 Dec 14, 2023
966a915
only enable benchmark if available
edgchen1 Dec 14, 2023
b59e7e1
handle workspace in benchmark
edgchen1 Dec 15, 2023
585103b
QuantizeARow neon impl1
edgchen1 Dec 15, 2023
c26cef4
dot compint8 neon impl
edgchen1 Dec 15, 2023
1b7d81b
use single workspace pointer in interface, get matmul_nbits working
edgchen1 Dec 16, 2023
f7e3db5
Merge remote-tracking branch 'origin/main' into edgchen1/sqnbitgemm_q…
edgchen1 Dec 27, 2023
71bd3a9
renaming and cleanup
edgchen1 Dec 27, 2023
f7127f9
try different comp types in matmulnbits
edgchen1 Dec 28, 2023
0060f55
Merge remote-tracking branch 'origin/main' into edgchen1/sqnbitgemm_q…
edgchen1 Dec 28, 2023
b3147c6
rename enum, add doc
edgchen1 Dec 28, 2023
789bcdc
change quant b params from uint8_t* to std::byte*
edgchen1 Dec 28, 2023
039dd92
handle CompUndef
edgchen1 Dec 28, 2023
cb9f428
check if dot product instructions are available before setting SQNBit…
edgchen1 Dec 29, 2023
437ad52
try to fix compile issue
edgchen1 Dec 29, 2023
241ca27
move zero initialize out of unrolled loop
edgchen1 Dec 29, 2023
53e2ae2
update comment
edgchen1 Jan 2, 2024
d5b26b4
split out float conversion
edgchen1 Jan 2, 2024
02cf7b3
remove impl0_reference
edgchen1 Jan 2, 2024
5b4a86c
use thread per gemm in prepare workspace fn, reorder include
edgchen1 Jan 2, 2024
61998ea
make pointer const
edgchen1 Jan 3, 2024
fe7f0e7
Merge remote-tracking branch 'origin/main' into edgchen1/sqnbitgemm_q…
edgchen1 Jan 3, 2024
d54cbd9
remove unneeded and
edgchen1 Jan 10, 2024
7d8753c
Merge remote-tracking branch 'origin/main' into edgchen1/sqnbitgemm_q…
edgchen1 Jan 10, 2024
6d88a0b
move code from merge conflict
edgchen1 Jan 10, 2024
ccaa994
pack quant b data
edgchen1 Jan 11, 2024
cff3cb4
get matmulnbits working, add docs
edgchen1 Jan 11, 2024
f8aba0c
Merge remote-tracking branch 'origin/main' into edgchen1/sqnbitgemm_q…
edgchen1 Jan 11, 2024
33e6dd9
use threadpool to pack b data
edgchen1 Jan 11, 2024
4cd2474
shorten names, update docs
edgchen1 Jan 11, 2024
9244a3f
rename another function, add check for implementation in MlasSQNBitGe…
edgchen1 Jan 11, 2024
86f84ea
move b_data_block_offset out of unrolled loop body
edgchen1 Jan 12, 2024
2337375
move b data offset out of unrolled loop in compfp32 kernel
edgchen1 Jan 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 38 additions & 11 deletions cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

set(MLAS_SRC_DIR ${ONNXRUNTIME_ROOT}/core/mlas/lib)
set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
set(MLAS_INC_DIR ${MLAS_ROOT}/inc)

#
# All hardware agnostic source files here
# hardware specific files would cause trouble in
# multi-target build
#
onnxruntime_add_static_library(onnxruntime_mlas
${MLAS_SRC_DIR}/mlasi.h
${MLAS_SRC_DIR}/platform.cpp
${MLAS_SRC_DIR}/threading.cpp
${MLAS_SRC_DIR}/sgemm.cpp
Expand All @@ -33,9 +36,18 @@ onnxruntime_add_static_library(onnxruntime_mlas
${MLAS_SRC_DIR}/qpostprocessor.cpp
${MLAS_SRC_DIR}/qlgavgpool.cpp
${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
${MLAS_SRC_DIR}/sqnbitgemm.h
${MLAS_SRC_DIR}/sqnbitgemm.cpp
)

target_sources(onnxruntime_mlas PRIVATE
${MLAS_INC_DIR}/mlas_float16.h
${MLAS_INC_DIR}/mlas_gemm_postprocessor.h
${MLAS_INC_DIR}/mlas_q4.h
${MLAS_INC_DIR}/mlas_qnbit.h
${MLAS_INC_DIR}/mlas.h
)

if (NOT onnxruntime_ORT_MINIMAL_BUILD)
target_sources(onnxruntime_mlas PRIVATE
${MLAS_SRC_DIR}/q4_dq.cpp
Expand All @@ -46,7 +58,7 @@ endif()
set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)

function(add_jblas)
add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas)
add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas)
target_link_libraries(onnxruntime_mlas PRIVATE jblas::jblas)
target_sources(onnxruntime_mlas PRIVATE
${MLAS_SRC_DIR}/jblas_gemm.cpp
Expand Down Expand Up @@ -143,10 +155,6 @@ function(setup_mlas_source_for_windows)
target_sources(onnxruntime_mlas PRIVATE
${MLAS_SRC_DIR}/arm/sgemmc.cpp
)
# it should be removed after Visual Stuio is upgraded to 17.7
if (MSVC)
add_compile_options("-d2SSAOptimizer-")
endif()
elseif(onnxruntime_target_platform STREQUAL "x64")

file(GLOB_RECURSE mlas_platform_srcs_avx CONFIGURE_DEPENDS
Expand Down Expand Up @@ -300,8 +308,8 @@ else()
if(APPLE)
get_target_property(ONNXRUNTIME_MLAS_MACOSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
endif()
list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH)
if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH GREATER 1)
list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH)
if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH GREATER 1)
set(ONNXRUNTIME_MLAS_MULTI_ARCH TRUE)
endif()
#If ONNXRUNTIME_MLAS_MULTI_ARCH is true, we need to go through every if branch below
Expand Down Expand Up @@ -348,6 +356,8 @@ else()
${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
)
set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
edgchen1 marked this conversation as resolved.
Show resolved Hide resolved
if (NOT APPLE)
set(mlas_platform_srcs
${mlas_platform_srcs}
Expand Down Expand Up @@ -617,10 +627,12 @@ if(USE_JBLAS)
endif()

foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
target_include_directories(${mlas_target} PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})

set_target_properties(${mlas_target} PROPERTIES FOLDER "ONNXRuntime")
endforeach()
set_target_properties(onnxruntime_mlas PROPERTIES FOLDER "ONNXRuntime")

if (WIN32)
target_compile_options(onnxruntime_mlas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/wd6385>" "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>")
if (onnxruntime_ENABLE_STATIC_ANALYSIS)
Expand All @@ -636,6 +648,21 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()

# set up source group for MLAS source files
block()
set(source_group_srcs)
foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
get_target_property(mlas_target_srcs ${mlas_target} SOURCES)
foreach(mlas_target_src ${mlas_target_srcs})
cmake_path(IS_PREFIX MLAS_ROOT ${mlas_target_src} in_mlas_root)
if(in_mlas_root)
list(APPEND source_group_srcs ${mlas_target_src})
endif()
endforeach()
endforeach()
source_group(TREE ${MLAS_ROOT} FILES ${source_group_srcs})
endblock()


if (NOT onnxruntime_ORT_MINIMAL_BUILD)

Expand All @@ -647,7 +674,7 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD)
onnxruntime_add_executable(onnxruntime_mlas_q4dq
${MLAS_SRC_DIR}/q4_dq_cli.cpp
)
target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
set_target_properties(onnxruntime_mlas_q4dq PROPERTIES FOLDER "ONNXRuntimeTest")

target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
Expand Down
71 changes: 43 additions & 28 deletions onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
gemm_params[i].C = y_data + helper.OutputOffsets()[i];
gemm_params[i].ldc = N;
}
auto ws_size = MlasSQNBitsGemmBatchWorkspaceSize(M, N, K, max_len, gemm_params.data());
auto ws_size = MlasSQNBitsGemmBatchPackedBWorkspaceSize(M, N, K, max_len, gemm_params.data());
// workspace for activation process(dynamic quantization and others)
auto ws_ptr = IAllocator::MakeUniquePtr<int8_t>(allocator, ws_size);
MlasSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(),
Expand Down Expand Up @@ -195,34 +195,49 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
const size_t K = static_cast<size_t>(helper.K());
const size_t lda = helper.Lda(false);

if (MlasIsSQNBitGemmAvailable(nbits_, block_size_)) {
// number of bytes or elements between adjacent matrices
size_t b_data_matrix_stride_in_bytes, b_scale_matrix_stride, b_zero_point_matrix_stride_in_bytes;
MlasBlockwiseQuantizedBufferSizes(static_cast<int>(nbits_), static_cast<int>(block_size_), /* columnwise */ true,
static_cast<int>(K), static_cast<int>(N),
b_data_matrix_stride_in_bytes, b_scale_matrix_stride,
&b_zero_point_matrix_stride_in_bytes);

const size_t b_matrix_size = K * N;

InlinedVector<MLAS_SQNBIT_GEMM_DATA_PARAMS> data(batch_count);
for (size_t i = 0; i < batch_count; ++i) {
const size_t b_matrix_offset = helper.RightOffsets()[i] / b_matrix_size;

data[i].A = a_data + helper.LeftOffsets()[i];
data[i].lda = lda;
data[i].QuantBData = b_data + b_matrix_offset * b_data_matrix_stride_in_bytes;
data[i].QuantBScale = scales_data + b_matrix_offset * b_scale_matrix_stride;
data[i].QuantBZeroPoint = zero_points_data != nullptr
? zero_points_data + b_matrix_offset * b_zero_point_matrix_stride_in_bytes
: nullptr;
data[i].C = y_data + helper.OutputOffsets()[i];
data[i].ldc = N;
}

MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, data.data(), thread_pool);
for (int64_t accuracy_level = accuracy_level_;
accuracy_level >= static_cast<int64_t>(CompMostAccurate);
--accuracy_level) {
const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level);
if (MlasIsSQNBitGemmAvailable(M, N, K, nbits_, block_size_, compute_type)) {
// number of bytes or elements between adjacent matrices
size_t b_data_matrix_stride_in_bytes, b_scale_matrix_stride, b_zero_point_matrix_stride_in_bytes;
MlasBlockwiseQuantizedBufferSizes(static_cast<int>(nbits_), static_cast<int>(block_size_), /* columnwise */ true,
static_cast<int>(K), static_cast<int>(N),
b_data_matrix_stride_in_bytes, b_scale_matrix_stride,
&b_zero_point_matrix_stride_in_bytes);

const size_t b_matrix_size = K * N;

IAllocatorUniquePtr<std::byte> workspace{};
if (const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, batch_count,
nbits_, block_size_, compute_type);
workspace_size > 0) {
AllocatorPtr allocator;
ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&allocator));
workspace = IAllocator::MakeUniquePtr<std::byte>(allocator, workspace_size);
}

InlinedVector<MLAS_SQNBIT_GEMM_DATA_PARAMS> data(batch_count);
for (size_t i = 0; i < batch_count; ++i) {
const size_t b_matrix_offset = helper.RightOffsets()[i] / b_matrix_size;

data[i].A = a_data + helper.LeftOffsets()[i];
data[i].lda = lda;
data[i].QuantBData = b_data + b_matrix_offset * b_data_matrix_stride_in_bytes;
data[i].QuantBScale = scales_data + b_matrix_offset * b_scale_matrix_stride;
data[i].QuantBZeroPoint = zero_points_data != nullptr
? zero_points_data + b_matrix_offset * b_zero_point_matrix_stride_in_bytes
: nullptr;
data[i].C = y_data + helper.OutputOffsets()[i];
data[i].ldc = N;
}

MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type, data.data(), workspace.get(),
thread_pool);

return Status::OK();
return Status::OK();
}
}

const size_t ldb = helper.Ldb(true);
Expand Down
86 changes: 66 additions & 20 deletions onnxruntime/core/mlas/inc/mlas_qnbit.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,36 @@
#include "mlas.h"
#include "mlas_gemm_postprocessor.h"

/**
* @brief Define compute types of block quantization, in order of decreasing accuracy.
*/
typedef enum {
CompUndef = 0, /*!< undef */
CompFp32, /*!< input fp32, accumulator fp32 */
CompFp16, /*!< input fp16, accumulator fp16 */
CompBf16, /*!< input bf16, accumulator fp32 */
CompInt8, /*!< input int8, accumulator int32 */

// special values that should be the first and last actual values

CompMostAccurate = CompUndef,
CompLeastAccurate = CompInt8,
} MLAS_SQNBIT_COMPUTE_TYPE;

using MLAS_SQNBIT_GEMM_COMPUTE_TYPE = MLAS_SQNBIT_COMPUTE_TYPE; // TODO consolidate these

Check warning on line 42 in onnxruntime/core/mlas/inc/mlas_qnbit.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/mlas/inc/mlas_qnbit.h#L42

Missing username in TODO; it should look like "// TODO(my_username): Stuff." [readability/todo] [2]
Raw output
onnxruntime/core/mlas/inc/mlas_qnbit.h:42:  Missing username in TODO; it should look like "// TODO(my_username): Stuff."  [readability/todo] [2]

/**
* @brief Data parameters for float/n-bit quantized int GEMM routine.
*/
struct MLAS_SQNBIT_GEMM_DATA_PARAMS {
const float* A = nullptr; ///< address of A (float32 matrix)
size_t lda = 0; ///< leading dimension of A
const void* QuantBData = nullptr; ///< address of quantized B (quantized n-bit int values)
const float* QuantBScale = nullptr; ///< address of scale values of quantized B, one per block
const void* QuantBZeroPoint = nullptr; ///< optional address of zero point values of quantized B, one per block
bool IsBPacked = false; ///< whether B values are packed in an optimized format for the computation
const float* Bias = nullptr; ///< optional address of Bias, vector size N
float* C = nullptr; ///< address of result matrix
size_t ldc = 0; ///< leading dimension of C
const float* A = nullptr; ///< address of A (float32 matrix)
size_t lda = 0; ///< leading dimension of A
const void* QuantBData = nullptr; ///< address of quantized B (quantized n-bit int values)
const float* QuantBScale = nullptr; ///< address of scale values of quantized B, one per block
const void* QuantBZeroPoint = nullptr; ///< optional address of zero point values of quantized B, one per block
const float* Bias = nullptr; ///< optional address of Bias, vector size N
float* C = nullptr; ///< address of result matrix
size_t ldc = 0; ///< leading dimension of C

///< optional post processing to apply to result matrix
MLAS_GEMM_POSTPROCESSOR<float>* PostProcessor = nullptr;
Expand All @@ -46,13 +63,19 @@
* A must be a float32 matrix
* B must be a quantized and packed n-bit int matrix
*
* Call MlasIsSQNBitGemmAvailable() with the same parameters to determine whether this function may be called.
*
* @param[in] M row size of matrix A and C
* @param[in] N column size of matrix B and C
* @param[in] K column size of matrix A and row size of matrix B
* @param[in] BatchN number of batches
* @param[in] BlkBitWidth quantized value bit width (e.g., 4 means 4 bit ints)
* @param[in] BlkLen number of quantized values per block
* @param[in] ComputeType GEMM compute type (e.g., multiplying float or int8 values)
* @param[inout] DataParams An array (size BatchN) of parameter blocks
* @param[in] Workspace Address of intermediate workspace buffer.
If MlasSQNBitGemmBatchWorkspaceSize() returns a non-zero value, this must be a
buffer with at least that many bytes. Otherwise, it may be nullptr.
* @param[in] ThreadPool optional thread pool to use
*/
void MLASCALL
Expand All @@ -63,31 +86,54 @@
size_t BatchN,
size_t BlkBitWidth,
size_t BlkLen,
MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
void* Workspace,
MLAS_THREADPOOL* ThreadPool = nullptr
);

/**
* @brief Determines whether a float32/quantized n-bit int GEMM implementation is available on the current platform.
*
* @param[in] M row size of matrix A and C
* @param[in] N column size of matrix B and C
* @param[in] K column size of matrix A and row size of matrix B
* @param[in] BlkBitWidth quantized value bit width (e.g., 4 means 4 bit ints)
* @param[in] BlkLen number of quantized values per block
* @param[in] ComputeType GEMM compute type (e.g., multiplying float or int8 values)
*/
bool MLASCALL
MlasIsSQNBitGemmAvailable(
size_t M,
size_t N,
size_t K,
size_t BlkBitWidth,
size_t BlkLen
size_t BlkLen,
MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
);

/**
* @brief Define compute types of block quantization
* @brief Gets the size in bytes of the intermediate workspace buffer required by the float32/quantized n-bit int GEMM
* implementation. If zero, no intermediate workspace is required.
*
* @param[in] M row size of matrix A and C
* @param[in] N column size of matrix B and C
* @param[in] K column size of matrix A and row size of matrix B
* @param[in] BatchN number of batches
* @param[in] BlkBitWidth quantized value bit width (e.g., 4 means 4 bit ints)
* @param[in] BlkLen number of quantized values per block
* @param[in] ComputeType GEMM compute type (e.g., multiplying float or int8 values)
*/
typedef enum {
CompUndef = 0, /*!< undef */
CompFp32 = 1, /*!< input fp32, accumulator fp32 */
CompFp16 = 2, /*!< input fp16, accumulator fp16 */
CompBf16 = 3, /*!< input bf16, accumulator fp32 */
CompInt8 = 4 /*!< input int8, accumulator int32 */
} MLAS_SQNBIT_COMPUTE_TYPE;
size_t MLASCALL
MlasSQNBitGemmBatchWorkspaceSize(
size_t M,
size_t N,
size_t K,
size_t BatchN,
size_t BlkBitWidth,
size_t BlkLen,
MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
);

/**
* @brief Data parameters for NBits GEMM routine
Expand Down Expand Up @@ -139,7 +185,7 @@
* @param last_call flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
* one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
* they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
* inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
* inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
* (is_asym is false) and Zp(is_asym is true).
* @param thread_pool
*/
Expand Down Expand Up @@ -186,7 +232,7 @@
* @return Workspace size in bytes
*/
size_t MLASCALL
MlasSQNBitsGemmBatchWorkspaceSize(
MlasSQNBitsGemmBatchPackedBWorkspaceSize(
const size_t M,
const size_t N,
const size_t K,
Expand Down
4 changes: 3 additions & 1 deletion onnxruntime/core/mlas/lib/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,6 @@ Return Value:
this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchNeon;
this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchNeon;
this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchNeon;
this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;

//
// Check if the processor supports ASIMD dot product instructions.
Expand Down Expand Up @@ -512,6 +511,9 @@ Return Value:
this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchSdot;
this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchDot;
this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;

// MlasSQNBitGemmDispatchNeon has a dependency on dot product instructions
this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
}

#if defined(__linux__)
Expand Down
Loading
Loading