microsoft · edgchen1 · Jan 13, 2024 · Dec 4, 2023 · Dec 5, 2023 · Dec 12, 2023
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -1,14 +1,17 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-set(MLAS_SRC_DIR ${ONNXRUNTIME_ROOT}/core/mlas/lib)
+set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
+set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
+set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
 
 #
 # All hardware agnostic source files here
 # hardware specific files would cause trouble in
 # multi-target build
 #
 onnxruntime_add_static_library(onnxruntime_mlas
+  ${MLAS_SRC_DIR}/mlasi.h
   ${MLAS_SRC_DIR}/platform.cpp
   ${MLAS_SRC_DIR}/threading.cpp
   ${MLAS_SRC_DIR}/sgemm.cpp
@@ -33,9 +36,18 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qpostprocessor.cpp
   ${MLAS_SRC_DIR}/qlgavgpool.cpp
   ${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
+  ${MLAS_SRC_DIR}/sqnbitgemm.h
   ${MLAS_SRC_DIR}/sqnbitgemm.cpp
 )
 
+target_sources(onnxruntime_mlas PRIVATE
+  ${MLAS_INC_DIR}/mlas_float16.h
+  ${MLAS_INC_DIR}/mlas_gemm_postprocessor.h
+  ${MLAS_INC_DIR}/mlas_q4.h
+  ${MLAS_INC_DIR}/mlas_qnbit.h
+  ${MLAS_INC_DIR}/mlas.h
+)
+
 if (NOT onnxruntime_ORT_MINIMAL_BUILD)
   target_sources(onnxruntime_mlas PRIVATE
     ${MLAS_SRC_DIR}/q4_dq.cpp
@@ -46,7 +58,7 @@ endif()
 set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
 
 function(add_jblas)
-    add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas) 
+    add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas)
     target_link_libraries(onnxruntime_mlas PRIVATE jblas::jblas)
     target_sources(onnxruntime_mlas PRIVATE
         ${MLAS_SRC_DIR}/jblas_gemm.cpp
@@ -143,10 +155,6 @@ function(setup_mlas_source_for_windows)
     target_sources(onnxruntime_mlas PRIVATE
       ${MLAS_SRC_DIR}/arm/sgemmc.cpp
     )
-    # it should be removed after Visual Stuio is upgraded to 17.7
-    if (MSVC)
-      add_compile_options("-d2SSAOptimizer-")
-    endif()
   elseif(onnxruntime_target_platform STREQUAL "x64")
 
     file(GLOB_RECURSE mlas_platform_srcs_avx CONFIGURE_DEPENDS
@@ -300,8 +308,8 @@ else()
     if(APPLE)
       get_target_property(ONNXRUNTIME_MLAS_MACOSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
     endif()
-    list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH  ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH)
-    if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH GREATER 1)
+    list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH)
+    if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH GREATER 1)
         set(ONNXRUNTIME_MLAS_MULTI_ARCH TRUE)
     endif()
     #If ONNXRUNTIME_MLAS_MULTI_ARCH is true, we need to go through every if branch below
@@ -348,6 +356,8 @@ else()
           ${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
         )
+        set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
+                                    PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
         if (NOT APPLE)
           set(mlas_platform_srcs
             ${mlas_platform_srcs}
@@ -617,10 +627,12 @@ if(USE_JBLAS)
 endif()
 
 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
-    target_include_directories(${mlas_target} PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
+    target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
     onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
+
+    set_target_properties(${mlas_target} PROPERTIES FOLDER "ONNXRuntime")
 endforeach()
-set_target_properties(onnxruntime_mlas PROPERTIES FOLDER "ONNXRuntime")
+
 if (WIN32)
   target_compile_options(onnxruntime_mlas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/wd6385>" "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>")
   if (onnxruntime_ENABLE_STATIC_ANALYSIS)
@@ -636,6 +648,21 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
 
+# set up source group for MLAS source files
+block()
+  set(source_group_srcs)
+  foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
+    get_target_property(mlas_target_srcs ${mlas_target} SOURCES)
+    foreach(mlas_target_src ${mlas_target_srcs})
+      cmake_path(IS_PREFIX MLAS_ROOT ${mlas_target_src} in_mlas_root)
+      if(in_mlas_root)
+        list(APPEND source_group_srcs ${mlas_target_src})
+      endif()
+    endforeach()
+  endforeach()
+  source_group(TREE ${MLAS_ROOT} FILES ${source_group_srcs})
+endblock()
+
 
 if (NOT onnxruntime_ORT_MINIMAL_BUILD)
 
@@ -647,7 +674,7 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD)
   onnxruntime_add_executable(onnxruntime_mlas_q4dq
     ${MLAS_SRC_DIR}/q4_dq_cli.cpp
   )
-  target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
+  target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
   set_target_properties(onnxruntime_mlas_q4dq PROPERTIES FOLDER "ONNXRuntimeTest")
 
   target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)

diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -161,7 +161,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
       gemm_params[i].C = y_data + helper.OutputOffsets()[i];
       gemm_params[i].ldc = N;
     }
-    auto ws_size = MlasSQNBitsGemmBatchWorkspaceSize(M, N, K, max_len, gemm_params.data());
+    auto ws_size = MlasSQNBitsGemmBatchPackedBWorkspaceSize(M, N, K, max_len, gemm_params.data());
     // workspace for activation process(dynamic quantization and others)
     auto ws_ptr = IAllocator::MakeUniquePtr<int8_t>(allocator, ws_size);
     MlasSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(),
@@ -195,34 +195,49 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   const size_t K = static_cast<size_t>(helper.K());
   const size_t lda = helper.Lda(false);
 
-  if (MlasIsSQNBitGemmAvailable(nbits_, block_size_)) {
-    // number of bytes or elements between adjacent matrices
-    size_t b_data_matrix_stride_in_bytes, b_scale_matrix_stride, b_zero_point_matrix_stride_in_bytes;
-    MlasBlockwiseQuantizedBufferSizes(static_cast<int>(nbits_), static_cast<int>(block_size_), /* columnwise */ true,
-                                      static_cast<int>(K), static_cast<int>(N),
-                                      b_data_matrix_stride_in_bytes, b_scale_matrix_stride,
-                                      &b_zero_point_matrix_stride_in_bytes);
-
-    const size_t b_matrix_size = K * N;
-
-    InlinedVector<MLAS_SQNBIT_GEMM_DATA_PARAMS> data(batch_count);
-    for (size_t i = 0; i < batch_count; ++i) {
-      const size_t b_matrix_offset = helper.RightOffsets()[i] / b_matrix_size;
-
-      data[i].A = a_data + helper.LeftOffsets()[i];
-      data[i].lda = lda;
-      data[i].QuantBData = b_data + b_matrix_offset * b_data_matrix_stride_in_bytes;
-      data[i].QuantBScale = scales_data + b_matrix_offset * b_scale_matrix_stride;
-      data[i].QuantBZeroPoint = zero_points_data != nullptr
-                                    ? zero_points_data + b_matrix_offset * b_zero_point_matrix_stride_in_bytes
-                                    : nullptr;
-      data[i].C = y_data + helper.OutputOffsets()[i];
-      data[i].ldc = N;
-    }
-
-    MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, data.data(), thread_pool);
+  for (int64_t accuracy_level = accuracy_level_;
+       accuracy_level >= static_cast<int64_t>(CompMostAccurate);
+       --accuracy_level) {
+    const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level);
+    if (MlasIsSQNBitGemmAvailable(M, N, K, nbits_, block_size_, compute_type)) {
+      // number of bytes or elements between adjacent matrices
+      size_t b_data_matrix_stride_in_bytes, b_scale_matrix_stride, b_zero_point_matrix_stride_in_bytes;
+      MlasBlockwiseQuantizedBufferSizes(static_cast<int>(nbits_), static_cast<int>(block_size_), /* columnwise */ true,
+                                        static_cast<int>(K), static_cast<int>(N),
+                                        b_data_matrix_stride_in_bytes, b_scale_matrix_stride,
+                                        &b_zero_point_matrix_stride_in_bytes);
+
+      const size_t b_matrix_size = K * N;
+
+      IAllocatorUniquePtr<std::byte> workspace{};
+      if (const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, batch_count,
+                                                                         nbits_, block_size_, compute_type);
+          workspace_size > 0) {
+        AllocatorPtr allocator;
+        ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&allocator));
+        workspace = IAllocator::MakeUniquePtr<std::byte>(allocator, workspace_size);
+      }
+
+      InlinedVector<MLAS_SQNBIT_GEMM_DATA_PARAMS> data(batch_count);
+      for (size_t i = 0; i < batch_count; ++i) {
+        const size_t b_matrix_offset = helper.RightOffsets()[i] / b_matrix_size;
+
+        data[i].A = a_data + helper.LeftOffsets()[i];
+        data[i].lda = lda;
+        data[i].QuantBData = b_data + b_matrix_offset * b_data_matrix_stride_in_bytes;
+        data[i].QuantBScale = scales_data + b_matrix_offset * b_scale_matrix_stride;
+        data[i].QuantBZeroPoint = zero_points_data != nullptr
+                                      ? zero_points_data + b_matrix_offset * b_zero_point_matrix_stride_in_bytes
+                                      : nullptr;
+        data[i].C = y_data + helper.OutputOffsets()[i];
+        data[i].ldc = N;
+      }
+
+      MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type, data.data(), workspace.get(),
+                          thread_pool);
 
-    return Status::OK();
+      return Status::OK();
+    }
   }
 
   const size_t ldb = helper.Ldb(true);

@@ -23,19 +23,36 @@
 #include "mlas.h"
 #include "mlas_gemm_postprocessor.h"
 
+/**
+ * @brief Define compute types of block quantization, in order of decreasing accuracy.
+ */
+typedef enum {
+    CompUndef = 0, /*!< undef */
+    CompFp32,      /*!< input fp32, accumulator fp32 */
+    CompFp16,      /*!< input fp16, accumulator fp16 */
+    CompBf16,      /*!< input bf16, accumulator fp32 */
+    CompInt8,      /*!< input int8, accumulator int32 */
+
+    // special values that should be the first and last actual values
+
+    CompMostAccurate = CompUndef,
+    CompLeastAccurate = CompInt8,
+} MLAS_SQNBIT_COMPUTE_TYPE;
+
+using MLAS_SQNBIT_GEMM_COMPUTE_TYPE = MLAS_SQNBIT_COMPUTE_TYPE;  // TODO consolidate these
+
 /**
  * @brief Data parameters for float/n-bit quantized int GEMM routine.
  */
 struct MLAS_SQNBIT_GEMM_DATA_PARAMS {
-    const float* A = nullptr;                ///< address of A (float32 matrix)
-    size_t lda = 0;                          ///< leading dimension of A
-    const void* QuantBData = nullptr;        ///< address of quantized B (quantized n-bit int values)
-    const float* QuantBScale = nullptr;      ///< address of scale values of quantized B, one per block
-    const void* QuantBZeroPoint = nullptr;   ///< optional address of zero point values of quantized B, one per block
-    bool IsBPacked = false;                  ///< whether B values are packed in an optimized format for the computation
-    const float* Bias = nullptr;             ///< optional address of Bias, vector size N
-    float* C = nullptr;                      ///< address of result matrix
-    size_t ldc = 0;                          ///< leading dimension of C
+    const float* A = nullptr;               ///< address of A (float32 matrix)
+    size_t lda = 0;                         ///< leading dimension of A
+    const void* QuantBData = nullptr;       ///< address of quantized B (quantized n-bit int values)
+    const float* QuantBScale = nullptr;     ///< address of scale values of quantized B, one per block
+    const void* QuantBZeroPoint = nullptr;  ///< optional address of zero point values of quantized B, one per block
+    const float* Bias = nullptr;            ///< optional address of Bias, vector size N
+    float* C = nullptr;                     ///< address of result matrix
+    size_t ldc = 0;                         ///< leading dimension of C
 
     ///< optional post processing to apply to result matrix
     MLAS_GEMM_POSTPROCESSOR<float>* PostProcessor = nullptr;
@@ -46,13 +63,19 @@
  *        A must be a float32 matrix
  *        B must be a quantized and packed n-bit int matrix
  *
+ *        Call MlasIsSQNBitGemmAvailable() with the same parameters to determine whether this function may be called.
+ *
  * @param[in]       M               row size of matrix A and C
  * @param[in]       N               column size of matrix B and C
  * @param[in]       K               column size of matrix A and row size of matrix B
  * @param[in]       BatchN          number of batches
  * @param[in]       BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
  * @param[in]       BlkLen          number of quantized values per block
+ * @param[in]       ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  * @param[inout]    DataParams      An array (size BatchN) of parameter blocks
+ * @param[in]       Workspace       Address of intermediate workspace buffer.
+                                    If MlasSQNBitGemmBatchWorkspaceSize() returns a non-zero value, this must be a
+                                    buffer with at least that many bytes. Otherwise, it may be nullptr.
  * @param[in]       ThreadPool      optional thread pool to use
  */
 void MLASCALL
@@ -63,31 +86,54 @@
     size_t BatchN,
     size_t BlkBitWidth,
     size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* Workspace,
     MLAS_THREADPOOL* ThreadPool = nullptr
 );
 
 /**
  * @brief Determines whether a float32/quantized n-bit int GEMM implementation is available on the current platform.
+ *
+ * @param[in]   M               row size of matrix A and C
+ * @param[in]   N               column size of matrix B and C
+ * @param[in]   K               column size of matrix A and row size of matrix B
  * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
  * @param[in]   BlkLen          number of quantized values per block
+ * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
 bool MLASCALL
 MlasIsSQNBitGemmAvailable(
+    size_t M,
+    size_t N,
+    size_t K,
     size_t BlkBitWidth,
-    size_t BlkLen
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
 );
 
 /**
- * @brief Define compute types of block quantization
+ * @brief Gets the size in bytes of the intermediate workspace buffer required by the float32/quantized n-bit int GEMM
+ * implementation. If zero, no intermediate workspace is required.
+ *
+ * @param[in]   M               row size of matrix A and C
+ * @param[in]   N               column size of matrix B and C
+ * @param[in]   K               column size of matrix A and row size of matrix B
+ * @param[in]   BatchN          number of batches
+ * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
+ * @param[in]   BlkLen          number of quantized values per block
+ * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
-typedef enum {
-    CompUndef = 0, /*!< undef */
-    CompFp32 = 1,  /*!< input fp32, accumulator fp32 */
-    CompFp16 = 2,  /*!< input fp16, accumulator fp16 */
-    CompBf16 = 3,  /*!< input bf16, accumulator fp32 */
-    CompInt8 = 4   /*!< input int8, accumulator int32 */
-} MLAS_SQNBIT_COMPUTE_TYPE;
+size_t MLASCALL
+MlasSQNBitGemmBatchWorkspaceSize(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BatchN,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+);
 
 /**
  * @brief Data parameters for NBits GEMM routine
@@ -139,7 +185,7 @@
  * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
  * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
  * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
- * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale 
+ * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
  * (is_asym is false) and Zp(is_asym is true).
  * @param thread_pool
  */
@@ -186,7 +232,7 @@
  * @return     Workspace size in bytes
  */
 size_t MLASCALL
-MlasSQNBitsGemmBatchWorkspaceSize(
+MlasSQNBitsGemmBatchPackedBWorkspaceSize(
     const size_t M,
     const size_t N,
     const size_t K,

@@ -482,7 +482,6 @@ Return Value:
     this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchNeon;
     this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchNeon;
     this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchNeon;
-    this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
 
     //
     // Check if the processor supports ASIMD dot product instructions.
@@ -512,6 +511,9 @@ Return Value:
         this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchSdot;
         this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchDot;
         this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
+
+        // MlasSQNBitGemmDispatchNeon has a dependency on dot product instructions
+        this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
     }
 
 #if defined(__linux__)