microsoft · liqunfu · Aug 2, 2024 · May 3, 2024 · May 6, 2024 · May 7, 2024
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -555,8 +555,17 @@ else()
           ${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
         )
-        set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
 
+message(STATUS "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+message(STATUS "CMAKE_CXX_COMPILER_VERSION: ${CMAKE_CXX_COMPILER_VERSION}")
+
+if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "10")
+          message(STATUS "Using -mavx2 -mfma -mavxvnni flags")
+          set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mavxvnni")
+else()
+          message(STATUS "Using -mavx2 -mfma flags")
+          set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
+endif()
         set(mlas_platform_srcs_avx512f
           ${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx512F.S
           ${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx512F.S
@@ -575,7 +584,7 @@ else()
           ${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx512Core.S
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
         )
-        set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
+        set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl")
 
         set(mlas_platform_srcs_avx512vnni
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp

diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -104,6 +104,8 @@
 
     ORT_ENFORCE(nbits_ == 4,
                 "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned.");
+    const Tensor* tensor_zero_point = nullptr;
+    has_zp_input_ = info.TryGetConstantInput(3, &tensor_zero_point);
 #ifdef ORT_NEURAL_SPEED
     const Tensor* tensor_B = nullptr;
     const Tensor* tensor_scale = nullptr;
@@ -139,6 +141,7 @@
   IAllocatorUniquePtr<void> packed_b_{};
   size_t packed_b_size_{0};
 
+  bool has_zp_input_{false};
 #if defined(ORT_NEURAL_SPEED)
 
   bool is_asym_{false};
@@ -207,10 +210,10 @@
     is_packed = true;
   }
 
-#else   // defined(ORT_NEURAL_SPEED)
-
+#else  // defined(ORT_NEURAL_SPEED)
+  ORT_UNUSED_PARAMETER(prepacked_weights);
+  const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level_);
   if (input_idx == InputIndex::B) {
-    const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level_);
     if (!MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type)) {
       return Status::OK();
     }
@@ -220,12 +223,20 @@
     }
     auto qptr = tensor.DataRaw();
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
-    MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type, qptr, packed_b_.get());
-    if (prepacked_weights) {
-      prepacked_weights->buffers_.push_back(std::move(packed_b_));
-      prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
-    }
+    MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type, qptr, packed_b_.get(), nullptr, has_zp_input_, nullptr, nullptr);
     is_packed = true;
+  } else if (compute_type == CompInt8) {
+#ifdef MLAS_TARGET_AMD64_IX86
+    if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
+      auto sptr = tensor.Data<float>();
+      MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type, nullptr, packed_b_.get(), sptr, has_zp_input_, nullptr, nullptr);
+      is_packed = false;
+    } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) {
+      auto zptr = tensor.Data<uint8_t>();
+      MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr);
+      is_packed = false;
+    }
+#endif
   }
 #endif  // defined(ORT_NEURAL_SPEED)
 
@@ -332,9 +343,9 @@
       const auto* bias_data = bias == nullptr ? nullptr : bias->Data<float>();
 
       IAllocatorUniquePtr<std::byte> workspace{};
-      if (const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, batch_count,
-                                                                         nbits_, block_size_, compute_type);
-          workspace_size > 0) {
+      const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(
+          M, N, K, batch_count, nbits_, block_size_, compute_type);
+      if (workspace_size > 0) {
         AllocatorPtr allocator;
         ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&allocator));
         workspace = IAllocator::MakeUniquePtr<std::byte>(allocator, workspace_size);
@@ -344,14 +355,18 @@
       for (size_t i = 0; i < batch_count; ++i) {
         data[i].A = a_data + helper.LeftOffsets()[i];
         data[i].lda = lda;
-        data[i].QuantBData = packed_b_.get();
+#ifdef MLAS_TARGET_AMD64_IX86
+        if (compute_type == CompInt8) {
+          data[i].QuantBDataWorkspace = packed_b_.get();
+        }
+#endif
+        data[i].PackedQuantBData = static_cast<std::byte*>(packed_b_.get());
         data[i].QuantBScale = scales_data;
         data[i].QuantBZeroPoint = zero_points_data;
         data[i].Bias = bias_data;
         data[i].C = y_data + helper.OutputOffsets()[i];
         data[i].ldc = N;
       }
-
       MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type, data.data(), workspace.get(),
                           thread_pool);
 

@@ -43,14 +43,16 @@ typedef enum {
  * @brief Data parameters for float/n-bit quantized int GEMM routine.
  */
 struct MLAS_SQNBIT_GEMM_DATA_PARAMS {
-    const float* A = nullptr;               ///< address of A (float32 matrix)
-    size_t lda = 0;                         ///< leading dimension of A
-    const void* QuantBData = nullptr;       ///< address of quantized B (quantized n-bit int values)
-    const float* QuantBScale = nullptr;     ///< address of scale values of quantized B, one per block
-    const void* QuantBZeroPoint = nullptr;  ///< optional address of zero point values of quantized B, one per block
-    const float* Bias = nullptr;            ///< optional address of Bias, vector size N
-    float* C = nullptr;                     ///< address of result matrix
-    size_t ldc = 0;                         ///< leading dimension of C
+    const float* A = nullptr;                       ///< address of A (float32 matrix)
+    size_t lda = 0;                                 ///< leading dimension of A
+    const void* QuantBDataWorkspace;                ///< address of quantized B (quantized n-bit int values)
+    const std::byte* PackedQuantBData = nullptr;    /// address of packed quantized B data
+    const float* QuantBScale = nullptr;             ///< address of scale values of quantized B, one per block
+    const void* QuantBZeroPoint = nullptr;          ///< optional address of zero point values of quantized B, one per block
+    const float* QuantBBlkSum = nullptr;            ///< optional address of scale * zp, one per block
+    const float* Bias = nullptr;                    ///< optional address of Bias, vector size N
+    float* C = nullptr;                             ///< address of result matrix
+    size_t ldc = 0;                                 ///< leading dimension of C
 
     ///< optional post processing to apply to result matrix
     MLAS_GEMM_POSTPROCESSOR<float>* PostProcessor = nullptr;
@@ -159,14 +161,29 @@ MlasSQNBitGemmPackQuantBDataSize(
 /**
  * @brief Packs the quantized B data in a format that the kernel expects.
  *
- * @param[in]   N                   column size of matrix B and C
- * @param[in]   K                   column size of matrix A and row size of matrix B
- * @param[in]   BlkBitWidth         quantized value bit width (e.g., 4 means 4 bit ints)
- * @param[in]   BlkLen              number of quantized values per block
- * @param[in]   ComputeType         GEMM compute type (e.g., multiplying float or int8 values)
- * @param[in]   QuantBData          quantized B data
- * @param[out]  PackedQuantBData    packed quantized B data
- * @param[in]   ThreadPool          optional thread pool to use
+ * If the function is called without QuantBScale and QuantBZeroPoint,
+ * it just packs QuantBData into PackedQuantBDataAndOrBlkSum.
+ *
+ * If the function is called with QuantBData, QuantBScale, and QuantBZeroPoint
+ * additional BlkSum (Scale * zeropoint) is computed and stored at the second part of PackedQuantBDataAndOrBlkSum.
+ *
+ * Because ORT OpKernel::PrePack is called for each input (in this case, QuantBData,
+ * QuantBScale, and QuantBZeroPoint) separately, this function may be called 3 times, first with QuantBData,
+ * and then QuantBScale and QuantBZeroPoint. When the function is called with QuantBScale without QuantBZeroPoint,
+ * BlkSum is computed with default zero point 8 and stored at the second part of PackedQuantBDataAndOrBlkSum.
+ * If there is a third call with QuantBZeroPoint, BlkSum is recomputed/adjusted with provided zeropoint.
+ *
+ * @param[in]   N                               column size of matrix B and C
+ * @param[in]   K                               column size of matrix A and row size of matrix B
+ * @param[in]   BlkBitWidth                     quantized value bit width (e.g., 4 means 4 bit ints)
+ * @param[in]   BlkLen                          number of quantized values per block
+ * @param[in]   ComputeType                     GEMM compute type (e.g., multiplying float or int8 values)
+ * @param[in]   QuantBData                      quantized B data
+ * @param[in]   PackedQuantBDataAndOrBlkSum     buffer to store packed quantized B data and/or BlkSum
+ * @param[in]   QuantBScale                     quantized B scale
+ * @param[in]   has_zp_input                    whether QuantBZeroPoint is provided
+ * @param[in]   QuantBZeroPoint                 quantized B zero point
+ * @param[in]   ThreadPool          thread pool to use (no parallel if nullptr)
  */
 void MLASCALL
 MlasSQNBitGemmPackQuantBData(
@@ -176,6 +193,9 @@ MlasSQNBitGemmPackQuantBData(
     size_t BlkLen,
     MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const void* QuantBData,
-    void* PackedQuantBData,
-    MLAS_THREADPOOL* ThreadPool = nullptr
+    void* PackedQuantBDataAndOrBlkSum,
+    const void* QuantBScale,
+    bool has_zp_input,
+    const void* QuantBZeroPoint,
+    MLAS_THREADPOOL* ThreadPool
 );
@@ -993,6 +993,8 @@ extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon;
 
 extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2;
 
+extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2vnni;
+
 extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512;
 
 extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni;

@@ -409,6 +409,7 @@ Return Value:
                     this->GemmU8S8Kernel = MlasGemmU8S8KernelAvxVnni;
                     this->GemvU8S8Kernel = MlasGemvU8S8KernelAvxVnni;
                     this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvxVnni;
+                    this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2vnni;
                 }
 
 #if !defined(ORT_MINIMAL_BUILD)