From fa73d7cbf9ed7d1705c993874e4e05eae1c2f4a7 Mon Sep 17 00:00:00 2001 From: raoanag <127366241+raoanag@users.noreply.github.com> Date: Fri, 8 Mar 2024 15:35:10 -0800 Subject: [PATCH] [DML] DynamicQuantizeMatMul (#19763) ### Description DML Implementation for [com.microsoft.DynamicQuantizeMatMul ](https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.DynamicQuantizeMatMul) ``` .\onnxruntime_test_all.exe --gtest_filter="*DynamicQuantizeMatMul.*" Note: Google Test filter = *DynamicQuantizeMatMul.* [==========] Running 10 tests from 1 test suite. [----------] Global test environment set-up. [----------] 10 tests from DynamicQuantizeMatMul [ RUN ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_S8 [ OK ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_S8 (635 ms) [ RUN ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_U8 [ OK ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_U8 (514 ms) [ RUN ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_S8 [ OK ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_S8 (512 ms) [ RUN ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_U8 [ OK ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_U8 (505 ms) [ RUN ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_S8 [ OK ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_S8 (526 ms) [ RUN ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_U8 [ OK ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_U8 (504 ms) [ RUN ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_S8 [ OK ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_S8 (512 ms) [ RUN ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_U8 [ OK ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_U8 (512 ms) [ RUN ] DynamicQuantizeMatMul.UInt8_test_with_empty_input [ OK ] DynamicQuantizeMatMul.UInt8_test_with_empty_input (112 ms) [ RUN ] DynamicQuantizeMatMul.B_PerColumn_ND [ OK ] DynamicQuantizeMatMul.B_PerColumn_ND (348 ms) [----------] 10 tests from DynamicQuantizeMatMul (4685 ms total) [----------] Global test environment tear-down [==========] 10 tests from 1 test suite ran. (4686 ms total) [ PASSED ] 10 tests. memleakdbg: ----- No memory leaks detected ----- ``` ### Motivation and Context - CalculateDynamicQuantizeMatMul to replace CPU EP run reference - Added more FP32 testcases to isolate all input datatype combinations --------- Co-authored-by: Xiang Zhang --- docs/OperatorKernels.md | 1 + .../External/DirectMLHelpers/DirectMLSchema.h | 30 +-- .../DmlOperatorDynamicQuantizeMatMul.cpp | 173 +++++++++++++++ .../src/Operators/OperatorRegistration.cpp | 2 + .../dml/OperatorAuthorHelper/OperatorHelper.h | 1 + .../OperatorAuthorHelper/OperatorVersions.h | 1 + .../dynamic_quantize_matmul_test.cc | 199 ++++++++++++------ 7 files changed, 330 insertions(+), 77 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 9f5cd4cc842dc..955957f2957dc 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -1268,6 +1268,7 @@ Do not modify directly.* |BiasSplitGelu|*in* X:**T**
*in* bias:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |ConvTransposeWithDynamicPads|*in* X:**T**
*in* W:**T**
*in* Pads:**tensor(int64)**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |DequantizeLinear|*in* x:**T1**
*in* x_scale:**T2**
*in* x_zero_point:**T1**
*out* y:**T2**|1+|**T1** = tensor(int32), tensor(int8), tensor(uint8)
**T2** = tensor(float), tensor(float16)| +|DynamicQuantizeMatMul|*in* A:**T1**
*in* B:**T2**
*in* b_scale:**T1**
*in* b_zero_point:**T2**
*in* bias:**T1**
*out* Y:**T1**|1+|**T1** = tensor(float)
**T2** = tensor(int8), tensor(uint8)| |EmbedLayerNormalization|*in* input_ids:**T1**
*in* segment_ids:**T1**
*in* word_embedding:**T**
*in* position_embedding:**T**
*in* segment_embedding:**T**
*in* gamma:**T**
*in* beta:**T**
*in* mask:**T1**
*in* position_ids:**T1**
*out* output:**T**
*out* mask_index:**T1**
*out* embedding_sum:**T**|1+|**T** = tensor(float), tensor(float16)| |FusedMatMul|*in* A:**T**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |FusedMatMulActivation|*in* A:**T**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h index da57c2aa235fd..64ea5b7801a84 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h @@ -1865,43 +1865,43 @@ constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA { DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA_FIELDS, }; -constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS[9] { +constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, }; -constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA { - "DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY", - DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY, +constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA { + "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT", + static_cast(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT), DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, - 9, - DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS, + 8, + DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS, }; -constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] { +constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS[9] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, }; -constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA { - "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT", - static_cast(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT), +constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA { + "DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY", + DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY, DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, - 8, - DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS, + 9, + DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS, }; constexpr DML_SCHEMA_FIELD DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA_FIELDS[11] { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp new file mode 100644 index 0000000000000..c6a87da705a99 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp @@ -0,0 +1,173 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" + +namespace Dml +{ +// DynamicQuantizeMatMul = MatrixMultiplyIntegerToFloat(DynamicQuantizeLinear(A), B) +class DmlOperatorDynamicQuantizeMatMul : public DmlOperator +{ + // This order matches the ONNX schema. + enum OnnxInputIndex + { + A, // Input + B, + B_scale, + B_zero_point, + Bias, + Count, + }; + +public: + DmlOperatorDynamicQuantizeMatMul(const MLOperatorKernelCreationContext& kernelCreationContext) + : DmlOperator(kernelCreationContext) + { + DmlOperator::Initialize(kernelCreationContext); + + const bool hasBias = kernelCreationContext.IsInputValid(OnnxInputIndex::Bias); + const bool hasBZP = kernelCreationContext.IsInputValid(OnnxInputIndex::B_zero_point); + + // Broadcast Bias tensor to the shape of the output tensor. + if (hasBias) + { + m_inputTensorDescs[OnnxInputIndex::Bias] = CreateTensorDescFromInput( + kernelCreationContext, + OnnxInputIndex::Bias, + TensorAxis::DoNotCoerce, + TensorAxis::W, + TensorAxis::RightAligned, + kernelCreationContext.GetTensorShapeDescription().GetOutputTensorShape(0) + ); + } + MLOperatorTensorDataType BDatatype = kernelCreationContext.GetInputEdgeDescription(OnnxInputIndex::B).tensorDataType; + + std::vector ATensorShape = kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(OnnxInputIndex::A); + std::vector ExpectedAScaleTensorShape = {1, 1, 1, 1}; + std::vector ExpectedAZeroPointTensorShape = {1, 1, 1, 1}; + + // output edges between DynQL and MMItoFloat node + TensorDesc intermediateQuantizedATensorDesc = TensorDesc( + BDatatype, + gsl::make_span(ATensorShape), + gsl::make_span(ATensorShape), + TensorAxis::DoNotCoerce, + TensorAxis::W, + TensorAxis::RightAligned, + NchwDimensionCount, // minDimensionCount + 0 // guaranteedBaseOffsetAlignment + ); + + TensorDesc intermediateQuantizedAScaleTensorDesc = TensorDesc( + MLOperatorTensorDataType::Float, + gsl::make_span(ExpectedAScaleTensorShape), + gsl::make_span(ExpectedAScaleTensorShape), + TensorAxis::DoNotCoerce, + TensorAxis::W, + TensorAxis::RightAligned, + NchwDimensionCount, // minDimensionCount + 0 // guaranteedBaseOffsetAlignment + ); + + TensorDesc intermediateQuantizedAZeroPointTensorDesc = TensorDesc( + BDatatype, + gsl::make_span(ExpectedAZeroPointTensorShape), + gsl::make_span(ExpectedAZeroPointTensorShape), + TensorAxis::DoNotCoerce, + TensorAxis::W, + TensorAxis::RightAligned, + NchwDimensionCount, // minDimensionCount + 0 // guaranteedBaseOffsetAlignment + ); + + DML_TENSOR_DESC namedIntermediateQuantizedATensorDesc = intermediateQuantizedATensorDesc.GetDmlDesc(); + DML_TENSOR_DESC namedIntermediateQuantizedAScaleTensorDesc = intermediateQuantizedAScaleTensorDesc.GetDmlDesc(); + DML_TENSOR_DESC namedIntermediateQuantizedAZeroPointTensorDesc = intermediateQuantizedAZeroPointTensorDesc.GetDmlDesc(); + + std::vector inputDescs = GetDmlInputDescs(); + std::vector outputDescs = GetDmlOutputDescs(); + + DML_DYNAMIC_QUANTIZE_LINEAR_OPERATOR_DESC dynamicQuantizeLinearOperatorDesc = {}; + dynamicQuantizeLinearOperatorDesc.InputTensor = &inputDescs[OnnxInputIndex::A]; + dynamicQuantizeLinearOperatorDesc.OutputTensor = &namedIntermediateQuantizedATensorDesc; + dynamicQuantizeLinearOperatorDesc.OutputScaleTensor = &namedIntermediateQuantizedAScaleTensorDesc; + dynamicQuantizeLinearOperatorDesc.OutputZeroPointTensor = &namedIntermediateQuantizedAZeroPointTensorDesc; + + const DML_OPERATOR_DESC opDesc1{DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR, &dynamicQuantizeLinearOperatorDesc}; + + DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC matrixMultiplyIntergerToFloatOperatorDesc = {}; + matrixMultiplyIntergerToFloatOperatorDesc.ATensor = dynamicQuantizeLinearOperatorDesc.OutputTensor; + matrixMultiplyIntergerToFloatOperatorDesc.AScaleTensor = dynamicQuantizeLinearOperatorDesc.OutputScaleTensor; + matrixMultiplyIntergerToFloatOperatorDesc.AZeroPointTensor = dynamicQuantizeLinearOperatorDesc.OutputZeroPointTensor; + matrixMultiplyIntergerToFloatOperatorDesc.BTensor = &inputDescs[OnnxInputIndex::B]; + matrixMultiplyIntergerToFloatOperatorDesc.BScaleTensor = &inputDescs[OnnxInputIndex::B_scale]; + matrixMultiplyIntergerToFloatOperatorDesc.BZeroPointTensor = hasBZP? &inputDescs[OnnxInputIndex::B_zero_point] : nullptr; + matrixMultiplyIntergerToFloatOperatorDesc.BiasTensor = hasBias? &inputDescs[OnnxInputIndex::Bias] : nullptr; + matrixMultiplyIntergerToFloatOperatorDesc.OutputTensor = &outputDescs[0]; + + const DML_OPERATOR_DESC opDesc2{ DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, &matrixMultiplyIntergerToFloatOperatorDesc}; + + MLOperatorGraphDesc operatorGraphDesc = {}; + std::vector opDescs{&opDesc1, &opDesc2}; + operatorGraphDesc.nodeCount = static_cast(opDescs.size()); + operatorGraphDesc.nodes = opDescs.data(); + + // set input edges + std::pair nodeToNodeInputIndex[OnnxInputIndex::Count] {{0, 0}, {1, 3}, {1, 4}, {1, 5}, {1, 6}}; + std::vector inputEdges; + for (uint32_t inputIndex = 0; inputIndex < OnnxInputIndex::Count; inputIndex++) + { + if (inputIndex == OnnxInputIndex::B_zero_point && !hasBZP) continue; + if (inputIndex == OnnxInputIndex::Bias && !hasBias) continue; + DML_INPUT_GRAPH_EDGE_DESC inputEdge = {}; + inputEdge.GraphInputIndex = inputIndex; // OnnxInputIndex and DmlInputIndex are identity for QLinearSigmoid + inputEdge.ToNodeIndex = nodeToNodeInputIndex[inputIndex].first; + inputEdge.ToNodeInputIndex = nodeToNodeInputIndex[inputIndex].second; + inputEdges.push_back(inputEdge); + } + operatorGraphDesc.inputEdgeCount = gsl::narrow_cast(inputEdges.size()); + operatorGraphDesc.inputEdges = inputEdges.data(); + + // set intermediate edges + std::vector intermediateEdges; + + DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge1 = {}; + dynQLToMMItofloatEdge1.FromNodeIndex = 0; + dynQLToMMItofloatEdge1.FromNodeOutputIndex = 0; + dynQLToMMItofloatEdge1.ToNodeIndex = 1; + dynQLToMMItofloatEdge1.ToNodeInputIndex = 0; + intermediateEdges.push_back(dynQLToMMItofloatEdge1); + + DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge2 = {}; + dynQLToMMItofloatEdge2.FromNodeIndex = 0; + dynQLToMMItofloatEdge2.FromNodeOutputIndex = 1; + dynQLToMMItofloatEdge2.ToNodeIndex = 1; + dynQLToMMItofloatEdge2.ToNodeInputIndex = 1; + intermediateEdges.push_back(dynQLToMMItofloatEdge2); + + DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge3 = {}; + dynQLToMMItofloatEdge3.FromNodeIndex = 0; + dynQLToMMItofloatEdge3.FromNodeOutputIndex = 2; + dynQLToMMItofloatEdge3.ToNodeIndex = 1; + dynQLToMMItofloatEdge3.ToNodeInputIndex = 2; + intermediateEdges.push_back(dynQLToMMItofloatEdge3); + + operatorGraphDesc.intermediateEdgeCount = gsl::narrow_cast(intermediateEdges.size()); + operatorGraphDesc.intermediateEdges = intermediateEdges.data(); + + // set the output edges + std::vector outputEdges; + DML_OUTPUT_GRAPH_EDGE_DESC outputEdge = {}; + outputEdge.FromNodeIndex = 1; + outputEdge.FromNodeOutputIndex = 0; + outputEdge.GraphOutputIndex = 0; + outputEdges.push_back(outputEdge); + operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size()); + operatorGraphDesc.outputEdges = outputEdges.data(); + + SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext); + } +}; + +DML_OP_DEFINE_CREATION_FUNCTION(DynamicQuantizeMatMul, DmlOperatorDynamicQuantizeMatMul); +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp index f08151b61197a..38cf80b381762 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp @@ -435,6 +435,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(Dropout); DML_OP_EXTERN_CREATION_FUNCTION(MatMul); DML_OP_EXTERN_CREATION_FUNCTION(FusedMatMul); DML_OP_EXTERN_CREATION_FUNCTION(FusedMatMulActivation); +DML_OP_EXTERN_CREATION_FUNCTION(DynamicQuantizeMatMul); DML_OP_EXTERN_CREATION_FUNCTION(Cast); DML_OP_EXTERN_CREATION_FUNCTION(CastLike15); DML_OP_EXTERN_CREATION_FUNCTION(CastLike19); @@ -1065,6 +1066,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation {REG_INFO_MS( 1, Gelu, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)}, {REG_INFO_MS( 1, BiasGelu, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)}, {REG_INFO_MS( 1, FusedMatMul, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)}, + {REG_INFO_MS( 1, DynamicQuantizeMatMul, typeNameListTwo, supportedTypeListDynamicQuantizeLinear, DmlGraphSupport::Supported)}, {REG_INFO_MS( 1, FusedMatMulActivation, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)}, {REG_INFO_MS( 1, QLinearSigmoid, typeNameListDefault, supportedTypeListQLinearSigmoid, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryQLinearSigmoid)}, {REG_INFO_MS( 1, Attention, typeNameListAttention, supportedTypeListAttention, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryAttention)}, diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h index 06bacc1b28c99..1f5daed6ea0db 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h @@ -1776,6 +1776,7 @@ using ShapeInferenceHelper_Identity19 = GetOutputShapeAsInputShapeHelper; using ShapeInferenceHelper_MatMul = MatMulHelper; using ShapeInferenceHelper_MatMulInteger = MatMulHelper; using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulHelper; +using ShapeInferenceHelper_DynamicQuantizeMatMul = MatMulHelper; using ShapeInferenceHelper_QLinearMatMul = QLinearMatMulHelper; using ShapeInferenceHelper_QLinearAdd = GetBroadcastedOutputShapeHelper; using ShapeInferenceHelper_DynamicQuantizeLinear = GetOutputShapeAsInputShapeHelper; diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h index d081aa2e29148..8de43f270598d 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h @@ -462,6 +462,7 @@ namespace OperatorHelper static const int sc_sinceVer_RotaryEmbedding = 1; static const int sc_sinceVer_QLinearAveragePool = 1; static const int sc_sinceVer_QLinearGlobalAveragePool = 1; + static const int sc_sinceVer_DynamicQuantizeMatMul = 1; } // namespace MsftOperatorSet1 } // namespace OperatorHelper diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc index c70f659f1b645..88bee5fe1b125 100644 --- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc +++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc @@ -23,20 +23,85 @@ namespace onnxruntime { namespace test { template -void TestDynamicQuantizeMatMul(const std::vector& A_dims, - std::vector B_dims, - const std::string& reference_model, - bool is_matrix_b_constant, +static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, const int64_t K, + const std::vector& A_data, const std::vector& B_data, + std::vector& B_scale, std::vector& B_zero_point, + const std::vector& Bias, std::vector& Y_data, + bool per_column, bool has_zp, bool has_bias) { + // DynamicQuantize Matrix A + const uint32_t num_elements = static_cast(M * K); + std::vector QuantA_data(num_elements); + std::vector A_scale; + std::vector A_zero_point; + + // Get max and min + float min = std::numeric_limits::max(); + float max = std::numeric_limits::lowest(); + float qmax = static_cast(std::numeric_limits::max()); + float qmin = static_cast(std::numeric_limits::lowest()); + + for (uint32_t i = 0; i < num_elements; ++i) { + max = std::max(A_data[i], max); + min = std::min(A_data[i], min); + } + + // Adjust the maximum and minimum to include zero + max = std::max(max, 0.0f); + min = std::min(min, 0.0f); + + float scale = static_cast(max - min) / (qmax - qmin); + T zeroPoint = std::round(std::clamp(qmin - min / scale, qmin, qmax)); + + A_scale.push_back(scale); + A_zero_point.push_back(zeroPoint); + + // Matrix Multiplication + for (uint32_t i = 0; i < num_elements; ++i) { + QuantA_data[i] = static_cast(std::round((A_data[i] / scale) + zeroPoint)); + } + if (!per_column) { + B_zero_point.resize(N, B_zero_point[0]); + B_scale.resize(N, B_scale[0]); + } + + for (int64_t m = 0; m < M; m++) { + for (int64_t n = 0; n < N; n++) { + float sum = 0.0f; + for (int64_t k = 0; k < K; k++) { + float A_dequantized = (static_cast(QuantA_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0]; + + float B_dequantized = has_zp ? (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : B_data[k * N + n] * B_scale[n]; + + sum += A_dequantized * B_dequantized; + } + if (has_bias) { + sum += Bias[n]; + } + Y_data[m * N + n] = sum; + } + } +} + +template +void TestDynamicQuantizeMatMul(bool is_matrix_b_constant, bool per_column = false, bool has_zp = true, - bool has_bias = false) { + bool has_bias = false, + bool empty_input = false) { // create rand inputs RandomValueGenerator random{}; + int64_t M = empty_input ? 1 : 4; + int64_t N = 128; + int64_t K = 128; + std::vector A_dims{empty_input ? 0 : M, K}; + std::vector B_dims{K, N}; + std::vector Y_dims{empty_input ? 0 : M, K}; std::vector A_data = random.Uniform(A_dims, -1.0f, 1.0f); - std::vector B_data; - std::vector tmp_B_data = random.Uniform(B_dims, std::numeric_limits::min(), std::numeric_limits::max()); + std::vector tmp_B_data = random.Uniform(B_dims, + (std::is_same_v) ? std::numeric_limits::lowest() / 2 : std::numeric_limits::lowest(), + std::numeric_limits::max() / 2); std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T { return static_cast(v); }); @@ -47,7 +112,9 @@ void TestDynamicQuantizeMatMul(const std::vector& A_dims, std::for_each(B_zero_point.begin(), B_zero_point.end(), [&random](T& zp) { - zp = static_cast(random.Uniform(std::array{1}, std::numeric_limits::min(), std::numeric_limits::max())[0]); + zp = static_cast(random.Uniform(std::array{1}, + std::numeric_limits::min(), + std::numeric_limits::max())[0]); }); std::vector Bias = random.Uniform(AsSpan({B_dims.back()}), -0.1f, 0.1f); @@ -69,77 +136,85 @@ void TestDynamicQuantizeMatMul(const std::vector& A_dims, test.AddOptionalInputEdge(); } - test.AddReferenceOutputs(reference_model); + std::vector Y_data(M * N); + CalculateDynamicQuantizeMatMul(M, N, K, A_data, B_data, B_scale, B_zero_point, Bias, Y_data, + per_column, has_zp, has_bias); + test.AddOutput("Y", Y_dims, Y_data); + test.SetOutputRelErr("Y", 0.02f); test.Run(); } -template -void RunDynamicQuantizeMatMulTest(const string& model_path) { - std::vector A_dims{4, 128}; - std::vector B_dims{128, 128}; - std::vector Y_dims{4, 128}; - - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - false, /*is_matrix_b_constant*/ - false, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ +template +void RunDynamicQuantizeMatMulTest() { + TestDynamicQuantizeMatMul(false, /*is_matrix_b_constant*/ + false, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - true, /*is_matrix_b_constant*/ - false, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + TestDynamicQuantizeMatMul(true, /*is_matrix_b_constant*/ + false, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - false, /*is_matrix_b_constant*/ - true, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + TestDynamicQuantizeMatMul(false, /*is_matrix_b_constant*/ + true, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); - TestDynamicQuantizeMatMul(A_dims, - B_dims, - model_path, - true, /*is_matrix_b_constant*/ - true, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + TestDynamicQuantizeMatMul(true, /*is_matrix_b_constant*/ + true, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); } -TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test) { - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_int8.onnx"); - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_uint8.onnx"); +TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_S8) { + RunDynamicQuantizeMatMulTest(); } -TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test) { - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_int8_bias.onnx"); - RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_uint8_bias.onnx"); +TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_U8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_S8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_U8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_S8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_U8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_S8) { + RunDynamicQuantizeMatMulTest(); +} + +TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_U8) { + RunDynamicQuantizeMatMulTest(); } TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) { - std::vector A_dims{0, 128}; - std::vector B_dims{128, 128}; - std::vector Y_dims{0, 128}; - - TestDynamicQuantizeMatMul(A_dims, - B_dims, - "testdata/dynamic_quantize_matmul_uint8.onnx", - false /*is_matrix_b_constant*/); - - TestDynamicQuantizeMatMul(A_dims, - B_dims, - "testdata/dynamic_quantize_matmul_uint8.onnx", - true /*is_matrix_b_constant*/); + std::vector A_dims{0, 2}; + std::vector B_dims{2, 2}; + std::vector Y_dims{0, 2}; + OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain); + test.AddInput("T1", A_dims, {}); + test.AddInput("T2", B_dims, {1, 6, 0, 8}); + test.AddInput("b_scale", {1}, {1.0f}); + test.AddInput("b_zero_point", {1}, {0}); + test.AddOptionalInputEdge(); + test.AddOutput("Y", {0, 2}, {}); + test.Run(); } TEST(DynamicQuantizeMatMul, B_PerColumn_ND) {