From f4c9bf626663571b0155fffa2cbbe0e2e4c43a06 Mon Sep 17 00:00:00 2001 From: raoanag <127366241+raoanag@users.noreply.github.com> Date: Tue, 7 Nov 2023 00:10:36 -0800 Subject: [PATCH 01/10] Enable MatrixMultiplyIntegerToFloat on DML (#18275) [Cherry Pick Reviewed] Commit all MatrixMultiplyIntegerToFloat PRs [MatrixMultiplyIntegerToFloat (](https://github.com/microsoft/onnxruntime/pull/18275/commits/bf642a4d35691a13ff0ecef11cb8a9571c5a5610)https://github.com/microsoft/onnxruntime/pull/16804[)] [MatMulIntToFloat Enable FP16 and update tensor ORT-DML indexing (](https://github.com/microsoft/onnxruntime/pull/18275/commits/8237548d14f11a165a9b82bf181f8762e65f6142)https://github.com/microsoft/onnxruntime/pull/16871[)] [Disable MatMulIntegerToFloat transformation for FP16 on CPU EP (](https://github.com/microsoft/onnxruntime/pull/18275/commits/b16bf809dea31872ccb664f2622711966078e3f5)https://github.com/microsoft/onnxruntime/pull/18239[)] --- .../graph/contrib_ops/quantization_defs.cc | 2 +- .../core/optimizer/graph_transformer_utils.cc | 17 +-- .../core/optimizer/matmul_integer_to_float.cc | 23 +++- .../src/External/DirectMLHelpers/ApiTraits.h | 6 + .../External/DirectMLHelpers/DirectMLSchema.h | 19 +++ .../DirectMLHelpers/GeneratedSchemaHelpers.h | 18 +++ .../DmlOperatorMatMulIntegerToFloat.cpp | 113 ++++++++++++++++++ .../src/Operators/OperatorRegistration.cpp | 9 ++ .../dml/OperatorAuthorHelper/OperatorHelper.h | 8 ++ .../OperatorAuthorHelper/OperatorVersions.h | 1 + .../matmul_integer_to_float_test.cc | 78 ++++++++---- .../test/optimizer/graph_transform_test.cc | 18 +++ .../test/testdata/matmul_integer_to_float.py | 32 +++-- .../matmul_integer_to_float_int8.onnx | 4 +- .../matmul_integer_to_float_int8_bias.onnx | 4 +- .../matmul_integer_to_float_int8_int8.onnx | 4 +- ...atmul_integer_to_float_int8_int8_bias.onnx | 4 +- .../matmul_integer_to_float_uint8.onnx | 4 +- .../matmul_integer_to_float_uint8_bias.onnx | 4 +- .../fusion/matmul_integer_to_float.onnx | Bin 1520 -> 1520 bytes .../fusion/matmul_integer_to_float.py | 2 +- .../matmul_integer_to_float16_int8.onnx | 51 ++++++++ 22 files changed, 365 insertions(+), 56 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp create mode 100644 onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc index 4313fae767fe5..22a79ef652515 100644 --- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc @@ -434,7 +434,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA( .Output(0, "Y", "Matrix multiply results from A * B", "T3") .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input A data type to 8-bit integer tensor.") .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input B data type to 8-bit integer tensor.") - .TypeConstraint("T3", {"tensor(float)"}, + .TypeConstraint("T3", {"tensor(float)", "tensor(float16)"}, "Constrain input a_scale, b_scale and output Y data type as float tensor.") .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { propagateElemTypeFromInputToOutput(ctx, 2, 0); diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index 4e939fe3c7b6b..bcaf61e3cef90 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -273,13 +273,14 @@ InlinedVector> GenerateTransformers( onnxruntime::kCudaExecutionProvider, onnxruntime::kRocmExecutionProvider, onnxruntime::kDmlExecutionProvider}; - const InlinedHashSet cpu_cuda_rocm_acl_armnn_js_eps = {onnxruntime::kCpuExecutionProvider, - onnxruntime::kCudaExecutionProvider, - onnxruntime::kRocmExecutionProvider, - onnxruntime::kAclExecutionProvider, - onnxruntime::kArmNNExecutionProvider, - onnxruntime::kJsExecutionProvider}; - + const InlinedHashSet cpu_cuda_rocm_acl_armnn_eps = {onnxruntime::kCpuExecutionProvider, + onnxruntime::kCudaExecutionProvider, + onnxruntime::kRocmExecutionProvider, + onnxruntime::kAclExecutionProvider, + onnxruntime::kArmNNExecutionProvider, + onnxruntime::kJsExecutionProvider }; + const InlinedHashSet cpu_dml_eps = {onnxruntime::kCpuExecutionProvider, + onnxruntime::kDmlExecutionProvider}; #ifdef MLAS_TARGET_AMD64_IX86 const bool avx2_precision_mode = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow(); @@ -297,7 +298,7 @@ InlinedVector> GenerateTransformers( } transformers.emplace_back(std::make_unique(cpu_ep)); - transformers.emplace_back(std::make_unique(cpu_ep)); + transformers.emplace_back(std::make_unique(cpu_dml_eps)); transformers.emplace_back(std::make_unique(cpu_ep)); transformers.emplace_back(std::make_unique(cpu_cuda_rocm_acl_armnn_js_eps)); diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc index 56e51cb787931..4fee1a6ce224e 100644 --- a/onnxruntime/core/optimizer/matmul_integer_to_float.cc +++ b/onnxruntime/core/optimizer/matmul_integer_to_float.cc @@ -31,6 +31,24 @@ static bool CheckBiasShape(const TensorShapeProto* bias_shape) { return bias_last_dim > 1; } +bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) { + if (!node_arg.Exists()) { + return false; + } + + const auto* type_proto = node_arg.TypeAsProto(); + if (!type_proto) { + return false; + } + + int32_t actual_data_type; + if (!utils::TryGetElementDataType(*type_proto, actual_data_type)) { + return false; + } + + return data_type == actual_data_type; +} + /** MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat: @@ -63,9 +81,10 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g auto& mul_node = *node_ptr; ORT_RETURN_IF_ERROR(Recurse(mul_node, modified, graph_level, logger)); - + const bool is_dml_ep = node_ptr->GetExecutionProviderType() == kDmlExecutionProvider; if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14}) || - !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders())) { + !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders()) || + (!is_dml_ep && HasElementDataType(*mul_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16))) { continue; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h index e1e7eacfbd85d..7aad587304bb6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h @@ -879,6 +879,12 @@ struct OperatorDescTraits static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY; }; +template <> +struct OperatorDescTraits +{ + static constexpr DML_OPERATOR_TYPE Type = (DML_OPERATOR_TYPE) DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT; +}; + template <> struct OperatorDescTraits { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h index 5fe6603c2a0bf..ae4a02469e68e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h @@ -1885,6 +1885,25 @@ constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHE DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS, }; +constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] { + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, +}; + +constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA { + "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT", + static_cast(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT), + DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, + 8, + DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS, +}; + constexpr DML_SCHEMA_FIELD DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA_FIELDS[11] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true }, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h index 4be41ad3924a2..3dee8fe5649ea 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h @@ -1139,6 +1139,19 @@ inline std::vector GetFields(const DML_QUANTIZED_LINEAR_MATRIX_MU OperatorField(&DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast(desc.OutputTensor))), }; } +inline std::vector GetFields(const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC& desc) +{ + return { + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast(desc.ATensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast(desc.AScaleTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast(desc.AZeroPointTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast(desc.BTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast(desc.BScaleTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast(desc.BZeroPointTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast(desc.BiasTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast(desc.OutputTensor))), + }; +} inline std::vector GetFields(const DML_CONVOLUTION_INTEGER_OPERATOR_DESC& desc) { return { @@ -1829,6 +1842,7 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType) case DML_OPERATOR_RESAMPLE1: return DML_RESAMPLE1_OPERATOR_SCHEMA; case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER: return DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA; case DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY: return DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA; + case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA; case DML_OPERATOR_CONVOLUTION_INTEGER: return DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA; case DML_OPERATOR_QUANTIZED_LINEAR_CONVOLUTION: return DML_QUANTIZED_LINEAR_CONVOLUTION_OPERATOR_SCHEMA; case DML_OPERATOR_ELEMENT_WISE_BIT_AND: return DML_ELEMENT_WISE_BIT_AND_OPERATOR_SCHEMA; @@ -2360,6 +2374,10 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc) return AbstractOperatorDesc( &DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA, GetFields(*static_cast(opDesc.Desc))); + case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: + return AbstractOperatorDesc( + &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA, + GetFields(*static_cast(opDesc.Desc))); case DML_OPERATOR_CONVOLUTION_INTEGER: return AbstractOperatorDesc( &DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp new file mode 100644 index 0000000000000..ba0ecb9d7af69 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp @@ -0,0 +1,113 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" + +namespace Dml +{ + +class DmlOperatorMatMulIntegerToFloat : public DmlOperator +{ + enum OrtInputTensors : uint32_t + { + ortA, + ortB, + ortAScale, + ortBScale, + ortAZeroPoint, + ortBZeroPoint, + ortBias, + ortInputCount + }; + + enum DmlInputIndex : uint32_t + { + dmlA, + dmlAScale, + dmlAZeroPoint, + dmlB, + dmlBScale, + dmlBZeroPoint, + dmlBias, + dmlInputCount, + }; + +public: + DmlOperatorMatMulIntegerToFloat(const MLOperatorKernelCreationContext& kernelInfo) + : DmlOperator(kernelInfo) + { + std::vector> inputIndices = { OrtInputTensors::ortA, OrtInputTensors::ortAScale, OrtInputTensors::ortAZeroPoint, OrtInputTensors::ortB, OrtInputTensors::ortBScale, OrtInputTensors::ortBZeroPoint, OrtInputTensors::ortBias }; + DmlOperator::Initialize(kernelInfo, inputIndices); + + std::vector inputShape0 = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortA); + std::vector inputShape1 = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortB); + std::vector outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0); + + OperatorHelper::MatMulShapeMapping(inputShape0, inputShape1, outputShape); + + // Initialize the input descriptions with broadcasting + m_inputTensorDescs[DmlInputIndex::dmlA] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortA, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, inputShape0); + m_inputTensorDescs[DmlInputIndex::dmlB] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortB, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, inputShape1); + + // Broadcast Bias tensor to the shape of the output tensor. + if(kernelInfo.IsInputValid(OrtInputTensors::ortBias)) { + + m_inputTensorDescs[DmlInputIndex::dmlBias] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortBias, TensorAxis::DoNotCoerce, + TensorAxis::W, TensorAxis::RightAligned, outputShape); + } + + uint32_t dmlDimSize = m_inputTensorDescs[DmlInputIndex::dmlA].GetDimensionCount(); + // Resize the A Scale to be the same dimension as the input tensor. + // The 1D tensor needs to be moved to the H channel. + m_inputTensorDescs[DmlInputIndex::dmlAScale] = CreateTensorDescFromInput( + kernelInfo, + OrtInputTensors::ortAScale, + TensorAxis::DoNotCoerce, + TensorAxis::H, + TensorAxis::LeftAligned, + std::nullopt, + dmlDimSize + ); + + // Resize the A ZeroPoint to be the same dimension as the input tensor. + // The 1D tensor needs to be moved to the H channel. + if (kernelInfo.IsInputValid(OrtInputTensors::ortAZeroPoint)) + { + + m_inputTensorDescs[DmlInputIndex::dmlAZeroPoint] = CreateTensorDescFromInput( + kernelInfo, + OrtInputTensors::ortAZeroPoint, + TensorAxis::DoNotCoerce, + TensorAxis::H, + TensorAxis::LeftAligned, + std::nullopt, + dmlDimSize + ); + } + + // B Zeropoint and BScale are already aligned in the W dimension so no need to align them + + // Initialize the output description while overriding the shape + m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, outputShape); + + std::vector inputDescs = GetDmlInputDescs(); + std::vector outputDescs = GetDmlOutputDescs(); + + DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC matMulDesc = {}; + matMulDesc.ATensor = &inputDescs[DmlInputIndex::dmlA]; + matMulDesc.AScaleTensor = &inputDescs[DmlInputIndex::dmlAScale]; + matMulDesc.AZeroPointTensor = inputDescs[DmlInputIndex::dmlAZeroPoint].Desc != nullptr ? &inputDescs[DmlInputIndex::dmlAZeroPoint] : nullptr; + matMulDesc.BTensor = &inputDescs[DmlInputIndex::dmlB]; + matMulDesc.BScaleTensor = &inputDescs[DmlInputIndex::dmlBScale]; + matMulDesc.BZeroPointTensor = inputDescs[DmlInputIndex::dmlBZeroPoint].Desc != nullptr ? &inputDescs[DmlInputIndex::dmlBZeroPoint] : nullptr; + matMulDesc.BiasTensor = inputDescs[DmlInputIndex::dmlBias].Desc != nullptr ? &inputDescs[DmlInputIndex::dmlBias] : nullptr; + matMulDesc.OutputTensor = &outputDescs[0]; + + DML_OPERATOR_DESC opDesc = { (DML_OPERATOR_TYPE) DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, &matMulDesc }; + SetDmlOperatorDesc(opDesc, kernelInfo); + } +}; + +DML_OP_DEFINE_CREATION_FUNCTION(MatMulIntegerToFloat, DmlOperatorMatMulIntegerToFloat); + +} // namespace Dml \ No newline at end of file diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp index 9c136ed8c9484..f08151b61197a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp @@ -503,6 +503,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(QLinearMatMul); DML_OP_EXTERN_CREATION_FUNCTION(QLinearConcat); DML_OP_EXTERN_CREATION_FUNCTION(DynamicQuantizeLinear); DML_OP_EXTERN_CREATION_FUNCTION(MatMulInteger); +DML_OP_EXTERN_CREATION_FUNCTION(MatMulIntegerToFloat); DML_OP_EXTERN_CREATION_FUNCTION(ConvInteger); DML_OP_EXTERN_CREATION_FUNCTION(Trilu); @@ -622,6 +623,13 @@ constexpr static std::array supportedTypeListQLinea SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8, SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8 }; + +constexpr static std::array supportedTypeListMatMulIntegerToFloat = { + SupportedTensorDataTypes::Ints8Bit, + SupportedTensorDataTypes::Ints8Bit, + SupportedTensorDataTypes::Float16to32 +}; + constexpr static std::array supportedTypeListQLinearConv = { SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8, SupportedTensorDataTypes::Int8|SupportedTensorDataTypes::UInt8, @@ -1083,6 +1091,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation {REG_INFO( 10, QLinearConv, typeNameListFour, supportedTypeListQLinearConv, DmlGraphSupport::Supported)}, {REG_INFO( 10, QLinearMatMul, typeNameListThree, supportedTypeListQLinearMatMul, DmlGraphSupport::Supported)}, {REG_INFO( 10, MatMulInteger, typeNameListThree, supportedTypeListInteger, DmlGraphSupport::Supported)}, + {REG_INFO_MS( 1, MatMulIntegerToFloat, typeNameListThree, supportedTypeListMatMulIntegerToFloat, DmlGraphSupport::Supported)}, {REG_INFO( 10, ConvInteger, typeNameListThree, supportedTypeListInteger, DmlGraphSupport::Supported)}, {REG_INFO( 11, DynamicQuantizeLinear, typeNameListTwo, supportedTypeListDynamicQuantizeLinear, DmlGraphSupport::Supported)}, {REG_INFO( 7, LayerNormalization, typeNameListLayerNormContrib, supportedTypeListLayerNormalizationContrib, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryLayerNormalization)}, diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h index 1b2521a86613f..1ba528d0b2da0 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h @@ -870,6 +870,13 @@ class QLinearMatMulHelper : public MatMulHelperBase QLinearMatMulHelper(const Info_t& info, const Shape_t& shape) : MatMulHelperBase(info, shape, 0, 3) {} }; +class MatMulIntegerToFloatHelper : public MatMulHelperBase +{ +public: + template + MatMulIntegerToFloatHelper(const Info_t& info, const Shape_t& shape) : MatMulHelperBase(info, shape, 0, 1) {} +}; + class TopKHelper { @@ -1776,6 +1783,7 @@ using ShapeInferenceHelper_Identity16 = GetOutputShapeAsInputShapeHelper; using ShapeInferenceHelper_Identity19 = GetOutputShapeAsInputShapeHelper; using ShapeInferenceHelper_MatMul = MatMulHelper; using ShapeInferenceHelper_MatMulInteger = MatMulHelper; +using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulIntegerToFloatHelper; using ShapeInferenceHelper_QLinearMatMul = QLinearMatMulHelper; using ShapeInferenceHelper_QLinearAdd = GetBroadcastedOutputShapeHelper; using ShapeInferenceHelper_DynamicQuantizeLinear = GetOutputShapeAsInputShapeHelper; diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h index e725ba085113d..d081aa2e29148 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h @@ -449,6 +449,7 @@ namespace OperatorHelper static const int sc_sinceVer_FusedMatMulActivation = 1; static const int sc_sinceVer_QLinearSigmoid = 1; static const int sc_sinceVer_Attention = 1; + static const int sc_sinceVer_MatMulIntegerToFloat = 1; static const int sc_sinceVer_MultiHeadAttention = 1; static const int sc_sinceVer_SkipLayerNormalization = 1; static const int sc_sinceVer_EmbedLayerNormalization = 1; diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 26ce5272d25ee..51d9a57b5e447 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -23,7 +23,7 @@ using namespace std; namespace onnxruntime { namespace test { -template +template void TestMatMulIntegerToFloat(const std::vector& A_dims, std::vector B_dims, const std::string& reference_model, @@ -50,11 +50,11 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, return static_cast(v); }); - std::vector A_scale = random.Uniform(AsSpan({1}), -0.1f, 0.1f); + std::vector A_scale = random.Uniform(AsSpan({1}), -0.1f, 0.1f); std::vector A_zero_point{(std::numeric_limits::lowest() + std::numeric_limits::max() + IType(2)) / 2}; int64_t b_scale_zp_size = per_column ? B_dims.back() : 1; - std::vector B_scale = random.Uniform(AsSpan({b_scale_zp_size}), -0.1f, 0.1f); + std::vector B_scale = random.Uniform(AsSpan({b_scale_zp_size}), -0.1f, 0.1f); std::vector B_zero_point(b_scale_zp_size); std::for_each(B_zero_point.begin(), @@ -65,13 +65,13 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, std::numeric_limits::max())[0]); }); - std::vector Bias = random.Uniform(AsSpan({B_dims.back()}), -0.1f, 0.1f); + std::vector Bias = random.Uniform(AsSpan({B_dims.back()}), -0.1f, 0.1f); OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain); test.AddInput("A", A_dims, A_data); test.AddInput("B", B_dims, B_data, is_matrix_b_constant); - test.AddInput("a_scale", {1}, A_scale); - test.AddInput("b_scale", {b_scale_zp_size}, B_scale); + test.AddInput("a_scale", {1}, A_scale); + test.AddInput("b_scale", {b_scale_zp_size}, B_scale); if (has_zp) { test.AddInput("a_zero_point", {1}, A_zero_point); @@ -82,23 +82,38 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, } if (has_bias) { - test.AddInput("bias", {B_dims.back()}, Bias); + test.AddInput("bias", {B_dims.back()}, Bias); } else { - test.AddOptionalInputEdge(); + test.AddOptionalInputEdge(); } test.AddReferenceOutputs(reference_model); +#if defined(USE_DML) + if constexpr (std::is_same_v) { + test.SetOutputRelErr("Y", 2e-2f); + } else { + test.SetOutputRelErr("Y", 2.0f); + } +#else test.SetOutputRelErr("Y", 1e-4f); - test.Run(); +#endif + + if constexpr (std::is_same_v) { + test.Run(); + } else { + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider}); + } + } -template +template void RunMatMulIntegerToFloatTest(const string& model_path) { std::vector A_dims{4, 128}; std::vector B_dims{128, 128}; std::vector Y_dims{4, 128}; - TestMatMulIntegerToFloat(A_dims, + TestMatMulIntegerToFloat( + A_dims, B_dims, model_path, false, /*is_matrix_b_constant*/ @@ -107,7 +122,8 @@ void RunMatMulIntegerToFloatTest(const string& model_path) { HasBias /*has_bias*/ ); - TestMatMulIntegerToFloat(A_dims, + TestMatMulIntegerToFloat( + A_dims, B_dims, model_path, true, /*is_matrix_b_constant*/ @@ -116,7 +132,8 @@ void RunMatMulIntegerToFloatTest(const string& model_path) { HasBias /*has_bias*/ ); - TestMatMulIntegerToFloat(A_dims, + TestMatMulIntegerToFloat( + A_dims, B_dims, model_path, false, /*is_matrix_b_constant*/ @@ -125,7 +142,8 @@ void RunMatMulIntegerToFloatTest(const string& model_path) { HasBias /*has_bias*/ ); - TestMatMulIntegerToFloat(A_dims, + TestMatMulIntegerToFloat( + A_dims, B_dims, model_path, true, /*is_matrix_b_constant*/ @@ -135,22 +153,42 @@ void RunMatMulIntegerToFloatTest(const string& model_path) { ); } +#if USE_DML +TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) { + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8.onnx"); + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_uint8.onnx"); +} + +TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8_FP16) { + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_bias.onnx"); + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_uint8_bias.onnx"); +} + +TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) { + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_int8.onnx"); +} + +TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) { + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_int8_bias.onnx"); +} +#endif // USE_DML + TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8.onnx"); - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_uint8.onnx"); + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8.onnx"); + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_uint8.onnx"); } TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_bias.onnx"); - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_uint8_bias.onnx"); + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_bias.onnx"); + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_uint8_bias.onnx"); } TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_int8.onnx"); + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_int8.onnx"); } TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_int8_bias.onnx"); + RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_int8_bias.onnx"); } TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) { diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index e1fcf835c6043..5c885cf31fe31 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -5680,6 +5680,24 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) { EXPECT_EQ(op_to_count["Add"], 1); } +#ifdef USE_DML + TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) { + constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx"; + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + for (auto& node : graph.Nodes()) { + node.SetExecutionProviderType(kDmlExecutionProvider); + } + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique(), TransformerLevel::Level2)); + ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_)); + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1); +} +#endif // USE_DML + #endif #ifndef DISABLE_CONTRIB_OPS diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py index b898390044cf4..206a8514253c5 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float.py +++ b/onnxruntime/test/testdata/matmul_integer_to_float.py @@ -4,7 +4,7 @@ from onnx import TensorProto, helper -def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False): # noqa: N802 +def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bias=False): # noqa: N802 nodes = [ # subgraph helper.make_node( "MatMulInteger", @@ -13,7 +13,7 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False): # noqa: "MatMulInteger", ), helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"), - helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1), + helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT), helper.make_node( "Mul", ["matmul_output_float", "multiplier"], @@ -25,8 +25,8 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False): # noqa: inputs = [ # inputs helper.make_tensor_value_info("A", TensorProto.INT8 if sign_i else TensorProto.UINT8, ["M", "K"]), helper.make_tensor_value_info("B", TensorProto.INT8 if sign_w else TensorProto.UINT8, ["K", "N"]), - helper.make_tensor_value_info("a_scale", TensorProto.FLOAT, [1]), - helper.make_tensor_value_info("b_scale", TensorProto.FLOAT, ["C"]), + helper.make_tensor_value_info("a_scale", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [1]), + helper.make_tensor_value_info("b_scale", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["C"]), ] if has_zp: @@ -48,14 +48,14 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False): # noqa: if bias: nodes.extend([helper.make_node("Add", ["mul_bottom_output", "bias"], ["Y"], "add")]) - inputs.extend([helper.make_tensor_value_info("bias", TensorProto.FLOAT, ["N"])]) + inputs.extend([helper.make_tensor_value_info("bias", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["N"])]) graph = helper.make_graph( nodes, "DynamicQuantizeMatMul_fusion", # name inputs, [ # outputs - helper.make_tensor_value_info("Y", TensorProto.FLOAT, ["M", "N"]), + helper.make_tensor_value_info("Y", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["M", "N"]), ], ) @@ -64,10 +64,18 @@ def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False): # noqa: if __name__ == "__main__": - GenerateModel("matmul_integer_to_float_int8.onnx", False, True) - GenerateModel("matmul_integer_to_float_uint8.onnx", False, False) - GenerateModel("matmul_integer_to_float_int8_bias.onnx", False, True, False, True) - GenerateModel("matmul_integer_to_float_uint8_bias.onnx", False, False, False, True) + GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True) + GenerateModel("matmul_integer_to_float16_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=True) + GenerateModel("matmul_integer_to_float16_int8_bias.onnx", sign_i=False, sign_w=True, output_type_fp16=True, has_zp=False, bias=True) + GenerateModel("matmul_integer_to_float16_uint8_bias.onnx", sign_i=False, sign_w=False, output_type_fp16=True, has_zp=False, bias=True) - GenerateModel("matmul_integer_to_float_int8_int8.onnx", True, True) - GenerateModel("matmul_integer_to_float_int8_int8_bias.onnx", True, True, False, True) + GenerateModel("matmul_integer_to_float16_int8_int8.onnx", sign_i=True, sign_w=True, output_type_fp16=True) + GenerateModel("matmul_integer_to_float16_int8_int8_bias.onnx", sign_i=True, sign_w=True, output_type_fp16=True, has_zp=False, bias=True) + + GenerateModel("matmul_integer_to_float_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=False) + GenerateModel("matmul_integer_to_float_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=False) + GenerateModel("matmul_integer_to_float_int8_bias.onnx", sign_i=False, sign_w=True, output_type_fp16=False, has_zp=False, bias=True) + GenerateModel("matmul_integer_to_float_uint8_bias.onnx", sign_i=False, sign_w=False, output_type_fp16=False, has_zp=False, bias=True) + + GenerateModel("matmul_integer_to_float_int8_int8.onnx", sign_i=True, sign_w=True, output_type_fp16=False) + GenerateModel("matmul_integer_to_float_int8_int8_bias.onnx", sign_i=True, sign_w=True, output_type_fp16=False, has_zp=False, bias=True) diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx index 9f4465a914963..906dec542a4fa 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx +++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx @@ -1,4 +1,4 @@ -:Ì + :Ì U A B @@ -44,4 +44,4 @@ mul_bottom"MulDynamicQuantizeMatMul_fusionZ  M -NB \ No newline at end of file +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx index 01b7e15aa4a1f..16cdf03c7ae59 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx +++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx @@ -1,4 +1,4 @@ -:Ä + :Ä 9 A Bmatmul_output_int32 MatMulInteger" MatMulInteger @@ -41,4 +41,4 @@ mul_bottom"Mul  M -NB \ No newline at end of file +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx index 9d38828e25d6a..55102757a0b57 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx +++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8.onnx @@ -1,4 +1,4 @@ -:Ì + :Ì U A B @@ -44,4 +44,4 @@ mul_bottom"MulDynamicQuantizeMatMul_fusionZ  M -NB \ No newline at end of file +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx index 4d9a55af50a87..d9d7222a1acaa 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx +++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8_int8_bias.onnx @@ -1,4 +1,4 @@ -:Ä + :Ä 9 A Bmatmul_output_int32 MatMulInteger" MatMulInteger @@ -41,4 +41,4 @@ mul_bottom"Mul  M -NB \ No newline at end of file +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx index a4c6d20d59be8..5373ce145688e 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx +++ b/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx @@ -1,4 +1,4 @@ -:Ì + :Ì U A B @@ -44,4 +44,4 @@ mul_bottom"MulDynamicQuantizeMatMul_fusionZ  M -NB \ No newline at end of file +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx index a5be0c63f4dcb..e407414b23b24 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx +++ b/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx @@ -1,4 +1,4 @@ -:Ä + :Ä 9 A Bmatmul_output_int32 MatMulInteger" MatMulInteger @@ -41,4 +41,4 @@ mul_bottom"Mul  M -NB \ No newline at end of file +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx index 7ea69c580ee435be09f12b949f14fdb2efe3d403..aa8e67bcbc59e53d3418000c23ef35c75dfd76c6 100644 GIT binary patch delta 13 Ucmeys{ehc_gL5O(TUJJ403a9x!vFvP delta 13 Ucmeys{ehc_gMA~@TUJIM03ZVcx&QzG diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py index 018e5fb332dd0..60bdd92dc9c93 100644 --- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py @@ -104,4 +104,4 @@ def GenerateModel(model_name): # noqa: N802 if __name__ == "__main__": - GenerateModel("matmul_integer_to_float.onnx") + GenerateModel("matmul_integer_to_float.onnx") \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx new file mode 100644 index 0000000000000..22293b0d10756 --- /dev/null +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx @@ -0,0 +1,51 @@ + :Ì +U +A +B + a_zero_point + b_zero_pointmatmul_output_int32 MatMulInteger" MatMulInteger +. +a_scale +b_scale +multiplier mul_right"Mul +A +matmul_output_int32matmul_output_floatcast"Cast* +to +  +5 +matmul_output_float + +multiplierY +mul_bottom"MulDynamicQuantizeMatMul_fusionZ +A + + +M +KZ +B + + +K +NZ +a_scale + + + +Z +b_scale +  + +CZ + a_zero_point + + +Z + b_zero_point +  +Cb +Y + + + +M +NB \ No newline at end of file From 9cceffa4c93e4a2ff5fe484763fe594d2e1e366f Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Wed, 21 Feb 2024 22:39:28 -0800 Subject: [PATCH 02/10] Doc updates --- docs/ContribOperators.md | 2 +- docs/OperatorKernels.md | 1 + .../src/External/DirectMLHelpers/ApiTraits.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index f523e97293427..e295dfa203ae5 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -2795,7 +2795,7 @@ This version of the operator has been available since version 1 of the 'com.micr
Constrain input A data type to 8-bit integer tensor.
T2 : tensor(int8), tensor(uint8)
Constrain input B data type to 8-bit integer tensor.
-
T3 : tensor(float)
+
T3 : tensor(float), tensor(float16)
Constrain input a_scale, b_scale and output Y data type as float tensor.
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index b0ed68d595c42..11e8bcd684c25 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -1267,6 +1267,7 @@ Do not modify directly.* |FusedMatMulActivation|*in* A:**T**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |Gelu|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |GroupNorm|*in* X:**T**
*in* gamma:**M**
*in* beta:**M**
*out* Y:**T**|1+|**M** = tensor(float), tensor(float16)
**T** = tensor(float), tensor(float16)| +|MatMulIntegerToFloat|*in* A:**T1**
*in* B:**T2**
*in* a_scale:**T3**
*in* b_scale:**T3**
*in* a_zero_point:**T1**
*in* b_zero_point:**T2**
*in* bias:**T3**
*out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8)
**T2** = tensor(int8), tensor(uint8)
**T3** = tensor(float), tensor(float16)| |MultiHeadAttention|*in* query:**T**
*in* key:**T**
*in* value:**T**
*in* bias:**T**
*in* key_padding_mask:**M**
*in* relative_position_bias:**T**
*in* past_key:**T**
*in* past_value:**T**
*out* output:**T**
*out* present_key:**T**
*out* present_value:**T**|1+|**M** = tensor(int32)
**T** = tensor(float), tensor(float16)| |NhwcConv|*in* X:**T**
*in* W:**T**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |QLinearAdd|*in* A:**T**
*in* A_scale:**tensor(float)**
*in* A_zero_point:**T**
*in* B:**T**
*in* B_scale:**tensor(float)**
*in* B_zero_point:**T**
*in* C_scale:**tensor(float)**
*in* C_zero_point:**T**
*out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)| diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h index 7aad587304bb6..1cb0b3f8e65d0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h @@ -882,7 +882,7 @@ struct OperatorDescTraits template <> struct OperatorDescTraits { - static constexpr DML_OPERATOR_TYPE Type = (DML_OPERATOR_TYPE) DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT; + static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT; }; template <> From 7d0437e026d2bc8f54f0ebba842f34daa12bfffd Mon Sep 17 00:00:00 2001 From: raoanag <127366241+raoanag@users.noreply.github.com> Date: Thu, 15 Feb 2024 12:51:14 -0800 Subject: [PATCH 03/10] MatMulIntegerToFloat reference update for tests (#19333) ### Description MatMulIntegerToFloat tests were noticed to be failing for DMLEP the root cause being inaccuracies in CPUEP implementation to some data type combinations. ``` .\onnxruntime_test_all.exe --gtest_filter="*MatMulIntegerToFloat.*" Note: Google Test filter = *MatMulIntegerToFloat.* [==========] Running 22 tests from 1 test suite. [----------] Global test environment set-up. [----------] 22 tests from MatMulIntegerToFloat [ RUN ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 (620 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 (497 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_S8S8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_S8S8 (488 ms) [ RUN ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_S8S8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_S8S8 (503 ms) [ RUN ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8U8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8U8 (495 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8U8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8U8 (488 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_U8U8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_U8U8 (492 ms) [ RUN ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_U8X8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_U8X8 (502 ms) [ RUN ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8U8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8U8 (452 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8U8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8U8 (454 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_S8U8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_S8U8 (446 ms) [ RUN ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_S8U8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_S8U8 (508 ms) [ RUN ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8S8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8S8 (456 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8S8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8S8 (455 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_U8S8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_NoBias_test_U8S8 (447 ms) [ RUN ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_U8S8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_HasBias_test_U8S8 (465 ms) [ RUN ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_U8U8 [ OK ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_U8U8 (111 ms) [ RUN ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_U8S8 [ OK ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_U8S8 (115 ms) [ RUN ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_S8S8 [ OK ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_S8S8 (114 ms) [ RUN ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_S8U8 [ OK ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16_S8U8 (110 ms) [ RUN ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16 [ OK ] MatMulIntegerToFloat.MatMulIntegerToFloat_FP16 (112 ms) [ RUN ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint [ OK ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint (337 ms) [----------] 22 tests from MatMulIntegerToFloat (8679 ms total) [----------] Global test environment tear-down [==========] 22 tests from 1 test suite ran. (8680 ms total) [ PASSED ] 22 tests. memleakdbg: ----- No memory leaks detected ----- ``` ### Motivation and Context * `CalculateMatMulIntegerToFloat` to replace CPU EP run reference * Added more FP32 testcases to isolate all input datatype combinations * Added fixed input to `MatMulIntegerToFloat_FP16*` test cases as for FP16 test cases. There is no support for direct onnxruntime::MLFloat16 datatype comparison with gtest framework. This leads to FP32 reference -> FP16 tensor -> FP32 reference conversion which is adding inaccuracies. ![image](https://github.com/microsoft/onnxruntime/assets/127366241/c6aaf68e-44df-42be-9860-df2cb0dd7a56) * Removing `MatMulIntegerToFloatHelper` as its same as `MatMulHelper` * onnxruntime/test/testdata/matmul_integer_to_float.py` is still capable of generating FP16 models, but we do not produce any for now --- .../dml/OperatorAuthorHelper/OperatorHelper.h | 8 - .../matmul_integer_to_float_test.cc | 396 ++++++++++++++---- .../test/testdata/matmul_integer_to_float.py | 7 - 3 files changed, 321 insertions(+), 90 deletions(-) diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h index 1ba528d0b2da0..8b0d643b0709c 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h @@ -870,14 +870,6 @@ class QLinearMatMulHelper : public MatMulHelperBase QLinearMatMulHelper(const Info_t& info, const Shape_t& shape) : MatMulHelperBase(info, shape, 0, 3) {} }; -class MatMulIntegerToFloatHelper : public MatMulHelperBase -{ -public: - template - MatMulIntegerToFloatHelper(const Info_t& info, const Shape_t& shape) : MatMulHelperBase(info, shape, 0, 1) {} -}; - - class TopKHelper { void Initialize( diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 51d9a57b5e447..0d5dab35826c1 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -24,28 +24,66 @@ namespace onnxruntime { namespace test { template -void TestMatMulIntegerToFloat(const std::vector& A_dims, - std::vector B_dims, - const std::string& reference_model, - bool is_matrix_b_constant, +static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, const int64_t K, + const std::vector& A_data, const std::vector& A_scale, + const std::vector& A_zero_point, const std::vector& B_data, + std::vector& B_scale, std::vector& B_zero_point, + const std::vector& Bias, std::vector& Y_data, + bool per_column, bool has_zp, bool has_bias) { + if (!per_column) { + B_zero_point.resize(N, B_zero_point[0]); + B_scale.resize(N, B_scale[0]); + } + + for (int64_t m = 0; m < M; m++) { + for (int64_t n = 0; n < N; n++) { + float sum = 0.0f; + for (int64_t k = 0; k < K; k++) { + float A_dequantized = has_zp ? + (static_cast(A_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0] : + A_data[m * K + k] * A_scale[0]; + float B_dequantized = has_zp ? + (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : + B_data[k * N + n] * B_scale[n]; + + sum += A_dequantized * B_dequantized; + } + if (has_bias) { + sum += Bias[n]; + } + Y_data[m * N + n] = static_cast(sum); + } + } +} + +template +void TestMatMulIntegerToFloat(bool is_matrix_b_constant, bool per_column = false, bool has_zp = true, bool has_bias = false) { // create rand inputs RandomValueGenerator random{}; - + int64_t M = 4; + int64_t N = 128; + int64_t K = 128; + std::vector A_dims{M, K}; + std::vector B_dims{K, N}; + std::vector Y_dims{M, K}; std::vector A_data; - std::vector tmp_A_data = random.Uniform(A_dims, - std::numeric_limits::lowest(), - std::numeric_limits::max()); - std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> WType { + std::vector tmp_A_data = random.Uniform(A_dims, + std::numeric_limits::lowest(), + std::numeric_limits::max()); + std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> IType { return static_cast(v); }); std::vector B_data; - std::vector tmp_B_data = random.Uniform(B_dims, - std::numeric_limits::lowest(), - std::numeric_limits::max()); + + std::vector tmp_B_data; + tmp_B_data = random.Uniform(B_dims, + (constexpr(std::is_same_v)) ? + std::numeric_limits::lowest()/2 : std::numeric_limits::lowest(), + std::numeric_limits::max() / 2); std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType { return static_cast(v); }); @@ -60,9 +98,9 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, std::for_each(B_zero_point.begin(), B_zero_point.end(), [&random](WType& zp) { - zp = static_cast(random.Uniform(std::array{1}, - std::numeric_limits::lowest(), - std::numeric_limits::max())[0]); + zp = static_cast(random.Uniform(std::array{1}, + std::numeric_limits::lowest(), + std::numeric_limits::max())[0]); }); std::vector Bias = random.Uniform(AsSpan({B_dims.back()}), -0.1f, 0.1f); @@ -77,7 +115,7 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, test.AddInput("a_zero_point", {1}, A_zero_point); test.AddInput("b_zero_point", {b_scale_zp_size}, B_zero_point); } else { - test.AddOptionalInputEdge(); + test.AddOptionalInputEdge(); test.AddOptionalInputEdge(); } @@ -87,39 +125,39 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, test.AddOptionalInputEdge(); } - test.AddReferenceOutputs(reference_model); -#if defined(USE_DML) - if constexpr (std::is_same_v) { - test.SetOutputRelErr("Y", 2e-2f); + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, Bias, Y_data, + per_column, has_zp, has_bias); + + if (constexpr(std::is_same_v)) { + test.AddOutput("Y", {M, N}, Y_data); } else { - test.SetOutputRelErr("Y", 2.0f); + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + test.SetOutputAbsErr("Y", 0.5f); } -#else - test.SetOutputRelErr("Y", 1e-4f); -#endif - if constexpr (std::is_same_v) { - test.Run(); + // Only DML EP supports these data type combinations for now + if ((constexpr(std::is_same_v)) || + (constexpr(std::is_same_v) && + constexpr(std::is_same_v) && + constexpr(std::is_same_v))) { + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } else { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider}); + test.Run(); } } template -void RunMatMulIntegerToFloatTest(const string& model_path) { - std::vector A_dims{4, 128}; - std::vector B_dims{128, 128}; - std::vector Y_dims{4, 128}; - +void RunMatMulIntegerToFloatTest() { TestMatMulIntegerToFloat( - A_dims, - B_dims, - model_path, - false, /*is_matrix_b_constant*/ - false, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + false, /*is_matrix_b_constant*/ + false, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); TestMatMulIntegerToFloat( @@ -130,66 +168,274 @@ void RunMatMulIntegerToFloatTest(const string& model_path) { false, /*per_column*/ HasZeroPoint, /*has_zp*/ HasBias /*has_bias*/ + true, /*is_matrix_b_constant*/ + false, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); TestMatMulIntegerToFloat( - A_dims, - B_dims, - model_path, - false, /*is_matrix_b_constant*/ - true, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + false, /*is_matrix_b_constant*/ + true, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); TestMatMulIntegerToFloat( - A_dims, - B_dims, - model_path, - true, /*is_matrix_b_constant*/ - true, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ + true, /*is_matrix_b_constant*/ + true, /*per_column*/ + HasZeroPoint, /*has_zp*/ + HasBias /*has_bias*/ ); } -#if USE_DML -TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8.onnx"); - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_uint8.onnx"); +TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8_FP16) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_bias.onnx"); - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_uint8_bias.onnx"); +TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_int8.onnx"); +TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8S8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_int8_bias.onnx"); +TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8S8) { + RunMatMulIntegerToFloatTest(); } -#endif // USE_DML -TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8.onnx"); - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_uint8.onnx"); +TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8U8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8X8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_bias.onnx"); - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_uint8_bias.onnx"); +TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8U8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_int8.onnx"); +TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8U8) { + RunMatMulIntegerToFloatTest(); } -TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8) { - RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8_int8_bias.onnx"); +TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8X8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8S8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_U8S8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_U8S8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) { + RunMatMulIntegerToFloatTest(); +} + +// DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output +#if defined(USE_DML) + +TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8U8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, NoZeroPoint_NoBias_test_S8U8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_S8U8) { + RunMatMulIntegerToFloatTest(); +} + +TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8U8) { + OpTester test("MatMulIntegerToFloat", 1, kMSDomain); + int64_t M = 5; + int64_t N = 5; + int64_t K = 2; + + std::vector A_data = {1, 5, 2, 1, 9, + 1, 1, 3, 7, 2}; + std::vector B_data = {3, 7, 2, 1, 1, + 2, 1, 9, 1, 1}; + std::vector A_scale = ToFloat16({3.0f}); + std::vector B_scale = ToFloat16({2.0f}); + test.AddInput("A", {M, K}, A_data); + test.AddInput("B", {K, N}, B_data); + std::vector A_zero_point = {1}; + std::vector B_zero_point = {1}; + + test.AddInput("a_scale", {1}, A_scale); + test.AddInput("b_scale", {1}, B_scale); + test.AddInput("a_zero_point", {1}, A_zero_point); + test.AddInput("b_zero_point", {1}, B_zero_point); + + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, {}, Y_data, + false, true, false); + + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_U8S8) { + OpTester test("MatMulIntegerToFloat", 1, kMSDomain); + int64_t M = 5; + int64_t N = 5; + int64_t K = 2; + + std::vector A_data = {3, 7, 2, 1, 1, + 2, 1, 9, 1, 1}; + std::vector B_data = {2, -1, -9, 1, 1, + -1, 0, -3, 1, -4}; + std::vector A_scale = ToFloat16({-4.0f}); + std::vector B_scale = ToFloat16({2.0f}); + test.AddInput("A", {M, K}, A_data); + test.AddInput("B", {K, N}, B_data); + std::vector A_zero_point = {1}; + std::vector B_zero_point = {3}; + std::vector Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f}); + + test.AddInput("a_scale", {1}, A_scale); + test.AddInput("b_scale", {1}, B_scale); + test.AddInput("a_zero_point", {1}, A_zero_point); + test.AddInput("b_zero_point", {1}, B_zero_point); + + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, {}, Y_data, + false, true, false); + + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8S8) { + OpTester test("MatMulIntegerToFloat", 1, kMSDomain); + int64_t M = 5; + int64_t N = 5; + int64_t K = 2; + + std::vector A_data = {3, 7, -2, 1, 1, + 2, -1, -9, 1, 1}; + std::vector B_data = {2, -1, -9, 1, 1, + -1, 0, -3, 1, -4}; + std::vector A_scale = ToFloat16({-4.0f}); + std::vector B_scale = ToFloat16({2.0f}); + test.AddInput("A", {M, K}, A_data); + test.AddInput("B", {K, N}, B_data); + std::vector A_zero_point = {-1}; + std::vector B_zero_point = {3}; + std::vector Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f}); + + test.AddInput("a_scale", {1}, A_scale); + test.AddInput("b_scale", {1}, B_scale); + test.AddInput("a_zero_point", {1}, A_zero_point); + test.AddInput("b_zero_point", {1}, B_zero_point); + test.AddInput("bias", {N}, Bias); + + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, Bias, Y_data, + false, true, true); + + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16_S8U8) { + OpTester test("MatMulIntegerToFloat", 1, kMSDomain); + int64_t M = 5; + int64_t N = 5; + int64_t K = 2; + + std::vector A_data = {3, 7, -2, 1, 1, + 2, -1, -9, 1, 1}; + std::vector B_data = {3, 7, 2, 1, 1, + 2, 1, 9, 1, 1}; + std::vector A_scale = ToFloat16({-4.0f}); + std::vector B_scale = ToFloat16({2.0f}); + test.AddInput("A", {M, K}, A_data); + test.AddInput("B", {K, N}, B_data); + std::vector A_zero_point = {-1}; + std::vector B_zero_point = {1}; + std::vector Bias = ToFloat16({11.0f, -17.0f, 1.0f, -3.0f, 12.0f}); + + test.AddInput("a_scale", {1}, A_scale); + test.AddInput("b_scale", {1}, B_scale); + test.AddInput("a_zero_point", {1}, A_zero_point); + test.AddInput("b_zero_point", {1}, B_zero_point); + test.AddInput("bias", {N}, Bias); + + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, Bias, Y_data, + false, true, true); + + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(MatMulIntegerToFloat, MatMulIntegerToFloat_FP16) { + OpTester test("MatMulIntegerToFloat", 1, kMSDomain); + int64_t M = 2; + int64_t N = 2; + int64_t K = 3; + + std::vector A_data = {11, -2, 5, + -1, 3, 10}; + std::vector B_data = {-13, -2, + 9, 55, + -1, 23}; + std::vector A_scale = ToFloat16({0.910f}); + std::vector B_scale = ToFloat16({1.10f, 1.123f}); + + std::vector A_zero_point = {113}; + std::vector B_zero_point = {98, 71}; + + std::vector Bias = ToFloat16({0.10f, 1.123f}); + + test.AddInput("A", {M, K}, A_data); + test.AddInput("B", {K, N}, B_data); + + test.AddInput("a_scale", {}, {A_scale}); + test.AddInput("b_scale", {N}, B_scale); + test.AddInput("a_zero_point", {}, {A_zero_point}); + test.AddInput("b_zero_point", {N}, B_zero_point); + test.AddInput("bias", {N}, Bias); + + std::vector Y_data(M * N); + CalculateMatMulIntegerToFloat(M, N, K, A_data, A_scale, A_zero_point, + B_data, B_scale, B_zero_point, Bias, Y_data, + true, true, true); + + test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); + test.SetOutputRelErr("Y", 2e-2f); + std::vector> execution_providers; + execution_providers.push_back(DefaultDmlExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } +#endif TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) { auto test_case = [&](const std::vector& input_shape, diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py index 206a8514253c5..37db93a288b08 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float.py +++ b/onnxruntime/test/testdata/matmul_integer_to_float.py @@ -65,13 +65,6 @@ def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bia if __name__ == "__main__": GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True) - GenerateModel("matmul_integer_to_float16_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=True) - GenerateModel("matmul_integer_to_float16_int8_bias.onnx", sign_i=False, sign_w=True, output_type_fp16=True, has_zp=False, bias=True) - GenerateModel("matmul_integer_to_float16_uint8_bias.onnx", sign_i=False, sign_w=False, output_type_fp16=True, has_zp=False, bias=True) - - GenerateModel("matmul_integer_to_float16_int8_int8.onnx", sign_i=True, sign_w=True, output_type_fp16=True) - GenerateModel("matmul_integer_to_float16_int8_int8_bias.onnx", sign_i=True, sign_w=True, output_type_fp16=True, has_zp=False, bias=True) - GenerateModel("matmul_integer_to_float_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=False) GenerateModel("matmul_integer_to_float_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=False) GenerateModel("matmul_integer_to_float_int8_bias.onnx", sign_i=False, sign_w=True, output_type_fp16=False, has_zp=False, bias=True) From 1c74a29eb66428a4feca62b1bcdc280e76170b73 Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Thu, 22 Feb 2024 09:19:41 -0800 Subject: [PATCH 04/10] Resolve conflicts --- .../core/optimizer/graph_transformer_utils.cc | 12 ++++++------ .../src/External/DirectMLHelpers/ApiTraits.h | 7 +++++++ .../dml/OperatorAuthorHelper/OperatorHelper.h | 2 +- .../matmul_integer_to_float_test.cc | 19 +++---------------- 4 files changed, 17 insertions(+), 23 deletions(-) diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index bcaf61e3cef90..0015ac1e5aff4 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -273,12 +273,12 @@ InlinedVector> GenerateTransformers( onnxruntime::kCudaExecutionProvider, onnxruntime::kRocmExecutionProvider, onnxruntime::kDmlExecutionProvider}; - const InlinedHashSet cpu_cuda_rocm_acl_armnn_eps = {onnxruntime::kCpuExecutionProvider, - onnxruntime::kCudaExecutionProvider, - onnxruntime::kRocmExecutionProvider, - onnxruntime::kAclExecutionProvider, - onnxruntime::kArmNNExecutionProvider, - onnxruntime::kJsExecutionProvider }; + const InlinedHashSet cpu_cuda_rocm_acl_armnn_js_eps = {onnxruntime::kCpuExecutionProvider, + onnxruntime::kCudaExecutionProvider, + onnxruntime::kRocmExecutionProvider, + onnxruntime::kAclExecutionProvider, + onnxruntime::kArmNNExecutionProvider, + onnxruntime::kJsExecutionProvider}; const InlinedHashSet cpu_dml_eps = {onnxruntime::kCpuExecutionProvider, onnxruntime::kDmlExecutionProvider}; #ifdef MLAS_TARGET_AMD64_IX86 diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h index 1cb0b3f8e65d0..176fb2dfaa1e8 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h @@ -2227,6 +2227,11 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_SWISH> { using DescType = DML_ACTIVATION_SWISH_OPERATOR_DESC; }; +template <> +struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT> +{ + using DescType = DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC; +}; template <> struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_HARD_SWISH> @@ -2589,6 +2594,8 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args return std::invoke(std::forward(visitor), DML_ACTIVATION_SWISH_OPERATOR_DESC{}, std::forward(args)...); case DML_OPERATOR_ACTIVATION_HARD_SWISH: return std::invoke(std::forward(visitor), DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC{}, std::forward(args)...); + case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: + return std::invoke(std::forward(visitor), DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC{}, std::forward(args)...); default: ORT_THROW_HR(E_INVALIDARG); return std::invoke(std::forward(visitor), DML_ACTIVATION_RELU_OPERATOR_DESC{}, std::forward(args)...); diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h index 8b0d643b0709c..06bacc1b28c99 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h @@ -1775,7 +1775,7 @@ using ShapeInferenceHelper_Identity16 = GetOutputShapeAsInputShapeHelper; using ShapeInferenceHelper_Identity19 = GetOutputShapeAsInputShapeHelper; using ShapeInferenceHelper_MatMul = MatMulHelper; using ShapeInferenceHelper_MatMulInteger = MatMulHelper; -using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulIntegerToFloatHelper; +using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulHelper; using ShapeInferenceHelper_QLinearMatMul = QLinearMatMulHelper; using ShapeInferenceHelper_QLinearAdd = GetBroadcastedOutputShapeHelper; using ShapeInferenceHelper_DynamicQuantizeLinear = GetOutputShapeAsInputShapeHelper; diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 0d5dab35826c1..c7f2ec89fb817 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -39,12 +39,8 @@ static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, cons for (int64_t n = 0; n < N; n++) { float sum = 0.0f; for (int64_t k = 0; k < K; k++) { - float A_dequantized = has_zp ? - (static_cast(A_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0] : - A_data[m * K + k] * A_scale[0]; - float B_dequantized = has_zp ? - (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : - B_data[k * N + n] * B_scale[n]; + float A_dequantized = has_zp ? (static_cast(A_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0] : A_data[m * K + k] * A_scale[0]; + float B_dequantized = has_zp ? (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : B_data[k * N + n] * B_scale[n]; sum += A_dequantized * B_dequantized; } @@ -81,8 +77,7 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant, std::vector tmp_B_data; tmp_B_data = random.Uniform(B_dims, - (constexpr(std::is_same_v)) ? - std::numeric_limits::lowest()/2 : std::numeric_limits::lowest(), + (constexpr(std::is_same_v)) ? std::numeric_limits::lowest() / 2 : std::numeric_limits::lowest(), std::numeric_limits::max() / 2); std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType { return static_cast(v); @@ -148,7 +143,6 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant, } else { test.Run(); } - } template @@ -161,13 +155,6 @@ void RunMatMulIntegerToFloatTest() { ); TestMatMulIntegerToFloat( - A_dims, - B_dims, - model_path, - true, /*is_matrix_b_constant*/ - false, /*per_column*/ - HasZeroPoint, /*has_zp*/ - HasBias /*has_bias*/ true, /*is_matrix_b_constant*/ false, /*per_column*/ HasZeroPoint, /*has_zp*/ From 88f988e1134c34f8a8a439ce45ad72cb2d9eb4d3 Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Thu, 22 Feb 2024 10:25:37 -0800 Subject: [PATCH 05/10] Lint runner --- .../DmlOperatorMatMulIntegerToFloat.cpp | 14 +++--- .../matmul_integer_to_float_test.cc | 11 +++-- .../test/optimizer/graph_transform_test.cc | 4 +- .../test/testdata/matmul_integer_to_float.py | 47 ++++++++++++++++--- .../fusion/matmul_integer_to_float.py | 2 +- 5 files changed, 58 insertions(+), 20 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp index ba0ecb9d7af69..b5a3dd0960b86 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorMatMulIntegerToFloat.cpp @@ -19,7 +19,7 @@ class DmlOperatorMatMulIntegerToFloat : public DmlOperator ortBias, ortInputCount }; - + enum DmlInputIndex : uint32_t { dmlA, @@ -51,7 +51,6 @@ class DmlOperatorMatMulIntegerToFloat : public DmlOperator // Broadcast Bias tensor to the shape of the output tensor. if(kernelInfo.IsInputValid(OrtInputTensors::ortBias)) { - m_inputTensorDescs[DmlInputIndex::dmlBias] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortBias, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, outputShape); } @@ -60,9 +59,9 @@ class DmlOperatorMatMulIntegerToFloat : public DmlOperator // Resize the A Scale to be the same dimension as the input tensor. // The 1D tensor needs to be moved to the H channel. m_inputTensorDescs[DmlInputIndex::dmlAScale] = CreateTensorDescFromInput( - kernelInfo, + kernelInfo, OrtInputTensors::ortAScale, - TensorAxis::DoNotCoerce, + TensorAxis::DoNotCoerce, TensorAxis::H, TensorAxis::LeftAligned, std::nullopt, @@ -73,11 +72,10 @@ class DmlOperatorMatMulIntegerToFloat : public DmlOperator // The 1D tensor needs to be moved to the H channel. if (kernelInfo.IsInputValid(OrtInputTensors::ortAZeroPoint)) { - m_inputTensorDescs[DmlInputIndex::dmlAZeroPoint] = CreateTensorDescFromInput( - kernelInfo, + kernelInfo, OrtInputTensors::ortAZeroPoint, - TensorAxis::DoNotCoerce, + TensorAxis::DoNotCoerce, TensorAxis::H, TensorAxis::LeftAligned, std::nullopt, @@ -110,4 +108,4 @@ class DmlOperatorMatMulIntegerToFloat : public DmlOperator DML_OP_DEFINE_CREATION_FUNCTION(MatMulIntegerToFloat, DmlOperatorMatMulIntegerToFloat); -} // namespace Dml \ No newline at end of file +} // namespace Dml diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index c7f2ec89fb817..ed1911be4cf77 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -39,8 +39,12 @@ static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, cons for (int64_t n = 0; n < N; n++) { float sum = 0.0f; for (int64_t k = 0; k < K; k++) { - float A_dequantized = has_zp ? (static_cast(A_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0] : A_data[m * K + k] * A_scale[0]; - float B_dequantized = has_zp ? (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : B_data[k * N + n] * B_scale[n]; + float A_dequantized = has_zp ? + (static_cast(A_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0] : + A_data[m * K + k] * A_scale[0]; + float B_dequantized = has_zp ? + (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : + B_data[k * N + n] * B_scale[n]; sum += A_dequantized * B_dequantized; } @@ -77,7 +81,8 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant, std::vector tmp_B_data; tmp_B_data = random.Uniform(B_dims, - (constexpr(std::is_same_v)) ? std::numeric_limits::lowest() / 2 : std::numeric_limits::lowest(), + (constexpr(std::is_same_v)) ? + std::numeric_limits::lowest() / 2 :std::numeric_limits::lowest(), std::numeric_limits::max() / 2); std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType { return static_cast(v); diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 5c885cf31fe31..0e58c26ff05df 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -5681,7 +5681,7 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) { } #ifdef USE_DML - TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) { +TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) { constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx"; std::shared_ptr p_model; ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); @@ -5689,7 +5689,7 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) { for (auto& node : graph.Nodes()) { node.SetExecutionProviderType(kDmlExecutionProvider); - } + } onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique(), TransformerLevel::Level2)); ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_)); diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py index 37db93a288b08..e6c51009018f9 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float.py +++ b/onnxruntime/test/testdata/matmul_integer_to_float.py @@ -13,7 +13,13 @@ def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bia "MatMulInteger", ), helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"), - helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT), + helper.make_node( + "Cast", + ["matmul_output_int32"], + ["matmul_output_float"], + "cast", + to=TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, + ), helper.make_node( "Mul", ["matmul_output_float", "multiplier"], @@ -48,14 +54,22 @@ def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bia if bias: nodes.extend([helper.make_node("Add", ["mul_bottom_output", "bias"], ["Y"], "add")]) - inputs.extend([helper.make_tensor_value_info("bias", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["N"])]) + inputs.extend( + [ + helper.make_tensor_value_info( + "bias", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["N"] + ) + ] + ) graph = helper.make_graph( nodes, "DynamicQuantizeMatMul_fusion", # name inputs, [ # outputs - helper.make_tensor_value_info("Y", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["M", "N"]), + helper.make_tensor_value_info( + "Y", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, ["M", "N"] + ), ], ) @@ -67,8 +81,29 @@ def GenerateModel(model_name, sign_i, sign_w, output_type_fp16, has_zp=True, bia GenerateModel("matmul_integer_to_float16_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=True) GenerateModel("matmul_integer_to_float_int8.onnx", sign_i=False, sign_w=True, output_type_fp16=False) GenerateModel("matmul_integer_to_float_uint8.onnx", sign_i=False, sign_w=False, output_type_fp16=False) - GenerateModel("matmul_integer_to_float_int8_bias.onnx", sign_i=False, sign_w=True, output_type_fp16=False, has_zp=False, bias=True) - GenerateModel("matmul_integer_to_float_uint8_bias.onnx", sign_i=False, sign_w=False, output_type_fp16=False, has_zp=False, bias=True) + GenerateModel( + "matmul_integer_to_float_int8_bias.onnx", + sign_i=False, + sign_w=True, + output_type_fp16=False, + has_zp=False, + bias=True, + ) + GenerateModel( + "matmul_integer_to_float_uint8_bias.onnx", + sign_i=False, + sign_w=False, + output_type_fp16=False, + has_zp=False, + bias=True, + ) GenerateModel("matmul_integer_to_float_int8_int8.onnx", sign_i=True, sign_w=True, output_type_fp16=False) - GenerateModel("matmul_integer_to_float_int8_int8_bias.onnx", sign_i=True, sign_w=True, output_type_fp16=False, has_zp=False, bias=True) + GenerateModel( + "matmul_integer_to_float_int8_int8_bias.onnx", + sign_i=True, + sign_w=True, + output_type_fp16=False, + has_zp=False, + bias=True, + ) diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py index 60bdd92dc9c93..018e5fb332dd0 100644 --- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py @@ -104,4 +104,4 @@ def GenerateModel(model_name): # noqa: N802 if __name__ == "__main__": - GenerateModel("matmul_integer_to_float.onnx") \ No newline at end of file + GenerateModel("matmul_integer_to_float.onnx") From 795241ceb9146f1e7f303c19de3d82516593b295 Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Fri, 23 Feb 2024 12:08:03 -0800 Subject: [PATCH 06/10] adding back 120 character --- .../test/contrib_ops/matmul_integer_to_float_test.cc | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index ed1911be4cf77..c7f2ec89fb817 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -39,12 +39,8 @@ static void CalculateMatMulIntegerToFloat(const int64_t M, const int64_t N, cons for (int64_t n = 0; n < N; n++) { float sum = 0.0f; for (int64_t k = 0; k < K; k++) { - float A_dequantized = has_zp ? - (static_cast(A_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0] : - A_data[m * K + k] * A_scale[0]; - float B_dequantized = has_zp ? - (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : - B_data[k * N + n] * B_scale[n]; + float A_dequantized = has_zp ? (static_cast(A_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0] : A_data[m * K + k] * A_scale[0]; + float B_dequantized = has_zp ? (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : B_data[k * N + n] * B_scale[n]; sum += A_dequantized * B_dequantized; } @@ -81,8 +77,7 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant, std::vector tmp_B_data; tmp_B_data = random.Uniform(B_dims, - (constexpr(std::is_same_v)) ? - std::numeric_limits::lowest() / 2 :std::numeric_limits::lowest(), + (constexpr(std::is_same_v)) ? std::numeric_limits::lowest() / 2 : std::numeric_limits::lowest(), std::numeric_limits::max() / 2); std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType { return static_cast(v); From 6fe223c94fef839521fdcece51c137e6074d992e Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Tue, 27 Feb 2024 12:10:47 -0800 Subject: [PATCH 07/10] Linx Build fix --- .../src/External/DirectMLHelpers/ApiTraits.h | 13 ------------- .../External/DirectMLHelpers/DirectMLSchema.h | 18 ------------------ .../DirectMLHelpers/GeneratedSchemaHelpers.h | 18 ------------------ .../matmul_integer_to_float_test.cc | 10 +++++----- 4 files changed, 5 insertions(+), 54 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h index 176fb2dfaa1e8..7c25755a7d09e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h @@ -1047,12 +1047,6 @@ struct OperatorDescTraits static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING; }; -template <> -struct OperatorDescTraits -{ - static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT; -}; - template <> struct OperatorDescTraits { @@ -2227,11 +2221,6 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_SWISH> { using DescType = DML_ACTIVATION_SWISH_OPERATOR_DESC; }; -template <> -struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT> -{ - using DescType = DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC; -}; template <> struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_HARD_SWISH> @@ -2594,8 +2583,6 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args return std::invoke(std::forward(visitor), DML_ACTIVATION_SWISH_OPERATOR_DESC{}, std::forward(args)...); case DML_OPERATOR_ACTIVATION_HARD_SWISH: return std::invoke(std::forward(visitor), DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC{}, std::forward(args)...); - case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: - return std::invoke(std::forward(visitor), DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC{}, std::forward(args)...); default: ORT_THROW_HR(E_INVALIDARG); return std::invoke(std::forward(visitor), DML_ACTIVATION_RELU_OPERATOR_DESC{}, std::forward(args)...); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h index ae4a02469e68e..da57c2aa235fd 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h @@ -2414,24 +2414,6 @@ constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHE DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS, }; -constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] { - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, -}; - -constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA { - "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT", - DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, - DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, - 8, - DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS, -}; constexpr DML_SCHEMA_FIELD DML_ACTIVATION_ELU_OPERATOR_SCHEMA_FIELDS[3] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h index 3dee8fe5649ea..86c66d8cca26c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h @@ -1500,19 +1500,6 @@ inline std::vector GetFields(const DML_QUANTIZED_LINEAR_AVERAGE_P OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[12], ToOperatorFieldType(static_cast(desc.IncludePadding))), }; } -inline std::vector GetFields(const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC& desc) -{ - return { - OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast(desc.ATensor))), - OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast(desc.AScaleTensor))), - OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast(desc.AZeroPointTensor))), - OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast(desc.BTensor))), - OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast(desc.BScaleTensor))), - OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast(desc.BZeroPointTensor))), - OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast(desc.BiasTensor))), - OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast(desc.OutputTensor))), - }; -} inline std::vector GetFields(const DML_ACTIVATION_ELU_OPERATOR_DESC& desc) { return { @@ -1870,7 +1857,6 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType) case DML_OPERATOR_DIAGONAL_MATRIX1: return DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA; case DML_OPERATOR_MULTIHEAD_ATTENTION: return DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA; case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: return DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA; - case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA; case DML_OPERATOR_ACTIVATION_ELU: return DML_ACTIVATION_ELU_OPERATOR_SCHEMA; case DML_OPERATOR_ACTIVATION_CELU: return DML_ACTIVATION_CELU_OPERATOR_SCHEMA; case DML_OPERATOR_ACTIVATION_HARDMAX: return DML_ACTIVATION_HARDMAX_OPERATOR_SCHEMA; @@ -2486,10 +2472,6 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc) return AbstractOperatorDesc( &DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA, GetFields(*static_cast(opDesc.Desc))); - case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: - return AbstractOperatorDesc( - &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA, - GetFields(*static_cast(opDesc.Desc))); case DML_OPERATOR_ACTIVATION_ELU: return AbstractOperatorDesc( &DML_ACTIVATION_ELU_OPERATOR_SCHEMA, diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index c7f2ec89fb817..eaa3b718cd180 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -77,7 +77,7 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant, std::vector tmp_B_data; tmp_B_data = random.Uniform(B_dims, - (constexpr(std::is_same_v)) ? std::numeric_limits::lowest() / 2 : std::numeric_limits::lowest(), + std::is_signed::value ? std::numeric_limits::lowest() / 2 : std::numeric_limits::lowest(), std::numeric_limits::max() / 2); std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> WType { return static_cast(v); @@ -133,10 +133,10 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant, } // Only DML EP supports these data type combinations for now - if ((constexpr(std::is_same_v)) || - (constexpr(std::is_same_v) && - constexpr(std::is_same_v) && - constexpr(std::is_same_v))) { + if (std::is_same_v || + (std::is_same_v && + std::is_same_v && + std::is_same_v)) { std::vector> execution_providers; execution_providers.push_back(DefaultDmlExecutionProvider()); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); From a4c81589994674783f4cc4f20b6722f5ea8c345e Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Fri, 1 Mar 2024 12:15:59 -0800 Subject: [PATCH 08/10] update constexpr Linix build error --- onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index eaa3b718cd180..899ffa6bd5859 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -125,7 +125,7 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant, B_data, B_scale, B_zero_point, Bias, Y_data, per_column, has_zp, has_bias); - if (constexpr(std::is_same_v)) { + if (std::is_same_v) { test.AddOutput("Y", {M, N}, Y_data); } else { test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); From 577706ffc42dcc432de5086751a1c7537d0ac86d Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Fri, 1 Mar 2024 14:46:13 -0800 Subject: [PATCH 09/10] Update tolerance --- onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 899ffa6bd5859..0183887adf104 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -126,7 +126,7 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant, per_column, has_zp, has_bias); if (std::is_same_v) { - test.AddOutput("Y", {M, N}, Y_data); + test.AddOutput("Y", {M, N}, Y_data, 0.02f); } else { test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); test.SetOutputAbsErr("Y", 0.5f); From 66c21b2e984791bf67daf112eba5f9ae8aefbe7d Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Mon, 4 Mar 2024 09:04:25 -0800 Subject: [PATCH 10/10] Increase tolerance for CPU --- onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 0183887adf104..6f3ca7e239671 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -126,7 +126,8 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant, per_column, has_zp, has_bias); if (std::is_same_v) { - test.AddOutput("Y", {M, N}, Y_data, 0.02f); + test.AddOutput("Y", {M, N}, Y_data); + test.SetOutputRelErr("Y", 0.02f); } else { test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); test.SetOutputAbsErr("Y", 0.5f);