diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc index 4fee1a6ce224e..b619efb2f751e 100644 --- a/onnxruntime/core/optimizer/matmul_integer_to_float.cc +++ b/onnxruntime/core/optimizer/matmul_integer_to_float.cc @@ -49,6 +49,49 @@ bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) { return data_type == actual_data_type; } +// Return total mnumber of Elements. +static uint64_t NumElements(const TensorShapeProto* tensor_shape) { + if (nullptr == tensor_shape || tensor_shape->dim_size() < 1) { + return 0; + } + uint64_t num_elements = 1; + + for (int i = 0; i < tensor_shape->dim_size(); i++) { + num_elements *= tensor_shape->dim(i).dim_value(); + } + return num_elements; +} + +bool CheckMatMulLargeTensors(const Node& matmulinteger_node, const Node& cast_node) { + const auto a_def = matmulinteger_node.InputDefs()[0]; + const auto b_def = matmulinteger_node.InputDefs()[1]; + const int a_dim_size = a_def->Shape()->dim_size(); + const int b_dim_size = b_def->Shape()->dim_size(); + uint64_t a_num_elements = NumElements(a_def->Shape()); + uint64_t b_num_elements = NumElements(b_def->Shape()); + + if (a_dim_size != b_dim_size) { + bool a_is_broadcasted = a_dim_size < b_dim_size; + if (a_is_broadcasted) { + for (int i = 0; i < b_dim_size - a_dim_size; i++) { + a_num_elements *= b_def->Shape()->dim(i).dim_value(); + } + } else { + for (int i = 0; i < a_dim_size - b_dim_size; i++) { + b_num_elements *= a_def->Shape()->dim(i).dim_value(); + } + } + } + + int output_data_type = HasElementDataType(*cast_node.OutputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) ? 2 : 4; + uint64_t total_bytes = (a_num_elements + b_num_elements) * output_data_type; + + if (total_bytes > UINT32_MAX) { + return true; + } + return false; +} + /** MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat: @@ -114,6 +157,17 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g continue; } + const Node* p_dynamicquantize_node = graph_utils::FirstParentByType(*p_matmulinteger_node, "DynamicQuantizeLinear"); + + // Check MatMulInteger Nodes' input is coming from DynamicQuantizeLinear + // For larger tensors DynamicQuantizeLinear -> MatMulInteger is used to be resource efficient + // And we have better MatMulInteger Metacommand coverage in DML + if (is_dml_ep && p_dynamicquantize_node) { + if (CheckMatMulLargeTensors(matmulinteger_node, cast_node)) { + continue; + } + } + // Find bias node Node* p_add_node = nullptr; if (optimizer_utils::CheckOutputEdges(graph, mul_node, 1)) { diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 3aec0d5a67e94..ee3a1baade005 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -5859,6 +5859,22 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) { std::map op_to_count = CountOpsInGraph(graph); EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1); } + +TEST_F(GraphTransformationTests, MatMulIntegerToFloatLargeTensorTest) { + constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float_large_tensor.onnx"; + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + for (auto& node : graph.Nodes()) { + node.SetExecutionProviderType(kDmlExecutionProvider); + } + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique(), TransformerLevel::Level2)); + ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_)); + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 0); +} #endif // USE_DML #endif diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx new file mode 100644 index 0000000000000..2521a89b7bb56 --- /dev/null +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx @@ -0,0 +1,41 @@ + :é +R +inputA a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear +Y + a_quantized +inputB +a_zp +inputBZPmatmulinteger_output MatMulInteger" MatMulInteger +- +a_scale + inputBScalemul_1 mul_right"Mul +: +matmulinteger_output cast_outputcast"Cast* +to  +- +mul_1 + cast_outputoutput +mul_bottom"Mul+matmul_integer_to_float_large_tensor_fusionZ" +inputA + + + +€ + +€ +Z +inputB + + +€ + +€ +Z +inputBZP + + +Z + inputBScale + + +B \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py new file mode 100644 index 0000000000000..543517cc015ef --- /dev/null +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py @@ -0,0 +1,49 @@ +from enum import Enum # noqa: F401 + +import onnx +from onnx import TensorProto, helper + + +def GenerateModel(model_name): # noqa: N802 + inputs = [] + outputs = [] + initializers = [] + nodes = [] + + inputs.append(helper.make_tensor_value_info("inputA", TensorProto.FLOAT, [16, 32, 1280, 1280])) + inputs.append(helper.make_tensor_value_info("inputB", TensorProto.INT8, [1280, 1280])) + inputs.append(helper.make_tensor_value_info("inputBZP", TensorProto.INT8, [1])) + inputs.append(helper.make_tensor_value_info("inputBScale", TensorProto.FLOAT, [1])) + + nodes = [ # construct graph + helper.make_node( + "DynamicQuantizeLinear", + ["inputA"], + ["a_quantized", "a_scale", "a_zp"], + "DynamicQuantizeLinear", + ), + helper.make_node( + "MatMulInteger", + ["a_quantized", "inputB", "a_zp", "inputBZP"], + ["matmulinteger_output"], + "MatMulInteger", + ), + helper.make_node("Mul", ["a_scale", "inputBScale"], ["mul_1"], "mul_right"), + helper.make_node("Cast", ["matmulinteger_output"], ["cast_output"], "cast", to=1), + helper.make_node("Mul", ["mul_1", "cast_output"], ["output"], "mul_bottom"), + ] + + graph = helper.make_graph( + nodes, + "matmul_integer_to_float_large_tensor_fusion", # name + inputs, + outputs, + initializers, + ) + + model = helper.make_model(graph) + onnx.save(model, model_name) + + +if __name__ == "__main__": + GenerateModel("matmul_integer_to_float_large_tensor.onnx")