diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index a02aeb5236881..d7f4a0675e118 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -892,6 +892,8 @@ if (MSVC) set_property(SOURCE "${TEST_SRC_DIR}/optimizer/graph_transform_test.cc" "${TEST_SRC_DIR}/optimizer/qdq_transformer_test.cc" APPEND PROPERTY COMPILE_OPTIONS "/bigobj") + set_property(SOURCE "${TEST_SRC_DIR}/optimizer/qdq_transformer_test.cc" + APPEND PROPERTY COMPILE_OPTIONS "/bigobj") else() target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses") endif() diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc index 379d271fbdca7..f1b30da01f907 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc @@ -72,16 +72,25 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) { std::unique_ptr selector_no_16bit_and_positive_scale = std::make_unique(false, true, false); qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_and_positive_scale_name, - {{"MaxPool", {12}}}, + {{"MaxPool", {12}}, + {"ReduceMax", {}}, + {"ReduceMin", {}}}, std::move(selector_no_16bit_and_positive_scale), std::move(drop_action_no_int16_and_positive_scale)); std::unique_ptr selector = std::make_unique(true); + // DepthToSpace and SpaceToDepth not included because there are no integer implementations. + // https://github.com/microsoft/onnxruntime/issues/21287 qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_name, - {{"Gather", {}}, + {{"Expand", {}}, + {"Flatten", {}}, + {"Gather", {}}, + {"GatherElements", {}}, {"Reshape", {}}, - {"Transpose", {}}, + {"Slice", {}}, {"Squeeze", {}}, + {"Tile", {}}, + {"Transpose", {}}, {"Unsqueeze", {}}}, std::move(selector), std::move(drop_action)); diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index a043d6553bdfd..d07977d4b97b8 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -1087,6 +1087,297 @@ TEST(QDQTransformerTests, UnsqueezeDropQDQ) { RunSqueezeUnsqueezeDropQDQTestCase("Unsqueeze", {1, 3, 2, 2}, {0}, false, 21); } +// Runs a test case that checks if Q/DQ nodes are dropped from DQ -> Flatten -> Q. +template +static void RunFlattenDropQDQTestCase(const std::vector& input_shape, + int64_t axis = 1, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [input_shape, axis, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* flatten_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, .003f, zero_point, input_arg_dq, use_contrib_qdq); + Node& flatten_node = builder.AddNode("Flatten", {input_arg_dq}, {flatten_output}); + flatten_node.AddAttribute("axis", axis); + + // add Q + builder.AddQuantizeLinearNode(flatten_output, .003f, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count["Flatten"], 1); + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are dropped from DQ -> Reshape -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, FlattenDropQDQ) { + for (int64_t axis : {0, 1, 3}) { + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis); + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis, true, 13); // Use com.microsoft QDQ ops + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis, true, 13); // Use int16 com.microsoft QDQ ops + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis, true, 13); // Use int16 com.microsoft QDQ ops + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis, false); // Use int16 ONNX QDQ ops + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis, false); // Use int16 ONNX QDQ ops + } +} + +// Runs a test case that checks if Q/DQ nodes are dropped from DQ -> Expand -> Q. +template +static void RunExpandDropQDQTestCase(const std::vector& input_shape, + const std::vector& expanded_shape, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [input_shape, expanded_shape, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* expanded_shape_arg = builder.Make1DInitializer(expanded_shape); + auto* expand_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, .003f, zero_point, input_arg_dq, use_contrib_qdq); + builder.AddNode("Expand", {input_arg_dq, expanded_shape_arg}, {expand_output}); + + // add Q + builder.AddQuantizeLinearNode(expand_output, .003f, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count["Expand"], 1); + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are dropped from DQ -> Expand -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, ExpandDropQDQ) { + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}); + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}, true, 13); // Use com.microsoft QDQ ops + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}, true, 13); // Use int16 com.microsoft QDQ ops + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}, true, 13); // Use int16 com.microsoft QDQ ops + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}, false); // Use int16 ONNX QDQ ops + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}, false); // Use int16 ONNX QDQ ops +} + +// Runs a test case that checks if Q/DQ nodes are dropped from DQ -> Tile -> Q. +template +static void RunTileDropQDQTestCase(const std::vector& input_shape, + const std::vector& repeats, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [input_shape, repeats, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* repeats_arg = builder.Make1DInitializer(repeats); + auto* tile_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, .003f, zero_point, input_arg_dq, use_contrib_qdq); + builder.AddNode("Tile", {input_arg_dq, repeats_arg}, {tile_output}); + + // add Q + builder.AddQuantizeLinearNode(tile_output, .003f, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count["Tile"], 1); + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are dropped from DQ -> Tile -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, TileDropQDQ) { + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}); + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}, true, 13); // Use com.microsoft QDQ ops + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}, true, 13); // Use int16 com.microsoft QDQ ops + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}, true, 13); // Use int16 com.microsoft QDQ ops + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}, false); // Use int16 ONNX QDQ ops + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}, false); // Use int16 ONNX QDQ ops +} + +// Runs a test case that checks if Q/DQ nodes are dropped from DQ -> Slice -> Q. +template +static void RunSliceDropQDQTestCase(const std::vector& input_shape, + const std::vector& starts, + const std::vector& ends, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [input_shape, starts, ends, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* starts_arg = builder.Make1DInitializer(starts); + auto* ends_arg = builder.Make1DInitializer(ends); + auto* slice_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, .003f, zero_point, input_arg_dq, use_contrib_qdq); + builder.AddNode("Slice", {input_arg_dq, starts_arg, ends_arg}, {slice_output}); + + // add Q + builder.AddQuantizeLinearNode(slice_output, .003f, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count["Slice"], 1); + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are dropped from DQ -> Slice -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, SliceDropQDQ) { + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}); + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}, true, 13); // Use com.microsoft QDQ ops + // Use int16 com.microsoft QDQ ops + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}, true, 13); + // Use int16 com.microsoft QDQ ops + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}, true, 13); + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}, false); // Use int16 ONNX QDQ ops + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}, false); // Use int16 ONNX QDQ ops +} + +// Runs a test case that checks if Q/DQ nodes are dropped from DQ -> GatherElements -> Q. +template +static void RunGatherElementsDropQDQTestCase(const std::vector& input_shape, + const std::vector& indices_shape, + const std::vector& indices_data, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [input_shape, indices_shape, indices_data, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* indices_arg = builder.MakeInitializer(indices_shape, indices_data); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* gather_elements_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, .003f, zero_point, input_arg_dq, use_contrib_qdq); + builder.AddNode("GatherElements", {input_arg_dq, indices_arg}, {gather_elements_output}); + + // add Q + builder.AddQuantizeLinearNode(gather_elements_output, .003f, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count["GatherElements"], 1); + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are dropped from DQ -> GatherElements -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, GatherElementsDropQDQ) { + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}); + // Use com.microsoft QDQ ops + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}, true, 13); + // Use int16 com.microsoft QDQ ops + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}, true, 13); + // Use int16 com.microsoft QDQ ops + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}, true, 13); + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}, false); // Use int16 ONNX QDQ ops + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}, false); // Use int16 ONNX QDQ ops +} + +// Runs a test case whether Q/DQ nodes are dropped from DQ -> Reduce(Min|Max) -> Q. +template +static void RunReduceExtremumDropQDQTestCase(const std::string& op_type, + const std::vector& input_shape, + float qscale, + bool expect_drop_qdq, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [op_type, input_shape, qscale, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* reduce_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, qscale, zero_point, input_arg_dq, use_contrib_qdq); + builder.AddNode(op_type, {input_arg_dq}, {reduce_output}); + + // add Q + builder.AddQuantizeLinearNode(reduce_output, qscale, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [op_type, expect_drop_qdq, use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count[op_type], 1); + if (expect_drop_qdq) { + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + } else { + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 1); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1); + } + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks whether Q/DQ nodes are dropped from DQ -> Reduce(Min|Max) -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, ReduceExtremumDropQDQ) { + // Check that Q/DQ nodes are dropped for positive scale + RunReduceExtremumDropQDQTestCase("ReduceMin", {3, 3}, 0.003f, true); + RunReduceExtremumDropQDQTestCase("ReduceMin", {3, 3}, 0.003f, true, true, 13); // Use com.microsoft QDQ ops + RunReduceExtremumDropQDQTestCase("ReduceMax", {3, 3}, 0.003f, true); + RunReduceExtremumDropQDQTestCase("ReduceMax", {3, 3}, 0.003f, true, true, 13); // Use com.microsoft QDQ ops + + // Check that Q/DQ nodes are *not* dropped for negative scale + RunReduceExtremumDropQDQTestCase("ReduceMin", {3, 3}, -0.003f, false); + RunReduceExtremumDropQDQTestCase("ReduceMin", {3, 3}, -0.003f, false, true, 13); // Use com.microsoft QDQ ops + RunReduceExtremumDropQDQTestCase("ReduceMax", {3, 3}, -0.003f, false); + RunReduceExtremumDropQDQTestCase("ReduceMax", {3, 3}, -0.003f, false, true, 13); // Use com.microsoft QDQ ops +} + TEST(QDQTransformerTests, DoubleQDQ) { constexpr uint8_t good_u8_1 = 80; constexpr uint8_t good_u8_2 = 40;