From 5d54dc1462f0c6cc94cecbe86a45962771ac5b86 Mon Sep 17 00:00:00 2001 From: mcollinswisc Date: Mon, 26 Aug 2024 23:54:37 -0700 Subject: [PATCH] Drop QDQ around more nodes (#21376) ### Description Extends the Drop QDQ optimization to remove DequantizeLinear and QuantizeLinear nodes from around operators: - Flatten - Expand - Tile - Slice - GatherElements - ReduceMin - ReduceMax ### Motivation and Context To reduce floating-point conversions in quantize inference. Mainly motivated by the Flatten case, since that will show up in graphs exported from PyTorch to ONNX. But to make the change complete, extending to a larger set of ops for which this optimization is valid. https://github.com/microsoft/onnxruntime/issues/21375 --------- Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> --- cmake/onnxruntime_unittests.cmake | 2 + .../qdq_selector_action_transformer.cc | 15 +- .../test/optimizer/qdq_transformer_test.cc | 291 ++++++++++++++++++ 3 files changed, 305 insertions(+), 3 deletions(-) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index a02aeb5236881..d7f4a0675e118 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -892,6 +892,8 @@ if (MSVC) set_property(SOURCE "${TEST_SRC_DIR}/optimizer/graph_transform_test.cc" "${TEST_SRC_DIR}/optimizer/qdq_transformer_test.cc" APPEND PROPERTY COMPILE_OPTIONS "/bigobj") + set_property(SOURCE "${TEST_SRC_DIR}/optimizer/qdq_transformer_test.cc" + APPEND PROPERTY COMPILE_OPTIONS "/bigobj") else() target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses") endif() diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc index 379d271fbdca7..f1b30da01f907 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc @@ -72,16 +72,25 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) { std::unique_ptr selector_no_16bit_and_positive_scale = std::make_unique(false, true, false); qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_and_positive_scale_name, - {{"MaxPool", {12}}}, + {{"MaxPool", {12}}, + {"ReduceMax", {}}, + {"ReduceMin", {}}}, std::move(selector_no_16bit_and_positive_scale), std::move(drop_action_no_int16_and_positive_scale)); std::unique_ptr selector = std::make_unique(true); + // DepthToSpace and SpaceToDepth not included because there are no integer implementations. + // https://github.com/microsoft/onnxruntime/issues/21287 qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_name, - {{"Gather", {}}, + {{"Expand", {}}, + {"Flatten", {}}, + {"Gather", {}}, + {"GatherElements", {}}, {"Reshape", {}}, - {"Transpose", {}}, + {"Slice", {}}, {"Squeeze", {}}, + {"Tile", {}}, + {"Transpose", {}}, {"Unsqueeze", {}}}, std::move(selector), std::move(drop_action)); diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index a043d6553bdfd..d07977d4b97b8 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -1087,6 +1087,297 @@ TEST(QDQTransformerTests, UnsqueezeDropQDQ) { RunSqueezeUnsqueezeDropQDQTestCase("Unsqueeze", {1, 3, 2, 2}, {0}, false, 21); } +// Runs a test case that checks if Q/DQ nodes are dropped from DQ -> Flatten -> Q. +template +static void RunFlattenDropQDQTestCase(const std::vector& input_shape, + int64_t axis = 1, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [input_shape, axis, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* flatten_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, .003f, zero_point, input_arg_dq, use_contrib_qdq); + Node& flatten_node = builder.AddNode("Flatten", {input_arg_dq}, {flatten_output}); + flatten_node.AddAttribute("axis", axis); + + // add Q + builder.AddQuantizeLinearNode(flatten_output, .003f, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count["Flatten"], 1); + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are dropped from DQ -> Reshape -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, FlattenDropQDQ) { + for (int64_t axis : {0, 1, 3}) { + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis); + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis, true, 13); // Use com.microsoft QDQ ops + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis, true, 13); // Use int16 com.microsoft QDQ ops + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis, true, 13); // Use int16 com.microsoft QDQ ops + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis, false); // Use int16 ONNX QDQ ops + RunFlattenDropQDQTestCase({1, 3, 2, 2}, axis, false); // Use int16 ONNX QDQ ops + } +} + +// Runs a test case that checks if Q/DQ nodes are dropped from DQ -> Expand -> Q. +template +static void RunExpandDropQDQTestCase(const std::vector& input_shape, + const std::vector& expanded_shape, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [input_shape, expanded_shape, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* expanded_shape_arg = builder.Make1DInitializer(expanded_shape); + auto* expand_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, .003f, zero_point, input_arg_dq, use_contrib_qdq); + builder.AddNode("Expand", {input_arg_dq, expanded_shape_arg}, {expand_output}); + + // add Q + builder.AddQuantizeLinearNode(expand_output, .003f, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count["Expand"], 1); + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are dropped from DQ -> Expand -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, ExpandDropQDQ) { + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}); + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}, true, 13); // Use com.microsoft QDQ ops + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}, true, 13); // Use int16 com.microsoft QDQ ops + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}, true, 13); // Use int16 com.microsoft QDQ ops + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}, false); // Use int16 ONNX QDQ ops + RunExpandDropQDQTestCase({1, 3, 1, 1}, {1, 3, 7, 13}, false); // Use int16 ONNX QDQ ops +} + +// Runs a test case that checks if Q/DQ nodes are dropped from DQ -> Tile -> Q. +template +static void RunTileDropQDQTestCase(const std::vector& input_shape, + const std::vector& repeats, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [input_shape, repeats, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* repeats_arg = builder.Make1DInitializer(repeats); + auto* tile_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, .003f, zero_point, input_arg_dq, use_contrib_qdq); + builder.AddNode("Tile", {input_arg_dq, repeats_arg}, {tile_output}); + + // add Q + builder.AddQuantizeLinearNode(tile_output, .003f, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count["Tile"], 1); + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are dropped from DQ -> Tile -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, TileDropQDQ) { + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}); + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}, true, 13); // Use com.microsoft QDQ ops + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}, true, 13); // Use int16 com.microsoft QDQ ops + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}, true, 13); // Use int16 com.microsoft QDQ ops + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}, false); // Use int16 ONNX QDQ ops + RunTileDropQDQTestCase({1, 3, 2, 2}, {1, 1, 3, 3}, false); // Use int16 ONNX QDQ ops +} + +// Runs a test case that checks if Q/DQ nodes are dropped from DQ -> Slice -> Q. +template +static void RunSliceDropQDQTestCase(const std::vector& input_shape, + const std::vector& starts, + const std::vector& ends, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [input_shape, starts, ends, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* starts_arg = builder.Make1DInitializer(starts); + auto* ends_arg = builder.Make1DInitializer(ends); + auto* slice_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, .003f, zero_point, input_arg_dq, use_contrib_qdq); + builder.AddNode("Slice", {input_arg_dq, starts_arg, ends_arg}, {slice_output}); + + // add Q + builder.AddQuantizeLinearNode(slice_output, .003f, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count["Slice"], 1); + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are dropped from DQ -> Slice -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, SliceDropQDQ) { + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}); + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}, true, 13); // Use com.microsoft QDQ ops + // Use int16 com.microsoft QDQ ops + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}, true, 13); + // Use int16 com.microsoft QDQ ops + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}, true, 13); + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}, false); // Use int16 ONNX QDQ ops + RunSliceDropQDQTestCase({1, 3, 5, 5}, {0, 1, 1, 1}, {1, 3, 4, 4}, false); // Use int16 ONNX QDQ ops +} + +// Runs a test case that checks if Q/DQ nodes are dropped from DQ -> GatherElements -> Q. +template +static void RunGatherElementsDropQDQTestCase(const std::vector& input_shape, + const std::vector& indices_shape, + const std::vector& indices_data, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [input_shape, indices_shape, indices_data, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* indices_arg = builder.MakeInitializer(indices_shape, indices_data); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* gather_elements_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, .003f, zero_point, input_arg_dq, use_contrib_qdq); + builder.AddNode("GatherElements", {input_arg_dq, indices_arg}, {gather_elements_output}); + + // add Q + builder.AddQuantizeLinearNode(gather_elements_output, .003f, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count["GatherElements"], 1); + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are dropped from DQ -> GatherElements -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, GatherElementsDropQDQ) { + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}); + // Use com.microsoft QDQ ops + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}, true, 13); + // Use int16 com.microsoft QDQ ops + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}, true, 13); + // Use int16 com.microsoft QDQ ops + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}, true, 13); + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}, false); // Use int16 ONNX QDQ ops + RunGatherElementsDropQDQTestCase({3, 3}, {2, 3}, {1, 2, 0, 2, 0, 0}, false); // Use int16 ONNX QDQ ops +} + +// Runs a test case whether Q/DQ nodes are dropped from DQ -> Reduce(Min|Max) -> Q. +template +static void RunReduceExtremumDropQDQTestCase(const std::string& op_type, + const std::vector& input_shape, + float qscale, + bool expect_drop_qdq, + bool use_contrib_qdq = false, + int opset = 21) { + auto build_test_case = [op_type, input_shape, qscale, use_contrib_qdq](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* reduce_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, qscale, zero_point, input_arg_dq, use_contrib_qdq); + builder.AddNode(op_type, {input_arg_dq}, {reduce_output}); + + // add Q + builder.AddQuantizeLinearNode(reduce_output, qscale, zero_point, output_arg, use_contrib_qdq); + }; + + auto check_graph = [op_type, expect_drop_qdq, use_contrib_qdq](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); + EXPECT_EQ(op_to_count[op_type], 1); + if (expect_drop_qdq) { + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); + } else { + EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 1); + EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1); + } + }; + + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks whether Q/DQ nodes are dropped from DQ -> Reduce(Min|Max) -> Q. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, ReduceExtremumDropQDQ) { + // Check that Q/DQ nodes are dropped for positive scale + RunReduceExtremumDropQDQTestCase("ReduceMin", {3, 3}, 0.003f, true); + RunReduceExtremumDropQDQTestCase("ReduceMin", {3, 3}, 0.003f, true, true, 13); // Use com.microsoft QDQ ops + RunReduceExtremumDropQDQTestCase("ReduceMax", {3, 3}, 0.003f, true); + RunReduceExtremumDropQDQTestCase("ReduceMax", {3, 3}, 0.003f, true, true, 13); // Use com.microsoft QDQ ops + + // Check that Q/DQ nodes are *not* dropped for negative scale + RunReduceExtremumDropQDQTestCase("ReduceMin", {3, 3}, -0.003f, false); + RunReduceExtremumDropQDQTestCase("ReduceMin", {3, 3}, -0.003f, false, true, 13); // Use com.microsoft QDQ ops + RunReduceExtremumDropQDQTestCase("ReduceMax", {3, 3}, -0.003f, false); + RunReduceExtremumDropQDQTestCase("ReduceMax", {3, 3}, -0.003f, false, true, 13); // Use com.microsoft QDQ ops +} + TEST(QDQTransformerTests, DoubleQDQ) { constexpr uint8_t good_u8_1 = 80; constexpr uint8_t good_u8_2 = 40;