diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc index 2fbe59bf0d578..ed70111087e19 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc @@ -80,6 +80,64 @@ Status BaseOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, return Status::OK(); } +Status BaseOpBuilder::AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper, + const QnnQuantParamsWrapper& input0_qparams, + const QnnQuantParamsWrapper& input1_qparams, + std::vector&& bias_shape, + const std::string& bias_name, + const logging::Logger& logger, + std::vector& input_names) const { + ORT_UNUSED_PARAMETER(logger); + // For now, only handle case where input0 is per-tensor quantized and input1 is either per-tensor + // or per-channel quantized. + ORT_RETURN_IF_NOT(input0_qparams.IsPerTensor(/*include_bw*/ true) && input1_qparams.IsQuantized(), + "QNN EP currently only supports adding a dummy zero bias input for per-tensor ", + "input[0] and per-tensor/per-channel input[1]"); + + size_t num_bias_elems = 1; + for (size_t i = 0; i < bias_shape.size(); i++) { + num_bias_elems *= static_cast(bias_shape[i]); + } + + // Bias static input should be all zeros. + std::vector bias_bytes(num_bias_elems * sizeof(int32_t), 0); + + // Bias's quantization scale(s) should be the product of the other inputs' quantization scales. + // Input[0] is expected to have one scale (per-tensor). + // If input[1] is per-channel (many scales), then the dummy bias also needs to be per-channel. + std::vector input0_quant_scales; + std::vector input1_quant_scales; + ORT_RETURN_IF_ERROR(input0_qparams.GetScales(input0_quant_scales)); + ORT_RETURN_IF_ERROR(input1_qparams.GetScales(input1_quant_scales)); + + const size_t num_bias_scales_offsets = input1_quant_scales.size(); + assert(input0_quant_scales.size() == 1); // Expected for per-tensor. + ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(), + "Input[1] should have >= 1 quantization scale values"); + + std::vector bias_scales(num_bias_scales_offsets); + for (size_t i = 0; i < num_bias_scales_offsets; i++) { + bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i]; + } + + std::vector bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros. + QnnQuantParamsWrapper bias_qparams; + + if (input1_qparams.IsPerChannel()) { + bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false); + } else { + bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]); + } + + auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32, + std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes)); + + qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper)); + input_names.push_back(bias_name); + + return Status::OK(); +} + Status BaseOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, std::vector&& input_names, diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h index 6886845ff3de1..9d0772d1daac7 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h @@ -95,6 +95,14 @@ class BaseOpBuilder : public IOpBuilder { const logging::Logger& logger, std::vector& input_names) const ORT_MUST_USE_RESULT; + Status AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper, + const QnnQuantParamsWrapper& input0_qparams, + const QnnQuantParamsWrapper& input1_qparams, + std::vector&& bias_shape, + const std::string& bias_name, + const logging::Logger& logger, + std::vector& input_names) const ORT_MUST_USE_RESULT; + Status SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, const logging::Logger& logger, diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc index 5283e9a559cd4..12887f0fb72d6 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc @@ -289,10 +289,30 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper, // // Input 2: bias // - if (num_inputs == 3) { + const bool has_bias_input = num_inputs == 3; + if (has_bias_input) { ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names)); } +#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 16 && QNN_API_VERSION_MINOR <= 18) + if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) { + // Bias is implicit. QNN SDK 2.23/2.24/2.25 (QNN API version 2.16/2.17/2.18) has a validation bug for + // implicit bias inputs, so provide an explicit bias of all 0 (quantized int32). + TensorInfo input0_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info)); + + TensorInfo input1_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info)); + + if (input0_info.quant_param.IsPerTensor(/*include_bw*/ true) && input1_info.quant_param.IsQuantized()) { + const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep"; + std::vector bias_shape = {input1_info.shape[0]}; + ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, input0_info.quant_param, input1_info.quant_param, + std::move(bias_shape), bias_name, logger, input_names)); + } + } +#endif + return Status::OK(); } diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc index a31b15948cb7f..35a6b7bf40637 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc @@ -99,47 +99,9 @@ Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, if (x_input_info.quant_param.IsPerTensor(/*include_bw*/ true) && scale_input_info.quant_param.IsQuantized()) { const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep"; - - // Make dummy bias input have the same shape as the scale input. std::vector bias_shape = scale_input_info.shape; - size_t num_bias_elems = 1; - for (size_t i = 0; i < bias_shape.size(); i++) { - num_bias_elems *= static_cast(bias_shape[i]); - } - - // Bias static input should be all zeros. - std::vector bias_bytes(num_bias_elems * sizeof(int32_t), 0); - - // Bias's quantization scale should be the product of the other inputs' quantization scales. - std::vector input0_quant_scales; - std::vector input1_quant_scales; - ORT_RETURN_IF_ERROR(x_input_info.quant_param.GetScales(input0_quant_scales)); - ORT_RETURN_IF_ERROR(scale_input_info.quant_param.GetScales(input1_quant_scales)); - - const size_t num_bias_scales_offsets = input1_quant_scales.size(); - assert(input0_quant_scales.size() == 1); // Expected for per-tensor. - ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(), - "Input[1] should have >= 1 quantization scale values"); - - std::vector bias_scales(num_bias_scales_offsets); - for (size_t i = 0; i < num_bias_scales_offsets; i++) { - bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i]; - } - - std::vector bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros. - QnnQuantParamsWrapper bias_qparams; - - if (scale_input_info.quant_param.IsPerChannel()) { - bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false); - } else { - bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]); - } - - auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32, - std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes)); - - qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper)); - input_names.push_back(bias_name); + ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, x_input_info.quant_param, scale_input_info.quant_param, + std::move(bias_shape), bias_name, logger, input_names)); } } #endif diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc index 657224f68f71b..3c029fda9cd52 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc @@ -335,7 +335,18 @@ Status QnnModelWrapper::UnpackZeroPoints(const std::string& initializer_name, switch (onnx_data_type) { // QNN use -offset for some reason - case ONNX_NAMESPACE::TensorProto_DataType_INT4: // INT4 zero-points are unpacked as 8-bit values for QNN + case ONNX_NAMESPACE::TensorProto_DataType_INT4: { // INT4 zero-points are unpacked as 8-bit values for QNN + auto int8_span = ReinterpretAsSpan(gsl::make_span(initializer_bytes)); + std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points), + [](int8_t masked_zp) -> int32_t { + // We currently unpack int4 as int8 but with the top 4-bits masked off due to QNN bug. + // Need to undo the masking so that the zero-point value is correct. + // (Not really a problem yet because QNN only supports symmetric INT4 quantization with zp == 0). + int8_t zp = Int4x2::SignExtendLower4Bits(std::byte(masked_zp)); + return -static_cast(zp); + }); + break; + } case ONNX_NAMESPACE::TensorProto_DataType_INT8: { auto int8_span = ReinterpretAsSpan(gsl::make_span(initializer_bytes)); std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points), diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc index d6c93a8f226e8..8d2cb5bdb6da0 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc @@ -231,6 +231,8 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize } else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { out << " axis=" << quantize_params.axisScaleOffsetEncoding.axis; size_t num_elems = quantize_params.axisScaleOffsetEncoding.numScaleOffsets; + bool truncate = num_elems > 20; + num_elems = truncate ? 20 : num_elems; out << " scales=("; for (size_t i = 0; i < num_elems; i++) { out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].scale << (i == num_elems - 1 ? "" : " "); @@ -239,11 +241,13 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize for (size_t i = 0; i < num_elems; i++) { out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].offset << (i == num_elems - 1 ? "" : " "); } - out << ")"; + out << (truncate ? "...)" : ")"); } else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { out << " axis=" << quantize_params.bwAxisScaleOffsetEncoding.axis; out << " bw=" << quantize_params.bwAxisScaleOffsetEncoding.bitwidth; size_t num_elems = quantize_params.bwAxisScaleOffsetEncoding.numElements; + bool truncate = num_elems > 20; + num_elems = truncate ? 20 : num_elems; out << " scales=("; for (size_t i = 0; i < num_elems; i++) { out << quantize_params.bwAxisScaleOffsetEncoding.scales[i] << (i == num_elems - 1 ? "" : " "); @@ -252,7 +256,7 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize for (size_t i = 0; i < num_elems; i++) { out << quantize_params.bwAxisScaleOffsetEncoding.offsets[i] << (i == num_elems - 1 ? "" : " "); } - out << ")"; + out << (truncate ? "...)" : ")"); } else { out << " encoding not supported."; } diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index fc64d63ede338..b7408940ff48a 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -423,14 +423,14 @@ static void LogNodeSupport(const logging::Logger& logger, return; } + size_t num_nodes = 0; std::ostringstream oss; - oss << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for nodes (" - << qnn_node_group.Type() << "):" << std::endl; for (const NodeUnit* node_unit : qnn_node_group.GetNodeUnits()) { for (const Node* node : node_unit->GetAllNodesInGroup()) { oss << "\tOperator type: " << node->OpType() << " Node name: " << node->Name() << " Node index: " << node->Index() << std::endl; + num_nodes += 1; } } if (!support_status.IsOK()) { @@ -440,6 +440,9 @@ static void LogNodeSupport(const logging::Logger& logger, logging::Capture(logger, log_severity, logging::Category::onnxruntime, log_data_type, call_site) .Stream() + << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes + << " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :" + << std::endl << oss.str(); } diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc index 95673586677ef..cf37fc00335d3 100644 --- a/onnxruntime/test/providers/qnn/conv_test.cc +++ b/onnxruntime/test/providers/qnn/conv_test.cc @@ -793,19 +793,101 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) { TestInputDef bias_def(bias_shape, true, GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size())); - RunHTPConvOpPerChannelTest("Conv", - input_def, - weight_def, - bias_def, - 0, // weight quant axis - {1, 1}, // Strides - {0, 0, 0, 0}, // Pads - {1, 1}, // Dilations - 1, // default group - "NOTSET", - ExpectedEPNodeAssignment::All, - false, // use_qdq_contrib_ops - 21); // opset + RunHTPConvOpPerChannelTest("Conv", + input_def, + weight_def, + bias_def, + 0, // weight quant axis + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21); // opset +} + +// Test per-channel QDQ Conv with INT4 weights and no bias. +// in0: u16, in1 (weight): s4, out: u8 +// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias). +TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias) { + std::vector input_shape = {1, 2, 4, 4}; + std::vector weight_shape = {3, 2, 2, 2}; + + TestInputDef input_def(input_shape, false, + GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size())); + TestInputDef weight_def(weight_shape, true, + GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size())); + + RunHTPConvOpPerChannelTest("Conv", + input_def, + weight_def, + TestInputDef(), + 0, // weight quant axis + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21); // opset +} + +// Test per-channel QDQ Conv with uint16 input[0], uint8 weights, and no bias. +// in0: u16, in1 (weight): s4, out: u8 +// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias). +TEST_F(QnnHTPBackendTests, ConvU16U8_PerTensor_NoBias) { + std::vector input_shape = {1, 2, 4, 4}; + std::vector weight_shape = {3, 2, 2, 2}; + + TestInputDef input_def(input_shape, false, + GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size())); + TestInputDef weight_def(weight_shape, true, + GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size())); + + RunHTPConvOpTest("Conv", + input_def, + weight_def, + TestInputDef(), + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21); // opset +} + +TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias_LargeINT4Weight) { + std::vector input_shape = {1, 3072, 1, 512}; + std::vector weight_shape = {9216, 3072, 1, 1}; + std::vector input_data(TensorShape(input_shape).Size(), 0.1f); + input_data[0] = 0.2f; + std::vector weight_data(TensorShape(weight_shape).Size(), -0.1f); + for (size_t c = 0; c < static_cast(weight_shape[0]); c++) { + size_t i = c * 3072; + weight_data[i] = 0.1f; + } + + TestInputDef input_def(input_shape, false, input_data); + TestInputDef weight_def(weight_shape, true, weight_data); + + RunHTPConvOpPerChannelTest("Conv", + input_def, + weight_def, + TestInputDef(), + 0, // weight quant axis + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21); // opset } // Test fusion of DQs -> Conv -> Relu/Clip -> Q.