Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[QNN EP] Apply workaround for Conv validation bug when bias input is implicit #21764

Merged
merged 5 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,64 @@ Status BaseOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
return Status::OK();
}

Status BaseOpBuilder::AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
const QnnQuantParamsWrapper& input0_qparams,
const QnnQuantParamsWrapper& input1_qparams,
std::vector<uint32_t>&& bias_shape,
const std::string& bias_name,
const logging::Logger& logger,
std::vector<std::string>& input_names) const {
ORT_UNUSED_PARAMETER(logger);
// For now, only handle case where input0 is per-tensor quantized and input1 is either per-tensor
// or per-channel quantized.
ORT_RETURN_IF_NOT(input0_qparams.IsPerTensor(/*include_bw*/ true) && input1_qparams.IsQuantized(),
"QNN EP currently only supports adding a dummy zero bias input for per-tensor ",
"input[0] and per-tensor/per-channel input[1]");

size_t num_bias_elems = 1;
for (size_t i = 0; i < bias_shape.size(); i++) {
num_bias_elems *= static_cast<size_t>(bias_shape[i]);
}

// Bias static input should be all zeros.
std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);

// Bias's quantization scale(s) should be the product of the other inputs' quantization scales.
// Input[0] is expected to have one scale (per-tensor).
// If input[1] is per-channel (many scales), then the dummy bias also needs to be per-channel.
std::vector<float> input0_quant_scales;
std::vector<float> input1_quant_scales;
ORT_RETURN_IF_ERROR(input0_qparams.GetScales(input0_quant_scales));
ORT_RETURN_IF_ERROR(input1_qparams.GetScales(input1_quant_scales));

const size_t num_bias_scales_offsets = input1_quant_scales.size();
assert(input0_quant_scales.size() == 1); // Expected for per-tensor.
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
"Input[1] should have >= 1 quantization scale values");

std::vector<float> bias_scales(num_bias_scales_offsets);
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
}

std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
QnnQuantParamsWrapper bias_qparams;

if (input1_qparams.IsPerChannel()) {
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
} else {
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
}

auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));

qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
input_names.push_back(bias_name);

return Status::OK();
}

Status BaseOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,14 @@ class BaseOpBuilder : public IOpBuilder {
const logging::Logger& logger,
std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;

Status AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
const QnnQuantParamsWrapper& input0_qparams,
const QnnQuantParamsWrapper& input1_qparams,
std::vector<uint32_t>&& bias_shape,
const std::string& bias_name,
const logging::Logger& logger,
std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;

Status SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,10 +289,30 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
//
// Input 2: bias
//
if (num_inputs == 3) {
const bool has_bias_input = num_inputs == 3;
if (has_bias_input) {
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names));
}

#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 16 && QNN_API_VERSION_MINOR <= 18)
if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) {
// Bias is implicit. QNN SDK 2.23/2.24/2.25 (QNN API version 2.16/2.17/2.18) has a validation bug for
// implicit bias inputs, so provide an explicit bias of all 0 (quantized int32).
TensorInfo input0_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));

TensorInfo input1_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info));

if (input0_info.quant_param.IsPerTensor(/*include_bw*/ true) && input1_info.quant_param.IsQuantized()) {
const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";
std::vector<uint32_t> bias_shape = {input1_info.shape[0]};
ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, input0_info.quant_param, input1_info.quant_param,
std::move(bias_shape), bias_name, logger, input_names));
}
}
#endif

return Status::OK();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,47 +99,9 @@ Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,

if (x_input_info.quant_param.IsPerTensor(/*include_bw*/ true) && scale_input_info.quant_param.IsQuantized()) {
const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";

// Make dummy bias input have the same shape as the scale input.
std::vector<uint32_t> bias_shape = scale_input_info.shape;
size_t num_bias_elems = 1;
for (size_t i = 0; i < bias_shape.size(); i++) {
num_bias_elems *= static_cast<size_t>(bias_shape[i]);
}

// Bias static input should be all zeros.
std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);

// Bias's quantization scale should be the product of the other inputs' quantization scales.
std::vector<float> input0_quant_scales;
std::vector<float> input1_quant_scales;
ORT_RETURN_IF_ERROR(x_input_info.quant_param.GetScales(input0_quant_scales));
ORT_RETURN_IF_ERROR(scale_input_info.quant_param.GetScales(input1_quant_scales));

const size_t num_bias_scales_offsets = input1_quant_scales.size();
assert(input0_quant_scales.size() == 1); // Expected for per-tensor.
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
"Input[1] should have >= 1 quantization scale values");

std::vector<float> bias_scales(num_bias_scales_offsets);
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
}

std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
QnnQuantParamsWrapper bias_qparams;

if (scale_input_info.quant_param.IsPerChannel()) {
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
} else {
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
}

auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));

qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
input_names.push_back(bias_name);
ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, x_input_info.quant_param, scale_input_info.quant_param,
std::move(bias_shape), bias_name, logger, input_names));
}
}
#endif
Expand Down
13 changes: 12 additions & 1 deletion onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,18 @@ Status QnnModelWrapper::UnpackZeroPoints(const std::string& initializer_name,

switch (onnx_data_type) {
// QNN use -offset for some reason
case ONNX_NAMESPACE::TensorProto_DataType_INT4: // INT4 zero-points are unpacked as 8-bit values for QNN
case ONNX_NAMESPACE::TensorProto_DataType_INT4: { // INT4 zero-points are unpacked as 8-bit values for QNN
auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),
[](int8_t masked_zp) -> int32_t {
// We currently unpack int4 as int8 but with the top 4-bits masked off due to QNN bug.
// Need to undo the masking so that the zero-point value is correct.
// (Not really a problem yet because QNN only supports symmetric INT4 quantization with zp == 0).
int8_t zp = Int4x2::SignExtendLower4Bits(std::byte(masked_zp));
return -static_cast<int32_t>(zp);
});
break;
}
case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),
Expand Down
8 changes: 6 additions & 2 deletions onnxruntime/core/providers/qnn/builder/qnn_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
} else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
out << " axis=" << quantize_params.axisScaleOffsetEncoding.axis;
size_t num_elems = quantize_params.axisScaleOffsetEncoding.numScaleOffsets;
bool truncate = num_elems > 20;
num_elems = truncate ? 20 : num_elems;
out << " scales=(";
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].scale << (i == num_elems - 1 ? "" : " ");
Expand All @@ -239,11 +241,13 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].offset << (i == num_elems - 1 ? "" : " ");
}
out << ")";
out << (truncate ? "...)" : ")");
} else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
out << " axis=" << quantize_params.bwAxisScaleOffsetEncoding.axis;
out << " bw=" << quantize_params.bwAxisScaleOffsetEncoding.bitwidth;
size_t num_elems = quantize_params.bwAxisScaleOffsetEncoding.numElements;
bool truncate = num_elems > 20;
num_elems = truncate ? 20 : num_elems;
out << " scales=(";
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.bwAxisScaleOffsetEncoding.scales[i] << (i == num_elems - 1 ? "" : " ");
Expand All @@ -252,7 +256,7 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.bwAxisScaleOffsetEncoding.offsets[i] << (i == num_elems - 1 ? "" : " ");
}
out << ")";
out << (truncate ? "...)" : ")");
} else {
out << " encoding not supported.";
}
Expand Down
7 changes: 5 additions & 2 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -423,14 +423,14 @@ static void LogNodeSupport(const logging::Logger& logger,
return;
}

size_t num_nodes = 0;
std::ostringstream oss;
oss << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for nodes ("
<< qnn_node_group.Type() << "):" << std::endl;
for (const NodeUnit* node_unit : qnn_node_group.GetNodeUnits()) {
for (const Node* node : node_unit->GetAllNodesInGroup()) {
oss << "\tOperator type: " << node->OpType()
<< " Node name: " << node->Name()
<< " Node index: " << node->Index() << std::endl;
num_nodes += 1;
}
}
if (!support_status.IsOK()) {
Expand All @@ -440,6 +440,9 @@ static void LogNodeSupport(const logging::Logger& logger,
logging::Capture(logger, log_severity, logging::Category::onnxruntime,
log_data_type, call_site)
.Stream()
<< (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes
<< " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :"
<< std::endl
adrianlizarraga marked this conversation as resolved.
Show resolved Hide resolved
<< oss.str();
}

Expand Down
108 changes: 95 additions & 13 deletions onnxruntime/test/providers/qnn/conv_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -793,19 +793,101 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) {
TestInputDef<float> bias_def(bias_shape, true,
GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));

RunHTPConvOpPerChannelTest<uint8_t, Int4x2>("Conv",
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}

// Test per-channel QDQ Conv with INT4 weights and no bias.
// in0: u16, in1 (weight): s4, out: u8
// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias).
TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 2, 2};

TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));

RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}

// Test per-channel QDQ Conv with uint16 input[0], uint8 weights, and no bias.
// in0: u16, in1 (weight): s4, out: u8
// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias).
TEST_F(QnnHTPBackendTests, ConvU16U8_PerTensor_NoBias) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 2, 2};

TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));

RunHTPConvOpTest<uint16_t, uint8_t>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}

TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias_LargeINT4Weight) {
std::vector<int64_t> input_shape = {1, 3072, 1, 512};
std::vector<int64_t> weight_shape = {9216, 3072, 1, 1};
std::vector<float> input_data(TensorShape(input_shape).Size(), 0.1f);
input_data[0] = 0.2f;
std::vector<float> weight_data(TensorShape(weight_shape).Size(), -0.1f);
for (size_t c = 0; c < static_cast<size_t>(weight_shape[0]); c++) {
size_t i = c * 3072;
weight_data[i] = 0.1f;
}

TestInputDef<float> input_def(input_shape, false, input_data);
TestInputDef<float> weight_def(weight_shape, true, weight_data);

RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}

// Test fusion of DQs -> Conv -> Relu/Clip -> Q.
Expand Down
Loading