Skip to content

Commit

Permalink
[QNN EP] Support Qnn MatMul with 2 dynamic inputs which are uint16 qu…
Browse files Browse the repository at this point in the history
…antized (microsoft#18469)

### Description
QNN can't run MatMul if both inputs are dynamic inputs with uint16 quantized on v68. Make it run by inserting Convert op to convert 1 input to int8
  • Loading branch information
HectorSVC authored and kleiti committed Mar 22, 2024
1 parent 0907185 commit ec96215
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,6 @@ bool InstanceAndLayerNormalizationNodeGroupSelector::Check(const GraphViewer& gr
}

int32_t dt_input = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
int32_t dt_scale = dq_nodes[1]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
int32_t dt_bias = 0;
bool has_bias = false;
// bias is optional for LayerNorm
Expand All @@ -453,9 +452,9 @@ bool InstanceAndLayerNormalizationNodeGroupSelector::Check(const GraphViewer& gr
}
int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();

// Input, output, and scale need to be the same type. The bias is int32.
// Input, output, need to be the same type. The bias is int32.
// Scale can be different with input for a16w8 case
return (dt_input == dt_output) &&
(dt_input == dt_scale) &&
(has_bias ? dt_bias == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32 : true);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ class SimpleOpBuilder : public BaseOpBuilder {
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SimpleOpBuilder);

protected:
Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const override ORT_MUST_USE_RESULT;
Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,
Expand All @@ -48,6 +53,90 @@ class SimpleOpBuilder : public BaseOpBuilder {
static constexpr std::array<std::string_view, 3> gridsample_supported_padding_modes = {"zeros", "border", "reflection"};
};

// Move to qnn_utils if it's re-usable
Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper,
const std::string& convert_input_name,
const std::string& convert_output_name,
Qnn_DataType_t input_qnn_data_type,
Qnn_DataType_t output_qnn_data_type,
int32_t input_offset,
float input_scale,
const std::vector<uint32_t>& output_shape,
bool do_op_validation) {
// Assume input is already handled.
float qmin = 0.0f;
float qmax = 255.0f;
ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax));
double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin);
double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax);

Qnn_QuantizeParams_t convert_output_quant_param = QNN_QUANTIZE_PARAMS_INIT;
convert_output_quant_param.encodingDefinition = QNN_DEFINITION_DEFINED;
convert_output_quant_param.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast<float>(value_min),
static_cast<float>(value_max),
output_qnn_data_type,
convert_output_quant_param.scaleOffsetEncoding.scale,
convert_output_quant_param.scaleOffsetEncoding.offset));

std::vector<uint32_t> output_shape_copy = output_shape;
QnnTensorWrapper convert_output_tensorwrapper(convert_output_name,
QNN_TENSOR_TYPE_NATIVE,
output_qnn_data_type,
convert_output_quant_param,
std::move(output_shape_copy));
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor.");

ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name,
QNN_OP_PACKAGE_NAME_QTI_AISW,
"Convert",
{convert_input_name},
{convert_output_name},
{},
do_op_validation),
"Failed to add node.");
return Status::OK();
}

Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const {
const std::string& op_type = node_unit.OpType();
ORT_RETURN_IF_ERROR(BaseOpBuilder::ProcessInputs(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation));

if (op_type == "MatMul") {
const auto& inputs = node_unit.Inputs();
TensorInfo input0_info = {};
TensorInfo input1_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info));
// Need to insert Convert op if both inputs are dynamic inputs and are ufixed_16
if (!input0_info.is_initializer && !input1_info.is_initializer &&
input0_info.qnn_data_type == input1_info.qnn_data_type &&
input0_info.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
// insert Convert op after input1
std::string convert_input_name = input_names.back();
input_names.pop_back();
const std::string& matmul_output_name = node_unit.Outputs()[0].node_arg.Name();
std::string convert_output_name = convert_input_name + "_convert_" + matmul_output_name;
ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper,
convert_input_name,
convert_output_name,
input1_info.qnn_data_type,
QNN_DATATYPE_UFIXED_POINT_8,
input1_info.quant_param.scaleOffsetEncoding.offset,
input1_info.quant_param.scaleOffsetEncoding.scale,
input1_info.shape,
do_op_validation));
input_names.push_back(convert_output_name);
}
}

return Status::OK();
}

Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const {
const std::string& op_type = node_unit.OpType();

Expand Down
39 changes: 34 additions & 5 deletions onnxruntime/test/providers/qnn/matmul_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,6 @@ TEST_F(QnnHTPBackendTests, MatMulOp_HTP_u8) {
}

// Test QDQ MatMul with 16-bit act, 8-bit weights (static)
// TODO: (SLIGHT) Inaccuracy detected for output 'output', element 0.
// Output quant params: scale=0.0015259021893143654, zero_point=0.
// Expected val: 98
// QNN QDQ val: 97.720298767089844 (err 0.27970123291015625)
// CPU QDQ val: 97.726402282714844 (err 0.27359771728515625)
TEST_F(QnnHTPBackendTests, MatMulOp_HTP_A16_W8Static) {
std::vector<float> input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f};
std::vector<float> input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f};
Expand All @@ -158,6 +153,40 @@ TEST_F(QnnHTPBackendTests, MatMulOp_HTP_A16_W8Static) {
7e-3f);
}

// Test QDQ MatMul with uint16 activation uint16 weights, both dynamic
// Inaccuracy detected for output 'output_0', element 1.
// Output quant params: scale=0.0015259021893143654, zero_point=0.
// Expected val: 40
// QNN QDQ val: 39.681087493896484 (err 0.31891250610351562)
// CPU QDQ val: 39.99847412109375 (err 0.00152587890625)
TEST_F(QnnHTPBackendTests, DISABLED_MatMulOp_HTP_A16_W16Dynamic) {
std::vector<float> input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f};
std::vector<float> input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f};
RunQDQMatMulOpOpTest<uint16_t, uint16_t, uint16_t>(TestInputDef<float>({2, 3}, false, input0_data),
TestInputDef<float>({3, 2}, false, input1_data),
ExpectedEPNodeAssignment::All,
18,
true, // Use com.microsoft Q/DQ ops
7e-3f);
}

// Test QDQ MatMul with uint16 activation uint16 weights, both dynamic
// Inaccuracy detected for output 'output_0', element 1.
// Output quant params: scale=0.71908456087112427, zero_point=1.
// Expected val: 46848.41015625
// QNN QDQ val: 46844.04296875 (err 4.3671875)
// CPU QDQ val: 46848.359375 (err 0.05078125)
TEST_F(QnnHTPBackendTests, DISABLED_MatMulOp_HTP_A16_W16DynamicLarge) {
std::vector<float> input0_data = GetFloatDataInRange(-10.0f, 10.0f, 12 * 96 * 512);
std::vector<float> input1_data = GetFloatDataInRange(-10.0f, 10.0f, 12 * 96 * 512);
RunQDQMatMulOpOpTest<uint16_t, uint16_t, uint16_t>(TestInputDef<float>({1, 12, 96, 512}, false, input0_data),
TestInputDef<float>({1, 12, 512, 96}, false, input1_data),
ExpectedEPNodeAssignment::All,
18,
true, // Use com.microsoft Q/DQ ops
7e-3f);
}

// Test 16-bit QDQ MatMul with static weights
// TODO: Inaccuracy detected for output 'output', element 0.
// Output quant params: scale=0.0015259021893143654, zero_point=0.
Expand Down

0 comments on commit ec96215

Please sign in to comment.