Skip to content

Commit

Permalink
Set enable_htp_fp16_precision default to true (#22186)
Browse files Browse the repository at this point in the history
### Description
Set enable_htp_fp16_precision default to true for HTP backend.
  • Loading branch information
HectorSVC authored Sep 24, 2024
1 parent 209ff86 commit 5fa4505
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 17 deletions.
6 changes: 3 additions & 3 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3650,10 +3650,10 @@ struct OrtApi {
* - "73"
* - "75"
* "device_id": The ID of the device to use when setting 'htp_arch'. Defaults to "0" (for single device).
"enable_htp_fp16_precision": Only used for float32 model.
"enable_htp_fp16_precision": Used for float32 model for HTP backend.
Enable the float32 model to be inferenced with fp16 precision. Otherwise, it will be fp32 precision.
- "0": Default. With fp32 precision.
- "1": With fp16 precision.
- "0": With fp32 precision.
- "1": Default. With fp16 precision.
"enable_htp_weight_sharing": Enable QNN weight sharing feature while compiling multiple graphs into one QNN context.
- "0": Default. Disabled.
- "1": Enabled.
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/qnn/qnn_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ class QNNExecutionProvider : public IExecutionProvider {
uint32_t device_id_ = 0;
qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
uint32_t default_rpc_control_latency_ = 0;
bool enable_HTP_FP16_precision_ = false;
bool enable_HTP_FP16_precision_ = true;
bool share_ep_contexts_ = false;
#ifdef _WIN32
onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_;
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/onnx/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ void usage() {
"\t Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
"\t [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
"\t [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
"\t Otherwise, it will be fp32 precision. Only works for float32 model. Defaults to '0' (with FP32 precision.). \n"
"\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
"\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
"\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_path|/folderpath/libQnnCpu.so\" \n\n"
"\t [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/perftest/command_args_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ namespace perftest {
"\t Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
"\t [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
"\t [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
"\t Otherwise, it will be fp32 precision. Only works for float32 model. Defaults to '0' (with FP32 precision.). \n"
"\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
"\t [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n"
"\n"
"\t [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
Expand Down
15 changes: 11 additions & 4 deletions onnxruntime/test/providers/qnn/cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,21 @@ static GetTestModelFn BuildCastTestCase(const std::vector<int64_t>& shape,
template <typename InputType>
static void RunCastOpTest(const std::vector<int64_t>& shape, ONNX_NAMESPACE::TensorProto_DataType dst_type,
ExpectedEPNodeAssignment expected_ep_assignment,
bool use_htp) {
bool use_htp,
bool enable_fp16_precision = true) {
ProviderOptions provider_options;
#if defined(_WIN32)
provider_options["backend_path"] = use_htp ? "QnnHtp.dll" : "QnnCpu.dll";
#else
provider_options["backend_path"] = use_htp ? "libQnnHtp.so" : "libQnnCpu.so";
#endif

if (use_htp && enable_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";
} else {
provider_options["enable_htp_fp16_precision"] = "0";
}

RunQnnModelTest(BuildCastTestCase<InputType>(shape, dst_type),
provider_options,
13, // opset
Expand Down Expand Up @@ -93,19 +100,19 @@ TEST_F(QnnCPUBackendTests, TestCastFloatToInt32) {
// Cast int32_t to float on HTP
TEST_F(QnnHTPBackendTests, TestCastInt32ToFloatHTP) {
RunCastOpTest<int32_t>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT, ExpectedEPNodeAssignment::All,
true);
true, false);
}

// Cast uint8_t to float on HTP
TEST_F(QnnHTPBackendTests, TestCastUInt8ToFloatHTP) {
RunCastOpTest<uint8_t>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT, ExpectedEPNodeAssignment::All,
true);
true, false);
}

// Cast float to int32_t on HTP
TEST_F(QnnHTPBackendTests, TestCastFloatToInt32HTP) {
RunCastOpTest<float>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32, ExpectedEPNodeAssignment::All,
true);
true, false);
}

// Cast int64_t to int32_t on HTP
Expand Down
13 changes: 11 additions & 2 deletions onnxruntime/test/providers/qnn/clip_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ static void RunClipTest(const TestInputDef<DataType>& input_def,
const std::vector<TestInputDef<DataType>>& min_max_defs,
ExpectedEPNodeAssignment expected_ep_assignment,
bool on_cpu_backend = true,
int opset = 13) {
int opset = 13,
bool enable_fp16_precision = true) {
ProviderOptions provider_options;

#if defined(_WIN32)
Expand All @@ -30,6 +31,12 @@ static void RunClipTest(const TestInputDef<DataType>& input_def,
provider_options["backend_path"] = on_cpu_backend ? "libQnnCpu.so" : "libQnnHtp.so";
#endif

if (!on_cpu_backend && enable_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";
} else {
provider_options["enable_htp_fp16_precision"] = "0";
}

RunQnnModelTest(BuildOpTestCase<DataType, DataType>("Clip", {input_def}, min_max_defs, {}),
provider_options,
opset,
Expand Down Expand Up @@ -80,7 +87,9 @@ TEST_F(QnnHTPBackendTests, Clip_f32) {
{TestInputDef<float>({}, true, {-5.0f}),
TestInputDef<float>({}, true, {5.0f})},
ExpectedEPNodeAssignment::All,
on_cpu_backend);
on_cpu_backend,
13,
false);
}

// Test Clip with int32 on HTP
Expand Down
12 changes: 10 additions & 2 deletions onnxruntime/test/providers/qnn/matmul_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,21 @@ static void RunQDQPerChannelMatMulOpOpTest(const TestInputDef<float>& input_def,
ExpectedEPNodeAssignment expected_ep_assignment,
int opset = 21,
bool use_contrib_qdq = false,
QDQTolerance tolerance = QDQTolerance()) {
QDQTolerance tolerance = QDQTolerance(),
bool enable_fp16_precision = true) {
ProviderOptions provider_options;
#if defined(_WIN32)
provider_options["backend_path"] = "QnnHtp.dll";
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif

if (enable_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";
} else {
provider_options["enable_htp_fp16_precision"] = "0";
}

TestQDQModelAccuracy(BuildMatMulOpTestCase(input_def, weights_def),
BuildQDQPerChannelMatMulTestCase<Input0QType, WeightQType, OutputQType>(input_def,
weights_def,
Expand Down Expand Up @@ -275,7 +282,8 @@ TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_AS8_WeightInt4) {
ExpectedEPNodeAssignment::All,
21,
false,
QDQTolerance(0.007f));
QDQTolerance(0.007f),
false);
}

// Test QDQ per-channel MatMul with 16-bit act, int8 weights (static)
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/simple_op_htp_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ static void RunOpTest(const std::string& op_type,

if (enable_htp_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";
} else {
provider_options["enable_htp_fp16_precision"] = "0"; // enabled in QNN EP by default
}

// Runs model with a Q/DQ binary op and compares the outputs of the CPU and QNN EPs.
Expand Down
11 changes: 9 additions & 2 deletions onnxruntime/test/providers/qnn/transpose_htp_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,21 @@ static void RunTransposeQDQTest(const TestInputDef<float>& input_def,
template <typename DataType>
static void RunTransposeNonQDQOnHTP(const TestInputDef<DataType>& input_def,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
ExpectedEPNodeAssignment expected_ep_assignment) {
ExpectedEPNodeAssignment expected_ep_assignment,
bool enable_fp16_precision = true) {
ProviderOptions provider_options;
#if defined(_WIN32)
provider_options["backend_path"] = "QnnHtp.dll";
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif

if (enable_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";
} else {
provider_options["enable_htp_fp16_precision"] = "0";
}

RunQnnModelTest(BuildTransposeTestCase<DataType>(input_def, attrs),
provider_options,
13,
Expand All @@ -123,7 +130,7 @@ TEST_F(QnnHTPBackendTests, TransposeInt32OnHTP) {
TEST_F(QnnHTPBackendTests, TransposeFloatOnHTP) {
RunTransposeNonQDQOnHTP<float>(TestInputDef<float>({1, 3, 224, 128}, false, 0, 10.0f),
{utils::MakeAttribute("perm", std::vector<int64_t>{0, 2, 3, 1})},
ExpectedEPNodeAssignment::All);
ExpectedEPNodeAssignment::All, false);
}

#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/qnn_ctx_gen/command_args_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace qnnctxgen {
"\t [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
"\t [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. eg: '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
"\t [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
"\t Otherwise, it will be fp32 precision. Only works for float32 model. Defaults to '0' (with FP32 precision.). \n"
"\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
"\t [enable_htp_weight_sharing]: Allows common weights across graphs to be shared and stored in a single context binary. Defaults to '1' (enabled).\n"
"\t [Example] -i \"vtcm_mb|8 htp_arch|73\" \n"
"\n"
Expand Down

0 comments on commit 5fa4505

Please sign in to comment.