diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index a4ec66761c4ba..3aa98bb020452 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -3650,10 +3650,10 @@ struct OrtApi { * - "73" * - "75" * "device_id": The ID of the device to use when setting 'htp_arch'. Defaults to "0" (for single device). - "enable_htp_fp16_precision": Only used for float32 model. + "enable_htp_fp16_precision": Used for float32 model for HTP backend. Enable the float32 model to be inferenced with fp16 precision. Otherwise, it will be fp32 precision. - - "0": Default. With fp32 precision. - - "1": With fp16 precision. + - "0": With fp32 precision. + - "1": Default. With fp16 precision. "enable_htp_weight_sharing": Enable QNN weight sharing feature while compiling multiple graphs into one QNN context. - "0": Default. Disabled. - "1": Enabled. diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index 9cd73edbff0e0..ac9098f907975 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -142,7 +142,7 @@ class QNNExecutionProvider : public IExecutionProvider { uint32_t device_id_ = 0; qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault; uint32_t default_rpc_control_latency_ = 0; - bool enable_HTP_FP16_precision_ = false; + bool enable_HTP_FP16_precision_ = true; bool share_ep_contexts_ = false; #ifdef _WIN32 onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_; diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 924616f49ab25..e8c948ade1068 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -73,7 +73,7 @@ void usage() { "\t Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n" "\t [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n" "\t [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n" - "\t Otherwise, it will be fp32 precision. Only works for float32 model. Defaults to '0' (with FP32 precision.). \n" + "\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n" "\t [Usage]: -e -i '| |' \n\n" "\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_path|/folderpath/libQnnCpu.so\" \n\n" "\t [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n" diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index c1c48d4945a4d..6e811f4596eab 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -98,7 +98,7 @@ namespace perftest { "\t Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n" "\t [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n" "\t [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n" - "\t Otherwise, it will be fp32 precision. Only works for float32 model. Defaults to '0' (with FP32 precision.). \n" + "\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n" "\t [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n" "\n" "\t [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n" diff --git a/onnxruntime/test/providers/qnn/cast_test.cc b/onnxruntime/test/providers/qnn/cast_test.cc index f03782c33c30a..9b83dd281a56d 100644 --- a/onnxruntime/test/providers/qnn/cast_test.cc +++ b/onnxruntime/test/providers/qnn/cast_test.cc @@ -49,7 +49,8 @@ static GetTestModelFn BuildCastTestCase(const std::vector& shape, template static void RunCastOpTest(const std::vector& shape, ONNX_NAMESPACE::TensorProto_DataType dst_type, ExpectedEPNodeAssignment expected_ep_assignment, - bool use_htp) { + bool use_htp, + bool enable_fp16_precision = true) { ProviderOptions provider_options; #if defined(_WIN32) provider_options["backend_path"] = use_htp ? "QnnHtp.dll" : "QnnCpu.dll"; @@ -57,6 +58,12 @@ static void RunCastOpTest(const std::vector& shape, ONNX_NAMESPACE::Ten provider_options["backend_path"] = use_htp ? "libQnnHtp.so" : "libQnnCpu.so"; #endif + if (use_htp && enable_fp16_precision) { + provider_options["enable_htp_fp16_precision"] = "1"; + } else { + provider_options["enable_htp_fp16_precision"] = "0"; + } + RunQnnModelTest(BuildCastTestCase(shape, dst_type), provider_options, 13, // opset @@ -93,19 +100,19 @@ TEST_F(QnnCPUBackendTests, TestCastFloatToInt32) { // Cast int32_t to float on HTP TEST_F(QnnHTPBackendTests, TestCastInt32ToFloatHTP) { RunCastOpTest({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT, ExpectedEPNodeAssignment::All, - true); + true, false); } // Cast uint8_t to float on HTP TEST_F(QnnHTPBackendTests, TestCastUInt8ToFloatHTP) { RunCastOpTest({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT, ExpectedEPNodeAssignment::All, - true); + true, false); } // Cast float to int32_t on HTP TEST_F(QnnHTPBackendTests, TestCastFloatToInt32HTP) { RunCastOpTest({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32, ExpectedEPNodeAssignment::All, - true); + true, false); } // Cast int64_t to int32_t on HTP diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc index c3a75fd7446e2..cfa77a46210b3 100644 --- a/onnxruntime/test/providers/qnn/clip_op_test.cc +++ b/onnxruntime/test/providers/qnn/clip_op_test.cc @@ -21,7 +21,8 @@ static void RunClipTest(const TestInputDef& input_def, const std::vector>& min_max_defs, ExpectedEPNodeAssignment expected_ep_assignment, bool on_cpu_backend = true, - int opset = 13) { + int opset = 13, + bool enable_fp16_precision = true) { ProviderOptions provider_options; #if defined(_WIN32) @@ -30,6 +31,12 @@ static void RunClipTest(const TestInputDef& input_def, provider_options["backend_path"] = on_cpu_backend ? "libQnnCpu.so" : "libQnnHtp.so"; #endif + if (!on_cpu_backend && enable_fp16_precision) { + provider_options["enable_htp_fp16_precision"] = "1"; + } else { + provider_options["enable_htp_fp16_precision"] = "0"; + } + RunQnnModelTest(BuildOpTestCase("Clip", {input_def}, min_max_defs, {}), provider_options, opset, @@ -80,7 +87,9 @@ TEST_F(QnnHTPBackendTests, Clip_f32) { {TestInputDef({}, true, {-5.0f}), TestInputDef({}, true, {5.0f})}, ExpectedEPNodeAssignment::All, - on_cpu_backend); + on_cpu_backend, + 13, + false); } // Test Clip with int32 on HTP diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp index d8c34d6a6c6ed..708aac03ceb2e 100644 --- a/onnxruntime/test/providers/qnn/matmul_test.cpp +++ b/onnxruntime/test/providers/qnn/matmul_test.cpp @@ -117,7 +117,8 @@ static void RunQDQPerChannelMatMulOpOpTest(const TestInputDef& input_def, ExpectedEPNodeAssignment expected_ep_assignment, int opset = 21, bool use_contrib_qdq = false, - QDQTolerance tolerance = QDQTolerance()) { + QDQTolerance tolerance = QDQTolerance(), + bool enable_fp16_precision = true) { ProviderOptions provider_options; #if defined(_WIN32) provider_options["backend_path"] = "QnnHtp.dll"; @@ -125,6 +126,12 @@ static void RunQDQPerChannelMatMulOpOpTest(const TestInputDef& input_def, provider_options["backend_path"] = "libQnnHtp.so"; #endif + if (enable_fp16_precision) { + provider_options["enable_htp_fp16_precision"] = "1"; + } else { + provider_options["enable_htp_fp16_precision"] = "0"; + } + TestQDQModelAccuracy(BuildMatMulOpTestCase(input_def, weights_def), BuildQDQPerChannelMatMulTestCase(input_def, weights_def, @@ -275,7 +282,8 @@ TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_AS8_WeightInt4) { ExpectedEPNodeAssignment::All, 21, false, - QDQTolerance(0.007f)); + QDQTolerance(0.007f), + false); } // Test QDQ per-channel MatMul with 16-bit act, int8 weights (static) diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc index 2ebc2c6251b44..83899ec6ef17b 100644 --- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc @@ -157,6 +157,8 @@ static void RunOpTest(const std::string& op_type, if (enable_htp_fp16_precision) { provider_options["enable_htp_fp16_precision"] = "1"; + } else { + provider_options["enable_htp_fp16_precision"] = "0"; // enabled in QNN EP by default } // Runs model with a Q/DQ binary op and compares the outputs of the CPU and QNN EPs. diff --git a/onnxruntime/test/providers/qnn/transpose_htp_test.cc b/onnxruntime/test/providers/qnn/transpose_htp_test.cc index 119b8301f36ed..63746e22d214d 100644 --- a/onnxruntime/test/providers/qnn/transpose_htp_test.cc +++ b/onnxruntime/test/providers/qnn/transpose_htp_test.cc @@ -90,7 +90,8 @@ static void RunTransposeQDQTest(const TestInputDef& input_def, template static void RunTransposeNonQDQOnHTP(const TestInputDef& input_def, const std::vector& attrs, - ExpectedEPNodeAssignment expected_ep_assignment) { + ExpectedEPNodeAssignment expected_ep_assignment, + bool enable_fp16_precision = true) { ProviderOptions provider_options; #if defined(_WIN32) provider_options["backend_path"] = "QnnHtp.dll"; @@ -98,6 +99,12 @@ static void RunTransposeNonQDQOnHTP(const TestInputDef& input_def, provider_options["backend_path"] = "libQnnHtp.so"; #endif + if (enable_fp16_precision) { + provider_options["enable_htp_fp16_precision"] = "1"; + } else { + provider_options["enable_htp_fp16_precision"] = "0"; + } + RunQnnModelTest(BuildTransposeTestCase(input_def, attrs), provider_options, 13, @@ -123,7 +130,7 @@ TEST_F(QnnHTPBackendTests, TransposeInt32OnHTP) { TEST_F(QnnHTPBackendTests, TransposeFloatOnHTP) { RunTransposeNonQDQOnHTP(TestInputDef({1, 3, 224, 128}, false, 0, 10.0f), {utils::MakeAttribute("perm", std::vector{0, 2, 3, 1})}, - ExpectedEPNodeAssignment::All); + ExpectedEPNodeAssignment::All, false); } #endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) diff --git a/onnxruntime/test/qnn_ctx_gen/command_args_parser.cc b/onnxruntime/test/qnn_ctx_gen/command_args_parser.cc index 509f56664e572..102846e08ac5f 100644 --- a/onnxruntime/test/qnn_ctx_gen/command_args_parser.cc +++ b/onnxruntime/test/qnn_ctx_gen/command_args_parser.cc @@ -46,7 +46,7 @@ namespace qnnctxgen { "\t [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n" "\t [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. eg: '0', '68', '69', '73', '75'. Defaults to '0' (none). \n" "\t [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n" - "\t Otherwise, it will be fp32 precision. Only works for float32 model. Defaults to '0' (with FP32 precision.). \n" + "\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n" "\t [enable_htp_weight_sharing]: Allows common weights across graphs to be shared and stored in a single context binary. Defaults to '1' (enabled).\n" "\t [Example] -i \"vtcm_mb|8 htp_arch|73\" \n" "\n"