Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move some QNN EP provider options to session options #18877

Merged
merged 9 commits into from
Dec 20, 2023
6 changes: 0 additions & 6 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3593,17 +3593,11 @@ struct OrtApi {
*
* QNN supported keys:
* "backend_path": file path to QNN backend library.
* "qnn_context_cache_enable": 1 to enable QNN graph creation from cached QNN context file. If it's enabled: QNN EP will
* load from cached QNN context binary if it exist. It will generate a context binary file if it's not exist
* "qnn_context_cache_path": explicitly provide the QNN context cache file. Default to model_file.onnx.bin if not provided.
* "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off.
* "rpc_control_latency": QNN RPC control latency.
* "vtcm_mb": QNN VTCM size in MB. default to 0(not set).
* "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
* "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
* "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the ONNX skeleton model.
* 0 means dump the QNN context binary into separate bin file and set the path to EPContext->ep_cache_context.
* The path is relative path to the ONNX skeleton model file.
* "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
* dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
* may alter model/EP partitioning. Use only for debugging.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,18 @@
// Use this config to control the minimum size of the initializer when externalizing it during serialization
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
"session.optimized_model_external_initializers_min_size_in_bytes";

// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.

Check warning on line 240 in include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h#L240

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h:240:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
// "0": disable. (default)
// "1": enable.
static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable";
HectorSVC marked this conversation as resolved.
Show resolved Hide resolved

// Specify the file path for the Onnx model which has EP context.
// Default to original_file_name_ctx.onnx if not specified
static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path";

// Flag to specify whether to dump the EP context into the Onnx model.
// "0": dump the EP context into separate file, keep the file name in the Onnx model.
// "1": dump the EP context into the Onnx model. (default).
static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";

Check warning on line 252 in include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h#L252

Could not find a newline character at the end of the file. [whitespace/ending_newline] [5]
Raw output
include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h:252:  Could not find a newline character at the end of the file.  [whitespace/ending_newline] [5]
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ bool IsContextCacheFileExists(const std::string& customer_context_cache_path,
if (!customer_context_cache_path.empty()) {
context_cache_path = ToPathString(customer_context_cache_path);
} else if (!model_pathstring.empty()) {
context_cache_path = model_pathstring + ToPathString("_qnn_ctx.onnx");
context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
HectorSVC marked this conversation as resolved.
Show resolved Hide resolved
}

return std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path);
Expand Down
34 changes: 14 additions & 20 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,29 +114,23 @@
if (session_options) {
disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
}

static const std::string CONTEXT_CACHE_ENABLED = "qnn_context_cache_enable";
auto context_cache_enabled_pos = provider_options_map.find(CONTEXT_CACHE_ENABLED);
if (context_cache_enabled_pos != provider_options_map.end()) {
if (context_cache_enabled_pos->second == "1") {
context_cache_enabled_ = true;
LOGS_DEFAULT(VERBOSE) << "Context cache enabled.";
}
}

static const std::string CONTEXT_CACHE_PATH = "qnn_context_cache_path";
auto context_cache_path_pos = provider_options_map.find(CONTEXT_CACHE_PATH);
if (context_cache_path_pos != provider_options_map.end()) {
context_cache_path_cfg_ = context_cache_path_pos->second;
LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
}
context_cache_enabled_ = session_options->config_options.GetConfigOrDefault(
kOrtSessionOptionEpContextEnable, "0") == "1";
LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;

static const std::string CONTEXT_CACHE_EMBED_MODE = "qnn_context_embed_mode";
auto context_cache_embed_mode_pos = provider_options_map.find(CONTEXT_CACHE_EMBED_MODE);
if (context_cache_embed_mode_pos != provider_options_map.end()) {
qnn_context_embed_mode_ = context_cache_embed_mode_pos->second == "1";
std::string embed_mode = session_options->config_options.GetConfigOrDefault(
kOrtSessionOptionEpContextEmbedMode, "1");
if ("1" == embed_mode) {
qnn_context_embed_mode_ = true;
} else if ("0" == embed_mode) {
qnn_context_embed_mode_ = false;
} else {
LOGS_DEFAULT(VERBOSE) << "Invalid ep.context_embed_mode: " << embed_mode << " only 0 or 1 allowed. Set to 1.";
}
LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;

context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");

Check warning on line 133 in onnxruntime/core/providers/qnn/qnn_execution_provider.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/qnn/qnn_execution_provider.cc#L133

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/core/providers/qnn/qnn_execution_provider.cc:133:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
}

static const std::string BACKEND_PATH = "backend_path";
Expand Down
30 changes: 18 additions & 12 deletions onnxruntime/test/onnx/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,12 @@ void usage() {
"\t-a: Specify custom absolute tolerance values for output value comparison. default: 1e-5\n"
"\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
"\t [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
"\t [QNN only] [qnn_context_cache_enable]: 1 to enable cache QNN context. Default to false.\n"
"\t [QNN only] [qnn_context_cache_path]: File path to the qnn context cache. Default to model_file.onnx.bin if not set.\n"
"\t [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
"\t [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
"\t [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
"\t [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
"\t 'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
"\t [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
"\t [QNN only] [qnn_context_embed_mode]: 1 means dump the QNN context binary into the Onnx skeleton model.\n"
"\t 0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n"
"\t [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
"\t [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
Expand All @@ -73,6 +70,8 @@ void usage() {
"\t [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
"\t-o [optimization level]: Default is 99. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
"\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels. "
"\t-f: Enable EP context cache generation.\n"
"\t-b: Disable EP context embed mode.\n"
"\n"
"\t-h: help\n"
"\n"
Expand Down Expand Up @@ -179,11 +178,13 @@ int real_main(int argc, char* argv[], Ort::Env& env) {

OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_ERROR;
bool verbose_logging_required = false;
bool ep_context_enable = false;
bool disable_ep_context_embed_mode = false;

bool pause = false;
{
int ch;
while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pz"))) != -1) {
while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pzfb"))) != -1) {
switch (ch) {
case 'A':
enable_cpu_mem_arena = false;
Expand Down Expand Up @@ -312,6 +313,12 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
case 'z':
set_denormal_as_zero = true;
break;
case 'b':
disable_ep_context_embed_mode = true;
break;
case 'f':
ep_context_enable = true;
break;
case '?':
case 'h':
default:
Expand Down Expand Up @@ -386,6 +393,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
if (set_denormal_as_zero)
sf.AddConfigEntry(kOrtSessionOptionsConfigSetDenormalAsZero, "1");

if (ep_context_enable)
sf.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
if (disable_ep_context_embed_mode)
sf.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0");

if (enable_tensorrt) {
#ifdef USE_TENSORRT
OrtCUDAProviderOptions cuda_options;
Expand Down Expand Up @@ -466,12 +478,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
if (value != "0") {
ORT_THROW("Set to 0 to disable qnn_context_embed_mode.");
}
} else if (key == "qnn_context_cache_enable") {
if (value != "1") {
ORT_THROW("Set to 1 to enable qnn_context_cache_enable.");
}
} else if (key == "qnn_context_cache_path") {
// no validation
} else if (key == "profiling_level") {
std::set<std::string> supported_profiling_level = {"off", "basic", "detailed"};
if (supported_profiling_level.find(value) == supported_profiling_level.end()) {
Expand Down Expand Up @@ -507,8 +513,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
}
} else {
ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable',
'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
}

Expand Down
2 changes: 0 additions & 2 deletions onnxruntime/test/perftest/command_args_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ namespace perftest {
"\t [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n"
"\t [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n"
"\t [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
"\t [QNN only] [qnn_context_cache_enable]: 1 to enable cache QNN context. Default to false.\n"
"\t [QNN only] [qnn_context_cache_path]: File path to the qnn context cache. Default to model_file.onnx.bin if not set.\n"
"\t [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
"\t [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
"\t [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
Expand Down
10 changes: 2 additions & 8 deletions onnxruntime/test/perftest/ort_test_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -332,12 +332,6 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
if (value.empty()) {
ORT_THROW("Please provide the QNN backend path.");
}
} else if (key == "qnn_context_cache_enable") {
if (value != "1") {
ORT_THROW("Set to 1 to enable qnn_context_cache_enable.");
}
} else if (key == "qnn_context_cache_path") {
// no validation
} else if (key == "profiling_level") {
std::set<std::string> supported_profiling_level = {"off", "basic", "detailed"};
if (supported_profiling_level.find(value) == supported_profiling_level.end()) {
Expand Down Expand Up @@ -373,8 +367,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
ORT_THROW("Supported qnn_context_priority: low, normal, normal_high, high");
}
} else {
ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable',
'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
}

Expand Down
37 changes: 28 additions & 9 deletions onnxruntime/test/providers/qnn/qnn_basic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -375,17 +375,36 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["qnn_context_cache_enable"] = "1";

// Add kMSDomain to cover contrib op like Gelu
const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};

auto& logging_manager = DefaultLoggingManager();
logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);

onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(),
IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
logging_manager.DefaultLogger());
Graph& graph = model.MainGraph();
ModelTestBuilder helper(graph);
BuildCastAddTestCase()(helper);
helper.SetGraphOutputs();
ASSERT_STATUS_OK(model.MainGraph().Resolve());

// Serialize the model to a string.
std::string model_data;
model.ToProto().SerializeToString(&model_data);

const auto model_data_span = AsByteSpan(model_data.data(), model_data.size());

const std::string context_binary_file = "./qnn_context_binary_int32_fp32_inputs_test.onnx";
provider_options["qnn_context_cache_path"] = context_binary_file;
Ort::SessionOptions so;
so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());

RunQnnModelTest(BuildCastAddTestCase(),
provider_options,
13, // opset
ExpectedEPNodeAssignment::All,
1e-5f,
logging::Severity::kERROR,
false);
so.AppendExecutionProvider("QNN", provider_options);

Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), so);

// Make sure the Qnn context cache binary file is generated
EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
Expand Down
20 changes: 14 additions & 6 deletions onnxruntime/test/providers/qnn/qnn_test_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "core/common/span_utils.h"
#include "core/framework/compute_capability.h"
#include "core/graph/graph.h"
#include "core/session/onnxruntime_session_options_config_keys.h"

namespace onnxruntime {
namespace test {
Expand Down Expand Up @@ -106,24 +107,31 @@
TryEnableQNNSaver(provider_options);
RunAndVerifyOutputsWithEP(AsByteSpan(model_data.data(), model_data.size()), "QNN_EP_TestLogID",
QnnExecutionProviderWithOptions(provider_options),
helper.feeds_, verification_params, {}, verify_outputs);
helper.feeds_, verification_params,
{}, verify_outputs);
}

void InferenceModel(const std::string& model_data, const char* log_id,
std::unique_ptr<IExecutionProvider> execution_provider,
const ProviderOptions& provider_options,
ExpectedEPNodeAssignment expected_ep_assignment, const NameMLValMap& feeds,
std::vector<OrtValue>& output_vals) {
std::vector<OrtValue>& output_vals,
bool is_qnn_ep,
const std::unordered_map<std::string, std::string>& session_option_pairs) {
SessionOptions so;
so.session_logid = log_id;
for (auto key_value : session_option_pairs) {
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(key_value.first.c_str(), key_value.second.c_str()));
}
RunOptions run_options;
run_options.run_tag = so.session_logid;

InferenceSessionWrapper session_object{so, GetEnvironment()};

std::string provider_type = kCpuExecutionProvider;
if (execution_provider) {
provider_type = execution_provider->Type();
ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(execution_provider)));
if (is_qnn_ep) {
auto qnn_ep = QnnExecutionProviderWithOptions(provider_options, &so);
provider_type = qnn_ep->Type();
ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(qnn_ep)));

Check warning on line 134 in onnxruntime/test/providers/qnn/qnn_test_utils.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/test/providers/qnn/qnn_test_utils.cc#L134

Add #include <utility> for move [build/include_what_you_use] [4]
Raw output
onnxruntime/test/providers/qnn/qnn_test_utils.cc:134:  Add #include <utility> for move  [build/include_what_you_use] [4]
}
ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
ASSERT_STATUS_OK(session_object.Initialize());
Expand Down
Loading
Loading