Skip to content

Commit

Permalink
Move provider options to session options for QNN context cache featur…
Browse files Browse the repository at this point in the history
…e related configuration
  • Loading branch information
HectorSVC committed Dec 19, 2023
1 parent 6d7519e commit 8c4dc75
Show file tree
Hide file tree
Showing 12 changed files with 159 additions and 96 deletions.
6 changes: 0 additions & 6 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3593,17 +3593,11 @@ struct OrtApi {
*
* QNN supported keys:
* "backend_path": file path to QNN backend library.
* "qnn_context_cache_enable": 1 to enable QNN graph creation from cached QNN context file. If it's enabled: QNN EP will
* load from cached QNN context binary if it exist. It will generate a context binary file if it's not exist
* "qnn_context_cache_path": explicitly provide the QNN context cache file. Default to model_file.onnx.bin if not provided.
* "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off.
* "rpc_control_latency": QNN RPC control latency.
* "vtcm_mb": QNN VTCM size in MB. default to 0(not set).
* "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
* "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
* "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the ONNX skeleton model.
* 0 means dump the QNN context binary into separate bin file and set the path to EPContext->ep_cache_context.
* The path is relative path to the ONNX skeleton model file.
* "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
* dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
* may alter model/EP partitioning. Use only for debugging.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,18 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
// Use this config to control the minimum size of the initializer when externalizing it during serialization
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
"session.optimized_model_external_initializers_min_size_in_bytes";

// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.

Check warning on line 240 in include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h#L240

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h:240:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
// "0": disable. (default)
// "1": enable.
static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable";

// Specify the file path for the Onnx model which has EP context.
// Default to original_file_name_ctx.onnx if not specified
static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path";

// Flag to specify whether to dump the EP context into the Onnx model.
// "0": dump the EP context into separate file, keep the file name in the Onnx model.
// "1": dump the EP context into the Onnx model. (default).
static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";

Check warning on line 252 in include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h#L252

Could not find a newline character at the end of the file. [whitespace/ending_newline] [5]
Raw output
include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h:252:  Could not find a newline character at the end of the file.  [whitespace/ending_newline] [5]
27 changes: 7 additions & 20 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,29 +114,16 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
if (session_options) {
disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
}

static const std::string CONTEXT_CACHE_ENABLED = "qnn_context_cache_enable";
auto context_cache_enabled_pos = provider_options_map.find(CONTEXT_CACHE_ENABLED);
if (context_cache_enabled_pos != provider_options_map.end()) {
if (context_cache_enabled_pos->second == "1") {
context_cache_enabled_ = true;
LOGS_DEFAULT(VERBOSE) << "Context cache enabled.";
}
}

static const std::string CONTEXT_CACHE_PATH = "qnn_context_cache_path";
auto context_cache_path_pos = provider_options_map.find(CONTEXT_CACHE_PATH);
if (context_cache_path_pos != provider_options_map.end()) {
context_cache_path_cfg_ = context_cache_path_pos->second;
LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
}
context_cache_enabled_ = session_options->config_options.GetConfigOrDefault(
kOrtSessionOptionEpContextEnable, "0") == "1";
LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;

static const std::string CONTEXT_CACHE_EMBED_MODE = "qnn_context_embed_mode";
auto context_cache_embed_mode_pos = provider_options_map.find(CONTEXT_CACHE_EMBED_MODE);
if (context_cache_embed_mode_pos != provider_options_map.end()) {
qnn_context_embed_mode_ = context_cache_embed_mode_pos->second == "1";
qnn_context_embed_mode_ = session_options->config_options.GetConfigOrDefault(
kOrtSessionOptionEpContextEmbedMode, "1") == "1";
LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;

context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");

Check warning on line 126 in onnxruntime/core/providers/qnn/qnn_execution_provider.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/qnn/qnn_execution_provider.cc#L126

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/core/providers/qnn/qnn_execution_provider.cc:126:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
}

static const std::string BACKEND_PATH = "backend_path";
Expand Down
30 changes: 18 additions & 12 deletions onnxruntime/test/onnx/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,12 @@ void usage() {
"\t-a: Specify custom absolute tolerance values for output value comparison. default: 1e-5\n"
"\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
"\t [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
"\t [QNN only] [qnn_context_cache_enable]: 1 to enable cache QNN context. Default to false.\n"
"\t [QNN only] [qnn_context_cache_path]: File path to the qnn context cache. Default to model_file.onnx.bin if not set.\n"
"\t [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
"\t [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
"\t [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
"\t [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
"\t 'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
"\t [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
"\t [QNN only] [qnn_context_embed_mode]: 1 means dump the QNN context binary into the Onnx skeleton model.\n"
"\t 0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n"
"\t [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
"\t [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
Expand All @@ -73,6 +70,8 @@ void usage() {
"\t [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
"\t-o [optimization level]: Default is 99. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
"\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels. "
"\t-f: Enable EP context cache generation.\n"
"\t-b: Disable EP context embed mode.\n"
"\n"
"\t-h: help\n"
"\n"
Expand Down Expand Up @@ -179,11 +178,13 @@ int real_main(int argc, char* argv[], Ort::Env& env) {

OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_ERROR;
bool verbose_logging_required = false;
bool ep_context_enable = false;
bool disable_ep_context_embed_mode = false;

bool pause = false;
{
int ch;
while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pz"))) != -1) {
while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pzfb"))) != -1) {
switch (ch) {
case 'A':
enable_cpu_mem_arena = false;
Expand Down Expand Up @@ -312,6 +313,12 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
case 'z':
set_denormal_as_zero = true;
break;
case 'b':
disable_ep_context_embed_mode = true;
break;
case 'f':
ep_context_enable = true;
break;
case '?':
case 'h':
default:
Expand Down Expand Up @@ -386,6 +393,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
if (set_denormal_as_zero)
sf.AddConfigEntry(kOrtSessionOptionsConfigSetDenormalAsZero, "1");

if (ep_context_enable)
sf.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
if (disable_ep_context_embed_mode)
sf.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0");

if (enable_tensorrt) {
#ifdef USE_TENSORRT
OrtCUDAProviderOptions cuda_options;
Expand Down Expand Up @@ -466,12 +478,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
if (value != "0") {
ORT_THROW("Set to 0 to disable qnn_context_embed_mode.");
}
} else if (key == "qnn_context_cache_enable") {
if (value != "1") {
ORT_THROW("Set to 1 to enable qnn_context_cache_enable.");
}
} else if (key == "qnn_context_cache_path") {
// no validation
} else if (key == "profiling_level") {
std::set<std::string> supported_profiling_level = {"off", "basic", "detailed"};
if (supported_profiling_level.find(value) == supported_profiling_level.end()) {
Expand Down Expand Up @@ -507,8 +513,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
}
} else {
ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable',
'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
}

Expand Down
2 changes: 0 additions & 2 deletions onnxruntime/test/perftest/command_args_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ namespace perftest {
"\t [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n"
"\t [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n"
"\t [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
"\t [QNN only] [qnn_context_cache_enable]: 1 to enable cache QNN context. Default to false.\n"
"\t [QNN only] [qnn_context_cache_path]: File path to the qnn context cache. Default to model_file.onnx.bin if not set.\n"
"\t [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
"\t [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
"\t [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
Expand Down
14 changes: 4 additions & 10 deletions onnxruntime/test/perftest/ort_test_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
} else {
ORT_THROW("[ERROR] [OpenVINO] The value for the key 'enable_opencl_throttling' should be a boolean i.e. true or false. Default value is false.\n");
}
} else if (key == "disable_dynamic_shapes") {
} else if (key == "enable_dynamic_shapes") {
if (value == "true" || value == "True" ||
value == "false" || value == "False") {
ov_options[key] = value;
Expand All @@ -298,7 +298,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
ov_options[key] = value;
}
} else {
ORT_THROW("[ERROR] [OpenVINO] wrong key type entered. Choose from the following runtime key options that are available for OpenVINO. ['device_type', 'device_id', 'enable_npu_fast_compile', 'num_of_threads', 'cache_dir', 'num_streams', 'enable_opencl_throttling', 'disable_dynamic_shapes'] \n");
ORT_THROW("[ERROR] [OpenVINO] wrong key type entered. Choose from the following runtime key options that are available for OpenVINO. ['device_type', 'device_id', 'enable_npu_fast_compile', 'num_of_threads', 'cache_dir', 'num_streams', 'enable_opencl_throttling|true'] \n");

Check warning on line 301 in onnxruntime/test/perftest/ort_test_session.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/test/perftest/ort_test_session.cc#L301

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/test/perftest/ort_test_session.cc:301:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
}
}
session_options.AppendExecutionProvider("OpenVINO", ov_options);
Expand Down Expand Up @@ -332,12 +332,6 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
if (value.empty()) {
ORT_THROW("Please provide the QNN backend path.");
}
} else if (key == "qnn_context_cache_enable") {
if (value != "1") {
ORT_THROW("Set to 1 to enable qnn_context_cache_enable.");
}
} else if (key == "qnn_context_cache_path") {
// no validation
} else if (key == "profiling_level") {
std::set<std::string> supported_profiling_level = {"off", "basic", "detailed"};
if (supported_profiling_level.find(value) == supported_profiling_level.end()) {
Expand Down Expand Up @@ -373,8 +367,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
ORT_THROW("Supported qnn_context_priority: low, normal, normal_high, high");
}
} else {
ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable',
'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
}

Expand Down
37 changes: 28 additions & 9 deletions onnxruntime/test/providers/qnn/qnn_basic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -375,17 +375,36 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["qnn_context_cache_enable"] = "1";

// Add kMSDomain to cover contrib op like Gelu
const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};

auto& logging_manager = DefaultLoggingManager();
logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);

onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(),
IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
logging_manager.DefaultLogger());
Graph& graph = model.MainGraph();
ModelTestBuilder helper(graph);
BuildCastAddTestCase()(helper);
helper.SetGraphOutputs();
ASSERT_STATUS_OK(model.MainGraph().Resolve());

// Serialize the model to a string.
std::string model_data;
model.ToProto().SerializeToString(&model_data);

const auto model_data_span = AsByteSpan(model_data.data(), model_data.size());

const std::string context_binary_file = "./qnn_context_binary_int32_fp32_inputs_test.onnx";
provider_options["qnn_context_cache_path"] = context_binary_file;
Ort::SessionOptions so;
so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());

RunQnnModelTest(BuildCastAddTestCase(),
provider_options,
13, // opset
ExpectedEPNodeAssignment::All,
1e-5f,
logging::Severity::kERROR,
false);
so.AppendExecutionProvider("QNN", provider_options);

Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), so);

// Make sure the Qnn context cache binary file is generated
EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
Expand Down
20 changes: 14 additions & 6 deletions onnxruntime/test/providers/qnn/qnn_test_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "core/common/span_utils.h"
#include "core/framework/compute_capability.h"
#include "core/graph/graph.h"
#include "core/session/onnxruntime_session_options_config_keys.h"

namespace onnxruntime {
namespace test {
Expand Down Expand Up @@ -106,24 +107,31 @@ void RunQnnModelTest(const GetTestModelFn& build_test_case, ProviderOptions prov
TryEnableQNNSaver(provider_options);
RunAndVerifyOutputsWithEP(AsByteSpan(model_data.data(), model_data.size()), "QNN_EP_TestLogID",
QnnExecutionProviderWithOptions(provider_options),
helper.feeds_, verification_params, {}, verify_outputs);
helper.feeds_, verification_params,
{}, verify_outputs);
}

void InferenceModel(const std::string& model_data, const char* log_id,
std::unique_ptr<IExecutionProvider> execution_provider,
const ProviderOptions& provider_options,
ExpectedEPNodeAssignment expected_ep_assignment, const NameMLValMap& feeds,
std::vector<OrtValue>& output_vals) {
std::vector<OrtValue>& output_vals,
bool is_qnn_ep,
const std::unordered_map<std::string, std::string>& session_option_pairs) {
SessionOptions so;
so.session_logid = log_id;
for (auto key_value : session_option_pairs) {
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(key_value.first.c_str(), key_value.second.c_str()));
}
RunOptions run_options;
run_options.run_tag = so.session_logid;

InferenceSessionWrapper session_object{so, GetEnvironment()};

std::string provider_type = kCpuExecutionProvider;
if (execution_provider) {
provider_type = execution_provider->Type();
ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(execution_provider)));
if (is_qnn_ep) {
auto qnn_ep = QnnExecutionProviderWithOptions(provider_options, &so);
provider_type = qnn_ep->Type();
ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(qnn_ep)));

Check warning on line 134 in onnxruntime/test/providers/qnn/qnn_test_utils.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/test/providers/qnn/qnn_test_utils.cc#L134

Add #include <utility> for move [build/include_what_you_use] [4]
Raw output
onnxruntime/test/providers/qnn/qnn_test_utils.cc:134:  Add #include <utility> for move  [build/include_what_you_use] [4]
}
ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
ASSERT_STATUS_OK(session_object.Initialize());
Expand Down
Loading

0 comments on commit 8c4dc75

Please sign in to comment.