Skip to content

Commit

Permalink
Remove QNN EP options: qnn_context_cache_enable, qnn_context_cache_pa…
Browse files Browse the repository at this point in the history
…th, qnn_context_embed_mode. Add session option accordingly.
  • Loading branch information
HectorSVC committed Dec 18, 2023
1 parent 8117368 commit 8a00784
Show file tree
Hide file tree
Showing 10 changed files with 135 additions and 79 deletions.
6 changes: 0 additions & 6 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3593,17 +3593,11 @@ struct OrtApi {
*
* QNN supported keys:
* "backend_path": file path to QNN backend library.
* "qnn_context_cache_enable": 1 to enable QNN graph creation from cached QNN context file. If it's enabled: QNN EP will
* load from cached QNN context binary if it exist. It will generate a context binary file if it's not exist
* "qnn_context_cache_path": explicitly provide the QNN context cache file. Default to model_file.onnx.bin if not provided.
* "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off.
* "rpc_control_latency": QNN RPC control latency.
* "vtcm_mb": QNN VTCM size in MB. default to 0(not set).
* "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
* "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
* "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the ONNX skeleton model.
* 0 means dump the QNN context binary into separate bin file and set the path to EPContext->ep_cache_context.
* The path is relative path to the ONNX skeleton model file.
* "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
* dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
* may alter model/EP partitioning. Use only for debugging.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,21 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
// Use this config to control the minimum size of the initializer when externalizing it during serialization
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
"session.optimized_model_external_initializers_min_size_in_bytes";

// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
// "0": disable. (default)
// "1": enable.
static const char* const kOrtSessionOptionEpContextEnable = "ep.ep_context_enable";

// Specify the file path for the Onnx model which has EP context.
// Default to original_file_name_ctx.onnx if not specified
static const char* const kOrtSessionOptionEpContextFilePath = "ep.ep_context_file_path";

// Flag to specify whether to dump the EP context into the Onnx model.
// "0": dump the EP context into separate file, keep the file name in the Onnx model.
// "1": dump the EP context into the Onnx model. (default).
static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.ep_context_embed_mode";

// Dump the model after graph partitioning to file "partitioned_graph.onnx".
static const char* const kDumpPartitionedGraph = "session.dump_partitioned_graph";
103 changes: 72 additions & 31 deletions onnxruntime/core/framework/graph_partitioner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "core/graph/function_utils.h"
#include "core/graph/graph_viewer.h"
#include "core/graph/model.h"
#include "core/session/onnxruntime_session_options_config_keys.h"

// uncomment this line to count non-CUDA ops in ONNX domain
// #define COUNT_NON_CUDA_OPS
Expand Down Expand Up @@ -510,34 +511,6 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
ORT_RETURN_IF_ERROR(graph.Resolve());
}

const std::vector<const Node*> ep_context_nodes = current_ep.GetEpContextNodes();
auto get_ep_context_node = [&ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
for (auto& node : ep_context_nodes) {
if (node_name == node->Name()) {
return std::make_pair(true, node);
}
}
return std::make_pair(false, static_cast<const Node*>(nullptr));
};

if (ep_context_nodes.size() > 0) {
Model ep_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
graph.DomainToVersionMap(), {}, *current_ep.GetLogger());
auto& ep_graph = ep_model.MainGraph();
ep_graph.SetDescription(graph.Description());
for (const auto& node : graph.Nodes()) {
// the fused node and EPContext node has same node name
auto ep_context_node = get_ep_context_node(node.Name());
// Use EpContext node created by current EP if name matched, otherwise use original node
if (ep_context_node.first) {
ep_graph.AddNode(*ep_context_node.second);
} else {
ep_graph.AddNode(node);
}
}
ORT_RETURN_IF_ERROR(Model::Save(ep_model, "ep_partition.onnx"));
}

// For some cases, like fp16 on cpu, right now we don't have any kernel support that.
// But we will insert cast op to run the model, so skip the error checking here.
// If after graph transform phase, the node still not assigned, we will report error
Expand Down Expand Up @@ -662,9 +635,68 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
return Status::OK();
}

static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
const Graph& graph,
const std::string& ep_context_path,
const logging::Logger& logger) {
std::vector<const Node*> all_ep_context_nodes;
for (const auto& ep : execution_providers) {
const std::vector<const Node*> ep_context_nodes = ep->GetEpContextNodes();
all_ep_context_nodes.insert(all_ep_context_nodes.begin(), ep_context_nodes.begin(), ep_context_nodes.end());
}

auto get_ep_context_node = [&all_ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
for (auto& node : all_ep_context_nodes) {
if (node_name == node->Name()) {
return std::make_pair(true, node);
}
}
return std::make_pair(false, static_cast<const Node*>(nullptr));
};

onnxruntime::PathString context_cache_path;
PathString model_pathstring = graph.ModelPath().ToPathString();
if (all_ep_context_nodes.size() > 0) {
if (!ep_context_path.empty()) {
context_cache_path = ToPathString(ep_context_path);
} else if (!model_pathstring.empty()) {
context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
}

bool file_exist = std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path);

if (file_exist) {
// User need to remove the existing file if want to re-generate it
LOGS(logger, INFO) << "Ep context file exist already.";
return Status::OK();
}

Model ep_context_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
graph.DomainToVersionMap(), {}, logger);
auto& ep_graph = ep_context_model.MainGraph();
ep_graph.SetDescription(graph.Description());
for (const auto& node : graph.Nodes()) {
// the fused node and EPContext node has same node name
auto ep_context_node = get_ep_context_node(node.Name());
// Use EpContext node created by the EPs if name matched, otherwise use node from original model
if (ep_context_node.first) {
ep_graph.AddNode(*ep_context_node.second);
} else {
ep_graph.AddNode(node);
}
}
ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
}

return Status::OK();
}

static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
const ExecutionProviders& execution_providers,
KernelRegistryManager& kernel_registry_manager) {
KernelRegistryManager& kernel_registry_manager,
bool ep_context_enabled,
std::string ep_context_path,
const logging::Logger& logger) {
bool modified_graph = false;

auto& graph = partition_params.graph.get();
Expand All @@ -682,6 +714,10 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
partition_params.debug_graph_fn));
}

if (ep_context_enabled) {
ORT_RETURN_IF_ERROR(CreateEpContextModel(execution_providers, graph, ep_context_path, logger));
}

// expand any nodes that have an ONNX function definition but no matching ORT kernel.
modified_graph = false;
ORT_RETURN_IF_ERROR(InlineNodes(graph, modified_graph));
Expand Down Expand Up @@ -868,6 +904,8 @@ Status GraphPartitioner::InlineFunctionsAOT(Model& model,

Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
const layout_transformation::TransformLayoutFunction& transform_layout_function,
const ConfigOptions& config_options,
const logging::Logger& logger,
Mode mode,
const layout_transformation::DebugGraphFn& debug_graph_fn) const {
// It is a greedy partitioning algorithm per provider preferences user provided when calling ONNX RUNTIME right now.
Expand Down Expand Up @@ -912,8 +950,11 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,

if (mode == Mode::kNormal || mode == Mode::kAssignOnly) {
#if !defined(ORT_MINIMAL_BUILD)
ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode,
providers_, kernel_registry_mgr_));
bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_,
kernel_registry_mgr_, ep_context_enabled,
ep_context_path, logger));
#else
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX models are not supported in this build.");
#endif //! defined(ORT_MINIMAL_BUILD)
Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/core/framework/graph_partitioner.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ namespace onnxruntime {
class ExecutionProviders;
class KernelRegistryManager;
class Model;
struct ConfigOptions;

class GraphPartitioner {
public:
Expand All @@ -31,6 +32,8 @@ class GraphPartitioner {
// Run partitioning.
Status Partition(Graph& graph, FuncManager& func_mgr,
const layout_transformation::TransformLayoutFunction& transform_layout_function,
const ConfigOptions& config_options,
const logging::Logger& logger,
Mode mode = Mode::kNormal,
const layout_transformation::DebugGraphFn& debug_graph_fn = {}) const;

Expand Down
12 changes: 6 additions & 6 deletions onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ Status GetMainContextNode(const std::vector<IExecutionProvider::FusedNodeAndGrap
}

Status GetContextFromOnnxModel(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
const onnxruntime::PathString& ctx_onnx_model_path,
QnnBackendManager* qnn_backend_manager,
const logging::Logger& logger,
std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
const onnxruntime::PathString& ctx_onnx_model_path,
QnnBackendManager* qnn_backend_manager,
const logging::Logger& logger,
std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
for (const auto& fused_node_and_graph : fused_nodes_and_graphs) {
const Node& fused_node = fused_node_and_graph.fused_node;
qnn_models.emplace(fused_node.Name(),
Expand Down Expand Up @@ -204,7 +204,7 @@ bool IsContextCacheFileExists(const std::string& customer_context_cache_path,
if (!customer_context_cache_path.empty()) {
context_cache_path = ToPathString(customer_context_cache_path);
} else if (!model_pathstring.empty()) {
context_cache_path = model_pathstring + ToPathString("_qnn_ctx.onnx");
context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
}

return std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path);
Expand Down Expand Up @@ -305,7 +305,7 @@ Status GenerateCtxCacheOnnxModel(Model* model,
nullptr,
kMSDomain);

// Only dump the context buffer once since all QNN graph are in one single context
// Only dump the context buffer once since all QNN graphs are in one single context
if (0 == index) {
if (qnn_context_embed_mode) {
std::string cache_payload(buffer, buffer + buffer_size);
Expand Down
30 changes: 9 additions & 21 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,29 +114,17 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
if (session_options) {
disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
}

static const std::string CONTEXT_CACHE_ENABLED = "qnn_context_cache_enable";
auto context_cache_enabled_pos = provider_options_map.find(CONTEXT_CACHE_ENABLED);
if (context_cache_enabled_pos != provider_options_map.end()) {
if (context_cache_enabled_pos->second == "1") {
context_cache_enabled_ = true;
LOGS_DEFAULT(VERBOSE) << "Context cache enabled.";
}
}

static const std::string CONTEXT_CACHE_PATH = "qnn_context_cache_path";
auto context_cache_path_pos = provider_options_map.find(CONTEXT_CACHE_PATH);
if (context_cache_path_pos != provider_options_map.end()) {
context_cache_path_cfg_ = context_cache_path_pos->second;
LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
}
context_cache_enabled_ = session_options->config_options.GetConfigOrDefault(
kOrtSessionOptionEpContextEnable, "0") == "1";
LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;

static const std::string CONTEXT_CACHE_EMBED_MODE = "qnn_context_embed_mode";
auto context_cache_embed_mode_pos = provider_options_map.find(CONTEXT_CACHE_EMBED_MODE);
if (context_cache_embed_mode_pos != provider_options_map.end()) {
qnn_context_embed_mode_ = context_cache_embed_mode_pos->second == "1";
qnn_context_embed_mode_ = session_options->config_options.GetConfigOrDefault(
kOrtSessionOptionEpContextEmbedMode, "1") == "1";
LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;

context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
}

static const std::string BACKEND_PATH = "backend_path";
Expand Down Expand Up @@ -557,7 +545,7 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
bool is_qnn_ctx_model = qnn::IsFusedGraphHasCtxNode(fused_nodes_and_graphs);

onnxruntime::PathString context_cache_path;
bool is_ctx_file_exist = false;
bool is_ctx_file_exist = false;
if (!is_qnn_ctx_model) {
const onnxruntime::GraphViewer& graph_viewer_0(fused_nodes_and_graphs[0].filtered_graph);
is_ctx_file_exist = qnn::IsContextCacheFileExists(context_cache_path_cfg_,
Expand Down
14 changes: 11 additions & 3 deletions onnxruntime/core/session/inference_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1166,6 +1166,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool

// Do partitioning based on execution providers' capabilities.
ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn,
session_options_.config_options, *session_logger_,
mode, debug_graph_fn));

// apply Level2 and higher transformers.
Expand Down Expand Up @@ -1198,7 +1199,10 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
ORT_RETURN_IF_ERROR_SESSIONID_(apply_transformer_once(copy_transformer, *session_logger_, graph));
}

ORT_RETURN_IF_ERROR_SESSIONID_(Model::Save(*model_, "partitioned_graph.onnx"));
bool dump_partitioned_graph = session_options_.config_options.GetConfigOrDefault(kDumpPartitionedGraph, "0") == "1";
if (dump_partitioned_graph) {
ORT_RETURN_IF_ERROR_SESSIONID_(Model::Save(*model_, "partitioned_graph.onnx"));
}

#ifdef ENABLE_TRAINING
// Enable memory optimizations (mainly insert recomputation nodes with priority).
Expand Down Expand Up @@ -1462,7 +1466,9 @@ namespace {
Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
const ExecutionProviders& providers,
KernelRegistryManager& kernel_registry_manager,
SessionState& session_state) {
SessionState& session_state,
const ConfigOptions& config_options,
const logging::Logger& logger) {
layout_transformation::TransformLayoutFunction transform_layout_fn = nullptr;

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
Expand All @@ -1483,6 +1489,8 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
ORT_RETURN_IF_ERROR(partitioner.Partition(graph,
session_state.GetMutableFuncMgr(),
transform_layout_fn,
config_options,
logger,
GraphPartitioner::Mode::kOrtFormatLoad));

return Status::OK();
Expand Down Expand Up @@ -1836,7 +1844,7 @@ common::Status InferenceSession::Initialize() {
#endif // !defined(ORT_MINIMAL_BUILD)
} else {
ORT_RETURN_IF_ERROR_SESSIONID_(PartitionOrtFormatModel(graph, execution_providers_, kernel_registry_manager_,
*session_state_));
*session_state_, session_options_.config_options, *session_logger_));

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);
Expand Down
Loading

0 comments on commit 8a00784

Please sign in to comment.