Remove QNN EP options: qnn_context_cache_enable, qnn_context_cache_pa…

…th, qnn_context_embed_mode. Add session option accordingly.
microsoft · Dec 18, 2023 · 8a00784 · 8a00784
1 parent 8117368
commit 8a00784
Show file tree

Hide file tree

Showing 10 changed files with 135 additions and 79 deletions.
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3593,17 +3593,11 @@ struct OrtApi {
    *
    * QNN supported keys:
    *   "backend_path": file path to QNN backend library.
-   *   "qnn_context_cache_enable": 1 to enable QNN graph creation from cached QNN context file. If it's enabled: QNN EP will
-   *    load from cached QNN context binary if it exist. It will generate a context binary file if it's not exist
-   *   "qnn_context_cache_path": explicitly provide the QNN context cache file. Default to model_file.onnx.bin if not provided.
    *   "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off.
    *   "rpc_control_latency": QNN RPC control latency.
    *   "vtcm_mb": QNN VTCM size in MB. default to 0(not set).
    *   "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
    *   "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
-   *   "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the ONNX skeleton model.
-   *   0 means dump the QNN context binary into separate bin file and set the path to EPContext->ep_cache_context.
-   *   The path is relative path to the ONNX skeleton model file.
    *   "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
    *   dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
    *   may alter model/EP partitioning. Use only for debugging.

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -235,3 +235,21 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 // Use this config to control the minimum size of the initializer when externalizing it during serialization
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
+
+// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
+// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
+// "0": disable. (default)
+// "1": enable.
+static const char* const kOrtSessionOptionEpContextEnable = "ep.ep_context_enable";
+
+// Specify the file path for the Onnx model which has EP context.
+// Default to original_file_name_ctx.onnx if not specified
+static const char* const kOrtSessionOptionEpContextFilePath = "ep.ep_context_file_path";
+
+// Flag to specify whether to dump the EP context into the Onnx model.
+// "0": dump the EP context into separate file, keep the file name in the Onnx model.
+// "1": dump the EP context into the Onnx model. (default).
+static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.ep_context_embed_mode";
+
+// Dump the model after graph partitioning to file "partitioned_graph.onnx".
+static const char* const kDumpPartitionedGraph = "session.dump_partitioned_graph";
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
@@ -16,6 +16,7 @@
 #include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 // uncomment this line to count non-CUDA ops in ONNX domain
 // #define COUNT_NON_CUDA_OPS
@@ -510,34 +511,6 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
     ORT_RETURN_IF_ERROR(graph.Resolve());
   }
 
-  const std::vector<const Node*> ep_context_nodes = current_ep.GetEpContextNodes();
-  auto get_ep_context_node = [&ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
-    for (auto& node : ep_context_nodes) {
-      if (node_name == node->Name()) {
-        return std::make_pair(true, node);
-      }
-    }
-    return std::make_pair(false, static_cast<const Node*>(nullptr));
-  };
-
-  if (ep_context_nodes.size() > 0) {
-    Model ep_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-                   graph.DomainToVersionMap(), {}, *current_ep.GetLogger());
-    auto& ep_graph = ep_model.MainGraph();
-    ep_graph.SetDescription(graph.Description());
-    for (const auto& node : graph.Nodes()) {
-      // the fused node and EPContext node has same node name
-      auto ep_context_node = get_ep_context_node(node.Name());
-      // Use EpContext node created by current EP if name matched, otherwise use original node
-      if (ep_context_node.first) {
-        ep_graph.AddNode(*ep_context_node.second);
-      } else {
-        ep_graph.AddNode(node);
-      }
-    }
-    ORT_RETURN_IF_ERROR(Model::Save(ep_model, "ep_partition.onnx"));
-  }
-
   // For some cases, like fp16 on cpu, right now we don't have any kernel support that.
   // But we will insert cast op to run the model, so skip the error checking here.
   // If after graph transform phase, the node still not assigned, we will report error
@@ -662,9 +635,68 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
   return Status::OK();
 }
 
+static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
+                                   const Graph& graph,
+                                   const std::string& ep_context_path,
+                                   const logging::Logger& logger) {
+  std::vector<const Node*> all_ep_context_nodes;
+  for (const auto& ep : execution_providers) {
+    const std::vector<const Node*> ep_context_nodes = ep->GetEpContextNodes();
+    all_ep_context_nodes.insert(all_ep_context_nodes.begin(), ep_context_nodes.begin(), ep_context_nodes.end());
+  }
+
+  auto get_ep_context_node = [&all_ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
+    for (auto& node : all_ep_context_nodes) {
+      if (node_name == node->Name()) {
+        return std::make_pair(true, node);
+      }
+    }
+    return std::make_pair(false, static_cast<const Node*>(nullptr));
+  };
+
+  onnxruntime::PathString context_cache_path;
+  PathString model_pathstring = graph.ModelPath().ToPathString();
+  if (all_ep_context_nodes.size() > 0) {
+    if (!ep_context_path.empty()) {
+      context_cache_path = ToPathString(ep_context_path);
+    } else if (!model_pathstring.empty()) {
+      context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
+    }
+
+    bool file_exist = std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path);
+
+    if (file_exist) {
+      // User need to remove the existing file if want to re-generate it
+      LOGS(logger, INFO) << "Ep context file exist already.";
+      return Status::OK();
+    }
+
+    Model ep_context_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                           graph.DomainToVersionMap(), {}, logger);
+    auto& ep_graph = ep_context_model.MainGraph();
+    ep_graph.SetDescription(graph.Description());
+    for (const auto& node : graph.Nodes()) {
+      // the fused node and EPContext node has same node name
+      auto ep_context_node = get_ep_context_node(node.Name());
+      // Use EpContext node created by the EPs if name matched, otherwise use node from original model
+      if (ep_context_node.first) {
+        ep_graph.AddNode(*ep_context_node.second);
+      } else {
+        ep_graph.AddNode(node);
+      }
+    }
+    ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
+  }
+
+  return Status::OK();
+}
+
 static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
                                        const ExecutionProviders& execution_providers,
-                                       KernelRegistryManager& kernel_registry_manager) {
+                                       KernelRegistryManager& kernel_registry_manager,
+                                       bool ep_context_enabled,
+                                       std::string ep_context_path,
+                                       const logging::Logger& logger) {
   bool modified_graph = false;
 
   auto& graph = partition_params.graph.get();
@@ -682,6 +714,10 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
                                                        partition_params.debug_graph_fn));
     }
 
+    if (ep_context_enabled) {
+      ORT_RETURN_IF_ERROR(CreateEpContextModel(execution_providers, graph, ep_context_path, logger));
+    }
+
     // expand any nodes that have an ONNX function definition but no matching ORT kernel.
     modified_graph = false;
     ORT_RETURN_IF_ERROR(InlineNodes(graph, modified_graph));
@@ -868,6 +904,8 @@ Status GraphPartitioner::InlineFunctionsAOT(Model& model,
 
 Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
                                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                                   const ConfigOptions& config_options,
+                                   const logging::Logger& logger,
                                    Mode mode,
                                    const layout_transformation::DebugGraphFn& debug_graph_fn) const {
   // It is a greedy partitioning algorithm per provider preferences user provided when calling ONNX RUNTIME right now.
@@ -912,8 +950,11 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 
   if (mode == Mode::kNormal || mode == Mode::kAssignOnly) {
 #if !defined(ORT_MINIMAL_BUILD)
-    ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode,
-                                                 providers_, kernel_registry_mgr_));
+    bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+    std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_,
+                                                 kernel_registry_mgr_, ep_context_enabled,
+                                                 ep_context_path, logger));
 #else
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX models are not supported in this build.");
 #endif  //! defined(ORT_MINIMAL_BUILD)

diff --git a/onnxruntime/core/framework/graph_partitioner.h b/onnxruntime/core/framework/graph_partitioner.h
@@ -13,6 +13,7 @@ namespace onnxruntime {
 class ExecutionProviders;
 class KernelRegistryManager;
 class Model;
+struct ConfigOptions;
 
 class GraphPartitioner {
  public:
@@ -31,6 +32,8 @@ class GraphPartitioner {
   // Run partitioning.
   Status Partition(Graph& graph, FuncManager& func_mgr,
                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                   const ConfigOptions& config_options,
+                   const logging::Logger& logger,
                    Mode mode = Mode::kNormal,
                    const layout_transformation::DebugGraphFn& debug_graph_fn = {}) const;
 

diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -57,10 +57,10 @@ Status GetMainContextNode(const std::vector<IExecutionProvider::FusedNodeAndGrap
 }
 
 Status GetContextFromOnnxModel(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
-                                const onnxruntime::PathString& ctx_onnx_model_path,
-                                QnnBackendManager* qnn_backend_manager,
-                                const logging::Logger& logger,
-                                std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
+                               const onnxruntime::PathString& ctx_onnx_model_path,
+                               QnnBackendManager* qnn_backend_manager,
+                               const logging::Logger& logger,
+                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
   for (const auto& fused_node_and_graph : fused_nodes_and_graphs) {
     const Node& fused_node = fused_node_and_graph.fused_node;
     qnn_models.emplace(fused_node.Name(),
@@ -204,7 +204,7 @@ bool IsContextCacheFileExists(const std::string& customer_context_cache_path,
   if (!customer_context_cache_path.empty()) {
     context_cache_path = ToPathString(customer_context_cache_path);
   } else if (!model_pathstring.empty()) {
-    context_cache_path = model_pathstring + ToPathString("_qnn_ctx.onnx");
+    context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
   }
 
   return std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path);
@@ -305,7 +305,7 @@ Status GenerateCtxCacheOnnxModel(Model* model,
                                   nullptr,
                                   kMSDomain);
 
-    // Only dump the context buffer once since all QNN graph are in one single context
+    // Only dump the context buffer once since all QNN graphs are in one single context
     if (0 == index) {
       if (qnn_context_embed_mode) {
         std::string cache_payload(buffer, buffer + buffer_size);

diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -114,29 +114,17 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   if (session_options) {
     disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
-  }
-
-  static const std::string CONTEXT_CACHE_ENABLED = "qnn_context_cache_enable";
-  auto context_cache_enabled_pos = provider_options_map.find(CONTEXT_CACHE_ENABLED);
-  if (context_cache_enabled_pos != provider_options_map.end()) {
-    if (context_cache_enabled_pos->second == "1") {
-      context_cache_enabled_ = true;
-      LOGS_DEFAULT(VERBOSE) << "Context cache enabled.";
-    }
-  }
 
-  static const std::string CONTEXT_CACHE_PATH = "qnn_context_cache_path";
-  auto context_cache_path_pos = provider_options_map.find(CONTEXT_CACHE_PATH);
-  if (context_cache_path_pos != provider_options_map.end()) {
-    context_cache_path_cfg_ = context_cache_path_pos->second;
-    LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
-  }
+    context_cache_enabled_ = session_options->config_options.GetConfigOrDefault(
+                                 kOrtSessionOptionEpContextEnable, "0") == "1";
+    LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;
 
-  static const std::string CONTEXT_CACHE_EMBED_MODE = "qnn_context_embed_mode";
-  auto context_cache_embed_mode_pos = provider_options_map.find(CONTEXT_CACHE_EMBED_MODE);
-  if (context_cache_embed_mode_pos != provider_options_map.end()) {
-    qnn_context_embed_mode_ = context_cache_embed_mode_pos->second == "1";
+    qnn_context_embed_mode_ = session_options->config_options.GetConfigOrDefault(
+                                  kOrtSessionOptionEpContextEmbedMode, "1") == "1";
     LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;
+
+    context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
   }
 
   static const std::string BACKEND_PATH = "backend_path";
@@ -557,7 +545,7 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
   bool is_qnn_ctx_model = qnn::IsFusedGraphHasCtxNode(fused_nodes_and_graphs);
 
   onnxruntime::PathString context_cache_path;
-  bool is_ctx_file_exist = false;  
+  bool is_ctx_file_exist = false;
   if (!is_qnn_ctx_model) {
     const onnxruntime::GraphViewer& graph_viewer_0(fused_nodes_and_graphs[0].filtered_graph);
     is_ctx_file_exist = qnn::IsContextCacheFileExists(context_cache_path_cfg_,

diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
@@ -1166,6 +1166,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
 
   // Do partitioning based on execution providers' capabilities.
   ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn,
+                                                       session_options_.config_options, *session_logger_,
                                                        mode, debug_graph_fn));
 
   // apply Level2 and higher transformers.
@@ -1198,7 +1199,10 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
     ORT_RETURN_IF_ERROR_SESSIONID_(apply_transformer_once(copy_transformer, *session_logger_, graph));
   }
 
-  ORT_RETURN_IF_ERROR_SESSIONID_(Model::Save(*model_, "partitioned_graph.onnx"));
+  bool dump_partitioned_graph = session_options_.config_options.GetConfigOrDefault(kDumpPartitionedGraph, "0") == "1";
+  if (dump_partitioned_graph) {
+    ORT_RETURN_IF_ERROR_SESSIONID_(Model::Save(*model_, "partitioned_graph.onnx"));
+  }
 
 #ifdef ENABLE_TRAINING
   // Enable memory optimizations (mainly insert recomputation nodes with priority).
@@ -1462,7 +1466,9 @@ namespace {
 Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
                                const ExecutionProviders& providers,
                                KernelRegistryManager& kernel_registry_manager,
-                               SessionState& session_state) {
+                               SessionState& session_state,
+                               const ConfigOptions& config_options,
+                               const logging::Logger& logger) {
   layout_transformation::TransformLayoutFunction transform_layout_fn = nullptr;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1483,6 +1489,8 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
   ORT_RETURN_IF_ERROR(partitioner.Partition(graph,
                                             session_state.GetMutableFuncMgr(),
                                             transform_layout_fn,
+                                            config_options,
+                                            logger,
                                             GraphPartitioner::Mode::kOrtFormatLoad));
 
   return Status::OK();
@@ -1836,7 +1844,7 @@ common::Status InferenceSession::Initialize() {
 #endif  // !defined(ORT_MINIMAL_BUILD)
     } else {
       ORT_RETURN_IF_ERROR_SESSIONID_(PartitionOrtFormatModel(graph, execution_providers_, kernel_registry_manager_,
-                                                             *session_state_));
+                                                             *session_state_, session_options_.config_options, *session_logger_));
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);