From 8a00784019a07a3755e11731274133cb62ef12d2 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Mon, 18 Dec 2023 01:33:05 -0800
Subject: [PATCH] Remove QNN EP options: qnn_context_cache_enable,
 qnn_context_cache_path, qnn_context_embed_mode. Add session option
 accordingly.

---
 .../core/session/onnxruntime_c_api.h          |   6 -
 .../onnxruntime_session_options_config_keys.h |  18 +++
 .../core/framework/graph_partitioner.cc       | 103 ++++++++++++------
 .../core/framework/graph_partitioner.h        |   3 +
 .../qnn/builder/onnx_ctx_model_helper.cc      |  12 +-
 .../providers/qnn/qnn_execution_provider.cc   |  30 ++---
 onnxruntime/core/session/inference_session.cc |  14 ++-
 onnxruntime/test/onnx/main.cc                 |  20 +++-
 .../test/perftest/command_args_parser.cc      |   2 -
 onnxruntime/test/perftest/ort_test_session.cc |   6 -
 10 files changed, 135 insertions(+), 79 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index c41700453a73b..dbd5ad41255fa 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3593,17 +3593,11 @@ struct OrtApi {
    *
    * QNN supported keys:
    *   "backend_path": file path to QNN backend library.
-   *   "qnn_context_cache_enable": 1 to enable QNN graph creation from cached QNN context file. If it's enabled: QNN EP will
-   *    load from cached QNN context binary if it exist. It will generate a context binary file if it's not exist
-   *   "qnn_context_cache_path": explicitly provide the QNN context cache file. Default to model_file.onnx.bin if not provided.
    *   "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off.
    *   "rpc_control_latency": QNN RPC control latency.
    *   "vtcm_mb": QNN VTCM size in MB. default to 0(not set).
    *   "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
    *   "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
-   *   "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the ONNX skeleton model.
-   *   0 means dump the QNN context binary into separate bin file and set the path to EPContext->ep_cache_context.
-   *   The path is relative path to the ONNX skeleton model file.
    *   "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
    *   dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
    *   may alter model/EP partitioning. Use only for debugging.
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index a94973b2cc5d7..c0f503ea02821 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -235,3 +235,21 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 // Use this config to control the minimum size of the initializer when externalizing it during serialization
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
+
+// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
+// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
+// "0": disable. (default)
+// "1": enable.
+static const char* const kOrtSessionOptionEpContextEnable = "ep.ep_context_enable";
+
+// Specify the file path for the Onnx model which has EP context.
+// Default to original_file_name_ctx.onnx if not specified
+static const char* const kOrtSessionOptionEpContextFilePath = "ep.ep_context_file_path";
+
+// Flag to specify whether to dump the EP context into the Onnx model.
+// "0": dump the EP context into separate file, keep the file name in the Onnx model.
+// "1": dump the EP context into the Onnx model. (default).
+static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.ep_context_embed_mode";
+
+// Dump the model after graph partitioning to file "partitioned_graph.onnx".
+static const char* const kDumpPartitionedGraph = "session.dump_partitioned_graph";
\ No newline at end of file
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 55e3bf6e05a7c..f6f1a1e6aba93 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -16,6 +16,7 @@
 #include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 // uncomment this line to count non-CUDA ops in ONNX domain
 // #define COUNT_NON_CUDA_OPS
@@ -510,34 +511,6 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
     ORT_RETURN_IF_ERROR(graph.Resolve());
   }
 
-  const std::vector<const Node*> ep_context_nodes = current_ep.GetEpContextNodes();
-  auto get_ep_context_node = [&ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
-    for (auto& node : ep_context_nodes) {
-      if (node_name == node->Name()) {
-        return std::make_pair(true, node);
-      }
-    }
-    return std::make_pair(false, static_cast<const Node*>(nullptr));
-  };
-
-  if (ep_context_nodes.size() > 0) {
-    Model ep_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-                   graph.DomainToVersionMap(), {}, *current_ep.GetLogger());
-    auto& ep_graph = ep_model.MainGraph();
-    ep_graph.SetDescription(graph.Description());
-    for (const auto& node : graph.Nodes()) {
-      // the fused node and EPContext node has same node name
-      auto ep_context_node = get_ep_context_node(node.Name());
-      // Use EpContext node created by current EP if name matched, otherwise use original node
-      if (ep_context_node.first) {
-        ep_graph.AddNode(*ep_context_node.second);
-      } else {
-        ep_graph.AddNode(node);
-      }
-    }
-    ORT_RETURN_IF_ERROR(Model::Save(ep_model, "ep_partition.onnx"));
-  }
-
   // For some cases, like fp16 on cpu, right now we don't have any kernel support that.
   // But we will insert cast op to run the model, so skip the error checking here.
   // If after graph transform phase, the node still not assigned, we will report error
@@ -662,9 +635,68 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
   return Status::OK();
 }
 
+static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
+                                   const Graph& graph,
+                                   const std::string& ep_context_path,
+                                   const logging::Logger& logger) {
+  std::vector<const Node*> all_ep_context_nodes;
+  for (const auto& ep : execution_providers) {
+    const std::vector<const Node*> ep_context_nodes = ep->GetEpContextNodes();
+    all_ep_context_nodes.insert(all_ep_context_nodes.begin(), ep_context_nodes.begin(), ep_context_nodes.end());
+  }
+
+  auto get_ep_context_node = [&all_ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
+    for (auto& node : all_ep_context_nodes) {
+      if (node_name == node->Name()) {
+        return std::make_pair(true, node);
+      }
+    }
+    return std::make_pair(false, static_cast<const Node*>(nullptr));
+  };
+
+  onnxruntime::PathString context_cache_path;
+  PathString model_pathstring = graph.ModelPath().ToPathString();
+  if (all_ep_context_nodes.size() > 0) {
+    if (!ep_context_path.empty()) {
+      context_cache_path = ToPathString(ep_context_path);
+    } else if (!model_pathstring.empty()) {
+      context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
+    }
+
+    bool file_exist = std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path);
+
+    if (file_exist) {
+      // User need to remove the existing file if want to re-generate it
+      LOGS(logger, INFO) << "Ep context file exist already.";
+      return Status::OK();
+    }
+
+    Model ep_context_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                           graph.DomainToVersionMap(), {}, logger);
+    auto& ep_graph = ep_context_model.MainGraph();
+    ep_graph.SetDescription(graph.Description());
+    for (const auto& node : graph.Nodes()) {
+      // the fused node and EPContext node has same node name
+      auto ep_context_node = get_ep_context_node(node.Name());
+      // Use EpContext node created by the EPs if name matched, otherwise use node from original model
+      if (ep_context_node.first) {
+        ep_graph.AddNode(*ep_context_node.second);
+      } else {
+        ep_graph.AddNode(node);
+      }
+    }
+    ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
+  }
+
+  return Status::OK();
+}
+
 static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
                                        const ExecutionProviders& execution_providers,
-                                       KernelRegistryManager& kernel_registry_manager) {
+                                       KernelRegistryManager& kernel_registry_manager,
+                                       bool ep_context_enabled,
+                                       std::string ep_context_path,
+                                       const logging::Logger& logger) {
   bool modified_graph = false;
 
   auto& graph = partition_params.graph.get();
@@ -682,6 +714,10 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
                                                        partition_params.debug_graph_fn));
     }
 
+    if (ep_context_enabled) {
+      ORT_RETURN_IF_ERROR(CreateEpContextModel(execution_providers, graph, ep_context_path, logger));
+    }
+
     // expand any nodes that have an ONNX function definition but no matching ORT kernel.
     modified_graph = false;
     ORT_RETURN_IF_ERROR(InlineNodes(graph, modified_graph));
@@ -868,6 +904,8 @@ Status GraphPartitioner::InlineFunctionsAOT(Model& model,
 
 Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
                                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                                   const ConfigOptions& config_options,
+                                   const logging::Logger& logger,
                                    Mode mode,
                                    const layout_transformation::DebugGraphFn& debug_graph_fn) const {
   // It is a greedy partitioning algorithm per provider preferences user provided when calling ONNX RUNTIME right now.
@@ -912,8 +950,11 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 
   if (mode == Mode::kNormal || mode == Mode::kAssignOnly) {
 #if !defined(ORT_MINIMAL_BUILD)
-    ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode,
-                                                 providers_, kernel_registry_mgr_));
+    bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+    std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_,
+                                                 kernel_registry_mgr_, ep_context_enabled,
+                                                 ep_context_path, logger));
 #else
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX models are not supported in this build.");
 #endif  //! defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/framework/graph_partitioner.h b/onnxruntime/core/framework/graph_partitioner.h
index 4fc85c2588260..d1ef193cf1520 100644
--- a/onnxruntime/core/framework/graph_partitioner.h
+++ b/onnxruntime/core/framework/graph_partitioner.h
@@ -13,6 +13,7 @@ namespace onnxruntime {
 class ExecutionProviders;
 class KernelRegistryManager;
 class Model;
+struct ConfigOptions;
 
 class GraphPartitioner {
  public:
@@ -31,6 +32,8 @@ class GraphPartitioner {
   // Run partitioning.
   Status Partition(Graph& graph, FuncManager& func_mgr,
                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                   const ConfigOptions& config_options,
+                   const logging::Logger& logger,
                    Mode mode = Mode::kNormal,
                    const layout_transformation::DebugGraphFn& debug_graph_fn = {}) const;
 
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index a9075cb984734..08f862d2b4dcb 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -57,10 +57,10 @@ Status GetMainContextNode(const std::vector<IExecutionProvider::FusedNodeAndGrap
 }
 
 Status GetContextFromOnnxModel(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
-                                const onnxruntime::PathString& ctx_onnx_model_path,
-                                QnnBackendManager* qnn_backend_manager,
-                                const logging::Logger& logger,
-                                std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
+                               const onnxruntime::PathString& ctx_onnx_model_path,
+                               QnnBackendManager* qnn_backend_manager,
+                               const logging::Logger& logger,
+                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
   for (const auto& fused_node_and_graph : fused_nodes_and_graphs) {
     const Node& fused_node = fused_node_and_graph.fused_node;
     qnn_models.emplace(fused_node.Name(),
@@ -204,7 +204,7 @@ bool IsContextCacheFileExists(const std::string& customer_context_cache_path,
   if (!customer_context_cache_path.empty()) {
     context_cache_path = ToPathString(customer_context_cache_path);
   } else if (!model_pathstring.empty()) {
-    context_cache_path = model_pathstring + ToPathString("_qnn_ctx.onnx");
+    context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
   }
 
   return std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path);
@@ -305,7 +305,7 @@ Status GenerateCtxCacheOnnxModel(Model* model,
                                   nullptr,
                                   kMSDomain);
 
-    // Only dump the context buffer once since all QNN graph are in one single context
+    // Only dump the context buffer once since all QNN graphs are in one single context
     if (0 == index) {
       if (qnn_context_embed_mode) {
         std::string cache_payload(buffer, buffer + buffer_size);
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 30f3fddf59019..7acec57f51db5 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -114,29 +114,17 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   if (session_options) {
     disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
-  }
-
-  static const std::string CONTEXT_CACHE_ENABLED = "qnn_context_cache_enable";
-  auto context_cache_enabled_pos = provider_options_map.find(CONTEXT_CACHE_ENABLED);
-  if (context_cache_enabled_pos != provider_options_map.end()) {
-    if (context_cache_enabled_pos->second == "1") {
-      context_cache_enabled_ = true;
-      LOGS_DEFAULT(VERBOSE) << "Context cache enabled.";
-    }
-  }
 
-  static const std::string CONTEXT_CACHE_PATH = "qnn_context_cache_path";
-  auto context_cache_path_pos = provider_options_map.find(CONTEXT_CACHE_PATH);
-  if (context_cache_path_pos != provider_options_map.end()) {
-    context_cache_path_cfg_ = context_cache_path_pos->second;
-    LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
-  }
+    context_cache_enabled_ = session_options->config_options.GetConfigOrDefault(
+                                 kOrtSessionOptionEpContextEnable, "0") == "1";
+    LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;
 
-  static const std::string CONTEXT_CACHE_EMBED_MODE = "qnn_context_embed_mode";
-  auto context_cache_embed_mode_pos = provider_options_map.find(CONTEXT_CACHE_EMBED_MODE);
-  if (context_cache_embed_mode_pos != provider_options_map.end()) {
-    qnn_context_embed_mode_ = context_cache_embed_mode_pos->second == "1";
+    qnn_context_embed_mode_ = session_options->config_options.GetConfigOrDefault(
+                                  kOrtSessionOptionEpContextEmbedMode, "1") == "1";
     LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;
+
+    context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
   }
 
   static const std::string BACKEND_PATH = "backend_path";
@@ -557,7 +545,7 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
   bool is_qnn_ctx_model = qnn::IsFusedGraphHasCtxNode(fused_nodes_and_graphs);
 
   onnxruntime::PathString context_cache_path;
-  bool is_ctx_file_exist = false;  
+  bool is_ctx_file_exist = false;
   if (!is_qnn_ctx_model) {
     const onnxruntime::GraphViewer& graph_viewer_0(fused_nodes_and_graphs[0].filtered_graph);
     is_ctx_file_exist = qnn::IsContextCacheFileExists(context_cache_path_cfg_,
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index fcc33a75ce9a0..fda67b3685bd9 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1166,6 +1166,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
 
   // Do partitioning based on execution providers' capabilities.
   ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn,
+                                                       session_options_.config_options, *session_logger_,
                                                        mode, debug_graph_fn));
 
   // apply Level2 and higher transformers.
@@ -1198,7 +1199,10 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
     ORT_RETURN_IF_ERROR_SESSIONID_(apply_transformer_once(copy_transformer, *session_logger_, graph));
   }
 
-  ORT_RETURN_IF_ERROR_SESSIONID_(Model::Save(*model_, "partitioned_graph.onnx"));
+  bool dump_partitioned_graph = session_options_.config_options.GetConfigOrDefault(kDumpPartitionedGraph, "0") == "1";
+  if (dump_partitioned_graph) {
+    ORT_RETURN_IF_ERROR_SESSIONID_(Model::Save(*model_, "partitioned_graph.onnx"));
+  }
 
 #ifdef ENABLE_TRAINING
   // Enable memory optimizations (mainly insert recomputation nodes with priority).
@@ -1462,7 +1466,9 @@ namespace {
 Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
                                const ExecutionProviders& providers,
                                KernelRegistryManager& kernel_registry_manager,
-                               SessionState& session_state) {
+                               SessionState& session_state,
+                               const ConfigOptions& config_options,
+                               const logging::Logger& logger) {
   layout_transformation::TransformLayoutFunction transform_layout_fn = nullptr;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1483,6 +1489,8 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
   ORT_RETURN_IF_ERROR(partitioner.Partition(graph,
                                             session_state.GetMutableFuncMgr(),
                                             transform_layout_fn,
+                                            config_options,
+                                            logger,
                                             GraphPartitioner::Mode::kOrtFormatLoad));
 
   return Status::OK();
@@ -1836,7 +1844,7 @@ common::Status InferenceSession::Initialize() {
 #endif  // !defined(ORT_MINIMAL_BUILD)
     } else {
       ORT_RETURN_IF_ERROR_SESSIONID_(PartitionOrtFormatModel(graph, execution_providers_, kernel_registry_manager_,
-                                                             *session_state_));
+                                                             *session_state_, session_options_.config_options, *session_logger_));
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 646ff7c95b229..cc0c9c69754fe 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -50,15 +50,12 @@ void usage() {
       "\t-a: Specify custom absolute tolerance values for output value comparison. default: 1e-5\n"
       "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
       "\t    [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
-      "\t    [QNN only] [qnn_context_cache_enable]: 1 to enable cache QNN context. Default to false.\n"
-      "\t    [QNN only] [qnn_context_cache_path]: File path to the qnn context cache. Default to model_file.onnx.bin if not set.\n"
       "\t    [QNN only] [profiling_level]: QNN profiling level, options:  'basic', 'detailed', default 'off'.\n"
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
       "\t    [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
       "\t    'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
       "\t    [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
-      "\t    [QNN only] [qnn_context_embed_mode]: 1 means dump the QNN context binary into the Onnx skeleton model.\n"
       "\t    0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n"
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
       "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
@@ -73,6 +70,8 @@ void usage() {
       "\t [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
       "\t-o [optimization level]: Default is 99. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
       "\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels. "
+      "\t-f: Enable EP context cache generation.\n"
+      "\t-b: Disable EP context embed mode.\n"
       "\n"
       "\t-h: help\n"
       "\n"
@@ -179,11 +178,13 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
 
   OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_ERROR;
   bool verbose_logging_required = false;
+  bool ep_context_enable = false;
+  bool disable_ep_context_embed_mode = false;
 
   bool pause = false;
   {
     int ch;
-    while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pz"))) != -1) {
+    while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pzfb"))) != -1) {
       switch (ch) {
         case 'A':
           enable_cpu_mem_arena = false;
@@ -312,6 +313,12 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
         case 'z':
           set_denormal_as_zero = true;
           break;
+        case 'b':
+          disable_ep_context_embed_mode = true;
+          break;
+        case 'f':
+          ep_context_enable = true;
+          break;
         case '?':
         case 'h':
         default:
@@ -386,6 +393,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     if (set_denormal_as_zero)
       sf.AddConfigEntry(kOrtSessionOptionsConfigSetDenormalAsZero, "1");
 
+    if (ep_context_enable)
+      sf.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+    if (disable_ep_context_embed_mode)
+      sf.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0");
+
     if (enable_tensorrt) {
 #ifdef USE_TENSORRT
       OrtCUDAProviderOptions cuda_options;
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 27e26fe0b3c45..6e3252aaeb4b8 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -65,8 +65,6 @@ namespace perftest {
       "\t    [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n"
       "\t    [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n"
       "\t    [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
-      "\t    [QNN only] [qnn_context_cache_enable]: 1 to enable cache QNN context. Default to false.\n"
-      "\t    [QNN only] [qnn_context_cache_path]: File path to the qnn context cache. Default to model_file.onnx.bin if not set.\n"
       "\t    [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index eb2a77c07f803..abec16f787895 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -332,12 +332,6 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
         if (value.empty()) {
           ORT_THROW("Please provide the QNN backend path.");
         }
-      } else if (key == "qnn_context_cache_enable") {
-        if (value != "1") {
-          ORT_THROW("Set to 1 to enable qnn_context_cache_enable.");
-        }
-      } else if (key == "qnn_context_cache_path") {
-        // no validation
       } else if (key == "profiling_level") {
         std::set<std::string> supported_profiling_level = {"off", "basic", "detailed"};
         if (supported_profiling_level.find(value) == supported_profiling_level.end()) {