Merge remote-tracking branch 'origin/main' into skottmckay/AddConfigO…

…ptionsToOpKernelInfo
microsoft · Dec 20, 2023 · 9ba9e63 · 9ba9e63
2 parents 6ead3c2 + c0142c9
commit 9ba9e63
Show file tree

Hide file tree

Showing 24 changed files with 305 additions and 158 deletions.
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3593,17 +3593,11 @@ struct OrtApi {
    *
    * QNN supported keys:
    *   "backend_path": file path to QNN backend library.
-   *   "qnn_context_cache_enable": 1 to enable QNN graph creation from cached QNN context file. If it's enabled: QNN EP will
-   *    load from cached QNN context binary if it exist. It will generate a context binary file if it's not exist
-   *   "qnn_context_cache_path": explicitly provide the QNN context cache file. Default to model_file.onnx.bin if not provided.
    *   "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off.
    *   "rpc_control_latency": QNN RPC control latency.
    *   "vtcm_mb": QNN VTCM size in MB. default to 0(not set).
    *   "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
    *   "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
-   *   "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the ONNX skeleton model.
-   *   0 means dump the QNN context binary into separate bin file and set the path to EPContext->ep_cache_context.
-   *   The path is relative path to the ONNX skeleton model file.
    *   "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
    *   dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
    *   may alter model/EP partitioning. Use only for debugging.

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -235,3 +235,18 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 // Use this config to control the minimum size of the initializer when externalizing it during serialization
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
+
+// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
+// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
+// "0": disable. (default)
+// "1": enable.
+static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable";
+
+// Specify the file path for the Onnx model which has EP context.
+// Default to original_file_name_ctx.onnx if not specified
+static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path";
+
+// Flag to specify whether to dump the EP context into the Onnx model.
+// "0": dump the EP context into separate file, keep the file name in the Onnx model.
+// "1": dump the EP context into the Onnx model. (default).
+static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
diff --git a/js/node/lib/backend.ts b/js/node/lib/backend.ts
@@ -20,7 +20,7 @@ class OnnxruntimeSessionHandler implements InferenceSessionHandler {
   }
 
   async dispose(): Promise<void> {
-    return Promise.resolve();
+    this.#inferenceSession.dispose();
   }
 
   readonly inputNames: string[];

diff --git a/js/node/lib/binding.ts b/js/node/lib/binding.ts
@@ -28,6 +28,8 @@ export declare namespace Binding {
     readonly outputNames: string[];
 
     run(feeds: FeedsType, fetches: FetchesType, options: RunOptions): ReturnType;
+
+    dispose(): void;
   }
 
   export interface InferenceSessionConstructor {

diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc
@@ -31,6 +31,7 @@ Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
   Napi::Function func = DefineClass(
       env, "InferenceSession",
       {InstanceMethod("loadModel", &InferenceSessionWrap::LoadModel), InstanceMethod("run", &InferenceSessionWrap::Run),
+       InstanceMethod("dispose", &InferenceSessionWrap::Dispose),
        InstanceAccessor("inputNames", &InferenceSessionWrap::GetInputNames, nullptr, napi_default, nullptr),
        InstanceAccessor("outputNames", &InferenceSessionWrap::GetOutputNames, nullptr, napi_default, nullptr)});
 
@@ -45,14 +46,15 @@ Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
 }
 
 InferenceSessionWrap::InferenceSessionWrap(const Napi::CallbackInfo &info)
-    : Napi::ObjectWrap<InferenceSessionWrap>(info), initialized_(false), session_(nullptr),
+    : Napi::ObjectWrap<InferenceSessionWrap>(info), initialized_(false), disposed_(false), session_(nullptr),
       defaultRunOptions_(nullptr) {}
 
 Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
   Napi::HandleScope scope(env);
 
   ORT_NAPI_THROW_ERROR_IF(this->initialized_, env, "Model already loaded. Cannot load model multiple times.");
+  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
 
   size_t argsLength = info.Length();
   ORT_NAPI_THROW_TYPEERROR_IF(argsLength == 0, env, "Expect argument: model file path or buffer.");
@@ -129,6 +131,7 @@ Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo &info) {
 Napi::Value InferenceSessionWrap::GetInputNames(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
   ORT_NAPI_THROW_ERROR_IF(!this->initialized_, env, "Session is not initialized.");
+  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
 
   Napi::EscapableHandleScope scope(env);
   return scope.Escape(CreateNapiArrayFrom(env, inputNames_));
@@ -137,6 +140,7 @@ Napi::Value InferenceSessionWrap::GetInputNames(const Napi::CallbackInfo &info)
 Napi::Value InferenceSessionWrap::GetOutputNames(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
   ORT_NAPI_THROW_ERROR_IF(!this->initialized_, env, "Session is not initialized.");
+  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
 
   Napi::EscapableHandleScope scope(env);
   return scope.Escape(CreateNapiArrayFrom(env, outputNames_));
@@ -145,6 +149,7 @@ Napi::Value InferenceSessionWrap::GetOutputNames(const Napi::CallbackInfo &info)
 Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
   ORT_NAPI_THROW_ERROR_IF(!this->initialized_, env, "Session is not initialized.");
+  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
   ORT_NAPI_THROW_TYPEERROR_IF(info.Length() < 2, env, "Expect argument: inputs(feed) and outputs(fetch).");
   ORT_NAPI_THROW_TYPEERROR_IF(!info[0].IsObject() || !info[1].IsObject(), env,
                               "Expect inputs(feed) and outputs(fetch) to be objects.");
@@ -209,6 +214,18 @@ Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo &info) {
   }
 }
 
+Napi::Value InferenceSessionWrap::Dispose(const Napi::CallbackInfo &info) {
+  Napi::Env env = info.Env();
+  ORT_NAPI_THROW_ERROR_IF(!this->initialized_, env, "Session is not initialized.");
+  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
+
+  this->defaultRunOptions_.reset(nullptr);
+  this->session_.reset(nullptr);
+
+  this->disposed_ = true;
+  return env.Undefined();
+}
+
 Napi::Value InferenceSessionWrap::ListSupportedBackends(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
   Napi::EscapableHandleScope scope(env);

diff --git a/js/node/src/inference_session_wrap.h b/js/node/src/inference_session_wrap.h
@@ -55,13 +55,22 @@ class InferenceSessionWrap : public Napi::ObjectWrap<InferenceSessionWrap> {
    */
   Napi::Value Run(const Napi::CallbackInfo &info);
 
+  /**
+   * [sync] dispose the session.
+   * @param nothing
+   * @returns nothing
+   * @throw nothing
+   */
+  Napi::Value Dispose(const Napi::CallbackInfo &info);
+
   // private members
 
   // persistent constructor
   static Napi::FunctionReference constructor;
 
   // session objects
   bool initialized_;
+  bool disposed_;
   std::unique_ptr<Ort::Session> session_;
   std::unique_ptr<Ort::RunOptions> defaultRunOptions_;
 

diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -160,7 +160,7 @@ bool IsContextCacheFileExists(const std::string& customer_context_cache_path,
   if (!customer_context_cache_path.empty()) {
     context_cache_path = ToPathString(customer_context_cache_path);
   } else if (!model_pathstring.empty()) {
-    context_cache_path = model_pathstring + ToPathString("_qnn_ctx.onnx");
+    context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
   }
 
   return std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path);

diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -114,29 +114,23 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   if (session_options) {
     disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
-  }
-
-  static const std::string CONTEXT_CACHE_ENABLED = "qnn_context_cache_enable";
-  auto context_cache_enabled_pos = provider_options_map.find(CONTEXT_CACHE_ENABLED);
-  if (context_cache_enabled_pos != provider_options_map.end()) {
-    if (context_cache_enabled_pos->second == "1") {
-      context_cache_enabled_ = true;
-      LOGS_DEFAULT(VERBOSE) << "Context cache enabled.";
-    }
-  }
 
-  static const std::string CONTEXT_CACHE_PATH = "qnn_context_cache_path";
-  auto context_cache_path_pos = provider_options_map.find(CONTEXT_CACHE_PATH);
-  if (context_cache_path_pos != provider_options_map.end()) {
-    context_cache_path_cfg_ = context_cache_path_pos->second;
-    LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
-  }
+    context_cache_enabled_ = session_options->config_options.GetConfigOrDefault(
+                                 kOrtSessionOptionEpContextEnable, "0") == "1";
+    LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;
 
-  static const std::string CONTEXT_CACHE_EMBED_MODE = "qnn_context_embed_mode";
-  auto context_cache_embed_mode_pos = provider_options_map.find(CONTEXT_CACHE_EMBED_MODE);
-  if (context_cache_embed_mode_pos != provider_options_map.end()) {
-    qnn_context_embed_mode_ = context_cache_embed_mode_pos->second == "1";
+    std::string embed_mode = session_options->config_options.GetConfigOrDefault(
+        kOrtSessionOptionEpContextEmbedMode, "1");
+    if ("1" == embed_mode) {
+      qnn_context_embed_mode_ = true;
+    } else if ("0" == embed_mode) {
+      qnn_context_embed_mode_ = false;
+    } else {
+      LOGS_DEFAULT(VERBOSE) << "Invalid ep.context_embed_mode: " << embed_mode << " only 0 or 1 allowed. Set to 1.";
+    }
     LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;
+
+    context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
   }
 
   static const std::string BACKEND_PATH = "backend_path";

diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -193,9 +193,13 @@ def parse_args():
 
     parser.add_argument("--input_model", required=True, help="Path to the input model file")
     parser.add_argument("--output_model", required=True, help="Path to the output model file")
-    parser.add_argument("--block_size", required=False, default=32)
+    parser.add_argument("--block_size", required=False, default=32, type=int, help="Block size for quantization")
     parser.add_argument(
-        "--symmetric", required=False, default=True, help="Indicate whether to quantize the model symmetrically"
+        "--symmetric",
+        required=False,
+        default=True,
+        type=bool,
+        help="Indicate whether to quantize the model symmetrically",
     )
     parser.add_argument("-v", "--verbose", required=False, action="store_true")
     parser.set_defaults(verbose=False)

diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
@@ -14,7 +14,7 @@ s) ORT_SOURCE=${OPTARG};;
 esac
 done
 
-ONNX_MODEL_TAR_URL="https://github.com/onnx/models/raw/main/vision/classification/squeezenet/model/squeezenet1.0-7.tar.gz"
+ONNX_MODEL_TAR_URL="https://github.com/onnx/models/raw/main/archive/vision/classification/squeezenet/model/squeezenet1.0-7.tar.gz"
 MODEL_TAR_NAME="squeezenet1.0-7.tar.gz"
 ONNX_MODEL="squeezenet.onnx"
 ASAN_OPTIONS="protect_shadow_gap=0:new_delete_type_mismatch=0:log_path=asan.log"

diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
@@ -50,15 +50,12 @@ void usage() {
       "\t-a: Specify custom absolute tolerance values for output value comparison. default: 1e-5\n"
       "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
       "\t    [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
-      "\t    [QNN only] [qnn_context_cache_enable]: 1 to enable cache QNN context. Default to false.\n"
-      "\t    [QNN only] [qnn_context_cache_path]: File path to the qnn context cache. Default to model_file.onnx.bin if not set.\n"
       "\t    [QNN only] [profiling_level]: QNN profiling level, options:  'basic', 'detailed', default 'off'.\n"
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
       "\t    [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
       "\t    'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
       "\t    [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
-      "\t    [QNN only] [qnn_context_embed_mode]: 1 means dump the QNN context binary into the Onnx skeleton model.\n"
       "\t    0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n"
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
       "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
@@ -73,6 +70,8 @@ void usage() {
       "\t [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
       "\t-o [optimization level]: Default is 99. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
       "\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels. "
+      "\t-f: Enable EP context cache generation.\n"
+      "\t-b: Disable EP context embed mode.\n"
       "\n"
       "\t-h: help\n"
       "\n"
@@ -179,11 +178,13 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
 
   OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_ERROR;
   bool verbose_logging_required = false;
+  bool ep_context_enable = false;
+  bool disable_ep_context_embed_mode = false;
 
   bool pause = false;
   {
     int ch;
-    while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pz"))) != -1) {
+    while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pzfb"))) != -1) {
       switch (ch) {
         case 'A':
           enable_cpu_mem_arena = false;
@@ -312,6 +313,12 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
         case 'z':
           set_denormal_as_zero = true;
           break;
+        case 'b':
+          disable_ep_context_embed_mode = true;
+          break;
+        case 'f':
+          ep_context_enable = true;
+          break;
         case '?':
         case 'h':
         default:
@@ -386,6 +393,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     if (set_denormal_as_zero)
       sf.AddConfigEntry(kOrtSessionOptionsConfigSetDenormalAsZero, "1");
 
+    if (ep_context_enable)
+      sf.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+    if (disable_ep_context_embed_mode)
+      sf.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0");
+
     if (enable_tensorrt) {
 #ifdef USE_TENSORRT
       OrtCUDAProviderOptions cuda_options;
@@ -466,12 +478,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
           if (value != "0") {
             ORT_THROW("Set to 0 to disable qnn_context_embed_mode.");
           }
-        } else if (key == "qnn_context_cache_enable") {
-          if (value != "1") {
-            ORT_THROW("Set to 1 to enable qnn_context_cache_enable.");
-          }
-        } else if (key == "qnn_context_cache_path") {
-          // no validation
         } else if (key == "profiling_level") {
           std::set<std::string> supported_profiling_level = {"off", "basic", "detailed"};
           if (supported_profiling_level.find(value) == supported_profiling_level.end()) {
@@ -507,8 +513,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
           }
         } else {
-          ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable',
-'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
+          ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
+'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
 'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
         }
 

diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
@@ -65,8 +65,6 @@ namespace perftest {
       "\t    [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n"
       "\t    [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n"
       "\t    [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
-      "\t    [QNN only] [qnn_context_cache_enable]: 1 to enable cache QNN context. Default to false.\n"
-      "\t    [QNN only] [qnn_context_cache_path]: File path to the qnn context cache. Default to model_file.onnx.bin if not set.\n"
       "\t    [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"

diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
@@ -332,12 +332,6 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
         if (value.empty()) {
           ORT_THROW("Please provide the QNN backend path.");
         }
-      } else if (key == "qnn_context_cache_enable") {
-        if (value != "1") {
-          ORT_THROW("Set to 1 to enable qnn_context_cache_enable.");
-        }
-      } else if (key == "qnn_context_cache_path") {
-        // no validation
       } else if (key == "profiling_level") {
         std::set<std::string> supported_profiling_level = {"off", "basic", "detailed"};
         if (supported_profiling_level.find(value) == supported_profiling_level.end()) {
@@ -373,8 +367,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           ORT_THROW("Supported qnn_context_priority: low, normal, normal_high, high");
         }
       } else {
-        ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable',
-'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
+        ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
+'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
 'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])");
       }