From e60587078320b06dba9338d5cc00a41a0a85068b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 11 Dec 2024 10:24:14 -0800
Subject: [PATCH 1/8] [js/web] Update API for `ort.env.webgpu` (#23026)

### Description

This PR is a replacement of #21671. It offers a new way for accessing
the following:
- `ort.env.webgpu.adapter`:
- **deprecating**. There is no point to get the value of it. Once
`GPUDevice.adapterInfo` is supported, there is no point to set the value
too.
- `ort.env.webgpu.device`:
  - set value of `GPUDevice` if user created it. Use at user's own risk.
- get value of `Promise<GPUDevice>`. if not exist, create a new one. if
exist return it.
- `ort.env.webgpu.powerPreference`:
- **deprecating**. encouraging users to set `ort.env.webgpu.device` if
necessary.
- `ort.env.webgpu.forceFallbackAdapter`:
- **deprecating**. encouraging users to set `ort.env.webgpu.device` if
necessary.
---
 js/common/lib/env.ts       | 33 +++++++++++++++++++++++++--------
 js/web/test/test-runner.ts | 12 ++++++------
 2 files changed, 31 insertions(+), 14 deletions(-)
diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index e70f608ad7030..d6d9f7fa48790 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -45,17 +45,19 @@ export declare namespace Env {
      *
      * This setting is available only when WebAssembly SIMD feature is available in current context.
      *
+     * @defaultValue `true`
+     *
      * @deprecated This property is deprecated. Since SIMD is supported by all major JavaScript engines, non-SIMD
      * build is no longer provided. This property will be removed in future release.
-     * @defaultValue `true`
      */
     simd?: boolean;
 
     /**
      * set or get a boolean value indicating whether to enable trace.
      *
-     * @deprecated Use `env.trace` instead. If `env.trace` is set, this property will be ignored.
      * @defaultValue `false`
+     *
+     * @deprecated Use `env.trace` instead. If `env.trace` is set, this property will be ignored.
      */
     trace?: boolean;
 
@@ -153,7 +155,7 @@ export declare namespace Env {
     /**
      * Set or get the profiling configuration.
      */
-    profiling?: {
+    profiling: {
       /**
        * Set or get the profiling mode.
        *
@@ -176,6 +178,9 @@ export declare namespace Env {
      * See {@link https://gpuweb.github.io/gpuweb/#dictdef-gpurequestadapteroptions} for more details.
      *
      * @defaultValue `undefined`
+     *
+     * @deprecated Create your own GPUAdapter, use it to create a GPUDevice instance and set {@link device} property if
+     * you want to use a specific power preference.
      */
     powerPreference?: 'low-power' | 'high-performance';
     /**
@@ -187,6 +192,9 @@ export declare namespace Env {
      * See {@link https://gpuweb.github.io/gpuweb/#dictdef-gpurequestadapteroptions} for more details.
      *
      * @defaultValue `undefined`
+     *
+     * @deprecated Create your own GPUAdapter, use it to create a GPUDevice instance and set {@link device} property if
+     * you want to use a specific fallback option.
      */
     forceFallbackAdapter?: boolean;
     /**
@@ -199,16 +207,25 @@ export declare namespace Env {
      * value will be the GPU adapter that created by the underlying WebGPU backend.
      *
      * When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types".
+     *
+     * @deprecated It is no longer recommended to use this property. The latest WebGPU spec adds `GPUDevice.adapterInfo`
+     * (https://www.w3.org/TR/webgpu/#dom-gpudevice-adapterinfo), which allows to get the adapter information from the
+     * device. When it's available, there is no need to set/get the {@link adapter} property.
      */
     adapter: TryGetGlobalType<'GPUAdapter'>;
     /**
-     * Get the device for WebGPU.
-     *
-     * This property is only available after the first WebGPU inference session is created.
+     * Set or get the GPU device for WebGPU.
      *
-     * When use with TypeScript, the type of this property is `GPUDevice` defined in "@webgpu/types".
+     * There are 3 valid scenarios of accessing this property:
+     * - Set a value before the first WebGPU inference session is created. The value will be used by the WebGPU backend
+     * to perform calculations. If the value is not a `GPUDevice` object, an error will be thrown.
+     * - Get the value before the first WebGPU inference session is created. This will try to create a new GPUDevice
+     * instance. Returns a `Promise` that resolves to a `GPUDevice` object.
+     * - Get the value after the first WebGPU inference session is created. Returns a resolved `Promise` to the
+     * `GPUDevice` object used by the WebGPU backend.
      */
-    readonly device: TryGetGlobalType<'GPUDevice'>;
+    get device(): Promise<TryGetGlobalType<'GPUDevice'>>;
+    set device(value: TryGetGlobalType<'GPUDevice'>);
     /**
      * Set or get whether validate input content.
      *
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index d54ba32f9f494..5de39535a5c07 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -586,11 +586,11 @@ export class TensorResultValidator {
   }
 }
 
-function createGpuTensorForInput(cpuTensor: ort.Tensor): ort.Tensor {
+async function createGpuTensorForInput(cpuTensor: ort.Tensor): Promise<ort.Tensor> {
   if (!isGpuBufferSupportedType(cpuTensor.type) || Array.isArray(cpuTensor.data)) {
     throw new Error(`createGpuTensorForInput can not work with ${cpuTensor.type} tensor`);
   }
-  const device = ort.env.webgpu.device as GPUDevice;
+  const device = await ort.env.webgpu.device;
   const gpuBuffer = device.createBuffer({
     // eslint-disable-next-line no-bitwise
     usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE,
@@ -612,14 +612,14 @@ function createGpuTensorForInput(cpuTensor: ort.Tensor): ort.Tensor {
   });
 }
 
-function createGpuTensorForOutput(type: ort.Tensor.Type, dims: readonly number[]) {
+async function createGpuTensorForOutput(type: ort.Tensor.Type, dims: readonly number[]) {
   if (!isGpuBufferSupportedType(type)) {
     throw new Error(`createGpuTensorForOutput can not work with ${type} tensor`);
   }
 
   const size = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(type), dims)!;
 
-  const device = ort.env.webgpu.device as GPUDevice;
+  const device = await ort.env.webgpu.device;
   const gpuBuffer = device.createBuffer({
     // eslint-disable-next-line no-bitwise
     usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE,
@@ -725,7 +725,7 @@ export async function sessionRun(options: {
             if (options.ioBinding === 'ml-location' || options.ioBinding === 'ml-tensor') {
               feeds[name] = await createMLTensorForInput(options.mlContext!, feeds[name]);
             } else {
-              feeds[name] = createGpuTensorForInput(feeds[name]);
+              feeds[name] = await createGpuTensorForInput(feeds[name]);
             }
           }
         }
@@ -742,7 +742,7 @@ export async function sessionRun(options: {
             if (options.ioBinding === 'ml-tensor') {
               fetches[name] = await createMLTensorForOutput(options.mlContext!, type, dims);
             } else {
-              fetches[name] = createGpuTensorForOutput(type, dims);
+              fetches[name] = await createGpuTensorForOutput(type, dims);
             }
           }
         }

From ebb968d34af32e70e361321db37d56ea3bb4ce68 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Wed, 11 Dec 2024 17:26:29 -0800
Subject: [PATCH 2/8] disable the EP context embed model by default in session
 option (#23070)

change the default value for session option ep.context_embed_mode to 0 to avoid the model loading memory overhead
---
 .../core/session/onnxruntime_session_options_config_keys.h   | 4 ++--
 onnxruntime/core/providers/openvino/contexts.h               | 2 +-
 .../core/providers/openvino/openvino_execution_provider.h    | 2 +-
 .../core/providers/openvino/openvino_provider_factory.cc     | 2 +-
 onnxruntime/core/providers/qnn/qnn_execution_provider.cc     | 2 +-
 .../core/providers/vitisai/vitisai_execution_provider.h      | 2 +-
 onnxruntime/core/session/provider_bridge_ort.cc              | 2 +-
 onnxruntime/test/onnx/main.cc                                | 5 ++++-
 8 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 6a01602e634f8..8f1bc98ce7b49 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -261,8 +261,8 @@ static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable";
 static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path";
 
 // Flag to specify whether to dump the EP context into the Onnx model.
-// "0": dump the EP context into separate file, keep the file name in the Onnx model.
-// "1": dump the EP context into the Onnx model. (default).
+// "0": dump the EP context into separate file, keep the file name in the Onnx model. (default).
+// "1": dump the EP context into the Onnx model.
 static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
 
 // Specify the EPContext node name prefix to make it unique
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index a2f4b236213cc..4f970bc7bc287 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -18,7 +18,7 @@ struct GlobalContext {
   bool is_wholly_supported_graph = false;
   bool enable_opencl_throttling = false;
   bool disable_dynamic_shapes = false;
-  bool ep_context_embed_mode = true;
+  bool ep_context_embed_mode = false;
   bool export_ep_ctx_blob = false;
   bool enable_qdq_optimizer = false;
   bool disable_cpu_fallback = false;
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index bea9badea475a..59dbd141f4782 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -90,7 +90,7 @@ struct OpenVINOExecutionProviderInfo {
   bool export_ep_ctx_blob_{false};
   bool enable_qdq_optimizer_{false};
   bool disable_cpu_fallback_{false};
-  bool so_epctx_embed_mode_{true};
+  bool so_epctx_embed_mode_{false};
 
   OpenVINOExecutionProviderInfo() = delete;
 
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 57c4e92685c96..66f9bcb7b2a5e 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -53,7 +53,7 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
 std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
   bool so_disable_cpu_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
   bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
-  bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1") == "1";
+  bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
   std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str();
 
   if (so_export_ep_ctx_blob && !so_cache_path.empty()) {
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 060bbd4f79bf2..27e195dea73d2 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -204,7 +204,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;
 
     std::string embed_mode = session_options->config_options.GetConfigOrDefault(
-        kOrtSessionOptionEpContextEmbedMode, "1");
+        kOrtSessionOptionEpContextEmbedMode, "0");
     if ("1" == embed_mode) {
       qnn_context_embed_mode_ = true;
     } else if ("0" == embed_mode) {
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
index 9864a40bd1d3b..77dede6035b4c 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
@@ -52,7 +52,7 @@ class VitisAIExecutionProvider : public IExecutionProvider {
   std::shared_ptr<KernelRegistry> registry_;
   // EP context related.
   bool ep_ctx_enabled_ = false;
-  bool ep_ctx_embed_mode_ = true;
+  bool ep_ctx_embed_mode_ = false;
   std::string ep_ctx_model_path_cfg_{""};
   mutable PathString ep_ctx_model_file_loc_{};
   // It might need to be called before loading
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index c3832498af584..e0c479dbc7637 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -2242,7 +2242,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT_V2,
       new_tensorrt_options.trt_ep_context_file_path = (context_cache_path.size() == 0) ? nullptr : context_cache_path.c_str();
       LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path;
 
-      embed_mode = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1");
+      embed_mode = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0");
       if ("1" == embed_mode) {
         new_tensorrt_options.trt_ep_context_embed_mode = 1;
       } else if ("0" == embed_mode) {
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index ddc453f84feb6..99c3e44e13013 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -454,8 +454,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
 
     if (ep_context_enable)
       sf.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
-    if (disable_ep_context_embed_mode)
+    if (disable_ep_context_embed_mode) {
       sf.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0");
+    } else {
+      sf.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "1");
+    }
 
     for (auto& it : session_config_entries) {
       sf.AddConfigEntry(it.first.c_str(), it.second.c_str());

From 1f88284f96d98b735824c8a9eebee4fc85becf7d Mon Sep 17 00:00:00 2001
From: Ankit Maheshkar <ankit.maheshkar@intel.com>
Date: Thu, 12 Dec 2024 11:56:32 +0530
Subject: [PATCH 3/8] OVEP 1.21.0 Development Updates (#23080)

### Description
OVEP development changes for ORT 1.21 Release


### Motivation and Context
- Has Critical Bug Fixes
- Improved Performance optimizations for both memory & inference latency
(https://github.com/intel/onnxruntime/pull/513)
- Enabled Model Compilation using NPUW
(https://github.com/intel/onnxruntime/pull/508)
- Fixed support for EPContext embed mode 0 for lower memory utilization
- Updated NuGet package name as `Intel.ML.OnnxRuntime.OpenVino`
- Fixed QDQ Stripping logic on NPU
---
 cmake/onnxruntime_providers_openvino.cmake    |  4 +-
 .../providers/openvino/backend_manager.cc     |  5 +-
 .../core/providers/openvino/backend_utils.cc  | 16 +++---
 .../core/providers/openvino/backend_utils.h   |  2 +-
 .../openvino/backends/basic_backend.cc        | 57 ++++++++++++++++---
 .../openvino/backends/basic_backend.h         |  1 -
 .../openvino/onnx_ctx_model_helper.cc         | 13 ++++-
 .../openvino/onnx_ctx_model_helper.h          |  6 +-
 .../openvino/openvino_execution_provider.h    |  2 +-
 .../openvino/openvino_provider_factory.cc     |  4 +-
 .../core/providers/openvino/ov_allocator.cc   |  1 -
 .../core/providers/openvino/ov_interface.cc   |  8 +--
 .../core/providers/openvino/ov_interface.h    |  2 +-
 .../openvino/ov_versions/capability.cc        |  8 +--
 .../openvino/ov_versions/data_ops.cc          | 12 ++--
 .../providers/openvino/ov_versions/data_ops.h |  3 +-
 .../qdq_transformations/qdq_stripping.cc      |  6 +-
 tools/ci_build/build.py                       |  2 +-
 .../linux-openvino-ci-pipeline.yml            |  2 +-
 .../linux/docker/Dockerfile.ubuntu_openvino   |  8 +--
 .../nuget/generate_nuspec_for_native_nuget.py | 28 ++++++++-
 21 files changed, 138 insertions(+), 52 deletions(-)

diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index e500957f864f8..f5fae8d169ccc 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -13,8 +13,8 @@
 
   # Header paths
   find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
-  if(OpenVINO_VERSION VERSION_LESS 2024.3)
-    message(FATAL_ERROR "OpenVINO 2024.3 and newer are supported. Please, use latest OpenVINO release")
+  if(OpenVINO_VERSION VERSION_LESS 2024.4)
+    message(FATAL_ERROR "OpenVINO 2024.4 and newer are supported. Please, use latest OpenVINO release")
   endif()
 
   if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4)
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 0ffde116f4efc..a0bcf953938d9 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -70,7 +70,10 @@ BackendManager::BackendManager(const GlobalContext& global_context,
     i++;
   }
   subgraph_context_.subgraph_name = fused_node.Name();
-  auto model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
+  std::unique_ptr<onnx::ModelProto> model_proto;
+  if (!ep_ctx_handle_.IsValidOVEPCtxGraph()) {
+    model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
+  }
   std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type;
 
   if (ModelHasSymbolicInputDims(subgraph)) {
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index f772b9c3b0478..b97736f2e124d 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -39,7 +39,7 @@ struct static_cast_int64 {
   int64_t operator()(const T1& x) const { return static_cast<int64_t>(x); }
 };
 
-std::shared_ptr<OVNetwork>
+std::shared_ptr<const OVNetwork>
 CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
   if (IsCILogEnabled()) {
@@ -47,13 +47,13 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
   }
   const std::string model = model_proto.SerializeAsString();
   try {
-    auto cnn_network = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name);
+    auto ov_model = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name);
 
     // Check for Constant Folding
-    if (!global_context.is_wholly_supported_graph) {
+    if ((global_context.device_type != "NPU") && !global_context.is_wholly_supported_graph) {
       ov::pass::ConstantFolding pass_const_obj;
-      pass_const_obj.run_on_model(cnn_network);
-      auto& results = const_cast<ov::ResultVector&>(cnn_network.get()->get_results());
+      pass_const_obj.run_on_model(ov_model);
+      auto& results = const_cast<ov::ResultVector&>(ov_model.get()->get_results());
       size_t index = results.size() - 1;
 
       for (auto it = results.rbegin(); it != results.rend(); ++it) {
@@ -67,12 +67,12 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
     }
 #ifndef NDEBUG
     if (IsDebugEnabled()) {
-      std::string name = cnn_network->get_friendly_name();
+      std::string name = ov_model->get_friendly_name();
       ov::pass::Serialize serializer(name + ".xml", name + ".bin");
-      serializer.run_on_model(cnn_network);
+      serializer.run_on_model(ov_model);
     }
 #endif
-    return cnn_network;
+    return ov_model;
   } catch (std::string const& msg) {
     ORT_THROW(msg);
   }
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 9e65770da7d23..9d58e1ca73abb 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -60,7 +60,7 @@ void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
 void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
                     size_t batch_slice_idx);
 
-std::shared_ptr<OVNetwork>
+std::shared_ptr<const OVNetwork>
 CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
               const GlobalContext& global_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 56cceb8cf2a19..435ca83ff69d4 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -48,6 +48,16 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   // Set the inference_num_threads property of the CPU
   SetNumThreads(device_config);
 
+  auto npuw_status =
+      std::any_of(device_config.begin(), device_config.end(), [&](const std::pair<std::string, ov::Any>& pair) {
+        return (pair.first.find("NPU_USE_NPUW") != std::string::npos) && (pair.second.is<std::string>()) &&
+               (pair.second.as<std::string>() == "YES");
+      });
+
+  if (npuw_status) {
+    LOGS_DEFAULT(INFO) << log_tag << "NPUW Enabled during compilation";
+  }
+
   try {
     std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str;
 
@@ -81,7 +91,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
                                                            device_config,
                                                            global_context_.ep_context_embed_mode,
                                                            subgraph_context_.subgraph_name);
-        ie_cnn_network_ = exe_network_.Get().get_runtime_model();
       } else if (global_context_.export_ep_ctx_blob &&
                  hw_target.find("NPU") != std::string::npos &&
                  !global_context_.has_external_weights) {
@@ -106,15 +115,15 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
                                                             device_config,
                                                             subgraph_context_.subgraph_name);
       } else {  // For all other types use ov::Model Type
-        ie_cnn_network_ = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
+        auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
         exe_network_ = global_context_.ie_core.CompileModel(
-            ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
+            ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
       }
 #endif
     } else {  // Full graph is not supported
-      ie_cnn_network_ = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
+      auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
       exe_network_ = global_context_.ie_core.CompileModel(
-          ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
+          ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
     }
     LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
   } catch (const char* msg) {
@@ -145,8 +154,8 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     device_config.emplace(ov::hint::inference_precision("f32"));
   }
   if (global_context_.precision_str.find("ACCURACY") != std::string::npos &&
-      global_context_.device_type == "GPU") {
-    if (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) >= 1) {
+      global_context_.device_type.find("GPU") != std::string::npos) {
+    if (global_context_.OpenVINO_Version.at(0) >= 2024) {
       device_config.emplace(ov::hint::inference_precision(ov::element::undefined));
       device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY));
     } else {
@@ -174,7 +183,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
       device_property = std::make_pair("NPU_COMPILER_TYPE", env_npu_compiler_type);
     }
     device_config.emplace(ov::device::properties("NPU", device_property));
-#if (OPENVINO_VERSION_MAJOR >= 2024) && (OPENVINO_VERSION_MINOR > 3)
+#if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024))
     if (global_context_.export_ep_ctx_blob) {
       global_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
     }
@@ -184,6 +193,33 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   if (!global_context_.load_config.empty()) {
     const std::map<std::string, ov::AnyMap>& target_config = global_context_.load_config;
 
+    if (global_context_.device_type.find("NPU") != std::string::npos) {
+      auto npuw_config = target_config.at("NPU");
+
+      // Check if "NPU_USE_NPUW" exists and is set to "YES"
+      auto npu_use_npuw_it = npuw_config.find("NPU_USE_NPUW");
+      if (npu_use_npuw_it != npuw_config.end() &&
+          npu_use_npuw_it->second.is<std::string>() &&
+          npu_use_npuw_it->second.as<std::string>() == "YES") {
+        // Only add NPUW-related keys if NPU_USE_NPUW is "YES"
+        for (const auto& [key, value] : npuw_config) {
+          if (key.find("NPUW") != std::string::npos) {
+            if (!value.is<std::string>()) {
+              LOGS_DEFAULT(ERROR) << "Invalid value type for key: " << key;
+              continue;
+            }
+            device_config[key] = value;
+          }
+        }
+      } else {
+        // Check if there are any "NPUW" keys and log a warning
+        if (std::any_of(npuw_config.begin(), npuw_config.end(),
+                        [&](const auto& pair) { return pair.first.find("NPUW") != std::string::npos; })) {
+          LOGS_DEFAULT(WARNING) << "Skipping NPUW-related configurations as NPU_USE_NPUW is not set to 'YES'.";
+        }
+      }
+    }
+
     // Parse device types like "AUTO:CPU,GPU" and extract individual devices
     auto parse_individual_devices = [&](const std::string& device_type) -> std::vector<std::string> {
       std::vector<std::string> devices;
@@ -213,6 +249,9 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options,
                                      const std::vector<ov::PropertyName>& supported_properties) {
       for (const auto& [key, value] : config_options) {
+        if (key.find("NPUW") != std::string::npos) {
+          continue;
+        }
         if (is_supported_and_mutable(key, supported_properties)) {
           global_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}});
         } else {
@@ -378,7 +417,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           if ((it == ort_ov_tensor_map.end()) ||
               (it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) {
             ov_tensor_data_t ov_tensor_data;
-            auto input = graph_input_info.at(input_idx);
+            const auto& input = graph_input_info.at(input_idx);
             ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input.get_shape(),
                                                                      const_cast<void*>(tensor.GetTensorRawData()));
 
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 12502a1d83c5d..3fcf6e4384d52 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -58,7 +58,6 @@ class BasicBackend : public IBackend {
   GlobalContext& global_context_;
   SubGraphContext subgraph_context_;
   mutable std::mutex compute_lock_;
-  std::shared_ptr<const OVNetwork> ie_cnn_network_;
   OVExeNetwork exe_network_;
   std::map<std::string, std::shared_ptr<ov::Node>> const_outputs_map_;
   std::unique_ptr<InferRequestsQueue> inferRequestsQueue_;
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
index 42a2b5d30c25c..6d159db3b390d 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
@@ -99,7 +99,9 @@ Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, b
   auto node = graph_viewer.GetNode(0);
   auto& attrs = node->GetAttributes();
   ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) > 0);
-  model_stream_ = std::make_shared<std::istringstream>(attrs.at(EP_CACHE_CONTEXT).s());
+
+  ep_cache_context_attribute_ = &attrs.at(EP_CACHE_CONTEXT);
+
   ep_context_embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
   LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
 
@@ -107,6 +109,15 @@ Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, b
   return Status::OK();
 }
 
+const std::string& EPCtxHandler::GetModelBlobStream() const {
+  static std::string empty;
+  if (ep_cache_context_attribute_ != nullptr) {
+    return ep_cache_context_attribute_->s();
+  } else {
+    return empty;
+  }
+}
+
 bool EPCtxHandler::CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const {
   for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) {
     auto node = graph_viewer.GetNode(i);
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
index c7ee943dff761..caab33b7db775 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
@@ -23,7 +23,7 @@ static const char SOURCE[] = "source";
 class EPCtxHandler {
  public:
   EPCtxHandler() = default;
-  EPCtxHandler(const EPCtxHandler&) = default;
+  EPCtxHandler(const EPCtxHandler&) = delete;
   Status ExportEPCtxModel(const GraphViewer& graph_viewer,
                           const std::string& graph_name,
                           const logging::Logger& logger,
@@ -33,11 +33,11 @@ class EPCtxHandler {
   Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode);
   bool CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const;
   bool IsValidOVEPCtxGraph() const { return is_valid_ep_ctx_graph_; }
-  [[nodiscard]] const std::shared_ptr<std::istringstream> GetModelBlobStream() const { return model_stream_; }
+  const std::string& GetModelBlobStream() const;
 
  private:
   bool is_valid_ep_ctx_graph_{false};
-  std::shared_ptr<std::istringstream> model_stream_;
+  const onnx::AttributeProto* ep_cache_context_attribute_;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 59dbd141f4782..d5c22a4e2a9e4 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -159,7 +159,7 @@ struct OpenVINOExecutionProviderInfo {
       device_type_ = std::move(dev_type);
     } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) {
       std::vector<std::string> devices = parseDevices(dev_type, available_devices);
-      device_type_ = dev_type;
+      device_type_ = std::move(dev_type);
     } else {
       ORT_THROW("Invalid device string: " + dev_type);
     }
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 66f9bcb7b2a5e..5855cb594a08e 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -57,7 +57,7 @@ std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
   std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str();
 
   if (so_export_ep_ctx_blob && !so_cache_path.empty()) {
-    cache_dir_ = so_cache_path;
+    cache_dir_ = std::move(so_cache_path);
     auto file_path = std::filesystem::path(cache_dir_);
     // ep_context_file_path_ file extension must be .onnx
     if (file_path.extension().generic_string() == ".onnx") {
@@ -248,7 +248,7 @@ struct OpenVINO_Provider : Provider {
                 LOGS_DEFAULT(WARNING) << "Unsupported JSON value type for key: " << inner_key << ". Skipping key.";
               }
             }
-            target_map[key] = inner_map;
+            target_map[key] = std::move(inner_map);
           }
         } catch (const nlohmann::json::parse_error& e) {
           // Handle syntax errors in JSON
diff --git a/onnxruntime/core/providers/openvino/ov_allocator.cc b/onnxruntime/core/providers/openvino/ov_allocator.cc
index 6700244b754d8..0e5ff8ff98efb 100644
--- a/onnxruntime/core/providers/openvino/ov_allocator.cc
+++ b/onnxruntime/core/providers/openvino/ov_allocator.cc
@@ -39,7 +39,6 @@ void* OVRTAllocator::Alloc(size_t size) {
   } catch (const ov::Exception& e) {
     ORT_THROW(std::string("Alloc failed: ") + e.what());
   }
-  return nullptr;
 }
 
 void OVRTAllocator::Free(void* p) {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 7e8681d304abf..12ab7ecede031 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -109,7 +109,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
   }
 }
 
-OVExeNetwork OVCore::ImportModel(std::shared_ptr<std::istringstream> model_stream,
+OVExeNetwork OVCore::ImportModel(const std::string& model_string,
                                  std::string hw_target,
                                  const ov::AnyMap& device_config,
                                  bool embed_mode,
@@ -117,10 +117,10 @@ OVExeNetwork OVCore::ImportModel(std::shared_ptr<std::istringstream> model_strea
   try {
     ov::CompiledModel obj;
     if (embed_mode) {
-      obj = oe.import_model(*model_stream, hw_target, device_config);
+      std::istringstream model_stream(model_string);
+      obj = oe.import_model(model_stream, hw_target, device_config);
     } else {
-      std::string blob_file_path = (*model_stream).str();
-      std::ifstream modelStream(blob_file_path, std::ios_base::binary | std::ios_base::in);
+      std::ifstream modelStream(model_string, std::ios_base::binary | std::ios_base::in);
       obj = oe.import_model(modelStream,
                             hw_target,
                             {});
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index f4da4ea3e3244..c3417003f8e1f 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -54,7 +54,7 @@ class OVCore {
                             ov::AnyMap& device_config,
                             const std::string& name);
   // OV Interface for Import model Stream
-  OVExeNetwork ImportModel(std::shared_ptr<std::istringstream> model_stream,
+  OVExeNetwork ImportModel(const std::string& model_string,
                            std::string hw_target,
                            const ov::AnyMap& device_config,
                            bool embed_mode,
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 95c7466e02f2f..3e780f74145ae 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -35,14 +35,14 @@ GetCapability::GetCapability(const GraphViewer& graph_viewer_param,
     device_type_ = "CPU";
     if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true;
   }
-#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 3
-  data_ops_ = new DataOps(graph_viewer_, V_2024_3, device_type_, npu_qdq_optimizer_enabled);
-#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 4
+#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 4
   data_ops_ = new DataOps(graph_viewer_, V_2024_4, device_type_, npu_qdq_optimizer_enabled);
 #elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5
   data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled);
+#elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 0
+  data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled);
 #else
-  data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled);
+  data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled);
 #endif
 }
 
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index b2c5fd6f83167..f118f057ac11e 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -355,6 +355,7 @@ void DataOps::populate_op_mode_supported() {
   no_dimension_supported_.push_back({"Floor", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Gather", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Identity", V_2023_0, {"All"}});
+  no_dimension_supported_.push_back({"If", V_2022_3, {"CPU", "GPU"}});
   no_dimension_supported_.push_back({"Less", V_2022_1, {"CPU"}});
   no_dimension_supported_.push_back({"Loop", V_2021_4, {"All"}});
   no_dimension_supported_.push_back({"Min", V_2020_4, {"All"}});
@@ -387,7 +388,7 @@ void DataOps::populate_op_mode_supported() {
 
   // populate unsupportedmode_t
   {
-    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
+    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2025_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch)
                                for (size_t i = 0; i < node->InputDefs().size(); i++) {
@@ -402,7 +403,8 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"ReduceMax", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
+                              V_2024_3, V_2024_4, V_2024_5, V_2025_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                const auto& input_arg = node->InputDefs()[1];
                                auto shape = input_arg->Shape();
@@ -419,7 +421,8 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Reshape", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
+                              V_2024_3, V_2024_4, V_2024_5, V_2025_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the operator is unsqueeze
                                // If axes is an input, then we cannot produce a static graph.
@@ -434,7 +437,8 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Unsqueeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5,
+                              V_2025_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // check for attributes
                                auto& upsample_attr = node->GetAttributes();
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
index a2db56deca7cd..07fa36f355d55 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
@@ -32,7 +32,8 @@ enum versionNum {
   V_2024_2,
   V_2024_3,
   V_2024_4,
-  V_2024_5
+  V_2024_5,
+  V_2025_0
 };
 
 using VersionNum = enum versionNum;
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index decfe91c598be..387aaf9985b4c 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -30,6 +30,10 @@ constexpr std::string_view DuplicateDQ = "/duplicated";
 
 constexpr ONNX_NAMESPACE::TensorProto_DataType DT_UINT16 = ONNX_NAMESPACE::TensorProto_DataType_UINT16;
 constexpr ONNX_NAMESPACE::TensorProto_DataType DT_INT16 = ONNX_NAMESPACE::TensorProto_DataType_INT16;
+constexpr ONNX_NAMESPACE::TensorProto_DataType DT_UINT8 = ONNX_NAMESPACE::TensorProto_DataType_UINT8;
+constexpr ONNX_NAMESPACE::TensorProto_DataType DT_INT8 = ONNX_NAMESPACE::TensorProto_DataType_INT8;
+constexpr ONNX_NAMESPACE::TensorProto_DataType DT_UINT4 = ONNX_NAMESPACE::TensorProto_DataType_UINT4;
+constexpr ONNX_NAMESPACE::TensorProto_DataType DT_INT4 = ONNX_NAMESPACE::TensorProto_DataType_INT4;
 
 // Return the data type of the qdq node.
 // Check output type of Q and input type of DQ to determine it as zero_point is an optional input and may not exist
@@ -218,7 +222,7 @@ static bool DQFeedsASupportedOp(const Node* dq_node) {
     } else {
       return true;
     }
-  } else if (op_type == "Add") {
+  } else if (op_type == "Add" && !(GetQDQDataType(dq_node) == DT_UINT16 || GetQDQDataType(dq_node) == DT_INT16)) {
     // Add => keeps all DQs
     return true;
   }
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 6ee37b8b0519e..3527a89ca7a7b 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2332,7 +2332,7 @@ def build_nuget_package(
         target_name = "/t:CreateWindowsAIPackage"
     elif use_openvino:
         execution_provider = "/p:ExecutionProvider=openvino"
-        package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.OpenVino"
+        package_name = "/p:OrtPackageId=Intel.ML.OnnxRuntime.OpenVino"
     elif use_tensorrt:
         execution_provider = "/p:ExecutionProvider=tensorrt"
         package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.TensorRT"
diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
index 9ee589a3d6ef3..c7b814f3dd52c 100644
--- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
@@ -33,5 +33,5 @@ jobs:
   parameters:
     AgentPool : 'Linux-CPU-2019'
     JobName: 'Linux_CI_Dev'
-    RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2024.4.0 -x "--use_openvino CPU --build_wheel"'
+    RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2024.5.0 -x "--use_openvino CPU --build_wheel"'
     TimeoutInMinutes: 120
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
index 8f3dcb69d6c56..643c0d66d01f5 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
@@ -1,7 +1,7 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:${UBUNTU_VERSION}
 
-ARG OPENVINO_VERSION=2024.4.0
+ARG OPENVINO_VERSION=2024.5.0
 ARG PYTHON_VERSION=3.10
 
 ADD scripts /tmp/scripts
@@ -19,9 +19,9 @@ ENV IE_PLUGINS_PATH=$INTEL_OPENVINO_DIR/runtime/lib/intel64
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN cd /opt && mkdir -p intel && cd intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.4/linux/l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && \
-    tar xzf l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && \
-    mv l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64 openvino_2024.4.0 && \
+    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.5/linux/l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64.tgz && \
+    tar xzf l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64.tgz && \
+    mv l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64 openvino_2024.5.0 && \
     cd $INTEL_OPENVINO_DIR/install_dependencies && ./install_openvino_dependencies.sh -y
 
 WORKDIR /root
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index ba125f4e2d980..11842f34ce45b 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -182,6 +182,8 @@ def generate_description(line_list, package_name):
         description = "This package contains Linux native shared library artifacts for ONNX Runtime with CUDA."
     elif "Microsoft.ML.OnnxRuntime.Gpu.Windows" in package_name:
         description = "This package contains Windows native shared library artifacts for ONNX Runtime with CUDA."
+    elif "Intel.ML.OnnxRuntime" in package_name:
+        description = "This package contains native shared library artifacts for ONNX Runtime with OpenVINO."
     elif "Microsoft.ML.OnnxRuntime" in package_name:  # This is a Microsoft.ML.OnnxRuntime.* package
         description = (
             "This package contains native shared library artifacts for all supported platforms of ONNX Runtime."
@@ -715,7 +717,7 @@ def generate_files(line_list, args):
         )
 
     if args.execution_provider == "openvino":
-        get_env_var("INTEL_OPENVINO_DIR")
+        openvino_path = get_env_var("INTEL_OPENVINO_DIR")
         files_list.append(
             "<file src="
             + '"'
@@ -733,6 +735,30 @@ def generate_files(line_list, args):
             + '\\native" />'
         )
 
+        if is_windows():
+            dll_list_path = os.path.join(openvino_path, "runtime\\bin\\intel64\\Release\\")
+            tbb_list_path = os.path.join(openvino_path, "runtime\\3rdparty\\tbb\\bin\\")
+            for dll_element in os.listdir(dll_list_path):
+                if dll_element.endswith("dll"):
+                    files_list.append(
+                        "<file src="
+                        + '"'
+                        + os.path.join(dll_list_path, dll_element)
+                        + runtimes_target
+                        + args.target_architecture
+                        + '\\native" />'
+                    )
+            for tbb_element in os.listdir(tbb_list_path):
+                if tbb_element.endswith("dll"):
+                    files_list.append(
+                        "<file src="
+                        + '"'
+                        + os.path.join(tbb_list_path, tbb_element)
+                        + runtimes_target
+                        + args.target_architecture
+                        + '\\native" />'
+                    )
+
     if args.execution_provider == "cuda" or is_cuda_gpu_win_sub_package and not is_ado_packaging_build:
         files_list.append(
             "<file src="

From 890a719c91f3b7dac473e9c1bb59182499a015c2 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Thu, 12 Dec 2024 10:19:47 -0800
Subject: [PATCH 4/8] Remove deprecated static from Eigen that contributes to
 size increase (#23084)

### Description
<!-- Describe your changes. -->
This patches Eigen source to remove an unused deprecated static var.

### Motivation and Context
Internal customer request.
---
 cmake/external/eigen.cmake           |  1 +
 cmake/patches/eigen/eigen-edge.patch | 13 +++++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 cmake/patches/eigen/eigen-edge.patch

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 339cded091b29..95dd438702a18 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -15,6 +15,7 @@ else ()
             eigen
             URL ${DEP_URL_eigen}
             URL_HASH SHA1=${DEP_SHA1_eigen}
+            PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/eigen/eigen-edge.patch
         )
     endif()
 
diff --git a/cmake/patches/eigen/eigen-edge.patch b/cmake/patches/eigen/eigen-edge.patch
new file mode 100644
index 0000000000000..d8dc850b4bd55
--- /dev/null
+++ b/cmake/patches/eigen/eigen-edge.patch
@@ -0,0 +1,13 @@
+diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h
+index f85de305f..3dc2bb5e7 100644
+--- a/Eigen/src/Core/util/IndexedViewHelper.h
++++ b/Eigen/src/Core/util/IndexedViewHelper.h
+@@ -178,7 +178,7 @@ namespace placeholders {
+ 
+   EIGEN_DEPRECATED static const all_t  all  = Eigen::all;    // PLEASE use Eigen::all    instead of Eigen::placeholders::all
+   EIGEN_DEPRECATED static const last_t last = Eigen::last;   // PLEASE use Eigen::last   instead of Eigen::placeholders::last
+-  EIGEN_DEPRECATED static const end_t  end  = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end
++ // EIGEN_DEPRECATED static const end_t  end  = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end
+ }
+ 
+ } // end namespace Eigen

From 01539ee7ab5f524b40c2bd4d10e1b0d751fbc117 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 12 Dec 2024 11:33:53 -0800
Subject: [PATCH 5/8] [js/webgpu] fix Conv2DMatMul shader's out-of-bound read
 (#23085)

### Description
<!-- Describe your changes. -->

Fix a bug caused by potential out-of-bound reads of `W` in the
Conv2DMatMul shader.

### Motivation and Context

Fixes #22983
---
 .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts     | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index 3ef5c943d5624..9e21a552b8466 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -143,7 +143,21 @@ const conv2dCommonSnippet = (
     }
     return ${typeSnippet(innerElementSizeX, dataType)}(0.0);`;
 
-  const sampleW = `${getWSnippet(innerElementSizeW)}`;
+  const sampleW = isChannelsLast
+    ? fitInner && fitBOuter
+      ? getWSnippet(innerElementSizeW)
+      : `
+    let col = colIn * ${innerElementSizeW};
+    if (row < uniforms.dim_inner && col < uniforms.dim_b_outer) {
+      ${getWSnippet(innerElementSizeW)}
+    }
+    return ${typeSnippet(innerElementSizeW, dataType)}(0.0);`
+    : `
+    let col = colIn * ${innerElementSizeW};
+    if (row < uniforms.dim_inner && col < uniforms.dim_a_outer) {
+      ${getWSnippet(innerElementSizeW)}
+    }
+    return ${typeSnippet(innerElementSizeW, dataType)}(0.0);`;
 
   const resType = typeSnippet(innerElementSize, dataType);
   const aType = isChannelsLast ? typeSnippet(innerElementSizeX, dataType) : typeSnippet(innerElementSizeW, dataType);

From f43f40facf09b85a9ebbdccf4b90c9d49fd40fd0 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Thu, 12 Dec 2024 17:04:20 -0800
Subject: [PATCH 6/8] Backward compatible with old QNN version (#23095)

### Description
Make QNN EP compliable with old QNN version
---
 .../qnn/builder/qnn_backend_manager.cc        | 30 +++++++++++-----
 .../core/providers/qnn/builder/qnn_model.cc   | 35 +++++++++++--------
 2 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 8a717c3f29ff9..3af646c3ce13a 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -619,6 +619,9 @@ std::unique_ptr<unsigned char[]> QnnBackendManager::GetContextBinaryBuffer(uint6
 Status QnnBackendManager::GetMaxSpillFillBufferSize(unsigned char* buffer,
                                                     uint64_t buffer_length,
                                                     uint64_t& max_spill_fill_buffer_size) {
+  max_spill_fill_buffer_size = 0;
+  // spill fill starts from 2.28
+#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 21)
   bool result = nullptr == qnn_sys_interface_.systemContextCreate ||
                 nullptr == qnn_sys_interface_.systemContextGetBinaryInfo ||
                 nullptr == qnn_sys_interface_.systemContextFree;
@@ -672,6 +675,10 @@ Status QnnBackendManager::GetMaxSpillFillBufferSize(unsigned char* buffer,
       LOGS(*logger_, VERBOSE) << "Unknown context binary graph info version.";
     }
   }
+#else
+  ORT_UNUSED_PARAMETER(buffer);
+  ORT_UNUSED_PARAMETER(buffer_length);
+#endif
 
   LOGS(*logger_, VERBOSE) << "Get max spill fill buffer size completed.";
   return Status::OK();
@@ -705,16 +712,23 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
   ORT_RETURN_IF(nullptr == binary_info, "Qnn cached binary info is nullptr.");
   uint32_t graph_count = 0;
   QnnSystemContext_GraphInfo_t* graphs_info = nullptr;
-  if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
-    graph_count = binary_info->contextBinaryInfoV3.numGraphs;
-    graphs_info = binary_info->contextBinaryInfoV3.graphs;
-  } else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
-    graph_count = binary_info->contextBinaryInfoV2.numGraphs;
-    graphs_info = binary_info->contextBinaryInfoV2.graphs;
-  } else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
+  if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
     graph_count = binary_info->contextBinaryInfoV1.numGraphs;
     graphs_info = binary_info->contextBinaryInfoV1.graphs;
-  } else {
+  }
+#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 15)  // starts from 2.22
+  else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
+    graph_count = binary_info->contextBinaryInfoV2.numGraphs;
+    graphs_info = binary_info->contextBinaryInfoV2.graphs;
+  }
+#endif
+#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 21)  // starts from 2.28
+  else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
+    graph_count = binary_info->contextBinaryInfoV3.numGraphs;
+    graphs_info = binary_info->contextBinaryInfoV3.graphs;
+  }
+#endif
+  else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported context binary info version.");
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 75973c7031d62..4f73e4c532ed4 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -325,28 +325,35 @@ Status QnnModel::DeserializeGraphInfoFromBinaryInfo(const QnnSystemContext_Graph
   Qnn_Tensor_t* output_tensors = nullptr;
   uint32_t graph_input_num = 0;
   uint32_t graph_output_num = 0;
-  if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) {
-    graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV3.graphName);
-    graph_input_num = qnn_sys_ctx_graph_info.graphInfoV3.numGraphInputs;
-    graph_output_num = qnn_sys_ctx_graph_info.graphInfoV3.numGraphOutputs;
+  if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) {
+    graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV1.graphName);
+    graph_input_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphInputs;
+    graph_output_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphOutputs;
 
-    input_tensors = qnn_sys_ctx_graph_info.graphInfoV3.graphInputs;
-    output_tensors = qnn_sys_ctx_graph_info.graphInfoV3.graphOutputs;
-  } else if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) {
+    input_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphInputs;
+    output_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphOutputs;
+  }
+#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 18)  // start from 2.25
+  else if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) {
     graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV2.graphName);
     graph_input_num = qnn_sys_ctx_graph_info.graphInfoV2.numGraphInputs;
     graph_output_num = qnn_sys_ctx_graph_info.graphInfoV2.numGraphOutputs;
 
     input_tensors = qnn_sys_ctx_graph_info.graphInfoV2.graphInputs;
     output_tensors = qnn_sys_ctx_graph_info.graphInfoV2.graphOutputs;
-  } else if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) {
-    graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV1.graphName);
-    graph_input_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphInputs;
-    graph_output_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphOutputs;
+  }
+#endif
+#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 21)  // start from 2.28
+  else if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) {
+    graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV3.graphName);
+    graph_input_num = qnn_sys_ctx_graph_info.graphInfoV3.numGraphInputs;
+    graph_output_num = qnn_sys_ctx_graph_info.graphInfoV3.numGraphOutputs;
 
-    input_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphInputs;
-    output_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphOutputs;
-  } else {
+    input_tensors = qnn_sys_ctx_graph_info.graphInfoV3.graphInputs;
+    output_tensors = qnn_sys_ctx_graph_info.graphInfoV3.graphOutputs;
+  }
+#endif
+  else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported context graph info version.");
   }
   ORT_RETURN_IF(nullptr == input_tensors, "Graph from cached context doesn't have any inputs.");

From 2a36fd4f6e0aabfb4a00c4c20ab5b9bd474c60b8 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Thu, 12 Dec 2024 21:12:02 -0800
Subject: [PATCH 7/8] Fix the ctx_gen tool to make sure all generated ctx.onnx
 have max_size (#23097)

### Description
Fix the qnn_ctx_gen tool to make sure all generated ctx.onnx have max_size
---
 onnxruntime/test/qnn_ctx_gen/main.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/test/qnn_ctx_gen/main.cc b/onnxruntime/test/qnn_ctx_gen/main.cc
index d568d5e78688a..3be0bd253c8a4 100644
--- a/onnxruntime/test/qnn_ctx_gen/main.cc
+++ b/onnxruntime/test/qnn_ctx_gen/main.cc
@@ -33,8 +33,11 @@ static void CheckStatus(const Status& status) {
 
 // from the last context cache Onnx model, find the EPContext node with main_context=1,
 // and get the QNN context binary file name, this context binary contains all graphs from all Onnx models
+// get the max spill fill buffer size
 static void GetLastContextBinaryFileName(const std::basic_string<ORTCHAR_T> last_onnx_ctx_file,
-                                         std::string& last_ctx_bin_file) {
+                                         std::string& last_ctx_bin_file,
+                                         int64_t& max_size) {
+  max_size = 0;
   std::shared_ptr<Model> ctx_model;
   CheckStatus(Model::Load(ToPathString(last_onnx_ctx_file), ctx_model, nullptr,
                           (*((OrtEnv*)*ort_env.get())->GetEnvironment().GetLoggingManager()).DefaultLogger()));
@@ -43,6 +46,7 @@ static void GetLastContextBinaryFileName(const std::basic_string<ORTCHAR_T> last
     if (node.OpType() == "EPContext") {
       NodeAttrHelper node_helper(node);
       int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+      max_size = node_helper.Get("max_size", static_cast<int64_t>(0));
       if (1 == is_main_context) {
         last_ctx_bin_file = node_helper.Get("ep_cache_context", "");
         return;
@@ -55,7 +59,8 @@ static void GetLastContextBinaryFileName(const std::basic_string<ORTCHAR_T> last
 // the last QNN context binary file
 // Remove not used QNN context binary file, only keep the last one which contains all graphs
 static void UpdateEpContextModel(const std::vector<std::basic_string<ORTCHAR_T>>& ep_ctx_files,
-                                 const std::string& last_qnn_ctx_binary_file_name) {
+                                 const std::string& last_qnn_ctx_binary_file_name,
+                                 int64_t max_size) {
   for (auto ep_ctx_file : ep_ctx_files) {
     std::shared_ptr<Model> ctx_model;
     auto path_str = ToPathString(ep_ctx_file);
@@ -75,6 +80,8 @@ static void UpdateEpContextModel(const std::vector<std::basic_string<ORTCHAR_T>>
           std::remove(file_path.string().c_str());
           node.ClearAttribute("ep_cache_context");
           node.AddAttribute("ep_cache_context", last_qnn_ctx_binary_file_name);
+          node.ClearAttribute("max_size");
+          node.AddAttribute("max_size", max_size);
         }
       }
     }
@@ -181,7 +188,8 @@ int real_main(int argc, char* argv[]) {
 
     // Get the last context binary file name
     std::string last_qnn_ctx_binary_file_name;
-    GetLastContextBinaryFileName(ep_ctx_files.back(), last_qnn_ctx_binary_file_name);
+    int64_t max_size = 0;
+    GetLastContextBinaryFileName(ep_ctx_files.back(), last_qnn_ctx_binary_file_name, max_size);
     std::cout << "The last context binary file: " << last_qnn_ctx_binary_file_name << std::endl;
     if (last_qnn_ctx_binary_file_name.empty()) {
       throw Ort::Exception("Can't find QNN context binary file from the Onnx model.", OrtErrorCode::ORT_FAIL);
@@ -191,7 +199,7 @@ int real_main(int argc, char* argv[]) {
     // Update generated context cache Onnx model to make the main EPContext node point to
     // the last QNN context binary file
     // Remove not used QNN context binary file, only keep the last one which contains all graphs
-    UpdateEpContextModel(ep_ctx_files, last_qnn_ctx_binary_file_name);
+    UpdateEpContextModel(ep_ctx_files, last_qnn_ctx_binary_file_name, max_size);
   }
   ORT_CATCH(const Ort::Exception& e) {
     fprintf(stderr, "Failed to generate context cache file: %s \n", e.what());

From 62e7e24f172a062242acae11575f7ea11529dd09 Mon Sep 17 00:00:00 2001
From: "genmingz@AMD" <danyue333@163.com>
Date: Fri, 13 Dec 2024 13:13:43 +0800
Subject: [PATCH 8/8] Add attrProto.release_s interface (#22977)

### Description
Add AttributeProto.release_s interface, which is used to obtain the
string in the attribute using move semantics instead of copying it



### Motivation and Context
The ep_context node stores a lot of information in attributes, which may
cause the memory usage to increase. Use this interface to avoid memory
waste

---------

Co-authored-by: GenMing Zhong <genmingz@xlnx.xilinx.com>
Co-authored-by: genmingz <genmingz@amd.com>
---
 .../core/providers/shared_library/provider_interfaces.h    | 1 +
 .../core/providers/shared_library/provider_wrappedtypes.h  | 1 +
 onnxruntime/core/providers/vitisai/imp/attr_proto.cc       | 4 ++++
 onnxruntime/core/providers/vitisai/imp/attr_proto.h        | 1 +
 onnxruntime/core/providers/vitisai/imp/global_api.cc       | 7 +++++++
 .../core/providers/vitisai/include/vaip/vaip_ort_api.h     | 3 ++-
 onnxruntime/core/session/provider_bridge_ort.cc            | 1 +
 7 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 7ab93d56cfe26..d182d0b9173bd 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -390,6 +390,7 @@ struct ProviderHost {
   virtual void AttributeProto__set_name(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) = 0;
   virtual void AttributeProto__set_type(ONNX_NAMESPACE::AttributeProto* p, ONNX_NAMESPACE::AttributeProto_AttributeType value) = 0;
   virtual ONNX_NAMESPACE::TensorProto* AttributeProto__add_tensors(ONNX_NAMESPACE::AttributeProto* p) = 0;
+  virtual std::string* AttributeProto__release_s(ONNX_NAMESPACE::AttributeProto* p) = 0;
 
   // GraphProto
   virtual std::unique_ptr<ONNX_NAMESPACE::GraphProto> GraphProto__construct() = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index a82ddfe64c64b..54249f0864cd7 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -122,6 +122,7 @@ struct AttributeProto final {
   void set_name(const ::std::string& value) { return g_host->AttributeProto__set_name(this, value); }
   void set_type(AttributeProto_AttributeType value) { return g_host->AttributeProto__set_type(this, value); }
   TensorProto* add_tensors() { return g_host->AttributeProto__add_tensors(this); }
+  std::string* release_s() { return g_host->AttributeProto__release_s(this); }
 
   typedef AttributeProto_AttributeType AttributeType;
   static constexpr AttributeType UNDEFINED = AttributeProto_AttributeType_UNDEFINED;
diff --git a/onnxruntime/core/providers/vitisai/imp/attr_proto.cc b/onnxruntime/core/providers/vitisai/imp/attr_proto.cc
index a9275b24ce91f..2b9ddf8ad147f 100644
--- a/onnxruntime/core/providers/vitisai/imp/attr_proto.cc
+++ b/onnxruntime/core/providers/vitisai/imp/attr_proto.cc
@@ -104,4 +104,8 @@ std::vector<std::string> attr_proto_get_strings(const ONNX_NAMESPACE::AttributeP
   }
   return ret;
 }
+std::string* attr_proto_release_string(ONNX_NAMESPACE::AttributeProto* attr) {
+  vai_assert(attr->type() == ONNX_NAMESPACE::AttributeProto_AttributeType_STRING, attr->name());
+  return attr->release_s();
+}
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/imp/attr_proto.h b/onnxruntime/core/providers/vitisai/imp/attr_proto.h
index bb2883512037b..08d980ec94c14 100644
--- a/onnxruntime/core/providers/vitisai/imp/attr_proto.h
+++ b/onnxruntime/core/providers/vitisai/imp/attr_proto.h
@@ -23,5 +23,6 @@ const ONNX_NAMESPACE::TensorProto& attr_proto_get_tensor(const ONNX_NAMESPACE::A
 gsl::span<const int64_t> attr_proto_get_ints(const ONNX_NAMESPACE::AttributeProto& attr);
 gsl::span<const float> attr_proto_get_floats(const ONNX_NAMESPACE::AttributeProto& attr);
 std::vector<std::string> attr_proto_get_strings(const ONNX_NAMESPACE::AttributeProto& attr);
+std::string* attr_proto_release_string(ONNX_NAMESPACE::AttributeProto* attr);
 
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 3e802e5a77203..51dc79c569589 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -449,6 +449,13 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     return vaip_core::DllSafe(model_proto.SerializeAsString());
   };
   the_global_api.model_proto_delete = [](ONNX_NAMESPACE::ModelProto* p) { delete p; };
+  the_global_api.attr_proto_release_string = [](ONNX_NAMESPACE::AttributeProto* attr) -> vaip_core::DllSafe<std::string> {
+    auto pstr = vaip::attr_proto_release_string(attr);
+    std::string local_str = std::move(*pstr);
+    pstr = nullptr;
+    return vaip_core::DllSafe<std::string>(std::move(local_str));
+  };
+
   if (!s_library_vitisaiep.vaip_get_version) {
     return reinterpret_cast<vaip_core::OrtApiForVaip*>(&(the_global_api.host_));
   } else {
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
index 288cfd6850d06..9425c08dceebc 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
@@ -13,7 +13,7 @@ struct OrtApi;
 
 namespace vaip_core {
 
-#define VAIP_ORT_API_MAJOR (11u)
+#define VAIP_ORT_API_MAJOR (12u)
 #define VAIP_ORT_API_MINOR (0u)
 #define VAIP_ORT_API_PATCH (0u)
 struct OrtApiForVaip {
@@ -234,6 +234,7 @@ struct OrtApiForVaip {
   ModelProto* (*model_to_proto)(Model& model);                                                                                                        // [95]
   DllSafe<std::string> (*model_proto_serialize_as_string)(ModelProto& model_proto);                                                                   // [96]
   void (*model_proto_delete)(ModelProto* p);                                                                                                          // [97]
+  DllSafe<std::string> (*attr_proto_release_string)(AttributeProto* attr);                                                                            // [98]
 };
 
 #ifndef USE_VITISAI
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index e0c479dbc7637..1444c1976d447 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -497,6 +497,7 @@ struct ProviderHostImpl : ProviderHost {
   void AttributeProto__set_name(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) override { return p->set_name(value); }
   void AttributeProto__set_type(ONNX_NAMESPACE::AttributeProto* p, ONNX_NAMESPACE::AttributeProto_AttributeType value) override { return p->set_type(value); }
   ONNX_NAMESPACE::TensorProto* AttributeProto__add_tensors(ONNX_NAMESPACE::AttributeProto* p) override { return p->add_tensors(); }
+  std::string* AttributeProto__release_s(ONNX_NAMESPACE::AttributeProto* p) override { return p->release_s(); }
 
   // GraphProto (wrapped)
   std::unique_ptr<ONNX_NAMESPACE::GraphProto> GraphProto__construct() override { return std::make_unique<ONNX_NAMESPACE::GraphProto>(); }