From e60587078320b06dba9338d5cc00a41a0a85068b Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Wed, 11 Dec 2024 10:24:14 -0800 Subject: [PATCH 1/8] [js/web] Update API for `ort.env.webgpu` (#23026) ### Description This PR is a replacement of #21671. It offers a new way for accessing the following: - `ort.env.webgpu.adapter`: - **deprecating**. There is no point to get the value of it. Once `GPUDevice.adapterInfo` is supported, there is no point to set the value too. - `ort.env.webgpu.device`: - set value of `GPUDevice` if user created it. Use at user's own risk. - get value of `Promise`. if not exist, create a new one. if exist return it. - `ort.env.webgpu.powerPreference`: - **deprecating**. encouraging users to set `ort.env.webgpu.device` if necessary. - `ort.env.webgpu.forceFallbackAdapter`: - **deprecating**. encouraging users to set `ort.env.webgpu.device` if necessary. --- js/common/lib/env.ts | 33 +++++++++++++++++++++++++-------- js/web/test/test-runner.ts | 12 ++++++------ 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts index e70f608ad7030..d6d9f7fa48790 100644 --- a/js/common/lib/env.ts +++ b/js/common/lib/env.ts @@ -45,17 +45,19 @@ export declare namespace Env { * * This setting is available only when WebAssembly SIMD feature is available in current context. * + * @defaultValue `true` + * * @deprecated This property is deprecated. Since SIMD is supported by all major JavaScript engines, non-SIMD * build is no longer provided. This property will be removed in future release. - * @defaultValue `true` */ simd?: boolean; /** * set or get a boolean value indicating whether to enable trace. * - * @deprecated Use `env.trace` instead. If `env.trace` is set, this property will be ignored. * @defaultValue `false` + * + * @deprecated Use `env.trace` instead. If `env.trace` is set, this property will be ignored. */ trace?: boolean; @@ -153,7 +155,7 @@ export declare namespace Env { /** * Set or get the profiling configuration. */ - profiling?: { + profiling: { /** * Set or get the profiling mode. * @@ -176,6 +178,9 @@ export declare namespace Env { * See {@link https://gpuweb.github.io/gpuweb/#dictdef-gpurequestadapteroptions} for more details. * * @defaultValue `undefined` + * + * @deprecated Create your own GPUAdapter, use it to create a GPUDevice instance and set {@link device} property if + * you want to use a specific power preference. */ powerPreference?: 'low-power' | 'high-performance'; /** @@ -187,6 +192,9 @@ export declare namespace Env { * See {@link https://gpuweb.github.io/gpuweb/#dictdef-gpurequestadapteroptions} for more details. * * @defaultValue `undefined` + * + * @deprecated Create your own GPUAdapter, use it to create a GPUDevice instance and set {@link device} property if + * you want to use a specific fallback option. */ forceFallbackAdapter?: boolean; /** @@ -199,16 +207,25 @@ export declare namespace Env { * value will be the GPU adapter that created by the underlying WebGPU backend. * * When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types". + * + * @deprecated It is no longer recommended to use this property. The latest WebGPU spec adds `GPUDevice.adapterInfo` + * (https://www.w3.org/TR/webgpu/#dom-gpudevice-adapterinfo), which allows to get the adapter information from the + * device. When it's available, there is no need to set/get the {@link adapter} property. */ adapter: TryGetGlobalType<'GPUAdapter'>; /** - * Get the device for WebGPU. - * - * This property is only available after the first WebGPU inference session is created. + * Set or get the GPU device for WebGPU. * - * When use with TypeScript, the type of this property is `GPUDevice` defined in "@webgpu/types". + * There are 3 valid scenarios of accessing this property: + * - Set a value before the first WebGPU inference session is created. The value will be used by the WebGPU backend + * to perform calculations. If the value is not a `GPUDevice` object, an error will be thrown. + * - Get the value before the first WebGPU inference session is created. This will try to create a new GPUDevice + * instance. Returns a `Promise` that resolves to a `GPUDevice` object. + * - Get the value after the first WebGPU inference session is created. Returns a resolved `Promise` to the + * `GPUDevice` object used by the WebGPU backend. */ - readonly device: TryGetGlobalType<'GPUDevice'>; + get device(): Promise>; + set device(value: TryGetGlobalType<'GPUDevice'>); /** * Set or get whether validate input content. * diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts index d54ba32f9f494..5de39535a5c07 100644 --- a/js/web/test/test-runner.ts +++ b/js/web/test/test-runner.ts @@ -586,11 +586,11 @@ export class TensorResultValidator { } } -function createGpuTensorForInput(cpuTensor: ort.Tensor): ort.Tensor { +async function createGpuTensorForInput(cpuTensor: ort.Tensor): Promise { if (!isGpuBufferSupportedType(cpuTensor.type) || Array.isArray(cpuTensor.data)) { throw new Error(`createGpuTensorForInput can not work with ${cpuTensor.type} tensor`); } - const device = ort.env.webgpu.device as GPUDevice; + const device = await ort.env.webgpu.device; const gpuBuffer = device.createBuffer({ // eslint-disable-next-line no-bitwise usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE, @@ -612,14 +612,14 @@ function createGpuTensorForInput(cpuTensor: ort.Tensor): ort.Tensor { }); } -function createGpuTensorForOutput(type: ort.Tensor.Type, dims: readonly number[]) { +async function createGpuTensorForOutput(type: ort.Tensor.Type, dims: readonly number[]) { if (!isGpuBufferSupportedType(type)) { throw new Error(`createGpuTensorForOutput can not work with ${type} tensor`); } const size = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(type), dims)!; - const device = ort.env.webgpu.device as GPUDevice; + const device = await ort.env.webgpu.device; const gpuBuffer = device.createBuffer({ // eslint-disable-next-line no-bitwise usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE, @@ -725,7 +725,7 @@ export async function sessionRun(options: { if (options.ioBinding === 'ml-location' || options.ioBinding === 'ml-tensor') { feeds[name] = await createMLTensorForInput(options.mlContext!, feeds[name]); } else { - feeds[name] = createGpuTensorForInput(feeds[name]); + feeds[name] = await createGpuTensorForInput(feeds[name]); } } } @@ -742,7 +742,7 @@ export async function sessionRun(options: { if (options.ioBinding === 'ml-tensor') { fetches[name] = await createMLTensorForOutput(options.mlContext!, type, dims); } else { - fetches[name] = createGpuTensorForOutput(type, dims); + fetches[name] = await createGpuTensorForOutput(type, dims); } } } From ebb968d34af32e70e361321db37d56ea3bb4ce68 Mon Sep 17 00:00:00 2001 From: Hector Li Date: Wed, 11 Dec 2024 17:26:29 -0800 Subject: [PATCH 2/8] disable the EP context embed model by default in session option (#23070) change the default value for session option ep.context_embed_mode to 0 to avoid the model loading memory overhead --- .../core/session/onnxruntime_session_options_config_keys.h | 4 ++-- onnxruntime/core/providers/openvino/contexts.h | 2 +- .../core/providers/openvino/openvino_execution_provider.h | 2 +- .../core/providers/openvino/openvino_provider_factory.cc | 2 +- onnxruntime/core/providers/qnn/qnn_execution_provider.cc | 2 +- .../core/providers/vitisai/vitisai_execution_provider.h | 2 +- onnxruntime/core/session/provider_bridge_ort.cc | 2 +- onnxruntime/test/onnx/main.cc | 5 ++++- 8 files changed, 12 insertions(+), 9 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 6a01602e634f8..8f1bc98ce7b49 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -261,8 +261,8 @@ static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable"; static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path"; // Flag to specify whether to dump the EP context into the Onnx model. -// "0": dump the EP context into separate file, keep the file name in the Onnx model. -// "1": dump the EP context into the Onnx model. (default). +// "0": dump the EP context into separate file, keep the file name in the Onnx model. (default). +// "1": dump the EP context into the Onnx model. static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode"; // Specify the EPContext node name prefix to make it unique diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index a2f4b236213cc..4f970bc7bc287 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -18,7 +18,7 @@ struct GlobalContext { bool is_wholly_supported_graph = false; bool enable_opencl_throttling = false; bool disable_dynamic_shapes = false; - bool ep_context_embed_mode = true; + bool ep_context_embed_mode = false; bool export_ep_ctx_blob = false; bool enable_qdq_optimizer = false; bool disable_cpu_fallback = false; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index bea9badea475a..59dbd141f4782 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -90,7 +90,7 @@ struct OpenVINOExecutionProviderInfo { bool export_ep_ctx_blob_{false}; bool enable_qdq_optimizer_{false}; bool disable_cpu_fallback_{false}; - bool so_epctx_embed_mode_{true}; + bool so_epctx_embed_mode_{false}; OpenVINOExecutionProviderInfo() = delete; diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 57c4e92685c96..66f9bcb7b2a5e 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -53,7 +53,7 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory { std::unique_ptr OpenVINOProviderFactory::CreateProvider() { bool so_disable_cpu_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; - bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1") == "1"; + bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str(); if (so_export_ep_ctx_blob && !so_cache_path.empty()) { diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 060bbd4f79bf2..27e195dea73d2 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -204,7 +204,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_; std::string embed_mode = session_options->config_options.GetConfigOrDefault( - kOrtSessionOptionEpContextEmbedMode, "1"); + kOrtSessionOptionEpContextEmbedMode, "0"); if ("1" == embed_mode) { qnn_context_embed_mode_ = true; } else if ("0" == embed_mode) { diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h index 9864a40bd1d3b..77dede6035b4c 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h @@ -52,7 +52,7 @@ class VitisAIExecutionProvider : public IExecutionProvider { std::shared_ptr registry_; // EP context related. bool ep_ctx_enabled_ = false; - bool ep_ctx_embed_mode_ = true; + bool ep_ctx_embed_mode_ = false; std::string ep_ctx_model_path_cfg_{""}; mutable PathString ep_ctx_model_file_loc_{}; // It might need to be called before loading diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index c3832498af584..e0c479dbc7637 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -2242,7 +2242,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT_V2, new_tensorrt_options.trt_ep_context_file_path = (context_cache_path.size() == 0) ? nullptr : context_cache_path.c_str(); LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path; - embed_mode = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1"); + embed_mode = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0"); if ("1" == embed_mode) { new_tensorrt_options.trt_ep_context_embed_mode = 1; } else if ("0" == embed_mode) { diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index ddc453f84feb6..99c3e44e13013 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -454,8 +454,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) { if (ep_context_enable) sf.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1"); - if (disable_ep_context_embed_mode) + if (disable_ep_context_embed_mode) { sf.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0"); + } else { + sf.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "1"); + } for (auto& it : session_config_entries) { sf.AddConfigEntry(it.first.c_str(), it.second.c_str()); From 1f88284f96d98b735824c8a9eebee4fc85becf7d Mon Sep 17 00:00:00 2001 From: Ankit Maheshkar Date: Thu, 12 Dec 2024 11:56:32 +0530 Subject: [PATCH 3/8] OVEP 1.21.0 Development Updates (#23080) ### Description OVEP development changes for ORT 1.21 Release ### Motivation and Context - Has Critical Bug Fixes - Improved Performance optimizations for both memory & inference latency (https://github.com/intel/onnxruntime/pull/513) - Enabled Model Compilation using NPUW (https://github.com/intel/onnxruntime/pull/508) - Fixed support for EPContext embed mode 0 for lower memory utilization - Updated NuGet package name as `Intel.ML.OnnxRuntime.OpenVino` - Fixed QDQ Stripping logic on NPU --- cmake/onnxruntime_providers_openvino.cmake | 4 +- .../providers/openvino/backend_manager.cc | 5 +- .../core/providers/openvino/backend_utils.cc | 16 +++--- .../core/providers/openvino/backend_utils.h | 2 +- .../openvino/backends/basic_backend.cc | 57 ++++++++++++++++--- .../openvino/backends/basic_backend.h | 1 - .../openvino/onnx_ctx_model_helper.cc | 13 ++++- .../openvino/onnx_ctx_model_helper.h | 6 +- .../openvino/openvino_execution_provider.h | 2 +- .../openvino/openvino_provider_factory.cc | 4 +- .../core/providers/openvino/ov_allocator.cc | 1 - .../core/providers/openvino/ov_interface.cc | 8 +-- .../core/providers/openvino/ov_interface.h | 2 +- .../openvino/ov_versions/capability.cc | 8 +-- .../openvino/ov_versions/data_ops.cc | 12 ++-- .../providers/openvino/ov_versions/data_ops.h | 3 +- .../qdq_transformations/qdq_stripping.cc | 6 +- tools/ci_build/build.py | 2 +- .../linux-openvino-ci-pipeline.yml | 2 +- .../linux/docker/Dockerfile.ubuntu_openvino | 8 +-- .../nuget/generate_nuspec_for_native_nuget.py | 28 ++++++++- 21 files changed, 138 insertions(+), 52 deletions(-) diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake index e500957f864f8..f5fae8d169ccc 100644 --- a/cmake/onnxruntime_providers_openvino.cmake +++ b/cmake/onnxruntime_providers_openvino.cmake @@ -13,8 +13,8 @@ # Header paths find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX) - if(OpenVINO_VERSION VERSION_LESS 2024.3) - message(FATAL_ERROR "OpenVINO 2024.3 and newer are supported. Please, use latest OpenVINO release") + if(OpenVINO_VERSION VERSION_LESS 2024.4) + message(FATAL_ERROR "OpenVINO 2024.4 and newer are supported. Please, use latest OpenVINO release") endif() if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 0ffde116f4efc..a0bcf953938d9 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -70,7 +70,10 @@ BackendManager::BackendManager(const GlobalContext& global_context, i++; } subgraph_context_.subgraph_name = fused_node.Name(); - auto model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger); + std::unique_ptr model_proto; + if (!ep_ctx_handle_.IsValidOVEPCtxGraph()) { + model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger); + } std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type; if (ModelHasSymbolicInputDims(subgraph)) { diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index f772b9c3b0478..b97736f2e124d 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -39,7 +39,7 @@ struct static_cast_int64 { int64_t operator()(const T1& x) const { return static_cast(x); } }; -std::shared_ptr +std::shared_ptr CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, std::map>& const_outputs_map) { if (IsCILogEnabled()) { @@ -47,13 +47,13 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext } const std::string model = model_proto.SerializeAsString(); try { - auto cnn_network = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name); + auto ov_model = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name); // Check for Constant Folding - if (!global_context.is_wholly_supported_graph) { + if ((global_context.device_type != "NPU") && !global_context.is_wholly_supported_graph) { ov::pass::ConstantFolding pass_const_obj; - pass_const_obj.run_on_model(cnn_network); - auto& results = const_cast(cnn_network.get()->get_results()); + pass_const_obj.run_on_model(ov_model); + auto& results = const_cast(ov_model.get()->get_results()); size_t index = results.size() - 1; for (auto it = results.rbegin(); it != results.rend(); ++it) { @@ -67,12 +67,12 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext } #ifndef NDEBUG if (IsDebugEnabled()) { - std::string name = cnn_network->get_friendly_name(); + std::string name = ov_model->get_friendly_name(); ov::pass::Serialize serializer(name + ".xml", name + ".bin"); - serializer.run_on_model(cnn_network); + serializer.run_on_model(ov_model); } #endif - return cnn_network; + return ov_model; } catch (std::string const& msg) { ORT_THROW(msg); } diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 9e65770da7d23..9d58e1ca73abb 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -60,7 +60,7 @@ void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx, void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor, size_t batch_slice_idx); -std::shared_ptr +std::shared_ptr CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, std::map>& const_outputs_map); diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 56cceb8cf2a19..435ca83ff69d4 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -48,6 +48,16 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr // Set the inference_num_threads property of the CPU SetNumThreads(device_config); + auto npuw_status = + std::any_of(device_config.begin(), device_config.end(), [&](const std::pair& pair) { + return (pair.first.find("NPU_USE_NPUW") != std::string::npos) && (pair.second.is()) && + (pair.second.as() == "YES"); + }); + + if (npuw_status) { + LOGS_DEFAULT(INFO) << log_tag << "NPUW Enabled during compilation"; + } + try { std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str; @@ -81,7 +91,6 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr device_config, global_context_.ep_context_embed_mode, subgraph_context_.subgraph_name); - ie_cnn_network_ = exe_network_.Get().get_runtime_model(); } else if (global_context_.export_ep_ctx_blob && hw_target.find("NPU") != std::string::npos && !global_context_.has_external_weights) { @@ -106,15 +115,15 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr device_config, subgraph_context_.subgraph_name); } else { // For all other types use ov::Model Type - ie_cnn_network_ = CreateOVModel(*model_proto, global_context_, const_outputs_map_); + auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_); exe_network_ = global_context_.ie_core.CompileModel( - ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); + ov_model, hw_target, device_config, subgraph_context_.subgraph_name); } #endif } else { // Full graph is not supported - ie_cnn_network_ = CreateOVModel(*model_proto, global_context_, const_outputs_map_); + auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_); exe_network_ = global_context_.ie_core.CompileModel( - ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); + ov_model, hw_target, device_config, subgraph_context_.subgraph_name); } LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; } catch (const char* msg) { @@ -145,8 +154,8 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { device_config.emplace(ov::hint::inference_precision("f32")); } if (global_context_.precision_str.find("ACCURACY") != std::string::npos && - global_context_.device_type == "GPU") { - if (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) >= 1) { + global_context_.device_type.find("GPU") != std::string::npos) { + if (global_context_.OpenVINO_Version.at(0) >= 2024) { device_config.emplace(ov::hint::inference_precision(ov::element::undefined)); device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY)); } else { @@ -174,7 +183,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { device_property = std::make_pair("NPU_COMPILER_TYPE", env_npu_compiler_type); } device_config.emplace(ov::device::properties("NPU", device_property)); -#if (OPENVINO_VERSION_MAJOR >= 2024) && (OPENVINO_VERSION_MINOR > 3) +#if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024)) if (global_context_.export_ep_ctx_blob) { global_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true)); } @@ -184,6 +193,33 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { if (!global_context_.load_config.empty()) { const std::map& target_config = global_context_.load_config; + if (global_context_.device_type.find("NPU") != std::string::npos) { + auto npuw_config = target_config.at("NPU"); + + // Check if "NPU_USE_NPUW" exists and is set to "YES" + auto npu_use_npuw_it = npuw_config.find("NPU_USE_NPUW"); + if (npu_use_npuw_it != npuw_config.end() && + npu_use_npuw_it->second.is() && + npu_use_npuw_it->second.as() == "YES") { + // Only add NPUW-related keys if NPU_USE_NPUW is "YES" + for (const auto& [key, value] : npuw_config) { + if (key.find("NPUW") != std::string::npos) { + if (!value.is()) { + LOGS_DEFAULT(ERROR) << "Invalid value type for key: " << key; + continue; + } + device_config[key] = value; + } + } + } else { + // Check if there are any "NPUW" keys and log a warning + if (std::any_of(npuw_config.begin(), npuw_config.end(), + [&](const auto& pair) { return pair.first.find("NPUW") != std::string::npos; })) { + LOGS_DEFAULT(WARNING) << "Skipping NPUW-related configurations as NPU_USE_NPUW is not set to 'YES'."; + } + } + } + // Parse device types like "AUTO:CPU,GPU" and extract individual devices auto parse_individual_devices = [&](const std::string& device_type) -> std::vector { std::vector devices; @@ -213,6 +249,9 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options, const std::vector& supported_properties) { for (const auto& [key, value] : config_options) { + if (key.find("NPUW") != std::string::npos) { + continue; + } if (is_supported_and_mutable(key, supported_properties)) { global_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}}); } else { @@ -378,7 +417,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque if ((it == ort_ov_tensor_map.end()) || (it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) { ov_tensor_data_t ov_tensor_data; - auto input = graph_input_info.at(input_idx); + const auto& input = graph_input_info.at(input_idx); ov_tensor_data.tensor_ptr = std::make_shared(input.get_element_type(), input.get_shape(), const_cast(tensor.GetTensorRawData())); diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 12502a1d83c5d..3fcf6e4384d52 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -58,7 +58,6 @@ class BasicBackend : public IBackend { GlobalContext& global_context_; SubGraphContext subgraph_context_; mutable std::mutex compute_lock_; - std::shared_ptr ie_cnn_network_; OVExeNetwork exe_network_; std::map> const_outputs_map_; std::unique_ptr inferRequestsQueue_; diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc index 42a2b5d30c25c..6d159db3b390d 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc @@ -99,7 +99,9 @@ Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, b auto node = graph_viewer.GetNode(0); auto& attrs = node->GetAttributes(); ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) > 0); - model_stream_ = std::make_shared(attrs.at(EP_CACHE_CONTEXT).s()); + + ep_cache_context_attribute_ = &attrs.at(EP_CACHE_CONTEXT); + ep_context_embed_mode = static_cast(attrs.at(EMBED_MODE).i()); LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node"; @@ -107,6 +109,15 @@ Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, b return Status::OK(); } +const std::string& EPCtxHandler::GetModelBlobStream() const { + static std::string empty; + if (ep_cache_context_attribute_ != nullptr) { + return ep_cache_context_attribute_->s(); + } else { + return empty; + } +} + bool EPCtxHandler::CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const { for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) { auto node = graph_viewer.GetNode(i); diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h index c7ee943dff761..caab33b7db775 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h @@ -23,7 +23,7 @@ static const char SOURCE[] = "source"; class EPCtxHandler { public: EPCtxHandler() = default; - EPCtxHandler(const EPCtxHandler&) = default; + EPCtxHandler(const EPCtxHandler&) = delete; Status ExportEPCtxModel(const GraphViewer& graph_viewer, const std::string& graph_name, const logging::Logger& logger, @@ -33,11 +33,11 @@ class EPCtxHandler { Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode); bool CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const; bool IsValidOVEPCtxGraph() const { return is_valid_ep_ctx_graph_; } - [[nodiscard]] const std::shared_ptr GetModelBlobStream() const { return model_stream_; } + const std::string& GetModelBlobStream() const; private: bool is_valid_ep_ctx_graph_{false}; - std::shared_ptr model_stream_; + const onnx::AttributeProto* ep_cache_context_attribute_; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 59dbd141f4782..d5c22a4e2a9e4 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -159,7 +159,7 @@ struct OpenVINOExecutionProviderInfo { device_type_ = std::move(dev_type); } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) { std::vector devices = parseDevices(dev_type, available_devices); - device_type_ = dev_type; + device_type_ = std::move(dev_type); } else { ORT_THROW("Invalid device string: " + dev_type); } diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 66f9bcb7b2a5e..5855cb594a08e 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -57,7 +57,7 @@ std::unique_ptr OpenVINOProviderFactory::CreateProvider() { std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str(); if (so_export_ep_ctx_blob && !so_cache_path.empty()) { - cache_dir_ = so_cache_path; + cache_dir_ = std::move(so_cache_path); auto file_path = std::filesystem::path(cache_dir_); // ep_context_file_path_ file extension must be .onnx if (file_path.extension().generic_string() == ".onnx") { @@ -248,7 +248,7 @@ struct OpenVINO_Provider : Provider { LOGS_DEFAULT(WARNING) << "Unsupported JSON value type for key: " << inner_key << ". Skipping key."; } } - target_map[key] = inner_map; + target_map[key] = std::move(inner_map); } } catch (const nlohmann::json::parse_error& e) { // Handle syntax errors in JSON diff --git a/onnxruntime/core/providers/openvino/ov_allocator.cc b/onnxruntime/core/providers/openvino/ov_allocator.cc index 6700244b754d8..0e5ff8ff98efb 100644 --- a/onnxruntime/core/providers/openvino/ov_allocator.cc +++ b/onnxruntime/core/providers/openvino/ov_allocator.cc @@ -39,7 +39,6 @@ void* OVRTAllocator::Alloc(size_t size) { } catch (const ov::Exception& e) { ORT_THROW(std::string("Alloc failed: ") + e.what()); } - return nullptr; } void OVRTAllocator::Free(void* p) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 7e8681d304abf..12ab7ecede031 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -109,7 +109,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, } } -OVExeNetwork OVCore::ImportModel(std::shared_ptr model_stream, +OVExeNetwork OVCore::ImportModel(const std::string& model_string, std::string hw_target, const ov::AnyMap& device_config, bool embed_mode, @@ -117,10 +117,10 @@ OVExeNetwork OVCore::ImportModel(std::shared_ptr model_strea try { ov::CompiledModel obj; if (embed_mode) { - obj = oe.import_model(*model_stream, hw_target, device_config); + std::istringstream model_stream(model_string); + obj = oe.import_model(model_stream, hw_target, device_config); } else { - std::string blob_file_path = (*model_stream).str(); - std::ifstream modelStream(blob_file_path, std::ios_base::binary | std::ios_base::in); + std::ifstream modelStream(model_string, std::ios_base::binary | std::ios_base::in); obj = oe.import_model(modelStream, hw_target, {}); diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index f4da4ea3e3244..c3417003f8e1f 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -54,7 +54,7 @@ class OVCore { ov::AnyMap& device_config, const std::string& name); // OV Interface for Import model Stream - OVExeNetwork ImportModel(std::shared_ptr model_stream, + OVExeNetwork ImportModel(const std::string& model_string, std::string hw_target, const ov::AnyMap& device_config, bool embed_mode, diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 95c7466e02f2f..3e780f74145ae 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -35,14 +35,14 @@ GetCapability::GetCapability(const GraphViewer& graph_viewer_param, device_type_ = "CPU"; if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true; } -#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 3 - data_ops_ = new DataOps(graph_viewer_, V_2024_3, device_type_, npu_qdq_optimizer_enabled); -#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 4 +#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 4 data_ops_ = new DataOps(graph_viewer_, V_2024_4, device_type_, npu_qdq_optimizer_enabled); #elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5 data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled); +#elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 0 + data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled); #else - data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled); + data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled); #endif } diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index b2c5fd6f83167..f118f057ac11e 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -355,6 +355,7 @@ void DataOps::populate_op_mode_supported() { no_dimension_supported_.push_back({"Floor", V_2020_4, {"All"}}); no_dimension_supported_.push_back({"Gather", V_2020_4, {"All"}}); no_dimension_supported_.push_back({"Identity", V_2023_0, {"All"}}); + no_dimension_supported_.push_back({"If", V_2022_3, {"CPU", "GPU"}}); no_dimension_supported_.push_back({"Less", V_2022_1, {"CPU"}}); no_dimension_supported_.push_back({"Loop", V_2021_4, {"All"}}); no_dimension_supported_.push_back({"Min", V_2020_4, {"All"}}); @@ -387,7 +388,7 @@ void DataOps::populate_op_mode_supported() { // populate unsupportedmode_t { - UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5}, + UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2025_0}, [this](const Node* node, const InitializedTensorSet&) { // If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch) for (size_t i = 0; i < node->InputDefs().size(); i++) { @@ -402,7 +403,8 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"ReduceMax", obj}); } { - UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, + V_2024_3, V_2024_4, V_2024_5, V_2025_0}, [this](const Node* node, const InitializedTensorSet&) { const auto& input_arg = node->InputDefs()[1]; auto shape = input_arg->Shape(); @@ -419,7 +421,8 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"Reshape", obj}); } { - UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, + V_2024_3, V_2024_4, V_2024_5, V_2025_0}, [this](const Node* node, const InitializedTensorSet&) { // If the operator is unsqueeze // If axes is an input, then we cannot produce a static graph. @@ -434,7 +437,8 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"Unsqueeze", obj}); } { - UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, + V_2025_0}, [this](const Node* node, const InitializedTensorSet&) { // check for attributes auto& upsample_attr = node->GetAttributes(); diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h index a2db56deca7cd..07fa36f355d55 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h @@ -32,7 +32,8 @@ enum versionNum { V_2024_2, V_2024_3, V_2024_4, - V_2024_5 + V_2024_5, + V_2025_0 }; using VersionNum = enum versionNum; diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index decfe91c598be..387aaf9985b4c 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -30,6 +30,10 @@ constexpr std::string_view DuplicateDQ = "/duplicated"; constexpr ONNX_NAMESPACE::TensorProto_DataType DT_UINT16 = ONNX_NAMESPACE::TensorProto_DataType_UINT16; constexpr ONNX_NAMESPACE::TensorProto_DataType DT_INT16 = ONNX_NAMESPACE::TensorProto_DataType_INT16; +constexpr ONNX_NAMESPACE::TensorProto_DataType DT_UINT8 = ONNX_NAMESPACE::TensorProto_DataType_UINT8; +constexpr ONNX_NAMESPACE::TensorProto_DataType DT_INT8 = ONNX_NAMESPACE::TensorProto_DataType_INT8; +constexpr ONNX_NAMESPACE::TensorProto_DataType DT_UINT4 = ONNX_NAMESPACE::TensorProto_DataType_UINT4; +constexpr ONNX_NAMESPACE::TensorProto_DataType DT_INT4 = ONNX_NAMESPACE::TensorProto_DataType_INT4; // Return the data type of the qdq node. // Check output type of Q and input type of DQ to determine it as zero_point is an optional input and may not exist @@ -218,7 +222,7 @@ static bool DQFeedsASupportedOp(const Node* dq_node) { } else { return true; } - } else if (op_type == "Add") { + } else if (op_type == "Add" && !(GetQDQDataType(dq_node) == DT_UINT16 || GetQDQDataType(dq_node) == DT_INT16)) { // Add => keeps all DQs return true; } diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 6ee37b8b0519e..3527a89ca7a7b 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -2332,7 +2332,7 @@ def build_nuget_package( target_name = "/t:CreateWindowsAIPackage" elif use_openvino: execution_provider = "/p:ExecutionProvider=openvino" - package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.OpenVino" + package_name = "/p:OrtPackageId=Intel.ML.OnnxRuntime.OpenVino" elif use_tensorrt: execution_provider = "/p:ExecutionProvider=tensorrt" package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.TensorRT" diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml index 9ee589a3d6ef3..c7b814f3dd52c 100644 --- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml @@ -33,5 +33,5 @@ jobs: parameters: AgentPool : 'Linux-CPU-2019' JobName: 'Linux_CI_Dev' - RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2024.4.0 -x "--use_openvino CPU --build_wheel"' + RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2024.5.0 -x "--use_openvino CPU --build_wheel"' TimeoutInMinutes: 120 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino index 8f3dcb69d6c56..643c0d66d01f5 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino @@ -1,7 +1,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:${UBUNTU_VERSION} -ARG OPENVINO_VERSION=2024.4.0 +ARG OPENVINO_VERSION=2024.5.0 ARG PYTHON_VERSION=3.10 ADD scripts /tmp/scripts @@ -19,9 +19,9 @@ ENV IE_PLUGINS_PATH=$INTEL_OPENVINO_DIR/runtime/lib/intel64 ENV DEBIAN_FRONTEND=noninteractive RUN cd /opt && mkdir -p intel && cd intel && \ - wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.4/linux/l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && \ - tar xzf l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && \ - mv l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64 openvino_2024.4.0 && \ + wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.5/linux/l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64.tgz && \ + tar xzf l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64.tgz && \ + mv l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64 openvino_2024.5.0 && \ cd $INTEL_OPENVINO_DIR/install_dependencies && ./install_openvino_dependencies.sh -y WORKDIR /root diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py index ba125f4e2d980..11842f34ce45b 100644 --- a/tools/nuget/generate_nuspec_for_native_nuget.py +++ b/tools/nuget/generate_nuspec_for_native_nuget.py @@ -182,6 +182,8 @@ def generate_description(line_list, package_name): description = "This package contains Linux native shared library artifacts for ONNX Runtime with CUDA." elif "Microsoft.ML.OnnxRuntime.Gpu.Windows" in package_name: description = "This package contains Windows native shared library artifacts for ONNX Runtime with CUDA." + elif "Intel.ML.OnnxRuntime" in package_name: + description = "This package contains native shared library artifacts for ONNX Runtime with OpenVINO." elif "Microsoft.ML.OnnxRuntime" in package_name: # This is a Microsoft.ML.OnnxRuntime.* package description = ( "This package contains native shared library artifacts for all supported platforms of ONNX Runtime." @@ -715,7 +717,7 @@ def generate_files(line_list, args): ) if args.execution_provider == "openvino": - get_env_var("INTEL_OPENVINO_DIR") + openvino_path = get_env_var("INTEL_OPENVINO_DIR") files_list.append( "' ) + if is_windows(): + dll_list_path = os.path.join(openvino_path, "runtime\\bin\\intel64\\Release\\") + tbb_list_path = os.path.join(openvino_path, "runtime\\3rdparty\\tbb\\bin\\") + for dll_element in os.listdir(dll_list_path): + if dll_element.endswith("dll"): + files_list.append( + "' + ) + for tbb_element in os.listdir(tbb_list_path): + if tbb_element.endswith("dll"): + files_list.append( + "' + ) + if args.execution_provider == "cuda" or is_cuda_gpu_win_sub_package and not is_ado_packaging_build: files_list.append( "version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) { - graph_count = binary_info->contextBinaryInfoV3.numGraphs; - graphs_info = binary_info->contextBinaryInfoV3.graphs; - } else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) { - graph_count = binary_info->contextBinaryInfoV2.numGraphs; - graphs_info = binary_info->contextBinaryInfoV2.graphs; - } else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) { + if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) { graph_count = binary_info->contextBinaryInfoV1.numGraphs; graphs_info = binary_info->contextBinaryInfoV1.graphs; - } else { + } +#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 15) // starts from 2.22 + else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) { + graph_count = binary_info->contextBinaryInfoV2.numGraphs; + graphs_info = binary_info->contextBinaryInfoV2.graphs; + } +#endif +#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 21) // starts from 2.28 + else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) { + graph_count = binary_info->contextBinaryInfoV3.numGraphs; + graphs_info = binary_info->contextBinaryInfoV3.graphs; + } +#endif + else { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported context binary info version."); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc index 75973c7031d62..4f73e4c532ed4 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc @@ -325,28 +325,35 @@ Status QnnModel::DeserializeGraphInfoFromBinaryInfo(const QnnSystemContext_Graph Qnn_Tensor_t* output_tensors = nullptr; uint32_t graph_input_num = 0; uint32_t graph_output_num = 0; - if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) { - graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV3.graphName); - graph_input_num = qnn_sys_ctx_graph_info.graphInfoV3.numGraphInputs; - graph_output_num = qnn_sys_ctx_graph_info.graphInfoV3.numGraphOutputs; + if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) { + graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV1.graphName); + graph_input_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphInputs; + graph_output_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphOutputs; - input_tensors = qnn_sys_ctx_graph_info.graphInfoV3.graphInputs; - output_tensors = qnn_sys_ctx_graph_info.graphInfoV3.graphOutputs; - } else if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) { + input_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphInputs; + output_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphOutputs; + } +#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 18) // start from 2.25 + else if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) { graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV2.graphName); graph_input_num = qnn_sys_ctx_graph_info.graphInfoV2.numGraphInputs; graph_output_num = qnn_sys_ctx_graph_info.graphInfoV2.numGraphOutputs; input_tensors = qnn_sys_ctx_graph_info.graphInfoV2.graphInputs; output_tensors = qnn_sys_ctx_graph_info.graphInfoV2.graphOutputs; - } else if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) { - graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV1.graphName); - graph_input_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphInputs; - graph_output_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphOutputs; + } +#endif +#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 21) // start from 2.28 + else if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) { + graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV3.graphName); + graph_input_num = qnn_sys_ctx_graph_info.graphInfoV3.numGraphInputs; + graph_output_num = qnn_sys_ctx_graph_info.graphInfoV3.numGraphOutputs; - input_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphInputs; - output_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphOutputs; - } else { + input_tensors = qnn_sys_ctx_graph_info.graphInfoV3.graphInputs; + output_tensors = qnn_sys_ctx_graph_info.graphInfoV3.graphOutputs; + } +#endif + else { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported context graph info version."); } ORT_RETURN_IF(nullptr == input_tensors, "Graph from cached context doesn't have any inputs."); From 2a36fd4f6e0aabfb4a00c4c20ab5b9bd474c60b8 Mon Sep 17 00:00:00 2001 From: Hector Li Date: Thu, 12 Dec 2024 21:12:02 -0800 Subject: [PATCH 7/8] Fix the ctx_gen tool to make sure all generated ctx.onnx have max_size (#23097) ### Description Fix the qnn_ctx_gen tool to make sure all generated ctx.onnx have max_size --- onnxruntime/test/qnn_ctx_gen/main.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/onnxruntime/test/qnn_ctx_gen/main.cc b/onnxruntime/test/qnn_ctx_gen/main.cc index d568d5e78688a..3be0bd253c8a4 100644 --- a/onnxruntime/test/qnn_ctx_gen/main.cc +++ b/onnxruntime/test/qnn_ctx_gen/main.cc @@ -33,8 +33,11 @@ static void CheckStatus(const Status& status) { // from the last context cache Onnx model, find the EPContext node with main_context=1, // and get the QNN context binary file name, this context binary contains all graphs from all Onnx models +// get the max spill fill buffer size static void GetLastContextBinaryFileName(const std::basic_string last_onnx_ctx_file, - std::string& last_ctx_bin_file) { + std::string& last_ctx_bin_file, + int64_t& max_size) { + max_size = 0; std::shared_ptr ctx_model; CheckStatus(Model::Load(ToPathString(last_onnx_ctx_file), ctx_model, nullptr, (*((OrtEnv*)*ort_env.get())->GetEnvironment().GetLoggingManager()).DefaultLogger())); @@ -43,6 +46,7 @@ static void GetLastContextBinaryFileName(const std::basic_string last if (node.OpType() == "EPContext") { NodeAttrHelper node_helper(node); int64_t is_main_context = node_helper.Get("main_context", static_cast(0)); + max_size = node_helper.Get("max_size", static_cast(0)); if (1 == is_main_context) { last_ctx_bin_file = node_helper.Get("ep_cache_context", ""); return; @@ -55,7 +59,8 @@ static void GetLastContextBinaryFileName(const std::basic_string last // the last QNN context binary file // Remove not used QNN context binary file, only keep the last one which contains all graphs static void UpdateEpContextModel(const std::vector>& ep_ctx_files, - const std::string& last_qnn_ctx_binary_file_name) { + const std::string& last_qnn_ctx_binary_file_name, + int64_t max_size) { for (auto ep_ctx_file : ep_ctx_files) { std::shared_ptr ctx_model; auto path_str = ToPathString(ep_ctx_file); @@ -75,6 +80,8 @@ static void UpdateEpContextModel(const std::vector> std::remove(file_path.string().c_str()); node.ClearAttribute("ep_cache_context"); node.AddAttribute("ep_cache_context", last_qnn_ctx_binary_file_name); + node.ClearAttribute("max_size"); + node.AddAttribute("max_size", max_size); } } } @@ -181,7 +188,8 @@ int real_main(int argc, char* argv[]) { // Get the last context binary file name std::string last_qnn_ctx_binary_file_name; - GetLastContextBinaryFileName(ep_ctx_files.back(), last_qnn_ctx_binary_file_name); + int64_t max_size = 0; + GetLastContextBinaryFileName(ep_ctx_files.back(), last_qnn_ctx_binary_file_name, max_size); std::cout << "The last context binary file: " << last_qnn_ctx_binary_file_name << std::endl; if (last_qnn_ctx_binary_file_name.empty()) { throw Ort::Exception("Can't find QNN context binary file from the Onnx model.", OrtErrorCode::ORT_FAIL); @@ -191,7 +199,7 @@ int real_main(int argc, char* argv[]) { // Update generated context cache Onnx model to make the main EPContext node point to // the last QNN context binary file // Remove not used QNN context binary file, only keep the last one which contains all graphs - UpdateEpContextModel(ep_ctx_files, last_qnn_ctx_binary_file_name); + UpdateEpContextModel(ep_ctx_files, last_qnn_ctx_binary_file_name, max_size); } ORT_CATCH(const Ort::Exception& e) { fprintf(stderr, "Failed to generate context cache file: %s \n", e.what()); From 62e7e24f172a062242acae11575f7ea11529dd09 Mon Sep 17 00:00:00 2001 From: "genmingz@AMD" Date: Fri, 13 Dec 2024 13:13:43 +0800 Subject: [PATCH 8/8] Add attrProto.release_s interface (#22977) ### Description Add AttributeProto.release_s interface, which is used to obtain the string in the attribute using move semantics instead of copying it ### Motivation and Context The ep_context node stores a lot of information in attributes, which may cause the memory usage to increase. Use this interface to avoid memory waste --------- Co-authored-by: GenMing Zhong Co-authored-by: genmingz --- .../core/providers/shared_library/provider_interfaces.h | 1 + .../core/providers/shared_library/provider_wrappedtypes.h | 1 + onnxruntime/core/providers/vitisai/imp/attr_proto.cc | 4 ++++ onnxruntime/core/providers/vitisai/imp/attr_proto.h | 1 + onnxruntime/core/providers/vitisai/imp/global_api.cc | 7 +++++++ .../core/providers/vitisai/include/vaip/vaip_ort_api.h | 3 ++- onnxruntime/core/session/provider_bridge_ort.cc | 1 + 7 files changed, 17 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 7ab93d56cfe26..d182d0b9173bd 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -390,6 +390,7 @@ struct ProviderHost { virtual void AttributeProto__set_name(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) = 0; virtual void AttributeProto__set_type(ONNX_NAMESPACE::AttributeProto* p, ONNX_NAMESPACE::AttributeProto_AttributeType value) = 0; virtual ONNX_NAMESPACE::TensorProto* AttributeProto__add_tensors(ONNX_NAMESPACE::AttributeProto* p) = 0; + virtual std::string* AttributeProto__release_s(ONNX_NAMESPACE::AttributeProto* p) = 0; // GraphProto virtual std::unique_ptr GraphProto__construct() = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index a82ddfe64c64b..54249f0864cd7 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -122,6 +122,7 @@ struct AttributeProto final { void set_name(const ::std::string& value) { return g_host->AttributeProto__set_name(this, value); } void set_type(AttributeProto_AttributeType value) { return g_host->AttributeProto__set_type(this, value); } TensorProto* add_tensors() { return g_host->AttributeProto__add_tensors(this); } + std::string* release_s() { return g_host->AttributeProto__release_s(this); } typedef AttributeProto_AttributeType AttributeType; static constexpr AttributeType UNDEFINED = AttributeProto_AttributeType_UNDEFINED; diff --git a/onnxruntime/core/providers/vitisai/imp/attr_proto.cc b/onnxruntime/core/providers/vitisai/imp/attr_proto.cc index a9275b24ce91f..2b9ddf8ad147f 100644 --- a/onnxruntime/core/providers/vitisai/imp/attr_proto.cc +++ b/onnxruntime/core/providers/vitisai/imp/attr_proto.cc @@ -104,4 +104,8 @@ std::vector attr_proto_get_strings(const ONNX_NAMESPACE::AttributeP } return ret; } +std::string* attr_proto_release_string(ONNX_NAMESPACE::AttributeProto* attr) { + vai_assert(attr->type() == ONNX_NAMESPACE::AttributeProto_AttributeType_STRING, attr->name()); + return attr->release_s(); +} } // namespace vaip diff --git a/onnxruntime/core/providers/vitisai/imp/attr_proto.h b/onnxruntime/core/providers/vitisai/imp/attr_proto.h index bb2883512037b..08d980ec94c14 100644 --- a/onnxruntime/core/providers/vitisai/imp/attr_proto.h +++ b/onnxruntime/core/providers/vitisai/imp/attr_proto.h @@ -23,5 +23,6 @@ const ONNX_NAMESPACE::TensorProto& attr_proto_get_tensor(const ONNX_NAMESPACE::A gsl::span attr_proto_get_ints(const ONNX_NAMESPACE::AttributeProto& attr); gsl::span attr_proto_get_floats(const ONNX_NAMESPACE::AttributeProto& attr); std::vector attr_proto_get_strings(const ONNX_NAMESPACE::AttributeProto& attr); +std::string* attr_proto_release_string(ONNX_NAMESPACE::AttributeProto* attr); } // namespace vaip diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc index 3e802e5a77203..51dc79c569589 100644 --- a/onnxruntime/core/providers/vitisai/imp/global_api.cc +++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc @@ -449,6 +449,13 @@ vaip_core::OrtApiForVaip* create_org_api_hook() { return vaip_core::DllSafe(model_proto.SerializeAsString()); }; the_global_api.model_proto_delete = [](ONNX_NAMESPACE::ModelProto* p) { delete p; }; + the_global_api.attr_proto_release_string = [](ONNX_NAMESPACE::AttributeProto* attr) -> vaip_core::DllSafe { + auto pstr = vaip::attr_proto_release_string(attr); + std::string local_str = std::move(*pstr); + pstr = nullptr; + return vaip_core::DllSafe(std::move(local_str)); + }; + if (!s_library_vitisaiep.vaip_get_version) { return reinterpret_cast(&(the_global_api.host_)); } else { diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h index 288cfd6850d06..9425c08dceebc 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h @@ -13,7 +13,7 @@ struct OrtApi; namespace vaip_core { -#define VAIP_ORT_API_MAJOR (11u) +#define VAIP_ORT_API_MAJOR (12u) #define VAIP_ORT_API_MINOR (0u) #define VAIP_ORT_API_PATCH (0u) struct OrtApiForVaip { @@ -234,6 +234,7 @@ struct OrtApiForVaip { ModelProto* (*model_to_proto)(Model& model); // [95] DllSafe (*model_proto_serialize_as_string)(ModelProto& model_proto); // [96] void (*model_proto_delete)(ModelProto* p); // [97] + DllSafe (*attr_proto_release_string)(AttributeProto* attr); // [98] }; #ifndef USE_VITISAI diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index e0c479dbc7637..1444c1976d447 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -497,6 +497,7 @@ struct ProviderHostImpl : ProviderHost { void AttributeProto__set_name(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) override { return p->set_name(value); } void AttributeProto__set_type(ONNX_NAMESPACE::AttributeProto* p, ONNX_NAMESPACE::AttributeProto_AttributeType value) override { return p->set_type(value); } ONNX_NAMESPACE::TensorProto* AttributeProto__add_tensors(ONNX_NAMESPACE::AttributeProto* p) override { return p->add_tensors(); } + std::string* AttributeProto__release_s(ONNX_NAMESPACE::AttributeProto* p) override { return p->release_s(); } // GraphProto (wrapped) std::unique_ptr GraphProto__construct() override { return std::make_unique(); }