Skip to content

Commit

Permalink
OVEP 1.21.0 Development Updates (microsoft#23080)
Browse files Browse the repository at this point in the history
### Description
OVEP development changes for ORT 1.21 Release
 
 
### Motivation and Context
- Has Critical Bug Fixes
- Improved Performance optimizations for both memory & inference latency
(intel#513)
- Enabled Model Compilation using NPUW
(intel#508)
- Fixed support for EPContext embed mode 0 for lower memory utilization
- Updated NuGet package name as `Intel.ML.OnnxRuntime.OpenVino`
- Fixed QDQ Stripping logic on NPU
  • Loading branch information
ankitm3k authored Dec 12, 2024
1 parent ebb968d commit 1f88284
Show file tree
Hide file tree
Showing 21 changed files with 138 additions and 52 deletions.
4 changes: 2 additions & 2 deletions cmake/onnxruntime_providers_openvino.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

# Header paths
find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
if(OpenVINO_VERSION VERSION_LESS 2024.3)
message(FATAL_ERROR "OpenVINO 2024.3 and newer are supported. Please, use latest OpenVINO release")
if(OpenVINO_VERSION VERSION_LESS 2024.4)
message(FATAL_ERROR "OpenVINO 2024.4 and newer are supported. Please, use latest OpenVINO release")
endif()

if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4)
Expand Down
5 changes: 4 additions & 1 deletion onnxruntime/core/providers/openvino/backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ BackendManager::BackendManager(const GlobalContext& global_context,
i++;
}
subgraph_context_.subgraph_name = fused_node.Name();
auto model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
std::unique_ptr<onnx::ModelProto> model_proto;
if (!ep_ctx_handle_.IsValidOVEPCtxGraph()) {
model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
}
std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type;

if (ModelHasSymbolicInputDims(subgraph)) {
Expand Down
16 changes: 8 additions & 8 deletions onnxruntime/core/providers/openvino/backend_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,21 @@ struct static_cast_int64 {
int64_t operator()(const T1& x) const { return static_cast<int64_t>(x); }
};

std::shared_ptr<OVNetwork>
std::shared_ptr<const OVNetwork>
CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context,
std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
if (IsCILogEnabled()) {
std::cout << "CreateNgraphFunc" << std::endl;
}
const std::string model = model_proto.SerializeAsString();
try {
auto cnn_network = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name);
auto ov_model = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name);

// Check for Constant Folding
if (!global_context.is_wholly_supported_graph) {
if ((global_context.device_type != "NPU") && !global_context.is_wholly_supported_graph) {
ov::pass::ConstantFolding pass_const_obj;
pass_const_obj.run_on_model(cnn_network);
auto& results = const_cast<ov::ResultVector&>(cnn_network.get()->get_results());
pass_const_obj.run_on_model(ov_model);
auto& results = const_cast<ov::ResultVector&>(ov_model.get()->get_results());
size_t index = results.size() - 1;

for (auto it = results.rbegin(); it != results.rend(); ++it) {
Expand All @@ -67,12 +67,12 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
}
#ifndef NDEBUG
if (IsDebugEnabled()) {
std::string name = cnn_network->get_friendly_name();
std::string name = ov_model->get_friendly_name();
ov::pass::Serialize serializer(name + ".xml", name + ".bin");
serializer.run_on_model(cnn_network);
serializer.run_on_model(ov_model);
}
#endif
return cnn_network;
return ov_model;
} catch (std::string const& msg) {
ORT_THROW(msg);
}
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/openvino/backend_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
size_t batch_slice_idx);

std::shared_ptr<OVNetwork>
std::shared_ptr<const OVNetwork>
CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
const GlobalContext& global_context,
std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
Expand Down
57 changes: 48 additions & 9 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,16 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
// Set the inference_num_threads property of the CPU
SetNumThreads(device_config);

auto npuw_status =
std::any_of(device_config.begin(), device_config.end(), [&](const std::pair<std::string, ov::Any>& pair) {
return (pair.first.find("NPU_USE_NPUW") != std::string::npos) && (pair.second.is<std::string>()) &&
(pair.second.as<std::string>() == "YES");
});

if (npuw_status) {
LOGS_DEFAULT(INFO) << log_tag << "NPUW Enabled during compilation";
}

try {
std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str;

Expand Down Expand Up @@ -81,7 +91,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
device_config,
global_context_.ep_context_embed_mode,
subgraph_context_.subgraph_name);
ie_cnn_network_ = exe_network_.Get().get_runtime_model();
} else if (global_context_.export_ep_ctx_blob &&
hw_target.find("NPU") != std::string::npos &&
!global_context_.has_external_weights) {
Expand All @@ -106,15 +115,15 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
device_config,
subgraph_context_.subgraph_name);
} else { // For all other types use ov::Model Type
ie_cnn_network_ = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
exe_network_ = global_context_.ie_core.CompileModel(
ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
}
#endif
} else { // Full graph is not supported
ie_cnn_network_ = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_);
exe_network_ = global_context_.ie_core.CompileModel(
ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
}
LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
} catch (const char* msg) {
Expand Down Expand Up @@ -145,8 +154,8 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
device_config.emplace(ov::hint::inference_precision("f32"));
}
if (global_context_.precision_str.find("ACCURACY") != std::string::npos &&
global_context_.device_type == "GPU") {
if (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) >= 1) {
global_context_.device_type.find("GPU") != std::string::npos) {
if (global_context_.OpenVINO_Version.at(0) >= 2024) {
device_config.emplace(ov::hint::inference_precision(ov::element::undefined));
device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY));
} else {
Expand Down Expand Up @@ -174,7 +183,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
device_property = std::make_pair("NPU_COMPILER_TYPE", env_npu_compiler_type);
}
device_config.emplace(ov::device::properties("NPU", device_property));
#if (OPENVINO_VERSION_MAJOR >= 2024) && (OPENVINO_VERSION_MINOR > 3)
#if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024))
if (global_context_.export_ep_ctx_blob) {
global_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
}
Expand All @@ -184,6 +193,33 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
if (!global_context_.load_config.empty()) {
const std::map<std::string, ov::AnyMap>& target_config = global_context_.load_config;

if (global_context_.device_type.find("NPU") != std::string::npos) {
auto npuw_config = target_config.at("NPU");

// Check if "NPU_USE_NPUW" exists and is set to "YES"
auto npu_use_npuw_it = npuw_config.find("NPU_USE_NPUW");
if (npu_use_npuw_it != npuw_config.end() &&
npu_use_npuw_it->second.is<std::string>() &&
npu_use_npuw_it->second.as<std::string>() == "YES") {
// Only add NPUW-related keys if NPU_USE_NPUW is "YES"
for (const auto& [key, value] : npuw_config) {
if (key.find("NPUW") != std::string::npos) {
if (!value.is<std::string>()) {
LOGS_DEFAULT(ERROR) << "Invalid value type for key: " << key;
continue;
}
device_config[key] = value;
}
}
} else {
// Check if there are any "NPUW" keys and log a warning
if (std::any_of(npuw_config.begin(), npuw_config.end(),
[&](const auto& pair) { return pair.first.find("NPUW") != std::string::npos; })) {
LOGS_DEFAULT(WARNING) << "Skipping NPUW-related configurations as NPU_USE_NPUW is not set to 'YES'.";
}
}
}

// Parse device types like "AUTO:CPU,GPU" and extract individual devices
auto parse_individual_devices = [&](const std::string& device_type) -> std::vector<std::string> {
std::vector<std::string> devices;
Expand Down Expand Up @@ -213,6 +249,9 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options,
const std::vector<ov::PropertyName>& supported_properties) {
for (const auto& [key, value] : config_options) {
if (key.find("NPUW") != std::string::npos) {
continue;
}
if (is_supported_and_mutable(key, supported_properties)) {
global_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}});
} else {
Expand Down Expand Up @@ -378,7 +417,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
if ((it == ort_ov_tensor_map.end()) ||
(it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) {
ov_tensor_data_t ov_tensor_data;
auto input = graph_input_info.at(input_idx);
const auto& input = graph_input_info.at(input_idx);
ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input.get_shape(),
const_cast<void*>(tensor.GetTensorRawData()));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ class BasicBackend : public IBackend {
GlobalContext& global_context_;
SubGraphContext subgraph_context_;
mutable std::mutex compute_lock_;
std::shared_ptr<const OVNetwork> ie_cnn_network_;
OVExeNetwork exe_network_;
std::map<std::string, std::shared_ptr<ov::Node>> const_outputs_map_;
std::unique_ptr<InferRequestsQueue> inferRequestsQueue_;
Expand Down
13 changes: 12 additions & 1 deletion onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,25 @@ Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, b
auto node = graph_viewer.GetNode(0);
auto& attrs = node->GetAttributes();
ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) > 0);
model_stream_ = std::make_shared<std::istringstream>(attrs.at(EP_CACHE_CONTEXT).s());

ep_cache_context_attribute_ = &attrs.at(EP_CACHE_CONTEXT);

ep_context_embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";

is_valid_ep_ctx_graph_ = true;
return Status::OK();
}

const std::string& EPCtxHandler::GetModelBlobStream() const {
static std::string empty;
if (ep_cache_context_attribute_ != nullptr) {
return ep_cache_context_attribute_->s();
} else {
return empty;
}
}

bool EPCtxHandler::CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const {
for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) {
auto node = graph_viewer.GetNode(i);
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ static const char SOURCE[] = "source";
class EPCtxHandler {
public:
EPCtxHandler() = default;
EPCtxHandler(const EPCtxHandler&) = default;
EPCtxHandler(const EPCtxHandler&) = delete;
Status ExportEPCtxModel(const GraphViewer& graph_viewer,
const std::string& graph_name,
const logging::Logger& logger,
Expand All @@ -33,11 +33,11 @@ class EPCtxHandler {
Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode);
bool CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const;
bool IsValidOVEPCtxGraph() const { return is_valid_ep_ctx_graph_; }
[[nodiscard]] const std::shared_ptr<std::istringstream> GetModelBlobStream() const { return model_stream_; }
const std::string& GetModelBlobStream() const;

private:
bool is_valid_ep_ctx_graph_{false};
std::shared_ptr<std::istringstream> model_stream_;
const onnx::AttributeProto* ep_cache_context_attribute_;
};

} // namespace openvino_ep
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ struct OpenVINOExecutionProviderInfo {
device_type_ = std::move(dev_type);
} else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) {
std::vector<std::string> devices = parseDevices(dev_type, available_devices);
device_type_ = dev_type;
device_type_ = std::move(dev_type);
} else {
ORT_THROW("Invalid device string: " + dev_type);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str();

if (so_export_ep_ctx_blob && !so_cache_path.empty()) {
cache_dir_ = so_cache_path;
cache_dir_ = std::move(so_cache_path);
auto file_path = std::filesystem::path(cache_dir_);
// ep_context_file_path_ file extension must be .onnx
if (file_path.extension().generic_string() == ".onnx") {
Expand Down Expand Up @@ -248,7 +248,7 @@ struct OpenVINO_Provider : Provider {
LOGS_DEFAULT(WARNING) << "Unsupported JSON value type for key: " << inner_key << ". Skipping key.";
}
}
target_map[key] = inner_map;
target_map[key] = std::move(inner_map);
}
} catch (const nlohmann::json::parse_error& e) {
// Handle syntax errors in JSON
Expand Down
1 change: 0 additions & 1 deletion onnxruntime/core/providers/openvino/ov_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ void* OVRTAllocator::Alloc(size_t size) {
} catch (const ov::Exception& e) {
ORT_THROW(std::string("Alloc failed: ") + e.what());
}
return nullptr;
}

void OVRTAllocator::Free(void* p) {
Expand Down
8 changes: 4 additions & 4 deletions onnxruntime/core/providers/openvino/ov_interface.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,18 +109,18 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
}
}

OVExeNetwork OVCore::ImportModel(std::shared_ptr<std::istringstream> model_stream,
OVExeNetwork OVCore::ImportModel(const std::string& model_string,
std::string hw_target,
const ov::AnyMap& device_config,
bool embed_mode,
std::string name) {
try {
ov::CompiledModel obj;
if (embed_mode) {
obj = oe.import_model(*model_stream, hw_target, device_config);
std::istringstream model_stream(model_string);
obj = oe.import_model(model_stream, hw_target, device_config);
} else {
std::string blob_file_path = (*model_stream).str();
std::ifstream modelStream(blob_file_path, std::ios_base::binary | std::ios_base::in);
std::ifstream modelStream(model_string, std::ios_base::binary | std::ios_base::in);
obj = oe.import_model(modelStream,
hw_target,
{});
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/openvino/ov_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class OVCore {
ov::AnyMap& device_config,
const std::string& name);
// OV Interface for Import model Stream
OVExeNetwork ImportModel(std::shared_ptr<std::istringstream> model_stream,
OVExeNetwork ImportModel(const std::string& model_string,
std::string hw_target,
const ov::AnyMap& device_config,
bool embed_mode,
Expand Down
8 changes: 4 additions & 4 deletions onnxruntime/core/providers/openvino/ov_versions/capability.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@ GetCapability::GetCapability(const GraphViewer& graph_viewer_param,
device_type_ = "CPU";
if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true;
}
#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 3
data_ops_ = new DataOps(graph_viewer_, V_2024_3, device_type_, npu_qdq_optimizer_enabled);
#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 4
#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 4
data_ops_ = new DataOps(graph_viewer_, V_2024_4, device_type_, npu_qdq_optimizer_enabled);
#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5
data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled);
#elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 0
data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled);
#else
data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled);
data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled);
#endif
}

Expand Down
12 changes: 8 additions & 4 deletions onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ void DataOps::populate_op_mode_supported() {
no_dimension_supported_.push_back({"Floor", V_2020_4, {"All"}});
no_dimension_supported_.push_back({"Gather", V_2020_4, {"All"}});
no_dimension_supported_.push_back({"Identity", V_2023_0, {"All"}});
no_dimension_supported_.push_back({"If", V_2022_3, {"CPU", "GPU"}});
no_dimension_supported_.push_back({"Less", V_2022_1, {"CPU"}});
no_dimension_supported_.push_back({"Loop", V_2021_4, {"All"}});
no_dimension_supported_.push_back({"Min", V_2020_4, {"All"}});
Expand Down Expand Up @@ -387,7 +388,7 @@ void DataOps::populate_op_mode_supported() {

// populate unsupportedmode_t
{
UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2025_0},
[this](const Node* node, const InitializedTensorSet&) {
// If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch)
for (size_t i = 0; i < node->InputDefs().size(); i++) {
Expand All @@ -402,7 +403,8 @@ void DataOps::populate_op_mode_supported() {
op_list_.insert({"ReduceMax", obj});
}
{
UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
V_2024_3, V_2024_4, V_2024_5, V_2025_0},
[this](const Node* node, const InitializedTensorSet&) {
const auto& input_arg = node->InputDefs()[1];
auto shape = input_arg->Shape();
Expand All @@ -419,7 +421,8 @@ void DataOps::populate_op_mode_supported() {
op_list_.insert({"Reshape", obj});
}
{
UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
V_2024_3, V_2024_4, V_2024_5, V_2025_0},
[this](const Node* node, const InitializedTensorSet&) {
// If the operator is unsqueeze
// If axes is an input, then we cannot produce a static graph.
Expand All @@ -434,7 +437,8 @@ void DataOps::populate_op_mode_supported() {
op_list_.insert({"Unsqueeze", obj});
}
{
UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5,
V_2025_0},
[this](const Node* node, const InitializedTensorSet&) {
// check for attributes
auto& upsample_attr = node->GetAttributes();
Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/core/providers/openvino/ov_versions/data_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ enum versionNum {
V_2024_2,
V_2024_3,
V_2024_4,
V_2024_5
V_2024_5,
V_2025_0
};

using VersionNum = enum versionNum;
Expand Down
Loading

0 comments on commit 1f88284

Please sign in to comment.