Ovep release lnl 1.2.1 (microsoft#22027)

Error Codes are added to catch compilation error and signal recompile. Remote Tensors are added to ensure direct memory access for NPU inferencing. UMD Bypass cache enabled with 2024.4 will eliminate need to disk caching ### Motivation and Context The changes are needed to ensure backward compatibility UMD Bypass caching eliminates driver caching Remote Tensors lead to performance improvement with inferencing on NPU --------- Co-authored-by: Preetha Veeramalai <[email protected]> Co-authored-by: Srirammaswamy <[email protected]> Co-authored-by: saurabh <[email protected]> Co-authored-by: Javier E. Martinez <[email protected]> Co-authored-by: Eric Crawford <[email protected]> Co-authored-by: jatinwadhwa921 <[email protected]>
axinging · Sep 11, 2024 · 0309c5f · 0309c5f
1 parent b800328
commit 0309c5f
Show file tree

Hide file tree

Showing 13 changed files with 338 additions and 43 deletions.
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
@@ -21,6 +21,10 @@
     message(FATAL_ERROR "OpenVINO 2024.0 and newer are supported. Please, use latest OpenVINO release")
   endif()
 
+  if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4)
+    add_definitions(-DUSE_OVEP_NPU_MEMORY=1)
+  endif()
+
   if (WIN32)
     unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO)
   endif()

diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
@@ -50,6 +50,8 @@ constexpr const char* HIP = "Hip";
 constexpr const char* HIP_PINNED = "HipPinned";
 constexpr const char* OpenVINO_CPU = "OpenVINO_CPU";
 constexpr const char* OpenVINO_GPU = "OpenVINO_GPU";
+constexpr const char* OpenVINO_RT = "OpenVINO_RT";
+constexpr const char* OpenVINO_RT_NPU = "OpenVINO_RT_NPU";
 constexpr const char* WEBGPU_BUFFER = "WebGPU_Buffer";
 
 constexpr size_t kAllocAlignment = 256;

diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
@@ -145,6 +145,10 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
     *out = new OrtMemoryInfo(
         name1, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
         mem_type1);
+  } else if (strcmp(name1, onnxruntime::OpenVINO_RT_NPU) == 0) {
+    *out = new OrtMemoryInfo(
+        name1, type, OrtDevice(OrtDevice::NPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
+        mem_type1);
   } else if (strcmp(name1, onnxruntime::CUDA_PINNED) == 0) {
     *out = new OrtMemoryInfo(
         onnxruntime::CUDA_PINNED, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, static_cast<OrtDevice::DeviceId>(id1)),

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -5,6 +5,7 @@
 #include <algorithm>
 #include <cassert>
 #include <fstream>
+#include <regex>
 #include <sstream>
 #include <unordered_map>
 #include <unordered_set>
@@ -107,12 +108,15 @@ BackendManager::BackendManager(const GlobalContext& global_context,
                                                       subgraph_context_,
                                                       ep_ctx_handle_);
     } catch (const OnnxRuntimeException& ex) {
+      std::string exception_str = ex.what();
+      bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
+                                       !GetGlobalContext().disable_cpu_fallback &&
+                                       !ep_ctx_handle_.IsValidOVEPCtxGraph();
 #if defined(OPENVINO_DISABLE_NPU_FALLBACK)
-      ORT_THROW(ex.what());
+      eligible_for_cpu_fallback = false;
 #else
-      if (device_type.find("NPU") != std::string::npos &&
-          !GetGlobalContext().disable_cpu_fallback) {
-        LOGS_DEFAULT(WARNING) << ex.what();
+      if (eligible_for_cpu_fallback) {
+        LOGS_DEFAULT(VERBOSE) << exception_str;
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                               << "Falling back to OV CPU for execution";
         GetGlobalContext().device_type = "CPU";
@@ -125,10 +129,32 @@ BackendManager::BackendManager(const GlobalContext& global_context,
         } catch (std::string const& msg) {
           ORT_THROW(msg);
         }
-      } else {
-        ORT_THROW(ex.what());
       }
 #endif
+      if (!eligible_for_cpu_fallback) {
+        if (device_type.find("NPU") != std::string::npos &&
+            exception_str.find("intel_npu") != std::string::npos) {
+          // Handle NPU device related errors
+#ifndef NDEBUG
+          ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
+#else
+          std::string error_message = "UNKNOWN NPU ERROR";
+          std::string error_code = "code 0x0";
+          std::regex error_message_pattern(R"(\bZE_\w*\b)");
+          std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
+          std::smatch matches;
+          if (std::regex_search(exception_str, matches, error_message_pattern)) {
+            error_message = matches[0];
+          }
+          if (std::regex_search(exception_str, matches, error_code_pattern)) {
+            error_code = matches[0];
+          }
+          throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
+#endif
+        } else {
+          ORT_THROW(exception_str);
+        }
+      }
     }
   }
   if (global_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -48,14 +48,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   // Set the inference_num_threads property of the CPU
   SetNumThreads(device_config);
 
-#ifndef NDEBUG
-  if (IsDebugEnabled()) {
-    std::string file_name = subgraph_context.subgraph_name + "_static.onnx";
-    std::fstream outfile(file_name, std::ios::out | std::ios::trunc | std::ios::binary);
-    model_proto->SerializeToOstream(outfile);
-  }
-#endif
-
   try {
     std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str;
 
@@ -180,6 +172,11 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
       device_property = std::make_pair("NPU_COMPILER_TYPE", env_npu_compiler_type);
     }
     device_config.emplace(ov::device::properties("NPU", device_property));
+#if (OPENVINO_VERSION_MAJOR >= 2024) && (OPENVINO_VERSION_MINOR > 3)
+    if (global_context_.export_ep_ctx_blob) {
+      global_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
+    }
+#endif
   }
 }
 
@@ -295,16 +292,104 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
           ORT_THROW(msg);
         }
       } else {
-        OVTensorPtr graph_input_blob;
-        try {
-          graph_input_blob = infer_request->GetTensor(input_name);
-        } catch (const char* msg) {
-          ORT_THROW(msg);
+        if ((global_context_.device_type.find("CPU") != std::string::npos ||
+             global_context_.device_type.find("GPU") != std::string::npos)) {
+          OVTensorPtr graph_input_blob;
+          try {
+            graph_input_blob = infer_request->GetTensor(input_name);
+          } catch (const char* msg) {
+            ORT_THROW(msg);
+          }
+          FillInputBlob(std::move(graph_input_blob), batch_slice_idx, std::move(input_name), context, subgraph_context_);
+        } else {
+          auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
+          auto allocator_name = tensor.GetTensorMemoryInfo().GetAllocatorName();
+          ov_tensor_data_t ov_tensor_key;
+          ort_tensor_key_t ort_tensor_key{tensor.GetTensorRawData(), allocator_name};
+          if (const auto& it = ort_ov_tensor_map.find(ort_tensor_key); it != ort_ov_tensor_map.end()) {
+            ov_tensor_key = it->second;
+          } else {
+            // Does this make sense for both types of allocators?
+            auto input = graph_input_info.at(input_idx);
+            if (allocator_name == OpenVINO_RT_NPU) {
+              ov_tensor_key.copy_needed = false;
+              ov_tensor_key.tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input.get_shape(),
+                                                                      (void*)tensor.GetTensorRawData());
+            } else {
+              ov_tensor_key.copy_needed = true;
+              ov_tensor_key.tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input.get_shape());
+            }
+            ort_ov_tensor_map.emplace(ort_tensor_key, ov_tensor_key);
+
+            if (ov_tensor_key.copy_needed) {
+              const char* ort_tensor_data = tensor.GetTensorData<char>();
+              size_t tensor_data_size = ov_tensor_key.tensor_ptr->get_byte_size();
+              auto ort_batch_memory_offset = ort_tensor_data + tensor_data_size * batch_slice_idx;
+              std::memcpy(ov_tensor_key.tensor_ptr->data(), ort_batch_memory_offset, tensor_data_size);
+            }
+
+            try {
+              infer_request->SetTensor(input_name, ov_tensor_key.tensor_ptr);
+            } catch (const char* msg) {
+              ORT_THROW(msg);
+            }
+          }
         }
-        FillInputBlob(std::move(graph_input_blob), batch_slice_idx, std::move(input_name), context, subgraph_context_);
       }
       input_idx++;
     }
+    if (global_context_.device_type.find("NPU") != std::string::npos) {
+      // Set the output blob as remote blob
+      auto graph_output_info = exe_network_.Get().outputs();
+      auto output_idx = 0;
+      for (auto output_info_iter = graph_output_info.begin();
+           output_info_iter != graph_output_info.end(); ++output_info_iter) {
+        auto output_names = output_info_iter->get_names();
+        std::string onnx_output_name;
+        std::string output_name;
+        // using the output name retrieved from ONNX original to match with the output names returned by OV tensors
+        for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) {
+          onnx_output_name = it->first;
+          if (output_names.find(onnx_output_name) != output_names.end()) {
+            // Assigning the output_name
+            output_name = it->first;
+            break;
+          }
+        }
+        size_t batch_size = 1;
+        Ort::UnownedValue tensor = GetOutputTensor(context,
+                                                   batch_size,
+                                                   infer_request,
+                                                   output_name,
+                                                   subgraph_context_.output_names);
+        auto allocator_name = tensor.GetTensorMemoryInfo().GetAllocatorName();
+
+        ov_tensor_data_t ov_tensor_data;
+        ort_tensor_key_t ort_tensor_key{tensor.GetTensorRawData(), allocator_name};
+        if (const auto& it = ort_ov_tensor_map.find(ort_tensor_key); it != ort_ov_tensor_map.end()) {
+          ov_tensor_data = it->second;
+        } else {
+          auto output = graph_output_info.at(output_idx);
+          if (allocator_name == OpenVINO_RT_NPU) {
+            ov_tensor_data.copy_needed = false;
+            ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(output.get_element_type(), output.get_shape(),
+                                                                     (void*)tensor.GetTensorRawData());
+          } else {
+            ov_tensor_data.copy_needed = true;
+            ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(output.get_element_type(), output.get_shape());
+          }
+          ort_ov_tensor_map.emplace(ort_tensor_key, ov_tensor_data);
+
+          try {
+            infer_request->SetTensor(output_name, ov_tensor_data.tensor_ptr);
+          } catch (const char* msg) {
+            ORT_THROW(msg);
+          }
+        }
+        output_idx++;
+      }
+    }
+
     // Start Async inference
     infer_request->StartAsync();
   } catch (const char* msg) {
@@ -454,20 +539,42 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
             " doesn't exist in the "
             "list of OpenVINO output tensor names");
       }
-      try {
-        graph_output_blob = infer_request->GetTensor(output_name);
-      } catch (const char* msg) {
-        ORT_THROW(msg);
-      }
-      size_t batch_size = 1;
-      Ort::UnownedValue output_tensor =
-          GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names);
-      auto mem_info = output_tensor.GetTensorMemoryInfo();
-      if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-        return;
+      if ((global_context_.device_type.find("CPU") != std::string::npos ||
+           global_context_.device_type.find("GPU") != std::string::npos)) {
+        try {
+          graph_output_blob = infer_request->GetTensor(output_name);
+        } catch (const char* msg) {
+          ORT_THROW(msg);
+        }
+        size_t batch_size = 1;
+        Ort::UnownedValue output_tensor =
+            GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names);
+        auto mem_info = output_tensor.GetTensorMemoryInfo();
+        if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
+          return;
+        } else {
+          size_t batch_slice = 0;
+          FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
+        }
       } else {
-        size_t batch_slice = 0;
-        FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
+        size_t batch_size = 1;
+        Ort::UnownedValue output_tensor =
+            GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names);
+        auto allocator_name = output_tensor.GetTensorMemoryInfo().GetAllocatorName();
+        ov_tensor_data_t ov_tensor_data;
+        ort_tensor_key_t ort_tensor_key{output_tensor.GetTensorRawData(), allocator_name};
+        if (const auto& it = ort_ov_tensor_map.find(ort_tensor_key); it != ort_ov_tensor_map.end()) {
+          ov_tensor_data = it->second;
+        } else {
+          ORT_THROW(log_tag + "Expected all outputs to have associated OV::Tensor's");
+        }
+
+        if (ov_tensor_data.copy_needed) {
+          auto ort_tensor_data = output_tensor.GetTensorMutableData<char>();
+          size_t tensor_data_size = ov_tensor_data.tensor_ptr->get_byte_size();
+          auto ort_batch_memory_offset = ort_tensor_data /*+ tensor_data_size * batch_size*/;
+          std::memcpy(ort_batch_memory_offset, ov_tensor_data.tensor_ptr->data(), tensor_data_size);
+        }
       }
     }
 

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -11,6 +11,7 @@
 #include <string>
 #include <condition_variable>
 #include <mutex>
+#include <map>
 
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/openvino/contexts.h"
@@ -20,6 +21,11 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
+struct ov_tensor_data_t {
+  OVTensorPtr tensor_ptr;
+  bool copy_needed;
+};
+
 class InferRequestsQueue;
 class BasicBackend : public IBackend {
  public:
@@ -60,6 +66,9 @@ class BasicBackend : public IBackend {
 #if defined IO_BUFFER_ENABLED
   OVRemoteContextPtr remote_context_;
 #endif
+
+  using ort_tensor_key_t = std::pair<const void*, const std::string>;
+  std::map<ort_tensor_key_t, ov_tensor_data_t> ort_ov_tensor_map;
 };
 
 class InferRequestsQueue {

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -10,6 +10,9 @@
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "openvino/core/version.hpp"
+#ifdef USE_OVEP_NPU_MEMORY
+#include "core/providers/openvino/ov_allocator.h"
+#endif
 
 #define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz))
 
@@ -180,4 +183,18 @@ common::Status OpenVINOExecutionProvider::Compile(
   return Status::OK();
 }
 
+#ifdef USE_OVEP_NPU_MEMORY
+std::vector<AllocatorPtr> OpenVINOExecutionProvider::CreatePreferredAllocators() {
+  AllocatorCreationInfo npu_allocator_info{
+      [this](OrtDevice::DeviceId device_id) {
+        return std::make_unique<OVRTAllocator>(global_context_->ie_core.Get(), OrtDevice::NPU, device_id, OpenVINO_RT_NPU);
+      },
+      0,
+  };
+
+  // fill in allocator
+  return std::vector<AllocatorPtr>{CreateAllocator(npu_allocator_info)};
+}
+#endif
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -189,7 +189,9 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
   const void* GetExecutionHandle() const noexcept override {
     return nullptr;
   }
-
+#ifdef USE_OVEP_NPU_MEMORY
+  std::vector<AllocatorPtr> CreatePreferredAllocators() override;
+#endif
  private:
   std::unique_ptr<openvino_ep::GlobalContext> global_context_;
   openvino_ep::EPCtxHandler ep_ctx_handle_{};