Skip to content

Commit

Permalink
Ovep release lnl 1.2.1 (microsoft#22027)
Browse files Browse the repository at this point in the history
Error Codes are added to catch compilation error and signal recompile.
Remote Tensors are added to ensure direct memory access for NPU
inferencing.
UMD Bypass cache enabled with 2024.4 will eliminate need to disk caching

### Motivation and Context
The changes are needed to ensure backward compatibility
UMD Bypass caching eliminates driver caching
Remote Tensors lead to performance improvement with inferencing on NPU

---------

Co-authored-by: Preetha Veeramalai <[email protected]>
Co-authored-by: Srirammaswamy <[email protected]>
Co-authored-by: saurabh <[email protected]>
Co-authored-by: Javier E. Martinez <[email protected]>
Co-authored-by: Eric Crawford <[email protected]>
Co-authored-by: jatinwadhwa921 <[email protected]>
  • Loading branch information
7 people authored Sep 11, 2024
1 parent b800328 commit 0309c5f
Show file tree
Hide file tree
Showing 13 changed files with 338 additions and 43 deletions.
4 changes: 4 additions & 0 deletions cmake/onnxruntime_providers_openvino.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
message(FATAL_ERROR "OpenVINO 2024.0 and newer are supported. Please, use latest OpenVINO release")
endif()

if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4)
add_definitions(-DUSE_OVEP_NPU_MEMORY=1)
endif()

if (WIN32)
unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO)
endif()
Expand Down
2 changes: 2 additions & 0 deletions include/onnxruntime/core/framework/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ constexpr const char* HIP = "Hip";
constexpr const char* HIP_PINNED = "HipPinned";
constexpr const char* OpenVINO_CPU = "OpenVINO_CPU";
constexpr const char* OpenVINO_GPU = "OpenVINO_GPU";
constexpr const char* OpenVINO_RT = "OpenVINO_RT";
constexpr const char* OpenVINO_RT_NPU = "OpenVINO_RT_NPU";
constexpr const char* WEBGPU_BUFFER = "WebGPU_Buffer";

constexpr size_t kAllocAlignment = 256;
Expand Down
4 changes: 4 additions & 0 deletions onnxruntime/core/framework/allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
*out = new OrtMemoryInfo(
name1, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
mem_type1);
} else if (strcmp(name1, onnxruntime::OpenVINO_RT_NPU) == 0) {
*out = new OrtMemoryInfo(
name1, type, OrtDevice(OrtDevice::NPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
mem_type1);
} else if (strcmp(name1, onnxruntime::CUDA_PINNED) == 0) {
*out = new OrtMemoryInfo(
onnxruntime::CUDA_PINNED, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, static_cast<OrtDevice::DeviceId>(id1)),
Expand Down
38 changes: 32 additions & 6 deletions onnxruntime/core/providers/openvino/backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <algorithm>
#include <cassert>
#include <fstream>
#include <regex>
#include <sstream>
#include <unordered_map>
#include <unordered_set>
Expand Down Expand Up @@ -107,12 +108,15 @@ BackendManager::BackendManager(const GlobalContext& global_context,
subgraph_context_,
ep_ctx_handle_);
} catch (const OnnxRuntimeException& ex) {
std::string exception_str = ex.what();
bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
!GetGlobalContext().disable_cpu_fallback &&
!ep_ctx_handle_.IsValidOVEPCtxGraph();
#if defined(OPENVINO_DISABLE_NPU_FALLBACK)
ORT_THROW(ex.what());
eligible_for_cpu_fallback = false;
#else
if (device_type.find("NPU") != std::string::npos &&
!GetGlobalContext().disable_cpu_fallback) {
LOGS_DEFAULT(WARNING) << ex.what();
if (eligible_for_cpu_fallback) {
LOGS_DEFAULT(VERBOSE) << exception_str;
LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
<< "Falling back to OV CPU for execution";
GetGlobalContext().device_type = "CPU";
Expand All @@ -125,10 +129,32 @@ BackendManager::BackendManager(const GlobalContext& global_context,
} catch (std::string const& msg) {
ORT_THROW(msg);
}
} else {
ORT_THROW(ex.what());
}
#endif
if (!eligible_for_cpu_fallback) {
if (device_type.find("NPU") != std::string::npos &&
exception_str.find("intel_npu") != std::string::npos) {
// Handle NPU device related errors
#ifndef NDEBUG
ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
#else
std::string error_message = "UNKNOWN NPU ERROR";
std::string error_code = "code 0x0";
std::regex error_message_pattern(R"(\bZE_\w*\b)");
std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
std::smatch matches;
if (std::regex_search(exception_str, matches, error_message_pattern)) {
error_message = matches[0];
}
if (std::regex_search(exception_str, matches, error_code_pattern)) {
error_code = matches[0];
}
throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
#endif
} else {
ORT_THROW(exception_str);
}
}
}
}
if (global_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
Expand Down
161 changes: 134 additions & 27 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
// Set the inference_num_threads property of the CPU
SetNumThreads(device_config);

#ifndef NDEBUG
if (IsDebugEnabled()) {
std::string file_name = subgraph_context.subgraph_name + "_static.onnx";
std::fstream outfile(file_name, std::ios::out | std::ios::trunc | std::ios::binary);
model_proto->SerializeToOstream(outfile);
}
#endif

try {
std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str;

Expand Down Expand Up @@ -180,6 +172,11 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
device_property = std::make_pair("NPU_COMPILER_TYPE", env_npu_compiler_type);
}
device_config.emplace(ov::device::properties("NPU", device_property));
#if (OPENVINO_VERSION_MAJOR >= 2024) && (OPENVINO_VERSION_MINOR > 3)
if (global_context_.export_ep_ctx_blob) {
global_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true));
}
#endif
}
}

Expand Down Expand Up @@ -295,16 +292,104 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
ORT_THROW(msg);
}
} else {
OVTensorPtr graph_input_blob;
try {
graph_input_blob = infer_request->GetTensor(input_name);
} catch (const char* msg) {
ORT_THROW(msg);
if ((global_context_.device_type.find("CPU") != std::string::npos ||
global_context_.device_type.find("GPU") != std::string::npos)) {
OVTensorPtr graph_input_blob;
try {
graph_input_blob = infer_request->GetTensor(input_name);
} catch (const char* msg) {
ORT_THROW(msg);
}
FillInputBlob(std::move(graph_input_blob), batch_slice_idx, std::move(input_name), context, subgraph_context_);
} else {
auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
auto allocator_name = tensor.GetTensorMemoryInfo().GetAllocatorName();
ov_tensor_data_t ov_tensor_key;
ort_tensor_key_t ort_tensor_key{tensor.GetTensorRawData(), allocator_name};
if (const auto& it = ort_ov_tensor_map.find(ort_tensor_key); it != ort_ov_tensor_map.end()) {
ov_tensor_key = it->second;
} else {
// Does this make sense for both types of allocators?
auto input = graph_input_info.at(input_idx);
if (allocator_name == OpenVINO_RT_NPU) {
ov_tensor_key.copy_needed = false;
ov_tensor_key.tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input.get_shape(),
(void*)tensor.GetTensorRawData());
} else {
ov_tensor_key.copy_needed = true;
ov_tensor_key.tensor_ptr = std::make_shared<ov::Tensor>(input.get_element_type(), input.get_shape());
}
ort_ov_tensor_map.emplace(ort_tensor_key, ov_tensor_key);

if (ov_tensor_key.copy_needed) {
const char* ort_tensor_data = tensor.GetTensorData<char>();
size_t tensor_data_size = ov_tensor_key.tensor_ptr->get_byte_size();
auto ort_batch_memory_offset = ort_tensor_data + tensor_data_size * batch_slice_idx;
std::memcpy(ov_tensor_key.tensor_ptr->data(), ort_batch_memory_offset, tensor_data_size);
}

try {
infer_request->SetTensor(input_name, ov_tensor_key.tensor_ptr);
} catch (const char* msg) {
ORT_THROW(msg);
}
}
}
FillInputBlob(std::move(graph_input_blob), batch_slice_idx, std::move(input_name), context, subgraph_context_);
}
input_idx++;
}
if (global_context_.device_type.find("NPU") != std::string::npos) {
// Set the output blob as remote blob
auto graph_output_info = exe_network_.Get().outputs();
auto output_idx = 0;
for (auto output_info_iter = graph_output_info.begin();
output_info_iter != graph_output_info.end(); ++output_info_iter) {
auto output_names = output_info_iter->get_names();
std::string onnx_output_name;
std::string output_name;
// using the output name retrieved from ONNX original to match with the output names returned by OV tensors
for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) {
onnx_output_name = it->first;
if (output_names.find(onnx_output_name) != output_names.end()) {
// Assigning the output_name
output_name = it->first;
break;
}
}
size_t batch_size = 1;
Ort::UnownedValue tensor = GetOutputTensor(context,
batch_size,
infer_request,
output_name,
subgraph_context_.output_names);
auto allocator_name = tensor.GetTensorMemoryInfo().GetAllocatorName();

ov_tensor_data_t ov_tensor_data;
ort_tensor_key_t ort_tensor_key{tensor.GetTensorRawData(), allocator_name};
if (const auto& it = ort_ov_tensor_map.find(ort_tensor_key); it != ort_ov_tensor_map.end()) {
ov_tensor_data = it->second;
} else {
auto output = graph_output_info.at(output_idx);
if (allocator_name == OpenVINO_RT_NPU) {
ov_tensor_data.copy_needed = false;
ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(output.get_element_type(), output.get_shape(),
(void*)tensor.GetTensorRawData());
} else {
ov_tensor_data.copy_needed = true;
ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(output.get_element_type(), output.get_shape());
}
ort_ov_tensor_map.emplace(ort_tensor_key, ov_tensor_data);

try {
infer_request->SetTensor(output_name, ov_tensor_data.tensor_ptr);
} catch (const char* msg) {
ORT_THROW(msg);
}
}
output_idx++;
}
}

// Start Async inference
infer_request->StartAsync();
} catch (const char* msg) {
Expand Down Expand Up @@ -454,20 +539,42 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
" doesn't exist in the "
"list of OpenVINO output tensor names");
}
try {
graph_output_blob = infer_request->GetTensor(output_name);
} catch (const char* msg) {
ORT_THROW(msg);
}
size_t batch_size = 1;
Ort::UnownedValue output_tensor =
GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names);
auto mem_info = output_tensor.GetTensorMemoryInfo();
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
return;
if ((global_context_.device_type.find("CPU") != std::string::npos ||
global_context_.device_type.find("GPU") != std::string::npos)) {
try {
graph_output_blob = infer_request->GetTensor(output_name);
} catch (const char* msg) {
ORT_THROW(msg);
}
size_t batch_size = 1;
Ort::UnownedValue output_tensor =
GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names);
auto mem_info = output_tensor.GetTensorMemoryInfo();
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
return;
} else {
size_t batch_slice = 0;
FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
}
} else {
size_t batch_slice = 0;
FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
size_t batch_size = 1;
Ort::UnownedValue output_tensor =
GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names);
auto allocator_name = output_tensor.GetTensorMemoryInfo().GetAllocatorName();
ov_tensor_data_t ov_tensor_data;
ort_tensor_key_t ort_tensor_key{output_tensor.GetTensorRawData(), allocator_name};
if (const auto& it = ort_ov_tensor_map.find(ort_tensor_key); it != ort_ov_tensor_map.end()) {
ov_tensor_data = it->second;
} else {
ORT_THROW(log_tag + "Expected all outputs to have associated OV::Tensor's");
}

if (ov_tensor_data.copy_needed) {
auto ort_tensor_data = output_tensor.GetTensorMutableData<char>();
size_t tensor_data_size = ov_tensor_data.tensor_ptr->get_byte_size();
auto ort_batch_memory_offset = ort_tensor_data /*+ tensor_data_size * batch_size*/;
std::memcpy(ort_batch_memory_offset, ov_tensor_data.tensor_ptr->data(), tensor_data_size);
}
}
}

Expand Down
9 changes: 9 additions & 0 deletions onnxruntime/core/providers/openvino/backends/basic_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <string>
#include <condition_variable>
#include <mutex>
#include <map>

#include "core/session/onnxruntime_cxx_api.h"
#include "core/providers/openvino/contexts.h"
Expand All @@ -20,6 +21,11 @@
namespace onnxruntime {
namespace openvino_ep {

struct ov_tensor_data_t {
OVTensorPtr tensor_ptr;
bool copy_needed;
};

class InferRequestsQueue;
class BasicBackend : public IBackend {
public:
Expand Down Expand Up @@ -60,6 +66,9 @@ class BasicBackend : public IBackend {
#if defined IO_BUFFER_ENABLED
OVRemoteContextPtr remote_context_;
#endif

using ort_tensor_key_t = std::pair<const void*, const std::string>;
std::map<ort_tensor_key_t, ov_tensor_data_t> ort_ov_tensor_map;
};

class InferRequestsQueue {
Expand Down
17 changes: 17 additions & 0 deletions onnxruntime/core/providers/openvino/openvino_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#include "core/providers/openvino/onnx_ctx_model_helper.h"
#include "core/providers/openvino/ov_versions/capability.h"
#include "openvino/core/version.hpp"
#ifdef USE_OVEP_NPU_MEMORY
#include "core/providers/openvino/ov_allocator.h"
#endif

#define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz))

Expand Down Expand Up @@ -180,4 +183,18 @@ common::Status OpenVINOExecutionProvider::Compile(
return Status::OK();
}

#ifdef USE_OVEP_NPU_MEMORY
std::vector<AllocatorPtr> OpenVINOExecutionProvider::CreatePreferredAllocators() {
AllocatorCreationInfo npu_allocator_info{
[this](OrtDevice::DeviceId device_id) {
return std::make_unique<OVRTAllocator>(global_context_->ie_core.Get(), OrtDevice::NPU, device_id, OpenVINO_RT_NPU);
},
0,
};

// fill in allocator
return std::vector<AllocatorPtr>{CreateAllocator(npu_allocator_info)};
}
#endif

} // namespace onnxruntime
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,9 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
const void* GetExecutionHandle() const noexcept override {
return nullptr;
}

#ifdef USE_OVEP_NPU_MEMORY
std::vector<AllocatorPtr> CreatePreferredAllocators() override;
#endif
private:
std::unique_ptr<openvino_ep::GlobalContext> global_context_;
openvino_ep::EPCtxHandler ep_ctx_handle_{};
Expand Down
Loading

0 comments on commit 0309c5f

Please sign in to comment.