diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp index 70856cba9839dc..2a0ab83992afec 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp @@ -75,6 +75,9 @@ class ZeroInferRequest final : public SyncInferRequest { mutable std::vector> _inputLevelZeroTensorCreatedLocally; mutable std::vector> _outputLevelZeroTensorCreatedLocally; + mutable std::vector _originalAddressInputLevelZeroTensor; + mutable std::vector _originalAddressOutputLevelZeroTensor; + ze_device_properties_t _properties = {}; std::shared_ptr _inputAllocator; std::shared_ptr _outputAllocator; diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp index 42626e4ad3d6d5..fa2ab1acf6e6c2 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp @@ -14,12 +14,6 @@ namespace intel_npu { -struct TensorData { - void* mem; - size_t size; - bool levelZeroTensorCreatedLocally = true; -}; - struct Pipeline { public: Pipeline(const Config& config, diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index 536f64beb362cd..85f64bb0f8177a 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -107,6 +107,8 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& _levelZeroOutputTensors(_metadata.outputs.size(), nullptr), _inputLevelZeroTensorCreatedLocally(_metadata.inputs.size(), std::nullopt), _outputLevelZeroTensorCreatedLocally(_metadata.outputs.size(), std::nullopt), + _originalAddressInputLevelZeroTensor(_metadata.inputs.size(), nullptr), + _originalAddressOutputLevelZeroTensor(_metadata.outputs.size(), nullptr), _profilingPool(_initStructs, _graph, zeroProfiling::POOL_SIZE), _profilingQuery(_initStructs, 0) { _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest"); @@ -184,6 +186,7 @@ void ZeroInferRequest::create_pipeline() { INPUT, *_inputAllocator, _graph->get_batch_size()); + _inputLevelZeroTensorCreatedLocally.at(inputIndex) = true; } for (size_t outputIndex = 0; outputIndex < _metadata.outputs.size(); ++outputIndex) { @@ -198,6 +201,32 @@ void ZeroInferRequest::create_pipeline() { OUTPUT, *_outputAllocator, _graph->get_batch_size()); + _outputLevelZeroTensorCreatedLocally.at(outputIndex) = true; + } + + for (size_t inputIndex = 0; inputIndex < _metadata.inputs.size(); ++inputIndex) { + if (is_batched_input(inputIndex)) { + continue; + } + + auto levelZeroRemoteTensor = std::dynamic_pointer_cast(get_level_zero_input(inputIndex)); + if (levelZeroRemoteTensor == nullptr) { + _originalAddressInputLevelZeroTensor.at(inputIndex) = get_level_zero_input(inputIndex)->data(); + } else { + void* levelZeroBuffer = extract_object(levelZeroRemoteTensor->get_properties(), ov::intel_npu::mem_handle); + _originalAddressInputLevelZeroTensor.at(inputIndex) = levelZeroBuffer; + } + } + + for (size_t outputIndex = 0; outputIndex < _metadata.outputs.size(); ++outputIndex) { + auto levelZeroRemoteTensor = + std::dynamic_pointer_cast(_levelZeroOutputTensors.at(outputIndex)); + if (levelZeroRemoteTensor == nullptr) { + _originalAddressOutputLevelZeroTensor.at(outputIndex) = _levelZeroOutputTensors.at(outputIndex)->data(); + } else { + void* levelZeroBuffer = extract_object(levelZeroRemoteTensor->get_properties(), ov::intel_npu::mem_handle); + _originalAddressOutputLevelZeroTensor.at(outputIndex) = levelZeroBuffer; + } } // Find the corresponding command queue group. @@ -226,7 +255,7 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr& tenso const bool isInput) { OV_ITT_TASK_CHAIN(ZERO_SET_TENSOR, itt::domains::LevelZeroBackend, "set_tensor", "set_tensor_data"); auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index); - auto& tensorsData = + auto& tensorCreatedLocally = isInput ? _inputLevelZeroTensorCreatedLocally.at(index) : _outputLevelZeroTensorCreatedLocally.at(index); bool setTensorData = false; @@ -243,7 +272,7 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr& tenso if (!setTensorData) { // make sure that the L0 tensor was allocated locally and is not received from the user when receiving // random tensor - if (tensorsData.has_value() && !tensorsData) { + if (tensorCreatedLocally.has_value() && !(*tensorCreatedLocally)) { _logger.debug("ZeroInferRequest::set_tensor_data - create locally L0 tensor"); OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "allocate tensor"); @@ -259,16 +288,21 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr& tenso } if (setTensorData) { - tensorsData = levelZeroTensorCreatedLocally; + tensorCreatedLocally = levelZeroTensorCreatedLocally; if (_pipelineIsCreated) { _logger.debug("ZeroInferRequest::infer_async - update command list"); + auto& updateOriginalAddress = isInput ? _originalAddressInputLevelZeroTensor.at(index) + : _originalAddressOutputLevelZeroTensor.at(index); + OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList"); _pipeline->updateCommandList(levelZeroTensors->data(), levelZeroTensors->get_byte_size(), isInput ? _graph->get_input_descriptors().at(index).idx : _graph->get_output_descriptors().at(index).idx); + + updateOriginalAddress = levelZeroTensors->data(); } } } @@ -290,16 +324,25 @@ void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptrupdateCommandList( data, tensor->get_byte_size(), isInput ? _graph->get_input_descriptors().at(index).idx : _graph->get_output_descriptors().at(index).idx); + + updateOriginalAddress = data; } } @@ -408,6 +451,7 @@ void ZeroInferRequest::set_tensors(const ov::Output& port, if (_pipelineIsCreated) { OV_ITT_TASK_NEXT(SET_TENSORS, "updateCommandList"); + _pipeline->updateCommandList(data, _graph->get_input_descriptors().at(foundPort.idx).idx, i); } } @@ -438,6 +482,8 @@ ov::SoPtr ZeroInferRequest::get_tensor(const ov::Output ZeroInferRequest::get_tensor(const ov::Outputget_batch_size()); + tensorCreatedLocally = true; + return levelZeroTensors; } @@ -555,11 +603,62 @@ void ZeroInferRequest::infer_async() { OV_ITT_TASK_NEXT(ZERO_INFER, "memcpy"); std::memcpy(levelZeroBuffer, userBuffer, userTensor.at(SINGLE_TENSOR)->get_byte_size()); } + + if (_originalAddressInputLevelZeroTensor.at(inputIndex) != levelZeroBuffer) { + _logger.debug("Update input graph descriptor with the new tensor"); + _pipeline->updateCommandList(levelZeroBuffer, + levelZeroTensor->get_byte_size(), + _graph->get_input_descriptors().at(inputIndex).idx); + + _originalAddressInputLevelZeroTensor.at(inputIndex) = levelZeroBuffer; + } + } else { + void* remoteLevelZeroBuffer = + extract_object(levelZeroRemoteTensor->get_properties(), ov::intel_npu::mem_handle); + + if (_originalAddressInputLevelZeroTensor.at(inputIndex) != remoteLevelZeroBuffer) { + _logger.debug("Update input graph descriptor with the new remote tensor"); + _pipeline->updateCommandList(remoteLevelZeroBuffer, + levelZeroRemoteTensor->get_byte_size(), + _graph->get_input_descriptors().at(inputIndex).idx); + + _originalAddressInputLevelZeroTensor.at(inputIndex) = remoteLevelZeroBuffer; + } } ++inputIndex; } + size_t outputIndex = 0; + + for (const auto& levelZeroTensor : _levelZeroOutputTensors) { + auto levelZeroRemoteTensor = std::dynamic_pointer_cast(levelZeroTensor); + if (levelZeroRemoteTensor == nullptr) { + if (_originalAddressOutputLevelZeroTensor.at(outputIndex) != levelZeroTensor->data()) { + _logger.debug("Update output graph descriptor with the new tensor"); + _pipeline->updateCommandList(levelZeroTensor->data(), + levelZeroTensor->get_byte_size(), + _graph->get_output_descriptors().at(outputIndex).idx); + + _originalAddressOutputLevelZeroTensor.at(outputIndex) = levelZeroTensor->data(); + } + } else { + void* remoteLevelZeroBuffer = + extract_object(levelZeroRemoteTensor->get_properties(), ov::intel_npu::mem_handle); + + if (_originalAddressInputLevelZeroTensor.at(outputIndex) != remoteLevelZeroBuffer) { + _logger.debug("Update output graph descriptor with the new remote tensor"); + _pipeline->updateCommandList(remoteLevelZeroBuffer, + levelZeroRemoteTensor->get_byte_size(), + _graph->get_output_descriptors().at(outputIndex).idx); + + _originalAddressOutputLevelZeroTensor.at(outputIndex) = remoteLevelZeroBuffer; + } + } + + ++outputIndex; + } + OV_ITT_TASK_NEXT(ZERO_INFER, "push"); _pipeline->push(); } diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp index d29327a5e126ef..81d3f30e20b548 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp @@ -13,6 +13,22 @@ #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_api.hpp" #include "intel_npu/utils/zero/zero_types.hpp" +#include "zero_remote_tensor.hpp" + +namespace { + +template +Type extract_object(const ov::AnyMap& params, const ov::Property& p) { + auto itrHandle = params.find(p.name()); + ov::Any res = nullptr; + if (itrHandle == params.end()) { + OPENVINO_THROW("No parameter ", p.name(), " found in parameters map"); + } + res = itrHandle->second; + return res.as(); +} + +} // namespace namespace intel_npu { @@ -59,15 +75,33 @@ Pipeline::Pipeline(const Config& config, size_t ioIndex = 0; for (const auto& desc : graph->get_input_descriptors()) { if (inputTensorsData.at(ioIndex).size() > 1) { - graph->set_argument_value(desc.idx, inputTensorsData.at(ioIndex).at(i)->data()); + void* data = nullptr; + auto remoteTensor = std::dynamic_pointer_cast(inputTensorsData.at(ioIndex).at(i)); + if (remoteTensor == nullptr) { + data = inputTensorsData.at(ioIndex).at(i)->data(); + + } else { + data = extract_object(remoteTensor->get_properties(), ov::intel_npu::mem_handle); + } + + graph->set_argument_value(desc.idx, data); ++ioIndex; continue; } + void* data = nullptr; + auto remoteTensor = std::dynamic_pointer_cast(inputTensorsData.at(ioIndex).at(0)); + if (remoteTensor == nullptr) { + data = inputTensorsData.at(ioIndex).at(0)->data(); + + } else { + data = extract_object(remoteTensor->get_properties(), ov::intel_npu::mem_handle); + } + graph->set_argument_value( desc.idx, - static_cast(inputTensorsData.at(ioIndex).at(0)->data()) + + static_cast(data) + (i * inputTensorsData.at(ioIndex).at(0)->get_byte_size()) / _number_of_command_lists); ++ioIndex; @@ -75,9 +109,18 @@ Pipeline::Pipeline(const Config& config, ioIndex = 0; for (const auto& desc : graph->get_output_descriptors()) { + void* data = nullptr; + auto remoteTensor = std::dynamic_pointer_cast(outputTensorsData.at(ioIndex)); + if (remoteTensor == nullptr) { + data = outputTensorsData.at(ioIndex)->data(); + + } else { + data = extract_object(remoteTensor->get_properties(), ov::intel_npu::mem_handle); + } + graph->set_argument_value( desc.idx, - static_cast(outputTensorsData.at(ioIndex)->data()) + + static_cast(data) + (i * outputTensorsData.at(ioIndex)->get_byte_size()) / _number_of_command_lists); ++ioIndex; } diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp index 07466677b9d547..acf7b7742c934e 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp +++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp @@ -23,6 +23,7 @@ #include "openvino/opsets/opset8.hpp" #include "openvino/runtime/compiled_model.hpp" #include "openvino/runtime/core.hpp" +#include "openvino/runtime/intel_npu/level_zero/level_zero.hpp" #include "overload/overload_test_utils_npu.hpp" using CompilationParams = std::tupleget_default_context(target_device); + + compiled_model = core->compile_model(model, target_device, configuration); + ov::InferRequest inference_request; + inference_request = compiled_model.create_infer_request(); + + input = compiled_model.input(); + output = compiled_model.output(); + + ov::Tensor input_tensor, first_output_tensor, second_output_tensor; + auto in_shape = input.get_shape(); + auto out_shape = output.get_shape(); + + OV_ASSERT_NO_THROW(input_tensor = inference_request.get_tensor(input)); + auto* input_data = input_tensor.data(); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 5.f; + } + + OV_ASSERT_NO_THROW(inference_request.infer()); + OV_ASSERT_NO_THROW(first_output_tensor = inference_request.get_tensor(output)); + // create dummy Tensors to force the driver to allocate memory for the initial tensor somewhere else + [[maybe_unused]] auto l0_host_dummy_tensor_0 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_1 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_2 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_3 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_4 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_5 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_6 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_7 = context.create_host_tensor(ov::element::f32, dummy_shape); + + auto* actual = first_output_tensor.data(); + for (size_t i = 0; i < shape_size; ++i) { + EXPECT_NEAR(actual[i], 6.f, 1e-5) << "Expected=6, actual=" << actual[i] << " for index " << i; + } + + // imitates blob reallocation + OV_ASSERT_NO_THROW(input_tensor.set_shape({1, 50, 20, 20})); + OV_ASSERT_NO_THROW(input_tensor.set_shape(in_shape)); + + OV_ASSERT_NO_THROW(second_output_tensor = inference_request.get_tensor(output)); + OV_ASSERT_NO_THROW(second_output_tensor.set_shape({1, 20, 20, 20})); + OV_ASSERT_NO_THROW(second_output_tensor.set_shape(out_shape)); + + OV_ASSERT_NO_THROW(input_tensor = inference_request.get_tensor(input)); + input_data = input_tensor.data(); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 9.f; + } + + OV_ASSERT_NO_THROW(inference_request.infer()); + OV_ASSERT_NO_THROW(second_output_tensor = inference_request.get_tensor(output)); + + actual = second_output_tensor.data(); + for (size_t i = 0; i < shape_size; ++i) { + EXPECT_NEAR(actual[i], 10.f, 1e-5) << "Expected=10, actual=" << actual[i] << " for index " << i; + } +} + +TEST_P(InferRequestRunTests, remoteCheckResultsAfterIOBlobReallocation) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + auto original_shape = Shape{1, 10, 10, 10}; + auto dummy_shape = Shape{1, 50, 100, 100}; + auto shape_size = ov::shape_size(original_shape); + auto model = createModel(element::f32, original_shape, "N..."); + + auto context = core->get_default_context(target_device).as(); + + compiled_model = core->compile_model(model, target_device, configuration); + ov::InferRequest inference_request; + inference_request = compiled_model.create_infer_request(); + + input = compiled_model.input(); + output = compiled_model.output(); + + auto in_shape = input.get_shape(); + auto out_shape = output.get_shape(); + + auto l0_host_input_tensor = + context.create_l0_host_tensor(ov::element::f32, original_shape, ov::intel_npu::TensorType::INPUT); + auto l0_host_output_tensor = context.create_l0_host_tensor(ov::element::f32, original_shape); + + auto* host_tensor = l0_host_input_tensor.get(); + auto* input_data = reinterpret_cast(host_tensor); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 5.f; + } + + inference_request.set_input_tensor(l0_host_input_tensor); + inference_request.set_output_tensor(l0_host_output_tensor); + + OV_ASSERT_NO_THROW(inference_request.infer()); + + // create dummy Tensors to force the driver to allocate memory for the initial tensor somewhere else + [[maybe_unused]] auto l0_host_dummy_tensor_0 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_1 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_2 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_3 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_4 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_5 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_6 = context.create_host_tensor(ov::element::f32, dummy_shape); + [[maybe_unused]] auto l0_host_dummy_tensor_7 = context.create_host_tensor(ov::element::f32, dummy_shape); + + auto* actual_host_tensor = l0_host_output_tensor.get(); + auto* actual = reinterpret_cast(actual_host_tensor); + for (size_t i = 0; i < shape_size; ++i) { + EXPECT_NEAR(actual[i], 6.f, 1e-5) << "Expected=6, actual=" << actual[i] << " for index " << i; + } + + // imitates blob reallocation + OV_ASSERT_NO_THROW(l0_host_input_tensor.set_shape({1, 50, 20, 20})); + OV_ASSERT_NO_THROW(l0_host_input_tensor.set_shape(in_shape)); + + OV_ASSERT_NO_THROW(l0_host_output_tensor.set_shape({1, 20, 20, 20})); + OV_ASSERT_NO_THROW(l0_host_output_tensor.set_shape(out_shape)); + + host_tensor = l0_host_input_tensor.get(); + input_data = reinterpret_cast(host_tensor); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 9.f; + } + + OV_ASSERT_NO_THROW(inference_request.infer()); + + actual_host_tensor = l0_host_output_tensor.get(); + actual = reinterpret_cast(actual_host_tensor); + for (size_t i = 0; i < shape_size; ++i) { + EXPECT_NEAR(actual[i], 10.f, 1e-5) << "Expected=10, actual=" << actual[i] << " for index " << i; + } +} + using BatchingRunTests = InferRequestRunTests; TEST_P(BatchingRunTests, CheckBatchingSupportInfer) {