Skip to content

Commit

Permalink
Update command list if someone else allocated another tensor
Browse files Browse the repository at this point in the history
Signed-off-by: Bogdan Pereanu <[email protected]>
  • Loading branch information
pereanub committed Dec 11, 2024
1 parent 42de1ad commit dbe899b
Show file tree
Hide file tree
Showing 5 changed files with 294 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ class ZeroInferRequest final : public SyncInferRequest {
mutable std::vector<std::optional<bool>> _inputLevelZeroTensorCreatedLocally;
mutable std::vector<std::optional<bool>> _outputLevelZeroTensorCreatedLocally;

mutable std::vector<void*> _originalAddressInputLevelZeroTensor;
mutable std::vector<void*> _originalAddressOutputLevelZeroTensor;

ze_device_properties_t _properties = {};
std::shared_ptr<const zeroMemory::HostMemAllocator> _inputAllocator;
std::shared_ptr<const zeroMemory::HostMemAllocator> _outputAllocator;
Expand Down
6 changes: 0 additions & 6 deletions src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,6 @@

namespace intel_npu {

struct TensorData {
void* mem;
size_t size;
bool levelZeroTensorCreatedLocally = true;
};

struct Pipeline {
public:
Pipeline(const Config& config,
Expand Down
105 changes: 102 additions & 3 deletions src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
_levelZeroOutputTensors(_metadata.outputs.size(), nullptr),
_inputLevelZeroTensorCreatedLocally(_metadata.inputs.size(), std::nullopt),
_outputLevelZeroTensorCreatedLocally(_metadata.outputs.size(), std::nullopt),
_originalAddressInputLevelZeroTensor(_metadata.inputs.size(), nullptr),
_originalAddressOutputLevelZeroTensor(_metadata.outputs.size(), nullptr),
_profilingPool(_initStructs, _graph, zeroProfiling::POOL_SIZE),
_profilingQuery(_initStructs, 0) {
_logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest");
Expand Down Expand Up @@ -184,6 +186,7 @@ void ZeroInferRequest::create_pipeline() {
INPUT,
*_inputAllocator,
_graph->get_batch_size());
_inputLevelZeroTensorCreatedLocally.at(inputIndex) = true;
}

for (size_t outputIndex = 0; outputIndex < _metadata.outputs.size(); ++outputIndex) {
Expand All @@ -198,6 +201,32 @@ void ZeroInferRequest::create_pipeline() {
OUTPUT,
*_outputAllocator,
_graph->get_batch_size());
_outputLevelZeroTensorCreatedLocally.at(outputIndex) = true;
}

for (size_t inputIndex = 0; inputIndex < _metadata.inputs.size(); ++inputIndex) {
if (is_batched_input(inputIndex)) {
continue;
}

auto levelZeroRemoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(get_level_zero_input(inputIndex));
if (levelZeroRemoteTensor == nullptr) {
_originalAddressInputLevelZeroTensor.at(inputIndex) = get_level_zero_input(inputIndex)->data();
} else {
void* levelZeroBuffer = extract_object(levelZeroRemoteTensor->get_properties(), ov::intel_npu::mem_handle);
_originalAddressInputLevelZeroTensor.at(inputIndex) = levelZeroBuffer;
}
}

for (size_t outputIndex = 0; outputIndex < _metadata.outputs.size(); ++outputIndex) {
auto levelZeroRemoteTensor =
std::dynamic_pointer_cast<ZeroRemoteTensor>(_levelZeroOutputTensors.at(outputIndex));
if (levelZeroRemoteTensor == nullptr) {
_originalAddressOutputLevelZeroTensor.at(outputIndex) = _levelZeroOutputTensors.at(outputIndex)->data();
} else {
void* levelZeroBuffer = extract_object(levelZeroRemoteTensor->get_properties(), ov::intel_npu::mem_handle);
_originalAddressOutputLevelZeroTensor.at(outputIndex) = levelZeroBuffer;
}
}

// Find the corresponding command queue group.
Expand Down Expand Up @@ -226,7 +255,7 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tenso
const bool isInput) {
OV_ITT_TASK_CHAIN(ZERO_SET_TENSOR, itt::domains::LevelZeroBackend, "set_tensor", "set_tensor_data");
auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index);
auto& tensorsData =
auto& tensorCreatedLocally =
isInput ? _inputLevelZeroTensorCreatedLocally.at(index) : _outputLevelZeroTensorCreatedLocally.at(index);

bool setTensorData = false;
Expand All @@ -243,7 +272,7 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tenso
if (!setTensorData) {
// make sure that the L0 tensor was allocated locally and is not received from the user when receiving
// random tensor
if (tensorsData.has_value() && !tensorsData) {
if (tensorCreatedLocally.has_value() && !(*tensorCreatedLocally)) {
_logger.debug("ZeroInferRequest::set_tensor_data - create locally L0 tensor");
OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "allocate tensor");

Expand All @@ -259,16 +288,21 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor>& tenso
}

if (setTensorData) {
tensorsData = levelZeroTensorCreatedLocally;
tensorCreatedLocally = levelZeroTensorCreatedLocally;

if (_pipelineIsCreated) {
_logger.debug("ZeroInferRequest::infer_async - update command list");

auto& updateOriginalAddress = isInput ? _originalAddressInputLevelZeroTensor.at(index)
: _originalAddressOutputLevelZeroTensor.at(index);

OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList");
_pipeline->updateCommandList(levelZeroTensors->data(),
levelZeroTensors->get_byte_size(),
isInput ? _graph->get_input_descriptors().at(index).idx
: _graph->get_output_descriptors().at(index).idx);

updateOriginalAddress = levelZeroTensors->data();
}
}
}
Expand All @@ -290,16 +324,25 @@ void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptr<ZeroRemoteTe
}

auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index);
auto& tensorCreatedLocally =
isInput ? _inputLevelZeroTensorCreatedLocally.at(index) : _outputLevelZeroTensorCreatedLocally.at(index);

levelZeroTensors = tensor;
tensorCreatedLocally = false;

if (_pipelineIsCreated) {
_logger.debug("ZeroInferRequest::infer_async - update command list");

auto& updateOriginalAddress =
isInput ? _originalAddressInputLevelZeroTensor.at(index) : _originalAddressOutputLevelZeroTensor.at(index);

OV_ITT_TASK_NEXT(ZERO_SET_REMOTE_TENSOR, "updateCommandList");
_pipeline->updateCommandList(
data,
tensor->get_byte_size(),
isInput ? _graph->get_input_descriptors().at(index).idx : _graph->get_output_descriptors().at(index).idx);

updateOriginalAddress = data;
}
}

Expand Down Expand Up @@ -408,6 +451,7 @@ void ZeroInferRequest::set_tensors(const ov::Output<const ov::Node>& port,

if (_pipelineIsCreated) {
OV_ITT_TASK_NEXT(SET_TENSORS, "updateCommandList");

_pipeline->updateCommandList(data, _graph->get_input_descriptors().at(foundPort.idx).idx, i);
}
}
Expand Down Expand Up @@ -438,13 +482,17 @@ ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::N
_logger.debug("ZeroInferRequest::get_tensor - tensor is not allocated, create the tensor");

auto& levelZeroTensors = isInput ? get_level_zero_input(ioIndex) : _levelZeroOutputTensors.at(ioIndex);
auto& tensorCreatedLocally =
isInput ? _inputLevelZeroTensorCreatedLocally.at(ioIndex) : _outputLevelZeroTensorCreatedLocally.at(ioIndex);

levelZeroTensors = allocate_tensor(isInput ? _metadata.inputs.at(ioIndex) : _metadata.outputs.at(ioIndex),
ioIndex,
isInput,
isInput ? *_inputAllocator : *_outputAllocator,
_graph->get_batch_size());

tensorCreatedLocally = true;

return levelZeroTensors;
}

Expand Down Expand Up @@ -555,11 +603,62 @@ void ZeroInferRequest::infer_async() {
OV_ITT_TASK_NEXT(ZERO_INFER, "memcpy");
std::memcpy(levelZeroBuffer, userBuffer, userTensor.at(SINGLE_TENSOR)->get_byte_size());
}

if (_originalAddressInputLevelZeroTensor.at(inputIndex) != levelZeroBuffer) {
_logger.debug("Update input graph descriptor with the new tensor");
_pipeline->updateCommandList(levelZeroBuffer,
levelZeroTensor->get_byte_size(),
_graph->get_input_descriptors().at(inputIndex).idx);

_originalAddressInputLevelZeroTensor.at(inputIndex) = levelZeroBuffer;
}
} else {
void* remoteLevelZeroBuffer =
extract_object(levelZeroRemoteTensor->get_properties(), ov::intel_npu::mem_handle);

if (_originalAddressInputLevelZeroTensor.at(inputIndex) != remoteLevelZeroBuffer) {
_logger.debug("Update input graph descriptor with the new remote tensor");
_pipeline->updateCommandList(remoteLevelZeroBuffer,
levelZeroRemoteTensor->get_byte_size(),
_graph->get_input_descriptors().at(inputIndex).idx);

_originalAddressInputLevelZeroTensor.at(inputIndex) = remoteLevelZeroBuffer;
}
}

++inputIndex;
}

size_t outputIndex = 0;

for (const auto& levelZeroTensor : _levelZeroOutputTensors) {
auto levelZeroRemoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(levelZeroTensor);
if (levelZeroRemoteTensor == nullptr) {
if (_originalAddressOutputLevelZeroTensor.at(outputIndex) != levelZeroTensor->data()) {
_logger.debug("Update output graph descriptor with the new tensor");
_pipeline->updateCommandList(levelZeroTensor->data(),
levelZeroTensor->get_byte_size(),
_graph->get_output_descriptors().at(outputIndex).idx);

_originalAddressOutputLevelZeroTensor.at(outputIndex) = levelZeroTensor->data();
}
} else {
void* remoteLevelZeroBuffer =
extract_object(levelZeroRemoteTensor->get_properties(), ov::intel_npu::mem_handle);

if (_originalAddressInputLevelZeroTensor.at(outputIndex) != remoteLevelZeroBuffer) {
_logger.debug("Update output graph descriptor with the new remote tensor");
_pipeline->updateCommandList(remoteLevelZeroBuffer,
levelZeroRemoteTensor->get_byte_size(),
_graph->get_output_descriptors().at(outputIndex).idx);

_originalAddressOutputLevelZeroTensor.at(outputIndex) = remoteLevelZeroBuffer;
}
}

++outputIndex;
}

OV_ITT_TASK_NEXT(ZERO_INFER, "push");
_pipeline->push();
}
Expand Down
49 changes: 46 additions & 3 deletions src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,22 @@
#include "intel_npu/utils/logger/logger.hpp"
#include "intel_npu/utils/zero/zero_api.hpp"
#include "intel_npu/utils/zero/zero_types.hpp"
#include "zero_remote_tensor.hpp"

namespace {

template <typename Type>
Type extract_object(const ov::AnyMap& params, const ov::Property<Type>& p) {
auto itrHandle = params.find(p.name());
ov::Any res = nullptr;
if (itrHandle == params.end()) {
OPENVINO_THROW("No parameter ", p.name(), " found in parameters map");
}
res = itrHandle->second;
return res.as<Type>();
}

} // namespace

namespace intel_npu {

Expand Down Expand Up @@ -59,25 +75,52 @@ Pipeline::Pipeline(const Config& config,
size_t ioIndex = 0;
for (const auto& desc : graph->get_input_descriptors()) {
if (inputTensorsData.at(ioIndex).size() > 1) {
graph->set_argument_value(desc.idx, inputTensorsData.at(ioIndex).at(i)->data());
void* data = nullptr;
auto remoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(inputTensorsData.at(ioIndex).at(i));
if (remoteTensor == nullptr) {
data = inputTensorsData.at(ioIndex).at(i)->data();

} else {
data = extract_object(remoteTensor->get_properties(), ov::intel_npu::mem_handle);
}

graph->set_argument_value(desc.idx, data);

++ioIndex;
continue;
}

void* data = nullptr;
auto remoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(inputTensorsData.at(ioIndex).at(0));
if (remoteTensor == nullptr) {
data = inputTensorsData.at(ioIndex).at(0)->data();

} else {
data = extract_object(remoteTensor->get_properties(), ov::intel_npu::mem_handle);
}

graph->set_argument_value(
desc.idx,
static_cast<unsigned char*>(inputTensorsData.at(ioIndex).at(0)->data()) +
static_cast<unsigned char*>(data) +
(i * inputTensorsData.at(ioIndex).at(0)->get_byte_size()) / _number_of_command_lists);

++ioIndex;
}

ioIndex = 0;
for (const auto& desc : graph->get_output_descriptors()) {
void* data = nullptr;
auto remoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(outputTensorsData.at(ioIndex));
if (remoteTensor == nullptr) {
data = outputTensorsData.at(ioIndex)->data();

} else {
data = extract_object(remoteTensor->get_properties(), ov::intel_npu::mem_handle);
}

graph->set_argument_value(
desc.idx,
static_cast<unsigned char*>(outputTensorsData.at(ioIndex)->data()) +
static_cast<unsigned char*>(data) +
(i * outputTensorsData.at(ioIndex)->get_byte_size()) / _number_of_command_lists);
++ioIndex;
}
Expand Down
Loading

0 comments on commit dbe899b

Please sign in to comment.