Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NVIDIA] Add Multi-graph Feature to NVIDIA Plugin #710

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2f0468c
[NVIDIA] Change CudaGraphTopologyRunner/SubGraph to aggregation from …
Aug 25, 2023
df7a759
[NVIDIA] Add new constructor to SubGraph
Aug 25, 2023
fcc6096
[NVIDIA] Update CudaGraphContext to use vectors
Aug 25, 2023
590db80
[NVIDIA] Extract inputs/outputs to TensorMappingContext
Aug 25, 2023
05e13e7
[NVIDIA] Update SubGraph to use direct exec_sequence_ and std::shared…
Aug 25, 2023
9c0ed5d
[NVIDIA] Add CudaGraphInfo and update CudaGraphContext to use them
Aug 25, 2023
2ff31ec
[NVIDIA] Fix single-graph tests
Aug 25, 2023
b035b80
[NVIDIA] Add CudaMultiGraphTest test
Aug 25, 2023
6af3486
[NVIDIA] Add SubGraph::IsCudaGraphCompatible() cache
Aug 25, 2023
cfba412
[NVIDIA] Add execute_sequence/capture_sequence member functions to Pr…
Aug 25, 2023
e305986
[NVIDIA] Enable TensorIterator to use Profiler::execute_sequence()
Aug 25, 2023
26703db
[NVIDIA] Enable SubGraph to use Profiler::execute/capture_sequence()
Aug 25, 2023
39baa30
[NVIDIA] Extract ITopologyRunner into the separate header
Aug 25, 2023
a493f44
[NVIDIA] Update tests to include cuda_eager_topology_runner.hpp
Aug 25, 2023
8277039
[NVIDIA] Add IExecutionDelegator
Aug 25, 2023
3e4a839
[NVIDIA] Add cuda_perf_counts.hpp
Aug 25, 2023
3d820b1
[NVIDIA] Add SimpleExecutionDelegator class and use it when profiling…
Aug 25, 2023
d326be2
[NVIDIA] Update tests to use SimpleExecutionDelegator
Aug 25, 2023
5d27fc7
[NVIDIA] Add updateExecSequence() to TensorIteratorOp
Aug 25, 2023
9395549
[NVIDIA] Update TensorIteratorOp::IsCudaGraphCompatible() to use SubG…
Aug 25, 2023
93c99d2
[NVIDIA] Add rebase fixes
Aug 28, 2023
1d59b6e
[NVIDIA] Add comment fixes
Aug 28, 2023
6f5a407
[NVIDIA] Rename functions to correspond to OV coding style
Aug 28, 2023
7cf15dd
[NVIDIA] Add number_of_cuda_graphs property
Aug 29, 2023
8f32a6b
[NVIDIA] Fix and update SimpleExecutionDelegator
Aug 30, 2023
cbba769
[NVIDIA] Fix build error on some configurations
Aug 31, 2023
5699b60
[NVIDIA] Temporary disable CUDA graph compatibility for TensorIterator
Aug 31, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions modules/nvidia_plugin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ Please refer to OpenVINO documentation for details.

All parameters must be set before calling `ov::Core::compile_model()` in order to take effect.

### Plugin specific properties
* `ov::nvidia_gpu::number_of_cuda_graphs` - Read-only property showing the number of CUDA Graphs, used for the current model

## Compile options

During compilation of the openvino_nvidia_gpu_plugin, user could specify the following options:
Expand Down
7 changes: 6 additions & 1 deletion modules/nvidia_plugin/include/nvidia/properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@ static constexpr Property<bool, PropertyMutability::RW> operation_benchmark{"NVI
/**
* @brief Specifies if NVIDIA plugin attempts to use CUDA Graph feature to speed up sequential network inferences
*/
static constexpr ov::Property<bool, ov::PropertyMutability::RW> use_cuda_graph{"NVIDIA_USE_CUDA_GRAPH"};
static constexpr Property<bool, PropertyMutability::RW> use_cuda_graph{"NVIDIA_USE_CUDA_GRAPH"};

/**
* @brief Read-only property showing number of used CUDA Graphs
*/
static constexpr Property<size_t, PropertyMutability::RO> number_of_cuda_graphs{"NVIDIA_NUMBER_OF_CUDA_GRAPHS"};

} // namespace nvidia_gpu
} // namespace ov
21 changes: 14 additions & 7 deletions modules/nvidia_plugin/src/cuda/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,22 @@ cudaGraph_t Graph::createNativeWithFlags(unsigned int flags) {
return g;
}

bool operator==(const Graph &rhs, const Graph &lhs) { return rhs.get() == lhs.get(); }

GraphExec::GraphExec(const Graph &g)
#if !defined(NDEBUG) || defined(_DEBUG)
try
try
#endif
:
Handle(cudaGraphInstantiate, cudaGraphExecDestroy, g.get(), static_cast<cudaGraphNode_t*>(nullptr),
: Handle(cudaGraphInstantiate,
cudaGraphExecDestroy,
g.get(),
static_cast<cudaGraphNode_t *>(nullptr),
#if !defined(NDEBUG) || defined(_DEBUG)
errorMsg_, kErrorStringLen)
errorMsg_,
kErrorStringLen)
#else
static_cast<char*>(nullptr), static_cast<size_t>(0ul))
static_cast<char *>(nullptr),
static_cast<size_t>(0ul))
#endif
{
}
Expand All @@ -56,8 +62,9 @@ void GraphExec::launch(const Stream &stream) const {
throwIfError(cudaGraphLaunch(get(), stream.get()));
}

GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) :
graphCapture_ { graphCapture } {
bool operator==(const GraphExec &lhs, const GraphExec &rhs) { return rhs.get() == lhs.get(); }

GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) : graphCapture_{graphCapture} {
throwIfError(cudaStreamBeginCapture(graphCapture_.stream_.get(), cudaStreamCaptureModeThreadLocal));
}

Expand Down
6 changes: 6 additions & 0 deletions modules/nvidia_plugin/src/cuda/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class Graph: public Handle<cudaGraph_t> {
public:
Graph(unsigned int flags);

friend bool operator==(const Graph& lhs, const Graph& rhs);

friend GraphCapture;

private:
Expand All @@ -26,6 +28,7 @@ class Graph: public Handle<cudaGraph_t> {
static cudaGraph_t createNativeWithFlags(unsigned int flags);
};

bool operator==(const Graph& rhs, const Graph& lhs);

class GraphExec: public Handle<cudaGraphExec_t> {
public:
Expand All @@ -35,13 +38,16 @@ class GraphExec: public Handle<cudaGraphExec_t> {

void launch(const Stream& stream) const;

friend bool operator==(const GraphExec& lhs, const GraphExec& rhs);

#if !defined(NDEBUG) || defined(_DEBUG)
private:
static constexpr std::size_t kErrorStringLen = 1024;
char errorMsg_[kErrorStringLen];
#endif
};

bool operator==(const GraphExec& lhs, const GraphExec& rhs);

class GraphCapture {
public:
Expand Down
21 changes: 12 additions & 9 deletions modules/nvidia_plugin/src/cuda_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
#include <utility>

#include "cuda_compiled_model.hpp"
#include "cuda_eager_topology_runner.hpp"
#include "cuda_graph_topology_runner.hpp"
#include "cuda_itt.hpp"
#include "cuda_operation_registry.hpp"
#include "cuda_perf_counts.hpp"
#include "cuda_plugin.hpp"
#include "memory_manager/cuda_immutable_memory_block_builder.hpp"
#include "memory_manager/cuda_memory_manager.hpp"
Expand Down Expand Up @@ -55,7 +57,8 @@ CompiledModel::CompiledModel(const std::shared_ptr<const ov::Model>& model,
cuda_stream_executor_(std::move(wait_executor)),
loaded_from_cache_(loaded_from_cache),
use_cuda_graph_{get_property(ov::nvidia_gpu::use_cuda_graph.name()).as<bool>() &&
!get_property(ov::enable_profiling.name()).as<bool>()} {
!get_property(ov::enable_profiling.name()).as<bool>()},
number_of_cuda_graphs_{0} {
try {
compile_model(model);
init_executor(); // creates thread-based executor using for async requests
Expand Down Expand Up @@ -129,13 +132,9 @@ void CompiledModel::compile_model(const std::shared_ptr<const ov::Model>& model)
const auto creationContext = CreationContext{device, opBenchOption};

if (use_cuda_graph_) {
try {
topology_runner_ = std::make_unique<CudaGraphTopologyRunner>(creationContext, model_);
// TODO: Add CudaGraphTopologyRunner validation
} catch (const CudaGraphTopologyRunner::CudaGraphIncompatible&) {
topology_runner_ = std::make_unique<EagerTopologyRunner>(creationContext, model_);
use_cuda_graph_ = false;
}
auto cudaGraphTopologyRunner = std::make_unique<CudaGraphTopologyRunner>(creationContext, model_);
number_of_cuda_graphs_ = cudaGraphTopologyRunner->GetCudaGraphsCount();
topology_runner_ = std::move(cudaGraphTopologyRunner);
} else {
topology_runner_ = std::make_unique<EagerTopologyRunner>(creationContext, model_);
}
Expand Down Expand Up @@ -256,7 +255,7 @@ size_t CompiledModel::get_optimal_number_of_streams(size_t const_blob_size,
}

std::shared_ptr<MemoryPool> CompiledModel::create_memory_pool() {
const auto& memory_manager = topology_runner_->GetSubGraph().memoryManager();
const auto& memory_manager = *(topology_runner_->GetSubGraph().memoryManager());
const auto const_blob_size = memory_manager.immutableTensors().memoryModel()->deviceMemoryBlockSize();
const auto immutable_work_buffers_size = memory_manager.immutableWorkbuffers().memoryModel()->deviceMemoryBlockSize();
const auto& memory_model = memory_manager.mutableTensorsMemoryModel();
Expand Down Expand Up @@ -306,6 +305,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
supported_properties.push_back(
ov::PropertyName(ov::optimal_number_of_infer_requests.name(), PropertyMutability::RO));
supported_properties.push_back(ov::PropertyName(ov::loaded_from_cache.name(), PropertyMutability::RO));
supported_properties.push_back(ov::PropertyName(ov::nvidia_gpu::number_of_cuda_graphs.name(),
PropertyMutability::RO));
auto rw_properties = config_.get_rw_properties();
for (auto& rw_property : rw_properties)
supported_properties.emplace_back(ov::PropertyName(rw_property, PropertyMutability::RO));
Expand Down Expand Up @@ -333,6 +334,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
return decltype(ov::execution_devices)::value_type{get_plugin()->get_device_name() + "." + std::to_string(config_.get_device_id())};
} else if (ov::loaded_from_cache == name) {
return decltype(ov::loaded_from_cache)::value_type{loaded_from_cache_};
} else if (ov::nvidia_gpu::number_of_cuda_graphs == name) {
return decltype(ov::nvidia_gpu::number_of_cuda_graphs)::value_type{number_of_cuda_graphs_};
} else {
return config_.get(name);
}
Expand Down
8 changes: 4 additions & 4 deletions modules/nvidia_plugin/src/cuda_compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,17 @@

#pragma once

#include "openvino/runtime/icompiled_model.hpp"
#include "openvino/runtime/threading/itask_executor.hpp"

#include "cuda_async_infer_request.hpp"
#include "cuda_config.hpp"
#include "cuda_eager_topology_runner.hpp"
#include "cuda_infer_request.hpp"
#include "cuda_itopology_runner.hpp"
#include "cuda_op_buffers_extractor.hpp"
#include "memory_manager/cuda_device_mem_block.hpp"
#include "memory_manager/cuda_memory_manager.hpp"
#include "memory_manager/cuda_memory_pool.hpp"
#include "memory_manager/model/cuda_memory_model.hpp"
#include "openvino/runtime/icompiled_model.hpp"
#include "openvino/runtime/threading/itask_executor.hpp"
#include "ops/subgraph.hpp"

namespace ov {
Expand Down Expand Up @@ -78,6 +77,7 @@ class CompiledModel : public ov::ICompiledModel {
std::shared_ptr<MemoryPool> memory_pool_;
const bool loaded_from_cache_;
bool use_cuda_graph_;
size_t number_of_cuda_graphs_;
};

} // namespace nvidia_gpu
Expand Down
10 changes: 2 additions & 8 deletions modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp
Original file line number Diff line number Diff line change
@@ -1,21 +1,15 @@
// Copyright (C) 2018-2021 Intel Corporation
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <ops/subgraph.hpp>
#include "cuda_itopology_runner.hpp"

namespace ov {
namespace nvidia_gpu {

struct ITopologyRunner {
virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0;
virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0;
virtual const SubGraph& GetSubGraph() const = 0;
virtual ~ITopologyRunner() = default;
};

class EagerTopologyRunner final : public SubGraph, public ITopologyRunner {
public:
EagerTopologyRunner(const CreationContext& context, const std::shared_ptr<const ov::Model>& model);
Expand Down
132 changes: 132 additions & 0 deletions modules/nvidia_plugin/src/cuda_graph_context.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "cuda_graph_context.hpp"

namespace ov {
namespace nvidia_gpu {

void CudaGraphContext::reset() {
graphs_.clear();
currentGraphIndex_ = 0;
}

void CudaGraphContext::start_next_graph_addition() {
currentGraphIndex_ = graphs_.size();
graphs_.emplace_back();
}

void CudaGraphContext::add_parameter(const std::string& tensorName,
const CUDA::Stream& stream,
CUDA::DevicePointer<void*> dst,
const void* src,
std::size_t size) {
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].add_parameter(tensorName, stream, dst, src, size);
}

void CudaGraphContext::add_result(const std::string& tensorName,
const CUDA::Stream& stream,
void* dst,
CUDA::DevicePointer<const void*> src,
std::size_t size) {
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size);
}

void CudaGraphContext::add_graph(const CUDA::Graph& graph) {
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].set_graph(graph);
}

bool CudaGraphContext::is_initialized() const {
const auto size = graphs_.size();
return size != 0 && graphs_[size - 1].is_initialized();
}

void CudaGraphContext::update_capture(const TensorMappingContext& context) {
for (currentGraphIndex_ = 0; currentGraphIndex_ < graphs_.size(); ++currentGraphIndex_) {
nkogteva marked this conversation as resolved.
Show resolved Hide resolved
graphs_[currentGraphIndex_].update_capture(context);
}
}

void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const {
currentGraphIndex_ = index;
OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency");
graphs_[currentGraphIndex_].launch(stream);
}

std::size_t CudaGraphContext::get_params_count() const {
std::size_t res = 0;
for (const auto& graph : graphs_) {
res += graph.get_params_count();
}
return res;
}

std::size_t CudaGraphContext::get_results_count() const {
std::size_t res = 0;
for (const auto& graph : graphs_) {
res += graph.get_results_count();
}
return res;
}

std::size_t CudaGraphContext::get_graphs_count() const { return graphs_.size(); }

void CudaGraphContext::CudaGraphInfo::add_parameter(const std::string& tensorName,
const CUDA::Stream& stream,
CUDA::DevicePointer<void*> dst,
const void* src,
std::size_t size) {
CUDA::CaptureInfo captureInfo{stream};
parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size));
}

void CudaGraphContext::CudaGraphInfo::add_result(const std::string& tensorName,
const CUDA::Stream& stream,
void* dst,
CUDA::DevicePointer<const void*> src,
std::size_t size) {
CUDA::CaptureInfo captureInfo{stream};
resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size));
}

void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) {
graph_.emplace(graph);
graphExec_.emplace(graph);
}

bool CudaGraphContext::CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); }

void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext& context) {
for (auto&& [tensorName, node] : parameterNodes_) {
node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data()));
}
for (auto&& [tensorName, node] : resultNodes_) {
node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data());
}
}

void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); }

std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return parameterNodes_.size(); }

std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return resultNodes_.size(); }

bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) {
return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ &&
lhs.resultNodes_ == rhs.resultNodes_;
}

bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) {
return !(lhs == rhs);
}

bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return lhs.graphs_ == rhs.graphs_; }

bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return !(lhs == rhs); }

} // namespace nvidia_gpu
} // namespace ov
Loading
Loading