diff --git a/modules/nvidia_plugin/README.md b/modules/nvidia_plugin/README.md index 6d1cc9356..3cdbd8dbb 100644 --- a/modules/nvidia_plugin/README.md +++ b/modules/nvidia_plugin/README.md @@ -179,6 +179,9 @@ Please refer to OpenVINO documentation for details. All parameters must be set before calling `ov::Core::compile_model()` in order to take effect. +### Plugin specific properties +* `ov::nvidia_gpu::number_of_cuda_graphs` - Read-only property showing the number of CUDA Graphs, used for the current model + ## Compile options During compilation of the openvino_nvidia_gpu_plugin, user could specify the following options: diff --git a/modules/nvidia_plugin/include/nvidia/properties.hpp b/modules/nvidia_plugin/include/nvidia/properties.hpp index 685843e82..601ea892d 100644 --- a/modules/nvidia_plugin/include/nvidia/properties.hpp +++ b/modules/nvidia_plugin/include/nvidia/properties.hpp @@ -27,7 +27,12 @@ static constexpr Property operation_benchmark{"NVI /** * @brief Specifies if NVIDIA plugin attempts to use CUDA Graph feature to speed up sequential network inferences */ -static constexpr ov::Property use_cuda_graph{"NVIDIA_USE_CUDA_GRAPH"}; +static constexpr Property use_cuda_graph{"NVIDIA_USE_CUDA_GRAPH"}; + +/** + * @brief Read-only property showing number of used CUDA Graphs + */ +static constexpr Property number_of_cuda_graphs{"NVIDIA_NUMBER_OF_CUDA_GRAPHS"}; } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/cuda/graph.cpp b/modules/nvidia_plugin/src/cuda/graph.cpp index 4e9976d6b..4e307d3ae 100644 --- a/modules/nvidia_plugin/src/cuda/graph.cpp +++ b/modules/nvidia_plugin/src/cuda/graph.cpp @@ -27,16 +27,22 @@ cudaGraph_t Graph::createNativeWithFlags(unsigned int flags) { return g; } +bool operator==(const Graph &rhs, const Graph &lhs) { return rhs.get() == lhs.get(); } + GraphExec::GraphExec(const Graph &g) #if !defined(NDEBUG) || defined(_DEBUG) -try + try #endif -: -Handle(cudaGraphInstantiate, cudaGraphExecDestroy, g.get(), static_cast(nullptr), + : Handle(cudaGraphInstantiate, + cudaGraphExecDestroy, + g.get(), + static_cast(nullptr), #if !defined(NDEBUG) || defined(_DEBUG) - errorMsg_, kErrorStringLen) + errorMsg_, + kErrorStringLen) #else - static_cast(nullptr), static_cast(0ul)) + static_cast(nullptr), + static_cast(0ul)) #endif { } @@ -56,8 +62,9 @@ void GraphExec::launch(const Stream &stream) const { throwIfError(cudaGraphLaunch(get(), stream.get())); } -GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) : - graphCapture_ { graphCapture } { +bool operator==(const GraphExec &lhs, const GraphExec &rhs) { return rhs.get() == lhs.get(); } + +GraphCapture::GraphCaptureScope::GraphCaptureScope(GraphCapture &graphCapture) : graphCapture_{graphCapture} { throwIfError(cudaStreamBeginCapture(graphCapture_.stream_.get(), cudaStreamCaptureModeThreadLocal)); } diff --git a/modules/nvidia_plugin/src/cuda/graph.hpp b/modules/nvidia_plugin/src/cuda/graph.hpp index dfb0de4eb..220c79820 100644 --- a/modules/nvidia_plugin/src/cuda/graph.hpp +++ b/modules/nvidia_plugin/src/cuda/graph.hpp @@ -16,6 +16,8 @@ class Graph: public Handle { public: Graph(unsigned int flags); + friend bool operator==(const Graph& lhs, const Graph& rhs); + friend GraphCapture; private: @@ -26,6 +28,7 @@ class Graph: public Handle { static cudaGraph_t createNativeWithFlags(unsigned int flags); }; +bool operator==(const Graph& rhs, const Graph& lhs); class GraphExec: public Handle { public: @@ -35,6 +38,8 @@ class GraphExec: public Handle { void launch(const Stream& stream) const; + friend bool operator==(const GraphExec& lhs, const GraphExec& rhs); + #if !defined(NDEBUG) || defined(_DEBUG) private: static constexpr std::size_t kErrorStringLen = 1024; @@ -42,6 +47,7 @@ class GraphExec: public Handle { #endif }; +bool operator==(const GraphExec& lhs, const GraphExec& rhs); class GraphCapture { public: diff --git a/modules/nvidia_plugin/src/cuda_compiled_model.cpp b/modules/nvidia_plugin/src/cuda_compiled_model.cpp index af728506e..f0e139991 100644 --- a/modules/nvidia_plugin/src/cuda_compiled_model.cpp +++ b/modules/nvidia_plugin/src/cuda_compiled_model.cpp @@ -15,9 +15,11 @@ #include #include "cuda_compiled_model.hpp" +#include "cuda_eager_topology_runner.hpp" #include "cuda_graph_topology_runner.hpp" #include "cuda_itt.hpp" #include "cuda_operation_registry.hpp" +#include "cuda_perf_counts.hpp" #include "cuda_plugin.hpp" #include "memory_manager/cuda_immutable_memory_block_builder.hpp" #include "memory_manager/cuda_memory_manager.hpp" @@ -55,7 +57,8 @@ CompiledModel::CompiledModel(const std::shared_ptr& model, cuda_stream_executor_(std::move(wait_executor)), loaded_from_cache_(loaded_from_cache), use_cuda_graph_{get_property(ov::nvidia_gpu::use_cuda_graph.name()).as() && - !get_property(ov::enable_profiling.name()).as()} { + !get_property(ov::enable_profiling.name()).as()}, + number_of_cuda_graphs_{0} { try { compile_model(model); init_executor(); // creates thread-based executor using for async requests @@ -129,13 +132,9 @@ void CompiledModel::compile_model(const std::shared_ptr& model) const auto creationContext = CreationContext{device, opBenchOption}; if (use_cuda_graph_) { - try { - topology_runner_ = std::make_unique(creationContext, model_); - // TODO: Add CudaGraphTopologyRunner validation - } catch (const CudaGraphTopologyRunner::CudaGraphIncompatible&) { - topology_runner_ = std::make_unique(creationContext, model_); - use_cuda_graph_ = false; - } + auto cudaGraphTopologyRunner = std::make_unique(creationContext, model_); + number_of_cuda_graphs_ = cudaGraphTopologyRunner->GetCudaGraphsCount(); + topology_runner_ = std::move(cudaGraphTopologyRunner); } else { topology_runner_ = std::make_unique(creationContext, model_); } @@ -256,7 +255,7 @@ size_t CompiledModel::get_optimal_number_of_streams(size_t const_blob_size, } std::shared_ptr CompiledModel::create_memory_pool() { - const auto& memory_manager = topology_runner_->GetSubGraph().memoryManager(); + const auto& memory_manager = *(topology_runner_->GetSubGraph().memoryManager()); const auto const_blob_size = memory_manager.immutableTensors().memoryModel()->deviceMemoryBlockSize(); const auto immutable_work_buffers_size = memory_manager.immutableWorkbuffers().memoryModel()->deviceMemoryBlockSize(); const auto& memory_model = memory_manager.mutableTensorsMemoryModel(); @@ -306,6 +305,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const { supported_properties.push_back( ov::PropertyName(ov::optimal_number_of_infer_requests.name(), PropertyMutability::RO)); supported_properties.push_back(ov::PropertyName(ov::loaded_from_cache.name(), PropertyMutability::RO)); + supported_properties.push_back(ov::PropertyName(ov::nvidia_gpu::number_of_cuda_graphs.name(), + PropertyMutability::RO)); auto rw_properties = config_.get_rw_properties(); for (auto& rw_property : rw_properties) supported_properties.emplace_back(ov::PropertyName(rw_property, PropertyMutability::RO)); @@ -333,6 +334,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const { return decltype(ov::execution_devices)::value_type{get_plugin()->get_device_name() + "." + std::to_string(config_.get_device_id())}; } else if (ov::loaded_from_cache == name) { return decltype(ov::loaded_from_cache)::value_type{loaded_from_cache_}; + } else if (ov::nvidia_gpu::number_of_cuda_graphs == name) { + return decltype(ov::nvidia_gpu::number_of_cuda_graphs)::value_type{number_of_cuda_graphs_}; } else { return config_.get(name); } diff --git a/modules/nvidia_plugin/src/cuda_compiled_model.hpp b/modules/nvidia_plugin/src/cuda_compiled_model.hpp index 1ad8fa6ef..804704d5b 100644 --- a/modules/nvidia_plugin/src/cuda_compiled_model.hpp +++ b/modules/nvidia_plugin/src/cuda_compiled_model.hpp @@ -4,18 +4,17 @@ #pragma once -#include "openvino/runtime/icompiled_model.hpp" -#include "openvino/runtime/threading/itask_executor.hpp" - #include "cuda_async_infer_request.hpp" #include "cuda_config.hpp" -#include "cuda_eager_topology_runner.hpp" #include "cuda_infer_request.hpp" +#include "cuda_itopology_runner.hpp" #include "cuda_op_buffers_extractor.hpp" #include "memory_manager/cuda_device_mem_block.hpp" #include "memory_manager/cuda_memory_manager.hpp" #include "memory_manager/cuda_memory_pool.hpp" #include "memory_manager/model/cuda_memory_model.hpp" +#include "openvino/runtime/icompiled_model.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" #include "ops/subgraph.hpp" namespace ov { @@ -78,6 +77,7 @@ class CompiledModel : public ov::ICompiledModel { std::shared_ptr memory_pool_; const bool loaded_from_cache_; bool use_cuda_graph_; + size_t number_of_cuda_graphs_; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp index 8b795cee4..a5419d2c2 100644 --- a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp @@ -1,21 +1,15 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once #include +#include "cuda_itopology_runner.hpp" namespace ov { namespace nvidia_gpu { -struct ITopologyRunner { - virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; - virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; - virtual const SubGraph& GetSubGraph() const = 0; - virtual ~ITopologyRunner() = default; -}; - class EagerTopologyRunner final : public SubGraph, public ITopologyRunner { public: EagerTopologyRunner(const CreationContext& context, const std::shared_ptr& model); diff --git a/modules/nvidia_plugin/src/cuda_graph_context.cpp b/modules/nvidia_plugin/src/cuda_graph_context.cpp new file mode 100644 index 000000000..e1f9e2487 --- /dev/null +++ b/modules/nvidia_plugin/src/cuda_graph_context.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "cuda_graph_context.hpp" + +namespace ov { +namespace nvidia_gpu { + +void CudaGraphContext::reset() { + graphs_.clear(); + currentGraphIndex_ = 0; +} + +void CudaGraphContext::start_next_graph_addition() { + currentGraphIndex_ = graphs_.size(); + graphs_.emplace_back(); +} + +void CudaGraphContext::add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size) { + OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); + graphs_[currentGraphIndex_].add_parameter(tensorName, stream, dst, src, size); +} + +void CudaGraphContext::add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size) { + OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); + graphs_[currentGraphIndex_].add_result(tensorName, stream, dst, src, size); +} + +void CudaGraphContext::add_graph(const CUDA::Graph& graph) { + OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); + graphs_[currentGraphIndex_].set_graph(graph); +} + +bool CudaGraphContext::is_initialized() const { + const auto size = graphs_.size(); + return size != 0 && graphs_[size - 1].is_initialized(); +} + +void CudaGraphContext::update_capture(const TensorMappingContext& context) { + for (currentGraphIndex_ = 0; currentGraphIndex_ < graphs_.size(); ++currentGraphIndex_) { + graphs_[currentGraphIndex_].update_capture(context); + } +} + +void CudaGraphContext::launch(std::size_t index, const CUDA::Stream& stream) const { + currentGraphIndex_ = index; + OPENVINO_ASSERT(currentGraphIndex_ < graphs_.size(), "Graph index/vector size incosistency"); + graphs_[currentGraphIndex_].launch(stream); +} + +std::size_t CudaGraphContext::get_params_count() const { + std::size_t res = 0; + for (const auto& graph : graphs_) { + res += graph.get_params_count(); + } + return res; +} + +std::size_t CudaGraphContext::get_results_count() const { + std::size_t res = 0; + for (const auto& graph : graphs_) { + res += graph.get_results_count(); + } + return res; +} + +std::size_t CudaGraphContext::get_graphs_count() const { return graphs_.size(); } + +void CudaGraphContext::CudaGraphInfo::add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + parameterNodes_.emplace(tensorName, captureInfo.addUploadNode(dst, src, size)); +} + +void CudaGraphContext::CudaGraphInfo::add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size) { + CUDA::CaptureInfo captureInfo{stream}; + resultNodes_.emplace(tensorName, captureInfo.addDownloadNode(dst, src, size)); +} + +void CudaGraphContext::CudaGraphInfo::set_graph(const CUDA::Graph& graph) { + graph_.emplace(graph); + graphExec_.emplace(graph); +} + +bool CudaGraphContext::CudaGraphInfo::is_initialized() const { return graph_.has_value() && graphExec_.has_value(); } + +void CudaGraphContext::CudaGraphInfo::update_capture(const TensorMappingContext& context) { + for (auto&& [tensorName, node] : parameterNodes_) { + node.update_src(graphExec_.value(), (context.get_input_tensor(tensorName)->data())); + } + for (auto&& [tensorName, node] : resultNodes_) { + node.update_dst(graphExec_.value(), context.get_output_tensor(tensorName)->data()); + } +} + +void CudaGraphContext::CudaGraphInfo::launch(const CUDA::Stream& stream) const { graphExec_.value().launch(stream); } + +std::size_t CudaGraphContext::CudaGraphInfo::get_params_count() const { return parameterNodes_.size(); } + +std::size_t CudaGraphContext::CudaGraphInfo::get_results_count() const { return resultNodes_.size(); } + +bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) { + return lhs.graph_ == rhs.graph_ && lhs.graphExec_ == rhs.graphExec_ && lhs.parameterNodes_ == rhs.parameterNodes_ && + lhs.resultNodes_ == rhs.resultNodes_; +} + +bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs) { + return !(lhs == rhs); +} + +bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return lhs.graphs_ == rhs.graphs_; } + +bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs) { return !(lhs == rhs); } + +} // namespace nvidia_gpu +} // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_graph_context.hpp b/modules/nvidia_plugin/src/cuda_graph_context.hpp index c62505662..682e18e70 100644 --- a/modules/nvidia_plugin/src/cuda_graph_context.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_context.hpp @@ -5,18 +5,94 @@ #pragma once #include -#include -#include +#include "cuda_tensor_mapping_context.hpp" namespace ov { namespace nvidia_gpu { -struct CudaGraphContext { - std::optional graphExec{}; - std::optional graph{}; - std::map parameterNodes; - std::map resultNodes; +class CudaGraphContext { +public: + void reset(); + + void start_next_graph_addition(); + + void add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size); + + void add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size); + + void add_graph(const CUDA::Graph& graph); + + bool is_initialized() const; + + void update_capture(const TensorMappingContext& context); + + void launch(std::size_t index, const CUDA::Stream& stream) const; + + std::size_t get_params_count() const; + std::size_t get_results_count() const; + std::size_t get_graphs_count() const; + + friend bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs); + friend bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs); + +private: + class CudaGraphInfo { + public: + void add_parameter(const std::string& tensorName, + const CUDA::Stream& stream, + CUDA::DevicePointer dst, + const void* src, + std::size_t size); + + void add_result(const std::string& tensorName, + const CUDA::Stream& stream, + void* dst, + CUDA::DevicePointer src, + std::size_t size); + + void set_graph(const CUDA::Graph& graph); + + bool is_initialized() const; + + void update_capture(const TensorMappingContext& context); + + void launch(const CUDA::Stream& stream) const; + + std::size_t get_params_count() const; + std::size_t get_results_count() const; + + friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + + private: + std::optional graph_{}; + std::optional graphExec_{}; + std::map parameterNodes_; + std::map resultNodes_; + }; + + friend bool operator==(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + friend bool operator!=(const CudaGraphInfo& lhs, const CudaGraphInfo& rhs); + + std::vector graphs_{}; + mutable std::size_t currentGraphIndex_ = 0; }; -} // namespace nvidia_gpu -} // namespace ov +bool operator==(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs); + +bool operator!=(const CudaGraphContext::CudaGraphInfo& lhs, const CudaGraphContext::CudaGraphInfo& rhs); + +bool operator==(const CudaGraphContext& lhs, const CudaGraphContext& rhs); + +bool operator!=(const CudaGraphContext& lhs, const CudaGraphContext& rhs); + +} // namespace nvidia_gpu +} // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp index 501817c33..1e851ec41 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp @@ -3,56 +3,99 @@ // #include "cuda_graph_topology_runner.hpp" -#include "cuda/graph.hpp" + #include "cuda/event.hpp" -#include "cuda_profiler.hpp" namespace ov { namespace nvidia_gpu { -CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, const std::shared_ptr& model) - : SubGraph(context, model) { - if (!IsCudaGraphCompatible()) - throw CudaGraphIncompatible{"The topology is incompatible with CUDA graphs."}; +CudaGraphTopologyRunner::CudaGraphTopologyRunner(const CreationContext& context, + const std::shared_ptr& model) + : orig_subgraph_{context, model}, + cuda_graphs_count_{0} { + std::vector sequences; + SubGraph::ExecSequence currentSequence; + const auto& origSequence = orig_subgraph_.getExecSequence(); + const auto totalSize = origSequence.size(); + OPENVINO_ASSERT(totalSize != 0, "ExecSequence size is 0"); + + bool isLastOpCompatible = origSequence[0]->IsCudaGraphCompatible(); + currentSequence.push_back(origSequence[0]); + for (size_t i = 1; i < totalSize; ++i) { + const auto& op = origSequence[i]; + if (op->IsCudaGraphCompatible() != isLastOpCompatible) { + isLastOpCompatible = !isLastOpCompatible; + sequences.emplace_back(std::move(currentSequence)); + currentSequence.clear(); + } + currentSequence.push_back(op); + } + sequences.emplace_back(std::move(currentSequence)); + + const auto& memoryManager = orig_subgraph_.memoryManager(); + for (auto&& sequence : sequences) { + subgraphs_.emplace_back(context, model, std::move(sequence), memoryManager); + if (subgraphs_[subgraphs_.size() - 1].IsCudaGraphCompatible()) { + ++cuda_graphs_count_; + } + } } void CudaGraphTopologyRunner::Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { - context.getCudaGraphContext().graphExec.value().launch(context.getThreadContext().stream()); + const auto& stream = context.getThreadContext().stream(); + std::size_t graphIndex = 0; + for (auto& subgraph : subgraphs_) { + if (subgraph.IsCudaGraphCompatible()) { + context.getCudaGraphContext().launch(graphIndex, stream); + graphIndex++; + } else { + Workbuffers workbuffers{}; + workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); + subgraph.Execute(context, {}, {}, workbuffers); + } + } } -void CudaGraphTopologyRunner::Capture(InferenceRequestContext &context, - const DeviceMemBlock &memoryBlock) const { - CUDA::GraphCapture capture{context.getThreadContext().stream()}; - { - auto scope = capture.getScope(); - context.getProfiler().set_cuda_event_record_mode(CUDA::Event::RecordMode::External); - Workbuffers workbuffers{}; - workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); - SubGraph::Capture(context, {}, {}, workbuffers); +void CudaGraphTopologyRunner::Capture(InferenceRequestContext& context, + const DeviceMemBlock& memoryBlock) const { + const auto& stream = context.getThreadContext().stream(); + auto& graphContext = context.getCudaGraphContext(); + + graphContext.reset(); + for (const auto& subgraph : subgraphs_) { + if (subgraph.IsCudaGraphCompatible()) { + graphContext.start_next_graph_addition(); + CUDA::GraphCapture capture{stream}; + { + auto scope = capture.getScope(); + Workbuffers workbuffers{}; + workbuffers.mutable_buffers.emplace_back(memoryBlock.view().data()); + subgraph.Capture(context, {}, {}, workbuffers); + } + const auto& graph = capture.getGraph(); + graphContext.add_graph(graph); + } } - const auto& graph = capture.getGraph(); - context.getCudaGraphContext().graph.emplace(graph); - context.getCudaGraphContext().graphExec.emplace(graph); + OPENVINO_ASSERT(graphContext.get_graphs_count() == GetCudaGraphsCount(), + "CudaGraphTopologyRunner/CudaGraphContext graphs count mismatch"); } const SubGraph& CudaGraphTopologyRunner::GetSubGraph() const { - return *this; + return orig_subgraph_; } -void CudaGraphTopologyRunner::UpdateContext(InferenceRequestContext &context, const DeviceMemBlock &memoryBlock) const { - if (context.getCudaGraphContext().graphExec) +std::size_t CudaGraphTopologyRunner::GetCudaGraphsCount() const { return cuda_graphs_count_; } + +void CudaGraphTopologyRunner::UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const { + if (context.getCudaGraphContext().is_initialized()) { UpdateCapture(context); - else + } else { Capture(context, memoryBlock); + } } -void CudaGraphTopologyRunner::UpdateCapture(InferenceRequestContext &context) const { - CudaGraphContext& graphContext = context.getCudaGraphContext(); - for (auto& pair : graphContext.parameterNodes) - pair.second.update_src(graphContext.graphExec.value(), - const_cast(context.get_input_tensor(pair.first)->data())); - for (auto& pair : graphContext.resultNodes) - pair.second.update_dst(graphContext.graphExec.value(), context.get_output_tensor(pair.first)->data()); +void CudaGraphTopologyRunner::UpdateCapture(InferenceRequestContext& context) const { + context.getCudaGraphContext().update_capture(context.getTensorMappingContext()); } } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp index a58dcef6f..8e7cd1b85 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp @@ -4,17 +4,13 @@ #pragma once -#include "cuda_eager_topology_runner.hpp" +#include "cuda_itopology_runner.hpp" namespace ov { namespace nvidia_gpu { -class CudaGraphTopologyRunner final : public SubGraph, public ITopologyRunner { +class CudaGraphTopologyRunner final : public ITopologyRunner { public: - struct CudaGraphIncompatible : public std::runtime_error { - using std::runtime_error::runtime_error; - }; - CudaGraphTopologyRunner(const CreationContext& context, const std::shared_ptr& model); ~CudaGraphTopologyRunner() override = default; @@ -22,9 +18,15 @@ class CudaGraphTopologyRunner final : public SubGraph, public ITopologyRunner { void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; const SubGraph& GetSubGraph() const override; + std::size_t GetCudaGraphsCount() const; + private: void Capture(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const; void UpdateCapture(InferenceRequestContext& context) const; + + std::vector subgraphs_; + SubGraph orig_subgraph_; + std::size_t cuda_graphs_count_; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp b/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp new file mode 100644 index 000000000..7147da7d3 --- /dev/null +++ b/modules/nvidia_plugin/src/cuda_iexecution_delegator.hpp @@ -0,0 +1,86 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "openvino/runtime/profiling_info.hpp" +#include "ops/subgraph.hpp" + +namespace ov { +namespace nvidia_gpu { + +/** + * Interface for Profiler class or other Delegators + */ +class IExecutionDelegator { +public: + /** + * Virtual destructor for the interface + */ + virtual ~IExecutionDelegator() = default; + + /** + * Start time measurement of stage + */ + virtual void set_stream(const CUDA::Stream& stream) = 0; + + /** + * Start time measurement of stage + */ + virtual void start_stage() = 0; + + /** + * Stop time measurement of stage + * @param stage Stage for which time measurement was performed + */ + virtual void stop_stage(PerfStages stage) = 0; + + /** + * Execute sequence from SubGraph/TensorIterator class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + const InferenceRequestContext& context) = 0; + + /** + * Capture sequence from SubGraph/TensorIterator class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void capture_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) = 0; + + /** + * Returns performance counters + * @return Performance counters + */ + virtual const std::vector get_performance_counts() const = 0; + + /** + * Processes performance events into performance counters + */ + virtual void process_events() = 0; + + /** + * Set CUDA event record mode + * @param mode Value of CUDA::Event::RecordMode to set + */ + virtual void set_cuda_event_record_mode(CUDA::Event::RecordMode mode) = 0; +}; + +} // namespace nvidia_gpu +} // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_infer_request.cpp b/modules/nvidia_plugin/src/cuda_infer_request.cpp index 455b537aa..0f37f2c68 100644 --- a/modules/nvidia_plugin/src/cuda_infer_request.cpp +++ b/modules/nvidia_plugin/src/cuda_infer_request.cpp @@ -21,6 +21,8 @@ #include "cuda_graph_topology_runner.hpp" #include "cuda_itt.hpp" #include "cuda_plugin.hpp" +#include "cuda_profiler.hpp" +#include "cuda_simple_execution_delegator.hpp" #include "nvidia/properties.hpp" #include "openvino/runtime/make_tensor.hpp" @@ -38,13 +40,23 @@ void allocate_tensor_impl(ov::SoPtr& tensor, const ov::element::Typ tensor->set_shape(shape); } } + +inline std::unique_ptr create_execution_delegator(bool is_profiling_enabled, + const SubGraph& subGraph) { + if (is_profiling_enabled) { + return std::make_unique(subGraph); + } + return std::make_unique(); +} + } // namespace CudaInferRequest::CudaInferRequest(const std::shared_ptr& compiled_model) : ov::ISyncInferRequest(compiled_model), cancellation_token_{[this] { memory_proxy_.reset(); }}, - profiler_{compiled_model->get_property(ov::enable_profiling.name()).as(), - compiled_model->get_topology_runner().GetSubGraph()}, + executionDelegator_{ + create_execution_delegator(compiled_model->get_property(ov::enable_profiling.name()).as(), + compiled_model->get_topology_runner().GetSubGraph())}, is_benchmark_mode_{compiled_model->get_property(ov::nvidia_gpu::operation_benchmark.name()).as()} { create_infer_request(); } @@ -86,8 +98,8 @@ void CudaInferRequest::create_infer_request() { } void CudaInferRequest::infer_preprocess() { - OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, _profilingTask[Profiler::Preprocess]); - profiler_.start_stage(); + OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, _profilingTask[PerfStages::Preprocess]); + executionDelegator_->start_stage(); convert_batched_tensors(); check_tensors(); @@ -133,13 +145,13 @@ void CudaInferRequest::infer_preprocess() { else output_tensors_.at(i) = std::make_shared(element_type, shape); } - profiler_.stop_stage(Profiler::Preprocess); + executionDelegator_->stop_stage(PerfStages::Preprocess); } void CudaInferRequest::start_pipeline(const ThreadContext& threadContext) { try { - OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, _profilingTask[Profiler::StartPipeline]) - profiler_.start_stage(); + OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, _profilingTask[PerfStages::StartPipeline]) + executionDelegator_->start_stage(); auto compiled_model = get_nvidia_model(); memory_proxy_ = compiled_model->memory_pool_->WaitAndGet(cancellation_token_); auto& memory = memory_proxy_->Get(); @@ -151,12 +163,12 @@ void CudaInferRequest::start_pipeline(const ThreadContext& threadContext) { compiled_model->output_index_, threadContext, cancellation_token_, - profiler_, + *executionDelegator_, cudaGraphContext, is_benchmark_mode_}; topology_runner.UpdateContext(inferRequestContext, memory); topology_runner.Run(inferRequestContext, memory); - profiler_.stop_stage(Profiler::StartPipeline); + executionDelegator_->stop_stage(PerfStages::StartPipeline); } catch (...) { // TODO: // Log error once logger is available @@ -166,17 +178,17 @@ void CudaInferRequest::start_pipeline(const ThreadContext& threadContext) { } void CudaInferRequest::wait_pipeline(const ThreadContext& threadContext) { - OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, _profilingTask[Profiler::WaitPipeline]) - profiler_.start_stage(); + OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, _profilingTask[PerfStages::WaitPipeline]) + executionDelegator_->start_stage(); // TODO: probably all time will be spent in synchonize, out of reach of ThrowIfCanceled threadContext.stream().synchronize(); memory_proxy_.reset(); - profiler_.stop_stage(Profiler::WaitPipeline); + executionDelegator_->stop_stage(PerfStages::WaitPipeline); } void CudaInferRequest::infer_postprocess() { - OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, _profilingTask[Profiler::Postprocess]); - profiler_.start_stage(); + OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, _profilingTask[PerfStages::Postprocess]); + executionDelegator_->start_stage(); OPENVINO_ASSERT(get_outputs().size() == output_tensors_.size()); OPENVINO_ASSERT(get_outputs().size() == get_nvidia_model()->model_->get_results().size()); @@ -197,8 +209,8 @@ void CudaInferRequest::infer_postprocess() { OPENVINO_ASSERT(true, "NVIDIA plugin doesn't support RemoteTensor."); } } - profiler_.stop_stage(Profiler::Postprocess); - profiler_.process_events(); + executionDelegator_->stop_stage(PerfStages::Postprocess); + executionDelegator_->process_events(); } void CudaInferRequest::cancel() { @@ -233,7 +245,7 @@ std::vector> CudaInferRequest::query_state() const } std::vector CudaInferRequest::get_profiling_info() const { - return profiler_.get_performance_counts(); + return executionDelegator_->get_performance_counts(); } } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_infer_request.hpp b/modules/nvidia_plugin/src/cuda_infer_request.hpp index 8175afd3c..0010620f9 100644 --- a/modules/nvidia_plugin/src/cuda_infer_request.hpp +++ b/modules/nvidia_plugin/src/cuda_infer_request.hpp @@ -22,7 +22,7 @@ #include "cancellation_token.hpp" #include "cuda_config.hpp" #include "cuda_operation_base.hpp" -#include "cuda_profiler.hpp" +#include "cuda_iexecution_delegator.hpp" #include "memory_manager/cuda_memory_manager.hpp" #include "memory_manager/cuda_memory_pool.hpp" #include "utils/perf_timing.hpp" @@ -58,10 +58,10 @@ class CudaInferRequest : public ov::ISyncInferRequest { std::shared_ptr get_nvidia_model(); void create_infer_request(); - std::array _profilingTask; + std::array(PerfStages::NumOfStages)> _profilingTask; std::optional memory_proxy_; CancellationToken cancellation_token_; - Profiler profiler_; + std::unique_ptr executionDelegator_; std::vector> input_tensors_; std::vector> output_tensors_; bool is_benchmark_mode_; diff --git a/modules/nvidia_plugin/src/cuda_inference_request_context.hpp b/modules/nvidia_plugin/src/cuda_inference_request_context.hpp index 7103040ef..6eea4f1e6 100644 --- a/modules/nvidia_plugin/src/cuda_inference_request_context.hpp +++ b/modules/nvidia_plugin/src/cuda_inference_request_context.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -7,13 +7,14 @@ #include #include "cancellation_token.hpp" +#include "cuda_tensor_mapping_context.hpp" #include "cuda_thread_context.hpp" #include "cuda_graph_context.hpp" namespace ov { namespace nvidia_gpu { -class Profiler; +class IExecutionDelegator; class InferenceRequestContext { public: @@ -29,18 +30,16 @@ class InferenceRequestContext { const std::map& outputMapping, const ThreadContext& threadContext, CancellationToken& token, - Profiler& profiler, + IExecutionDelegator& executionDelegator, CudaGraphContext& cudaGraphContext, bool isBenchmarkMode = false) : threadContext{threadContext}, token{token}, - profiler{profiler}, - blob_inputs{inputs}, - inputs_mapping{inputMapping}, - blob_outputs{outputs}, - outputs_mapping{outputMapping}, + executionDelegator{executionDelegator}, + tensor_mapping_context_{inputs, inputMapping, outputs, outputMapping}, cuda_graph_context_{cudaGraphContext}, is_benchmark_mode_{isBenchmarkMode} {} + // don't allow storing references to temporary template InferenceRequestContext(std::vector>&& inputs, @@ -48,51 +47,26 @@ class InferenceRequestContext { std::vector>&& outputs, std::map&& outputMapping, Args... args) = delete; + InferenceRequestContext(std::vector>&& inputs, std::map&& inputMapping, std::vector>&& outputs, std::map&& outputMapping, const ThreadContext& threadContext) = delete; - /** - * @brief get_input_tensor(name) returns an tensor blob with the given name - */ - std::shared_ptr get_input_tensor(const std::string& input_name) const { - return blob_inputs.at(inputs_mapping.at(input_name)); - } - /** - * @brief get_output_tensor(name) returns an output tensor with the given name - */ - std::shared_ptr get_output_tensor(const std::string& output_name) const { - return blob_outputs.at(outputs_mapping.at(output_name)); - } - /** - * @brief has_input_tensor(name) returns true if it contains an input tensor with the given name - */ - bool has_input_tensor(const std::string& input_name) const noexcept { - return inputs_mapping.find(input_name) != inputs_mapping.end(); - } - /** - * @brief has_output_tensor(name) returns true if contains an output tensor with the given name - */ - bool has_output_tensor(const std::string& output_name) const noexcept { - return outputs_mapping.find(output_name) != outputs_mapping.end(); - } const ThreadContext& getThreadContext() const noexcept { return threadContext; } [[nodiscard]] ov::nvidia_gpu::CancellationToken& getCancellationToken() const noexcept { return token; } - [[nodiscard]] Profiler& getProfiler() const noexcept { return profiler; } + [[nodiscard]] IExecutionDelegator& getExecutionDelegator() const noexcept { return executionDelegator; } [[nodiscard]] bool isBenchmarkMode() const noexcept { return is_benchmark_mode_; } + [[nodiscard]] const TensorMappingContext& getTensorMappingContext() const { return tensor_mapping_context_; } [[nodiscard]] const CudaGraphContext& getCudaGraphContext() const { return cuda_graph_context_; } [[nodiscard]] CudaGraphContext& getCudaGraphContext() { return cuda_graph_context_; } private: const ThreadContext& threadContext; CancellationToken& token; - Profiler& profiler; - const std::vector>& blob_inputs; - const std::map& inputs_mapping; - const std::vector>& blob_outputs; - const std::map& outputs_mapping; + IExecutionDelegator& executionDelegator; + const TensorMappingContext tensor_mapping_context_; CudaGraphContext& cuda_graph_context_; bool is_benchmark_mode_; }; diff --git a/modules/nvidia_plugin/src/cuda_itopology_runner.hpp b/modules/nvidia_plugin/src/cuda_itopology_runner.hpp new file mode 100644 index 000000000..04cb61bbc --- /dev/null +++ b/modules/nvidia_plugin/src/cuda_itopology_runner.hpp @@ -0,0 +1,20 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace nvidia_gpu { + +struct ITopologyRunner { + virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; + virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; + virtual const SubGraph& GetSubGraph() const = 0; + virtual ~ITopologyRunner() = default; +}; + +} // namespace nvidia_gpu +} // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_perf_counts.hpp b/modules/nvidia_plugin/src/cuda_perf_counts.hpp new file mode 100644 index 000000000..755419056 --- /dev/null +++ b/modules/nvidia_plugin/src/cuda_perf_counts.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ov { +namespace nvidia_gpu { + +static const char PERF_COUNTER_NAME[] = "nvidia_perf_counter"; + +enum class PerfStages { Preprocess, Postprocess, StartPipeline, WaitPipeline, NumOfStages }; + +struct PerfCounts { + std::chrono::microseconds total_duration; + uint32_t num; + std::string impl_type; + std::string runtime_precision; + + PerfCounts() : total_duration{0}, num(0) {} + + uint64_t average() const { return (num == 0) ? 0 : total_duration.count() / num; } +}; + +} // namespace nvidia_gpu +} // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_profiler.cpp b/modules/nvidia_plugin/src/cuda_profiler.cpp index a9fe3cbdf..be8a3a61c 100644 --- a/modules/nvidia_plugin/src/cuda_profiler.cpp +++ b/modules/nvidia_plugin/src/cuda_profiler.cpp @@ -34,21 +34,19 @@ std::pair make_profile_info(std::string stage_na } } // namespace -Profiler::Profiler(bool perfCount, const SubGraph& graph) : perf_count_{perfCount} { +Profiler::Profiler(const SubGraph& graph) { std::vector execSequence; collect_subgraphs(graph, execSequence); - if (perf_count_) { - for (size_t i = 0; i < execSequence.size(); ++i) { - auto& op = *execSequence[i]; - perf_counters_.emplace(make_profile_info(op)); - execution_order_.push_back(op.GetName()); - } + for (size_t i = 0; i < execSequence.size(); ++i) { + auto& op = *execSequence[i]; + perf_counters_.emplace(make_profile_info(op)); + execution_order_.push_back(op.GetName()); } } void Profiler::process_events() { - if (!perf_count_ || infer_count_ == 0) return; + if (infer_count_ == 0) return; auto ms_to_us = [](float timing) { return std::chrono::duration_cast(std::chrono::duration{timing}); }; @@ -109,19 +107,44 @@ void Profiler::process_events() { auto result_ms = result_timing == layer_timing.cend() ? zero_time : time_per_infer_us(result_timing->second); // Adding some overall performance counters - auto stage_time_ms = [this](const Stages& stage) { - return std::chrono::microseconds(static_cast(durations_[stage].count())); + auto stage_time_ms = [this](PerfStages stage) { + const auto i = static_cast(stage); + return std::chrono::microseconds(static_cast(durations_[i].count())); }; auto insert_stage = [&](const std::pair& value) { auto const result = stage_counters_.insert(value); if (!result.second) { result.first->second = value.second; } }; - insert_stage(make_profile_info("1. input preprocessing", zero_time, stage_time_ms(Preprocess))); + insert_stage(make_profile_info("1. input preprocessing", zero_time, stage_time_ms(PerfStages::Preprocess))); insert_stage(make_profile_info("2. input transfer to a device", parameter_ms)); - insert_stage(make_profile_info("3. execution time", time_per_infer_us(exec_timing_.measure()), stage_time_ms(StartPipeline))); + insert_stage(make_profile_info("3. execution time", time_per_infer_us(exec_timing_.measure()), stage_time_ms(PerfStages::StartPipeline))); insert_stage(make_profile_info("4. output transfer from a device", result_ms)); - insert_stage(make_profile_info("5. output postprocessing", zero_time, stage_time_ms(Postprocess))); + insert_stage(make_profile_info("5. output postprocessing", zero_time, stage_time_ms(PerfStages::Postprocess))); +} + +void Profiler::execute_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + const InferenceRequestContext& context) { + for (const auto& op : create_exec_sequence(subGraphPtr)) { + const auto& inTensors = memoryManager.inputTensorPointers(*op, buffer); + const auto& outTensors = memoryManager.outputTensorPointers(*op, buffer); + const auto& workBuffers = memoryManager.workBuffers(*op, buffer); + op->execute(context, inTensors, outTensors, workBuffers); + } +} + +void Profiler::capture_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) { + for (const auto& op : create_exec_sequence(subGraphPtr)) { + const auto& inputTensors = memoryManager.inputTensorPointers(*op, buffer); + const auto& outputTensors = memoryManager.outputTensorPointers(*op, buffer); + const auto& workBuffers = memoryManager.workBuffers(*op, buffer); + op->capture(context, inputTensors, outputTensors, workBuffers); + } } Profiler::ProfilerSequence Profiler::create_exec_sequence(const SubGraph* subGraphPtr) { @@ -144,17 +167,6 @@ void Profiler::collect_subgraphs(const SubGraph& graph, std::vector& allExecSequence) { - std::vector perfSteps; - const auto& execSequence = graph.getExecSequence(); - for (const auto& execStep : execSequence) { - if (!dynamic_cast(execStep.get()) && !dynamic_cast(execStep.get())) { - collect_node_visitor(execStep, perfSteps, allExecSequence); - } - } - subgraph_perf_steps_map_.emplace_back(&graph, std::move(perfSteps)); -} - void Profiler::collect_node_visitor(const OperationBase::Ptr& execStep, std::vector& perfSteps, std::vector& allExecSequence) { diff --git a/modules/nvidia_plugin/src/cuda_profiler.hpp b/modules/nvidia_plugin/src/cuda_profiler.hpp index 60a48eb32..5eaa61dd0 100644 --- a/modules/nvidia_plugin/src/cuda_profiler.hpp +++ b/modules/nvidia_plugin/src/cuda_profiler.hpp @@ -1,47 +1,22 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include -#include - -#include "openvino/runtime/profiling_info.hpp" -#include "openvino/runtime/exec_model_info.hpp" - #include #include -#include -#include "cuda_eager_topology_runner.hpp" -#include "cuda_operation_base.hpp" +#include "cuda_iexecution_delegator.hpp" namespace ov { namespace nvidia_gpu { -static const char PERF_COUNTER_NAME[] = "nvidia_perf_counter"; - -struct PerfCounts { - std::chrono::microseconds total_duration; - uint32_t num; - std::string impl_type; - std::string runtime_precision; - - PerfCounts() : total_duration{0}, num(0) {} - - uint64_t average() const { - return (num == 0) ? 0 : total_duration.count() / num; - } -}; - /** * Creates profiler sequence and stores profiler results. */ -class Profiler { +class Profiler : public IExecutionDelegator { public: - enum Stages { Preprocess, Postprocess, StartPipeline, WaitPipeline, NumOfStages }; - using PerformaceCounters = std::map; using Duration = std::chrono::duration; using Time = std::chrono::steady_clock; @@ -53,59 +28,85 @@ class Profiler { * Constructor of Profiler class * @param perfCount Option that indicates if performance counters are enabled */ - explicit Profiler(bool perfCount, const SubGraph& graph); + explicit Profiler(const SubGraph& graph); /** * Start time measurement of stage */ - void set_stream(const CUDA::Stream& stream) { active_stream_ = &stream; } + void set_stream(const CUDA::Stream& stream) override { active_stream_ = &stream; } /** * Start time measurement of stage */ - void start_stage() { start_ = Time::now(); } + void start_stage() override { start_ = Time::now(); } /** * Stop time measurement of stage * @param stage Stage for which time measurement was performed */ - void stop_stage(Stages stage) { durations_[stage] = Time::now() - start_; } + void stop_stage(PerfStages stage) override { durations_[static_cast(stage)] = Time::now() - start_; } /** - * Creates profiler sequence and increase infer request counter - * @return ProfilerSequence for single InferRequest + * Execute sequence from SubGraph/TensorIterator class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext */ - Profiler::ProfilerSequence create_exec_sequence(const SubGraph* subGraphPtr); + void execute_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + const InferenceRequestContext& context) override; + + /** + * Capture sequence from SubGraph/TensorIterator class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + void capture_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) override; /** * Returns performance counters * @return Performance counters */ - [[nodiscard]] const std::vector get_performance_counts() const; + [[nodiscard]] const std::vector get_performance_counts() const override; /** * Processes performance events into performance counters */ - void process_events(); + void process_events() override; - void set_cuda_event_record_mode(CUDA::Event::RecordMode mode) { cuda_event_record_mode_ = mode; } + /** + * Set CUDA event record mode + * @param mode Value of CUDA::Event::RecordMode to set + */ + void set_cuda_event_record_mode(CUDA::Event::RecordMode mode) override { cuda_event_record_mode_ = mode; } private: + /** + * Creates profiler sequence and increase infer request counter + * @return ProfilerSequence for single InferRequest + */ + Profiler::ProfilerSequence create_exec_sequence(const SubGraph* subGraphPtr); + void collect_subgraphs(const SubGraph& graph, std::vector& vector); - void collect_subgraphs(const TensorIteratorOp& graph, std::vector& allExecSequence); void collect_node_visitor(const OperationBase::Ptr& execStep, std::vector& perfSteps, std::vector& allExecSequence); const CUDA::Stream* active_stream_ = nullptr; - const bool perf_count_; std::vector>> subgraph_perf_steps_map_; PerformaceCounters perf_counters_{}; PerformaceCounters stage_counters_{}; std::vector execution_order_{}; utils::PerformaceTiming exec_timing_{}; // for performance counters - std::array durations_; + std::array(PerfStages::NumOfStages)> durations_; Time::time_point start_{}; size_t infer_count_{}; CUDA::Event::RecordMode cuda_event_record_mode_ {CUDA::Event::RecordMode::Default}; @@ -127,24 +128,16 @@ class Profiler::ProfileExecStep { */ template void execute(TArgs&&... args) const { - if (this->profiler_.perf_count_) { - timing_.setStart(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); - exec_step_.Execute(std::forward(args)...); - timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); - } else { - exec_step_.Execute(std::forward(args)...); - } + timing_.setStart(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); + exec_step_.Execute(std::forward(args)...); + timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); } template void capture(TArgs&&... args) const { - if (this->profiler_.perf_count_) { - timing_.setStart(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); - exec_step_.Capture(std::forward(args)...); - timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); - } else { - exec_step_.Capture(std::forward(args)...); - } + timing_.setStart(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); + exec_step_.Capture(std::forward(args)...); + timing_.setStop(*this->profiler_.active_stream_, profiler_.cuda_event_record_mode_); } /** @@ -197,9 +190,7 @@ class Profiler::ProfilerSequence { * @param stream CUDA stream */ ProfilerSequence(Profiler& profiler, size_t index) : profiler_{profiler}, index_{index} { - if (profiler_.perf_count_) { - profiler_.exec_timing_.setStart(*profiler_.active_stream_, profiler.cuda_event_record_mode_); - } + profiler_.exec_timing_.setStart(*profiler_.active_stream_, profiler.cuda_event_record_mode_); } /** @@ -207,9 +198,7 @@ class Profiler::ProfilerSequence { * Stops time measurement */ ~ProfilerSequence() { - if (profiler_.perf_count_) { - profiler_.exec_timing_.setStop(*profiler_.active_stream_, profiler_.cuda_event_record_mode_); - } + profiler_.exec_timing_.setStop(*profiler_.active_stream_, profiler_.cuda_event_record_mode_); } /** diff --git a/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp b/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp new file mode 100644 index 000000000..97d174b76 --- /dev/null +++ b/modules/nvidia_plugin/src/cuda_simple_execution_delegator.hpp @@ -0,0 +1,98 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "cuda_iexecution_delegator.hpp" + +namespace ov { +namespace nvidia_gpu { + +/** + * Basic implementaion for IExecutionDelegator interface + */ +class SimpleExecutionDelegator : public IExecutionDelegator { +public: + /** + * Constructor of SimpleExecutionDelegator class + */ + SimpleExecutionDelegator() = default; + + /** + * Dummy set_stream implementation + */ + void set_stream(const CUDA::Stream& stream) override{}; + + /** + * Dummy start_stage implementation + */ + void start_stage() override {} + + /** + * Dummy stop_stage implementation + */ + virtual void stop_stage(PerfStages stage) override{}; + + /** + * Execute sequence from SubGraph/TensorIterator class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void execute_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + const InferenceRequestContext& context) override { + for (auto& op : subGraphPtr->getExecSequence()) { + const auto& inputTensors = memoryManager.inputTensorPointers(*op, buffer); + const auto& outputTensors = memoryManager.outputTensorPointers(*op, buffer); + const auto& workBuffers = memoryManager.workBuffers(*op, buffer); + op->Execute(context, inputTensors, outputTensors, workBuffers); + } + }; + + /** + * Capture sequence from SubGraph/TensorIterator class + * @param subGraphPtr Pointer to SubGraph + * @param memoryManager Reference to MemoryManager + * @param buffer Reference to orkbuffers::mutable_buffer + * @param context Reference to InferenceRequestContext + */ + virtual void capture_sequence(const SubGraph* subGraphPtr, + const MemoryManager& memoryManager, + const Workbuffers::mutable_buffer& buffer, + InferenceRequestContext& context) override { + for (auto& op : subGraphPtr->getExecSequence()) { + const auto& inputTensors = memoryManager.inputTensorPointers(*op, buffer); + const auto& outputTensors = memoryManager.outputTensorPointers(*op, buffer); + const auto& workBuffers = memoryManager.workBuffers(*op, buffer); + op->Capture(context, inputTensors, outputTensors, workBuffers); + } + }; + + /** + * Dummy get_performance_counts implementation + */ + virtual const std::vector get_performance_counts() const override { + return std::vector{}; + }; + + /** + * Dummy process_events implementation + */ + virtual void process_events() override{}; + + /** + * Dummy set_cuda_event_record_mode implementation + */ + virtual void set_cuda_event_record_mode(CUDA::Event::RecordMode mode) override{}; +}; + +} // namespace nvidia_gpu +} // namespace ov diff --git a/modules/nvidia_plugin/src/cuda_tensor_mapping_context.hpp b/modules/nvidia_plugin/src/cuda_tensor_mapping_context.hpp new file mode 100644 index 000000000..6260ce53b --- /dev/null +++ b/modules/nvidia_plugin/src/cuda_tensor_mapping_context.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/runtime/tensor.hpp" + +namespace ov { +namespace nvidia_gpu { + +class TensorMappingContext { + using TensorVec = std::vector>; + using MappingMap = std::map; + +public: + TensorMappingContext(const TensorVec& inputs, + const MappingMap& inputMapping, + const TensorVec& outputs, + const MappingMap& outputMapping) + : blob_inputs{inputs}, inputs_mapping{inputMapping}, blob_outputs{outputs}, outputs_mapping{outputMapping} {} + /** + * @brief get_input_tensor(name) returns an tensor blob with the given name + */ + inline std::shared_ptr get_input_tensor(const std::string& input_name) const { + return blob_inputs.at(inputs_mapping.at(input_name)); + } + /** + * @brief get_output_tensor(name) returns an output tensor with the given name + */ + inline std::shared_ptr get_output_tensor(const std::string& output_name) const { + return blob_outputs.at(outputs_mapping.at(output_name)); + } + /** + * @brief has_input_tensor(name) returns true if it contains an input tensor with the given name + */ + inline bool has_input_tensor(const std::string& input_name) const noexcept { + return inputs_mapping.find(input_name) != inputs_mapping.end(); + } + /** + * @brief has_output_tensor(name) returns true if contains an output tensor with the given name + */ + inline bool has_output_tensor(const std::string& output_name) const noexcept { + return outputs_mapping.find(output_name) != outputs_mapping.end(); + } + +private: + const TensorVec& blob_inputs; + const MappingMap& inputs_mapping; + const TensorVec& blob_outputs; + const MappingMap& outputs_mapping; +}; + +} // namespace nvidia_gpu +} // namespace ov diff --git a/modules/nvidia_plugin/src/memory_manager/cuda_device_mem_block.hpp b/modules/nvidia_plugin/src/memory_manager/cuda_device_mem_block.hpp index c3cbba365..666d94162 100644 --- a/modules/nvidia_plugin/src/memory_manager/cuda_device_mem_block.hpp +++ b/modules/nvidia_plugin/src/memory_manager/cuda_device_mem_block.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -13,6 +13,8 @@ namespace ov { namespace nvidia_gpu { +class CudaGraphContext; + /** * @brief Allocates and owns continuous memory blob on CUDA device. * Uses MemoryModel to determine a size of memory to allocate and diff --git a/modules/nvidia_plugin/src/memory_manager/model/cuda_memory_model.hpp b/modules/nvidia_plugin/src/memory_manager/model/cuda_memory_model.hpp index a8dbd8dfb..834e56896 100644 --- a/modules/nvidia_plugin/src/memory_manager/model/cuda_memory_model.hpp +++ b/modules/nvidia_plugin/src/memory_manager/model/cuda_memory_model.hpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace ov { namespace nvidia_gpu { diff --git a/modules/nvidia_plugin/src/ops/parameter.cpp b/modules/nvidia_plugin/src/ops/parameter.cpp index c9c2120d4..54c5dbe14 100644 --- a/modules/nvidia_plugin/src/ops/parameter.cpp +++ b/modules/nvidia_plugin/src/ops/parameter.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -27,8 +27,8 @@ void ParameterOp::Execute(const InferenceRequestContext& context, const Workbuffers&) const { OPENVINO_ASSERT(inputs.size() == 0, "Node name: ", GetName()); OPENVINO_ASSERT(outputs.size() == 1, "Node name: ", GetName()); - OPENVINO_ASSERT(context.has_input_tensor(input_tensor_name_), "Node name: ", GetName()); - auto tensor = context.get_input_tensor(input_tensor_name_); + OPENVINO_ASSERT(context.getTensorMappingContext().has_input_tensor(input_tensor_name_), "Node name: ", GetName()); + auto tensor = context.getTensorMappingContext().get_input_tensor(input_tensor_name_); context.getThreadContext().stream().upload(outputs[0], tensor->data(), tensor->get_byte_size()); } @@ -40,11 +40,10 @@ void ParameterOp::Capture(InferenceRequestContext &context, Inputs inputs, Outpu const Workbuffers&) const { OPENVINO_ASSERT(inputs.size() == 0, "Node name: ", GetName()); OPENVINO_ASSERT(outputs.size() == 1, "Node name: ", GetName()); - OPENVINO_ASSERT(context.has_input_tensor(input_tensor_name_), "Node name: ", GetName()); - auto tensor = context.get_input_tensor(input_tensor_name_); - CUDA::CaptureInfo captureInfo{context.getThreadContext().stream()}; - context.getCudaGraphContext().parameterNodes.emplace(std::make_pair(input_tensor_name_, - captureInfo.addUploadNode(outputs[0], tensor->data(), tensor->get_byte_size()))); + OPENVINO_ASSERT(context.getTensorMappingContext().has_input_tensor(input_tensor_name_), "Node name: ", GetName()); + auto tensor = context.getTensorMappingContext().get_input_tensor(input_tensor_name_); + context.getCudaGraphContext().add_parameter( + input_tensor_name_, context.getThreadContext().stream(), outputs[0], tensor->data(), tensor->get_byte_size()); } OPERATION_REGISTER(ParameterOp, Parameter); diff --git a/modules/nvidia_plugin/src/ops/result.cpp b/modules/nvidia_plugin/src/ops/result.cpp index 6f111db2f..9644447a0 100644 --- a/modules/nvidia_plugin/src/ops/result.cpp +++ b/modules/nvidia_plugin/src/ops/result.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -35,8 +35,8 @@ void ResultOp::Execute(const InferenceRequestContext& context, OPENVINO_ASSERT(outputs.size() == 0, "Node name: ", GetName()); std::shared_ptr tensor; for (const auto& outputName : output_tensor_names_) { - if (context.has_output_tensor(outputName)) { - tensor = context.get_output_tensor(outputName); + if (context.getTensorMappingContext().has_output_tensor(outputName)) { + tensor = context.getTensorMappingContext().get_output_tensor(outputName); break; } } @@ -121,16 +121,15 @@ void ResultOp::Capture(InferenceRequestContext& context, std::shared_ptr tensor; std::string outputTensorName{}; for (const auto& outputName : output_tensor_names_) { - if (context.has_output_tensor(outputName)) { - tensor = context.get_output_tensor(outputName); + if (context.getTensorMappingContext().has_output_tensor(outputName)) { + tensor = context.getTensorMappingContext().get_output_tensor(outputName); outputTensorName = outputName; break; } } OPENVINO_ASSERT(tensor != nullptr, "Node name: ", GetName()); - CUDA::CaptureInfo captureInfo{context.getThreadContext().stream()}; - context.getCudaGraphContext().resultNodes.emplace(std::make_pair(outputTensorName, - captureInfo.addDownloadNode(tensor->data(), inputs[0], tensor->get_byte_size()))); + context.getCudaGraphContext().add_result( + outputTensorName, context.getThreadContext().stream(), tensor->data(), inputs[0], tensor->get_byte_size()); } OPERATION_REGISTER(ResultOp, Result); diff --git a/modules/nvidia_plugin/src/ops/subgraph.cpp b/modules/nvidia_plugin/src/ops/subgraph.cpp index 140450bfa..012cbb7a4 100644 --- a/modules/nvidia_plugin/src/ops/subgraph.cpp +++ b/modules/nvidia_plugin/src/ops/subgraph.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include @@ -31,9 +31,15 @@ SubGraph::SubGraph(const CreationContext& context, SubGraph::SubGraph(const CreationContext& context, const std::shared_ptr& model) : OperationBase(context, nullptr), model_{model} { - initExecuteSequence(context, false, false); + initExecuteSequence(context, false, false); } +SubGraph::SubGraph(const CreationContext& context, + const std::shared_ptr& model, + ExecSequence&& sequence, + std::shared_ptr memoryManager) + : OperationBase{context, nullptr}, model_{model}, exec_sequence_{sequence}, memory_manager_{memoryManager} {} + void SubGraph::initExecuteSequence(const CreationContext& context, bool isStableParams, bool isStableResults) { static constexpr auto InitNeeded = IOperationExec::WorkbufferStatus::InitNeeded; @@ -128,15 +134,9 @@ void SubGraph::Capture(InferenceRequestContext &context, Inputs, Outputs, const auto& memoryManager = *memory_manager_; auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - auto& cancellationToken = context.getCancellationToken(); - auto& profiler = context.getProfiler(); - profiler.set_stream(stream); - for (auto& op : profiler.create_exec_sequence(this)) { - auto inputTensors = memoryManager.inputTensorPointers(*op, mutableBuffer); - auto outputTensors = memoryManager.outputTensorPointers(*op, mutableBuffer); - auto workBuffers = memoryManager.workBuffers(*op, mutableBuffer); - op->capture(context, inputTensors, outputTensors, workBuffers); - } + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); + executionDelegator.capture_sequence(this, memoryManager, mutableBuffer, context); } WorkbufferRequest SubGraph::GetWorkBufferRequest() const { @@ -149,24 +149,22 @@ void SubGraph::Execute(const InferenceRequestContext& context, Inputs, Outputs, const auto& memoryManager = *memory_manager_; auto& mutableBuffer = workbuffers.mutable_buffers.at(0); - auto& cancellationToken = context.getCancellationToken(); - auto& profiler = context.getProfiler(); - profiler.set_stream(stream); - for (auto& op : profiler.create_exec_sequence(this)) { - auto inputTensors = memoryManager.inputTensorPointers(*op, mutableBuffer); - auto outputTensors = memoryManager.outputTensorPointers(*op, mutableBuffer); - auto workBuffers = memoryManager.workBuffers(*op, mutableBuffer); - op->execute(context, inputTensors, outputTensors, workBuffers); - } + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); + executionDelegator.execute_sequence(this, memoryManager, mutableBuffer, context); } bool SubGraph::IsCudaGraphCompatible() const { - for (const auto& op : exec_sequence_) { - if (!op->IsCudaGraphCompatible()) { - return false; + if (is_cuda_graph_compatible_ == CompatibleState::NOT_INITIALIZED) { + is_cuda_graph_compatible_ = CompatibleState::COMPATIBLE; + for (const auto& op : exec_sequence_) { + if (!op->IsCudaGraphCompatible()) { + is_cuda_graph_compatible_ = CompatibleState::NOT_COMPATIBLE; + break; + } } } - return true; + return is_cuda_graph_compatible_ == CompatibleState::COMPATIBLE; } } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/ops/subgraph.hpp b/modules/nvidia_plugin/src/ops/subgraph.hpp index 022d46566..87d03cccc 100644 --- a/modules/nvidia_plugin/src/ops/subgraph.hpp +++ b/modules/nvidia_plugin/src/ops/subgraph.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -15,7 +15,16 @@ namespace nvidia_gpu { class SubGraph : public OperationBase { public: - virtual ~SubGraph() = 0; + using ExecSequence = std::vector; + + SubGraph(const CreationContext& context, const std::shared_ptr& model); + + SubGraph(const CreationContext& context, + const std::shared_ptr& model, + ExecSequence&& sequence, + std::shared_ptr memoryManager); + + virtual ~SubGraph() = default; void Execute(const InferenceRequestContext& context, Inputs inputTensors, @@ -29,12 +38,14 @@ class SubGraph : public OperationBase { bool IsCudaGraphCompatible() const override; - const MemoryManager& memoryManager() const { return *memory_manager_; } + inline std::shared_ptr memoryManager() const { return memory_manager_; } + + inline const std::vector& getExecSequence() const { return exec_sequence_; } + + inline const std::shared_ptr getModel() const { return model_; }; const std::vector& getParams() const; - const std::vector& getExecSequence() const; const std::vector& getResults() const; - const std::shared_ptr getModel() const { return model_; }; private: void initSharedImmutableWorkbuffers(const std::vector& init_sequence); @@ -49,7 +60,6 @@ class SubGraph : public OperationBase { const SubGraphOp& node, IndexCollection&& inputIds, IndexCollection&& outputIds); - SubGraph(const CreationContext& context, const std::shared_ptr& function); WorkbufferRequest GetWorkBufferRequest() const override; @@ -67,18 +77,18 @@ class SubGraph : public OperationBase { ov::Shape shape_{}; }; - std::unique_ptr memory_manager_; + enum class CompatibleState { NOT_INITIALIZED = -1, NOT_COMPATIBLE, COMPATIBLE }; + + std::shared_ptr memory_manager_; std::vector params_; std::vector params_info_; std::vector exec_sequence_; std::vector results_; std::vector results_info_; std::shared_ptr model_; -}; -inline SubGraph::~SubGraph() {} - -inline const std::vector& SubGraph::getExecSequence() const { return exec_sequence_; } + mutable CompatibleState is_cuda_graph_compatible_ = CompatibleState::NOT_INITIALIZED; +}; } // namespace nvidia_gpu } // namespace ov diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp index 91c4e84b1..c92238aec 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.cpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include @@ -131,6 +131,8 @@ TensorIteratorOp::TensorIteratorOp(const CreationContext& context, kernelmap_outputs_.emplace(outputIdx, kernel::Insert(element_type, props, max_threads_per_block_)); } } + + updateExecSequence(); } void TensorIteratorOp::Execute(const InferenceRequestContext& context, @@ -141,8 +143,8 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context, const auto& memoryManager = *memory_manager_; auto& mutableBuffer = workbuffers.mutable_buffers.at(0); auto& cancellationToken = context.getCancellationToken(); - auto& profiler = context.getProfiler(); - profiler.set_stream(stream); + auto& executionDelegator = context.getExecutionDelegator(); + executionDelegator.set_stream(stream); // First iteration for (const auto inputIdx : invariant_inputs_) { @@ -155,7 +157,6 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context, } } - const auto& execSequence = profiler.create_exec_sequence(this); for (int64_t iter = 0; iter < num_iterations_; ++iter) { // Input mapping of ports @@ -166,12 +167,7 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context, } // Inner loop - for (const auto& op : execSequence) { - auto inTensors = memoryManager.inputTensorPointers(*op, mutableBuffer); - auto outTensors = memoryManager.outputTensorPointers(*op, mutableBuffer); - auto workBuffers = memoryManager.workBuffers(*op, mutableBuffer); - op->execute(context, inTensors, outTensors, workBuffers); - } + executionDelegator.execute_sequence(this, memoryManager, mutableBuffer, context); // Back-edge mapping for (auto& [resultIdx, paramIdx] : results_parameters_map_) { @@ -195,7 +191,16 @@ void TensorIteratorOp::Execute(const InferenceRequestContext& context, } } -bool TensorIteratorOp::IsCudaGraphCompatible() const { return true; } +// TODO: Investigate problem with multi-graphs in some networks +// benchmark_app may hang in throughput mode +bool TensorIteratorOp::IsCudaGraphCompatible() const { return false; } + +void TensorIteratorOp::Capture(InferenceRequestContext& context, + Inputs inputTensors, + Outputs outputTensors, + const Workbuffers& workbuffers) const { + Execute(context, inputTensors, outputTensors, workbuffers); +} WorkbufferRequest TensorIteratorOp::GetWorkBufferRequest() const { std::vector immutable_sizes; @@ -305,11 +310,14 @@ void TensorIteratorOp::copyResult(const CUDA::Stream& stream, } } -void TensorIteratorOp::Capture(InferenceRequestContext& context, - Inputs inputTensors, - Outputs outputTensors, - const Workbuffers& workbuffers) const { - Execute(context, inputTensors, outputTensors, workbuffers); +void TensorIteratorOp::updateExecSequence() { + std::vector newExecSequence; + for (const auto& op : exec_sequence_) { + if (!dynamic_cast(op.get()) && !dynamic_cast(op.get())) { + newExecSequence.emplace_back(op); + } + } + exec_sequence_ = std::move(newExecSequence); } OPERATION_REGISTER(TensorIteratorOp, TensorIterator); diff --git a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp index 7ea7c96f8..d172f360f 100644 --- a/modules/nvidia_plugin/src/ops/tensor_iterator.hpp +++ b/modules/nvidia_plugin/src/ops/tensor_iterator.hpp @@ -63,6 +63,8 @@ class TensorIteratorOp : public SubGraph { std::size_t resultIdx, std::size_t outputIdx) const; + void updateExecSequence(); + size_t max_threads_per_block_; const int64_t num_iterations_; std::vector inputs_info_; diff --git a/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/activation_slt.cpp b/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/activation_slt.cpp index e8d200390..3db1ca94d 100644 --- a/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/activation_slt.cpp +++ b/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/activation_slt.cpp @@ -10,12 +10,11 @@ #include #include #include -#include #include #include #include #include -#include +#include #include #include #include @@ -2670,11 +2669,10 @@ struct ClampBenchmark : testing::Test { std::vector> emptyTensor; std::map emptyMapping; ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext; ov::nvidia_gpu::InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, simpleExecutionDelegator, cudaGraphContext}; std::vector inHost(tesnorSize); std::random_device rDevice; diff --git a/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/gather.cpp b/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/gather.cpp index 113a23729..bd3d74bcd 100644 --- a/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/gather.cpp +++ b/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/gather.cpp @@ -6,10 +6,9 @@ #include -#include #include #include -#include +#include #include #include @@ -713,13 +712,12 @@ void test_one_shape(const GatherTestParams& params, bool is_v7) { std::vector outputs{out_alloc}; ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; std::vector> emptyTensor; std::map emptyMapping; ov::nvidia_gpu::CudaGraphContext cudaGraphContext; ov::nvidia_gpu::InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, simpleExecutionDelegator, cudaGraphContext}; std::vector indices = generate_indices(params); std::vector dict(dict_size); std::random_device r_device; diff --git a/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/lstm_cell.cpp b/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/lstm_cell.cpp index 7f9c66625..41505597f 100644 --- a/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/lstm_cell.cpp +++ b/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/lstm_cell.cpp @@ -1,11 +1,11 @@ -// Copyright (C) 2018-2021 Intel Corporation +// Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "single_layer_tests/lstm_cell.hpp" #include -#include +#include #include #include #include @@ -279,12 +279,11 @@ void testOneShape(const LSTMCellTestParams& params) { std::vector> emptyTensor; std::map emptyMapping; - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext; ov::nvidia_gpu::InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, simpleExecutionDelegator, cudaGraphContext}; std::vector x_host(x_size); std::vector hi_host(hi_size); std::vector ci_host(ci_size); diff --git a/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/lstm_sequence.cpp b/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/lstm_sequence.cpp index 4eede00e7..a72e597d0 100644 --- a/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/lstm_sequence.cpp +++ b/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/lstm_sequence.cpp @@ -3,7 +3,7 @@ // #include "single_layer_tests/lstm_sequence.hpp" -#include +#include #include #include #include diff --git a/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/range.cpp b/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/range.cpp index bac509966..f4cb18c09 100644 --- a/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/range.cpp +++ b/modules/nvidia_plugin/tests/functional/shared_tests_instances/single_layer_tests/range.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -251,13 +251,18 @@ MATCHER_P(FloatNearPointwise, tol, "Out of range") { TEST_P(CudaRangeLayerTest, CompareWithRefs) { ASSERT_TRUE(outputSize > 0); ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; std::vector> emptyTensor; std::map emptyMapping; ov::nvidia_gpu::CudaGraphContext cudaGraphContext; - ov::nvidia_gpu::InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + ov::nvidia_gpu::InferenceRequestContext context{emptyTensor, + emptyMapping, + emptyTensor, + emptyMapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; auto& stream = context.getThreadContext().stream(); CudaRangeLayerTest::upload(stream, startParamAlloc, &start, start_type, 1); CudaRangeLayerTest::upload(stream, stopParamAlloc, &stop, Type_t::f32, 1); diff --git a/modules/nvidia_plugin/tests/unit/concat.cpp b/modules/nvidia_plugin/tests/unit/concat.cpp index be048fbcc..7ef75372a 100644 --- a/modules/nvidia_plugin/tests/unit/concat.cpp +++ b/modules/nvidia_plugin/tests/unit/concat.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -49,11 +49,16 @@ struct ConcatTest : testing::Test { auto concatOp = dynamic_cast(operation.get()); ASSERT_TRUE(concatOp); CancellationToken token{}; - EagerTopologyRunner graph{CreationContext{CUDA::Device{}, false}, {}}; - Profiler profiler{false, graph}; + SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; - InferenceRequestContext context{ - empty_tensor, empty_mapping, empty_tensor, empty_mapping, threadContext, token, profiler, cudaGraphContext}; + InferenceRequestContext context{empty_tensor, + empty_mapping, + empty_tensor, + empty_mapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; const auto& stream = threadContext.stream(); std::vector inputs{}; std::vector outputs{}; diff --git a/modules/nvidia_plugin/tests/unit/convert_benchmark.cpp b/modules/nvidia_plugin/tests/unit/convert_benchmark.cpp index cff3d2d21..dbcd9e74d 100644 --- a/modules/nvidia_plugin/tests/unit/convert_benchmark.cpp +++ b/modules/nvidia_plugin/tests/unit/convert_benchmark.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -42,12 +42,11 @@ TEST_F(ConvertTest, DISABLED_benchmark) { constexpr int kNumAttempts = 200; auto& stream = threadContext.stream(); - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; ov::nvidia_gpu::InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, simpleExecutionDelegator, cudaGraphContext}; using Type_t = ov::element::Type_t; constexpr Type_t supported_types[] = {Type_t::boolean, diff --git a/modules/nvidia_plugin/tests/unit/cuda_graph_topology_runner_test.cpp b/modules/nvidia_plugin/tests/unit/cuda_graph_topology_runner_test.cpp index 2b94e3dfa..2c3192de6 100644 --- a/modules/nvidia_plugin/tests/unit/cuda_graph_topology_runner_test.cpp +++ b/modules/nvidia_plugin/tests/unit/cuda_graph_topology_runner_test.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include "test_networks.hpp" using namespace ov::nvidia_gpu; @@ -48,7 +48,7 @@ class CudaGraphTopologyRunnerTest : public Test { CancellationToken cancellationToken_ {}; CudaGraphContext cudaGraphContext_ {}; CudaGraphTopologyRunner runner_ {creationContext_, model_}; - Profiler profiler_ {false, runner_.GetSubGraph()}; + SimpleExecutionDelegator simpleExecutionDelegator_ {}; std::vector> inputTensors_ {PopulateTensors(model_->inputs())}; std::vector> outputTensors_ {PopulateTensors(model_->outputs())}; std::map inputIndeces_ {PopulateInputIndices(model_)}; @@ -59,15 +59,15 @@ class CudaGraphTopologyRunnerTest : public Test { outputIndeces_, threadContext_, cancellationToken_, - profiler_, + simpleExecutionDelegator_, cudaGraphContext_, false}; - DeviceMemBlock deviceMemBlock_ {runner_.GetSubGraph().memoryManager().mutableTensorsMemoryModel()}; + DeviceMemBlock deviceMemBlock_ {runner_.GetSubGraph().memoryManager()->mutableTensorsMemoryModel()}; }; TEST_F(CudaGraphTopologyRunnerTest, InstantiateGraphExec) { runner_.UpdateContext(inferRequestContext_, deviceMemBlock_); - EXPECT_TRUE(inferRequestContext_.getCudaGraphContext().graphExec.has_value()); + EXPECT_TRUE(inferRequestContext_.getCudaGraphContext().is_initialized()); } TEST_F(CudaGraphTopologyRunnerTest, BasicRun) { @@ -77,51 +77,47 @@ TEST_F(CudaGraphTopologyRunnerTest, BasicRun) { TEST_F(CudaGraphTopologyRunnerTest, CheckGraphExecIsInstantiatedOnce) { runner_.UpdateContext(inferRequestContext_, deviceMemBlock_); - CUDA::GraphExec* exec = &inferRequestContext_.getCudaGraphContext().graphExec.value(); + const auto oldCudaGraphContext = &inferRequestContext_.getCudaGraphContext(); runner_.UpdateContext(inferRequestContext_, deviceMemBlock_); - EXPECT_EQ(&inferRequestContext_.getCudaGraphContext().graphExec.value(), exec); + EXPECT_EQ(&inferRequestContext_.getCudaGraphContext(), oldCudaGraphContext); } TEST_F(CudaGraphTopologyRunnerTest, CheckMemcpyNodesArePopulated) { runner_.UpdateContext(inferRequestContext_, deviceMemBlock_); - EXPECT_GT(inferRequestContext_.getCudaGraphContext().parameterNodes.size(), 0); - EXPECT_GT(inferRequestContext_.getCudaGraphContext().resultNodes.size(), 0); + EXPECT_GT(inferRequestContext_.getCudaGraphContext().get_params_count(), 0); + EXPECT_GT(inferRequestContext_.getCudaGraphContext().get_results_count(), 0); } TEST_F(CudaGraphTopologyRunnerTest, CheckMemcpyNodesAreUpdated) { runner_.UpdateContext(inferRequestContext_, deviceMemBlock_); - auto paramNodes = cudaGraphContext_.parameterNodes; - auto resultNodes = cudaGraphContext_.resultNodes; - std::vector> inputTensors {PopulateTensors(model_->inputs())}; - std::vector> outputTensors {PopulateTensors(model_->outputs())}; + const auto oldCudaGraphContext = cudaGraphContext_; + std::vector> inputTensors{PopulateTensors(model_->inputs())}; + std::vector> outputTensors{PopulateTensors(model_->outputs())}; InferenceRequestContext inferRequestContext{inputTensors, - inputIndeces_, - outputTensors, - outputIndeces_, - threadContext_, - cancellationToken_, - profiler_, - cudaGraphContext_, - false}; + inputIndeces_, + outputTensors, + outputIndeces_, + threadContext_, + cancellationToken_, + simpleExecutionDelegator_, + cudaGraphContext_, + false}; runner_.UpdateContext(inferRequestContext, deviceMemBlock_); - EXPECT_NE(cudaGraphContext_.parameterNodes, paramNodes); - EXPECT_NE(cudaGraphContext_.resultNodes, resultNodes); + EXPECT_NE(cudaGraphContext_, oldCudaGraphContext); } TEST_F(CudaGraphTopologyRunnerTest, CheckMemcpyNodesAreNotUpdatedIfPointersUnchanged) { runner_.UpdateContext(inferRequestContext_, deviceMemBlock_); - auto paramNodes = cudaGraphContext_.parameterNodes; - auto resultNodes = cudaGraphContext_.resultNodes; + const auto oldCudaGraphContext = cudaGraphContext_; InferenceRequestContext inferRequestContext{inputTensors_, - inputIndeces_, - outputTensors_, - outputIndeces_, - threadContext_, - cancellationToken_, - profiler_, - cudaGraphContext_, - false}; + inputIndeces_, + outputTensors_, + outputIndeces_, + threadContext_, + cancellationToken_, + simpleExecutionDelegator_, + cudaGraphContext_, + false}; runner_.UpdateContext(inferRequestContext, deviceMemBlock_); - EXPECT_EQ(cudaGraphContext_.parameterNodes, paramNodes); - EXPECT_EQ(cudaGraphContext_.resultNodes, resultNodes); + EXPECT_EQ(cudaGraphContext_, oldCudaGraphContext); } diff --git a/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp b/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp new file mode 100644 index 000000000..f4e4a0cf7 --- /dev/null +++ b/modules/nvidia_plugin/tests/unit/cuda_multi_graph_test.cpp @@ -0,0 +1,275 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "cuda_graph_topology_runner.hpp" +#include "cuda_simple_execution_delegator.hpp" +#include "ngraph_functions/builders.hpp" +#include "ngraph_functions/utils/data_utils.hpp" +#include "ops/parameter.hpp" +#include "ops/result.hpp" + +using namespace ov::nvidia_gpu; +using namespace testing; + +namespace { + +constexpr int TO = 10; +constexpr int FROM = -10; +constexpr int SEED = 1; + +constexpr std::size_t INPUTS_COUNT = 4; +constexpr int64_t CONCAT_AXIS = 0; + +constexpr float THRESHOLD = 0.01f; + +inline ov::float16* getMutablePtr(ov::Tensor& tensor) { return static_cast(tensor.data()); } + +inline const ov::float16* getConstPtr(const ov::Tensor& tensor) { + return static_cast(tensor.data()); +} + +void generateInput(ov::Tensor& tensor, int to = TO, int from = FROM, int seed = SEED) { + // This test supports only FP16 precision + EXPECT_EQ(tensor.get_element_type(), ov::element::Type_t::f16); + auto* ptr = getMutablePtr(tensor); + std::mt19937 engine(seed); + std::uniform_real_distribution dist(from, to); + std::generate(ptr, ptr + tensor.get_size(), [&dist, &engine]() { return ov::float16{dist(engine)}; }); +} + +void validateOutput(const ov::Tensor& tensor, const std::vector& refVector, float threshold) { + // This test supports only FP16 precision + EXPECT_EQ(tensor.get_element_type(), ov::element::Type_t::f16); + const auto size = tensor.get_size(); + EXPECT_EQ(size, refVector.size()); + const auto* ptr = getConstPtr(tensor); + bool areEqual = std::equal(ptr, ptr + size, refVector.cbegin(), [threshold](auto val1, auto val2) { + return std::abs(val1 - val2) < threshold; + }); + EXPECT_TRUE(areEqual); +} + +} // namespace + +class AddMul { +public: + static std::shared_ptr createNetwork() { + ov::element::Type prc = ov::element::Type_t::f16; + ov::Shape shape{1, 2, 3, 4}; + ov::ParameterVector params; + for (std::size_t i = 0; i < INPUTS_COUNT; ++i) { + params.emplace_back(std::make_shared(prc, shape)); + } + const auto add0 = ngraph::builder::makeEltwise(params[0], params[1], ngraph::helpers::EltwiseTypes::ADD); + const auto add1 = ngraph::builder::makeEltwise(params[2], params[3], ngraph::helpers::EltwiseTypes::ADD); + + const auto mul = ngraph::builder::makeEltwise(add0, add1, ngraph::helpers::EltwiseTypes::MULTIPLY); + const auto result = std::make_shared(mul); + return std::make_shared(result, params, "AddMul"); + } + + static void checkContext(const CudaGraphContext& cudaGraphContext) { + // AddMul network should have a single CUDA Graph + EXPECT_EQ(cudaGraphContext.get_graphs_count(), 1); + } + + static void checkSubGraph(const SubGraph& subGraph) { + // Original SubGraph for AddMul network should be CUDA Graph compatible + EXPECT_TRUE(subGraph.IsCudaGraphCompatible()); + } + + static std::vector> calcRefs( + const std::vector>& inputTensors) { + EXPECT_EQ(inputTensors.size(), INPUTS_COUNT); + const auto size = inputTensors[0]->get_size(); + std::vector> result{std::vector(size)}; + std::array inputs; + for (std::size_t i = 0; i < INPUTS_COUNT; ++i) { + inputs[i] = getConstPtr(*inputTensors[i]); + } + EXPECT_EQ(result.size(), 1); + auto& output = result[0]; + for (std::size_t i = 0; i < size; ++i) { + output[i] = (inputs[0][i] + inputs[1][i]) * ((inputs[2][i] + inputs[3][i])); + } + return result; + } +}; + +class AddConcat { +public: + static std::shared_ptr createNetwork() { + ov::element::Type prc = ov::element::Type_t::f16; + ov::Shape shape{1, 2, 3, 4}; + ov::ParameterVector params; + for (std::size_t i = 0; i < INPUTS_COUNT; ++i) { + params.emplace_back(std::make_shared(prc, shape)); + } + const auto add0 = ngraph::builder::makeEltwise(params[0], params[1], ngraph::helpers::EltwiseTypes::ADD); + const auto add1 = ngraph::builder::makeEltwise(params[2], params[3], ngraph::helpers::EltwiseTypes::ADD); + + constexpr int64_t axis = CONCAT_AXIS; + const auto concat = + std::make_shared(ngraph::helpers::convert2OutputVector({add0, add1}), axis); + const auto result = std::make_shared(concat); + return std::make_shared(result, params, "AddConcat"); + } + + static void checkContext(const CudaGraphContext& cudaGraphContext) { + // AddConcat network should have a more than one CUDA Graph + EXPECT_GT(cudaGraphContext.get_graphs_count(), 1); + } + + static void checkSubGraph(const SubGraph& subGraph) { + // Original SubGraph for AddConcat network should not be CUDA Graph compatible + EXPECT_FALSE(subGraph.IsCudaGraphCompatible()); + } + + static std::vector> calcRefs( + const std::vector>& inputTensors) { + EXPECT_EQ(inputTensors.size(), INPUTS_COUNT); + const auto size = inputTensors[0]->get_size(); + std::vector> result{std::vector(2 * size)}; + std::array inputs; + for (std::size_t i = 0; i < INPUTS_COUNT; ++i) { + inputs[i] = getConstPtr(*inputTensors[i]); + } + std::vector addResult0(size); + std::vector addResult1(size); + for (std::size_t i = 0; i < size; ++i) { + addResult0[i] = (inputs[0][i] + inputs[1][i]); + addResult1[i] = (inputs[2][i] + inputs[3][i]); + } + EXPECT_EQ(result.size(), 1); + auto& output = result[0]; + std::copy(addResult0.cbegin(), addResult0.cend(), output.begin()); + std::copy(addResult1.cbegin(), addResult1.cend(), output.begin() + size); + return result; + } +}; + +template +class CudaMultiGraphTest : public Test { +protected: + static std::map populateInputIndices(std::shared_ptr model) { + std::map inputIndices; + for (const auto& parameter : model->get_parameters()) { + const auto& parameter_index = model->get_parameter_index(parameter); + inputIndices.emplace(ParameterOp::GetInputTensorName(*parameter), parameter_index); + } + return inputIndices; + } + + static std::map populateOutputIndices(std::shared_ptr model) { + std::map outputIndices; + for (auto& result : model->get_results()) { + const auto& result_index = model->get_result_index(result->input_value(0)); + for (const auto& outputName : ResultOp::GetOutputTensorName(*result)) { + outputIndices.emplace(outputName, result_index); + } + } + return outputIndices; + } + + static std::vector> populateTensors(const std::vector>& nodes) { + std::vector> result; + for (const auto& node : nodes) { + result.push_back(std::make_shared(node.get_element_type(), node.get_shape())); + } + return result; + } + + void generateInputs() { + for (auto& input : inputTensors_) { + generateInput(*input, TO, FROM, currentSeed_); + ++currentSeed_; + } + } + + void updateContext() { runner_.UpdateContext(*inferRequestContext_, deviceMemBlock_); } + + void checkConditions() { + Network::checkContext(cudaGraphContext_); + Network::checkSubGraph(runner_.GetSubGraph()); + } + + void run() { runner_.Run(*inferRequestContext_, deviceMemBlock_); } + + void calcRefs() { refOutputs_ = Network::calcRefs(inputTensors_); } + + void validate(float threshold = THRESHOLD) { + const auto size = outputTensors_.size(); + EXPECT_EQ(size, refOutputs_.size()); + for (std::size_t i = 0; i < size; ++i) { + validateOutput(*outputTensors_[i], refOutputs_[i], THRESHOLD); + } + } + + void updateTensors() { + inputTensors_ = {populateTensors(model_->inputs())}; + outputTensors_ = {populateTensors(model_->outputs())}; + inferRequestContext_ = std::make_unique(inputTensors_, + inputIndices_, + outputTensors_, + outputIndices_, + threadContext_, + cancellationToken_, + simpleExecutionDelegator_, + cudaGraphContext_, + false); + } + + void runTest() { + generateInputs(); + updateContext(); + checkConditions(); + run(); + calcRefs(); + validate(); + + updateTensors(); + generateInputs(); + updateContext(); + checkConditions(); + run(); + calcRefs(); + validate(); + } + + std::shared_ptr model_{Network::createNetwork()}; + CreationContext creationContext_{{}, false}; + ThreadContext threadContext_{{}}; + CancellationToken cancellationToken_{}; + CudaGraphContext cudaGraphContext_{}; + CudaGraphTopologyRunner runner_{creationContext_, model_}; + SimpleExecutionDelegator simpleExecutionDelegator_{}; + std::vector> inputTensors_{populateTensors(model_->inputs())}; + std::vector> outputTensors_{populateTensors(model_->outputs())}; + std::map inputIndices_{populateInputIndices(model_)}; + std::map outputIndices_{populateOutputIndices(model_)}; + std::unique_ptr inferRequestContext_ = + std::make_unique(inputTensors_, + inputIndices_, + outputTensors_, + outputIndices_, + threadContext_, + cancellationToken_, + simpleExecutionDelegator_, + cudaGraphContext_, + false); + DeviceMemBlock deviceMemBlock_{runner_.GetSubGraph().memoryManager()->mutableTensorsMemoryModel()}; + + std::vector> refOutputs_; + int currentSeed_ = SEED; +}; + +using AddMulMultiGraphTest = CudaMultiGraphTest; + +TEST_F(AddMulMultiGraphTest, AddMulTest) { runTest(); } + +using AddConcatMultiGraphTest = CudaMultiGraphTest; + +TEST_F(AddConcatMultiGraphTest, AddConcatTest) { runTest(); } diff --git a/modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp b/modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp index 7609aecf3..a650be79a 100644 --- a/modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp +++ b/modules/nvidia_plugin/tests/unit/is_cuda_graph_compatible.cpp @@ -6,6 +6,7 @@ #include #include "cuda_compiled_model.hpp" +#include "cuda_simple_execution_delegator.hpp" #include "cuda_runtime.h" #include "cuda_operation_registry.hpp" #include "cuda_profiler.hpp" @@ -86,13 +87,18 @@ struct ReluIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { std::vector outputs{outAlloc}; CancellationToken token{}; - EagerTopologyRunner graph{creationContext, {}}; - Profiler profiler{false, graph}; + SimpleExecutionDelegator simpleExecutionDelegator{}; std::vector> emptyTensor; std::map emptyMapping; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; - InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + InferenceRequestContext context{emptyTensor, + emptyMapping, + emptyTensor, + emptyMapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; // Generate input std::vector input(inSize); @@ -163,13 +169,18 @@ struct ConcatIsCudaGraphCompatibleTest : IsCudaGraphCompatibleTest { std::vector outputs{outAlloc}; CancellationToken token{}; - EagerTopologyRunner graph{creationContext, {}}; - Profiler profiler{false, graph}; + SimpleExecutionDelegator simpleExecutionDelegator{}; std::vector> emptyTensor; std::map emptyMapping; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; - InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + InferenceRequestContext context{emptyTensor, + emptyMapping, + emptyTensor, + emptyMapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; // Generate inputs std::vector input1(inSize1); diff --git a/modules/nvidia_plugin/tests/unit/limits.cpp b/modules/nvidia_plugin/tests/unit/limits.cpp index 8597fb6dd..ff4bb9978 100644 --- a/modules/nvidia_plugin/tests/unit/limits.cpp +++ b/modules/nvidia_plugin/tests/unit/limits.cpp @@ -13,12 +13,11 @@ #include #include #include -#include #include #include #include #include -#include +#include #include #include #include @@ -149,11 +148,10 @@ void run_zero_div_test() { std::vector> emptyTensor; std::map emptyMapping; ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext; ov::nvidia_gpu::InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, simpleExecutionDelegator, cudaGraphContext}; auto& stream = context.getThreadContext().stream(); stream.upload(in1_alloc, in1.data(), size_bytes); stream.upload(in2_alloc, in2.data(), sizeof(T)); diff --git a/modules/nvidia_plugin/tests/unit/logical_not_benchmark.cpp b/modules/nvidia_plugin/tests/unit/logical_not_benchmark.cpp index 835f50797..dd47befe6 100644 --- a/modules/nvidia_plugin/tests/unit/logical_not_benchmark.cpp +++ b/modules/nvidia_plugin/tests/unit/logical_not_benchmark.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -51,11 +51,16 @@ struct LogicalNotBenchmark : testing::Test { TEST_F(LogicalNotBenchmark, DISABLED_benchmark) { constexpr int kNumAttempts = 20; ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; - ov::nvidia_gpu::InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + ov::nvidia_gpu::InferenceRequestContext context{emptyTensor, + emptyMapping, + emptyTensor, + emptyMapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; auto& stream = context.getThreadContext().stream(); std::vector in(length); std::random_device r_device; diff --git a/modules/nvidia_plugin/tests/unit/parameter.cpp b/modules/nvidia_plugin/tests/unit/parameter.cpp index 29ba6d6ec..07ac8713c 100644 --- a/modules/nvidia_plugin/tests/unit/parameter.cpp +++ b/modules/nvidia_plugin/tests/unit/parameter.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include @@ -61,11 +61,16 @@ TEST_F(ParameterRegistryTest, GetOperationBuilder_Available) { TEST_F(ParameterTest, canExecuteSync) { CancellationToken token{}; - EagerTopologyRunner graph{CreationContext{CUDA::Device{}, false}, {}}; - Profiler profiler{false, graph}; + SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; - InferenceRequestContext context{tensors, tensors_mapping, empty_tensor, empty_mapping, threadContext, - token, profiler, cudaGraphContext}; + InferenceRequestContext context{tensors, + tensors_mapping, + empty_tensor, + empty_mapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; auto& stream = context.getThreadContext().stream(); operation->Execute(context, inputs, outputs, {}); auto data = std::make_unique(size); @@ -76,11 +81,16 @@ TEST_F(ParameterTest, canExecuteSync) { TEST_F(ParameterTest, canExecuteAsync) { CancellationToken token{}; - ov::nvidia_gpu::EagerTopologyRunner graph{CreationContext{CUDA::Device{}, false}, {}}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; - InferenceRequestContext context{tensors, tensors_mapping, empty_tensor, empty_mapping, threadContext, - token, profiler, cudaGraphContext}; + InferenceRequestContext context{tensors, + tensors_mapping, + empty_tensor, + empty_mapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; auto& stream = context.getThreadContext().stream(); operation->Execute(context, inputs, outputs, {}); auto data = std::make_unique(size); diff --git a/modules/nvidia_plugin/tests/unit/pooling_tests.cpp b/modules/nvidia_plugin/tests/unit/pooling_tests.cpp index a484f37af..9fa56b32a 100644 --- a/modules/nvidia_plugin/tests/unit/pooling_tests.cpp +++ b/modules/nvidia_plugin/tests/unit/pooling_tests.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -89,11 +89,16 @@ struct PoolingTest : testing::Test { CUDA::Device device{}; const bool optimizeOption = false; CancellationToken token{}; - EagerTopologyRunner graph{CreationContext{CUDA::Device{}, false}, {}}; - Profiler profiler{false, graph}; + SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; - InferenceRequestContext context{ - empty_tensor, empty_mapping, empty_tensor, empty_mapping, threadContext, token, profiler, cudaGraphContext}; + InferenceRequestContext context{empty_tensor, + empty_mapping, + empty_tensor, + empty_mapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; auto& registry{OperationRegistry::getInstance()}; auto const_input = std::make_shared(ov::element::f32, Shape{in_shape}); const size_t spatial_dims = in_shape.size() - 2; diff --git a/modules/nvidia_plugin/tests/unit/relu.cpp b/modules/nvidia_plugin/tests/unit/relu.cpp index 3d737a756..948ac367e 100644 --- a/modules/nvidia_plugin/tests/unit/relu.cpp +++ b/modules/nvidia_plugin/tests/unit/relu.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -62,11 +62,16 @@ struct ReluTest : testing::Test { TEST_F(ReluTest, canExecuteSync) { ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; - ov::nvidia_gpu::InferenceRequestContext context{ - empty_tensor, empty_mapping, empty_tensor, empty_mapping, threadContext, token, profiler, cudaGraphContext}; + ov::nvidia_gpu::InferenceRequestContext context{empty_tensor, + empty_mapping, + empty_tensor, + empty_mapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; auto& stream = context.getThreadContext().stream(); std::array in{-1, 1, -5, 5, 0}; std::array correct; diff --git a/modules/nvidia_plugin/tests/unit/result.cpp b/modules/nvidia_plugin/tests/unit/result.cpp index 64169854c..4a4d77814 100644 --- a/modules/nvidia_plugin/tests/unit/result.cpp +++ b/modules/nvidia_plugin/tests/unit/result.cpp @@ -5,11 +5,10 @@ #include #include -#include #include #include #include -#include +#include #include #include @@ -79,11 +78,10 @@ TEST_F(ResultRegistryTest, GetOperationBuilder_Available) { TEST_F(ResultTest, canExecuteSync) { CancellationToken token{}; - EagerTopologyRunner graph{CreationContext{CUDA::Device{}, false}, {}}; - Profiler profiler{false, graph}; + SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; InferenceRequestContext context{empty_tensor, empty_mapping, tensors, tensors_mapping, threadContext, - token, profiler, cudaGraphContext}; + token, simpleExecutionDelegator, cudaGraphContext}; auto& stream = context.getThreadContext().stream(); stream.upload(inputs[0].as_mutable(), tensor->data(), size); operation->Execute(context, inputs, outputs, {}); @@ -95,11 +93,10 @@ TEST_F(ResultTest, canExecuteSync) { TEST_F(ResultTest, canExecuteAsync) { CancellationToken token{}; - EagerTopologyRunner graph{CreationContext{CUDA::Device{}, false}, {}}; - Profiler profiler{false, graph}; + SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; InferenceRequestContext context{empty_tensor, empty_mapping, tensors, tensors_mapping, threadContext, - token, profiler, cudaGraphContext}; + token, simpleExecutionDelegator, cudaGraphContext}; auto& stream = context.getThreadContext().stream(); stream.upload(inputs[0].as_mutable(), tensor->data(), size); operation->Execute(context, inputs, outputs, {}); diff --git a/modules/nvidia_plugin/tests/unit/select_benchmark.cpp b/modules/nvidia_plugin/tests/unit/select_benchmark.cpp index 6f52140a3..b4d001f85 100644 --- a/modules/nvidia_plugin/tests/unit/select_benchmark.cpp +++ b/modules/nvidia_plugin/tests/unit/select_benchmark.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -87,12 +87,11 @@ void fillArrayWithRandomData(std::vector& v) { TEST_F(SelectTest, DISABLED_benchmark) { using microseconds = std::chrono::duration; constexpr int kNumAttempts = 20000; - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; ov::nvidia_gpu::InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, simpleExecutionDelegator, cudaGraphContext}; auto& stream = context.getThreadContext().stream(); std::vector conditions(bufferLength); diff --git a/modules/nvidia_plugin/tests/unit/sigmoid_benchmark.cpp b/modules/nvidia_plugin/tests/unit/sigmoid_benchmark.cpp index 1520822aa..6b31466a9 100644 --- a/modules/nvidia_plugin/tests/unit/sigmoid_benchmark.cpp +++ b/modules/nvidia_plugin/tests/unit/sigmoid_benchmark.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include @@ -53,11 +53,16 @@ TEST_F(SigmoidTest, DISABLED_benchmark) { using microseconds = std::chrono::duration; constexpr int kNumAttempts = 20; ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext{}; - ov::nvidia_gpu::InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + ov::nvidia_gpu::InferenceRequestContext context{emptyTensor, + emptyMapping, + emptyTensor, + emptyMapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; auto& stream = context.getThreadContext().stream(); std::array in; std::random_device r_device; diff --git a/modules/nvidia_plugin/tests/unit/strided_slice_benchmark.cpp b/modules/nvidia_plugin/tests/unit/strided_slice_benchmark.cpp index f1c9f131b..af67221b4 100644 --- a/modules/nvidia_plugin/tests/unit/strided_slice_benchmark.cpp +++ b/modules/nvidia_plugin/tests/unit/strided_slice_benchmark.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -78,11 +78,16 @@ TEST_F(StridedSliceTest, DISABLED_benchmark) { using microseconds = std::chrono::duration; constexpr int kNumAttempts = 20000; ov::nvidia_gpu::CancellationToken token{}; - ov::nvidia_gpu::EagerTopologyRunner graph{ov::nvidia_gpu::CreationContext{CUDA::Device{}, false}, {}}; - ov::nvidia_gpu::Profiler profiler{false, graph}; + ov::nvidia_gpu::SimpleExecutionDelegator simpleExecutionDelegator{}; ov::nvidia_gpu::CudaGraphContext cudaGraphContext; - ov::nvidia_gpu::InferenceRequestContext context{ - emptyTensor, emptyMapping, emptyTensor, emptyMapping, threadContext, token, profiler, cudaGraphContext}; + ov::nvidia_gpu::InferenceRequestContext context{emptyTensor, + emptyMapping, + emptyTensor, + emptyMapping, + threadContext, + token, + simpleExecutionDelegator, + cudaGraphContext}; auto& stream = context.getThreadContext().stream(); std::vector in(inputBufferLength); std::random_device r_device;