microsoft · wangyems · Mar 7, 2024 · Feb 24, 2024 · Feb 24, 2024 · Feb 24, 2024
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
@@ -182,6 +182,12 @@ class IExecutionProvider {
   */
   virtual common::Status Sync() const { return Status::OK(); }
 
+  /**
+     Set graph annotation for saving/retriving executable graphs (e.g., cuda graph).
+     Currently only CUDA execution provider supports it.
+  */
+  virtual void SetGraphAnnotation(int) {}
+
   /**
      Called when InferenceSession::Run started
      NOTE that due to async execution in provider, the actual work of previous

diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -42,3 +42,8 @@ static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_
 
 // Set RPC control latency for QNN HTP backend
 static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency";
+
+// Set graph annotation id for CUDA EP. Use with enable_cuda_graph=true.
+// The value should be an integer. If the value is not set, ORT session only captures one cuda graph.
+// If the value is set to -1, cuda graph capture/replay is disabled in that run.
+static const char* const kOrtRunOptionsConfigCudaGraphAnnotation = "ep.cuda.cuda_graph_annotation";
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -194,8 +194,19 @@
   return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
 }
 
+bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureSkippedOnRun() const {
+  return !cuda_graph_.IsGraphCaptureAllowedOnRun();
+}
+
+void CUDAExecutionProvider::PerThreadContext::SetCudaGraphAnnotationId(GraphAnnotationOptional_t cuda_graph_annotation_id) {
+  cuda_graph_annotation_id_ = cuda_graph_annotation_id;
+  cuda_graph_.SetGraphAnnotation(cuda_graph_annotation_id_);
+}
+
 void CUDAExecutionProvider::PerThreadContext::CaptureBegin() {
-  cuda_graph_.Reset();
+  if (!cuda_graph_annotation_id_.has_value()) {
+    cuda_graph_.Reset();
+  }
   cuda_graph_.CaptureBegin();
 }
 
@@ -205,12 +216,15 @@
 }
 
 bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptured() const {
+  if (cuda_graph_annotation_id_.has_value()) {
+    return cuda_graph_.IsAdditionalGraphCaptured(*cuda_graph_annotation_id_);
+  }
   return is_graph_captured_;
 }
 
 Status CUDAExecutionProvider::PerThreadContext::ReplayGraph() {
   ORT_ENFORCE(IsGraphCaptured());
-  return cuda_graph_.Replay();
+  return cuda_graph_.Replay(cuda_graph_annotation_id_);
 }
 
 void CUDAExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture() {
@@ -389,15 +403,15 @@
 Status CUDAExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   // always set CUDA device when session::Run() in case it runs in a worker thread
   CUDA_RETURN_IF_ERROR(cudaSetDevice(GetDeviceId()));
-  if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
+  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptureSkippedOnRun() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
     LOGS(*GetLogger(), INFO) << "Capturing the cuda graph for this model";
     GetPerThreadContext().CaptureBegin();
   }
   return Status::OK();
 }
 
 Status CUDAExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
-  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
+  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptureSkippedOnRun() && !GetPerThreadContext().IsGraphCaptured()) {
     if (GetPerThreadContext().IsGraphCaptureAllowed()) {
       GetPerThreadContext().CaptureEnd();
       // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
@@ -430,7 +444,11 @@
 }
 
 bool CUDAExecutionProvider::IsGraphCaptureEnabled() const {
-  return info_.enable_cuda_graph;
+  return info_.enable_cuda_graph == 1;
+}
+
+void CUDAExecutionProvider::SetGraphAnnotation(GraphAnnotation_t cuda_graph_annotation_id) {
+  GetPerThreadContext().SetCudaGraphAnnotationId(make_optional<GraphAnnotation_t>(cuda_graph_annotation_id));
 }
 
 bool CUDAExecutionProvider::IsGraphCaptured() const {

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -94,6 +94,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
   bool IsGraphCaptureEnabled() const override;
   bool IsGraphCaptured() const override;
   Status ReplayGraph() override;
+  void SetGraphAnnotation(GraphAnnotation_t graph_annotation_id) override;
   void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override;
   OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
@@ -177,6 +178,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
     }
 
     bool IsGraphCaptureAllowed() const;
+    bool IsGraphCaptureSkippedOnRun() const;
+    void SetCudaGraphAnnotationId(GraphAnnotationOptional_t cuda_graph_annotation_id);
     void CaptureBegin();
     void CaptureEnd();
     bool IsGraphCaptured() const;
@@ -202,6 +205,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
     CUDAGraph cuda_graph_;
     bool is_graph_captured_ = false;
     int regular_run_count_before_graph_capture_ = 0;
+    GraphAnnotationOptional_t cuda_graph_annotation_id_;
 
     // There is chance that the second regular run allocates GPU memory for causes like:
     // (1) memory pattern is enabled. (2) arena allocation for stream.

diff --git a/onnxruntime/core/providers/cuda/cuda_graph.cc b/onnxruntime/core/providers/cuda/cuda_graph.cc
@@ -16,10 +16,20 @@
   stream_ = stream;
 }
 
+void CUDAGraph::SetGraphAnnotation(GraphAnnotationOptional_t cuda_graph_annotation_id) {
+  cuda_graph_annotation_id_ = cuda_graph_annotation_id;
+}
+
 void CUDAGraph::CaptureBegin() {
-  ORT_ENFORCE(!has_graph_exec_,
-              "This cuda graph has already captured a graph. "
-              "Create a new instance to capture a new graph.");
+  if (!cuda_graph_annotation_id_.has_value()) {
+    ORT_ENFORCE(!has_graph_exec_,
+                "This cuda graph has already captured a graph. "
+                "Create a new instance to capture a new graph.");
+  } else {
+    if (!IsGraphCaptureAllowedOnRun()) {
+      return;
+    }
+  }
 
   CUDA_CALL_THROW(cudaStreamSynchronize(stream_));
   // For now cuda graph can only work with a single thread. In the future, we
@@ -30,6 +40,29 @@
 }
 
 void CUDAGraph::CaptureEnd() {
+  if (!IsGraphCaptureAllowedOnRun()) {
+    return;
+  }
+
+  if (cuda_graph_annotation_id_.has_value()) {
+    CUDA_CALL_THROW(cudaStreamEndCapture(stream_, &additional_graph_));
+    if (additional_graph_ == NULL) {
+      ORT_THROW("CUDAGraph::CaptureEnd: additional_graph_ is NULL");
+    }
+
+    cudaGraphExec_t graph_exec = NULL;
+
+    has_additional_graph_ = true;
+    CUDA_CALL_THROW(cudaGraphInstantiate(&graph_exec, additional_graph_, NULL, NULL, 0));
+    CUDA_CALL_THROW(cudaGraphDestroy(additional_graph_));
+    has_additional_graph_ = false;
+
+    GraphAnnotation_t cuda_graph_id = cuda_graph_annotation_id_.value();
+    graph_exec_map_.emplace(cuda_graph_id, graph_exec);
+
+    return;
+  }
+
   CUDA_CALL_THROW(cudaStreamEndCapture(stream_, &graph_));
   if (graph_ == NULL) {
     ORT_THROW("CUDAGraph::CaptureEnd: graph_ is NULL");
@@ -42,15 +75,42 @@
   has_graph_ = false;
 }
 
-Status CUDAGraph::Replay() {
+Status CUDAGraph::Replay(GraphAnnotationOptional_t cuda_graph_annotation_id) {
+  if (!IsGraphCaptureAllowedOnRun()) {
+    return Status::OK();
+  }
   // Although this function is not thread safe, the lock is not needed here because
   // CUDA EP maintains a separate cuda graph per thread
-  LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_;
-  CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec_, stream_));
+  if (cuda_graph_annotation_id_.has_value()) {
+    LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_ << " with cuda_graph_annotation_id " << *cuda_graph_annotation_id;
+    auto it = graph_exec_map_.find(*cuda_graph_annotation_id);
+    if (it == graph_exec_map_.end()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME,
+                             FAIL,
+                             "CUDAGraph::Replay: graph_exec_map_ does not contain the cuda_graph_annotation_id");
+    }
+    CUDA_RETURN_IF_ERROR(cudaGraphLaunch(it->second, stream_));
+  } else {
+    LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_;
+    CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec_, stream_));
+  }
+
   CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
   return Status::OK();
 }
 
+bool CUDAGraph::IsAdditionalGraphCaptured(GraphAnnotation_t cuda_graph_annotation_id) const {
+  return graph_exec_map_.find(cuda_graph_annotation_id) != graph_exec_map_.end();
+}
+
+bool CUDAGraph::IsGraphCaptureAllowedOnRun() const {
+  if (!cuda_graph_annotation_id_.has_value()) {
+    // std::cout << "IsGraphCaptureAllowedOnRun()::cuda_graph_annotation_id is empty" << std::endl;
+    return true;
+  }
+  return *cuda_graph_annotation_id_ != kDefaultSkipGraphCapture;
+}
+
 void CUDAGraph::Reset() {
   if (has_graph_) {
     CUDA_CALL_THROW(cudaGraphDestroy(graph_));
@@ -62,8 +122,22 @@
   }
 }
 
+void CUDAGraph::ResetAdditional() {
+  if (has_additional_graph_) {
+    CUDA_CALL_THROW(cudaGraphDestroy(additional_graph_));
+    has_additional_graph_ = false;
+  }
+  if (!graph_exec_map_.empty()) {
+    for (auto& it : graph_exec_map_) {
+      CUDA_CALL_THROW(cudaGraphExecDestroy(it.second));
+    }
+    graph_exec_map_.clear();
+  }
+}
+
 CUDAGraph::~CUDAGraph() {
   Reset();
+  ResetAdditional();
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_graph.h b/onnxruntime/core/providers/cuda/cuda_graph.h
@@ -4,23 +4,34 @@
 #pragma once
 
 #include "core/common/common.h"
+#include "core/common/optional.h"
 #include "core/platform/ort_mutex.h"
 #include "core/providers/cuda/cuda_pch.h"
 
 namespace onnxruntime {
 
-using CaptureId_t = unsigned long long;
+using GraphAnnotation_t = int;
+using GraphAnnotationOptional_t = optional<GraphAnnotation_t>;
+
+constexpr GraphAnnotation_t kDefaultSkipGraphCapture = -1;
 
 struct CUDAGraph {
   CUDAGraph(){};
   CUDAGraph(cudaStream_t stream);
   ~CUDAGraph();
 
   void SetStream(cudaStream_t stream);
+  void SetGraphAnnotation(GraphAnnotationOptional_t cuda_graph_annotation_id);
+
   void CaptureBegin();
   void CaptureEnd();
-  Status Replay();
+  Status Replay(GraphAnnotationOptional_t cuda_graph_annotation_id);
+
   void Reset();
+  void ResetAdditional();
+
+  bool IsAdditionalGraphCaptured(GraphAnnotation_t cuda_graph_annotation_id) const;
+  bool IsGraphCaptureAllowedOnRun() const;
 
  private:
   cudaGraph_t graph_ = NULL;
@@ -29,6 +40,11 @@
   bool has_graph_ = false;
   bool has_graph_exec_ = false;
 
+  cudaGraph_t additional_graph_ = NULL;
+  std::unordered_map<GraphAnnotation_t, cudaGraphExec_t> graph_exec_map_;
+  GraphAnnotationOptional_t cuda_graph_annotation_id_;
+  bool has_additional_graph_ = false;
+
   cudaStream_t stream_ = nullptr;  // Does not own the stream
 };
 

diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
@@ -2385,6 +2385,16 @@ Status InferenceSession::Run(const RunOptions& run_options,
   auto* inter_tp = (control_spinning) ? inter_op_thread_pool_.get() : nullptr;
   ThreadPoolSpinningSwitch runs_refcounter_and_tp_spin_control(intra_tp, inter_tp, current_num_runs_);
 
+  if (cached_execution_provider_for_graph_replay_.IsGraphCaptureEnabled()) {
+    // Cuda graph annotation is only considered when enable_cuda_graph is set to true in session options
+    const std::string& graph_annotation_str =
+        run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigCudaGraphAnnotation, "");
+    // If graph annotation is not provided, fall back to the one cuda graph per session behavior
+    if (!graph_annotation_str.empty()) {
+      cached_execution_provider_for_graph_replay_.SetGraphAnnotation(std::stoi(graph_annotation_str));
+    }
+  }
+
   // Check if this Run() is simply going to be a CUDA Graph replay.
   if (cached_execution_provider_for_graph_replay_.IsGraphCaptured()) {
     LOGS(*session_logger_, INFO) << "Replaying the captured "
@@ -2552,6 +2562,7 @@ Status InferenceSession::Run(const RunOptions& run_options,
   // N is defined in min_num_runs_before_hip_graph_capture_ for ROCM EP,
   // and the value could be different for other EP.
   if (retval.IsOK() && cached_execution_provider_for_graph_replay_.IsGraphCaptureEnabled() &&
+      !cached_execution_provider_for_graph_replay_.IsGraphCaptureSkippedOnRun() &&
       !cached_execution_provider_for_graph_replay_.IsGraphCaptured()) {
     LOGS(*session_logger_, INFO) << "Start another run for necessary memory allocation or graph capture.";
     ORT_RETURN_IF_ERROR(Run(run_options, feed_names, feeds, output_names, p_fetches, p_fetches_device_info));

diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
@@ -863,6 +863,14 @@ class InferenceSession {
       cached_execution_provider_for_graph_replay_ = execution_provider;
     }
 
+    void SetGraphAnnotation(int graph_annotation_id) {
+      graph_annotation_id_ = graph_annotation_id;
+      if (!cached_execution_provider_for_graph_replay_) {
+        ORT_THROW("Cached EP instance for graph replay is not set yet before calling SetGraphAnnotation()");
+      }
+      cached_execution_provider_for_graph_replay_->SetGraphAnnotation(graph_annotation_id_);
+    }
+
     bool IsGraphCaptureEnabled() const {
       return cached_execution_provider_for_graph_replay_ != nullptr && cached_execution_provider_for_graph_replay_->IsGraphCaptureEnabled();
     }
@@ -871,6 +879,10 @@ class InferenceSession {
       return cached_execution_provider_for_graph_replay_ != nullptr && cached_execution_provider_for_graph_replay_->IsGraphCaptured();
     }
 
+    bool IsGraphCaptureSkippedOnRun() const {
+      return cached_execution_provider_for_graph_replay_ != nullptr && graph_annotation_id_ == -1;
+    }
+
     Status ReplayGraph() {
       ORT_ENFORCE(IsGraphCaptured());
       if (cached_execution_provider_for_graph_replay_) {
@@ -884,6 +896,7 @@ class InferenceSession {
     }
 
     IExecutionProvider* cached_execution_provider_for_graph_replay_ = nullptr;
+    int graph_annotation_id_ = 0;
   };
 
   CachedExecutionProviderForGraphReplay cached_execution_provider_for_graph_replay_;