triton-inference-server · ashishk98 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024 · Sep 5, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -37,6 +37,8 @@ set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which feat
 option(TRITON_ENABLE_GPU "Enable GPU support in backend." ON)
 option(TRITON_ENABLE_STATS "Include statistics collections in backend." ON)
 option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
+option(TRITON_ENABLE_CIG "Enable Cuda in Graphics (CiG) support in backend." OFF)
+
 set(TRITON_TENSORRT_LIB_PATHS "" CACHE PATH "Paths to TensorRT libraries. Multiple paths may be specified by separating them with a semicolon.")
 set(TRITON_TENSORRT_INCLUDE_PATHS "" CACHE PATH "Paths to TensorRT includes. Multiple paths may be specified by separating them with a semicolon.")
 
@@ -271,6 +273,17 @@ target_link_libraries(
       CUDA::cudart
 )
 
+if(${TRITON_ENABLE_CIG})
+  target_compile_definitions(
+      triton-tensorrt-backend
+      PRIVATE TRITON_ENABLE_CIG
+  )
+  target_link_libraries(
+      triton-tensorrt-backend
+      PRIVATE
+        CUDA::cuda_driver
+  )
+endif()
 
 #
 # Install

diff --git a/src/instance_state.cc b/src/instance_state.cc
@@ -257,7 +257,13 @@ ModelInstanceState::ModelInstanceState(
 
 ModelInstanceState::~ModelInstanceState()
 {
-  cudaSetDevice(DeviceId());
+#ifdef TRITON_ENABLE_CIG
+  // Set device if CiG is disabled
+  if (!model_state_->isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
+  {
+    cudaSetDevice(DeviceId());
+  }
   for (auto& io_binding_infos : io_binding_infos_) {
     for (auto& io_binding_info : io_binding_infos) {
       if (!io_binding_info.IsDynamicShapeOutput() &&
@@ -424,7 +430,13 @@ ModelInstanceState::Run(
   payload_.reset(new Payload(next_set_, requests, request_count));
   SET_TIMESTAMP(payload_->compute_start_ns_);
 
-  cudaSetDevice(DeviceId());
+#ifdef TRITON_ENABLE_CIG
+  // Set device if CiG is disabled
+  if (!model_state_->isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
+  {
+    cudaSetDevice(DeviceId());
+  }
 #ifdef TRITON_ENABLE_STATS
   {
     SET_TIMESTAMP(payload_->compute_start_ns_);
@@ -1551,13 +1563,20 @@ ModelInstanceState::EvaluateTensorRTContext(
 TRITONSERVER_Error*
 ModelInstanceState::InitStreamsAndEvents()
 {
-  // Set the device before preparing the context.
-  auto cuerr = cudaSetDevice(DeviceId());
-  if (cuerr != cudaSuccess) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL, (std::string("unable to set device for ") +
-                                      Name() + ": " + cudaGetErrorString(cuerr))
-                                         .c_str());
+#ifdef TRITON_ENABLE_CIG
+  // Set device if CiG is disabled
+  if (!model_state_->isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
+  {
+    // Set the device before preparing the context.
+    auto cuerr = cudaSetDevice(DeviceId());
+    if (cuerr != cudaSuccess) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to set device for ") + Name() + ": " +
+           cudaGetErrorString(cuerr))
+              .c_str());
+    }
   }
 
   // Create CUDA streams associated with the instance

diff --git a/src/model_state.cc b/src/model_state.cc
@@ -175,7 +175,13 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
 ModelState::~ModelState()
 {
   for (auto& device_engine : device_engines_) {
-    cudaSetDevice(device_engine.first.first);
+#ifdef TRITON_ENABLE_CIG
+    // Set device if CiG is disabled
+    if (!isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
+    {
+      cudaSetDevice(device_engine.first.first);
+    }
     auto& runtime = device_engine.second.first;
     auto& engine = device_engine.second.second;
     // Need to reset explicitly to ensure proper destruction order
@@ -209,15 +215,20 @@ ModelState::CreateEngine(
   // We share the engine (for models that don't have dynamic shapes) and
   // runtime across instances that have access to the same GPU/NVDLA.
   if (eit->second.second == nullptr) {
-    auto cuerr = cudaSetDevice(gpu_device);
-    if (cuerr != cudaSuccess) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("unable to set device for ") + Name() + ": " +
-           cudaGetErrorString(cuerr))
-              .c_str());
+#ifdef TRITON_ENABLE_CIG
+    // Set device if CiG is disabled
+    if (!isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
+    {
+      auto cuerr = cudaSetDevice(gpu_device);
+      if (cuerr != cudaSuccess) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            (std::string("unable to set device for ") + Name() + ": " +
+             cudaGetErrorString(cuerr))
+                .c_str());
+      }
     }
-
     const bool new_runtime = (eit->second.first == nullptr);
     RETURN_IF_ERROR(LoadPlan(
         model_path, dla_core_id, &eit->second.first, &eit->second.second,
@@ -321,13 +332,19 @@ ModelState::AutoCompleteConfig()
            " to auto-complete config for " + Name())
            .c_str()));
 
-  cuerr = cudaSetDevice(device_id);
-  if (cuerr != cudaSuccess) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        (std::string("unable to set CUDA device to GPU ") +
-         std::to_string(device_id) + " : " + cudaGetErrorString(cuerr))
-            .c_str());
+#ifdef TRITON_ENABLE_CIG
+  // Set device if CiG is disabled
+  if (!isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
+  {
+    cuerr = cudaSetDevice(device_id);
+    if (cuerr != cudaSuccess) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to set CUDA device to GPU ") +
+           std::to_string(device_id) + " : " + cudaGetErrorString(cuerr))
+              .c_str());
+    }
   }
 
   std::string artifact_name;
@@ -373,13 +390,19 @@ ModelState::AutoCompleteConfig()
 
   RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path));
 
-  cuerr = cudaSetDevice(current_device);
-  if (cuerr != cudaSuccess) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        (std::string("unable to revert CUDA device to GPU ") +
-         std::to_string(current_device) + " : " + cudaGetErrorString(cuerr))
-            .c_str());
+#ifdef TRITON_ENABLE_CIG
+  // Set device if CiG is disabled
+  if (!isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
+  {
+    cuerr = cudaSetDevice(current_device);
+    if (cuerr != cudaSuccess) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to revert CUDA device to GPU ") +
+           std::to_string(current_device) + " : " + cudaGetErrorString(cuerr))
+              .c_str());
+    }
   }
 
   if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {

diff --git a/src/tensorrt.cc b/src/tensorrt.cc
@@ -318,6 +318,9 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
     DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get());
   }
 
+#ifdef TRITON_ENABLE_CIG
+  ScopedRuntimeCiGContext cig_scope(model_state);
+#endif  // TRITON_ENABLE_CIG
 
   // With each instance we create a ModelInstanceState object and
   // associate it with the TRITONBACKEND_ModelInstance.
@@ -353,6 +356,12 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
   LOG_MESSAGE(
       TRITONSERVER_LOG_INFO,
       "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
+  if (!instance_state) {
+    return nullptr;
+  }
+#ifdef TRITON_ENABLE_CIG
+  ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel());
+#endif  // TRITON_ENABLE_CIG
 
   delete instance_state;
 
@@ -377,6 +386,10 @@ TRITONBACKEND_ModelInstanceExecute(
       instance, reinterpret_cast<void**>(&instance_state)));
   ModelState* model_state = instance_state->StateForModel();
 
+#ifdef TRITON_ENABLE_CIG
+  ScopedRuntimeCiGContext cig_scope(model_state);
+#endif  // TRITON_ENABLE_CIG
+
   // For TensorRT backend, the executing instance may not closely tie to
   // TRITONBACKEND_ModelInstance, the instance will be assigned based on
   // execution policy.

diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc
@@ -26,6 +26,8 @@
 
 #include "tensorrt_model.h"
 
+#include <sstream>
+
 namespace triton { namespace backend { namespace tensorrt {
 
 TensorRTModel::Priority
@@ -54,6 +56,10 @@ TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model)
       use_cuda_graphs_(false), gather_kernel_buffer_threshold_(0),
       separate_output_stream_(false), eager_batching_(false),
       busy_wait_events_(false)
+#ifdef TRITON_ENABLE_CIG
+      ,
+      cig_ctx_(nullptr)
+#endif  // TRITON_ENABLE_CIG
 {
   ParseModelConfig();
 }
@@ -90,6 +96,23 @@ TensorRTModel::ParseModelConfig()
     }
   }
 
+#ifdef TRITON_ENABLE_CIG
+  triton::common::TritonJson::Value parameters;
+  if (model_config_.Find("parameters", &parameters)) {
+    triton::common::TritonJson::Value value;
+    std::string ptr_value;
+    if (parameters.Find("CIG_CONTEXT_PTR", &value)) {
+      RETURN_IF_ERROR(value.MemberAsString("string_value", &ptr_value));
+      std::stringstream ss;
+      ss << ptr_value;
+      void* ctx_ptr;
+      ss >> ctx_ptr;
+      cig_ctx_ = static_cast<CUcontext>(ctx_ptr);
+      LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "CiG Context pointer is set");
+    }
+  }
+#endif  // TRITON_ENABLE_CIG
+
   return nullptr;  // Success
 }
 

diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
@@ -25,6 +25,10 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
+#ifdef TRITON_ENABLE_CIG
+#include <cuda.h>
+#endif  // TRITON_ENABLE_CIG
+
 #include "triton/backend/backend_model.h"
 
 namespace triton { namespace backend { namespace tensorrt {
@@ -53,6 +57,41 @@ class TensorRTModel : public BackendModel {
   bool EagerBatching() { return eager_batching_; }
   bool BusyWaitEvents() { return busy_wait_events_; }
 
+#ifdef TRITON_ENABLE_CIG
+  //! Following functions are related to CiG (Cuda in Graphics) context sharing
+  //! for gaming use case. Creating a shared contexts reduces context switching
+  //! overhead and leads to better performance of model execution along side
+  //! Graphics workload.
+  CUcontext GetCiGContext() { return cig_ctx_; }
+  bool isCiGEnabled() { return cig_ctx_ != nullptr; }
+
+  inline TRITONSERVER_Error* PushCiGContext()
+  {
+    if (CUDA_SUCCESS != cuCtxPushCurrent(cig_ctx_)) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to push CiG context for ") + Name()).c_str());
+    }
+    return nullptr;
+  }
+
+  inline TRITONSERVER_Error* PopCiGContext()
+  {
+    CUcontext oldCtx{};
+    if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to [pop CiG context for ") + Name()).c_str());
+    }
+    if (oldCtx != cig_ctx_) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("popping the wrong CiG context for ") + Name()).c_str());
+    }
+    return nullptr;
+  }
+#endif  // TRITON_ENABLE_CIG
+
  protected:
   common::TritonJson::Value graph_specs_;
   Priority priority_;
@@ -61,6 +100,28 @@ class TensorRTModel : public BackendModel {
   bool separate_output_stream_;
   bool eager_batching_;
   bool busy_wait_events_;
+#ifdef TRITON_ENABLE_CIG
+  CUcontext cig_ctx_;
+#endif  // TRITON_ENABLE_CIG
+};
+
+#ifdef TRITON_ENABLE_CIG
+struct ScopedRuntimeCiGContext {
+  ScopedRuntimeCiGContext(TensorRTModel* model_state)
+      : model_state_(model_state)
+  {
+    if (model_state_->isCiGEnabled()) {
+      THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCiGContext());
+    }
+  }
+  ~ScopedRuntimeCiGContext()
+  {
+    if (model_state_->isCiGEnabled()) {
+      THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCiGContext());
+    }
+  }
+  TensorRTModel* model_state_;
 };
+#endif  // TRITON_ENABLE_CIG
 
 }}}  // namespace triton::backend::tensorrt