-
Notifications
You must be signed in to change notification settings - Fork 30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enable Cuda in Graphics Implementation for TensorRT backend #100
base: main
Are you sure you want to change the base?
Changes from all commits
134bf33
7353671
ed1296d
05e3786
9ae5a09
89ab580
b624b98
1f1ae7e
b42a8d6
d9aff26
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,6 +90,14 @@ TensorRTModel::ParseModelConfig() | |
} | ||
} | ||
|
||
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING | ||
std::string ptr_str = ""; | ||
RETURN_IF_ERROR(GetParameter("CUDA_CONTEXT_PTR", ptr_str)); | ||
cuda_ctx = static_cast<CUcontext>(StringToPointer(ptr_str)); | ||
// cuda_ctx = static_cast<CUcontext>(reinterpret_cast<void*>(ptr_str)); | ||
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set"); | ||
#endif // TRITON_ENABLE_CUDA_CTX_SHARING | ||
|
||
return nullptr; // Success | ||
} | ||
|
||
|
@@ -120,4 +128,28 @@ TensorRTModel::GetCudaStreamPriority() | |
return cuda_stream_priority; | ||
} | ||
|
||
template <> | ||
TRITONSERVER_Error* | ||
TensorRTModel::GetParameter<std::string>( | ||
std::string const& name, std::string& str_value) | ||
{ | ||
triton::common::TritonJson::Value parameters; | ||
TRITONSERVER_Error* err = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use RETURN_IF_ERROR Macro. |
||
model_config_.MemberAsObject("parameters", ¶meters); | ||
if (err != nullptr) { | ||
return err; | ||
// throw std::runtime_error("Model config doesn't have a parameters | ||
// section"); | ||
} | ||
triton::common::TritonJson::Value value; | ||
err = parameters.MemberAsObject(name.c_str(), &value); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use RETURN_IF_ERROR macro |
||
if (err != nullptr) { | ||
return err; | ||
// std::string errStr = "Cannot find parameter with name: " + name; | ||
// throw std::runtime_error(errStr); | ||
} | ||
value.MemberAsString("string_value", &str_value); | ||
return nullptr; | ||
} | ||
|
||
}}} // namespace triton::backend::tensorrt |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,11 @@ | |
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
#pragma once | ||
|
||
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING | ||
#include <cuda.h> | ||
#endif // TRITON_ENABLE_CUDA_CTX_SHARING | ||
#include <sstream> | ||
|
||
#include "triton/backend/backend_model.h" | ||
|
||
namespace triton { namespace backend { namespace tensorrt { | ||
|
@@ -34,6 +39,14 @@ class TensorRTModel : public BackendModel { | |
TensorRTModel(TRITONBACKEND_Model* triton_model); | ||
virtual ~TensorRTModel() = default; | ||
|
||
template <typename T> | ||
TRITONSERVER_Error* GetParameter(std::string const& name, T& value) | ||
{ | ||
assert(false); | ||
auto dummy = T(); | ||
return dummy; | ||
} | ||
|
||
TRITONSERVER_Error* SetTensorRTModelConfig(); | ||
|
||
TRITONSERVER_Error* ParseModelConfig(); | ||
|
@@ -53,6 +66,65 @@ class TensorRTModel : public BackendModel { | |
bool EagerBatching() { return eager_batching_; } | ||
bool BusyWaitEvents() { return busy_wait_events_; } | ||
|
||
template <> | ||
TRITONSERVER_Error* GetParameter<std::string>( | ||
std::string const& name, std::string& str_value); | ||
|
||
void* StringToPointer(std::string& str) | ||
{ | ||
std::stringstream ss; | ||
ss << str; | ||
|
||
void* ctx_ptr; | ||
ss >> ctx_ptr; | ||
return ctx_ptr; | ||
} | ||
|
||
//! Following functions are related to Cuda (Cuda in Graphics) context sharing | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. related to custom cuda context sharing? |
||
//! for gaming use case. Creating a shared contexts reduces context switching | ||
//! overhead and leads to better performance of model execution along side | ||
//! Graphics workload. | ||
|
||
bool isCudaContextSharingEnabled() | ||
{ | ||
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING | ||
return cuda_ctx != nullptr; | ||
#else | ||
return false; | ||
#endif // TRITON_ENABLE_CUDA_CTX_SHARING | ||
} | ||
|
||
inline TRITONSERVER_Error* PushCudaContext() | ||
{ | ||
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING | ||
if (CUDA_SUCCESS != cuCtxPushCurrent(cuda_ctx)) { | ||
return TRITONSERVER_ErrorNew( | ||
TRITONSERVER_ERROR_INTERNAL, | ||
(std::string("unable to push Cuda context for ") + Name()).c_str()); | ||
} | ||
#endif // TRITON_ENABLE_CUDA_CTX_SHARING | ||
return nullptr; | ||
} | ||
|
||
inline TRITONSERVER_Error* PopCudaContext() | ||
{ | ||
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING | ||
CUcontext oldCtx{}; | ||
if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) { | ||
return TRITONSERVER_ErrorNew( | ||
TRITONSERVER_ERROR_INTERNAL, | ||
(std::string("unable to pop Cuda context for ") + Name()).c_str()); | ||
} | ||
if (oldCtx != cuda_ctx) { | ||
return TRITONSERVER_ErrorNew( | ||
TRITONSERVER_ERROR_INTERNAL, | ||
(std::string("popping the wrong Cuda context for ") + Name()) | ||
.c_str()); | ||
} | ||
#endif // TRITON_ENABLE_CUDA_CTX_SHARING | ||
return nullptr; | ||
} | ||
|
||
protected: | ||
common::TritonJson::Value graph_specs_; | ||
Priority priority_; | ||
|
@@ -61,6 +133,30 @@ class TensorRTModel : public BackendModel { | |
bool separate_output_stream_; | ||
bool eager_batching_; | ||
bool busy_wait_events_; | ||
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING | ||
CUcontext cuda_ctx = nullptr; | ||
#endif // TRITON_ENABLE_CUDA_CTX_SHARING | ||
}; | ||
|
||
struct ScopedRuntimeCudaContext { | ||
ScopedRuntimeCudaContext(TensorRTModel* model_state) | ||
: model_state_(model_state) | ||
{ | ||
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING | ||
if (model_state_->isCudaContextSharingEnabled()) { | ||
THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCudaContext()); | ||
} | ||
#endif // TRITON_ENABLE_CUDA_CTX_SHARING | ||
} | ||
~ScopedRuntimeCudaContext() | ||
{ | ||
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING | ||
if (model_state_->isCudaContextSharingEnabled()) { | ||
THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCudaContext()); | ||
} | ||
#endif // TRITON_ENABLE_CUDA_CTX_SHARING | ||
} | ||
TensorRTModel* model_state_; | ||
}; | ||
|
||
}}} // namespace triton::backend::tensorrt |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mind to share the reasoning of avoiding the set device calls? Wouldn't that cause the issue of model not being placed / executed on selected device (based on model config)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.