From 568cc51c89d5baa868234875d4cee43f53eccfb6 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 15 Oct 2024 23:22:58 +0530 Subject: [PATCH] Remove src_trt8 folder --- CMakeLists.txt | 117 +- src_trt8/filesystem.cc | 71 - src_trt8/filesystem.h | 39 - src_trt8/instance_state.cc | 4197 -------------------------- src_trt8/instance_state.h | 550 ---- src_trt8/io_binding_info.cc | 249 -- src_trt8/io_binding_info.h | 118 - src_trt8/libtriton_tensorrt.ldscript | 30 - src_trt8/loader.cc | 97 - src_trt8/loader.h | 60 - src_trt8/logging.cc | 68 - src_trt8/logging.h | 52 - src_trt8/model_state.cc | 932 ------ src_trt8/model_state.h | 146 - src_trt8/output_allocator.cc | 77 - src_trt8/output_allocator.h | 71 - src_trt8/semaphore.h | 59 - src_trt8/shared_library.cc | 177 -- src_trt8/shared_library.h | 41 - src_trt8/tensorrt.cc | 418 --- src_trt8/tensorrt_model.cc | 123 - src_trt8/tensorrt_model.h | 66 - src_trt8/tensorrt_model_instance.cc | 79 - src_trt8/tensorrt_model_instance.h | 53 - src_trt8/tensorrt_utils.cc | 509 ---- src_trt8/tensorrt_utils.h | 151 - 26 files changed, 33 insertions(+), 8517 deletions(-) delete mode 100644 src_trt8/filesystem.cc delete mode 100644 src_trt8/filesystem.h delete mode 100644 src_trt8/instance_state.cc delete mode 100644 src_trt8/instance_state.h delete mode 100644 src_trt8/io_binding_info.cc delete mode 100644 src_trt8/io_binding_info.h delete mode 100644 src_trt8/libtriton_tensorrt.ldscript delete mode 100644 src_trt8/loader.cc delete mode 100644 src_trt8/loader.h delete mode 100644 src_trt8/logging.cc delete mode 100644 src_trt8/logging.h delete mode 100644 src_trt8/model_state.cc delete mode 100644 src_trt8/model_state.h delete mode 100644 src_trt8/output_allocator.cc delete mode 100644 src_trt8/output_allocator.h delete mode 100644 src_trt8/semaphore.h delete mode 100644 src_trt8/shared_library.cc delete mode 100644 src_trt8/shared_library.h delete mode 100644 src_trt8/tensorrt.cc delete mode 100644 src_trt8/tensorrt_model.cc delete mode 100644 src_trt8/tensorrt_model.h delete mode 100644 src_trt8/tensorrt_model_instance.cc delete mode 100644 src_trt8/tensorrt_model_instance.h delete mode 100644 src_trt8/tensorrt_utils.cc delete mode 100644 src_trt8/tensorrt_utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index bc1c10d..3368050 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,91 +118,40 @@ if(${TRITON_ENABLE_NVTX}) add_definitions(-DTRITON_ENABLE_NVTX=1) endif() # TRITON_ENABLE_NVTX -if(NOT DEFINED ENV{TRT_VERSION}) - set(ENV{TRT_VERSION} 10.0.1.6) -endif() - -message(STATUS "TRT_VERSION envvar is $ENV{TRT_VERSION}") - -# -# FIXME: Update src folder to build with TRT8 and then remove -# src_trt8 folder. -# Currently the iGPU builds are still using TensorRT 8. This means -# we need to keep supporting TRT 8 for this backend. As there has -# been significant clean-ups when moving from TRT 8 to TRT 10 -# we are maintaining a previous version of backend for now. -# -if("$ENV{TRT_VERSION}" VERSION_LESS 10) - set(SOURCE_DIR "src_trt8") - add_library( - triton-tensorrt-backend SHARED - ${SOURCE_DIR}/tensorrt.cc - ${SOURCE_DIR}/model_state.cc - ${SOURCE_DIR}/tensorrt_model.cc - ${SOURCE_DIR}/tensorrt_model.h - ${SOURCE_DIR}/instance_state.cc - ${SOURCE_DIR}/tensorrt_model_instance.cc - ${SOURCE_DIR}/tensorrt_model_instance.h - ${SOURCE_DIR}/tensorrt_utils.cc - ${SOURCE_DIR}/tensorrt_utils.h - ${SOURCE_DIR}/filesystem.h - ${SOURCE_DIR}/filesystem.cc - ${SOURCE_DIR}/semaphore.h - ${SOURCE_DIR}/shared_library.h - ${SOURCE_DIR}/shared_library.cc - ${SOURCE_DIR}/loader.cc - ${SOURCE_DIR}/loader.h - ${SOURCE_DIR}/logging.cc - ${SOURCE_DIR}/logging.h - ${SOURCE_DIR}/output_allocator.cc - ${SOURCE_DIR}/output_allocator.h - ${SOURCE_DIR}/io_binding_info.cc - ${SOURCE_DIR}/io_binding_info.h - ) - # Shared library implementing the Triton Backend API - configure_file(${SOURCE_DIR}/libtriton_tensorrt.ldscript libtriton_tensorrt.ldscript COPYONLY) - - target_include_directories( - triton-tensorrt-backend - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_DIR} - ) -else() - set(SOURCE_DIR "src") - add_library( - triton-tensorrt-backend SHARED - ${SOURCE_DIR}/tensorrt.cc - ${SOURCE_DIR}/model_state.cc - ${SOURCE_DIR}/tensorrt_model.cc - ${SOURCE_DIR}/tensorrt_model.h - ${SOURCE_DIR}/instance_state.cc - ${SOURCE_DIR}/tensorrt_model_instance.cc - ${SOURCE_DIR}/tensorrt_model_instance.h - ${SOURCE_DIR}/shape_tensor.cc - ${SOURCE_DIR}/shape_tensor.h - ${SOURCE_DIR}/tensorrt_utils.cc - ${SOURCE_DIR}/tensorrt_utils.h - ${SOURCE_DIR}/filesystem.h - ${SOURCE_DIR}/filesystem.cc - ${SOURCE_DIR}/semaphore.h - ${SOURCE_DIR}/shared_library.h - ${SOURCE_DIR}/shared_library.cc - ${SOURCE_DIR}/loader.cc - ${SOURCE_DIR}/loader.h - ${SOURCE_DIR}/logging.cc - ${SOURCE_DIR}/logging.h - ${SOURCE_DIR}/output_allocator.cc - ${SOURCE_DIR}/output_allocator.h - ${SOURCE_DIR}/io_binding_info.cc - ${SOURCE_DIR}/io_binding_info.h - ) - # Shared library implementing the Triton Backend API - configure_file(${SOURCE_DIR}/libtriton_tensorrt.ldscript libtriton_tensorrt.ldscript COPYONLY) +add_library( + triton-tensorrt-backend SHARED + src/tensorrt.cc + src/model_state.cc + src/tensorrt_model.cc + src/tensorrt_model.h + src/instance_state.cc + src/tensorrt_model_instance.cc + src/tensorrt_model_instance.h + src/shape_tensor.cc + src/shape_tensor.h + src/tensorrt_utils.cc + src/tensorrt_utils.h + src/filesystem.h + src/filesystem.cc + src/semaphore.h + src/shared_library.h + src/shared_library.cc + src/loader.cc + src/loader.h + src/logging.cc + src/logging.h + src/output_allocator.cc + src/output_allocator.h + src/io_binding_info.cc + src/io_binding_info.h +) +# Shared library implementing the Triton Backend API +configure_file(src/libtriton_tensorrt.ldscript libtriton_tensorrt.ldscript COPYONLY) - target_include_directories( - triton-tensorrt-backend - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_DIR} - ) -endif() +target_include_directories( + triton-tensorrt-backend + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src +) add_library( TritonTensorRTBackend::triton-tensorrt-backend ALIAS triton-tensorrt-backend diff --git a/src_trt8/filesystem.cc b/src_trt8/filesystem.cc deleted file mode 100644 index 4eb22ab..0000000 --- a/src_trt8/filesystem.cc +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "logging.h" -#include "mutex" -#include "shared_library.h" - -/// FIXME: Duplication of core/src/shared_library.cc -/// Separate shared_library to common library and delete this - -#ifdef _WIN32 -// suppress the min and max definitions in Windef.h. -#define NOMINMAX -#include -#else -#include -#endif - -namespace triton { namespace backend { namespace tensorrt { - -std::string -DirName(const std::string& path) -{ - if (path.empty()) { - return path; - } - - size_t last = path.size() - 1; - while ((last > 0) && (path[last] == '/')) { - last -= 1; - } - - if (path[last] == '/') { - return std::string("/"); - } - - const size_t idx = path.find_last_of("/", last); - if (idx == std::string::npos) { - return std::string("."); - } - if (idx == 0) { - return std::string("/"); - } - - return path.substr(0, idx); -} - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/filesystem.h b/src_trt8/filesystem.h deleted file mode 100644 index 678a4b4..0000000 --- a/src_trt8/filesystem.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include - -/// FIXME: Duplication of core/src/filesystem.h -/// Separate filesystem to common library and delete this - -namespace triton { namespace backend { namespace tensorrt { - -/// Get the dirname of a path. -/// \param path The path. -/// \return all but the last segment of the path. -std::string DirName(const std::string& path); - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/instance_state.cc b/src_trt8/instance_state.cc deleted file mode 100644 index 24f851f..0000000 --- a/src_trt8/instance_state.cc +++ /dev/null @@ -1,4197 +0,0 @@ -// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "instance_state.h" - -#include "tensorrt_utils.h" -#include "triton/common/nvtx.h" - -namespace triton { namespace backend { namespace tensorrt { - -namespace { - -TRITONSERVER_Error* -CreateCudaEvent( - const std::string& event_name, unsigned int event_flags, cudaEvent_t* event) -{ - // Not adding 'cudaEventBlockingSync' to reduce gaps between the - // time of event record and the time of signaling blocking thread. - // The busy waiting only happens when there is an inflight request. - auto cuerr = cudaEventCreateWithFlags(event, event_flags); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to create CUDA event for ") + event_name + ": " + - cudaGetErrorString(cuerr)) - .c_str()); - } - return nullptr; -} - -} // namespace - -#ifdef TRITON_ENABLE_STATS -#define FAIL_ALL_AND_RETURN_IF_ERROR( \ - REQUESTS, REQUEST_COUNT, RESPONSES, S, LOG_MSG) \ - do { \ - TRITONSERVER_Error* farie_err_ = (S); \ - if (farie_err_ != nullptr) { \ - for (uint32_t r = 0; r < REQUEST_COUNT; ++r) { \ - if (RESPONSES[r] != nullptr) { \ - LOG_IF_ERROR( \ - TRITONBACKEND_ResponseSend( \ - RESPONSES[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ - farie_err_), \ - "failed to send TensorRT backend response"); \ - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, (LOG_MSG)); \ - } \ - LOG_IF_ERROR( \ - TRITONBACKEND_ModelInstanceReportStatistics( \ - TritonModelInstance(), REQUESTS[r], false /* success */, 0, 0, \ - 0, 0), \ - "failed reporting request statistics"); \ - LOG_IF_ERROR( \ - TRITONBACKEND_RequestRelease( \ - REQUESTS[r], TRITONSERVER_REQUEST_RELEASE_ALL), \ - "failed releasing request"); \ - REQUESTS[r] = nullptr; \ - } \ - TRITONSERVER_ErrorDelete(farie_err_); \ - return; \ - } \ - } while (false) - -void CUDART_CB -TimestampCaptureCallback(void* data) -{ - SET_TIMESTAMP(*(reinterpret_cast(data))); -} - -#else -#define FAIL_ALL_AND_RETURN_IF_ERROR( \ - REQUESTS, REQUEST_COUNT, RESPONSES, S, LOG_MSG) \ - do { \ - TRITONSERVER_Error* farie_err_ = (S); \ - if (farie_err_ != nullptr) { \ - for (uint32_t r = 0; r < REQUEST_COUNT; ++r) { \ - if (RESPONSES[r] != nullptr) { \ - LOG_IF_ERROR( \ - TRITONBACKEND_ResponseSend( \ - RESPONSES[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ - farie_err_), \ - "failed to send TensorRT backend response"); \ - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, (LOG_MSG)); \ - } \ - LOG_IF_ERROR( \ - TRITONBACKEND_RequestRelease( \ - REQUESTS[r], TRITONSERVER_REQUEST_RELEASE_ALL), \ - "failed releasing request"); \ - REQUESTS[r] = nullptr; \ - } \ - TRITONSERVER_ErrorDelete(farie_err_); \ - return; \ - } \ - } while (false) - -#endif // TRITON_ENABLE_STATS - -TRITONSERVER_Error* -ModelInstanceState::Create( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, - ModelInstanceState** state) -{ - try { - *state = new ModelInstanceState(model_state, triton_model_instance); - } - catch (const BackendModelInstanceException& ex) { - RETURN_ERROR_IF_TRUE( - ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, - std::string("unexpected nullptr in BackendModelInstanceException")); - RETURN_IF_ERROR(ex.err_); - } - - // If the model configuration doesn't have an explicit model file - // specified then use the default name. - std::string cc_model_filename = (*state)->ArtifactFilename(); - if (cc_model_filename.empty()) { - cc_model_filename = "model.plan"; - } - - auto model_path = JoinPath( - {model_state->RepositoryPath(), std::to_string(model_state->Version()), - cc_model_filename}); - - { - bool exists; - RETURN_IF_ERROR(FileExists(model_path, &exists)); - RETURN_ERROR_IF_FALSE( - exists, TRITONSERVER_ERROR_UNAVAILABLE, - std::string("unable to find '") + model_path + - "' for model instance '" + (*state)->Name() + "'"); - } - - (*state)->InitSemaphore(); - RETURN_IF_ERROR((*state)->InitStreamsAndEvents()); - RETURN_IF_ERROR(model_state->CreateEngine( - (*state)->DeviceId(), (*state)->DLACoreId(), model_path, - (*state)->EnginePtr())); - - // Create TRT API interface once able to obtain implicit batch info, - // all TRT operations must be done after the interface is instantiated. - if (UseTensorRTv1API((*state)->Engine())) { - (*state)->interface_.reset(new TRTv1Interface(*state)); - } else { - (*state)->interface_.reset(new TRTv3Interface(*state)); - } - RETURN_IF_ERROR((*state)->InitOptimizationProfiles()); - RETURN_IF_ERROR((*state)->ValidateIO()); - RETURN_IF_ERROR((*state)->InitIOBindingBuffers()); - - (*state)->completion_thread_ = - std::thread(&ModelInstanceState::ProcessResponse, *state); - - // CUDA 10.1 starts to support CUDA graphs. - // If enabled, build CUDA graphs with a set of graph specs. -#ifdef TRITON_ENABLE_CUDA_GRAPH - if (model_state->UseCudaGraphs()) { - RETURN_IF_ERROR((*state)->InitializeCudaGraph()); - } -#endif - - model_state->RegisterInstance((*state)->DeviceId(), *state); - - std::string profiles_desc; - (*state)->GetConfiguredProfiles(&profiles_desc); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Created instance ") + (*state)->Name() + " on GPU " + - std::to_string((*state)->DeviceId()) + " with stream priority " + - std::to_string((*state)->CudaStreamPriority()) + - " and optimization profile" + profiles_desc) - .c_str()); - - return nullptr; // success -} - -ModelInstanceState::ModelInstanceState( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) - : TensorRTModelInstance(model_state, triton_model_instance), - total_bindings_(0), num_expected_bindings_(0), - uses_implicit_state_(false), model_state_(model_state) -{ - // 'coalesce_request_input_' is set at backend level - { - TRITONBACKEND_Model* model; - THROW_IF_BACKEND_INSTANCE_ERROR( - TRITONBACKEND_ModelInstanceModel(triton_model_instance, &model)); - TRITONBACKEND_Backend* backend; - THROW_IF_BACKEND_INSTANCE_ERROR( - TRITONBACKEND_ModelBackend(model, &backend)); - void* state; - THROW_IF_BACKEND_INSTANCE_ERROR( - TRITONBACKEND_BackendState(backend, &state)); - coalesce_request_input_ = - reinterpret_cast(state)->coalesce_request_input_; - } - - if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_GPU) { - throw triton::backend::BackendModelInstanceException(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unable to load model '") + model_state_->Name() + - "', TensorRT backend supports only GPU device") - .c_str())); - } - - signal_stream_ = nullptr; - input_copy_stream_ = nullptr; - output_copy_stream_ = nullptr; - num_copy_streams_ = (model_state_->SeparateOutputStream()) ? 2 : 1; - next_buffer_binding_set_ = 0; - - next_set_ = 0; - for (size_t idx = 0; idx < EVENT_SET_COUNT; idx++) { - events_[idx].input_ready_ = nullptr; - events_[idx].ready_for_input_ = nullptr; - events_[idx].output_ready_ = nullptr; - events_[idx].ready_for_output_ = nullptr; - events_[idx].compute_input_start_ = nullptr; - events_[idx].compute_input_end_ = nullptr; - events_[idx].compute_output_start_ = nullptr; - } - support_batching_ = (model_state_->MaxBatchSize() > 0); - - TRITONSERVER_Error* err = - SupportsIntegratedZeroCopy(DeviceId(), &zero_copy_support_); - if (err != nullptr) { - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, TRITONSERVER_ErrorMessage(err)); - TRITONSERVER_ErrorDelete(err); - err = nullptr; - zero_copy_support_ = false; - } else if (zero_copy_support_) { - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Zero copy optimization is enabled"); - } else { - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Zero copy optimization is disabled"); - } -} - -ModelInstanceState::~ModelInstanceState() -{ - cudaSetDevice(DeviceId()); - for (auto& io_binding_infos : io_binding_infos_) { - for (auto& io_binding_info : io_binding_infos) { - if (!io_binding_info.IsDynamicShapeOutput() && - io_binding_info.GetBuffer() != nullptr) { - cudaError_t err = cudaSuccess; - if (io_binding_info.GetMemoryType() == TRITONSERVER_MEMORY_GPU) { - err = cudaFree(io_binding_info.GetBuffer()); - } else { - err = cudaFreeHost(io_binding_info.GetBuffer()); - } - if (err != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("Failed to free allocated memory for '") + Name() + - "': " + cudaGetErrorString(err)) - .c_str()); - } - } - } - } - - for (auto& trt_context : trt_contexts_) { - for (const auto& cuda_graph_execs : trt_context.second.cuda_graph_execs_) { - for (const auto& pr : cuda_graph_execs) { - cudaError_t err = cudaGraphExecDestroy(pr.second.cuda_graph_exec_); - if (err != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("Failed to destroy cuda graph exec: ") + - +cudaGetErrorString(err)) - .c_str()); - } - } - } - trt_context.second.cuda_graph_execs_.clear(); - } - - if (stream_ != nullptr) { - cudaError_t err = cudaStreamDestroy(stream_); - if (err != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("Failed to destroy cuda stream: ") + - +cudaGetErrorString(err)) - .c_str()); - } - stream_ = nullptr; - } - - if (signal_stream_ != nullptr) { - cudaError_t err = cudaStreamDestroy(signal_stream_); - if (err != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("Failed to destroy cuda signal stream: ") + - +cudaGetErrorString(err)) - .c_str()); - } - signal_stream_ = nullptr; - } - - if (input_copy_stream_ != nullptr) { - cudaError_t err = cudaStreamDestroy(input_copy_stream_); - if (err != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("Failed to destroy cuda input copy stream: ") + - +cudaGetErrorString(err)) - .c_str()); - } - input_copy_stream_ = nullptr; - } - - if (output_copy_stream_ != nullptr) { - cudaError_t err = cudaStreamDestroy(output_copy_stream_); - if (err != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("Failed to destroy cuda output copy stream: ") + - +cudaGetErrorString(err)) - .c_str()); - } - output_copy_stream_ = nullptr; - } - - DestroyEventSet(); - - // Notify the completion thread to exit - completion_queue_.Put(std::move(std::unique_ptr())); - if (completion_thread_.joinable()) { - completion_thread_.join(); - } -} - -void -ModelInstanceState::ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("TRITONBACKEND_ModelExecute: Issuing ") + Name() + " with " + - std::to_string(request_count) + " requests") - .c_str()); - - Run(requests, request_count); - - bool run_failed = true; - for (size_t i = 0; i < request_count; ++i) { - if (requests[i] != nullptr) { - run_failed = false; - break; - } - } - - // On error, handle the response here instead of delegating to the - // completion thread as the completion thread will wait on CUDA - // events unconditionally, which can be ignored on error. - if (run_failed) { - // On inference error, place the slot back in the queue - // immediately, as all work for the slot should be ignored. - semaphore_->Release(); - } else { - auto event_set_idx = next_set_; - next_set_ = (event_set_idx + 1) % EVENT_SET_COUNT; - payload_->requests_list_.reserve(payload_->request_count_); - for (uint32_t i = 0; i < payload_->request_count_; i++) { - payload_->requests_list_.push_back(payload_->requests_[i]); - } - payload_->requests_ = &payload_->requests_list_[0]; - // Put the details needed by the ProcessResponse thread on the - // queue - completion_queue_.Put(std::move(payload_)); - next_buffer_binding_set_ = - (next_buffer_binding_set_ + 1) % num_copy_streams_; - - // Wait until the states are updated. Barrier is only - // engaged when model has an implicit state. - if (barrier_.get() != nullptr) { - barrier_->get_future().wait(); - } - } -} - -void -ModelInstanceState::Run( - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + - std::to_string(request_count) + " requests") - .c_str()); - - NVTX_RANGE(nvtx_, "Run " + Name()); - - // Should set a barrier so that instance execution can be blocked until - // the state update is completed. - if (uses_implicit_state_) { - barrier_.reset(new std::promise()); - } - - // Need to move the TRITONBACKEND_Request objects, as the lifetime - // must be extended until ProcessResponse completes. - payload_.reset(new Payload(next_set_, requests, request_count)); - SET_TIMESTAMP(payload_->compute_start_ns_); - - cudaSetDevice(DeviceId()); -#ifdef TRITON_ENABLE_STATS - { - SET_TIMESTAMP(payload_->compute_start_ns_); - cudaEventRecord(events_[next_set_].compute_input_start_, signal_stream_); - } -#endif // TRITON_ENABLE_STATS - - const int max_batch_size = StateForModel()->MaxBatchSize(); - - // For each request collect the total batch size for this inference - // execution. The batch-size, number of inputs, and size of each - // input has already been checked so don't need to do that here. - payload_->total_batch_size_ = 0; - for (size_t i = 0; i < payload_->request_count_; i++) { - // If we get a nullptr request then something is badly wrong. Fail - // and release all requests. - if (payload_->requests_[i] == nullptr) { - RequestsRespondWithError( - payload_->requests_, payload_->request_count_, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "null request given to TensorRT backend for '" + Name() + "'") - .c_str())); - return; - } - - if (max_batch_size > 0) { - // Retrieve the batch size from one of the inputs, - // if the model support batching, the first dimension size is - // batch size - TRITONBACKEND_Input* input; - auto err = - TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); - if (err == nullptr) { - const int64_t* shape; - err = TRITONBACKEND_InputProperties( - input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); - payload_->total_batch_size_ += shape[0]; - } - if (err != nullptr) { - RequestsRespondWithError(requests, request_count, err); - return; - } - } else { - payload_->total_batch_size_ += 1; - } - } - - // If there are no valid requests then no need to run the - // inference. This should never happen unless called with an empty - // 'requests' for some reason. - if (payload_->total_batch_size_ == 0) { - return; - } - - // Make sure the maximum batch size is not exceeded. The - // total_batch_size must be 1 for models that don't support batching - // (i.e. max_batch_size == 0). If max_batch_size is exceeded then - // scheduler has done something badly wrong so fail and release all - // requests. - if ((payload_->total_batch_size_ != 1) && - (payload_->total_batch_size_ > (size_t)max_batch_size)) { - RequestsRespondWithError( - payload_->requests_, payload_->request_count_, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "batch size " + std::to_string(payload_->total_batch_size_) + - " for '" + Name() + "', max allowed is " + - std::to_string(max_batch_size)) - .c_str())); - return; - } - - std::map> request_shape_values; - // Scheduler ensures all the requests have identical shape values so - // use values from first shape tensor - TRITONSERVER_Error* err = GetRequestShapeValues( - payload_->total_batch_size_, payload_->requests_[0], - &request_shape_values); - if (err != nullptr) { - RequestsRespondWithError( - payload_->requests_, payload_->request_count_, err); - return; - } - - std::map::iterator citr; - err = GetMostOptimizedProfile( - payload_->total_batch_size_, payload_->requests_, - payload_->request_count_, request_shape_values, &citr); - - if (err != nullptr) { - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, TRITONSERVER_ErrorMessage(err)); - TRITONSERVER_ErrorDelete(err); - err = nullptr; - } - - int binding_offset = citr->first * num_expected_bindings_; - - // At this point we are committed to running inference with all - // 'requests'. Create a response for each request. During input - // processing if there is an error with any request that error will - // be sent immediately with the corresponding response (and the - // response pointer will then be nullptr). The request object - // itself will not be released until after all inferencing is done - // (below) as we may need to access the request object when - // determine how to process outputs (for example, even if we don't - // need the outputs for a request that has an error, we do need to - // know the size of those outputs associated with the request so we - // can skip them in the output tensors). - payload_->responses_.reserve(payload_->request_count_); - - for (size_t i = 0; i < payload_->request_count_; i++) { - TRITONBACKEND_Response* response; - auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); - if (err == nullptr) { - payload_->responses_.emplace_back(response); - } else { - payload_->responses_.emplace_back(nullptr); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); - TRITONSERVER_ErrorDelete(err); - } - } - - // Calculate the set of event used with the current buffer set - // in previous execution - int prev_set = (EVENT_SET_COUNT - - (buffer_bindings_.size() % EVENT_SET_COUNT) + next_set_) % - EVENT_SET_COUNT; - auto prev_input_ready_event = model_state_->EagerBatching() - ? events_[prev_set].ready_for_input_ - : nullptr; - std::vector input_dims{(int64_t)payload_->total_batch_size_}; - payload_->collector_.reset(new BackendInputCollector( - payload_->requests_, payload_->request_count_, &payload_->responses_, - model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(), - input_copy_stream_, events_[next_set_].input_ready_, - prev_input_ready_event, model_state_->GatherKernelBufferThreshold(), - HostPolicyName().c_str(), zero_copy_support_, coalesce_request_input_)); - // For each input, concatenate input values from each request into - // the corresponding binding. - for (int io_index = 0; io_index < num_expected_bindings_; ++io_index) { - auto& io_binding_info = - io_binding_infos_[next_buffer_binding_set_][io_index]; - int binding_index = binding_offset + io_index; - - if (io_binding_info.IsDynamicShapeOutput()) { - citr->second.context_->setOutputAllocator( - io_binding_info.GetName().c_str(), io_binding_info.GetAllocator()); - } - - if (!engine_->bindingIsInput(binding_index)) { - continue; - } - - const std::string& name = engine_->getBindingName(io_index); - - TRITONSERVER_Error* err = nullptr; - // Set the shape binding if needed. If unable to set the shape - // binding then fail all requests. - if (engine_->isShapeBinding(binding_index)) { - auto it = request_shape_values.find(io_index); - if (it != request_shape_values.end()) { - err = ValidateShapeValues( - it->second, citr->second.min_shapes_[binding_index], - citr->second.max_shapes_[binding_index], - citr->second.nb_shape_values_, support_batching_); - } else { - err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to find shape values for shape input '") + - name + "' in request for " + Name()) - .c_str()); - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - err, "missing shape values for the shape tensor"); - } - if (err != nullptr) { - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - err, "invalid shape values encountered for shape inputs"); - } else { - // [FIXME] formalize it, the 'buffer_' may be set directly while forming - // the shape value - memcpy( - io_binding_info.GetBuffer(), &(it->second[0]), - sizeof(int32_t) * it->second.size()); - citr->second.context_->setInputTensorAddress( - name.c_str(), io_binding_info.GetBuffer()); - // [FIXME] should be replaced by above, see another use of - // setInputShapeBinding for detail - citr->second.context_->setInputShapeBinding( - binding_index, - reinterpret_cast(io_binding_info.GetBuffer())); - } - } - - // Skip the upcoming section if it is a shape tensor - if (engine_->isShapeInferenceIO(name.c_str())) { - continue; - } - - // FIXME inefficient as looping in this way may iterate the same - // source_input multiple times - if (io_binding_info.GetBatchInput() != nullptr) { - std::vector shape; - const auto& batch_input = io_binding_info.GetBatchInput()->first; - auto& allocated_memory = io_binding_info.GetBatchInput()->second; - TRITONSERVER_MemoryType mem_type = allocated_memory->MemoryType(); - int64_t mem_type_id = allocated_memory->MemoryTypeId(); - char* input_buffer = allocated_memory->MemoryPtr(); - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - payload_->collector_->BatchInputShape(batch_input, &shape), - "error getting the batch input shape"); - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - interface_->SetBindingDimensions( - name, shape, citr->second, io_index, binding_index, &input_dims), - "error setting the binding dimension"); - - TRITONSERVER_DataType datatype = batch_input.DataType(); - size_t total_byte_size = GetByteSize(datatype, shape); - - const char* dst_buffer; - size_t dst_buffer_byte_size; - TRITONSERVER_MemoryType dst_memory_type; - int64_t dst_memory_type_id; - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - payload_->collector_->ProcessBatchInput( - batch_input, input_buffer, total_byte_size, - {{mem_type, mem_type_id}}, &dst_buffer, &dst_buffer_byte_size, - &dst_memory_type, &dst_memory_type_id), - "error setting the batch input value"); - - if ((batch_input.BatchInputKind() != - BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE) && - (io_binding_info.GetMemoryType() == TRITONSERVER_MEMORY_GPU)) { - bool cuda_used = false; - - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - CopyBuffer( - name, mem_type, mem_type_id, io_binding_info.GetMemoryType(), - io_binding_info.GetMemoryTypeId(), total_byte_size, - input_buffer, io_binding_info.GetBuffer(), input_copy_stream_, - &cuda_used), - "error copying the batch input buffer"); - if (cuda_used) { - cudaEventRecord(events_[next_set_].input_ready_, input_copy_stream_); - } - } - } else if (io_binding_info.IsBufferRagged()) { - std::vector ragged_shape{0}; - TRITONSERVER_DataType datatype; - for (size_t req_idx = 0; req_idx < payload_->request_count_; req_idx++) { - TRITONBACKEND_Input* repr_input; - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - TRITONBACKEND_RequestInput( - payload_->requests_[req_idx], name.c_str(), &repr_input), - (std::string("failed to obtain the input '") + name + "'").c_str()); - - TRITONSERVER_DataType temp_dt; - const int64_t* shape; - uint32_t dims_count; - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - TRITONBACKEND_InputProperties( - repr_input, nullptr, &temp_dt, &shape, &dims_count, nullptr, - nullptr), - (std::string("failed to obtain the input properties for '") + name + - "'") - .c_str()); - - ragged_shape[0] += backend::GetElementCount(shape, dims_count); - if (req_idx == 0) { - datatype = temp_dt; - } - } - - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - interface_->SetBindingDimensions( - name, ragged_shape, citr->second, io_index, binding_index, - &input_dims), - "error setting the binding dimension"); - - size_t total_byte_size = GetByteSize(datatype, ragged_shape); - - payload_->collector_->ProcessTensor( - name.c_str(), static_cast(io_binding_info.GetBuffer()), - total_byte_size, io_binding_info.GetMemoryType(), - io_binding_info.GetMemoryTypeId()); - } else { - TRITONBACKEND_Input* repr_input; - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - TRITONBACKEND_RequestInput( - payload_->requests_[0], name.c_str(), &repr_input), - (std::string("failed to obtain the representative input '") + name + - "'") - .c_str()); - - // Get the shape of the input. The request has already checked - // that the request shape is valid so don't need to do it here. - TRITONSERVER_DataType datatype; - const int64_t* shape; - uint32_t dims_count; - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - TRITONBACKEND_InputProperties( - repr_input, nullptr, &datatype, &shape, &dims_count, nullptr, - nullptr), - (std::string("failed to obtain the representative input " - "properties for '") + - name + "'") - .c_str()); - - // The shape for the entire input batch, [total_batch_size, ...] - std::vector batchn_shape; - uint64_t dim_idx = 0; - batchn_shape.reserve(dims_count); - if (support_batching_) { - if (!engine_->isShapeBinding(binding_index)) { - batchn_shape.push_back(payload_->total_batch_size_); - dim_idx = 1; - } - } - while (dim_idx < dims_count) { - batchn_shape.push_back(shape[dim_idx++]); - } - - // Set the binding dimension so that output dimensions can be - // obtained - if (!engine_->isShapeBinding(binding_index)) { - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - interface_->SetBindingDimensions( - name, batchn_shape, citr->second, io_index, binding_index, - &input_dims), - "error setting the binding dimension"); - } - - size_t total_byte_size = 0; - if (io_binding_info.GetFormat().is_linear_format_) { - total_byte_size = GetByteSize(datatype, batchn_shape); - } else { - batchn_shape[io_binding_info.GetFormat().vectorized_dim_] += - (io_binding_info.GetFormat().components_per_element_ - - (batchn_shape[io_binding_info.GetFormat().vectorized_dim_] % - io_binding_info.GetFormat().components_per_element_)); - total_byte_size = GetByteSize(datatype, batchn_shape); - } - - payload_->collector_->ProcessTensor( - name.c_str(), static_cast(io_binding_info.GetBuffer()), - total_byte_size, io_binding_info.GetMemoryType(), - io_binding_info.GetMemoryTypeId()); - } - } - payload_->collector_->Finalize(); - - const TensorRTContext::CudaGraph* cuda_graph = nullptr; - bool found_exact = false; - // FIXME closest_cuda_graph - FindClosestCudaGraph(citr->second, input_dims, &cuda_graph, &found_exact); - // [DLIS-4283] should re-visit below... - // is below even necessary if we are going to launch a CUDA graph? - // regular input buffer has been filled and batch input buffer has been set, - // unless the below is something that must be changed based on selected graph - if ((cuda_graph != nullptr) && !found_exact) { - size_t input_idx = 0; - for (int io_index = 0; io_index < num_expected_bindings_; ++io_index) { - auto& io_binding_info = - io_binding_infos_[next_buffer_binding_set_][io_index]; - int binding_index = binding_offset + io_index; - if (!engine_->bindingIsInput(binding_index) || - engine_->isShapeBinding(binding_index)) { - continue; - } - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - interface_->SetBindingDimensions( - "CUDA graph input", cuda_graph->input_dims_[input_idx], - citr->second, io_index, binding_index, nullptr), - "error setting the binding dimension"); - - // Initialize additional entries in batch input - if (io_binding_info.GetBatchInput() != nullptr) { - const auto& batch_input = io_binding_info.GetBatchInput()->first; - const size_t total_byte_size = GetByteSize( - batch_input.DataType(), cuda_graph->input_dims_[input_idx]); - - auto& allocated_memory = io_binding_info.GetBatchInput()->second; - TRITONSERVER_MemoryType mem_type = allocated_memory->MemoryType(); - int64_t mem_type_id = allocated_memory->MemoryTypeId(); - char* input_buffer = allocated_memory->MemoryPtr(); - - const char* dst_buffer; - size_t dst_buffer_byte_size; - TRITONSERVER_MemoryType dst_memory_type; - int64_t dst_memory_type_id; - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - payload_->collector_->ProcessBatchInput( - batch_input, input_buffer, total_byte_size, - {{mem_type, mem_type_id}}, &dst_buffer, &dst_buffer_byte_size, - &dst_memory_type, &dst_memory_type_id), - "error setting the bath input value"); - - if ((batch_input.BatchInputKind() != - BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE) && - (io_binding_info.GetMemoryType() == TRITONSERVER_MEMORY_GPU)) { - bool cuda_used = false; - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, - payload_->responses_, - CopyBuffer( - "CUDA graph batch input", mem_type, mem_type_id, - io_binding_info.GetMemoryType(), - io_binding_info.GetMemoryTypeId(), total_byte_size, - input_buffer, io_binding_info.GetBuffer(), input_copy_stream_, - &cuda_used), - "error copying the batch input buffer"); - if (cuda_used) { - cudaEventRecord( - events_[next_set_].input_ready_, input_copy_stream_); - } - } - } - input_idx++; - } - } - - // Ensure inputs are ready before execution. - // Output buffers are guaranteed to be available at this point when - // the execution and output copy are on the same stream. - cudaStreamWaitEvent(stream_, events_[next_set_].input_ready_, 0); - -#ifdef TRITON_ENABLE_STATS - cudaStreamWaitEvent(signal_stream_, events_[next_set_].input_ready_, 0); - cudaEventRecord(events_[next_set_].compute_input_end_, signal_stream_); -#endif // TRITON_ENABLE_STATS - - // Wait for the output buffers to be available at this point when - // the execution and output copy are on separate streams - if (model_state_->SeparateOutputStream()) { - cudaStreamWaitEvent(stream_, events_[next_set_].output_ready_, 0); - } - // Async execute the inference using a CUDA graph if available for - // the batch-size, otherwise execution normally. - if (cuda_graph != nullptr) { - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Context with profile ") + citr->second.profile_name_ + - " [" + std::to_string(citr->first) + "] is launching CUDA graph for " + - Name()) - .c_str()); - - cudaError_t cuda_err = - cudaGraphLaunch(cuda_graph->cuda_graph_exec_, stream_); - if (cuda_err != cudaSuccess) { - cudaStreamSynchronize(stream_); - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to execute graph for inference ") + name_ + - ": " + cudaGetErrorString(cuda_err)) - .c_str()), - "failed to run TRT inference"); - } - // [DLIS-4283] [FIXME] should not need this as TRT adopted the new CUDA - // graph API that exposed event activity - // Event recorded during CUDA graph capture is not visible outside - // of the graph, need to explicitly record it. - cudaEventRecord(events_[next_set_].ready_for_input_, stream_); - } else { - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Context with profile ") + citr->second.profile_name_ + - " [" + std::to_string(citr->first) + "] is being executed for " + - Name()) - .c_str()); - - if (!citr->second.context_->allInputDimensionsSpecified()) { - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "failed to specify the dimensions of all input " - "bindings"), - "failed to run TRT inference"); - } - if (!citr->second.context_->allInputShapesSpecified()) { - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "failed to specify the values for all input shape " - "tensors"), - "failed to run TRT inference"); - } - - if (!interface_->Enqueue(citr->second.context_.get())) { - cudaStreamSynchronize(stream_); - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to enqueue for inference ") + Name()) - .c_str()), - "failed to run TRT inference"); - } - } - - cudaEventRecord(events_[next_set_].ready_for_output_, stream_); - -#ifdef TRITON_ENABLE_STATS - cudaStreamWaitEvent(signal_stream_, events_[next_set_].ready_for_output_, 0); - cudaEventRecord(events_[next_set_].compute_output_start_, signal_stream_); -#endif // TRITON_ENABLE_STATS - - // Collect the names of requested outputs. Do not include outputs - // for requests that have already responded with an error. - std::vector> request_required_outputs( - payload_->request_count_); - for (size_t idx = 0; idx < payload_->request_count_; idx++) { - const auto& request = payload_->requests_[idx]; - auto& response = payload_->responses_[idx]; - if (response != nullptr) { - uint32_t output_count; - RESPOND_AND_SET_NULL_IF_ERROR( - &response, TRITONBACKEND_RequestOutputCount(request, &output_count)); - if (response != nullptr) { - for (uint32_t output_idx = 0; output_idx < output_count; output_idx++) { - const char* output_name; - RESPOND_AND_SET_NULL_IF_ERROR( - &response, TRITONBACKEND_RequestOutputName( - request, output_idx, &output_name)); - if (response != nullptr) { - request_required_outputs[idx].insert(output_name); - } - } - } - } - } - - // Wait for the inference to be completed before copying output if - // output copy is on a separate stream - if (model_state_->SeparateOutputStream()) { - cudaStreamWaitEvent( - output_copy_stream_, events_[next_set_].ready_for_output_, 0); - } - - const auto output_stream = - model_state_->SeparateOutputStream() ? output_copy_stream_ : stream_; - - // For each requested output, verify that the output can accept the - // actual model output and then copy that output from the GPU - payload_->responder_.reset(new BackendOutputResponder( - payload_->requests_, payload_->request_count_, &payload_->responses_, - model_state_->TritonMemoryManager(), model_state_->MaxBatchSize() > 0, - model_state_->EnablePinnedOutput(), output_stream, - events_[next_set_].output_ready_, zero_copy_support_)); - for (int io_index = 0; io_index < num_expected_bindings_; ++io_index) { - auto& io_binding_info = - io_binding_infos_[next_buffer_binding_set_][io_index]; - int binding_index = binding_offset + io_index; - if (engine_->bindingIsInput(binding_index)) { - continue; - } - - const std::string& name = engine_->getBindingName(io_index); - - nvinfer1::Dims dims; - dims = citr->second.context_->getBindingDimensions(binding_index); - - // Make sure each output is of the expected size and copy it into - // the payload responses. - bool cuda_copy = false; - if (engine_->isShapeBinding(binding_index)) { - // Custom handling for shape tensors - // Obtain the shape value - if (dims.nbDims != 0) { - auto shape_value_ptr = reinterpret_cast( - citr->second.context_->getTensorAddress(name.c_str())); - - // The first shape value must be equal to the total batch_size - if (support_batching_ && - payload_->total_batch_size_ != (uint32_t)*shape_value_ptr) { - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, - payload_->responses_, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("failed to retrieve the output shape values " - "from binding '") + - Name() + "'") - .c_str()), - "failed to run TRT response"); - } - - std::vector batchn_shape; - if (support_batching_) { - batchn_shape.push_back(payload_->total_batch_size_); - batchn_shape.push_back(dims.d[0] - 1); - } else { - batchn_shape.push_back(dims.d[0]); - } - - for (size_t idx = 0; idx < payload_->responses_.size(); idx++) { - auto& request = payload_->requests_[idx]; - auto& response = payload_->responses_[idx]; - - if (support_batching_) { - TRITONBACKEND_Input* input; - TRITONBACKEND_RequestInputByIndex(request, 0, &input); - const int64_t* shape; - RESPOND_AND_SET_NULL_IF_ERROR( - &response, TRITONBACKEND_InputProperties( - input, nullptr, nullptr, &shape, nullptr, - nullptr, nullptr)); - batchn_shape[0] = shape[0]; - } - - const size_t tensor_element_cnt = - backend::GetElementCount(batchn_shape); - - TRITONSERVER_DataType dt = ConvertTrtTypeToDataType( - engine_->getBindingDataType(binding_index)); - - // Only need an response tensor for requested outputs. - if ((response != nullptr) && - (request_required_outputs[idx].find(name) != - request_required_outputs[idx].end())) { - TRITONBACKEND_Output* response_output = nullptr; - RESPOND_AND_SET_NULL_IF_ERROR( - &response, TRITONBACKEND_ResponseOutput( - response, &response_output, name.c_str(), dt, - batchn_shape.data(), batchn_shape.size())); - cuda_copy |= SetOutputShapeTensorBuffer( - shape_value_ptr, &response, response_output, tensor_element_cnt, - batchn_shape[0], stream_); - } - } - } - } else if (io_binding_info.IsBufferRagged()) { - // FIXME add correctness checking like below - io_binding_info.SetBatchOutput(model_state_->FindBatchOutput(name)); - - // Process the output tensors with pinned memory address if zero-copy is - // supported, otherwise use device memory. Perform memory copies - // asynchronously and wait for model execution. - payload_->responder_->ProcessBatchOutput( - name, *(io_binding_info.GetBatchOutput()), - static_cast(io_binding_info.GetBuffer()), - io_binding_info.GetMemoryType(), io_binding_info.GetMemoryTypeId()); - } else { - std::vector batchn_shape; - - if (engine_->hasImplicitBatchDimension() && support_batching_) { - batchn_shape.push_back(payload_->total_batch_size_); - } - - for (int i = 0; i < dims.nbDims; ++i) { - batchn_shape.push_back(dims.d[i]); - } - - TRITONSERVER_DataType dt = - ConvertTrtTypeToDataType(engine_->getBindingDataType(binding_index)); - - // FIXME process reformat-free output, need to update output - // process code to accept batch1_byte_size and request batch - // size to break down output buffer properly. - size_t batch1_byte_size = GetByteSize(dt, batchn_shape); - if (support_batching_) { - batch1_byte_size /= payload_->total_batch_size_; - } - - if (io_binding_info.GetByteSize() < - (batch1_byte_size * payload_->total_batch_size_) && - !io_binding_info.IsDynamicShapeOutput()) { - FAIL_ALL_AND_RETURN_IF_ERROR( - payload_->requests_, payload_->request_count_, payload_->responses_, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unexpected size for output '") + name + - "', byte-size " + - std::to_string(io_binding_info.GetByteSize()) + - " is less than " + - std::to_string(payload_->total_batch_size_) + " * " + - std::to_string(batch1_byte_size)) - .c_str()), - "failed to run TRT response"); - } - - if (io_binding_info.IsRequestedOutputTensor()) { - // Process the output tensors with pinned memory address if zero-copy is - // supported, otherwise use device memory. Perform memory copies - // asynchronously and wait for model execution. - payload_->responder_->ProcessTensor( - name, dt, batchn_shape, - static_cast(io_binding_info.GetBuffer()), - io_binding_info.GetMemoryType(), io_binding_info.GetMemoryTypeId()); - } - - if (io_binding_info.IsStateOutput()) { - auto updated_states = payload_->responder_->ProcessStateTensor( - name, dt, batchn_shape, - static_cast(io_binding_info.GetBuffer()), - io_binding_info.GetMemoryType(), io_binding_info.GetMemoryTypeId()); - payload_->seq_states_.insert( - payload_->seq_states_.end(), updated_states.begin(), - updated_states.end()); - } - } - } -} - -bool -ModelInstanceState::SetOutputShapeTensorBuffer( - const int32_t* content, TRITONBACKEND_Response** response, - TRITONBACKEND_Output* response_output, const size_t tensor_element_count, - const int64_t batch_size, cudaStream_t stream) -{ - bool cuda_copy = false; - - const size_t expected_byte_size = tensor_element_count * sizeof(int32_t); - - // Allocate a buffer large enough to hold the serialized tensor. - TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU; - int64_t actual_memory_type_id = 0; - - void* buffer; - auto err = TRITONBACKEND_OutputBuffer( - response_output, &buffer, expected_byte_size, &actual_memory_type, - &actual_memory_type_id); - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - return cuda_copy; - } - - const size_t nb_shape_values = tensor_element_count / batch_size; - - // Copy the serialized tensor into the allocated buffer. - bool cuda_used = false; - size_t content_offset = support_batching_ ? 1 : 0; - size_t buffer_offset = 0; - for (int i = 0; i < batch_size; i++) { - err = CopyBuffer( - "Shape tensor output", TRITONSERVER_MEMORY_CPU /* src_memory_type */, - 0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id, - nb_shape_values * sizeof(int32_t), (void*)(content + content_offset), - (void*)((char*)buffer + buffer_offset), stream_, &cuda_used); - cuda_copy |= cuda_used; - buffer_offset += (nb_shape_values * sizeof(int32_t)); - } - - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - return cuda_copy; - } - - return cuda_copy; -} - -void -ModelInstanceState::ProcessResponse() -{ - while (true) { - NVTX_RANGE(nvtx_, "ProcessResponse " + Name()); - auto payload = std::move(completion_queue_.Get()); - if (payload.get() == nullptr) { - break; - } - auto& event_set = events_[payload->event_set_idx_]; - - // The model execution associated with the current slot - // has consumed the inputs. Put the slot back into the available - // slots so that it can begin enqueuing new memcpys into the input - // buffers - cudaEventSynchronize(event_set.ready_for_input_); - - // This will be empty unless TRITONSERVER_RESET_BINDING_BUFFERS is set to 1 - for (auto& buffer_binding_pair : payload->buffer_input_binding_pairs_) { - cudaMemsetAsync( - buffer_binding_pair.first, 0, buffer_binding_pair.second, - input_copy_stream_); - } - - semaphore_->Release(); - NVTX_MARKER("plan_input_available"); - - // Call Finalize() here to defer CUDA synchronization as much as - // possible - payload->responder_->Finalize(); - cudaEventSynchronize(event_set.output_ready_); - NVTX_MARKER("plan_output_ready"); - - // Update the states - for (auto& state : payload->seq_states_) { - FAIL_ALL_AND_RETURN_IF_ERROR( - payload->requests_, payload->request_count_, payload->responses_, - TRITONBACKEND_StateUpdate(state), "failed to update state"); - } - // Signal the state update completion. - if (barrier_.get() != nullptr) { - barrier_->set_value(); - } - - // Compute ends when the output data copy is completed - uint64_t compute_end_ns = 0; -#ifdef TRITON_ENABLE_STATS - cudaEventSynchronize(event_set.compute_output_start_); - float compute_infer = 0; - LOG_IF_CUDA_ERROR( - cudaEventElapsedTime( - &compute_infer, event_set.compute_input_end_, - event_set.compute_output_start_), - "elapsed failed"); - - float compute_input = 0; - LOG_IF_CUDA_ERROR( - cudaEventElapsedTime( - &compute_input, event_set.compute_input_start_, - event_set.compute_input_end_), - "elapsed time failed."); - - payload->compute_input_end_ns_ = - payload->compute_start_ns_ + (compute_input * 1e6); - payload->compute_output_start_ns_ = - payload->compute_input_end_ns_ + (compute_infer * 1e6); - SET_TIMESTAMP(compute_end_ns); -#endif // TRITON_ENABLE_STATS - - // Send all the responses that haven't already been sent because - // of an earlier error. Note that the responses are not set to - // nullptr here as we need that indication below to determine if - // the request we successful or not. - for (auto& response : payload->responses_) { - if (response != nullptr) { - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), - "failed to send TensorRT backend response"); - } - } - - // Report statistics for each request. - for (uint32_t r = 0; r < payload->request_count_; ++r) { - auto& request = payload->requests_[r]; - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportStatistics( - TritonModelInstance(), request, - (payload->responses_[r] != nullptr) /* success */, - payload->compute_start_ns_, payload->compute_input_end_ns_, - payload->compute_output_start_ns_, compute_end_ns), - "failed reporting request statistics"); - - LOG_IF_ERROR( - TRITONBACKEND_RequestRelease( - request, TRITONSERVER_REQUEST_RELEASE_ALL), - "failed releasing request"); - } - - // Report the entire batch statistics. - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportBatchStatistics( - TritonModelInstance(), payload->total_batch_size_, - payload->compute_start_ns_, payload->compute_input_end_ns_, - payload->compute_output_start_ns_, compute_end_ns), - "failed reporting batch request statistics"); - - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("TRITONBACKEND_ModelExecute: model ") + Name() + - " released " + std::to_string(payload->request_count_) + " requests") - .c_str()); - } -} - -TRITONSERVER_Error* -ModelInstanceState::GetRequestShapeValues( - size_t total_batch_size, TRITONBACKEND_Request* request, - std::map>* request_shape_values) -{ - // Visit all the inputs and extract the shape values present in the - // request - uint32_t input_count; - RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(request, &input_count)); - for (uint32_t i = 0; i < input_count; i++) { - TRITONBACKEND_Input* input; - TRITONBACKEND_RequestInputByIndex(request, i, &input); - const char* input_name; - TRITONSERVER_DataType datatype; - const int64_t* shape; - uint32_t dims_count; - uint64_t byte_size; - uint32_t buffer_count; - RETURN_IF_ERROR(TRITONBACKEND_InputProperties( - input, &input_name, &datatype, &shape, &dims_count, &byte_size, - &buffer_count)); - - int io_index = engine_->getBindingIndex(input_name); - if (engine_->isShapeBinding(io_index)) { - auto it = - request_shape_values->emplace(io_index, std::vector()).first; - if (support_batching_) { - it->second.push_back((int32_t)total_batch_size); - } - - // For now being conservative and requiring that shape tensors - // be in a single buffer on the CPU. We can handle more cases in - // future if necessary. - if (buffer_count != 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("shape tensor for input '") + input_name + - "' must be in single contiguous buffer on CPU") - .c_str()); - } - - size_t data_byte_size; - TRITONSERVER_MemoryType data_memory_type; - int64_t data_memory_id; - const char* data_buffer; - RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( - input, 0 /* idx */, reinterpret_cast(&data_buffer), - &data_byte_size, &data_memory_type, &data_memory_id)); - - if ((data_buffer == nullptr) || - (data_memory_type == TRITONSERVER_MEMORY_GPU)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("shape tensor for input '") + input_name + - "' must be in single contiguous buffer on CPU") - .c_str()); - } - - // Shape tensors datatype is INT32. - int64_t element_cnt = backend::GetElementCount(shape, dims_count); - if (support_batching_) { - element_cnt /= shape[0]; - } - const size_t expected_byte_size = - element_cnt * GetByteSize(TRITONSERVER_TYPE_INT32, {1}); - - bool includes_batch_shape_value = false; - if (expected_byte_size != data_byte_size) { - if (expected_byte_size == (data_byte_size - sizeof(int32_t))) { - includes_batch_shape_value = true; - } else { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("shape tensor for input '") + input_name + - "' expected byte size is " + std::to_string(expected_byte_size) + - " [ or " + std::to_string(expected_byte_size + sizeof(int32_t)) + - " if input includes batch shape value] " + ", got " + - std::to_string(data_byte_size)) - .c_str()); - } - } - - const int32_t* dims = reinterpret_cast(data_buffer); - int64_t offset = includes_batch_shape_value ? 1 : 0; - for (int64_t i = offset; i < element_cnt; ++i) { - it->second.push_back(dims[i]); - } - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::GetMostOptimizedProfile( - size_t total_batch_size, TRITONBACKEND_Request** requests, - uint32_t request_count, - const std::map>& request_shape_values, - std::map::iterator* citr) -{ - // Returns the TensorRT context that uses profile with shortest - // Manhattan distance in terms of input dimensions [TODO] traverse - // it with more efficient data structure (i.e. K-D tree) - *citr = trt_contexts_.begin(); - if (trt_contexts_.size() != 1) { - int64_t shortest_distance = LLONG_MAX; - for (auto cit = trt_contexts_.begin(); cit != trt_contexts_.end(); cit++) { - int64_t current_distance = 0; - EvaluateTensorRTContext( - cit, total_batch_size, requests, request_count, request_shape_values, - ¤t_distance); - if (current_distance < shortest_distance) { - *citr = cit; - shortest_distance = current_distance; - } - } - if (shortest_distance == LLONG_MAX) { - std::string profiles_str; - for (const auto& trt_context : trt_contexts_) { - profiles_str += - (" " + trt_context.second.profile_name_ + "[" + - std::to_string(trt_context.first) + "]"); - } - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("failed to find any Optimization Profile among [") + - profiles_str + - "] to support the " - "requested dimensions (or shape values), proceeding with " - "first " - "profile.") - .c_str()); - } - } - - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Optimization profile ") + (*citr)->second.profile_name_ + - " [" + std::to_string((*citr)->first) + "] is selected for " + Name()) - .c_str()); - return nullptr; -} - - -TRITONSERVER_Error* -ModelInstanceState::EvaluateTensorRTContext( - std::map::iterator& citr, size_t total_batch_size, - TRITONBACKEND_Request** requests, uint32_t request_count, - const std::map>& request_shape_values, - int64_t* error_distance) -{ - *error_distance = 0; - - // Visit all the inputs and extract the shape values present in the - // request - uint32_t input_count; - RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count)); - for (uint32_t i = 0; i < input_count; i++) { - TRITONBACKEND_Input* input; - TRITONBACKEND_RequestInputByIndex(requests[0], i, &input); - const char* input_name; - const int64_t* input_shape; - uint32_t input_dims_count; - RETURN_IF_ERROR(TRITONBACKEND_InputProperties( - input, &input_name, nullptr, &input_shape, &input_dims_count, nullptr, - nullptr)); - std::vector input_shape_vec; - for (uint32_t dim_idx = 0; dim_idx < input_dims_count; dim_idx++) { - input_shape_vec.push_back(*(input_shape + dim_idx)); - } - if (support_batching_) { - input_shape_vec[0] = total_batch_size; - } - - int io_index = engine_->getBindingIndex(input_name); - auto& io_binding_info = - io_binding_infos_[next_buffer_binding_set_][io_index]; - if (io_binding_info.IsBufferRagged()) { - std::vector shape_vec{0}; - for (uint32_t req_idx = 0; req_idx < request_count; req_idx++) { - TRITONBACKEND_Input* repr_input; - RETURN_IF_ERROR(TRITONBACKEND_RequestInput( - requests[req_idx], input_name, &repr_input)); - const int64_t* shape; - uint32_t dims_count; - RETURN_IF_ERROR(TRITONBACKEND_InputProperties( - repr_input, nullptr, nullptr, &shape, &dims_count, nullptr, - nullptr)); - shape_vec[0] += backend::GetElementCount(shape, dims_count); - } - auto err = ValidateDimension( - shape_vec, citr->second.min_dims_[io_index], - citr->second.max_dims_[io_index], false); - if (err != nullptr) { - *error_distance = LLONG_MAX; - TRITONSERVER_ErrorDelete(err); - break; - } else { - const auto& opt_dims = citr->second.opt_dims_[io_index]; - *error_distance += std::abs(opt_dims.d[0] - shape_vec[0]); - } - } else { - TRITONSERVER_Error* err; - err = ValidateDimension( - input_shape_vec, citr->second.min_dims_[io_index], - citr->second.max_dims_[io_index], false); - - bool valid_bs = true; - TRITONSERVER_Error* shape_err = nullptr; - bool missing_shape_values = false; - if (engine_->isShapeBinding(io_index)) { - auto it = request_shape_values.find(io_index); - if (it != request_shape_values.end()) { - shape_err = ValidateShapeValues( - it->second, citr->second.min_shapes_[io_index], - citr->second.max_shapes_[io_index], citr->second.nb_shape_values_, - support_batching_); - valid_bs = - (!support_batching_) || (((int32_t)total_batch_size >= - *citr->second.min_shapes_[io_index]) && - ((int64_t)total_batch_size <= - *citr->second.max_shapes_[io_index])); - } else { - missing_shape_values = true; - } - } - - if ((err != nullptr) || (shape_err != nullptr) || !valid_bs || - missing_shape_values) { - *error_distance = LLONG_MAX; - if (err != nullptr) { - TRITONSERVER_ErrorDelete(err); - } - if (shape_err != nullptr) { - TRITONSERVER_ErrorDelete(shape_err); - } - break; - } else { - const auto& opt_dims = citr->second.opt_dims_[io_index]; - *error_distance += std::abs(opt_dims.d[0] - (int64_t)total_batch_size); - for (int idx = 1; idx < opt_dims.nbDims; idx++) { - *error_distance += - std::abs(opt_dims.d[idx] - input_shape_vec[idx - 1]); - } - if (engine_->isShapeBinding(io_index)) { - const auto* opt_shape_values = citr->second.opt_shapes_[io_index]; - *error_distance += - std::abs(*opt_shape_values - (int64_t)total_batch_size); - auto it = request_shape_values.find(io_index); - for (size_t idx = 1; idx < citr->second.nb_shape_values_; idx++) { - *error_distance += - std::abs(*(opt_shape_values + idx) - it->second[idx - 1]); - } - } - } - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitStreamsAndEvents() -{ - // Set the device before preparing the context. - auto cuerr = cudaSetDevice(DeviceId()); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, (std::string("unable to set device for ") + - Name() + ": " + cudaGetErrorString(cuerr)) - .c_str()); - } - - // Create CUDA streams associated with the instance - cuda_stream_priority_ = model_state_->GetCudaStreamPriority(); - - // The stream created by default has set priority of 0. Destroy the - // the default stream and create a new stream with requested - // priority. - // FIXME, This should be moved to backend repo to directly build - // cuda stream with required priority. - if (cuda_stream_priority_ != 0) { - if (stream_ != nullptr) { - cudaError_t err = cudaStreamDestroy(stream_); - if (err != cudaSuccess) { - TRITONSERVER_LogMessage( - TRITONSERVER_LOG_ERROR, __FILE__, __LINE__, - (std::string("~BackendModelInstance: ") + name_ + - " failed to destroy cuda stream: " + cudaGetErrorString(err)) - .c_str()); - } - stream_ = nullptr; - RETURN_IF_ERROR( - CreateCudaStream(DeviceId(), cuda_stream_priority_, &stream_)); - } - } -#ifdef TRITON_ENABLE_STATS - RETURN_IF_ERROR( - CreateCudaStream(DeviceId(), cuda_stream_priority_, &signal_stream_)); -#endif // TRITON_ENABLE_STATS - RETURN_IF_ERROR( - CreateCudaStream(DeviceId(), cuda_stream_priority_, &input_copy_stream_)); - if (model_state_->SeparateOutputStream()) { - RETURN_IF_ERROR(CreateCudaStream( - DeviceId(), cuda_stream_priority_, &output_copy_stream_)); - } - // Create CUDA events associated with the execution states - RETURN_IF_ERROR(InitEventSet(model_state_->BusyWaitEvents())); - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitEventSet(bool busy_wait_events) -{ - unsigned int event_flags = - (busy_wait_events ? cudaEventDefault : cudaEventBlockingSync) | - cudaEventDisableTiming; - - for (size_t idx = 0; idx < EVENT_SET_COUNT; idx++) { - RETURN_IF_ERROR(CreateCudaEvent( - "Set " + std::to_string(idx) + " ready for input", event_flags, - &events_[idx].ready_for_input_)); - RETURN_IF_ERROR(CreateCudaEvent( - "Set " + std::to_string(idx) + " input ready", event_flags, - &events_[idx].input_ready_)); - RETURN_IF_ERROR(CreateCudaEvent( - "Set " + std::to_string(idx) + " ready for output", event_flags, - &events_[idx].ready_for_output_)); - RETURN_IF_ERROR(CreateCudaEvent( - "Set " + std::to_string(idx) + " output ready", event_flags, - &events_[idx].output_ready_)); -#ifdef TRITON_ENABLE_STATS - RETURN_IF_ERROR(CreateCudaEvent( - "Set " + std::to_string(idx) + " compute output start", - cudaEventDefault, &events_[idx].compute_output_start_)); - RETURN_IF_ERROR(CreateCudaEvent( - "Set " + std::to_string(idx) + " compute input start", cudaEventDefault, - &events_[idx].compute_input_start_)); - RETURN_IF_ERROR(CreateCudaEvent( - "Set " + std::to_string(idx) + " compute input end", cudaEventDefault, - &events_[idx].compute_input_end_)); -#endif // TRITON_ENABLE_STATS - } - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::DestroyEventSet() -{ - for (size_t idx = 0; idx < EVENT_SET_COUNT; idx++) { - if (events_[idx].ready_for_input_ != nullptr) { - cudaEventDestroy(events_[idx].ready_for_input_); - } - if (events_[idx].input_ready_ != nullptr) { - cudaEventDestroy(events_[idx].input_ready_); - } - if (events_[idx].ready_for_output_ != nullptr) { - cudaEventDestroy(events_[idx].ready_for_output_); - } - if (events_[idx].output_ready_ != nullptr) { - cudaEventDestroy(events_[idx].output_ready_); - } -#ifdef TRITON_ENABLE_STATS - if (events_[idx].compute_output_start_ != nullptr) { - cudaEventDestroy(events_[idx].compute_output_start_); - } - if (events_[idx].compute_input_start_ != nullptr) { - cudaEventDestroy(events_[idx].compute_input_start_); - } - if (events_[idx].compute_input_end_ != nullptr) { - cudaEventDestroy(events_[idx].compute_input_end_); - } -#endif - } - return nullptr; -} - -void -ModelInstanceState::InitSemaphore() -{ - // If eager batching is set, we add to the semaphore resource count - // which allows to start preparing next batch before the previous - // batch has completed. The number of duplicates are limited by - // number of event sets to prevent too many iterations are run - // ahead and to avoid interference of the event communication in - // the previous execution - int sem_count = (model_state_->EagerBatching()) ? EVENT_SET_COUNT : 1; - semaphore_.reset(new Semaphore(sem_count)); -} - -TRITONSERVER_Error* -ModelInstanceState::InitOptimizationProfiles() -{ - total_bindings_ = engine_->getNbBindings(); - const int total_profiles = engine_->getNbOptimizationProfiles(); - - // TRT sets the optimization profile index to be 0 implicitly with - // the first context creation. As currently triton supports one - // context per engine, in order to set the specified profile_index, - // another context is created and the previous context is destroyed. - std::shared_ptr default_trt_context( - engine_->createExecutionContext()); - if (default_trt_context == nullptr) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to create TensorRT context: ") + - model_state_->GetTensorRTLogger().LastErrorMsg()) - .c_str()); - } - - num_expected_bindings_ = total_bindings_ / total_profiles; - - std::vector> profile_name_index; - // No optimization profile is set for this TensorRT plan - if (ProfileNames().empty()) { - profile_name_index.emplace_back("default", 0); - } else { - for (const auto& profile_name : ProfileNames()) { - int profile_index = 0; - RETURN_IF_ERROR(GetProfileIndex(profile_name, &profile_index)); - profile_name_index.emplace_back(profile_name, profile_index); - } - } - - // Create one TRT context for each specified profile - for (const auto& name_index : profile_name_index) { - const auto& profile_name = name_index.first; - const int profile_index = name_index.second; - auto res = trt_contexts_.emplace( - profile_index, TensorRTContext( - profile_name, profile_index, num_expected_bindings_, - EVENT_SET_COUNT)); - if (!res.second) { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (profile_name + " maps to profile index " + - std::to_string(profile_index) + " which has been mapped by " + - res.first->second.profile_name_ + - ", existing optimization profile will be reused") - .c_str()); - continue; - } - if (profile_index == 0) { - res.first->second.context_ = std::move(default_trt_context); - } else { - res.first->second.context_.reset(engine_->createExecutionContext()); - if (res.first->second.context_ == nullptr) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to create TensorRT context: ") + - model_state_->GetTensorRTLogger().LastErrorMsg()) - .c_str()); - } - if (!res.first->second.context_->setOptimizationProfileAsync( - profile_index, stream_)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("Can not set the specified optimization " - "profile ") + - profile_name + "[" + std::to_string(profile_index) + "] for " + - name_ + ". Expected optimization profile index range 0-" + - std::to_string(engine_->getNbOptimizationProfiles() - 1)) - .c_str()); - } - cudaStreamSynchronize(CudaStream()); - } - - // Store the profile dimensions for later initializing the input bindings - for (int io_index = 0; io_index < num_expected_bindings_; io_index++) { - const auto binding_index = - profile_index * num_expected_bindings_ + io_index; - if (engine_->bindingIsInput(binding_index)) { - RETURN_IF_ERROR( - GetProfileDimensions(io_index, profile_index, &res.first->second)); - } - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateIO() -{ - // Collect all the expected input and allowed output tensor names - // and validate that the model configuration specifies only those. - std::set allowed_inputs, allowed_outputs, allowed_shape_tensors; - for (int i = 0; i < num_expected_bindings_; ++i) { - if (engine_->bindingIsInput(i)) { - allowed_inputs.emplace(engine_->getBindingName(i)); - } else { - allowed_outputs.emplace(engine_->getBindingName(i)); - } - if (engine_->isExecutionBinding(i)) { - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Detected ") + engine_->getBindingName(i) + - " as execution binding for " + Name()) - .c_str()); - } - if (engine_->isShapeBinding(i)) { - allowed_shape_tensors.emplace(engine_->getBindingName(i)); - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Detected ") + engine_->getBindingName(i) + - " as shape binding for " + Name()) - .c_str()); - } - } - - triton::common::TritonJson::Value config_inputs; - RETURN_IF_ERROR( - model_state_->ModelConfig().MemberAsArray("input", &config_inputs)); - if (allowed_inputs.size() < config_inputs.ArraySize()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - std::string( - "unable to load model '" + model_state_->Name() + - "', configuration expects " + - std::to_string(config_inputs.ArraySize()) + - " inputs, model provides at most " + - std::to_string(allowed_inputs.size())) - .c_str()); - } - - for (size_t i = 0; i < config_inputs.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(config_inputs.IndexAsObject(i, &io)); - RETURN_IF_ERROR(CheckAllowedModelInput(io, allowed_inputs)); - } - - triton::common::TritonJson::Value config_outputs; - RETURN_IF_ERROR( - model_state_->ModelConfig().MemberAsArray("output", &config_outputs)); - for (size_t i = 0; i < config_outputs.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(config_outputs.IndexAsObject(i, &io)); - RETURN_IF_ERROR(CheckAllowedModelOutput(io, allowed_outputs)); - } - - RETURN_IF_ERROR(ValidateIOHelper( - config_inputs, allowed_shape_tensors, true /* is_input */)); - RETURN_IF_ERROR(ValidateIOHelper( - config_outputs, allowed_shape_tensors, false /* is_input */)); - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateIOHelper( - common::TritonJson::Value& ios, - const std::set& allowed_shape_tensors, const bool is_input) -{ - std::string type = is_input ? "input" : "output"; - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - - std::string io_data_type; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_data_type)); - if (!ConvertDataTypeToTrtType( - ModelConfigDataTypeToTritonServerDataType(io_data_type)) - .first) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unsupported datatype") + io_data_type + " for " + type + - " '" + io_name + "' for model '" + model_state_->Name() + "'") - .c_str()); - } - - // Check the shape tensor specification - if (allowed_shape_tensors.find(io_name) != allowed_shape_tensors.end()) { - bool is_shape_tensor = false; - RETURN_IF_ERROR(io.MemberAsBool("is_shape_tensor", &is_shape_tensor)); - if (!is_shape_tensor) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (type + " '" + io_name + "' for model '" + model_state_->Name() + - "' is a shape tensor but the model configuration " - "doesn't mark " - "it as a shape tensor.") - .c_str()); - } - } else { - bool is_shape_tensor = false; - RETURN_IF_ERROR(io.MemberAsBool("is_shape_tensor", &is_shape_tensor)); - if (is_shape_tensor) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (type + " '" + io_name + "' for model '" + model_state_->Name() + - "' is incorrectly marked as a shape tensor in the model " - "configuration.") - .c_str()); - } - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitIOBindingBuffers() -{ - triton::common::TritonJson::Value config_inputs; - RETURN_IF_ERROR( - model_state_->ModelConfig().MemberAsArray("input", &config_inputs)); - triton::common::TritonJson::Value config_outputs; - RETURN_IF_ERROR( - model_state_->ModelConfig().MemberAsArray("output", &config_outputs)); - - // Initialize the inputs and outputs. Make sure the model matches - // what is in the configuration. Allocate memory for the maximum - // possible batch size: min(engine maximum, config maximum) - io_binding_infos_.push_back( - std::vector(num_expected_bindings_)); - buffer_bindings_.push_back(std::vector(total_bindings_, nullptr)); - - // Use an additional set of buffers if a separate stream is used for - // output - if (model_state_->SeparateOutputStream()) { - io_binding_infos_.push_back( - std::vector(num_expected_bindings_)); - buffer_bindings_.push_back(std::vector(total_bindings_, nullptr)); - } - - // Sequence State should be processed at the end. - for (int s = 0; s < num_copy_streams_; s++) { - next_buffer_binding_set_ = s; - RETURN_IF_ERROR(InitializeConfigShapeInputBindings(config_inputs)); - RETURN_IF_ERROR(InitializeConfigExecuteInputBindings(config_inputs)); - RETURN_IF_ERROR( - InitializeSequenceControlInputBindings(model_state_->ModelConfig())); - RETURN_IF_ERROR(InitializeBatchInputBindings(model_state_->ModelConfig())); - RETURN_IF_ERROR( - InitializeSequenceStateInputBindings(model_state_->ModelConfig())); - } - - for (const auto& trt_context : trt_contexts_) { - if (!trt_context.second.context_->allInputDimensionsSpecified()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "failed to specify the dimensions of all input bindings"); - } - if (!trt_context.second.context_->allInputShapesSpecified()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "failed to specify the values of all input shape tensors"); - } - } - - // Validate the batch dimension against the implicit batch dimension - // if available. - if (engine_->hasImplicitBatchDimension() && - (model_state_->MaxBatchSize() > engine_->getMaxBatchSize())) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected configuration maximum batch size ") + - std::to_string(model_state_->MaxBatchSize()) + " for '" + Name() + - "', model maximum is " + std::to_string(engine_->getMaxBatchSize())) - .c_str()); - } - - // Batch output must be processed before other outputs and sequence state - // should be processed at the end. - for (int s = 0; s < num_copy_streams_; s++) { - next_buffer_binding_set_ = s; - RETURN_IF_ERROR(InitializeBatchOutputBindings(model_state_->ModelConfig())); - RETURN_IF_ERROR(InitializeConfigShapeOutputBindings(config_outputs)); - RETURN_IF_ERROR(InitializeConfigExecuteOutputBindings(config_outputs)); - RETURN_IF_ERROR( - InitializeSequenceStateOutputBindings(model_state_->ModelConfig())); - } - next_buffer_binding_set_ = 0; - // Make sure every index which corresponds to an execution binding - // is initialized. - for (int s = 0; s < num_copy_streams_; ++s) { - for (int i = 0; i < num_expected_bindings_; ++i) { - if (!io_binding_infos_[s][i].IsBufferAllocated() && - engine_->isExecutionBinding(i)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("expected configuration for ") + - std::string((engine_->bindingIsInput(i) ? "input" : "output")) + - " '" + engine_->getBindingName(i) + "' for " + Name()) - .c_str()); - } - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeConfigShapeInputBindings( - common::TritonJson::Value& config_inputs) -{ - for (size_t i = 0; i < config_inputs.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(config_inputs.IndexAsObject(i, &io)); - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - std::string io_data_type; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_data_type)); - common::TritonJson::Value model_config_dims; - common::TritonJson::Value reshape; - if (io.Find("reshape", &reshape)) { - RETURN_IF_ERROR(reshape.MemberAsArray("shape", &model_config_dims)); - } else { - RETURN_IF_ERROR(io.MemberAsArray("dims", &model_config_dims)); - } - - RETURN_IF_ERROR(InitializeShapeInputBinding( - io_name, ModelConfigDataTypeToTritonServerDataType(io_data_type), - model_config_dims)); - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeConfigExecuteInputBindings( - common::TritonJson::Value& config_inputs) -{ - for (size_t i = 0; i < config_inputs.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(config_inputs.IndexAsObject(i, &io)); - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - std::string io_datatype; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_datatype)); - common::TritonJson::Value model_config_dims; - common::TritonJson::Value reshape; - if (io.Find("reshape", &reshape)) { - RETURN_IF_ERROR(reshape.MemberAsArray("shape", &model_config_dims)); - } else { - RETURN_IF_ERROR(io.MemberAsArray("dims", &model_config_dims)); - } - bool io_allow_ragged_batch = false; - triton::common::TritonJson::Value allow_ragged_batch; - if (io.Find("allow_ragged_batch", &allow_ragged_batch)) { - RETURN_IF_ERROR(allow_ragged_batch.AsBool(&io_allow_ragged_batch)); - } - - RETURN_IF_ERROR(InitializeExecuteInputBinding( - io_name, io_datatype, model_config_dims, false, io_allow_ragged_batch)); - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeSequenceControlInputBindings( - common::TritonJson::Value& config) -{ - common::TritonJson::Value sequence_batching; - if (model_state_->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - std::vector boolean_kinds{ - "CONTROL_SEQUENCE_START", "CONTROL_SEQUENCE_END", - "CONTROL_SEQUENCE_READY"}; - - for (const auto& control_kind : boolean_kinds) { - const bool required = false; - - std::string tensor_name; - std::string tensor_datatype; - RETURN_IF_ERROR(GetBooleanSequenceControlProperties( - sequence_batching, model_state_->Name(), control_kind, required, - &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr, - nullptr, nullptr)); - if (!tensor_name.empty()) { - // Control tensors must have shape [1]. - common::TritonJson::Value dims{ - triton::common::TritonJson::ValueType::ARRAY}; - RETURN_IF_ERROR(dims.AppendInt(1)); - - RETURN_IF_ERROR(InitializeExecuteInputBinding( - tensor_name, tensor_datatype, dims, true)); - } - } - - std::vector typdef_kinds{"CONTROL_SEQUENCE_CORRID"}; - - for (const auto& control_kind : typdef_kinds) { - const bool required = false; - - std::string tensor_name; - std::string tensor_datatype; - RETURN_IF_ERROR(GetTypedSequenceControlProperties( - sequence_batching, model_state_->Name(), control_kind, required, - &tensor_name, &tensor_datatype)); - if (!tensor_name.empty()) { - // Control tensors must have shape [1]. - common::TritonJson::Value dims{ - triton::common::TritonJson::ValueType::ARRAY}; - RETURN_IF_ERROR(dims.AppendInt(1)); - - RETURN_IF_ERROR(InitializeExecuteInputBinding( - tensor_name, tensor_datatype, dims, true)); - } - } - } - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeSequenceStateInputBindings( - common::TritonJson::Value& config) -{ - common::TritonJson::Value sequence_batching; - if (model_state_->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - for (size_t i = 0; i < states.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(states.IndexAsObject(i, &io)); - std::string input_name; - RETURN_IF_ERROR(io.MemberAsString("input_name", &input_name)); - std::string io_datatype; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_datatype)); - - common::TritonJson::Value model_config_dims; - RETURN_IF_ERROR(io.MemberAsArray("dims", &model_config_dims)); - - RETURN_IF_ERROR(InitializeExecuteInputBinding( - input_name, io_datatype, model_config_dims, false /* is_control */, - false /* is_ragged */, true /* is_state */)); - } - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeSequenceStateOutputBindings( - common::TritonJson::Value& config) -{ - common::TritonJson::Value sequence_batching; - if (model_state_->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - for (size_t i = 0; i < states.ArraySize(); i++) { - uses_implicit_state_ = true; - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(states.IndexAsObject(i, &io)); - std::string output_name; - RETURN_IF_ERROR(io.MemberAsString("output_name", &output_name)); - std::string io_datatype; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_datatype)); - - common::TritonJson::Value model_config_dims; - RETURN_IF_ERROR(io.MemberAsArray("dims", &model_config_dims)); - - RETURN_IF_ERROR(InitializeExecuteOutputBinding( - output_name, io_datatype, model_config_dims, true /* is_state */)); - } - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeBatchInputBindings( - common::TritonJson::Value& config) -{ - std::vector batch_inputs; - BatchInput::ParseFromModelConfig(config, &batch_inputs); - for (const auto& batch_input : batch_inputs) { - for (const auto& tensor_name : batch_input.TargetNames()) { - TRITONSERVER_DataType tensor_datatype = batch_input.DataType(); - common::TritonJson::Value dims{ - triton::common::TritonJson::ValueType::ARRAY}; - // Different batch input expects different shape, note that - // we are setting config input shape here so the batch dimension - // is not included - if (model_state_->MaxBatchSize() == 0) { - // If the model doesn't support batching, the range of some - // batch input kind is convergent to a fixed value, need to - // specify the fixed value in such case. - // Note that batch input is intended to be used with batching model, - // the following is more for completeness. - switch (batch_input.BatchInputKind()) { - case BatchInput::Kind::BATCH_ELEMENT_COUNT: - case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT: - RETURN_IF_ERROR(dims.AppendInt(1)); - break; - case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO: - RETURN_IF_ERROR(dims.AppendInt(2)); - break; - case BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE: - RETURN_IF_ERROR(dims.AppendInt(-1)); - break; - case BatchInput::Kind::BATCH_ITEM_SHAPE: - case BatchInput::Kind::BATCH_ITEM_SHAPE_FLATTEN: { - // Compiler doesn't like switch case fall through, - // add conditional handling - if (batch_input.BatchInputKind() == - BatchInput::Kind::BATCH_ITEM_SHAPE) { - RETURN_IF_ERROR(dims.AppendInt(1)); - } - triton::common::TritonJson::Value inputs; - RETURN_IF_ERROR(config.MemberAsArray("input", &inputs)); - for (size_t i = 0; i < inputs.ArraySize(); ++i) { - triton::common::TritonJson::Value input; - RETURN_IF_ERROR(inputs.IndexAsObject(i, &input)); - std::string input_name; - RETURN_IF_ERROR(input.MemberAsString("name", &input_name)); - if (input_name == batch_input.SourceInputs()[0]) { - // Get input config shape - common::TritonJson::Value model_config_dims; - common::TritonJson::Value reshape; - if (input.Find("reshape", &reshape)) { - RETURN_IF_ERROR( - reshape.MemberAsArray("shape", &model_config_dims)); - } else { - RETURN_IF_ERROR( - input.MemberAsArray("dims", &model_config_dims)); - } - if (model_config_dims.ArraySize() != 0) { - RETURN_IF_ERROR( - dims.AppendInt(model_config_dims.ArraySize())); - } - break; - } - } - break; - } - default: - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "batch input type '" + batch_input.BatchInputKindString() + - "' is not supported")) - .c_str()); - } - } else { - // For most type 'dims' will be empty as the full shape - // of the batch input is [-1] which will be covered by - // batch dimension. - switch (batch_input.BatchInputKind()) { - case BatchInput::Kind::BATCH_ELEMENT_COUNT: - case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT: - case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO: - case BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE: - case BatchInput::Kind::BATCH_ITEM_SHAPE_FLATTEN: - break; - case BatchInput::Kind::BATCH_ITEM_SHAPE: { - triton::common::TritonJson::Value inputs; - RETURN_IF_ERROR(config.MemberAsArray("input", &inputs)); - for (size_t i = 0; i < inputs.ArraySize(); ++i) { - triton::common::TritonJson::Value input; - RETURN_IF_ERROR(inputs.IndexAsObject(i, &input)); - std::string input_name; - RETURN_IF_ERROR(input.MemberAsString("name", &input_name)); - if (input_name == batch_input.SourceInputs()[0]) { - // Get input config shape - common::TritonJson::Value model_config_dims; - common::TritonJson::Value reshape; - if (input.Find("reshape", &reshape)) { - RETURN_IF_ERROR( - reshape.MemberAsArray("shape", &model_config_dims)); - } else { - RETURN_IF_ERROR( - input.MemberAsArray("dims", &model_config_dims)); - } - if (model_config_dims.ArraySize() != 0) { - RETURN_IF_ERROR( - dims.AppendInt(model_config_dims.ArraySize())); - } - break; - } - } - break; - } - default: - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "batch input type '" + batch_input.BatchInputKindString() + - "' is not supported")) - .c_str()); - } - } - int io_index = engine_->getBindingIndex(tensor_name.c_str()); - auto& io_binding_info = - io_binding_infos_[next_buffer_binding_set_][io_index]; - io_binding_info.SetName(tensor_name); - // Special handling hint for InitializeExecuteInputBinding() - io_binding_info.SetBatchInput(batch_input); - - std::string data_type_str("TYPE_"); - data_type_str.append(TRITONSERVER_DataTypeString(tensor_datatype)); - RETURN_IF_ERROR(InitializeExecuteInputBinding( - tensor_name, data_type_str, dims, false)); - - - BackendMemory* bm; - if (io_binding_info.GetMemoryType() != TRITONSERVER_MEMORY_GPU) { - // zero-copy is used so the input buffer is direct-writable - BackendMemory::Create( - model_state_->TritonMemoryManager(), - BackendMemory::AllocationType::CPU_PINNED_POOL, - 0 /* memory_type_id */, io_binding_info.GetBuffer(), - io_binding_info.GetByteSize(), &bm); - } else { - BackendMemory::Create( - model_state_->TritonMemoryManager(), - {BackendMemory::AllocationType::CPU_PINNED_POOL}, - 0 /* memory_type_id */, io_binding_info.GetByteSize(), &bm); - } - io_binding_info.GetBatchInput()->second.reset(bm); - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeBatchOutputBindings( - common::TritonJson::Value& config) -{ - std::vector batch_outputs; - BatchOutput::ParseFromModelConfig(config, &batch_outputs); - for (const auto& io : batch_outputs) { - for (const auto& name : io.TargetNames()) { - // FIXME Currently not handling the case that batch output is - // shape tensor - int io_index = engine_->getBindingIndex(name.c_str()); - auto& io_binding_info = - io_binding_infos_[next_buffer_binding_set_][io_index]; - io_binding_info.SetName(name); - if (engine_->isShapeBinding(io_index)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "batch output '" + name + "' can not be shape binding")) - .c_str()); - } - - // Whether the output needs to be scattered based on input - if (io.BatchOutputKind() != - BatchOutput::Kind::BATCH_SCATTER_WITH_INPUT_SHAPE) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("batch output kind other than " - "BATCH_SCATTER_WITH_INPUT_SHAPE is not " - "supported for ") + - name) - .c_str()); - } - // Set hints to for InitializeBatchOutputBindings() - io_binding_info.SetIsBufferRagged(true); - io_binding_info.SetIoShapeMapping( - std::make_pair(io.SourceInputs()[0], std::vector())); - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeConfigShapeOutputBindings( - common::TritonJson::Value& config_output) -{ - for (size_t i = 0; i < config_output.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(config_output.IndexAsObject(i, &io)); - - // the maximum byte sizes across all profiles - int64_t max_byte_size = 0; - - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - - bool is_shape_tensor = false; - RETURN_IF_ERROR(io.MemberAsBool("is_shape_tensor", &is_shape_tensor)); - // Skip if this output is not a shape tensor - if (!is_shape_tensor) { - continue; - } - - int io_index = engine_->getBindingIndex(io_name.c_str()); - auto& io_binding_info = - io_binding_infos_[next_buffer_binding_set_][io_index]; - io_binding_info.SetName(io_name); - for (auto& trt_context : trt_contexts_) { - auto& profile_index = trt_context.first; - auto& context = trt_context.second; - int binding_index = num_expected_bindings_ * profile_index + io_index; - if (binding_index < 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_NOT_FOUND, - (std::string("output '") + io_name + "' not found for " + Name()) - .c_str()); - } - - if (io_binding_info.IsBufferAllocated()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("output '") + io_name + - "' has already appeared as an input or output for " + Name()) - .c_str()); - } - - if (engine_->bindingIsInput(binding_index)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("output '") + io_name + - "' is expected to be an output in model for " + Name()) - .c_str()); - } - - std::string io_data_type; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_data_type)); - - if (io_data_type.compare("TYPE_INT32") != 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected datatype '") + io_data_type + - " in model configuration for shape output '" + io_name + - "', expecting TYPE_INT32 for " + Name()) - .c_str()); - } - - TRITONSERVER_DataType dt = - ConvertTrtTypeToDataType(engine_->getBindingDataType(binding_index)); - TRITONSERVER_DataType config_dt = - ModelConfigDataTypeToTritonServerDataType(io_data_type); - if ((dt == TRITONSERVER_TYPE_INVALID) || (dt != config_dt)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected datatype TYPE_") + - TRITONSERVER_DataTypeString(dt) + " for inference output '" + - io_name + "', expecting TYPE_" + - TRITONSERVER_DataTypeString(config_dt) + " for " + Name()) - .c_str()); - } - - // [DLIS-4283] Note that the initialize-binding functions are - // configuring the binding infos, may be group in separate class? - RETURN_IF_ERROR( - interface_->SetFormat(binding_index, &io_binding_info.GetFormat())); - - common::TritonJson::Value model_config_dims; - common::TritonJson::Value reshape; - if (io.Find("reshape", &reshape)) { - RETURN_IF_ERROR(reshape.MemberAsArray("shape", &model_config_dims)); - } else { - RETURN_IF_ERROR(io.MemberAsArray("dims", &model_config_dims)); - } - - nvinfer1::Dims engine_dims = engine_->getBindingDimensions(binding_index); - if (ContainsWildcard(engine_dims)) { - context.is_dynamic_per_binding_[io_index] = true; - io_binding_info.SetIsDynamicShapeOutput(true); - } - - RETURN_IF_ERROR(CompareShapeDimsSupported( - Name(), io_name, engine_dims, model_config_dims, support_batching_)); - - if (!io_binding_info.IsDynamicShapeOutput()) { - const nvinfer1::Dims output_dim = - context.context_->getBindingDimensions(binding_index); - std::vector dim_vec; - DimsToDimVec(output_dim, &dim_vec); - int64_t byte_size = GetByteSize(dt, dim_vec); - - max_byte_size = std::max(max_byte_size, byte_size); - } - } - - if (!io_binding_info.IsDynamicShapeOutput()) { - // [DLIS-4283] review below comment - // Allocate CUDA memory. Use cudaHostAlloc if zero copy - // supported. For static output tensors, we rely on - // buffer_bindings_ being non-nullptr to indicate that the - // buffer has been correctly initialized so even for zero-sized - // tensors always allocate something. - void* buffer = nullptr; - cudaError_t err = cudaSuccess; - // [DLIS-4283] by knowing that shape tensor should be on host, ideally - // should perform allocation based on getTensorLocation(), in such way - // the handling can be unified between shape tensor and execution tensor - err = cudaHostAlloc( - &buffer, std::max((int64_t)1, max_byte_size), cudaHostAllocDefault); - if (err != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to allocate memory for output '") + io_name + - "' for " + Name() + ": " + cudaGetErrorString(err)) - .c_str()); - } - - io_binding_info.SetByteSize(max_byte_size); - io_binding_info.SetBuffer(buffer); - io_binding_info.SetDeviceBuffer(buffer); - } else { - auto allocator = std::make_unique(zero_copy_support_); - io_binding_info.SetBuffer(std::move(allocator)); - } - io_binding_info.SetMemoryType(TRITONSERVER_MEMORY_CPU_PINNED); - io_binding_info.SetMemoryTypeId(0); - - // [DLIS-4283] below is v1 handling and v1 doesn't support shape tensor, - // can be removed. - // Set buffer bindings of all optimization profile since buffer - // is allocated - for (auto& trt_context : trt_contexts_) { - auto binding_index = - num_expected_bindings_ * trt_context.first + io_index; - buffer_bindings_[next_buffer_binding_set_][binding_index] = - io_binding_info.GetDeviceBuffer(); - if (!io_binding_info.IsDynamicShapeOutput()) { - // [DLIS-4283] revisit below, note that 'device_buffer_' is actually - // not on device for shape tensor, the name can be misleading, perhaps - // something like 'trt_enqueue_buffer' - RETURN_ERROR_IF_FALSE( - trt_context.second.context_->setTensorAddress( - io_name.c_str(), io_binding_info.GetDeviceBuffer()), - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("'") + io_name + - "' does not map to an input or output tensor")); - } - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeConfigExecuteOutputBindings( - common::TritonJson::Value& config_output) -{ - for (size_t i = 0; i < config_output.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(config_output.IndexAsObject(i, &io)); - - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - - std::string io_datatype; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_datatype)); - - bool is_shape_tensor = false; - RETURN_IF_ERROR(io.MemberAsBool("is_shape_tensor", &is_shape_tensor)); - - common::TritonJson::Value model_config_dims; - common::TritonJson::Value reshape; - if (io.Find("reshape", &reshape)) { - RETURN_IF_ERROR(reshape.MemberAsArray("shape", &model_config_dims)); - } else { - RETURN_IF_ERROR(io.MemberAsArray("dims", &model_config_dims)); - } - - // Skip if the output is specified to be a shape tensor - if (is_shape_tensor) { - continue; - } - - RETURN_IF_ERROR(InitializeExecuteOutputBinding( - io_name, io_datatype, model_config_dims)); - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeExecuteInputBinding( - const std::string& input_name, const std::string& input_datatype, - common::TritonJson::Value& input_dims, const bool is_control, - const bool is_ragged, const bool is_state) -{ - // the maximum byte sizes across all profiles - int64_t max_byte_size = 0; - int io_index = engine_->getBindingIndex(input_name.c_str()); - auto& io_binding_info = io_binding_infos_[next_buffer_binding_set_][io_index]; - io_binding_info.SetName(input_name); - - if (io_binding_info.IsBufferAllocated() && is_state) { - // The input bindings for the given state input is already allocated, - // hence, no need to proceed further. - return nullptr; - } - - for (auto& trt_context : trt_contexts_) { - auto& profile_index = trt_context.first; - auto& context = trt_context.second; - int binding_index = num_expected_bindings_ * profile_index + io_index; - if (io_index < 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_NOT_FOUND, - (std::string("input '") + input_name + "' not found for " + Name()) - .c_str()); - } - - // Skip if shape binding is encountered - if (engine_->isShapeBinding(binding_index)) { - return nullptr; - } - - if (io_binding_info.IsBufferAllocated()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("input '") + input_name + - "' has already appeared as an input or output for " + Name()) - .c_str()); - } - - - if (!engine_->bindingIsInput(binding_index)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("input '") + input_name + - "' is expected to be an input in model for " + Name()) - .c_str()); - } - - TRITONSERVER_DataType dt = - ConvertTrtTypeToDataType(engine_->getBindingDataType(binding_index)); - TRITONSERVER_DataType config_dt = - ModelConfigDataTypeToTritonServerDataType(input_datatype); - if ((dt == TRITONSERVER_TYPE_INVALID) || (dt != config_dt)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected datatype TYPE_") + - TRITONSERVER_DataTypeString(dt) + " for inference input '" + - input_name + "', expecting TYPE_" + - TRITONSERVER_DataTypeString(config_dt) + " for " + Name()) - .c_str()); - } - - RETURN_IF_ERROR( - interface_->SetFormat(binding_index, &io_binding_info.GetFormat())); - - // Detect whether dynamic or not - nvinfer1::Dims engine_dims = engine_->getBindingDimensions(binding_index); - if (ContainsWildcard(engine_dims)) { - context.is_dynamic_per_binding_[io_index] = true; - } - - if (!(is_control && context.is_dynamic_per_binding_[io_index])) { - if (!is_ragged) { - RETURN_IF_ERROR(CompareDimsSupported( - StateForModel()->Name(), input_name, engine_dims, input_dims, - support_batching_, (!engine_->hasImplicitBatchDimension()), - false /* compare_exact */)); - } else { - // For ragged input, the input will be concatenated and - // flatten, so expecting engine dims to be one dimensional. - int64_t input_dims_0; - RETURN_IF_ERROR(input_dims.IndexAsInt(0, &input_dims_0)); - if ((engine_dims.nbDims != 1) || (engine_dims.d[0] != input_dims_0)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("model '") + model_state_->Name() + "', tensor '" + - input_name + - "': for the model to support ragged input, the engine " - "shape" - " is: " + - DimsDebugString(engine_dims) + - " while the model config shape is: " + - DimsJsonToString(input_dims)) - .c_str()); - } - } - } else { - TRITONSERVER_Error* err = - ValidateControlDimsDynamic(engine_dims, support_batching_); - if (err != nullptr) { - TRITONSERVER_Error* full_err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected shape ") + DimsDebugString(engine_dims) + - " for control input '" + input_name + "' for model " + - model_state_->Name() + ": " + TRITONSERVER_ErrorMessage(err)) - .c_str()); - TRITONSERVER_ErrorDelete(err); - return full_err; - } - } - - std::vector full_config_dim; - if (is_ragged) { - // if ragged, full shape is defined to be [-1] - full_config_dim = {-1}; - } else if (io_binding_info.GetBatchInput() != nullptr) { - // if batch input, "config shape" has been populated in 'input_dims', - // but if batching is supported, batch dimension doesn't necessary - // limited by config max batch size - RETURN_IF_ERROR(DimsJsonToDimVec(input_dims, &full_config_dim)); - if (model_state_->MaxBatchSize() > 0) { - full_config_dim.insert(full_config_dim.begin(), -1); - } - } else { - RETURN_IF_ERROR(DimsJsonToDimVec(input_dims, &full_config_dim)); - if (model_state_->MaxBatchSize() > 0) { - full_config_dim.insert( - full_config_dim.begin(), model_state_->MaxBatchSize()); - } - } - - std::vector maximum_dims; - RETURN_IF_ERROR(interface_->ConfigureInputDimensions( - &context, io_index, binding_index, full_config_dim, &maximum_dims)); - - int64_t byte_size = 0; - if (io_binding_info.GetFormat().is_linear_format_) { - byte_size = GetByteSize(dt, maximum_dims); - } else { - maximum_dims[io_binding_info.GetFormat().vectorized_dim_] += - (io_binding_info.GetFormat().components_per_element_ - - (maximum_dims[io_binding_info.GetFormat().vectorized_dim_] % - io_binding_info.GetFormat().components_per_element_)); - byte_size = GetByteSize(dt, maximum_dims); - } - - if (byte_size == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to calculate size for input '") + input_name + - "' for " + Name()) - .c_str()); - } - max_byte_size = std::max(max_byte_size, byte_size); - } - - // Allocate CUDA memory. Use cudaHostAlloc if zero copy supported. - // We rely on buffer_bindings_ being non-nullptr to indicate that - // the buffer has been correctly initialized so even for zero-sized - // tensors always allocate something. - void* buffer = nullptr; - cudaError_t err = cudaSuccess; - if (zero_copy_support_) { - err = cudaHostAlloc( - &buffer, std::max((int64_t)1, max_byte_size), cudaHostAllocMapped); - } else { - err = cudaMalloc(&buffer, std::max((int64_t)1, max_byte_size)); - } - if (err != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to allocate memory for input '") + input_name + - "' for " + Name() + ": " + cudaGetErrorString(err)) - .c_str()); - } - err = cudaMemset((uint8_t*)buffer, 0, max_byte_size); - if (err != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to set memory for input '") + input_name + - "' for " + Name() + ": " + cudaGetErrorString(err)) - .c_str()); - } - - io_binding_info.SetByteSize(max_byte_size); - io_binding_info.SetBuffer(buffer); - io_binding_info.SetDeviceBuffer(buffer); - io_binding_info.SetIsBufferRagged(is_ragged); - if (zero_copy_support_) { - io_binding_info.SetMemoryType(TRITONSERVER_MEMORY_CPU_PINNED); - io_binding_info.SetMemoryTypeId(0); - err = cudaHostGetDevicePointer( - io_binding_info.GetDeviceBufferAddr(), io_binding_info.GetBuffer(), 0); - if (err != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to get mapped device address for input '") + - input_name + "' for " + Name() + ": " + cudaGetErrorString(err)) - .c_str()); - } - } else { - io_binding_info.SetMemoryType(TRITONSERVER_MEMORY_GPU); - io_binding_info.SetMemoryTypeId(DeviceId()); - } - if (io_binding_info.IsBufferRagged() && - !io_binding_info.GetFormat().is_linear_format_) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unexpected allow-ragged for non-linear input '") + - input_name + "' for " + Name()) - .c_str()); - } - - // Set buffer bindings of all optimization profile since buffer is - // allocated - for (auto& trt_context : trt_contexts_) { - auto binding_index = num_expected_bindings_ * trt_context.first + io_index; - buffer_bindings_[next_buffer_binding_set_][binding_index] = - io_binding_info.GetDeviceBuffer(); - RETURN_ERROR_IF_FALSE( - trt_context.second.context_->setTensorAddress( - input_name.c_str(), io_binding_info.GetDeviceBuffer()), - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("'") + input_name + - "' does not map to an input or output tensor")); - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeExecuteOutputBinding( - const std::string& output_name, const std::string& output_datatype, - common::TritonJson::Value& output_dims, bool is_state) -{ - // the maximum byte sizes across all profiles - int64_t max_byte_size = 0; - - int io_index = engine_->getBindingIndex(output_name.c_str()); - - auto& io_binding_info = io_binding_infos_[next_buffer_binding_set_][io_index]; - io_binding_info.SetName(output_name); - - // State output is initialized before the requested output tensor. - if (is_state) { - io_binding_info.SetIsStateOutput(true); - } else { - io_binding_info.SetIsRequestedOutputTensor(true); - } - - if (io_binding_info.IsBufferAllocated() && is_state) { - // The input bindings for the given state input is already allocated, - // hence, no need to proceed further. - return nullptr; - } - - // Check whether the output shape is data-dependent. - for (auto& trt_context : trt_contexts_) { - if (ContainsWildcard( - trt_context.second.context_->getTensorShape(output_name.c_str()))) { - io_binding_info.SetIsDynamicShapeOutput(true); - break; - } - } - - for (auto& trt_context : trt_contexts_) { - auto& profile_index = trt_context.first; - auto& context = trt_context.second; - int binding_index = num_expected_bindings_ * profile_index + io_index; - - if (binding_index < 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_NOT_FOUND, - (std::string("output '") + output_name + "' not found for " + Name()) - .c_str()); - } - - if (engine_->bindingIsInput(binding_index)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("output '") + output_name + - "' is expected to be an input in model for " + Name()) - .c_str()); - } - - if (io_binding_info.IsBufferAllocated()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("output '") + output_name + - "' has already appeared as an input or output for " + Name()) - .c_str()); - } - - nvinfer1::Dims engine_dims = engine_->getBindingDimensions(binding_index); - if (ContainsWildcard(engine_dims)) { - context.is_dynamic_per_binding_[io_index] = true; - } - - TRITONSERVER_DataType dt = - ConvertTrtTypeToDataType(engine_->getBindingDataType(binding_index)); - TRITONSERVER_DataType config_dt = - ModelConfigDataTypeToTritonServerDataType(output_datatype); - if ((dt == TRITONSERVER_TYPE_INVALID) || (dt != config_dt)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected datatype TYPE_") + - TRITONSERVER_DataTypeString(dt) + " for inference output '" + - output_name + "', expecting TYPE_" + - TRITONSERVER_DataTypeString(config_dt) + " for " + Name()) - .c_str()); - } - - RETURN_IF_ERROR( - interface_->SetFormat(binding_index, &io_binding_info.GetFormat())); - - // Skip 'batch_output' validation as it is not exact match to - // model dims - if (!io_binding_info.IsBufferRagged()) { - RETURN_IF_ERROR(CompareDimsSupported( - StateForModel()->Name(), output_name, engine_dims, output_dims, - support_batching_, (!engine_->hasImplicitBatchDimension()), - false /* compare_exact */)); - } - - if (io_binding_info.IsBufferRagged() && - !io_binding_info.GetFormat().is_linear_format_) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected allow-ragged for non-linear output '") + - output_name + "' for " + Name()) - .c_str()); - } - if (!io_binding_info.IsDynamicShapeOutput()) { - int64_t byte_size = interface_->GetFullByteSize( - context.context_.get(), output_name, binding_index); - if (byte_size == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to allocate memory for output '") + - output_name + "' for " + Name()) - .c_str()); - } - max_byte_size = std::max(max_byte_size, byte_size); - } - } - - cudaError_t err = cudaSuccess; - - if (!io_binding_info.IsDynamicShapeOutput()) { - // Allocate CUDA memory. Use cudaHostAlloc if zero copy supported. - // For static output tensors, we rely on buffer_bindings_ being - // non-nullptr to indicate that the buffer has been correctly initialized - // so even for zero-sized tensors always allocate something. - void* buffer = nullptr; - if (zero_copy_support_) { - err = cudaHostAlloc( - &buffer, std::max((int64_t)1, max_byte_size), cudaHostAllocMapped); - } else { - err = cudaMalloc(&buffer, std::max((int64_t)1, max_byte_size)); - } - - if (err != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to allocate memory for output '") + output_name + - "' for " + Name() + ": " + cudaGetErrorString(err)) - .c_str()); - } - io_binding_info.SetByteSize(max_byte_size); - io_binding_info.SetBuffer(buffer); - io_binding_info.SetDeviceBuffer(buffer); - } else { - auto allocator = std::make_unique(zero_copy_support_); - for (auto& trt_context : trt_contexts_) { - trt_context.second.context_->setOutputAllocator( - output_name.c_str(), allocator.get()); - } - io_binding_info.SetBuffer(std::move(allocator)); - } - - // Whether the output needs to be scattered based on input - if (io_binding_info.IsBufferRagged()) { - std::vector output_shape; - if (support_batching_) { - output_shape.push_back(-1); - } - for (size_t i = 0; i < output_dims.ArraySize(); i++) { - int64_t dim; - RETURN_IF_ERROR(output_dims.IndexAsInt(i, &dim)); - output_shape.push_back(dim); - } - io_binding_info.GetIoShapeMapping().second = output_shape; - } - if (zero_copy_support_) { - io_binding_info.SetMemoryType(TRITONSERVER_MEMORY_CPU_PINNED); - io_binding_info.SetMemoryTypeId(0); - if (!io_binding_info.IsDynamicShapeOutput()) { - err = cudaHostGetDevicePointer( - io_binding_info.GetDeviceBufferAddr(), io_binding_info.GetBuffer(), - 0); - if (err != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to get mapped device address for output '") + - output_name + " for " + Name() + ": " + cudaGetErrorString(err)) - .c_str()); - } - } - } else { - io_binding_info.SetMemoryType(TRITONSERVER_MEMORY_GPU); - io_binding_info.SetMemoryTypeId(DeviceId()); - } - - // Set buffer bindings of all optimization profile since buffer is - // allocated - for (auto& trt_context : trt_contexts_) { - auto binding_index = num_expected_bindings_ * trt_context.first + io_index; - buffer_bindings_[next_buffer_binding_set_][binding_index] = - io_binding_info.GetDeviceBuffer(); - if (!io_binding_info.IsDynamicShapeOutput()) { - RETURN_ERROR_IF_FALSE( - trt_context.second.context_->setTensorAddress( - output_name.c_str(), io_binding_info.GetDeviceBuffer()), - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("'") + output_name + - "' does not map to an input or output tensor")); - } - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeShapeInputBinding( - const std::string& input_name, const TRITONSERVER_DataType input_datatype, - common::TritonJson::Value& model_config_dims) -{ - // the maximum byte sizes across all profiles - int64_t max_byte_size = 0; - int io_index = engine_->getBindingIndex(input_name.c_str()); - - auto& io_binding_info = io_binding_infos_[next_buffer_binding_set_][io_index]; - io_binding_info.SetName(input_name); - for (auto& trt_context : trt_contexts_) { - auto& profile_index = trt_context.first; - auto& context = trt_context.second; - int binding_index = num_expected_bindings_ * profile_index + io_index; - if (io_index < 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_NOT_FOUND, - (std::string("input '") + input_name + "' not found for " + Name()) - .c_str()); - } - - if (io_binding_info.IsBufferAllocated()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("input '") + input_name + - "' has already appeared as an input or output for " + Name()) - .c_str()); - } - - if (!engine_->bindingIsInput(binding_index)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("input '") + input_name + - "' is expected to be an input in model for " + Name()) - .c_str()); - } - - // Skip if the binding is not a shape tensor - if (!engine_->isShapeBinding(binding_index)) { - return nullptr; - } - - if (input_datatype != TRITONSERVER_TYPE_INT32) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected datatype TYPE_") + - TRITONSERVER_DataTypeString(input_datatype) + - " in model configuration for shape input '" + input_name + - "', expecting TYPE_INT32 for " + Name()) - .c_str()); - } - - TRITONSERVER_DataType dt = - ConvertTrtTypeToDataType(engine_->getBindingDataType(binding_index)); - if ((dt == TRITONSERVER_TYPE_INVALID) || (dt != input_datatype)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected datatype TYPE_") + - TRITONSERVER_DataTypeString(dt) + " in engine for shape input '" + - input_name + "', expecting TYPE_" + - TRITONSERVER_DataTypeString(input_datatype) + " for " + Name()) - .c_str()); - } - - RETURN_IF_ERROR( - interface_->SetFormat(binding_index, &io_binding_info.GetFormat())); - - nvinfer1::Dims engine_dims = engine_->getBindingDimensions(binding_index); - if (ContainsWildcard(engine_dims)) { - context.is_dynamic_per_binding_[io_index] = true; - } - - RETURN_IF_ERROR(CompareShapeDimsSupported( - name_, input_name, engine_dims, model_config_dims, support_batching_)); - - if (!context.context_->setBindingDimensions( - binding_index, context.max_dims_[io_index])) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("trt failed to set binding dimension to ") + - DimsDebugString(context.max_dims_[io_index]) + " for input '" + - input_name + "' for " + Name()) - .c_str()); - } - - context.nb_shape_values_ = (context.max_dims_[io_index].nbDims == 0) - ? 1 - : context.max_dims_[io_index].d[0]; - context.max_shapes_[io_index] = engine_->getProfileShapeValues( - binding_index, profile_index, nvinfer1::OptProfileSelector::kMAX); - context.min_shapes_[io_index] = engine_->getProfileShapeValues( - binding_index, profile_index, nvinfer1::OptProfileSelector::kMIN); - context.opt_shapes_[io_index] = engine_->getProfileShapeValues( - binding_index, profile_index, nvinfer1::OptProfileSelector::kOPT); - - // Set shape tensor address to buffer that contains max allowed value so - // later shape inference will return max output shape / size for - // pre-allocation. - if (!context.context_->setInputTensorAddress( - input_name.c_str(), context.max_shapes_[io_index])) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("trt failed to set the input shape binding for '") + - input_name + "' for " + Name()) - .c_str()); - } - // [FIXME] setInputShapeBinding() is deprecated and - // setInputTensorAddress() above alone should be sufficient. There is bug - // in TRT that getMaxOutputSize() won't work without - // setInputShapeBinding() for shape tensor, ETA fix in 8.5.2 - if (!context.context_->setInputShapeBinding( - binding_index, context.max_shapes_[io_index])) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("trt failed to set the input shape binding for '") + - input_name + "' for " + Name()) - .c_str()); - } - - if (engine_->isExecutionBinding(binding_index)) { - int64_t byte_size = 0; - if (io_binding_info.GetFormat().is_linear_format_) { - std::vector dim_vec; - DimsToDimVec( - context.context_->getBindingDimensions(binding_index), &dim_vec); - byte_size = GetByteSize(dt, dim_vec); - } else { - auto component_count = - GetElementCount(context.context_->getStrides(binding_index)); - component_count *= - engine_->getBindingComponentsPerElement(binding_index); - byte_size = component_count * - engine_->getBindingBytesPerComponent(binding_index); - } - max_byte_size = std::max(max_byte_size, byte_size); - } - } - - if (max_byte_size != 0) { - // Allocate CUDA memory. Use cudaHostAlloc if zero copy supported. - // We rely on buffer_bindings_ being non-nullptr to indicate that - // the buffer has been correctly initialized so even for zero-sized - // tensors always allocate something. - void* buffer = nullptr; - cudaError_t err = cudaSuccess; - // [DLIS-4283] by knowing that shape tensor should be on host, ideally - // should perform allocation based on getTensorLocation(), in such way - // the handling can be unified between shape tensor and execution tensor - auto cuda_flag = - zero_copy_support_ ? cudaHostAllocMapped : cudaHostAllocDefault; - err = - cudaHostAlloc(&buffer, std::max((int64_t)1, max_byte_size), cuda_flag); - if (err != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to allocate memory for input '") + input_name + - "' for " + Name() + ": " + cudaGetErrorString(err)) - .c_str()); - } - - io_binding_info.SetByteSize(max_byte_size); - io_binding_info.SetBuffer(buffer); - io_binding_info.SetDeviceBuffer(buffer); - io_binding_info.SetMemoryType(TRITONSERVER_MEMORY_CPU_PINNED); - io_binding_info.SetMemoryTypeId(0); - - // Different from other tensors that setTensorAdress() will be set to the - // allocated buffer at the end, we keep it to be the 'max_shapes_' set - // above because the buffer content will affect the output shape - // inference. We rely on Enqueue() function to make sure the correct - // buffer is set during runtime. - - // [DLIS-4283] below is v1 handling and v1 doesn't support shape tensor, - // can be removed. - // Set buffer bindings of all optimization profile since buffer is - // allocated - for (auto& trt_context : trt_contexts_) { - auto binding_index = - num_expected_bindings_ * trt_context.first + io_index; - buffer_bindings_[next_buffer_binding_set_][binding_index] = - io_binding_info.GetDeviceBuffer(); - } - } - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::GetProfileDimensions( - const int io_index, const int profile_index, TensorRTContext* context) -{ - int binding_index = (profile_index * num_expected_bindings_) + io_index; - context->max_dims_[io_index] = engine_->getProfileDimensions( - binding_index, profile_index, nvinfer1::OptProfileSelector::kMAX); - context->min_dims_[io_index] = engine_->getProfileDimensions( - binding_index, profile_index, nvinfer1::OptProfileSelector::kMIN); - context->opt_dims_[io_index] = engine_->getProfileDimensions( - binding_index, profile_index, nvinfer1::OptProfileSelector::kOPT); - return nullptr; -} - -void -ModelInstanceState::GetConfiguredProfiles(std::string* profiles_desc) -{ - profiles_desc->clear(); - for (const auto& trt_context : trt_contexts_) { - (*profiles_desc) += - (" " + trt_context.second.profile_name_ + "[" + - std::to_string(trt_context.first) + "];"); - } -} - -void -ModelInstanceState::FindClosestCudaGraph( - const TensorRTContext& trt_context, - const std::vector& cuda_graph_key, - const TensorRTContext::CudaGraph** cuda_graph, bool* found_exact) -{ - *cuda_graph = nullptr; - auto itr = - trt_context.cuda_graph_execs_[next_set_].lower_bound(cuda_graph_key); - if (itr != trt_context.cuda_graph_execs_[next_set_].end()) { - *found_exact = (itr->first == cuda_graph_key); - if (*found_exact) { - *cuda_graph = &itr->second; - return; - } else if (allow_inexact_match_) { - // For vector as key, returned lower bound may not satisfy - // requirements that all dims must be >= actual dims - for (; itr != trt_context.cuda_graph_execs_[next_set_].end(); itr++) { - bool found = true; - for (size_t key_idx = 0; key_idx < cuda_graph_key.size(); key_idx++) { - if ((cuda_graph_key[key_idx] > itr->first[key_idx]) || - (cuda_graph_key[key_idx] < - itr->second.lower_bound_key_[key_idx])) { - found = false; - break; - } - } - if (found) { - *cuda_graph = &itr->second; - return; - } - } - } - } - return; -} - -// CUDA 10.1 starts to support CUDA graphs. -#ifdef TRITON_ENABLE_CUDA_GRAPH -TRITONSERVER_Error* -ModelInstanceState::InitializeCudaGraph() -{ - std::vector graph_specs; - RETURN_IF_ERROR(InitializeGraphSpecs(&graph_specs, &allow_inexact_match_)); - - // CUDA graph will be captured for every TRT contexts as CUDA graph - // is merely capturing GPU activities for a given execution. - for (auto& graph_spec : graph_specs) { - for (auto& trt_context : trt_contexts_) { - graph_spec.captured_ = - interface_->BuildCudaGraph(&(trt_context.second), graph_spec); - } - } - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::InitializeGraphSpecs( - std::vector* graph_specs, bool* allow_inexact_match) -{ - *allow_inexact_match = false; - graph_specs->clear(); - - if (model_state_->GraphSpecs().ArraySize() == 0) { - // No graph spec is provided, use default specs - // Graphs are most likely to help for small batch sizes so by - // default build for batch sizes 1, 2, 3, 4, 6, 8, 12, 16, - // 'max_batch_size'. If preferred batch size is specified, then - // the batch sizes will be 1, preferred batch sizes, - // 'max_batch_size'. - std::set cuda_graph_batch_sizes; - if (model_state_->MaxBatchSize() == 0) { - cuda_graph_batch_sizes = {0}; - } else { - cuda_graph_batch_sizes = {1}; - } - - common::TritonJson::Value dynamic_batching; - common::TritonJson::Value sequence_batching; - common::TritonJson::Value oldest; - if (model_state_->ModelConfig().Find( - "dynamic_batching", &dynamic_batching)) { - common::TritonJson::Value pbs; - RETURN_IF_ERROR( - dynamic_batching.MemberAsArray("preferred_batch_size", &pbs)); - for (size_t i = 0; i < pbs.ArraySize(); i++) { - int64_t bs; - RETURN_IF_ERROR(pbs.IndexAsInt(i, &bs)); - cuda_graph_batch_sizes.emplace(bs); - } - } else if ( - model_state_->ModelConfig().Find( - "sequence_batching", &sequence_batching) && - sequence_batching.Find("oldest", &oldest)) { - common::TritonJson::Value pbs; - RETURN_IF_ERROR(oldest.MemberAsArray("preferred_batch_size", &pbs)); - for (size_t i = 0; i < pbs.ArraySize(); i++) { - int64_t bs; - RETURN_IF_ERROR(pbs.IndexAsInt(i, &bs)); - cuda_graph_batch_sizes.emplace(bs); - } - } else { - cuda_graph_batch_sizes = {1, 2, 3, 4, 6, 8, 12, 16}; - if (model_state_->MaxBatchSize() == 0) { - cuda_graph_batch_sizes.emplace(0); - } - } - if (model_state_->MaxBatchSize() > 0) { - cuda_graph_batch_sizes.emplace(model_state_->MaxBatchSize()); - } - - for (const auto bs : cuda_graph_batch_sizes) { - if (bs <= model_state_->MaxBatchSize()) { - graph_specs->emplace_back(); - graph_specs->back().batch_size_ = bs; - graph_specs->back().lower_bound_batch_size_ = bs; - } - } - } else { - for (size_t i = 0; i < model_state_->GraphSpecs().ArraySize(); i++) { - common::TritonJson::Value config_spec; - RETURN_IF_ERROR( - model_state_->GraphSpecs().IndexAsObject(i, &config_spec)); - graph_specs->emplace_back(); - auto& graph_spec = graph_specs->back(); - RETURN_IF_ERROR( - config_spec.MemberAsInt("batch_size", &graph_spec.batch_size_)); - common::TritonJson::Value inputs; - if (config_spec.Find("input", &inputs)) { - std::vector input_names; - RETURN_IF_ERROR(inputs.Members(&input_names)); - for (const auto& input_name : input_names) { - common::TritonJson::Value input; - RETURN_IF_ERROR(inputs.MemberAsObject(input_name.c_str(), &input)); - std::vector input_shape; - common::TritonJson::Value dims; - RETURN_IF_ERROR(input.MemberAsArray("dim", &dims)); - for (size_t i = 0; i < dims.ArraySize(); i++) { - std::string dim; - RETURN_IF_ERROR(dims.IndexAsString(i, &dim)); - input_shape.push_back(std::stoi(dim)); - } - graph_spec.shapes_[input_name] = std::move(input_shape); - } - } - - common::TritonJson::Value lower_bound_spec; - if (config_spec.Find("graph_lower_bound", &lower_bound_spec)) { - *allow_inexact_match = true; - RETURN_IF_ERROR(lower_bound_spec.MemberAsInt( - "batch_size", &graph_spec.lower_bound_batch_size_)); - common::TritonJson::Value inputs; - if (lower_bound_spec.Find("input", &inputs)) { - std::vector input_names; - RETURN_IF_ERROR(inputs.Members(&input_names)); - for (const auto& input_name : input_names) { - common::TritonJson::Value input; - RETURN_IF_ERROR(inputs.MemberAsObject(input_name.c_str(), &input)); - std::vector input_shape; - common::TritonJson::Value dims; - RETURN_IF_ERROR(input.MemberAsArray("dim", &dims)); - for (size_t i = 0; i < dims.ArraySize(); i++) { - std::string dim; - RETURN_IF_ERROR(dims.IndexAsString(i, &dim)); - input_shape.push_back(std::stoi(dim)); - } - graph_spec.lower_bound_shapes_[input_name] = std::move(input_shape); - } - } - } else { - graph_spec.lower_bound_batch_size_ = graph_spec.batch_size_; - graph_spec.lower_bound_shapes_ = graph_spec.shapes_; - } - } - } - for (const auto& graph_spec : *graph_specs) { - RETURN_IF_ERROR(ValidateGraphSpec(graph_spec)); - } - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateGraphSpec(const GraphSpec& graph_spec) -{ - if (model_state_->MaxBatchSize() == 0) { - if ((graph_spec.batch_size_ != 0) || - (graph_spec.lower_bound_batch_size_ != 0)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - "graph spec expects 'batch_size' to be 0 if " - "'max_batch_size' is 0"); - } - } else if ( - ((graph_spec.batch_size_ > model_state_->MaxBatchSize()) || - (graph_spec.batch_size_ < 1)) || - ((graph_spec.lower_bound_batch_size_ > model_state_->MaxBatchSize()) || - (graph_spec.lower_bound_batch_size_ < 1))) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("graph spec expects 'batch_size' to be >= 1 and <= ") + - std::to_string(model_state_->MaxBatchSize())) - .c_str()); - } - if (graph_spec.lower_bound_batch_size_ > graph_spec.batch_size_) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - "graph lower bound spec expects 'batch_size' to be <= graph " - "spec " - "'batch_size'"); - } - for (const auto& input : graph_spec.shapes_) { - const auto lit = graph_spec.lower_bound_shapes_.find(input.first); - if (lit == graph_spec.lower_bound_shapes_.end()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("graph lower bound spec expects shape for input '") + - input.first + "'") - .c_str()); - } else { - if (lit->second.size() != input.second.size()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("graph lower bound spec expects to have '") + - std::to_string(input.second.size()) + " dimensions, got " + - std::to_string(lit->second.size())) - .c_str()); - } - for (size_t idx = 0; idx < input.second.size(); idx++) { - if ((lit->second[idx] < 0) || (input.second[idx] < 0)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("graph spec expects input '") + input.first + - "' to have dimension >= 0") - .c_str()); - } - if (lit->second[idx] > input.second[idx]) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("graph lower bound spec expects input '") + - input.first + - "' to have dimension <= " + std::to_string(input.second[idx])) - .c_str()); - } - } - } - } - return nullptr; -} - -bool -TRTv1Interface::BuildCudaGraph( - TensorRTContext* trt_context, const GraphSpec& graph_spec) -{ - // 1 is special case as non-batching model has 'max_batch_size == 0' - int batch_size = (graph_spec.batch_size_ == 0) ? 1 : graph_spec.batch_size_; - std::vector cuda_graph_key{batch_size}; - auto cuda_graph = TensorRTContext::CudaGraph(); - int lower_bound_batch_size = (graph_spec.lower_bound_batch_size_ == 0) - ? 1 - : graph_spec.lower_bound_batch_size_; - cuda_graph.lower_bound_key_ = {lower_bound_batch_size}; - for (int io_index = 0; io_index < instance_->num_expected_bindings_; - ++io_index) { - // FIXME handle shape tensor properly, for now if model uses shape - // tensor then cuda graph is not captured - if (instance_->engine_->isShapeBinding(io_index)) { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("Detected shape tensor, CUDA graph is not " - "captured for ") + - instance_->Name()) - .c_str()); - return false; - } - } - - // Enqueue to TRT to setup resources properly BEFORE capturing CUDA - // graph - for (int s = 0; s < instance_->num_copy_streams_; s++) { - if (!trt_context->context_->enqueue( - batch_size, instance_->buffer_bindings_[s].data(), - instance_->CudaStream(), nullptr)) { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("unable to record CUDA graph for ") + instance_->Name()) - .c_str()); - return false; - } - cudaStreamSynchronize(instance_->CudaStream()); - } - - bool captured = true; - for (int set_idx = 0; set_idx < EVENT_SET_COUNT; set_idx++) { - // The same spec has been captured - if (trt_context->cuda_graph_execs_[set_idx].find(cuda_graph_key) != - trt_context->cuda_graph_execs_[set_idx].end()) { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("Detected duplicated CUDA graph specification for ") + - instance_->Name() + ", skipping the duplicated specification") - .c_str()); - - return true; - } - // Use second set of buffers to capture cuda graph if - // double-buffering - auto buffer_binding_index = set_idx % instance_->num_copy_streams_; - cudaGraph_t graph; - // Using cudaStreamCaptureModeThreadLocal mode to confine the graph - // capture to this thread and avoid interference from other potentially - // unsafe cuda calls. - auto cuerr = cudaStreamBeginCapture( - instance_->CudaStream(), cudaStreamCaptureModeThreadLocal); - if (cuerr != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to start CUDA graph for ") + instance_->Name() + - ": " + cudaGetErrorString(cuerr)) - .c_str()); - captured = false; - } else { - auto context = trt_context->context_; - if (!context->enqueue( - batch_size, - instance_->buffer_bindings_[buffer_binding_index].data(), - instance_->CudaStream(), - &instance_->events_[set_idx].ready_for_input_)) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to record CUDA graph for ") + - instance_->Name()) - .c_str()); - captured = false; - } - - cuerr = cudaStreamEndCapture(instance_->CudaStream(), &graph); - if (captured == false) { - if (cuerr != cudaErrorStreamCaptureInvalidated) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("stream capture is not invalidated for ") + - instance_->Name() + ": " + cudaGetErrorString(cuerr)) - .c_str()); - } - // There has been an error during graph capture. Below call resets the - // sticky error from the cuda runtime. - cudaGetLastError(); - // Verify if the error has been cleared successfully. - auto cuerr2 = cudaGetLastError(); - if (cuerr2 != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to clear cuda runtime error for ") + - instance_->Name() + ": " + cudaGetErrorString(cuerr2)) - .c_str()); - } - } - if (cuerr != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to finish CUDA graph for ") + - instance_->Name() + ": " + cudaGetErrorString(cuerr)) - .c_str()); - captured = false; - } - - if (captured) { - cudaGraphExec_t graph_exec; -#if CUDART_VERSION >= 12000 - cuerr = cudaGraphInstantiate(&graph_exec, graph, 0); -#else - cuerr = cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0); -#endif - if (cuerr != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to instantiate CUDA graph for ") + - instance_->Name() + ": " + cudaGetErrorString(cuerr)) - .c_str()); - captured = false; - } else { - cuda_graph.cuda_graph_exec_ = graph_exec; - - trt_context->cuda_graph_execs_[set_idx].insert( - std::make_pair(cuda_graph_key, cuda_graph)); - } - cuerr = cudaGraphDestroy(graph); - if (cuerr != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to destroy graph for ") + instance_->Name() + - ": " + cudaGetErrorString(cuerr)) - .c_str()); - captured = false; - } - } - } - } - - if (captured) { - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("captured CUDA graph for ") + instance_->Name() + - ", batch size " + std::to_string(batch_size)) - .c_str()); - } - - return captured; -} - -bool -TRTv3Interface::BuildCudaGraph( - TensorRTContext* trt_context, const GraphSpec& graph_spec) -{ - // FIXME handle shape tensor properly, for now if model uses shape - // tensor then cuda graph is not captured - for (int i = 0; i < instance_->num_expected_bindings_; ++i) { - if (instance_->engine_->isShapeBinding(i)) { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("Detected shape tensor, CUDA graph is not " - "captured for") + - instance_->Name()) - .c_str()); - return false; - } - } - - std::vector cuda_graph_key; - auto cuda_graph = TensorRTContext::CudaGraph(); - auto err = - SetCudaGraphShape(trt_context, graph_spec, &cuda_graph_key, &cuda_graph); - if (err != nullptr) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("Failed to set cuda graph shape for ") + - instance_->Name() + TRITONSERVER_ErrorMessage(err)) - .c_str()); - TRITONSERVER_ErrorDelete(err); - return false; - } - - // [DLIS-4283] keeping current event / buffer idx for later recovery, - // however do we need to restore them? - int curr_buffer_binding_set = instance_->next_buffer_binding_set_; - size_t curr_event_set_idx = instance_->next_set_; - - // Enqueue to TRT to setup resources properly BEFORE capturing CUDA - // graph, so there hasn't been association with input consumed event yet. - // [DLIS-4283] shouldn't need to repeat for all sets of bindings, reason - // being that the resource setup is more likely to be related to the - // execution context and not the binding buffers - for (int s = 0; s < instance_->num_copy_streams_; s++) { - // Use helper function to setTensorAddress if different buffer binding - // should be used. - instance_->next_buffer_binding_set_ = s; - if (!Enqueue(trt_context->context_.get())) { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("unable to record CUDA graph for ") + instance_->Name()) - .c_str()); - return false; - } - cudaStreamSynchronize(instance_->CudaStream()); - } - - bool captured = true; - - for (int set_idx = 0; set_idx < EVENT_SET_COUNT; set_idx++) { - cudaGraph_t graph; - instance_->next_buffer_binding_set_ = - set_idx % instance_->num_copy_streams_; - instance_->next_set_ = set_idx; - // Using cudaStreamCaptureModeThreadLocal mode to confine the graph - // capture to this thread and avoid interference from other potentially - // unsafe cuda calls. - auto cuerr = cudaStreamBeginCapture( - instance_->CudaStream(), cudaStreamCaptureModeThreadLocal); - if (cuerr != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to start CUDA graph for ") + instance_->Name() + - ": " + cudaGetErrorString(cuerr)) - .c_str()); - captured = false; - } else { - auto context = trt_context->context_; - if (!Enqueue(context.get())) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to record CUDA graph for ") + - instance_->Name()) - .c_str()); - captured = false; - } - - cuerr = cudaStreamEndCapture(instance_->CudaStream(), &graph); - if (captured == false) { - if (cuerr != cudaErrorStreamCaptureInvalidated) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("stream capture is not invalidated for ") + - instance_->Name() + ": " + cudaGetErrorString(cuerr)) - .c_str()); - } - // There has been an error during graph capture. Below call resets the - // sticky error from the cuda runtime. - cudaGetLastError(); - // Verify if the error has been cleared successfully. - auto cuerr2 = cudaGetLastError(); - if (cuerr2 != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to clear cuda runtime error for ") + - instance_->Name() + ": " + cudaGetErrorString(cuerr2)) - .c_str()); - } - } - if (cuerr != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to finish CUDA graph for ") + - instance_->Name() + ": " + cudaGetErrorString(cuerr)) - .c_str()); - captured = false; - } - - if (captured) { - cudaGraphExec_t graph_exec; -#if CUDART_VERSION >= 12000 - cuerr = cudaGraphInstantiate(&graph_exec, graph, 0); -#else - cuerr = cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0); -#endif - if (cuerr != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to instantiate CUDA graph for ") + - instance_->Name() + ": " + cudaGetErrorString(cuerr)) - .c_str()); - captured = false; - } else { - cuda_graph.cuda_graph_exec_ = graph_exec; - trt_context->cuda_graph_execs_[set_idx].insert( - std::make_pair(cuda_graph_key, cuda_graph)); - } - cuerr = cudaGraphDestroy(graph); - if (cuerr != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("unable to destroy graph for ") + instance_->Name() + - ": " + cudaGetErrorString(cuerr)) - .c_str()); - captured = false; - } - } - } - } - instance_->next_buffer_binding_set_ = curr_buffer_binding_set; - instance_->next_set_ = curr_event_set_idx; - - if (captured) { - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("captured CUDA graph for ") + instance_->Name() + - ", batch size " + std::to_string(graph_spec.batch_size_)) - .c_str()); - } - - return captured; -} - -TRITONSERVER_Error* -TRTv3Interface::SetCudaGraphShape( - TensorRTContext* trt_context, const GraphSpec& graph_spec, - std::vector* cuda_graph_key, - TensorRTContext::CudaGraph* cuda_graph) -{ - int batch_size = graph_spec.batch_size_; - int binding_offset = - trt_context->profile_idx_ * instance_->num_expected_bindings_; - *cuda_graph_key = std::vector{batch_size}; - auto& lower_bound_key = cuda_graph->lower_bound_key_; - lower_bound_key.push_back( - (graph_spec.lower_bound_batch_size_ == 0) - ? 1 - : graph_spec.lower_bound_batch_size_); - for (int io_index = 0; io_index < instance_->num_expected_bindings_; - io_index++) { - auto& io_binding_info = instance_->io_binding_infos_[0][io_index]; - auto binding_index = binding_offset + io_index; - if (!instance_->engine_->bindingIsInput(binding_index)) { - continue; - } - // Empty shapes indicates the graph spec is added by default, - // for default graph spec, opt dims are used. - if (graph_spec.shapes_.empty()) { - auto shape = trt_context->opt_dims_[io_index]; - if (batch_size != 0) { - shape.d[0] = batch_size; - } - if (!trt_context->context_->setBindingDimensions(binding_index, shape)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("trt failed to set binding dimension to ") + - DimsDebugString(shape) + " for binding " + - std::to_string(binding_index) + " for " + instance_->Name()) - .c_str()); - } - std::vector dims; - DimsToDimVec(shape, &dims); - cuda_graph->input_dims_.emplace_back(dims); - cuda_graph_key->insert(cuda_graph_key->end(), dims.begin(), dims.end()); - lower_bound_key.insert(lower_bound_key.end(), dims.begin(), dims.end()); - } else { - const std::string& name = instance_->engine_->getBindingName(io_index); - auto it = graph_spec.shapes_.find(name); - if (it != graph_spec.shapes_.end()) { - // For ragged / batch input, assume the shape in graph spec is proper - // shape after ragged. - if (io_binding_info.IsBufferRagged() || - (io_binding_info.GetBatchInput() != nullptr)) { - cuda_graph->input_dims_.emplace_back(); - } else { - cuda_graph->input_dims_.emplace_back(); - if (batch_size != 0) { - cuda_graph->input_dims_.back().push_back(batch_size); - } - lower_bound_key.push_back(lower_bound_key[0]); - } - auto& shape = cuda_graph->input_dims_.back(); - shape.insert(shape.end(), it->second.begin(), it->second.end()); - nvinfer1::Dims trt_shape; - DimVecToDims(shape, &trt_shape); - if (!trt_context->context_->setBindingDimensions( - binding_index, trt_shape)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("trt failed to set binding dimension to ") + - DimsDebugString(trt_shape) + " for binding " + - std::to_string(binding_index) + " for " + instance_->Name()) - .c_str()); - } - cuda_graph_key->insert( - cuda_graph_key->end(), shape.begin(), shape.end()); - auto lit = graph_spec.lower_bound_shapes_.find(name); - lower_bound_key.insert( - lower_bound_key.end(), lit->second.begin(), lit->second.end()); - } else { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("trt failed to set binding dimension for " - "unknown input '") + - name + "' for " + instance_->Name()) - .c_str()); - } - } - } - return nullptr; -} - -#endif // TRITON_ENABLE_CUDA_GRAPH - -bool -TRTv1Interface::Enqueue(nvinfer1::IExecutionContext* context) -{ - return context->enqueue( - instance_->payload_->total_batch_size_, - instance_->buffer_bindings_[instance_->next_buffer_binding_set_].data(), - instance_->stream_, - &instance_->events_[instance_->next_set_].ready_for_input_); -} - -int64_t -TRTv1Interface::GetFullByteSize( - nvinfer1::IExecutionContext* context, const std::string& tensor_name, - int32_t binding_index) -{ - auto dt = ConvertTrtTypeToDataType( - instance_->engine_->getBindingDataType(binding_index)); - const nvinfer1::Dims output_dim = - context->getBindingDimensions(binding_index); - std::vector dim_vec; - DimsToDimVec(output_dim, &dim_vec); - - std::vector dim_vec_with_mbs; - if (instance_->support_batching_) { - dim_vec_with_mbs.push_back(instance_->model_state_->MaxBatchSize()); - } - dim_vec_with_mbs.insert( - dim_vec_with_mbs.end(), dim_vec.begin(), dim_vec.end()); - return GetByteSize(dt, dim_vec_with_mbs); -} - -TRITONSERVER_Error* -TRTv1Interface::SetFormat(int binding_index, TensorFormat* format) -{ - format->is_linear_format_ = - (instance_->engine_->getBindingFormat(binding_index) == - nvinfer1::TensorFormat::kLINEAR); - if (!format->is_linear_format_) { - format->vectorized_dim_ = - instance_->engine_->getBindingVectorizedDim(binding_index); - format->components_per_element_ = - instance_->engine_->getBindingComponentsPerElement(binding_index); - if (format->vectorized_dim_ == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected vectorized dim is -1 for non-linear " - "tensor '") + - instance_->engine_->getBindingName(binding_index) + "' for " + - instance_->Name()) - .c_str()); - } - // +1 for implicit batch dimension to take full shape dim as reference - if (instance_->support_batching_) { - format->vectorized_dim_++; - } - } - return nullptr; -} - -TRITONSERVER_Error* -TRTv1Interface::ConfigureInputDimensions( - TensorRTContext* context, int io_index, int binding_index, - std::vector full_config_dims, std::vector* maximum_dims) -{ - // Below is more of a sanity check, v1 only support fixed shape tensor so - // config shape must be exact match of engine shape. - - // v1 slightly different that TRT dim doesn't contain batch dimension - TRITONSERVER_Error* err = ValidateDimension( - full_config_dims, context->min_dims_[io_index], - context->max_dims_[io_index], - instance_->support_batching_ /* skip_first_dimension */); - if (err != nullptr) { - TRITONSERVER_Error* full_err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("model configuration specified invalid shape for " - "input '") + - instance_->engine_->getBindingName(io_index) + "' for model " + - instance_->model_state_->Name() + - ". Error details: " + TRITONSERVER_ErrorMessage(err)) - .c_str()); - - TRITONSERVER_ErrorDelete(err); - return full_err; - } - *maximum_dims = full_config_dims; - return nullptr; -} - -TRITONSERVER_Error* -TRTv1Interface::SetBindingDimensions( - const std::string& input_name, const std::vector& shape, - const TensorRTContext& trt_context, const size_t io_index, - const size_t binding_index, std::vector* cuda_graph_key) -{ - // NO-OP, v1 engine doesn't support dynamic shape, so input shape won't - // need to be set at runtime, for the same reason 'cuda_graph_key' won't - // be changed at runtime as long as the batch size has set. - return nullptr; -} - -bool -TRTv3Interface::Enqueue(nvinfer1::IExecutionContext* context) -{ - if (SetTensorAddress(context)) { - if (context->setInputConsumedEvent( - instance_->events_[instance_->next_set_].ready_for_input_)) { - return context->enqueueV3(instance_->stream_); - } - } - return false; -} - -bool -TRTv3Interface::SetTensorAddress(nvinfer1::IExecutionContext* context) -{ - const auto& io_binding_info = - instance_->io_binding_infos_[instance_->next_buffer_binding_set_]; - for (const auto& info : io_binding_info) { - if (!context->setTensorAddress( - info.GetName().c_str(), info.GetDeviceBuffer())) { - return false; - } - } - return true; -} - -int64_t -TRTv3Interface::GetFullByteSize( - nvinfer1::IExecutionContext* context, const std::string& tensor_name, - int32_t binding_index) -{ - return context->getMaxOutputSize(tensor_name.c_str()); -} - -TRITONSERVER_Error* -TRTv3Interface::SetFormat(int binding_index, TensorFormat* format) -{ - format->is_linear_format_ = - (instance_->engine_->getBindingFormat(binding_index) == - nvinfer1::TensorFormat::kLINEAR); - if (!format->is_linear_format_) { - format->vectorized_dim_ = - instance_->engine_->getBindingVectorizedDim(binding_index); - format->components_per_element_ = - instance_->engine_->getBindingComponentsPerElement(binding_index); - if (format->vectorized_dim_ == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unexpected vectorized dim is -1 for non-linear " - "tensor '") + - instance_->engine_->getBindingName(binding_index) + "' for " + - instance_->Name()) - .c_str()); - } - } - return nullptr; -} - -TRITONSERVER_Error* -TRTv3Interface::ConfigureInputDimensions( - TensorRTContext* context, int io_index, int binding_index, - std::vector full_config_dims, std::vector* maximum_dims) -{ - // [FIXME] WAR: max batch size needs to be special handled as it is really - // a dynamic shape, setting it to fixed value will interfere with below - // dimension checkings - const auto batch_dim = full_config_dims[0]; - if (instance_->support_batching_) { - full_config_dims[0] = -1; - } - TRITONSERVER_Error* err = ValidateDimension( - full_config_dims, context->min_dims_[io_index], - context->max_dims_[io_index], false /* skip_first_dimension */); - if (err != nullptr) { - TRITONSERVER_Error* full_err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("model configuration specified invalid shape for " - "input '") + - instance_->engine_->getBindingName(io_index) + "' for model " + - instance_->model_state_->Name() + - ". Error details: " + TRITONSERVER_ErrorMessage(err)) - .c_str()); - - TRITONSERVER_ErrorDelete(err); - return full_err; - } - - // Only "prune" maximum dims for the input given context max dim and - // config shape, user may provide fixed config value even for dynamic dim, - // to reduce the size of allocation. - RETURN_IF_ERROR(MaximumDims( - context->max_dims_[io_index], full_config_dims, maximum_dims)); - // WAR, see [FIXME] above - if (instance_->support_batching_ && (batch_dim != -1)) { - (*maximum_dims)[0] = std::min((*maximum_dims)[0], batch_dim); - } - DimVecToDims(*maximum_dims, &context->max_dims_[io_index]); - - if (!context->context_->setBindingDimensions( - binding_index, context->max_dims_[io_index])) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("trt failed to set binding dimension to ") + - DimsDebugString(context->max_dims_[io_index]) + " for input '" + - instance_->engine_->getBindingName(io_index) + "' for " + - instance_->Name()) - .c_str()); - } - - return nullptr; -} - -TRITONSERVER_Error* -TRTv3Interface::MaximumDims( - const nvinfer1::Dims& max_profile_dims, const std::vector& dims, - std::vector* max_dims) -{ - if (max_profile_dims.nbDims != (int32_t)dims.size()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("can not maximize dimension ") + - backend::ShapeToString(dims) + " to " + - DimsDebugString(max_profile_dims) + " due to incompatibility.") - .c_str()); - } - - for (uint64_t i = 0; i < dims.size(); ++i) { - if (dims[i] == WILDCARD_DIM) { - max_dims->emplace_back(max_profile_dims.d[i]); - } else { - if (dims[i] <= max_profile_dims.d[i]) { - max_dims->emplace_back(dims[i]); - } else { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("can not maximize dimension ") + - backend::ShapeToString(dims) + " to " + - DimsDebugString(max_profile_dims) + " due to incompatibility.") - .c_str()); - } - } - } - return nullptr; -} - -TRITONSERVER_Error* -TRTv3Interface::SetBindingDimensions( - const std::string& input_name, const std::vector& shape, - const TensorRTContext& trt_context, const size_t io_index, - const size_t binding_index, std::vector* cuda_graph_key) -{ - if (cuda_graph_key != nullptr) { - cuda_graph_key->insert(cuda_graph_key->end(), shape.begin(), shape.end()); - } - - nvinfer1::Dims this_dim; - // Set the binding dimension so that output dimensions can be - // obtained - if (!DimVecToDims(shape, &this_dim)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("failed to create dims object for ") + - ShapeToString(shape) + " for input '" + input_name + "' for " + - instance_->name_ + ".") - .c_str()); - } - TRITONSERVER_Error* err = ValidateDimension( - this_dim, trt_context.min_dims_[io_index], - trt_context.max_dims_[io_index]); - if (err != nullptr) { - TRITONSERVER_Error* full_err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("request specifies invalid shape for input '") + - input_name + "' for " + instance_->name_ + - ". Error details: " + TRITONSERVER_ErrorMessage(err)) - .c_str()); - TRITONSERVER_ErrorDelete(err); - return full_err; - } - - if (!trt_context.is_dynamic_per_binding_[io_index]) { - // No need to set dimension for the binding that does not include - // dynamic shape. - return nullptr; - } - - if (!trt_context.context_->setBindingDimensions(binding_index, this_dim)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("trt failed to set binding dimension to ") + - DimsDebugString(this_dim) + " for input '" + input_name + "' for " + - instance_->Name()) - .c_str()); - } - - return nullptr; -} -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/instance_state.h b/src_trt8/instance_state.h deleted file mode 100644 index 3e7dfc0..0000000 --- a/src_trt8/instance_state.h +++ /dev/null @@ -1,550 +0,0 @@ -// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#pragma once - -#include - -#include -#include -#include - -#include "io_binding_info.h" -#include "model_state.h" -#include "semaphore.h" -#include "tensorrt_model_instance.h" -#include "triton/backend/backend_input_collector.h" -#include "triton/backend/backend_output_responder.h" - -namespace triton { namespace backend { namespace tensorrt { - -// Number of CUDA event set for each instance. -// We need three sets to avoid event overlaps between issue -// and response threads. -static constexpr int EVENT_SET_COUNT = 3; - -// -// BackendConfiguration -// -// Struct to hold values specified via backend config -struct BackendConfiguration { - static const BackendConfiguration& RetrieveFrom( - TRITONBACKEND_Backend* backend) - { - void* state = nullptr; - THROW_IF_BACKEND_INSTANCE_ERROR( - TRITONBACKEND_BackendState(backend, &state)); - return *reinterpret_cast(state); - } - - static const BackendConfiguration& RetrieveFrom(TRITONBACKEND_Model* model) - { - TRITONBACKEND_Backend* backend = nullptr; - THROW_IF_BACKEND_INSTANCE_ERROR( - TRITONBACKEND_ModelBackend(model, &backend)); - return RetrieveFrom(backend); - } - - static const BackendConfiguration& RetrieveFrom( - TRITONBACKEND_ModelInstance* instance) - { - TRITONBACKEND_Model* model = nullptr; - THROW_IF_BACKEND_INSTANCE_ERROR( - TRITONBACKEND_ModelInstanceModel(instance, &model)); - return RetrieveFrom(model); - } - - bool coalesce_request_input_{false}; - bool enable_memory_tracker_{false}; -}; - -class ModelInstanceState; -// A struct to hold TensorRT execution context and its meta data, a -// backend context can have multiple of this struct if multiple -// optimization profiles is specified. -struct TensorRTContext { - TensorRTContext( - const std::string& profile_name, const int profile_idx, - const int binding_cnts, const int event_set_cnts) - : profile_name_(profile_name), profile_idx_(profile_idx), - cuda_graph_execs_(event_set_cnts), min_dims_(binding_cnts), - max_dims_(binding_cnts), opt_dims_(binding_cnts), - min_shapes_(binding_cnts), max_shapes_(binding_cnts), - opt_shapes_(binding_cnts), is_dynamic_per_binding_(binding_cnts) - { - } - std::string profile_name_; - int profile_idx_; - std::shared_ptr context_{nullptr}; - - // Struct that holds cudaGraphExec_t and the dimensions of the - // inputs used to capture the graph - struct CudaGraph { - CudaGraph() : cuda_graph_exec_(nullptr) {} - std::vector lower_bound_key_; - // Store in the order of the bindng index - std::vector> input_dims_; - cudaGraphExec_t cuda_graph_exec_; - }; - - // The key is packed input dimensions prepended by batch size, so - // that uniqueness is guaranteed and the CUDA graphs are sorted to - // provide convenience to find the closest CUDA graph in the - // future. - // - // vector is used to map index of event sets to corresponding - // collection. - // [DLIS-4283] need to be careful in the case of having multiple sets of - // binding buffer, see BuildCudaGraph that in such a case - // (num_copy_streams_ != 1), the index of binding set is tied to the same - // value for index of event set. This is fine for now as there is at most - // 2 sets of bindings (the same as number of event sets), but wrong CUDA graph - // may be used if we change to mismatching number such that the assumption - // fails. - std::vector, CudaGraph>> cuda_graph_execs_; - - // Min Dimensions per bindings - std::vector min_dims_{}; - - // Max Dimensions per bindings - std::vector max_dims_{}; - - // Optimized Dimensions per bindings - std::vector opt_dims_{}; - - // Min shape values per bindings - std::vector min_shapes_{}; - - // Max shape values per bindings - std::vector max_shapes_{}; - - // Optimized shape values per bindings - std::vector opt_shapes_{}; - - // The number of shape values - size_t nb_shape_values_{0}; - - // Whether or not the binding contains a dynamic shape - std::vector is_dynamic_per_binding_{}; -}; - -struct GraphSpec { - int64_t batch_size_{0}; - std::map> shapes_{}; - int64_t lower_bound_batch_size_{0}; - std::map> lower_bound_shapes_{}; - bool captured_{false}; -}; - -// [DLIS-4283] temporary workaround to separate TRT v1 and TRT v3 usage -// in polymorphic style -class TRTInterface { - public: - TRTInterface(ModelInstanceState* i) : instance_(i) {} - - // NOTE: before calling this function, members of 'instance_' must be properly - // set for all variants of IExecutionContext::enqueue(), i.e. input shapes and - // I/O bindings - virtual bool Enqueue(nvinfer1::IExecutionContext* context) = 0; - - // This function will be called to specify the runtime shape of the input and - // adding metadata into existing 'cuda_graph_key' for graph lookup. - virtual TRITONSERVER_Error* SetBindingDimensions( - const std::string& input_name, const std::vector& shape, - const TensorRTContext& trt_context, const size_t io_index, - const size_t binding_index, std::vector* cuda_graph_key) = 0; - - // Return the max byte size of the binding - virtual int64_t GetFullByteSize( - nvinfer1::IExecutionContext* context, const std::string& tensor_name, - int32_t binding_index) = 0; - - virtual TRITONSERVER_Error* SetFormat( - int binding_index, TensorFormat* format) = 0; - - // get max input shape of the binding buffer based on the given - // TensorRTContext (optimization profile),member of 'context' may be updated - virtual TRITONSERVER_Error* ConfigureInputDimensions( - TensorRTContext* context, int io_index, int binding_index, - std::vector full_config_dims, - std::vector* maximum_dims) = 0; - - protected: - ModelInstanceState* instance_; - -#ifdef TRITON_ENABLE_CUDA_GRAPH - public: - virtual bool BuildCudaGraph( - TensorRTContext* trt_context, const GraphSpec& graph_spec) = 0; -#endif // TRITON_ENABLE_CUDA_GRAPH -}; - -class TRTv1Interface : public TRTInterface { - public: - TRTv1Interface(ModelInstanceState* i) : TRTInterface(i) {} - bool Enqueue(nvinfer1::IExecutionContext* context) override; - TRITONSERVER_Error* SetBindingDimensions( - const std::string& input_name, const std::vector& shape, - const TensorRTContext& trt_context, const size_t io_index, - const size_t binding_index, - std::vector* cuda_graph_key) override; - int64_t GetFullByteSize( - nvinfer1::IExecutionContext* context, const std::string& tensor_name, - int32_t binding_index) override; - TRITONSERVER_Error* SetFormat( - int binding_index, TensorFormat* format) override; - TRITONSERVER_Error* ConfigureInputDimensions( - TensorRTContext* context, int io_index, int binding_index, - std::vector full_config_dims, - std::vector* maximum_dims) override; -#ifdef TRITON_ENABLE_CUDA_GRAPH - public: - bool BuildCudaGraph( - TensorRTContext* trt_context, const GraphSpec& graph_spec) override; -#endif // TRITON_ENABLE_CUDA_GRAPH -}; - -class TRTv3Interface : public TRTInterface { - public: - TRTv3Interface(ModelInstanceState* i) : TRTInterface(i) {} - bool Enqueue(nvinfer1::IExecutionContext* context) override; - TRITONSERVER_Error* SetBindingDimensions( - const std::string& input_name, const std::vector& shape, - const TensorRTContext& trt_context, const size_t io_index, - const size_t binding_index, - std::vector* cuda_graph_key) override; - int64_t GetFullByteSize( - nvinfer1::IExecutionContext* context, const std::string& tensor_name, - int32_t binding_index) override; - TRITONSERVER_Error* SetFormat( - int binding_index, TensorFormat* format) override; - TRITONSERVER_Error* ConfigureInputDimensions( - TensorRTContext* context, int io_index, int binding_index, - std::vector full_config_dims, - std::vector* maximum_dims) override; - - private: - // Helper function to be called in Enqueue(). In v3, the binding buffer is set - // in execution context instead of being provided on enqueue, so in the case - // where different buffers are used alternatively in execution, we need to set - // tensor address to proper buffers. - bool SetTensorAddress(nvinfer1::IExecutionContext* context); - TRITONSERVER_Error* MaximumDims( - const nvinfer1::Dims& max_profile_dims, const std::vector& dims, - std::vector* max_dims); -#ifdef TRITON_ENABLE_CUDA_GRAPH - public: - bool BuildCudaGraph( - TensorRTContext* trt_context, const GraphSpec& graph_spec) override; - - private: - TRITONSERVER_Error* SetCudaGraphShape( - TensorRTContext* trt_context, const GraphSpec& graph_spec, - std::vector* cuda_graph_key, - TensorRTContext::CudaGraph* cuda_graph); -#endif // TRITON_ENABLE_CUDA_GRAPH -}; - -// -// ModelInstanceState -// -// State associated with a model instance. An object of this class is -// created and associated with each TRITONBACKEND_ModelInstance. -// -class ModelInstanceState : public TensorRTModelInstance { - public: - // GPU device number that indicates that no gpu is available for a - // context (which is an invalid state since TensorRT requires a - // GPU). - static constexpr int NO_GPU_DEVICE = -1; - - // GPU device number that indicates model will be loaded on GPUs - // as specified in model graph - static constexpr int MODEL_DEVICE = -2; - - static TRITONSERVER_Error* Create( - ModelState* model_state, - TRITONBACKEND_ModelInstance* triton_model_instance, - ModelInstanceState** state); - virtual ~ModelInstanceState(); - - // Get the state of the model that corresponds to this instance. - ModelState* StateForModel() const { return model_state_; } - - std::shared_ptr* EnginePtr() { return &engine_; } - std::shared_ptr Engine() { return engine_; } - - void ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count); - - void Run(TRITONBACKEND_Request** requests, const uint32_t request_count); - - Semaphore* SemaphorePtr() { return semaphore_.get(); } - - protected: - friend class TRTv1Interface; - friend class TRTv3Interface; - - ModelInstanceState( - ModelState* model_state, - TRITONBACKEND_ModelInstance* triton_model_instance); - - void InitSemaphore(); - TRITONSERVER_Error* InitStreamsAndEvents(); - TRITONSERVER_Error* InitEventSet(bool busy_wait_events); - TRITONSERVER_Error* DestroyEventSet(); - TRITONSERVER_Error* InitOptimizationProfiles(); - - TRITONSERVER_Error* ValidateIO(); - TRITONSERVER_Error* ValidateIOHelper( - common::TritonJson::Value& ios, - const std::set& allowed_shape_tensors, const bool is_input); - - TRITONSERVER_Error* InitIOBindingBuffers(); - TRITONSERVER_Error* InitializeConfigShapeInputBindings( - common::TritonJson::Value& config_inputs); - TRITONSERVER_Error* InitializeConfigExecuteInputBindings( - common::TritonJson::Value& config_inputs); - TRITONSERVER_Error* InitializeSequenceControlInputBindings( - common::TritonJson::Value& config); - TRITONSERVER_Error* InitializeSequenceStateInputBindings( - common::TritonJson::Value& config); - TRITONSERVER_Error* InitializeBatchInputBindings( - common::TritonJson::Value& config); - TRITONSERVER_Error* InitializeBatchOutputBindings( - common::TritonJson::Value& config); - // This function sets the output bindings for shape tensors. - TRITONSERVER_Error* InitializeConfigShapeOutputBindings( - common::TritonJson::Value& config_output); - TRITONSERVER_Error* InitializeConfigExecuteOutputBindings( - common::TritonJson::Value& config_output); - TRITONSERVER_Error* InitializeExecuteInputBinding( - const std::string& input_name, const std::string& input_datatype, - common::TritonJson::Value& input_dims, const bool is_control = false, - const bool is_ragged = false, const bool is_state = false); - TRITONSERVER_Error* InitializeExecuteOutputBinding( - const std::string& output_name, const std::string& output_datatype, - common::TritonJson::Value& output_dims, bool is_state = false); - TRITONSERVER_Error* InitializeShapeInputBinding( - const std::string& input_name, const TRITONSERVER_DataType input_datatype, - common::TritonJson::Value& model_config_dims); - TRITONSERVER_Error* InitializeSequenceStateOutputBindings( - common::TritonJson::Value& config); - - TRITONSERVER_Error* GetProfileDimensions( - const int io_index, const int profile_index, TensorRTContext* context); - - TRITONSERVER_Error* GetRequestShapeValues( - size_t total_batch_size, TRITONBACKEND_Request* request, - std::map>* request_shape_values); - TRITONSERVER_Error* GetMostOptimizedProfile( - size_t total_batch_size, TRITONBACKEND_Request** requests, - uint32_t request_count, - const std::map>& request_shape_values, - std::map::iterator* citr); - TRITONSERVER_Error* EvaluateTensorRTContext( - std::map::iterator& citr, size_t total_batch_size, - TRITONBACKEND_Request** requests, uint32_t request_count, - const std::map>& request_shape_values, - int64_t* error_distance); - - bool SetOutputShapeTensorBuffer( - const int32_t* content, TRITONBACKEND_Response** response, - TRITONBACKEND_Output* response_output, const size_t tensor_element_count, - const int64_t batch_size, cudaStream_t stream); - void ProcessResponse(); - - void GetConfiguredProfiles(std::string* profiles_desc); - int CudaStreamPriority() { return cuda_stream_priority_; } - - void FindClosestCudaGraph( - const TensorRTContext& trt_context, - const std::vector& cuda_graph_key, - const TensorRTContext::CudaGraph** cuda_graph, bool* found_exact); - -#ifdef TRITON_ENABLE_CUDA_GRAPH - TRITONSERVER_Error* InitializeCudaGraph(); - TRITONSERVER_Error* InitializeGraphSpecs( - std::vector* graph_specs, bool* allow_inexact_match); - TRITONSERVER_Error* ValidateGraphSpec(const GraphSpec& graph_spec); -#endif // TRITON_ENABLE_CUDA_GRAPH - - // The engine used for the instance. If the model uses dynamic - // shape, then the CUDA engine is owned by the instance. Otherwise, - // the engine is shared across all contexts and it must not be - // destroyed by the instance. In the future version of TensorRT, the - // engine may be shared even in the dynamic shape case. - std::shared_ptr engine_{nullptr}; - - // Map from profile index to the corresponding TensorRT context. Use - // map to ensure each profile index is mapped to exactly one - // TensorRT context. - std::map trt_contexts_{}; - - // Is set true if the configuration supports batching - bool support_batching_{false}; - - // Whether inexact match is allowed for finding CUDA graph - bool allow_inexact_match_{false}; - - // The total number of bindings - int total_bindings_{0}; - - // The number of expected bindings to the model. In case of dynamic - // shapes, it is the number of expected bindings to the configured - // optimization profile. - int num_expected_bindings_{0}; - - int cuda_stream_priority_{0}; - - // Additional CUDA streams to overlap copy and execution. - cudaStream_t input_copy_stream_{}; - cudaStream_t output_copy_stream_{}; - int num_copy_streams_{0}; - - // CUDA stream use to track execution status - cudaStream_t signal_stream_{}; - - // A group of CUDA events that signals different stages of the - // request. One group should be used for one request at any given - // moment. - struct CUDAEventSet { - // CUDA event to signal input buffer availability. - cudaEvent_t ready_for_input_{}; - cudaEvent_t input_ready_{}; - - // CUDA event for capturing correct timestamp. - cudaEvent_t ready_for_output_{}; - cudaEvent_t output_ready_{}; - - // CUDA event for synchronizing the order of timestamp capture. - cudaEvent_t compute_output_start_{}; - cudaEvent_t compute_input_end_{}; - cudaEvent_t compute_input_start_{}; - }; - - // Use two sets of events each for current request and next request. - CUDAEventSet events_[EVENT_SET_COUNT]{}; - size_t next_set_{0}; - - // Completion thread for handling items in the corresponding - // completion queue. One thread per instance so that the thread - // logic is simple as this avoids busy-looping on different model - // executions' event states. - std::thread completion_thread_{}; - - // The details needed by the completion thread to finalize the - // response for a model execution. - struct Payload { - explicit Payload( - size_t event_set_idx, TRITONBACKEND_Request** requests, - uint32_t request_count) - : event_set_idx_(event_set_idx), requests_(requests), - request_count_(request_count) - { - } - - // The index to the event set handling the request - size_t event_set_idx_{0}; - - // The total batch size for the request - size_t total_batch_size_{0}; - - // The timestamps for reporting stats - uint64_t compute_start_ns_{0}; - uint64_t compute_input_end_ns_{0}; - uint64_t compute_output_start_ns_{0}; - - // All the composing InferenceRequest objects - std::vector requests_list_{}; - TRITONBACKEND_Request** requests_{}; - uint32_t request_count_{0}; - - // All the generated InferenceResponse objects - std::vector responses_{}; - - // The State objects for the inference requests - std::vector seq_states_{}; - - // The collector and responder of the payload, need to extend - // their lifetime to match the payload to ensure content is intact - // until the end of execution. - std::unique_ptr collector_{nullptr}; - std::unique_ptr responder_{nullptr}; - - std::vector> buffer_input_binding_pairs_{}; - }; - - // Assume that the lifetime of composing completion data to extend - // till the responses are returned. - triton::common::SyncQueue> completion_queue_{}; - - // There will be two sets of input/output buffers when - // separate_output_stream is selected to overlap copy and execution - // safely. - int next_buffer_binding_set_{0}; - - // There are Context::num_expected_bindings_ number of IOBindingInfo - // elements for copy stream. - std::vector> io_binding_infos_{}; - - // [DLIS-4283] no longer needed for v3, but v1 still needs it. Should - // encapsulate to v1 specific handling and gradually remove it from regular - // workflow. - // The pointer to the CUDA buffer for each binding index of the - // TensorRT engine. This is used to match the TensorRT context - // execution declaration while minimizing memory allocation. The - // array size is equal to Context::total_bindings_ One of for each - // copy stream - std::vector> buffer_bindings_{}; - - // The request details of the ongoing model execution - std::unique_ptr payload_{nullptr}; - - // Whether zero copy is supported on this device - bool zero_copy_support_{false}; - - // Whether the input collector will coalesce request inputs as if they form - // one contiguous buffer when possible - bool coalesce_request_input_{false}; - - // Whether or not the model uses implicit state. - bool uses_implicit_state_{false}; - - // Holds up the execution on issue thread unless promise is fulfilled. - std::unique_ptr> barrier_{nullptr}; - - ModelState* model_state_{nullptr}; - - std::unique_ptr interface_{nullptr}; - - // TRT model instance performs execution asynchorously and thus may go - // ahead to prepare further executions. Use semaphore to prevent going too - // far ahead and overwriting resources that are still in use. - std::unique_ptr semaphore_{nullptr}; -}; - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/io_binding_info.cc b/src_trt8/io_binding_info.cc deleted file mode 100644 index aaca2b3..0000000 --- a/src_trt8/io_binding_info.cc +++ /dev/null @@ -1,249 +0,0 @@ -// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "io_binding_info.h" - -namespace triton { namespace backend { namespace tensorrt { - -void -IOBindingInfo::SetName(const std::string& name) -{ - name_ = name; -} - -const std::string& -IOBindingInfo::GetName() const -{ - return name_; -} - -void -IOBindingInfo::SetByteSize(uint64_t byte_size) -{ - byte_size_ = byte_size; -} - -uint64_t -IOBindingInfo::GetByteSize() const -{ - return byte_size_; -} - -void -IOBindingInfo::SetBuffer(void* buffer) -{ - buffer_ = buffer; -} - -void -IOBindingInfo::SetBuffer(std::unique_ptr allocator) -{ - allocator_ = std::move(allocator); - is_dynamic_shape_output_ = true; -} - -void* -IOBindingInfo::GetBuffer() const -{ - if (is_dynamic_shape_output_) { - if (allocator_ == nullptr) { - return nullptr; - } else { - return allocator_->getBuffer(); - } - } else { - return buffer_; - } -} - -void -IOBindingInfo::SetDeviceBuffer(void* device_buffer) -{ - device_buffer_ = device_buffer; -} - -void* -IOBindingInfo::GetDeviceBuffer() const -{ - if (is_dynamic_shape_output_) { - if (allocator_ == nullptr) { - return nullptr; - } else { - return allocator_->getBuffer(); - } - } else { - return device_buffer_; - } -} - -void** -IOBindingInfo::GetDeviceBufferAddr() -{ - if (is_dynamic_shape_output_) { - return allocator_->getBufferAddr(); - } else { - return &device_buffer_; - } -} - -void -IOBindingInfo::SetMemoryType(TRITONSERVER_MemoryType memory_type) -{ - memory_type_ = memory_type; -} - -TRITONSERVER_MemoryType -IOBindingInfo::GetMemoryType() const -{ - return memory_type_; -} - -void -IOBindingInfo::SetMemoryTypeId(int64_t memory_type_id) -{ - memory_type_id_ = memory_type_id; -} - -int64_t -IOBindingInfo::GetMemoryTypeId() const -{ - return memory_type_id_; -} - -void -IOBindingInfo::SetIsBufferRagged(bool is_buffer_ragged) -{ - is_buffer_ragged_ = is_buffer_ragged; -} - -bool -IOBindingInfo::IsBufferRagged() const -{ - return is_buffer_ragged_; -} - -void -IOBindingInfo::SetFormat(const TensorFormat& format) -{ - format_ = format; -} - -TensorFormat& -IOBindingInfo::GetFormat() -{ - return format_; -} - -void -IOBindingInfo::SetBatchOutput(const BatchOutput* batch_output) -{ - batch_output_ = batch_output; -} - -const BatchOutput* -IOBindingInfo::GetBatchOutput() const -{ - return batch_output_; -} - -void -IOBindingInfo::SetBatchInput(const BatchInput& batch_input) -{ - batch_input_.reset(new BatchInputData(batch_input, nullptr)); -} - -const std::shared_ptr& -IOBindingInfo::GetBatchInput() const -{ - return batch_input_; -} - -void -IOBindingInfo::SetIoShapeMapping( - const std::pair>& io_shape_mapping) -{ - io_shape_mapping_ = io_shape_mapping; -} - -std::pair>& -IOBindingInfo::GetIoShapeMapping() -{ - return io_shape_mapping_; -} - -void -IOBindingInfo::SetIsStateOutput(bool is_state_output) -{ - is_state_output_ = is_state_output; -} - -bool -IOBindingInfo::IsStateOutput() const -{ - return is_state_output_; -} - -void -IOBindingInfo::SetIsRequestedOutputTensor(bool is_requested_output_tensor) -{ - is_requested_output_tensor_ = is_requested_output_tensor; -} - -bool -IOBindingInfo::IsRequestedOutputTensor() const -{ - return is_requested_output_tensor_; -} - -void -IOBindingInfo::SetIsDynamicShapeOutput(bool is_dynamic_shape_output) -{ - is_dynamic_shape_output_ = is_dynamic_shape_output; -} - -bool -IOBindingInfo::IsDynamicShapeOutput() const -{ - return is_dynamic_shape_output_; -} - -bool -IOBindingInfo::IsBufferAllocated() const -{ - return (buffer_ != nullptr) || (allocator_ != nullptr); -} - -OutputAllocator* -IOBindingInfo::GetAllocator() -{ - if (allocator_) { - return allocator_.get(); - } else { - return nullptr; - } -} - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/io_binding_info.h b/src_trt8/io_binding_info.h deleted file mode 100644 index 5ff1c56..0000000 --- a/src_trt8/io_binding_info.h +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#pragma once - -#include -#include -#include - -#include "output_allocator.h" -#include "triton/backend/backend_common.h" -#include "triton/backend/backend_input_collector.h" -#include "triton/backend/backend_output_responder.h" -#include "triton/core/tritonserver.h" - -namespace triton { namespace backend { namespace tensorrt { - -struct TensorFormat { - bool is_linear_format_{true}; - int vectorized_dim_{-1}; - int components_per_element_{1}; -}; - -using BatchInputData = std::pair>; - -class IOBindingInfo { - private: - std::string name_{}; - uint64_t byte_size_{0}; - void* buffer_{nullptr}; - void* device_buffer_{nullptr}; - TRITONSERVER_MemoryType memory_type_{TRITONSERVER_MEMORY_GPU}; - int64_t memory_type_id_{0}; - bool is_buffer_ragged_{false}; - TensorFormat format_{}; - const BatchOutput* batch_output_{nullptr}; - std::shared_ptr batch_input_{nullptr}; - std::pair> io_shape_mapping_{}; - bool is_state_output_{false}; - bool is_requested_output_tensor_{false}; - bool is_dynamic_shape_output_{false}; - std::unique_ptr allocator_{nullptr}; - - public: - void SetName(const std::string& name); - const std::string& GetName() const; - - void SetByteSize(uint64_t byte_size); - uint64_t GetByteSize() const; - - void SetBuffer(void* buffer); - void SetBuffer(std::unique_ptr allocator); - void* GetBuffer() const; - - void SetDeviceBuffer(void* device_buffer); - void* GetDeviceBuffer() const; - void** GetDeviceBufferAddr(); - - void SetMemoryType(TRITONSERVER_MemoryType memory_type); - TRITONSERVER_MemoryType GetMemoryType() const; - - void SetMemoryTypeId(int64_t memory_type_id); - int64_t GetMemoryTypeId() const; - - void SetIsBufferRagged(bool is_buffer_ragged); - bool IsBufferRagged() const; - - void SetFormat(const TensorFormat& format); - TensorFormat& GetFormat(); - - void SetBatchOutput(const BatchOutput* batch_output); - const BatchOutput* GetBatchOutput() const; - - void SetBatchInput(const BatchInput& batch_input); - const std::shared_ptr& GetBatchInput() const; - - void SetIoShapeMapping( - const std::pair>& io_shape_mapping); - std::pair>& GetIoShapeMapping(); - - void SetIsStateOutput(bool is_state_output); - bool IsStateOutput() const; - - void SetIsRequestedOutputTensor(bool is_requested_output_tensor); - bool IsRequestedOutputTensor() const; - - void SetIsDynamicShapeOutput(bool is_dynamic_shape_output); - bool IsDynamicShapeOutput() const; - - bool IsBufferAllocated() const; - - OutputAllocator* GetAllocator(); -}; - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/libtriton_tensorrt.ldscript b/src_trt8/libtriton_tensorrt.ldscript deleted file mode 100644 index 748714d..0000000 --- a/src_trt8/libtriton_tensorrt.ldscript +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -{ - global: - TRITONBACKEND_*; - local: *; -}; diff --git a/src_trt8/loader.cc b/src_trt8/loader.cc deleted file mode 100644 index bd09404..0000000 --- a/src_trt8/loader.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "loader.h" - -#include - -#include -#include - -#include "triton/backend/backend_common.h" - -namespace triton { namespace backend { namespace tensorrt { - -TRITONSERVER_Error* -LoadPlan( - const std::string& plan_path, const int64_t dla_core_id, - std::shared_ptr* runtime, - std::shared_ptr* engine, - TensorRTLogger* tensorrt_logger) -{ - // Create runtime only if it is not provided - if (*runtime == nullptr) { - runtime->reset(nvinfer1::createInferRuntime(*tensorrt_logger)); - if (*runtime == nullptr) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to create TensorRT runtime: ") + - tensorrt_logger->LastErrorMsg()) - .c_str()); - } - - if (ModelState::isVersionCompatible() && - !runtime->get()->getEngineHostCodeAllowed()) { - runtime->get()->setEngineHostCodeAllowed(true); - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Version compatibility enabled for runtime")).c_str()); - } - } - - // Report error if 'dla_core_id' >= number of DLA cores - if (dla_core_id != -1) { - auto dla_core_count = (*runtime)->getNbDLACores(); - if (dla_core_id < dla_core_count) { - (*runtime)->setDLACore(dla_core_id); - } else { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unable to create TensorRT runtime with DLA Core ID: ") + - std::to_string(dla_core_id) + - ", available number of DLA cores: " + std::to_string(dla_core_count)) - .c_str()); - } - } - - std::string model_data_str; - RETURN_IF_ERROR(ReadTextFile(plan_path, &model_data_str)); - std::vector model_data(model_data_str.begin(), model_data_str.end()); - - engine->reset( - (*runtime)->deserializeCudaEngine(&model_data[0], model_data.size())); - if (*engine == nullptr) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to create TensorRT engine: ") + - tensorrt_logger->LastErrorMsg()) - .c_str()); - } - - return nullptr; -} - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/loader.h b/src_trt8/loader.h deleted file mode 100644 index 0e580e1..0000000 --- a/src_trt8/loader.h +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#pragma once - -#include - -#include -#include -#include - -#include "logging.h" -#include "model_state.h" -#include "triton/core/tritonserver.h" - -namespace triton { namespace backend { namespace tensorrt { - -/// Load a TensorRT plan from a serialized plan file and return the -/// corresponding runtime and engine. It is the caller's -/// responsibility to destroy any returned runtime or engine object -/// even if an error is returned. -/// -/// \param plan_path The path to the model plan file. -/// \param dla_core_id The DLA core to use for this runtime. Does not -/// use DLA when set to -1. -/// \param tensorrt_logger The logger to be used by this TensorRT plan. -/// \param runtime Returns the IRuntime object, or nullptr if failed -/// to create. -/// \param engine Returns the ICudaEngine object, or nullptr if failed -/// to create. -/// \return Error status. -TRITONSERVER_Error* LoadPlan( - const std::string& plan_path, const int64_t dla_core_id, - std::shared_ptr* runtime, - std::shared_ptr* engine, - TensorRTLogger* tensorrt_logger); - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/logging.cc b/src_trt8/logging.cc deleted file mode 100644 index f690096..0000000 --- a/src_trt8/logging.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "logging.h" - -#include "triton/backend/backend_common.h" - -namespace triton { namespace backend { namespace tensorrt { - -void -TensorRTLogger::log(Severity severity, const char* msg) noexcept -{ - switch (severity) { - case Severity::kINTERNAL_ERROR: // fall-through to 'Severity::kERROR' - case Severity::kERROR: - RecordErrorMsg(msg); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, msg); - break; - case Severity::kWARNING: - LOG_MESSAGE(TRITONSERVER_LOG_WARN, msg); - break; - case Severity::kINFO: - LOG_MESSAGE(TRITONSERVER_LOG_INFO, msg); - break; - case Severity::kVERBOSE: - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, msg); - break; - } -} - -void -TensorRTLogger::RecordErrorMsg(const char* msg) noexcept -{ - std::lock_guard lock(last_error_msg_mu_); - last_error_msg_ = std::string(msg); -} - -std::string -TensorRTLogger::LastErrorMsg() -{ - std::lock_guard lock(last_error_msg_mu_); - return last_error_msg_; -} - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/logging.h b/src_trt8/logging.h deleted file mode 100644 index b5145fa..0000000 --- a/src_trt8/logging.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#pragma once - -#include - -#include -#include - -namespace triton { namespace backend { namespace tensorrt { - -// Logger for TensorRT API -class TensorRTLogger : public nvinfer1::ILogger { - // Called by TensorRT to log messages - void log(Severity severity, const char* msg) noexcept override; - - public: - // Return the last 'Severity::kERROR' message logged by TensorRT - std::string LastErrorMsg(); - - private: - // Record error messages logged by TensorRT - void RecordErrorMsg(const char* msg) noexcept; - - std::mutex last_error_msg_mu_; // Protect the last error message string. - std::string last_error_msg_; -}; - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/model_state.cc b/src_trt8/model_state.cc deleted file mode 100644 index 4c9abfb..0000000 --- a/src_trt8/model_state.cc +++ /dev/null @@ -1,932 +0,0 @@ -// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "model_state.h" - -#include "instance_state.h" -#include "loader.h" -#include "tensorrt_utils.h" - -namespace triton { namespace backend { namespace tensorrt { - -class DeviceBlockingArbitrator : public ExecutionArbitrator { - public: - DeviceBlockingArbitrator() {} - void RegisterInstance( - const int device_id, ModelInstanceState* instance) override - { - // instance may be created in parallel - std::lock_guard lk(mtx_); - auto it = device_instances_.find(device_id); - if (it == device_instances_.end()) { - it = device_instances_.emplace(std::make_pair(device_id, InstanceList())) - .first; - // The first instance of the device should have 1-less items because - // it is naturally ready for next execution. See - // TRITONBACKEND_ModelInstanceExecute for the implication of acquisition. - instance->SemaphorePtr()->Acquire(); - } - it->second.instances_.emplace_back(instance); - } - - std::pair ExecutionState( - const int device_id, ModelInstanceState* instance) override - { - std::lock_guard lk(mtx_); - auto& il = device_instances_[device_id]; - auto current_instance = il.instances_[il.idx_]; - il.idx_ = ((il.idx_ + 1) % il.instances_.size()); - // In DEVICE_BLOCKING, the semaphore of the "next instance" will be used to - // determine whether the execution should wait for resources to be ready for - // next execution. - // Here we simplify the "available instance" look-up by round-robin the - // instances associated with the given device, as the executions are less - // likely to finish out of order. - auto semaphore = il.instances_[il.idx_]->SemaphorePtr(); - return {current_instance, semaphore}; - } - - private: - // A list of instances associated with the same device ID an the index - // to the instance to be used for execution. - struct InstanceList { - InstanceList() : idx_(0) {} - std::vector instances_; - int idx_; - }; - // map between device ID and associated instances - std::map device_instances_; - - std::mutex mtx_; -}; - -class InstanceBlockingArbitrator : public ExecutionArbitrator { - public: - InstanceBlockingArbitrator() {} - void RegisterInstance( - const int device_id, ModelInstanceState* instance) override - { - // all instances are naturally ready for next execution after - // initialization. - instance->SemaphorePtr()->Acquire(); - } - std::pair ExecutionState( - const int device_id, ModelInstanceState* instance) override - { - // In BLOCKING, the TRITONBACKEND_ModelInstance and executing instance are - // coupled - return {instance, instance->SemaphorePtr()}; - } -}; - - -TRITONSERVER_Error* -ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) -{ - try { - *state = new ModelState(triton_model); - } - catch (const BackendModelException& ex) { - RETURN_ERROR_IF_TRUE( - ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, - std::string("unexpected nullptr in BackendModelException")); - RETURN_IF_ERROR(ex.err_); - } - - // Auto-complete the configuration if requested... - bool auto_complete_config = false; - RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( - triton_model, &auto_complete_config)); - - // Server core already detects if a GPU is present and - // corrects the instance groups before backend model is - // initialized. Since the TensorRT backend only works with - // GPU instances, check if the model has a KIND_GPU - // instance group. If KIND_GPU is not present, skip - // autocomplete as the model cannot be loaded. - bool has_instance_kind_gpu = false; - (*state)->InstanceHasKindGPU(&has_instance_kind_gpu); - - if (auto_complete_config && has_instance_kind_gpu) { - RETURN_IF_ERROR((*state)->AutoCompleteConfig()); - RETURN_IF_ERROR((*state)->SetTensorRTModelConfig()); - } - - RETURN_IF_ERROR((*state)->ValidateModelConfig()); - RETURN_IF_ERROR((*state)->ParseParameters()); - - return nullptr; // success -} - -ModelState::ModelState(TRITONBACKEND_Model* triton_model) - : TensorRTModel(triton_model), engine_sharing_(true) -{ - // Obtain backend configuration - TRITONBACKEND_Backend* backend; - THROW_IF_BACKEND_MODEL_ERROR( - TRITONBACKEND_ModelBackend(triton_model, &backend)); - TRITONBACKEND_ExecutionPolicy policy; - THROW_IF_BACKEND_MODEL_ERROR( - TRITONBACKEND_BackendExecutionPolicy(backend, &policy)); - switch (policy) { - case TRITONBACKEND_EXECUTION_BLOCKING: - execution_arbitrator_.reset(new InstanceBlockingArbitrator()); - break; - case TRITONBACKEND_EXECUTION_DEVICE_BLOCKING: { - // [FIXME] ad-hoc way to resolve the issue that in sequence batching, - // Triton instance must tie to the actual execution instance so that the - // model state is properly maintained. - triton::common::TritonJson::Value value; - bool found_sequence_batching = - ModelConfig().Find("sequence_batching", &value); - if (found_sequence_batching) { - execution_arbitrator_.reset(new InstanceBlockingArbitrator()); - } else { - execution_arbitrator_.reset(new DeviceBlockingArbitrator()); - } - break; - } - } -} - -ModelState::~ModelState() -{ - for (auto& device_engine : device_engines_) { - cudaSetDevice(device_engine.first.first); - auto& runtime = device_engine.second.first; - auto& engine = device_engine.second.second; - // Need to reset explicitly to ensure proper destruction order - if (engine != nullptr) { - engine.reset(); - } - if (runtime != nullptr) { - runtime.reset(); - } - } -} - -TRITONSERVER_Error* -ModelState::CreateEngine( - int gpu_device, const int64_t dla_core_id, const std::string& model_path, - std::shared_ptr* engine) -{ - // TensorRT engine creation is not thread-safe, so multiple creations - // are serialized with a global lock. - static std::mutex global_context_mu; - std::lock_guard glock(global_context_mu); - - // Create shared engine for the device if haven't tried so. - auto device_pair = std::make_pair(gpu_device, dla_core_id); - auto eit = device_engines_.find(device_pair); - if (eit == device_engines_.end()) { - eit = device_engines_.emplace(device_pair, std::make_pair(nullptr, nullptr)) - .first; - } - - // We share the engine (for models that don't have dynamic shapes) and - // runtime across instances that have access to the same GPU/NVDLA. - if (eit->second.second == nullptr) { - auto cuerr = cudaSetDevice(gpu_device); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to set device for ") + Name() + ": " + - cudaGetErrorString(cuerr)) - .c_str()); - } - - const bool new_runtime = (eit->second.first == nullptr); - RETURN_IF_ERROR(LoadPlan( - model_path, dla_core_id, &eit->second.first, &eit->second.second, - &tensorrt_logger_)); - *engine = eit->second.second; - - if (new_runtime) { - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Created new runtime on GPU device ") + - std::to_string(gpu_device) + ", NVDLA core " + - std::to_string(dla_core_id) + " for " + Name()) - .c_str()); - } - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Created new engine on GPU device ") + - std::to_string(gpu_device) + ", NVDLA core " + - std::to_string(dla_core_id) + " for " + Name()) - .c_str()); - - if (IsEngineSharingEnabled()) { - // This logic runs at least once to validate whether the engine - // can be shared. - bool is_dynamic = false; - for (int idx = 0; idx < eit->second.second->getNbBindings(); idx++) { - auto dims = eit->second.second->getBindingDimensions(idx); - // Detect whether dynamic or not - if (ContainsWildcard(dims)) { - is_dynamic = true; - break; - } - } - if (is_dynamic) { - // Model with dynamic shapes can't share engine - DisableEngineSharing(); - } - } - - if (!IsEngineSharingEnabled()) { - // Set engine to 'nullptr' as hint, but keep runtime as it - // can be used repeatedly - if (eit->second.second != nullptr) { - eit->second.second.reset(); - } - } - } else { - *engine = eit->second.second; - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelState::ValidateModelConfig() -{ - // We have the json DOM for the model configuration... - triton::common::TritonJson::WriteBuffer buffer; - RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer)); - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("model configuration:\n") + buffer.Contents()).c_str()); - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelState::ParseParameters() -{ - return nullptr; // success -} - - -TRITONSERVER_Error* -ModelState::AutoCompleteConfig() -{ - int current_device; - cudaError_t cuerr = cudaGetDevice(¤t_device); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to get current CUDA device ") + ": " + - cudaGetErrorString(cuerr)) - .c_str()); - } - - int64_t device_id; - triton::common::TritonJson::Value groups; - RETURN_IF_ERROR(ModelConfig().MemberAsArray("instance_group", &groups)); - triton::common::TritonJson::Value group; - RETURN_IF_ERROR(groups.IndexAsObject(0, &group)); - triton::common::TritonJson::Value gpus; - RETURN_IF_ERROR(group.MemberAsArray("gpus", &gpus)); - RETURN_IF_ERROR(gpus.IndexAsInt(0, &device_id)); - - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string( - "Setting the CUDA device to GPU" + std::to_string(device_id) + - " to auto-complete config for " + Name()) - .c_str())); - - cuerr = cudaSetDevice(device_id); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to set CUDA device to GPU ") + - std::to_string(device_id) + " : " + cudaGetErrorString(cuerr)) - .c_str()); - } - - std::string artifact_name; - RETURN_IF_ERROR( - ModelConfig().MemberAsString("default_model_filename", &artifact_name)); - - cudaDeviceProp cuprops; - cuerr = cudaGetDeviceProperties(&cuprops, device_id); - if (cuerr != cudaSuccess) { - throw BackendModelException(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to get CUDA device properties for ") + name_ + - ": " + cudaGetErrorString(cuerr)) - .c_str())); - } - - const std::string cc = - std::to_string(cuprops.major) + "." + std::to_string(cuprops.minor); - - common::TritonJson::Value cc_names; - common::TritonJson::Value cc_name; - if ((ModelConfig().Find("cc_model_filenames", &cc_names)) && - (cc_names.Find(cc.c_str(), &cc_name))) { - RETURN_IF_ERROR(cc_name.AsString(&artifact_name)); - } - - // If the model configuration doesn't have an explicit model file specified - // then use the default name ("model.plan"). - std::string cc_model_filename = artifact_name; - if (cc_model_filename.empty()) { - cc_model_filename = "model.plan"; - } else { - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string( - "Using explicit serialized file '" + cc_model_filename + - "' to auto-complete config for " + Name()) - .c_str())); - } - - std::string model_path = JoinPath( - {RepositoryPath(), std::to_string(Version()), cc_model_filename}); - - RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path)); - - cuerr = cudaSetDevice(current_device); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to revert CUDA device to GPU ") + - std::to_string(current_device) + " : " + cudaGetErrorString(cuerr)) - .c_str()); - } - - if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) { - triton::common::TritonJson::WriteBuffer buffer; - RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer)); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("post auto-complete:\n") + buffer.Contents()).c_str()); - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelState::AutoCompleteConfigHelper(const std::string& model_path) -{ - std::shared_ptr runtime; - std::shared_ptr engine; - if (LoadPlan( - model_path, -1 /* dla_core_id */, &runtime, &engine, - &tensorrt_logger_) != nullptr) { - if (engine.get() != nullptr) { - engine.reset(); - } - if (runtime.get() != nullptr) { - runtime.reset(); - } - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string( - "unable to load plan file to auto complete config: " + model_path) - .c_str())); - } - - size_t input_cnt = 0; - size_t output_cnt = 0; - { - triton::common::TritonJson::Value inputs; - if (ModelConfig().Find("input", &inputs)) { - input_cnt = inputs.ArraySize(); - } - - triton::common::TritonJson::Value config_batch_inputs; - if (ModelConfig().Find("batch_input", &config_batch_inputs)) { - input_cnt += config_batch_inputs.ArraySize(); - } - - triton::common::TritonJson::Value outputs; - if (ModelConfig().Find("output", &outputs)) { - output_cnt = outputs.ArraySize(); - } - } - - int num_profile_bindings = 0; - int num_profiles = 0; - num_profiles = engine->getNbOptimizationProfiles(); - num_profile_bindings = engine->getNbBindings() / num_profiles; - - // For batching support, the number of dimensions specified in model config - // should be 1 less than the number of dimensions present in engine. - // Will use that as a hint to ascertain whether or not to enable batching. - // However, ragged batching is an exception to this rule. A tensor - // allowing ragged batch is in itself a batching hint. - bool config_batch_hint = false; - - // The number of IO Tensors with shape specification in config - int tensors_with_config_shape_cnt = 0; - - if ((input_cnt != 0) || (output_cnt != 0)) { - std::vector io_types{"input", "output"}; - std::map> allowed_tensors; - for (int i = 0; i < num_profile_bindings; ++i) { - if (engine->bindingIsInput(i)) { - allowed_tensors["input"].emplace(engine->getBindingName(i)); - } else { - allowed_tensors["output"].emplace(engine->getBindingName(i)); - } - } - - bool io_allow_ragged_batch = false; - for (const auto& io_type : io_types) { - triton::common::TritonJson::Value config_io; - RETURN_IF_ERROR(ModelConfig().MemberAsArray(io_type.c_str(), &config_io)); - for (size_t i = 0; - ((i < config_io.ArraySize()) && (!io_allow_ragged_batch)); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(config_io.IndexAsObject(i, &io)); - triton::common::TritonJson::Value allow_ragged_batch; - if (io.Find("allow_ragged_batch", &allow_ragged_batch)) { - RETURN_IF_ERROR(allow_ragged_batch.AsBool(&io_allow_ragged_batch)); - } - if (io_allow_ragged_batch) { - // Treat the presence of tensor allowing ragged batch as - // a hint for batching. - config_batch_hint = true; - } else { - common::TritonJson::Value model_config_dims; - common::TritonJson::Value reshape; - if (io.Find("reshape", &reshape)) { - RETURN_IF_ERROR(reshape.MemberAsArray("shape", &model_config_dims)); - } else { - RETURN_IF_ERROR(io.MemberAsArray("dims", &model_config_dims)); - } - if (model_config_dims.ArraySize() != 0) { - tensors_with_config_shape_cnt++; - } - std::string name; - RETURN_IF_ERROR(io.MemberAsString("name", &name)); - if (io_type.compare("input") == 0) { - RETURN_IF_ERROR( - CheckAllowedModelInput(io, allowed_tensors[io_type])); - } else { - RETURN_IF_ERROR( - CheckAllowedModelOutput(io, allowed_tensors[io_type])); - } - if (model_config_dims.ArraySize() != 0) { - RETURN_IF_ERROR(ExtractBatchHintFromIOConfig( - engine.get(), name, model_config_dims, &config_batch_hint)); - } - } - } - } - } - - int max_batch_size = 0; - bool has_implicit_batch_dim = false; - if (engine->hasImplicitBatchDimension()) { - // If engine has implicit batch dimension then retrieve the value and exit - max_batch_size = engine->getMaxBatchSize(); - has_implicit_batch_dim = (max_batch_size != 1) || (MaxBatchSize() != 0); - } else { - // Assuming the first dimension to be batch dimension, until and unless - // proven the batching is not supported. - RETURN_IF_ERROR( - GetMaxSupportedBatchSize(engine.get(), num_profiles, &max_batch_size)); - } - - if (config_batch_hint && max_batch_size == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("autofill failed for model '") + Name() + - "': model tensor shape configuration hints for dynamic batching " - "but the underlying engine doesn't support batching.") - .c_str()); - } else if ( - (tensors_with_config_shape_cnt != 0) && (!config_batch_hint) && - (!has_implicit_batch_dim)) { - // if an explicit hint for non batching in config io - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("The specified dimensions in model config for ") + Name() + - " hints that batching is unavailable") - .c_str()); - max_batch_size = 0; - } - - if (MaxBatchSize() == 0) { - triton::common::TritonJson::Value mbs_value; - ModelConfig().Find("max_batch_size", &mbs_value); - mbs_value.SetInt(max_batch_size); - SetMaxBatchSize(max_batch_size); - } else if (MaxBatchSize() > max_batch_size) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("autofill failed for model '") + Name() + - "': configuration specified max-batch " + - std::to_string(MaxBatchSize()) + - " but TensorRT engine only supports max-batch " + - std::to_string(max_batch_size)) - .c_str()); - } - - // Turn on dynamic batch scheduler if batch size is greater - // than 1 and there is no scheduler defined in the configuration. - if (max_batch_size > 1) { - triton::common::TritonJson::Value value; - bool found_sequence_batching = - ModelConfig().Find("sequence_batching", &value); - bool found_dynamic_batching = - ModelConfig().Find("dynamic_batching", &value); - if (!found_sequence_batching && !found_dynamic_batching) { - triton::common::TritonJson::Value dynamic_batching( - ModelConfig(), triton::common::TritonJson::ValueType::OBJECT); - RETURN_IF_ERROR( - ModelConfig().Add("dynamic_batching", std::move(dynamic_batching))); - } - } - - triton::common::TritonJson::Value ref_inputs( - ModelConfig(), triton::common::TritonJson::ValueType::ARRAY); - RETURN_IF_ERROR(GetRefIO(true /*is_input*/, engine.get(), &ref_inputs)); - triton::common::TritonJson::Value mutable_inputs( - ModelConfig(), triton::common::TritonJson::ValueType::ARRAY); - bool found_inputs = ModelConfig().Find("input", &mutable_inputs); - RETURN_IF_ERROR(FixIO(engine.get(), ref_inputs, &mutable_inputs)); - if (!found_inputs) { - RETURN_IF_ERROR(ModelConfig().Add("input", std::move(mutable_inputs))); - } - - triton::common::TritonJson::Value ref_outputs( - ModelConfig(), triton::common::TritonJson::ValueType::ARRAY); - RETURN_IF_ERROR(GetRefIO(false /*is_input*/, engine.get(), &ref_outputs)); - triton::common::TritonJson::Value mutable_outputs( - ModelConfig(), triton::common::TritonJson::ValueType::ARRAY); - bool found_outputs = ModelConfig().Find("output", &mutable_outputs); - RETURN_IF_ERROR(FixIO(engine.get(), ref_outputs, &mutable_outputs)); - if (!found_outputs) { - RETURN_IF_ERROR(ModelConfig().Add("output", std::move(mutable_outputs))); - } - - if (engine != nullptr) { - engine.reset(); - } - if (runtime != nullptr) { - runtime.reset(); - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelState::GetMaxSupportedBatchSize( - nvinfer1::ICudaEngine* engine, const int num_profiles, int* max_batch_size) -{ - std::set profile_indices; - RETURN_IF_ERROR(GetProfileIndices(num_profiles, &profile_indices)); - - int running_max = 0; - for (const auto profile_index : profile_indices) { - int max_profile_batch_size; - RETURN_IF_ERROR( - GetProfileMaxBatchSize(engine, profile_index, &max_profile_batch_size)); - if (max_profile_batch_size > running_max) { - running_max = max_profile_batch_size; - } - } - - *max_batch_size = running_max; - - return nullptr; -} - -TRITONSERVER_Error* -ModelState::GetProfileIndices( - const int num_profiles, std::set* profile_indices) -{ - common::TritonJson::Value groups; - RETURN_IF_ERROR(ModelConfig().MemberAsArray("instance_group", &groups)); - for (size_t i = 0; i < groups.ArraySize(); i++) { - common::TritonJson::Value group; - RETURN_IF_ERROR(groups.IndexAsObject(i, &group)); - common::TritonJson::Value profiles; - RETURN_IF_ERROR(group.MemberAsArray("profile", &profiles)); - for (size_t j = 0; j < profiles.ArraySize(); j++) { - std::string profile; - RETURN_IF_ERROR(profiles.IndexAsString(j, &profile)); - int profile_idx; - RETURN_IF_ERROR(GetProfileIndex(profile, &profile_idx)); - if (profile_idx < 0 || profile_idx >= num_profiles) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to autofill for '") + Name() + - "', configuration specified invalid profile " + profile + - " . Number of profiles supported by TensorRT engine: " + - std::to_string(num_profiles)) - .c_str()); - } - profile_indices->insert(profile_idx); - } - } - - if (profile_indices->empty()) { - // If not specified then use the default. - profile_indices->insert(0); - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelState::GetProfileMaxBatchSize( - nvinfer1::ICudaEngine* engine, int profile_index, int* max_batch_size) -{ - *max_batch_size = INT_MAX; - - int num_profiles = engine->getNbOptimizationProfiles(); - int num_profile_bindings = engine->getNbBindings() / num_profiles; - - // Visit all the bindings of the profile to capture the maximum and - // minimum batch size supported. - for (int binding_index = 0; binding_index < num_profile_bindings; - binding_index++) { - int effective_binding_index = - (profile_index * num_profile_bindings) + binding_index; - if (engine->bindingIsInput(effective_binding_index)) { - if (!engine->isShapeBinding(effective_binding_index)) { - nvinfer1::Dims max_shape = engine->getProfileDimensions( - effective_binding_index, profile_index, - nvinfer1::OptProfileSelector::kMAX); - if (*max_batch_size > max_shape.d[0]) { - *max_batch_size = max_shape.d[0]; - } - - } else { - const int32_t* max_shapes = engine->getProfileShapeValues( - effective_binding_index, profile_index, - nvinfer1::OptProfileSelector::kMAX); - if (*max_batch_size > *max_shapes) { - *max_batch_size = *max_shapes; - } - } - } - } - return nullptr; -} - -TRITONSERVER_Error* -ModelState::ExtractBatchHintFromIOConfig( - nvinfer1::ICudaEngine* engine, const std::string& tensor_name, - const common::TritonJson::Value& dims, bool* config_batch_hint) -{ - // look up corresponding io info from model - int num_profiles = engine->getNbOptimizationProfiles(); - int num_profile_bindings = engine->getNbBindings() / num_profiles; - - for (int binding_index = 0; binding_index < num_profile_bindings; - binding_index++) { - if (tensor_name == engine->getBindingName(binding_index)) { - nvinfer1::Dims shape = engine->getBindingDimensions(binding_index); - bool should_batch; - if (!engine->isShapeBinding(binding_index)) { - should_batch = (shape.nbDims == ((int32_t)dims.ArraySize() + 1)); - } else { - int64_t first_dim = 0; - RETURN_IF_ERROR(dims.IndexAsInt(0, &first_dim)); - should_batch = (shape.d[0] == (first_dim + 1)); - } - if (should_batch) { - *config_batch_hint = true; - } - if (*config_batch_hint && (!should_batch)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to autofill for '") + Name() + - "', model tensor configurations are contradicting " + - "each other in terms of whether batching is supported") - .c_str()); - } - } - } - return nullptr; -} - -TRITONSERVER_Error* -ModelState::InstanceHasKindGPU(bool* has_instance_kind_gpu) -{ - *has_instance_kind_gpu = false; - triton::common::TritonJson::Value instance_groups( - ModelConfig(), triton::common::TritonJson::ValueType::ARRAY); - ModelConfig().Find("instance_group", &instance_groups); - - if (instance_groups.ArraySize() > 0) { - // TensorRT backend does not support KIND_CPU at all - // so only check the first instance group kind. - triton::common::TritonJson::Value group; - RETURN_IF_ERROR(instance_groups.IndexAsObject(0, &group)); - - triton::common::TritonJson::Value kind; - group.Find("kind", &kind); - std::string kind_str; - RETURN_IF_ERROR(kind.AsString(&kind_str)); - if (kind_str == "KIND_GPU") { - *has_instance_kind_gpu = true; - } - } - - return nullptr; // success -} - - -TRITONSERVER_Error* -ModelState::GetRefIO( - const bool is_input, nvinfer1::ICudaEngine* engine, - triton::common::TritonJson::Value* ref_io) -{ - int num_profiles = engine->getNbOptimizationProfiles(); - int num_profile_bindings = engine->getNbBindings() / num_profiles; - - for (int i = 0; i < num_profile_bindings; ++i) { - nvinfer1::Dims dims = engine->getBindingDimensions(i); - bool is_shape_binding = engine->isShapeBinding(i); - if ((is_input && (!engine->bindingIsInput(i))) || - ((!is_input) && (engine->bindingIsInput(i)))) { - continue; - } - triton::common::TritonJson::Value io( - ModelConfig(), triton::common::TritonJson::ValueType::OBJECT); - std::string input_name{engine->getBindingName(i)}; - RETURN_IF_ERROR( - io.AddString("name", input_name.substr(0, input_name.find(" ")))); - RETURN_IF_ERROR(io.AddString( - "data_type", - ConvertTrtTypeToConfigDataType(engine->getBindingDataType(i)))); - RETURN_IF_ERROR(InitIODims(engine, dims, is_shape_binding, &io)); - RETURN_IF_ERROR(io.AddBool("is_shape_tensor", is_shape_binding)); - - RETURN_IF_ERROR(ref_io->Append(std::move(io))); - } - - return nullptr; -} - -TRITONSERVER_Error* -ModelState::InitIODims( - nvinfer1::ICudaEngine* engine, nvinfer1::Dims& dims, bool is_shape_binding, - triton::common::TritonJson::Value* io) -{ - bool skip_first = - (MaxBatchSize() != 0) && (!engine->hasImplicitBatchDimension()); - triton::common::TritonJson::Value config_dims( - ModelConfig(), triton::common::TritonJson::ValueType::ARRAY); - if (!is_shape_binding) { - for (int didx = (skip_first ? 1 : 0); didx < dims.nbDims; ++didx) { - RETURN_IF_ERROR(config_dims.AppendInt(dims.d[didx])); - } - // If tensor dims are empty then must use a reshape for the - // tensor, since 'dims' is not allowed to be empty. - if (config_dims.ArraySize() == 0) { - RETURN_IF_ERROR(config_dims.AppendInt(1)); - triton::common::TritonJson::Value reshape( - ModelConfig(), triton::common::TritonJson::ValueType::OBJECT); - triton::common::TritonJson::Value reshape_dims( - ModelConfig(), triton::common::TritonJson::ValueType::ARRAY); - RETURN_IF_ERROR(reshape.Add("shape", std::move(reshape_dims))); - RETURN_IF_ERROR(io->Add("reshape", std::move(reshape))); - } - } else { - if (dims.nbDims != 0) { - if (skip_first) { - RETURN_IF_ERROR(config_dims.AppendInt(dims.d[0] - 1)); - } else { - RETURN_IF_ERROR(config_dims.AppendInt(dims.d[0])); - } - } - } - RETURN_IF_ERROR(io->Add("dims", std::move(config_dims))); - - return nullptr; -} - -TRITONSERVER_Error* -ModelState::FixIO( - nvinfer1::ICudaEngine* engine, - triton::common::TritonJson::Value& reference_ios, - triton::common::TritonJson::Value* mutable_ios) -{ - if (mutable_ios->ArraySize() == 0) { - RETURN_IF_ERROR(mutable_ios->Swap(reference_ios)); - } else { - for (size_t i = 0; i < mutable_ios->ArraySize(); i++) { - triton::common::TritonJson::Value mutable_io; - RETURN_IF_ERROR(mutable_ios->IndexAsObject(i, &mutable_io)); - std::string io_name; - RETURN_IF_ERROR(mutable_io.MemberAsString("name", &io_name)); - for (size_t j = 0; j < reference_ios.ArraySize(); j++) { - triton::common::TritonJson::Value io_ref; - RETURN_IF_ERROR(reference_ios.IndexAsObject(j, &io_ref)); - std::string ref_name; - RETURN_IF_ERROR(io_ref.MemberAsString("name", &ref_name)); - if (io_name.compare(ref_name) == 0) { - // only set type and shape if they are not set - common::TritonJson::Value data_type; - if (mutable_io.Find("data_type", &data_type)) { - std::string dt_str; - RETURN_IF_ERROR(data_type.AsString(&dt_str)); - if (dt_str.empty() || (dt_str.compare("TYPE_INVALID") == 0)) { - std::string ref_data_type; - RETURN_IF_ERROR( - io_ref.MemberAsString("data_type", &ref_data_type)); - RETURN_IF_ERROR(data_type.SetString(ref_data_type)); - } - } else { - std::string ref_data_type; - RETURN_IF_ERROR(io_ref.MemberAsString("data_type", &ref_data_type)); - RETURN_IF_ERROR(mutable_io.AddString("data_type", ref_data_type)); - } - - common::TritonJson::Value dims; - if (mutable_io.Find("dims", &dims)) { - if (dims.ArraySize() == 0) { - common::TritonJson::Value ref_dims; - RETURN_IF_ERROR(io_ref.MemberAsArray("dims", &ref_dims)); - RETURN_IF_ERROR(dims.Swap(ref_dims)); - common::TritonJson::Value reshape; - if (io_ref.Find("reshape", &reshape)) { - RETURN_IF_ERROR(mutable_io.Add("reshape", std::move(reshape))); - } - } - } else { - common::TritonJson::Value ref_dims; - RETURN_IF_ERROR(io_ref.MemberAsArray("dims", &ref_dims)); - RETURN_IF_ERROR(mutable_io.Add("dims", std::move(ref_dims))); - common::TritonJson::Value reshape; - if (io_ref.Find("reshape", &reshape)) { - RETURN_IF_ERROR(mutable_io.Add("reshape", std::move(reshape))); - } - } - - // Check if the IO is a shape tensor. - bool is_shape_tensor = false; - int io_index = engine->getBindingIndex(io_name.c_str()); - if (io_index == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("binding for '") + io_name + - "' not found in the model.") - .c_str()); - } - is_shape_tensor = engine->isShapeBinding(io_index); - - common::TritonJson::Value shape_tensor; - if (mutable_io.Find("is_shape_tensor", &shape_tensor)) { - bool shape_tensor_val = false; - RETURN_IF_ERROR(shape_tensor.AsBool(&shape_tensor_val)); - if (shape_tensor_val && (!is_shape_tensor)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("'") + io_name + - "' is incorrectly specified as a shape tensor.") - .c_str()); - } else if (!shape_tensor_val && is_shape_tensor) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("'") + io_name + - "' is incorrectly specified as an execution tensor.") - .c_str()); - } - } else { - RETURN_IF_ERROR( - mutable_io.AddBool("is_shape_tensor", is_shape_tensor)); - } - break; - } - } - } - } - - return nullptr; -} - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/model_state.h b/src_trt8/model_state.h deleted file mode 100644 index f3fa646..0000000 --- a/src_trt8/model_state.h +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#pragma once - -#include - -#include "logging.h" -#include "semaphore.h" -#include "tensorrt_model.h" -#include "tensorrt_model_instance.h" - -namespace triton { namespace backend { namespace tensorrt { - -class ModelInstanceState; -// Helper class to determine the ModelInstanceState to be used for current -// execution and the semaphore to be blocked for next execution. -class ExecutionArbitrator { - public: - virtual void RegisterInstance( - const int device_id, ModelInstanceState* instance) = 0; - virtual std::pair ExecutionState( - const int device_id, ModelInstanceState* instance) = 0; - virtual ~ExecutionArbitrator(){}; -}; - -// -// ModelState -// -// State associated with a model that is using this backend. An object -// of this class is created and associated with each -// TRITONBACKEND_Model. -// -class ModelState : public TensorRTModel { - public: - static TRITONSERVER_Error* Create( - TRITONBACKEND_Model* triton_model, ModelState** state); - virtual ~ModelState(); - - TRITONSERVER_Error* CreateEngine( - int gpu_device, const int64_t dla_core_id, const std::string& model_path, - std::shared_ptr* engine); - - void DisableEngineSharing() { engine_sharing_ = false; } - bool IsEngineSharingEnabled() { return engine_sharing_; } - - static void EnableVersionCompatibility() { is_version_compatible_ = true; } - static bool isVersionCompatible() { return is_version_compatible_; } - - // Register the instance and its associated device ID to execution arbitrator. - void RegisterInstance(const int device_id, ModelInstanceState* instance) - { - execution_arbitrator_->RegisterInstance(device_id, instance); - } - - // Query the execution arbitrator to return the instance for the execution and - // the semaphore to check whether the next execution should be initiated. - // 'device_id', 'instance' are the metadata associated with the - // TRITONBACKEND_ModelInstance. - std::pair ExecutionState( - const int device_id, ModelInstanceState* instance) - { - return execution_arbitrator_->ExecutionState(device_id, instance); - } - - TensorRTLogger& GetTensorRTLogger() { return tensorrt_logger_; } - - private: - ModelState(TRITONBACKEND_Model* triton_model); - - // Auto-complete the model configuration - TRITONSERVER_Error* AutoCompleteConfig(); - TRITONSERVER_Error* AutoCompleteConfigHelper(const std::string& model_path); - TRITONSERVER_Error* GetMaxSupportedBatchSize( - nvinfer1::ICudaEngine* engine, const int num_profiles, - int* max_batch_size); - TRITONSERVER_Error* GetProfileIndices( - const int num_profiles, std::set* profile_indices); - TRITONSERVER_Error* GetProfileMaxBatchSize( - nvinfer1::ICudaEngine* engine, int profile_index, int* max_batch_size); - TRITONSERVER_Error* ExtractBatchHintFromIOConfig( - nvinfer1::ICudaEngine* engine, const std::string& tensor_name, - const common::TritonJson::Value& dims, bool* config_batch_hint); - TRITONSERVER_Error* InstanceHasKindGPU(bool* has_instance_kind_gpu); - TRITONSERVER_Error* GetRefIO( - const bool is_input, nvinfer1::ICudaEngine* engine, - triton::common::TritonJson::Value* ref_io); - TRITONSERVER_Error* InitIODims( - nvinfer1::ICudaEngine* engine, nvinfer1::Dims& dims, - bool is_shape_binding, triton::common::TritonJson::Value* io); - TRITONSERVER_Error* FixIO( - nvinfer1::ICudaEngine* engine, - triton::common::TritonJson::Value& reference_ios, - triton::common::TritonJson::Value* mutable_ios); - - - // Validate that model configuration is supported by this backend. - TRITONSERVER_Error* ValidateModelConfig(); - - // Parses the parameters in config - TRITONSERVER_Error* ParseParameters(); - - // TensorRT logger for this model - TensorRTLogger tensorrt_logger_; - - // CUDA engine shared across all model instances using the same (or no) DLA - // core on same GPU. The first element in the key pair is the GPU ID, the - // second is the DLA core ID. - std::map< - std::pair, std::pair< - std::shared_ptr, - std::shared_ptr>> - device_engines_; - bool engine_sharing_; - - std::unique_ptr execution_arbitrator_; - - // Whether the backend should support version-compatible TensorRT models. - static inline bool is_version_compatible_{false}; -}; - - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/output_allocator.cc b/src_trt8/output_allocator.cc deleted file mode 100644 index 9d3a077..0000000 --- a/src_trt8/output_allocator.cc +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "output_allocator.h" - -namespace triton { namespace backend { namespace tensorrt { - -void* -OutputAllocator::reallocateOutput( - char const* tensor_name, void* current_memory, uint64_t size, - uint64_t alignment) noexcept -{ - // When the requested output size is larger than the current size, - // free the allocated memory and attempt to allocate the larger size - // for the underlying output buffer. - if (size > output_size_) { - cudaFree(output_ptr_); - output_ptr_ = nullptr; - output_size_ = 0; - if (zero_copy_support_) { - cudaHostAlloc(&output_ptr_, size, cudaHostAllocMapped); - // If zero copy support is enabled, need to set the buffer to the device - // pointer. - void* device_buffer; - auto err = cudaHostGetDevicePointer(&device_buffer, &output_ptr_, 0); - if (err == cudaSuccess) { - output_ptr_ = device_buffer; - } - } else { - cudaMalloc(&output_ptr_, size); - } - - // If the memory allocation fails, output_ptr_=nullptr and engine - // gracefully fails. - if (output_ptr_ != nullptr) { - output_size_ = size; - } - } - return output_ptr_; -} - -void -OutputAllocator::notifyShape( - char const* tensor_name, nvinfer1::Dims const& dims) noexcept -{ - output_dims_ = dims; -} - -OutputAllocator::~OutputAllocator() -{ - cudaFree(output_ptr_); -} - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/output_allocator.h b/src_trt8/output_allocator.h deleted file mode 100644 index 6a3fe9e..0000000 --- a/src_trt8/output_allocator.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#pragma once - -#include -#include - -namespace triton { namespace backend { namespace tensorrt { - -class OutputAllocator : public nvinfer1::IOutputAllocator { - // This class extends nvinfer1::IOutputAllocator and its functions - // reallocateOutput and notifyShape. For consistency, all of its - // functions use camel case. - public: - OutputAllocator(bool zero_copy_support) - : zero_copy_support_(zero_copy_support) - { - } - // Allocates output dimensions - void* reallocateOutput( - char const* tensor_name, void* current_memory, uint64_t size, - uint64_t alignment) noexcept override; - - // Updates output dimensions - void notifyShape( - char const* tensor_name, nvinfer1::Dims const& dims) noexcept override; - - void* getBuffer() { return output_ptr_; }; - void** getBufferAddr() { return &output_ptr_; }; - - ~OutputAllocator() override; - - private: - // Saved dimensions of the output tensor - nvinfer1::Dims output_dims_{}; - - // Pointer to output, nullptr if memory could not be allocated - void* output_ptr_{nullptr}; - - // Size of allocation pointed to by output - uint64_t output_size_{0}; - - // Boolean flag indicating if zero copy support is enabled - bool zero_copy_support_{false}; -}; - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/semaphore.h b/src_trt8/semaphore.h deleted file mode 100644 index 89c99b9..0000000 --- a/src_trt8/semaphore.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#pragma once - -#include -#include - -namespace triton { namespace backend { namespace tensorrt { - -class Semaphore { - public: - explicit Semaphore(const int count) : count_(count) {} - - void Release() - { - std::unique_lock lck(mtx_); - count_++; - cv_.notify_one(); - } - - void Acquire() - { - std::unique_lock lck(mtx_); - cv_.wait(lck, [this]() { return (count_ > 0); }); - count_--; - } - - private: - int count_; - - std::mutex mtx_; - std::condition_variable cv_; -}; - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/shared_library.cc b/src_trt8/shared_library.cc deleted file mode 100644 index d4c762c..0000000 --- a/src_trt8/shared_library.cc +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "shared_library.h" - -#include "filesystem.h" -#include "logging.h" -#include "mutex" - -/// FIXME: Duplication of core/src/shared_library.cc -/// Separate shared_library to common library and delete this - -#ifdef _WIN32 -// suppress the min and max definitions in Windef.h. -#define NOMINMAX -#include -#else -#include -#endif - -namespace triton { namespace backend { namespace tensorrt { - -TRITONSERVER_Error* -SetLibraryDirectory(const std::string& path) -{ -#ifdef _WIN32 - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("SetLibraryDirectory: path = ") + path).c_str()); - if (!SetDllDirectory(path.c_str())) { - LPSTR err_buffer = nullptr; - size_t size = FormatMessageA( - FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), - (LPSTR)&err_buffer, 0, NULL); - std::string errstr(err_buffer, size); - LocalFree(err_buffer); - - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_NOT_FOUND, - ("unable to set dll path " + path + ": " + errstr).c_str()); - } -#endif - - return nullptr; // success -} - -TRITONSERVER_Error* -ResetLibraryDirectory() -{ -#ifdef _WIN32 - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "ResetLibraryDirectory"); - if (!SetDllDirectory(NULL)) { - LPSTR err_buffer = nullptr; - size_t size = FormatMessageA( - FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), - (LPSTR)&err_buffer, 0, NULL); - std::string errstr(err_buffer, size); - LocalFree(err_buffer); - - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_NOT_FOUND, - ("unable to reset dll path: " + errstr).c_str()); - } -#endif - - return nullptr; // success -} - -TRITONSERVER_Error* -OpenLibraryHandle(const std::string& path, void** handle) -{ - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("OpenLibraryHandle: ") + path).c_str()); - -#ifdef _WIN32 - // Need to put shared library directory on the DLL path so that any - // dependencies of the shared library are found - const std::string library_dir = DirName(path); - RETURN_IF_ERROR(SetLibraryDirectory(library_dir)); - - // HMODULE is typedef of void* - // https://docs.microsoft.com/en-us/windows/win32/winprog/windows-data-types - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("OpenLibraryHandle: path = ") + path).c_str()); - *handle = LoadLibrary(path.c_str()); - - // Remove the dll path added above... do this unconditionally before - // check for failure in dll load. - RETURN_IF_ERROR(ResetLibraryDirectory()); - - if (*handle == nullptr) { - LPSTR err_buffer = nullptr; - size_t size = FormatMessageA( - FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), - (LPSTR)&err_buffer, 0, NULL); - std::string errstr(err_buffer, size); - LocalFree(err_buffer); - - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_NOT_FOUND, - ("unable to load shared library: " + errstr).c_str()); - } -#else - *handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (*handle == nullptr) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_NOT_FOUND, - ("unable to load shared library: " + std::string(dlerror())).c_str()); - } -#endif - - return nullptr; // success -} - -TRITONSERVER_Error* -CloseLibraryHandle(void* handle) -{ - if (handle != nullptr) { -#ifdef _WIN32 - if (FreeLibrary((HMODULE)handle) == 0) { - LPSTR err_buffer = nullptr; - size_t size = FormatMessageA( - FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), - (LPSTR)&err_buffer, 0, NULL); - std::string errstr(err_buffer, size); - LocalFree(err_buffer); - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unable to unload shared library: " + errstr).c_str()); - } -#else - if (dlclose(handle) != 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unable to unload shared library: " + std::string(dlerror())) - .c_str()); - } -#endif - } - - return nullptr; // success -} - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/shared_library.h b/src_trt8/shared_library.h deleted file mode 100644 index 1555cf5..0000000 --- a/src_trt8/shared_library.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include - -#include "triton/backend/backend_common.h" - -/// FIXME: Duplication of core/src/shared_library.h -/// Separate shared_library to common library and delete this - -namespace triton { namespace backend { namespace tensorrt { - -TRITONSERVER_Error* SetLibraryDirectory(); -TRITONSERVER_Error* ResetLibraryDirectory(); -TRITONSERVER_Error* OpenLibraryHandle(const std::string& path, void** handle); -TRITONSERVER_Error* CloseLibraryHandle(void* handle); - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/tensorrt.cc b/src_trt8/tensorrt.cc deleted file mode 100644 index 2c2d2a4..0000000 --- a/src_trt8/tensorrt.cc +++ /dev/null @@ -1,418 +0,0 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "instance_state.h" -#include "logging.h" -#include "model_state.h" -#include "shared_library.h" -#include "triton/backend/backend_common.h" -#include "triton/backend/device_memory_tracker.h" - -// -// TensorRT Backend that implements the TRITONBACKEND API. -// -namespace triton { namespace backend { namespace tensorrt { - -///////////// - -extern "C" { - -// Implementing TRITONBACKEND_Initialize is optional. The backend -// should initialize any global state that is intended to be shared -// across all models and model instances that use the backend. -TRITONBACKEND_ISPEC TRITONSERVER_Error* -TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) -{ - const char* cname; - RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname)); - std::string name(cname); - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("TRITONBACKEND_Initialize: ") + name).c_str()); - - // We should check the backend API version that Triton supports - // vs. what this backend was compiled against. - uint32_t api_version_major, api_version_minor; - RETURN_IF_ERROR( - TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor)); - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Triton TRITONBACKEND API version: ") + - std::to_string(api_version_major) + "." + - std::to_string(api_version_minor)) - .c_str()); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("'") + name + "' TRITONBACKEND API version: " + - std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + - std::to_string(TRITONBACKEND_API_VERSION_MINOR)) - .c_str()); - - if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) || - (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_UNSUPPORTED, - "triton backend API version does not support this backend"); - } - - // The backend configuration may contain information needed by the - // backend, such as command-line arguments. - TRITONSERVER_Message* backend_config_message; - RETURN_IF_ERROR( - TRITONBACKEND_BackendConfig(backend, &backend_config_message)); - - const char* buffer; - size_t byte_size; - RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson( - backend_config_message, &buffer, &byte_size)); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("backend configuration:\n") + buffer).c_str()); - - triton::common::TritonJson::Value backend_config; - if (byte_size != 0) { - RETURN_IF_ERROR(backend_config.Parse(buffer, byte_size)); - } - - // Default execution policy, may be overridden by backend config - auto execution_policy = TRITONBACKEND_EXECUTION_DEVICE_BLOCKING; - - std::unique_ptr lconfig(new BackendConfiguration()); - // Check if device memory tracker is explicitly enabled - if (DeviceMemoryTracker::EnableFromBackendConfig(backend_config)) { - lconfig->enable_memory_tracker_ = DeviceMemoryTracker::Init(); - } - - triton::common::TritonJson::Value cmdline; - if (backend_config.Find("cmdline", &cmdline)) { - triton::common::TritonJson::Value value; - std::string value_str; - - if (cmdline.Find("coalesce-request-input", &value)) { - RETURN_IF_ERROR(value.AsString(&value_str)); - RETURN_IF_ERROR( - ParseBoolValue(value_str, &lconfig->coalesce_request_input_)); - } - - if (cmdline.Find("plugins", &value)) { - RETURN_IF_ERROR(value.AsString(&value_str)); - size_t pos = 0; - std::string plugin; - // Load individual plugins - while (value_str.length() > 0) { - pos = value_str.find(";"); - plugin = value_str.substr(0, pos); - void* handle = nullptr; - auto err = OpenLibraryHandle(plugin, &handle); - if (err != nullptr) { - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, TRITONSERVER_ErrorMessage(err)); - TRITONSERVER_ErrorDelete(err); - err = nullptr; - } - - if (pos != std::string::npos) { - pos++; - } - value_str.erase(0, pos); - } - } - - // Set the execution policy according to backend config, default will be - // DEVICE_BLOCKING - if (cmdline.Find("execution-policy", &value)) { - RETURN_IF_ERROR(value.AsString(&value_str)); - if (value_str == "DEVICE_BLOCKING") { - execution_policy = TRITONBACKEND_EXECUTION_DEVICE_BLOCKING; - } else if (value_str == "BLOCKING") { - execution_policy = TRITONBACKEND_EXECUTION_BLOCKING; - } - } - - if (cmdline.Find("version-compatible", &value)) { - bool is_version_compatible{false}; - RETURN_IF_ERROR(value.AsString(&value_str)); - RETURN_IF_ERROR(ParseBoolValue(value_str, &is_version_compatible)); - if (is_version_compatible) { - ModelState::EnableVersionCompatibility(); - } - } - } - - RETURN_IF_ERROR( - TRITONBACKEND_BackendSetExecutionPolicy(backend, execution_policy)); - - // Register all the default and custom plugins that come with TensorRT - bool success = true; - std::once_flag onceFlag; - { - std::call_once(onceFlag, [&success] { - TensorRTLogger tensorrt_logger; - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Registering TensorRT Plugins"); - success = initLibNvInferPlugins(&tensorrt_logger, ""); - }); - } - if (!success) { - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to register TensorRT Plugins"); - } - - RETURN_IF_ERROR(TRITONBACKEND_BackendSetState( - backend, reinterpret_cast(lconfig.get()))); - - lconfig.release(); - return nullptr; // success -} - -// Implementing TRITONBACKEND_Finalize is optional unless state is set -// using TRITONBACKEND_BackendSetState. The backend must free this -// state and perform any other global cleanup. -TRITONBACKEND_ISPEC TRITONSERVER_Error* -TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend) -{ - if (BackendConfiguration::RetrieveFrom(backend).enable_memory_tracker_) { - DeviceMemoryTracker::Fini(); - } - - void* vstate; - RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &vstate)); - delete reinterpret_cast(vstate); - return nullptr; // success -} - -// Implementing TRITONBACKEND_ModelInitialize is optional. The backend -// should initialize any state that is intended to be shared across -// all instances of the model. -TRITONBACKEND_ISPEC TRITONSERVER_Error* -TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) -{ - const char* cname; - RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname)); - std::string name(cname); - - uint64_t version; - RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version)); - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " + - std::to_string(version) + ")") - .c_str()); - - // Utilizing DeviceMemoryTracker behavior that function calls with - // 'nullptr' for usage will be no-ops. - std::unique_ptr lusage; - if (BackendConfiguration::RetrieveFrom(model).enable_memory_tracker_) { - lusage.reset(new DeviceMemoryTracker::MemoryUsage()); - DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get()); - } - - // With each model we create a ModelState object and associate it - // with the TRITONBACKEND_Model. - ModelState* model_state = nullptr; - RETURN_IF_ERROR(ModelState::Create(model, &model_state)); - RETURN_IF_ERROR( - TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); - - if (lusage) { - DeviceMemoryTracker::UntrackThreadMemoryUsage(lusage.get()); - TRITONSERVER_BufferAttributes** ba_array; - uint32_t ba_len = 0; - RETURN_IF_ERROR(lusage->SerializeToBufferAttributes(&ba_array, &ba_len)); - RETURN_IF_ERROR( - TRITONBACKEND_ModelReportMemoryUsage(model, ba_array, ba_len)); - } - - return nullptr; // success -} - -// Implementing TRITONBACKEND_ModelFinalize is optional unless state -// is set using TRITONBACKEND_ModelSetState. The backend must free -// this state and perform any other cleanup. -TRITONBACKEND_ISPEC TRITONSERVER_Error* -TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) -{ - void* vstate; - RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); - ModelState* model_state = reinterpret_cast(vstate); - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state"); - - delete model_state; - - return nullptr; // success -} - -// Implementing TRITONBACKEND_ModelInstanceInitialize is optional. The -// backend should initialize any state that is required for a model -// instance. -TRITONBACKEND_ISPEC TRITONSERVER_Error* -TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) -{ - const char* cname; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname)); - std::string name(cname); - - int32_t device_id; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id)); - TRITONSERVER_InstanceGroupKind kind; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind)); - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" + - TRITONSERVER_InstanceGroupKindString(kind) + " device " + - std::to_string(device_id) + ")") - .c_str()); - - // The instance can access the corresponding model as well... here - // we get the model and from that get the model's state. - TRITONBACKEND_Model* model; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); - - void* vmodelstate; - RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); - ModelState* model_state = reinterpret_cast(vmodelstate); - - // Utilizing DeviceMemoryTracker behavior that function calls with - // 'nullptr' for usage will be no-ops. - std::unique_ptr lusage; - if (BackendConfiguration::RetrieveFrom(instance).enable_memory_tracker_) { - lusage.reset(new DeviceMemoryTracker::MemoryUsage()); - DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get()); - } - - - // With each instance we create a ModelInstanceState object and - // associate it with the TRITONBACKEND_ModelInstance. - ModelInstanceState* instance_state; - RETURN_IF_ERROR( - ModelInstanceState::Create(model_state, instance, &instance_state)); - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState( - instance, reinterpret_cast(instance_state))); - - if (lusage) { - DeviceMemoryTracker::UntrackThreadMemoryUsage(lusage.get()); - TRITONSERVER_BufferAttributes** ba_array; - uint32_t ba_len = 0; - RETURN_IF_ERROR(lusage->SerializeToBufferAttributes(&ba_array, &ba_len)); - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceReportMemoryUsage( - instance, ba_array, ba_len)); - } - - return nullptr; // success -} - -// Implementing TRITONBACKEND_ModelInstanceFinalize is optional unless -// state is set using TRITONBACKEND_ModelInstanceSetState. The backend -// must free this state and perform any other cleanup. -TRITONBACKEND_ISPEC TRITONSERVER_Error* -TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) -{ - void* vstate; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); - ModelInstanceState* instance_state = - reinterpret_cast(vstate); - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); - - delete instance_state; - - return nullptr; // success -} - -// Implementing TRITONBACKEND_ModelInstanceExecute is required. -TRITONBACKEND_ISPEC TRITONSERVER_Error* -TRITONBACKEND_ModelInstanceExecute( - TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, - const uint32_t request_count) -{ - // Triton will not call this function simultaneously for the same - // 'instance'. But since this backend could be used by multiple - // instances from multiple models the implementation needs to handle - // multiple calls to this function at the same time (with different - // 'instance' objects). Suggested practice for this is to use only - // function-local and model-instance-specific state (obtained from - // 'instance'), which is what we do here. - ModelInstanceState* instance_state; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState( - instance, reinterpret_cast(&instance_state))); - ModelState* model_state = instance_state->StateForModel(); - - // For TensorRT backend, the executing instance may not closely tie to - // TRITONBACKEND_ModelInstance, the instance will be assigned based on - // execution policy. - int32_t device_id; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id)); - auto instance_semaphore = - model_state->ExecutionState(device_id, instance_state); - auto curr_instance = instance_semaphore.first; - auto semaphore = instance_semaphore.second; - - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("model ") + model_state->Name() + ", instance " + - curr_instance->Name() + ", executing " + std::to_string(request_count) + - " requests") - .c_str()); - - // At this point we accept ownership of 'requests', which means that - // even if something goes wrong we must still return success from - // this function. If something does go wrong in processing a - // particular request then we send an error response just for the - // specific request. - curr_instance->ProcessRequests(requests, request_count); - - // Returning from TRITONBACKEND_ModelInstanceExecute signals Triton to start - // preparing next batch. Due to the async execution in TensorRT backend, we - // need to block the function if there is no instance ready for preparing the - // next execution. - // Acquire() implies that the next execution can be initiated, and it is done - // at the end of current execution because we want to form the batch as late - // as possible, otherwise we may get a smaller batch while more requests may - // arrive between when the batch is formed and when batch is executed. - semaphore->Acquire(); - - return nullptr; // success -} - -} // extern "C" -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/tensorrt_model.cc b/src_trt8/tensorrt_model.cc deleted file mode 100644 index bf2959d..0000000 --- a/src_trt8/tensorrt_model.cc +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "tensorrt_model.h" - -namespace triton { namespace backend { namespace tensorrt { - -TensorRTModel::Priority -ParsePriority(const std::string& priority) -{ - TensorRTModel::Priority trt_priority = TensorRTModel::Priority::DEFAULT; - - if (priority.compare("PRIORITY_MAX") == 0) { - trt_priority = TensorRTModel::Priority::MAX; - } else if (priority.compare("PRIORITY_MIN") == 0) { - trt_priority = TensorRTModel::Priority::MIN; - } else if (priority.compare("PRIORITY_DEFAULT") != 0) { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string( - "TRT backend does not support the provided stream priority '") + - priority + "', using 'PRIORITY_DEFAULT'.") - .c_str()); - } - - return trt_priority; -} - -TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model) - : BackendModel(triton_model), priority_(Priority::DEFAULT), - use_cuda_graphs_(false), gather_kernel_buffer_threshold_(0), - separate_output_stream_(false), eager_batching_(false), - busy_wait_events_(false) -{ - ParseModelConfig(); -} - -TRITONSERVER_Error* -TensorRTModel::SetTensorRTModelConfig() -{ - RETURN_IF_ERROR(SetModelConfig()); - RETURN_IF_ERROR(ParseModelConfig()); - - return nullptr; // Success -} - -TRITONSERVER_Error* -TensorRTModel::ParseModelConfig() -{ - triton::common::TritonJson::Value optimization; - if (model_config_.Find("optimization", &optimization)) { - RETURN_IF_ERROR(optimization.MemberAsUInt( - "gather_kernel_buffer_threshold", &gather_kernel_buffer_threshold_)); - RETURN_IF_ERROR( - optimization.MemberAsBool("eager_batching", &eager_batching_)); - std::string priority; - RETURN_IF_ERROR(optimization.MemberAsString("priority", &priority)); - priority_ = ParsePriority(priority); - triton::common::TritonJson::Value cuda; - if (optimization.Find("cuda", &cuda)) { - RETURN_IF_ERROR(cuda.MemberAsBool("graphs", &use_cuda_graphs_)); - RETURN_IF_ERROR( - cuda.MemberAsBool("busy_wait_events", &busy_wait_events_)); - RETURN_IF_ERROR(cuda.MemberAsArray("graph_spec", &graph_specs_)); - RETURN_IF_ERROR( - cuda.MemberAsBool("output_copy_stream", &separate_output_stream_)); - } - } - - return nullptr; // Success -} - -int -TensorRTModel::GetCudaStreamPriority() -{ - // Default priority is 0 - int cuda_stream_priority = 0; - - int min, max; - cudaError_t cuerr = cudaDeviceGetStreamPriorityRange(&min, &max); - if ((cuerr != cudaErrorNoDevice) && (cuerr != cudaSuccess)) { - return 0; - } - - switch (priority_) { - case TensorRTModel::Priority::MAX: - cuda_stream_priority = max; - break; - case TensorRTModel::Priority::MIN: - cuda_stream_priority = min; - break; - default: - cuda_stream_priority = 0; - break; - } - - return cuda_stream_priority; -} - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/tensorrt_model.h b/src_trt8/tensorrt_model.h deleted file mode 100644 index 86c67a2..0000000 --- a/src_trt8/tensorrt_model.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#pragma once - -#include "triton/backend/backend_model.h" - -namespace triton { namespace backend { namespace tensorrt { - -class TensorRTModel : public BackendModel { - public: - TensorRTModel(TRITONBACKEND_Model* triton_model); - virtual ~TensorRTModel() = default; - - TRITONSERVER_Error* SetTensorRTModelConfig(); - - TRITONSERVER_Error* ParseModelConfig(); - - // The model configuration. - common::TritonJson::Value& GraphSpecs() { return graph_specs_; } - - enum Priority { DEFAULT = 0, MIN = 1, MAX = 2 }; - Priority ModelPriority() { return priority_; } - int GetCudaStreamPriority(); - bool UseCudaGraphs() { return use_cuda_graphs_; } - size_t GatherKernelBufferThreshold() - { - return gather_kernel_buffer_threshold_; - } - bool SeparateOutputStream() { return separate_output_stream_; } - bool EagerBatching() { return eager_batching_; } - bool BusyWaitEvents() { return busy_wait_events_; } - - protected: - common::TritonJson::Value graph_specs_; - Priority priority_; - bool use_cuda_graphs_; - size_t gather_kernel_buffer_threshold_; - bool separate_output_stream_; - bool eager_batching_; - bool busy_wait_events_; -}; - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/tensorrt_model_instance.cc b/src_trt8/tensorrt_model_instance.cc deleted file mode 100644 index 0ebc7c2..0000000 --- a/src_trt8/tensorrt_model_instance.cc +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "tensorrt_model_instance.h" - -namespace triton { namespace backend { namespace tensorrt { - -TensorRTModelInstance::TensorRTModelInstance( - TensorRTModel* tensorrt_model, - TRITONBACKEND_ModelInstance* triton_model_instance) - : BackendModelInstance(tensorrt_model, triton_model_instance), - tensorrt_model_(tensorrt_model) -{ - uint32_t profile_count; - THROW_IF_BACKEND_INSTANCE_ERROR(TRITONBACKEND_ModelInstanceProfileCount( - triton_model_instance, &profile_count)); - for (uint32_t index = 0; index < profile_count; index++) { - const char* profile_name; - THROW_IF_BACKEND_INSTANCE_ERROR(TRITONBACKEND_ModelInstanceProfileName( - triton_model_instance, index, &profile_name)); - profile_names_.insert(profile_name); - } - uint32_t secondary_device_count; - THROW_IF_BACKEND_INSTANCE_ERROR( - TRITONBACKEND_ModelInstanceSecondaryDeviceCount( - triton_model_instance, &secondary_device_count)); - if (secondary_device_count != 0) { - if (secondary_device_count != 1) { - THROW_IF_BACKEND_INSTANCE_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (Name() + " of model " + tensorrt_model->Name() + - " must have either zero or or one secondary devices") - .c_str())); - } - const char* secondary_device_kind; - int64_t secondary_device_id; - THROW_IF_BACKEND_INSTANCE_ERROR( - TRITONBACKEND_ModelInstanceSecondaryDeviceProperties( - triton_model_instance, 0 /* index */, &secondary_device_kind, - &secondary_device_id)); - - if (strcmp(secondary_device_kind, "KIND_NVDLA") != 0) { - THROW_IF_BACKEND_INSTANCE_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("secondary device for ") + Name() + " of model " + - tensorrt_model->Name() + " must be KIND_NVDLA") - .c_str())); - } else { - dla_core_id_ = secondary_device_id; - } - } else { - dla_core_id_ = -1; - } -} - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/tensorrt_model_instance.h b/src_trt8/tensorrt_model_instance.h deleted file mode 100644 index 87a46a8..0000000 --- a/src_trt8/tensorrt_model_instance.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#pragma once - -#include - -#include "tensorrt_model.h" -#include "triton/backend/backend_model_instance.h" - -namespace triton { namespace backend { namespace tensorrt { - -class TensorRTModelInstance : public BackendModelInstance { - public: - TensorRTModelInstance( - TensorRTModel* tensorrt_model, - TRITONBACKEND_ModelInstance* triton_model_instance); - virtual ~TensorRTModelInstance() = default; - - void Initialize(); - TensorRTModel* Model() { return tensorrt_model_; } - std::set& ProfileNames() { return profile_names_; } - int64_t DLACoreId() { return dla_core_id_; } - - protected: - TensorRTModel* tensorrt_model_; - std::set profile_names_; - int64_t dla_core_id_; -}; - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/tensorrt_utils.cc b/src_trt8/tensorrt_utils.cc deleted file mode 100644 index 1b8c932..0000000 --- a/src_trt8/tensorrt_utils.cc +++ /dev/null @@ -1,509 +0,0 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "tensorrt_utils.h" - -#include - -#include "triton/backend/backend_common.h" - -namespace triton { namespace backend { namespace tensorrt { - -TRITONSERVER_DataType -ConvertTrtTypeToDataType(nvinfer1::DataType trt_type) -{ - switch (trt_type) { - case nvinfer1::DataType::kFLOAT: - return TRITONSERVER_TYPE_FP32; - case nvinfer1::DataType::kHALF: - return TRITONSERVER_TYPE_FP16; - case nvinfer1::DataType::kINT8: - return TRITONSERVER_TYPE_INT8; - case nvinfer1::DataType::kUINT8: - return TRITONSERVER_TYPE_UINT8; - case nvinfer1::DataType::kINT32: - return TRITONSERVER_TYPE_INT32; - case nvinfer1::DataType::kBOOL: - return TRITONSERVER_TYPE_BOOL; - default: - return TRITONSERVER_TYPE_INVALID; - } -} - -std::string -ConvertTrtTypeToConfigDataType(nvinfer1::DataType trt_type) -{ - switch (trt_type) { - case nvinfer1::DataType::kFLOAT: - return "TYPE_FP32"; - case nvinfer1::DataType::kHALF: - return "TYPE_FP16"; - case nvinfer1::DataType::kINT8: - return "TYPE_INT8"; - case nvinfer1::DataType::kUINT8: - return "TYPE_UINT8"; - case nvinfer1::DataType::kINT32: - return "TYPE_INT32"; - case nvinfer1::DataType::kBOOL: - return "TYPE_BOOL"; - default: - return "TYPE_INVALID"; - } -} - -bool -UseTensorRTv1API(const std::shared_ptr& engine) -{ - // If the engine still uses implicit batch dimension (deprecated), - // TensorRT V1 API must be used to serve the model. - return engine->hasImplicitBatchDimension(); -} - -TRITONSERVER_Error* -GetProfileIndex(const std::string& profile_name, int* profile_index) -{ - if (profile_name.empty()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, "profile name must not be empty"); - } - - try { - *profile_index = stoi(profile_name); - } - catch (const std::invalid_argument& ia) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unable to parse '") + profile_name + "': " + ia.what()) - .c_str()); - } - - return nullptr; -} - -std::pair -ConvertDataTypeToTrtType(const TRITONSERVER_DataType& dtype) -{ - nvinfer1::DataType trt_type = nvinfer1::DataType::kFLOAT; - switch (dtype) { - case TRITONSERVER_TYPE_FP32: - trt_type = nvinfer1::DataType::kFLOAT; - break; - case TRITONSERVER_TYPE_FP16: - trt_type = nvinfer1::DataType::kHALF; - break; - case TRITONSERVER_TYPE_INT8: - trt_type = nvinfer1::DataType::kINT8; - break; - case TRITONSERVER_TYPE_UINT8: - trt_type = nvinfer1::DataType::kUINT8; - break; - case TRITONSERVER_TYPE_INT32: - trt_type = nvinfer1::DataType::kINT32; - break; - case TRITONSERVER_TYPE_BOOL: - trt_type = nvinfer1::DataType::kBOOL; - break; - default: - return std::make_pair(false, trt_type); - } - return std::make_pair(true, trt_type); -} - -bool -CompareDims(const nvinfer1::Dims& model_dims, const std::vector& dims) -{ - if (model_dims.nbDims != (int32_t)dims.size()) { - return false; - } - - for (int i = 0; i < model_dims.nbDims; ++i) { - if (model_dims.d[i] != dims[i]) { - return false; - } - } - - return true; -} - -bool -CompareDims(const nvinfer1::Dims& ldims, const nvinfer1::Dims& rdims) -{ - if (ldims.nbDims != rdims.nbDims) { - return false; - } - - for (int i = 0; i < ldims.nbDims; ++i) { - if (ldims.d[i] != rdims.d[i]) { - return false; - } - } - - return true; -} - -TRITONSERVER_Error* -CompareDimsSupported( - const std::string& model_name, const std::string& tensor_name, - const nvinfer1::Dims& model_dims, common::TritonJson::Value& dims, - const bool supports_batching, const bool contains_explicit_batch, - const bool compare_exact) -{ - std::vector dims_vec; - for (size_t i = 0; i < dims.ArraySize(); i++) { - int64_t dim; - RETURN_IF_ERROR(dims.IndexAsInt(i, &dim)); - dims_vec.push_back(dim); - } - - RETURN_IF_ERROR(CompareDimsSupported( - model_name, tensor_name, model_dims, dims_vec, supports_batching, - contains_explicit_batch, compare_exact)); - return nullptr; -} - -TRITONSERVER_Error* -CompareDimsSupported( - const std::string& model_name, const std::string& tensor_name, - const nvinfer1::Dims& model_dims, const std::vector& dims, - const bool supports_batching, const bool contains_explicit_batch, - const bool compare_exact) -{ - // If the model configuration expects batching support in the model, - // then the first dimension must be -1. - if (supports_batching && contains_explicit_batch) { - if ((model_dims.nbDims == 0)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("model '") + model_name + "', tensor '" + tensor_name + - "': for the model to support batching the shape should have at " - "least 1 dimension; but shape expected by the model is " + - DimsDebugString(model_dims)) - .c_str()); - } - - std::vector full_dims; - full_dims.emplace_back(WILDCARD_DIM); - full_dims.insert(full_dims.end(), dims.begin(), dims.end()); - - bool succ = (model_dims.nbDims == (int32_t)full_dims.size()); - if (succ) { - for (uint64_t i = 0; i < full_dims.size(); ++i) { - const int64_t model_dim = model_dims.d[i]; - if (compare_exact || ((model_dim != -1) && (full_dims[i] != -1))) { - succ &= (model_dim == full_dims[i]); - } - } - } - - if (!succ) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("model '") + model_name + "', tensor '" + tensor_name + - "': the model expects " + std::to_string(model_dims.nbDims) + - " dimensions (shape " + DimsDebugString(model_dims) + - ") but the model configuration specifies " + - std::to_string(full_dims.size()) + - " dimensions (an initial batch dimension because max_batch_size " - "> 0 followed by the explicit tensor shape, making complete " - "shape " + - backend::ShapeToString(full_dims) + ")") - .c_str()); - } - } else { - // ! supports_batching - bool succ = (model_dims.nbDims == (int32_t)dims.size()); - if (succ) { - for (uint64_t i = 0; i < dims.size(); ++i) { - const int64_t model_dim = model_dims.d[i]; - if (compare_exact || ((model_dim != -1) && (dims[i] != -1))) { - succ &= (model_dim == dims[i]); - } - } - } - - if (!succ) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("model '") + model_name + "', tensor '" + tensor_name + - "': the model expects " + std::to_string(model_dims.nbDims) + - " dimensions (shape " + DimsDebugString(model_dims) + - ") but the model configuration specifies " + - std::to_string(dims.size()) + " dimensions (shape " + - backend::ShapeToString(dims) + ")") - .c_str()); - } - } - - return nullptr; -} - -TRITONSERVER_Error* -CompareShapeDimsSupported( - const std::string& model_name, const std::string& tensor_name, - const nvinfer1::Dims& model_dims, common::TritonJson::Value& dims, - const bool supports_batching) -{ - std::vector dims_vec; - for (size_t i = 0; i < dims.ArraySize(); i++) { - int64_t dim; - RETURN_IF_ERROR(dims.IndexAsInt(i, &dim)); - dims_vec.push_back(dim); - } - - const int batch_offset = supports_batching ? 1 : 0; - bool not_supported = false; - if ((int32_t)dims_vec.size() != model_dims.nbDims) { - not_supported = true; - } else if (model_dims.nbDims == 1) { - if (((dims_vec[0] + batch_offset) != model_dims.d[0]) || - (dims_vec[0] == WILDCARD_DIM)) { - not_supported = true; - } - } else if (model_dims.nbDims > 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to load model '") + model_name + - "', shape binding '" + tensor_name + - "' can only be 0-d or 1-D tensor, got " + DimsDebugString(model_dims)) - .c_str()); - } - - - if (not_supported) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to load model '") + model_name + "', binding '" + - tensor_name + "' shape expected by framework " + - DimsDebugString(model_dims) + - " doesn't match model configuration shape " + - backend::ShapeToString(dims_vec)) - .c_str()); - } - - return nullptr; -} - -TRITONSERVER_Error* -ValidateDimension( - const nvinfer1::Dims& this_dims, const nvinfer1::Dims& min_dims, - const nvinfer1::Dims& max_dims) -{ - if ((this_dims.nbDims) != max_dims.nbDims) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("model expected ") + std::to_string(max_dims.nbDims) + - " dimensions but received " + std::to_string(this_dims.nbDims) + - " dimensions") - .c_str()); - } - - for (int i = 0; i < this_dims.nbDims; i++) { - if (this_dims.d[i] < min_dims.d[i] || this_dims.d[i] > max_dims.d[i]) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("model expected the shape of dimension ") + - std::to_string(i) + " to be between " + - std::to_string(min_dims.d[i]) + " and " + - std::to_string(max_dims.d[i]) + " but received " + - std::to_string(this_dims.d[i])) - .c_str()); - } - } - return nullptr; -} - -TRITONSERVER_Error* -ValidateControlDimsDynamic( - const nvinfer1::Dims& dims, const bool support_batching) -{ - int expected_first_shape = (support_batching ? -1 : 1); - if (dims.d[0] != expected_first_shape) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("expects the first dimension to be ") + - std::to_string(expected_first_shape) + " but the model specified " + - std::to_string(dims.d[0])) - .c_str()); - } - for (int i = 1; i < dims.nbDims; i++) { - if (dims.d[i] != 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("expects all dimensions (conditionally first) to be 1 " - "but the model specified shape ") + - DimsDebugString(dims)) - .c_str()); - } - } - return nullptr; -} - -TRITONSERVER_Error* -ValidateShapeValues( - const std::vector& request_shape_values, - const int32_t* min_shape_values, const int32_t* max_shape_values, - size_t nb_shape_values, const bool support_batching) -{ - if (request_shape_values.size() != nb_shape_values) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string( - "mismatch between the number of shape values. Expecting ") + - std::to_string(nb_shape_values) + ". Got " + - std::to_string(request_shape_values.size())) - .c_str()); - } - - for (size_t i = 0; i < nb_shape_values; i++) { - if (request_shape_values[i] < *(min_shape_values + i) || - request_shape_values[i] > *(max_shape_values + i)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("The shape value at index ") + std::to_string(i) + - " is expected to be in range from " + - std::to_string(*(min_shape_values + i)) + " to " + - std::to_string(*(max_shape_values + i)) + - ", Got: " + std::to_string(request_shape_values[i])) - .c_str()); - } - } - return nullptr; -} - -void -DimsToDimVec(const nvinfer1::Dims& model_dims, std::vector* dims) -{ - dims->clear(); - for (int i = 0; i < model_dims.nbDims; ++i) { - dims->emplace_back(model_dims.d[i]); - } -} - -TRITONSERVER_Error* -DimsJsonToDimVec( - common::TritonJson::Value& dims_json, std::vector* dims) -{ - dims->clear(); - for (size_t i = 0; i < dims_json.ArraySize(); i++) { - int64_t dim; - RETURN_IF_ERROR(dims_json.IndexAsInt(i, &dim)); - dims->push_back(dim); - } - return nullptr; -} - - -bool -DimVecToDims(const std::vector& dim_vec, nvinfer1::Dims* dims) -{ - if (dim_vec.size() > dims->MAX_DIMS) { - return false; - } else { - dims->nbDims = dim_vec.size(); - for (int i = 0; i < dims->nbDims; ++i) { - dims->d[i] = (int)dim_vec[i]; - } - } - return true; -} - -int64_t -GetElementCount(const nvinfer1::Dims& dims) -{ - int64_t count = 1; - for (int i = 0; i < dims.nbDims; ++i) { - count *= dims.d[i]; - } - return count; -} - -bool -ContainsWildcard(const nvinfer1::Dims& dims) -{ - for (int i = 0; i < dims.nbDims; ++i) { - if (dims.d[i] == WILDCARD_DIM) { - return true; - } - } - return false; -} - -bool -ContainsWildcardAtExplicitBatchDim(const nvinfer1::Dims& dims) -{ - if (dims.d[0] == WILDCARD_DIM) { - return true; - } - return false; -} - - -const std::string -DimsDebugString(const nvinfer1::Dims& dims) -{ - std::vector dims_vec; - DimsToDimVec(dims, &dims_vec); - return ShapeToString(dims_vec); -} - -const std::string -DimsJsonToString(common::TritonJson::Value& dims) -{ - std::vector dims_vec; - auto err = DimsJsonToDimVec(dims, &dims_vec); - if (err != nullptr) { - TRITONSERVER_ErrorDelete(err); - return std::string("UNPARSABLE"); - } - return ShapeToString(dims_vec); -} - -TRITONSERVER_Error* -SupportsIntegratedZeroCopy(const int gpu_id, bool* zero_copy_support) -{ - // Query the device to check if integrated - cudaDeviceProp cuprops; - cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, gpu_id); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to get CUDA device properties for GPU ID") + - std::to_string(gpu_id) + ": " + cudaGetErrorString(cuerr)) - .c_str()); - } - - // Zero-copy supported only on integrated GPU when it can map host - // memory - if (cuprops.integrated && cuprops.canMapHostMemory) { - *zero_copy_support = true; - } else { - *zero_copy_support = false; - } - - return nullptr; -} - -}}} // namespace triton::backend::tensorrt diff --git a/src_trt8/tensorrt_utils.h b/src_trt8/tensorrt_utils.h deleted file mode 100644 index 0eac47e..0000000 --- a/src_trt8/tensorrt_utils.h +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#pragma once - -#include - -#include -#include - -#include "triton/backend/backend_common.h" -#include "triton/core/tritonserver.h" - -namespace triton { namespace backend { namespace tensorrt { - -bool UseTensorRTv1API(const std::shared_ptr& engine); - -TRITONSERVER_Error* GetProfileIndex( - const std::string& profile_name, int* profile_index); - -TRITONSERVER_DataType ConvertTrtTypeToDataType(nvinfer1::DataType trt_type); - -std::string ConvertTrtTypeToConfigDataType(nvinfer1::DataType trt_type); - -std::pair ConvertDataTypeToTrtType( - const TRITONSERVER_DataType& dtype); - -bool CompareDims( - const nvinfer1::Dims& model_dims, const std::vector& dims); -bool CompareDims(const nvinfer1::Dims& ldims, const nvinfer1::Dims& rdims); - -TRITONSERVER_Error* ValidateDimension( - const nvinfer1::Dims& this_dims, const nvinfer1::Dims& min_dims, - const nvinfer1::Dims& max_dims); - -// 'skip_first_dimension' should be set to true if the model has implicit batch -// dimension as 'this_dims' should be the full shape (contains batch dimension). -template -TRITONSERVER_Error* ValidateDimension( - const T& this_dims, const nvinfer1::Dims& min_dims, - const nvinfer1::Dims& max_dims, const bool skip_first_dimension); - -TRITONSERVER_Error* CompareDimsSupported( - const std::string& model_name, const std::string& tensor_name, - const nvinfer1::Dims& model_dims, common::TritonJson::Value& dims, - const bool supports_batching, const bool contains_explicit_batch, - const bool compare_exact); - -TRITONSERVER_Error* CompareDimsSupported( - const std::string& model_name, const std::string& tensor_name, - const nvinfer1::Dims& model_dims, const std::vector& dims, - const bool supports_batching, const bool contains_explicit_batch, - const bool compare_exact); - -TRITONSERVER_Error* CompareShapeDimsSupported( - const std::string& model_name, const std::string& tensor_name, - const nvinfer1::Dims& model_dims, common::TritonJson::Value& dims, - const bool supports_batching); - -TRITONSERVER_Error* ValidateControlDimsDynamic( - const nvinfer1::Dims& dims, const bool support_batching); - -TRITONSERVER_Error* ValidateShapeValues( - const std::vector& request_shape_values, - const int32_t* min_shape_values, const int32_t* max_shape_values, - size_t nb_shape_values, const bool support_batching); - -void DimsToDimVec(const nvinfer1::Dims& model_dims, std::vector* dims); - -TRITONSERVER_Error* DimsJsonToDimVec( - common::TritonJson::Value& dims_json, std::vector* dims); - -bool DimVecToDims(const std::vector& dim_vec, nvinfer1::Dims* dims); - -int64_t GetElementCount(const nvinfer1::Dims& dims); - -bool ContainsWildcard(const nvinfer1::Dims& dims); - -bool ContainsWildcardAtExplicitBatchDim(const nvinfer1::Dims& dims); - -const std::string DimsDebugString(const nvinfer1::Dims& dims); - -const std::string DimsJsonToString(common::TritonJson::Value& dims); - -TRITONSERVER_Error* SupportsIntegratedZeroCopy( - const int gpu_id, bool* zero_copy_support); - -// -// Templates -// - -template -TRITONSERVER_Error* -ValidateDimension( - const T& this_dims, const nvinfer1::Dims& min_dims, - const nvinfer1::Dims& max_dims, const bool skip_first_dimension) -{ - const int nonbatch_start_idx = (skip_first_dimension ? 1 : 0); - if (int(this_dims.size()) != (max_dims.nbDims + nonbatch_start_idx)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("model expected ") + - std::to_string(max_dims.nbDims + nonbatch_start_idx) + - " dimensions but received " + std::to_string(this_dims.size()) + - " dimensions") - .c_str()); - } - - for (int i = 0; i < max_dims.nbDims; i++) { - if (this_dims[i] == -1) { - continue; - } - if (this_dims[i + nonbatch_start_idx] < min_dims.d[i] || - this_dims[i + nonbatch_start_idx] > max_dims.d[i]) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("model expected the shape of dimension ") + - std::to_string(i + nonbatch_start_idx) + " to be between " + - std::to_string(min_dims.d[i]) + " and " + - std::to_string(max_dims.d[i]) + " but received " + - std::to_string(this_dims[i + nonbatch_start_idx])) - .c_str()); - } - } - return nullptr; -} - -}}} // namespace triton::backend::tensorrt