Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add INT64 Datatype Support for Shape Tensors in TensorRT Backend #91

Merged
merged 10 commits into from
Jun 20, 2024
Merged
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ add_library(
${SOURCE_DIR}/instance_state.cc
${SOURCE_DIR}/tensorrt_model_instance.cc
${SOURCE_DIR}/tensorrt_model_instance.h
${SOURCE_DIR}/shape_tensor.cc
${SOURCE_DIR}/shape_tensor.h
${SOURCE_DIR}/tensorrt_utils.cc
${SOURCE_DIR}/tensorrt_utils.h
${SOURCE_DIR}/filesystem.h
Expand Down
109 changes: 44 additions & 65 deletions src/instance_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ ModelInstanceState::Run(
return;
}

std::map<int32_t, std::vector<int32_t>> request_shape_values;
std::map<int32_t, ShapeTensor> request_shape_values;
// Scheduler ensures all the requests have identical shape values so
// use values from first shape tensor
TRITONSERVER_Error* err = GetRequestShapeValues(
Expand Down Expand Up @@ -587,8 +587,7 @@ ModelInstanceState::Run(
if (it != request_shape_values.end()) {
err = ValidateShapeValues(
it->second, citr->second.min_shapes_[io_index],
citr->second.max_shapes_[io_index], citr->second.nb_shape_values_,
support_batching_);
citr->second.max_shapes_[io_index], citr->second.nb_shape_values_);
} else {
err = TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
Expand All @@ -607,8 +606,8 @@ ModelInstanceState::Run(
// [FIXME] formalize it, the 'buffer_' may be set directly while forming
// the shape value
memcpy(
io_binding_info.GetBuffer(), &(it->second[0]),
sizeof(int32_t) * it->second.size());
io_binding_info.GetBuffer(), it->second.GetData(),
it->second.GetSize());
citr->second.context_->setInputTensorAddress(
name.c_str(), io_binding_info.GetBuffer());
}
Expand Down Expand Up @@ -1304,7 +1303,7 @@ ModelInstanceState::ProcessResponse()
TRITONSERVER_Error*
ModelInstanceState::GetRequestShapeValues(
size_t total_batch_size, TRITONBACKEND_Request* request,
std::map<int, std::vector<int32_t>>* request_shape_values)
std::map<int, ShapeTensor>* request_shape_values)
{
// Visit all the inputs and extract the shape values present in the
// request
Expand All @@ -1325,12 +1324,6 @@ ModelInstanceState::GetRequestShapeValues(

int io_index = io_index_map_[input_name];
if (engine_->isShapeInferenceIO(input_name)) {
auto it =
tanmayv25 marked this conversation as resolved.
Show resolved Hide resolved
request_shape_values->emplace(io_index, std::vector<int32_t>()).first;
if (support_batching_) {
it->second.push_back((int32_t)total_batch_size);
}

// For now being conservative and requiring that shape tensors
// be in a single buffer on the CPU. We can handle more cases in
// future if necessary.
Expand Down Expand Up @@ -1359,38 +1352,15 @@ ModelInstanceState::GetRequestShapeValues(
.c_str());
}

// FIXME DLIS-6653: With the support of INT64, shape tensors
// can also be of type INT64 and the assumptions that shape
// tensors might not always hold true.
// Assuming input shape tensors datatype as INT32.
int64_t element_cnt = backend::GetElementCount(shape, dims_count);
if (support_batching_) {
element_cnt /= shape[0];
}
const size_t expected_byte_size =
element_cnt * GetByteSize(TRITONSERVER_TYPE_INT32, {1});

bool includes_batch_shape_value = false;
if (expected_byte_size != data_byte_size) {
if (expected_byte_size == (data_byte_size - sizeof(int32_t))) {
includes_batch_shape_value = true;
} else {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("shape tensor for input '") + input_name +
"' expected byte size is " + std::to_string(expected_byte_size) +
" [ or " + std::to_string(expected_byte_size + sizeof(int32_t)) +
" if input includes batch shape value] " + ", got " +
std::to_string(data_byte_size))
.c_str());
}
}

const int32_t* dims = reinterpret_cast<const int32_t*>(data_buffer);
int64_t offset = includes_batch_shape_value ? 1 : 0;
for (int64_t i = offset; i < element_cnt; ++i) {
it->second.push_back(dims[i]);
}
auto it = request_shape_values->emplace(io_index, ShapeTensor()).first;
RETURN_IF_ERROR(it->second.SetDataFromBuffer(
data_buffer, data_byte_size, datatype, element_cnt, input_name,
support_batching_, total_batch_size));
}
}

Expand All @@ -1401,7 +1371,7 @@ TRITONSERVER_Error*
ModelInstanceState::GetMostOptimizedProfile(
size_t total_batch_size, TRITONBACKEND_Request** requests,
uint32_t request_count,
const std::map<int, std::vector<int32_t>>& request_shape_values,
const std::map<int, ShapeTensor>& request_shape_values,
std::map<int, TensorRTContext>::iterator* citr)
{
// Returns the TensorRT context that uses profile with shortest
Expand Down Expand Up @@ -1452,7 +1422,7 @@ TRITONSERVER_Error*
ModelInstanceState::EvaluateTensorRTContext(
std::map<int, TensorRTContext>::iterator& citr, size_t total_batch_size,
TRITONBACKEND_Request** requests, uint32_t request_count,
const std::map<int, std::vector<int32_t>>& request_shape_values,
const std::map<int, ShapeTensor>& request_shape_values,
int64_t* error_distance)
{
*error_distance = 0;
Expand Down Expand Up @@ -1519,13 +1489,12 @@ ModelInstanceState::EvaluateTensorRTContext(
if (it != request_shape_values.end()) {
shape_err = ValidateShapeValues(
it->second, citr->second.min_shapes_[io_index],
citr->second.max_shapes_[io_index], citr->second.nb_shape_values_,
support_batching_);
valid_bs =
(!support_batching_) || (((int32_t)total_batch_size >=
*citr->second.min_shapes_[io_index]) &&
((int64_t)total_batch_size <=
*citr->second.max_shapes_[io_index]));
citr->second.max_shapes_[io_index],
citr->second.nb_shape_values_);
valid_bs = (!support_batching_) ||
ValidateBatchSize(
total_batch_size, citr->second.min_shapes_[io_index],
citr->second.max_shapes_[io_index]);
} else {
missing_shape_values = true;
}
Expand All @@ -1549,14 +1518,9 @@ ModelInstanceState::EvaluateTensorRTContext(
std::abs(opt_dims.d[idx] - input_shape_vec[idx - 1]);
}
if (engine_->isShapeInferenceIO(input_name)) {
const auto* opt_shape_values = citr->second.opt_shapes_[io_index];
*error_distance +=
std::abs(*opt_shape_values - (int64_t)total_batch_size);
auto it = request_shape_values.find(io_index);
for (size_t idx = 1; idx < citr->second.nb_shape_values_; idx++) {
*error_distance +=
std::abs(*(opt_shape_values + idx) - it->second[idx - 1]);
}
*error_distance += it->second.GetDistance(
citr->second.opt_shapes_[io_index], total_batch_size);
}
}
}
Expand Down Expand Up @@ -2996,13 +2960,14 @@ ModelInstanceState::InitializeShapeInputBinding(
return nullptr;
}

if (input_datatype != TRITONSERVER_TYPE_INT32) {
if ((input_datatype != TRITONSERVER_TYPE_INT32) &&
(input_datatype != TRITONSERVER_TYPE_INT64)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("unexpected datatype TYPE_") +
TRITONSERVER_DataTypeString(input_datatype) +
" in model configuration for shape input '" + input_name +
"', expecting TYPE_INT32 for " + Name())
" in model configuration for shape input '" + input_name +
"', expecting TYPE_INT32 or TYPE_INT64 for " + Name())
.c_str());
}

Expand Down Expand Up @@ -3042,18 +3007,32 @@ ModelInstanceState::InitializeShapeInputBinding(
context.nb_shape_values_ = (context.max_dims_[io_index].nbDims == 0)
? 1
: context.max_dims_[io_index].d[0];
context.max_shapes_[io_index] = engine_->getProfileTensorValues(
input_name.c_str(), profile_index, nvinfer1::OptProfileSelector::kMAX);
context.min_shapes_[io_index] = engine_->getProfileTensorValues(
input_name.c_str(), profile_index, nvinfer1::OptProfileSelector::kMIN);
context.opt_shapes_[io_index] = engine_->getProfileTensorValues(
input_name.c_str(), profile_index, nvinfer1::OptProfileSelector::kOPT);
context.max_shapes_[io_index] = ShapeTensor();
context.max_shapes_[io_index].SetDataFromShapeValues(
engine_->getProfileTensorValues(
input_name.c_str(), profile_index,
nvinfer1::OptProfileSelector::kMAX),
input_datatype, context.nb_shape_values_);

context.min_shapes_[io_index] = ShapeTensor();
context.min_shapes_[io_index].SetDataFromShapeValues(
engine_->getProfileTensorValues(
input_name.c_str(), profile_index,
nvinfer1::OptProfileSelector::kMIN),
input_datatype, context.nb_shape_values_);

context.opt_shapes_[io_index] = ShapeTensor();
context.opt_shapes_[io_index].SetDataFromShapeValues(
engine_->getProfileTensorValues(
input_name.c_str(), profile_index,
nvinfer1::OptProfileSelector::kOPT),
input_datatype, context.nb_shape_values_);

// Set shape tensor address to buffer that contains max allowed value so
// later shape inference will return max output shape / size for
// pre-allocation.
if (!context.context_->setInputTensorAddress(
input_name.c_str(), context.max_shapes_[io_index])) {
input_name.c_str(), context.max_shapes_[io_index].GetData())) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("trt failed to set the input shape binding for '") +
Expand Down
13 changes: 7 additions & 6 deletions src/instance_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "io_binding_info.h"
#include "model_state.h"
#include "semaphore.h"
#include "shape_tensor.h"
#include "tensorrt_model_instance.h"
#include "triton/backend/backend_input_collector.h"
#include "triton/backend/backend_output_responder.h"
Expand Down Expand Up @@ -136,13 +137,13 @@ struct TensorRTContext {
std::vector<nvinfer1::Dims> opt_dims_{};

// Min shape values per bindings
std::vector<const int32_t*> min_shapes_{};
std::vector<ShapeTensor> min_shapes_{};

// Max shape values per bindings
std::vector<const int32_t*> max_shapes_{};
std::vector<ShapeTensor> max_shapes_{};

// Optimized shape values per bindings
std::vector<const int32_t*> opt_shapes_{};
std::vector<ShapeTensor> opt_shapes_{};

// The number of shape values
size_t nb_shape_values_{0};
Expand Down Expand Up @@ -333,16 +334,16 @@ class ModelInstanceState : public TensorRTModelInstance {

TRITONSERVER_Error* GetRequestShapeValues(
size_t total_batch_size, TRITONBACKEND_Request* request,
std::map<int, std::vector<int32_t>>* request_shape_values);
std::map<int, ShapeTensor>* request_shape_values);
TRITONSERVER_Error* GetMostOptimizedProfile(
size_t total_batch_size, TRITONBACKEND_Request** requests,
uint32_t request_count,
const std::map<int, std::vector<int32_t>>& request_shape_values,
const std::map<int, ShapeTensor>& request_shape_values,
std::map<int, TensorRTContext>::iterator* citr);
TRITONSERVER_Error* EvaluateTensorRTContext(
std::map<int, TensorRTContext>::iterator& citr, size_t total_batch_size,
TRITONBACKEND_Request** requests, uint32_t request_count,
const std::map<int, std::vector<int32_t>>& request_shape_values,
const std::map<int, ShapeTensor>& request_shape_values,
int64_t* error_distance);

bool SetOutputShapeTensorBuffer(
Expand Down
Loading
Loading