triton-inference-server · pskiran1 · Jun 20, 2024 · May 29, 2024 · Jun 6, 2024 · Jun 11, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -141,6 +141,8 @@ add_library(
   ${SOURCE_DIR}/instance_state.cc
   ${SOURCE_DIR}/tensorrt_model_instance.cc
   ${SOURCE_DIR}/tensorrt_model_instance.h
+  ${SOURCE_DIR}/shape_tensor.cc
+  ${SOURCE_DIR}/shape_tensor.h
   ${SOURCE_DIR}/tensorrt_utils.cc
   ${SOURCE_DIR}/tensorrt_utils.h
   ${SOURCE_DIR}/filesystem.h

diff --git a/src/instance_state.cc b/src/instance_state.cc
@@ -500,7 +500,7 @@ ModelInstanceState::Run(
     return;
   }
 
-  std::map<int32_t, std::vector<int32_t>> request_shape_values;
+  std::map<int32_t, ShapeTensor> request_shape_values;
   // Scheduler ensures all the requests have identical shape values so
   // use values from first shape tensor
   TRITONSERVER_Error* err = GetRequestShapeValues(
@@ -587,8 +587,7 @@ ModelInstanceState::Run(
       if (it != request_shape_values.end()) {
         err = ValidateShapeValues(
             it->second, citr->second.min_shapes_[io_index],
-            citr->second.max_shapes_[io_index], citr->second.nb_shape_values_,
-            support_batching_);
+            citr->second.max_shapes_[io_index], citr->second.nb_shape_values_);
       } else {
         err = TRITONSERVER_ErrorNew(
             TRITONSERVER_ERROR_INTERNAL,
@@ -607,8 +606,8 @@ ModelInstanceState::Run(
         // [FIXME] formalize it, the 'buffer_' may be set directly while forming
         // the shape value
         memcpy(
-            io_binding_info.GetBuffer(), &(it->second[0]),
-            sizeof(int32_t) * it->second.size());
+            io_binding_info.GetBuffer(), it->second.GetData(),
+            it->second.GetSize());
         citr->second.context_->setInputTensorAddress(
             name.c_str(), io_binding_info.GetBuffer());
       }
@@ -1304,7 +1303,7 @@ ModelInstanceState::ProcessResponse()
 TRITONSERVER_Error*
 ModelInstanceState::GetRequestShapeValues(
     size_t total_batch_size, TRITONBACKEND_Request* request,
-    std::map<int, std::vector<int32_t>>* request_shape_values)
+    std::map<int, ShapeTensor>* request_shape_values)
 {
   // Visit all the inputs and extract the shape values present in the
   // request
@@ -1325,12 +1324,6 @@ ModelInstanceState::GetRequestShapeValues(
 
     int io_index = io_index_map_[input_name];
     if (engine_->isShapeInferenceIO(input_name)) {
-      auto it =
-          request_shape_values->emplace(io_index, std::vector<int32_t>()).first;
-      if (support_batching_) {
-        it->second.push_back((int32_t)total_batch_size);
-      }
-
       // For now being conservative and requiring that shape tensors
       // be in a single buffer on the CPU. We can handle more cases in
       // future if necessary.
@@ -1359,38 +1352,15 @@ ModelInstanceState::GetRequestShapeValues(
                 .c_str());
       }
 
-      // FIXME DLIS-6653: With the support of INT64, shape tensors
-      // can also be of type INT64 and the assumptions that shape
-      // tensors might not always hold true.
-      // Assuming input shape tensors datatype as INT32.
       int64_t element_cnt = backend::GetElementCount(shape, dims_count);
       if (support_batching_) {
         element_cnt /= shape[0];
       }
-      const size_t expected_byte_size =
-          element_cnt * GetByteSize(TRITONSERVER_TYPE_INT32, {1});
 
-      bool includes_batch_shape_value = false;
-      if (expected_byte_size != data_byte_size) {
-        if (expected_byte_size == (data_byte_size - sizeof(int32_t))) {
-          includes_batch_shape_value = true;
-        } else {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              (std::string("shape tensor for input '") + input_name +
-               "' expected byte size is " + std::to_string(expected_byte_size) +
-               " [ or " + std::to_string(expected_byte_size + sizeof(int32_t)) +
-               " if input includes batch shape value] " + ", got " +
-               std::to_string(data_byte_size))
-                  .c_str());
-        }
-      }
-
-      const int32_t* dims = reinterpret_cast<const int32_t*>(data_buffer);
-      int64_t offset = includes_batch_shape_value ? 1 : 0;
-      for (int64_t i = offset; i < element_cnt; ++i) {
-        it->second.push_back(dims[i]);
-      }
+      auto it = request_shape_values->emplace(io_index, ShapeTensor()).first;
+      RETURN_IF_ERROR(it->second.SetDataFromBuffer(
+          data_buffer, data_byte_size, datatype, element_cnt, input_name,
+          support_batching_, total_batch_size));
     }
   }
 
@@ -1401,7 +1371,7 @@ TRITONSERVER_Error*
 ModelInstanceState::GetMostOptimizedProfile(
     size_t total_batch_size, TRITONBACKEND_Request** requests,
     uint32_t request_count,
-    const std::map<int, std::vector<int32_t>>& request_shape_values,
+    const std::map<int, ShapeTensor>& request_shape_values,
     std::map<int, TensorRTContext>::iterator* citr)
 {
   // Returns the TensorRT context that uses profile with shortest
@@ -1452,7 +1422,7 @@ TRITONSERVER_Error*
 ModelInstanceState::EvaluateTensorRTContext(
     std::map<int, TensorRTContext>::iterator& citr, size_t total_batch_size,
     TRITONBACKEND_Request** requests, uint32_t request_count,
-    const std::map<int, std::vector<int32_t>>& request_shape_values,
+    const std::map<int, ShapeTensor>& request_shape_values,
     int64_t* error_distance)
 {
   *error_distance = 0;
@@ -1519,13 +1489,12 @@ ModelInstanceState::EvaluateTensorRTContext(
         if (it != request_shape_values.end()) {
           shape_err = ValidateShapeValues(
               it->second, citr->second.min_shapes_[io_index],
-              citr->second.max_shapes_[io_index], citr->second.nb_shape_values_,
-              support_batching_);
-          valid_bs =
-              (!support_batching_) || (((int32_t)total_batch_size >=
-                                        *citr->second.min_shapes_[io_index]) &&
-                                       ((int64_t)total_batch_size <=
-                                        *citr->second.max_shapes_[io_index]));
+              citr->second.max_shapes_[io_index],
+              citr->second.nb_shape_values_);
+          valid_bs = (!support_batching_) ||
+                     ValidateBatchSize(
+                         total_batch_size, citr->second.min_shapes_[io_index],
+                         citr->second.max_shapes_[io_index]);
         } else {
           missing_shape_values = true;
         }
@@ -1549,14 +1518,9 @@ ModelInstanceState::EvaluateTensorRTContext(
               std::abs(opt_dims.d[idx] - input_shape_vec[idx - 1]);
         }
         if (engine_->isShapeInferenceIO(input_name)) {
-          const auto* opt_shape_values = citr->second.opt_shapes_[io_index];
-          *error_distance +=
-              std::abs(*opt_shape_values - (int64_t)total_batch_size);
           auto it = request_shape_values.find(io_index);
-          for (size_t idx = 1; idx < citr->second.nb_shape_values_; idx++) {
-            *error_distance +=
-                std::abs(*(opt_shape_values + idx) - it->second[idx - 1]);
-          }
+          *error_distance += it->second.GetDistance(
+              citr->second.opt_shapes_[io_index], total_batch_size);
         }
       }
     }
@@ -2996,13 +2960,14 @@ ModelInstanceState::InitializeShapeInputBinding(
       return nullptr;
     }
 
-    if (input_datatype != TRITONSERVER_TYPE_INT32) {
+    if ((input_datatype != TRITONSERVER_TYPE_INT32) &&
+        (input_datatype != TRITONSERVER_TYPE_INT64)) {
       return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INVALID_ARG,
           (std::string("unexpected datatype TYPE_") +
            TRITONSERVER_DataTypeString(input_datatype) +
-           "  in model configuration for shape input '" + input_name +
-           "', expecting TYPE_INT32 for " + Name())
+           " in model configuration for shape input '" + input_name +
+           "', expecting TYPE_INT32 or TYPE_INT64 for " + Name())
               .c_str());
     }
 
@@ -3042,18 +3007,32 @@ ModelInstanceState::InitializeShapeInputBinding(
     context.nb_shape_values_ = (context.max_dims_[io_index].nbDims == 0)
                                    ? 1
                                    : context.max_dims_[io_index].d[0];
-    context.max_shapes_[io_index] = engine_->getProfileTensorValues(
-        input_name.c_str(), profile_index, nvinfer1::OptProfileSelector::kMAX);
-    context.min_shapes_[io_index] = engine_->getProfileTensorValues(
-        input_name.c_str(), profile_index, nvinfer1::OptProfileSelector::kMIN);
-    context.opt_shapes_[io_index] = engine_->getProfileTensorValues(
-        input_name.c_str(), profile_index, nvinfer1::OptProfileSelector::kOPT);
+    context.max_shapes_[io_index] = ShapeTensor();
+    context.max_shapes_[io_index].SetDataFromShapeValues(
+        engine_->getProfileTensorValues(
+            input_name.c_str(), profile_index,
+            nvinfer1::OptProfileSelector::kMAX),
+        input_datatype, context.nb_shape_values_);
+
+    context.min_shapes_[io_index] = ShapeTensor();
+    context.min_shapes_[io_index].SetDataFromShapeValues(
+        engine_->getProfileTensorValues(
+            input_name.c_str(), profile_index,
+            nvinfer1::OptProfileSelector::kMIN),
+        input_datatype, context.nb_shape_values_);
+
+    context.opt_shapes_[io_index] = ShapeTensor();
+    context.opt_shapes_[io_index].SetDataFromShapeValues(
+        engine_->getProfileTensorValues(
+            input_name.c_str(), profile_index,
+            nvinfer1::OptProfileSelector::kOPT),
+        input_datatype, context.nb_shape_values_);
 
     // Set shape tensor address to buffer that contains max allowed value so
     // later shape inference will return max output shape / size for
     // pre-allocation.
     if (!context.context_->setInputTensorAddress(
-            input_name.c_str(), context.max_shapes_[io_index])) {
+            input_name.c_str(), context.max_shapes_[io_index].GetData())) {
       return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INTERNAL,
           (std::string("trt failed to set the input shape binding for '") +

diff --git a/src/instance_state.h b/src/instance_state.h
@@ -35,6 +35,7 @@
 #include "io_binding_info.h"
 #include "model_state.h"
 #include "semaphore.h"
+#include "shape_tensor.h"
 #include "tensorrt_model_instance.h"
 #include "triton/backend/backend_input_collector.h"
 #include "triton/backend/backend_output_responder.h"
@@ -136,13 +137,13 @@ struct TensorRTContext {
   std::vector<nvinfer1::Dims> opt_dims_{};
 
   // Min shape values per bindings
-  std::vector<const int32_t*> min_shapes_{};
+  std::vector<ShapeTensor> min_shapes_{};
 
   // Max shape values per bindings
-  std::vector<const int32_t*> max_shapes_{};
+  std::vector<ShapeTensor> max_shapes_{};
 
   // Optimized shape values per bindings
-  std::vector<const int32_t*> opt_shapes_{};
+  std::vector<ShapeTensor> opt_shapes_{};
 
   // The number of shape values
   size_t nb_shape_values_{0};
@@ -333,16 +334,16 @@ class ModelInstanceState : public TensorRTModelInstance {
 
   TRITONSERVER_Error* GetRequestShapeValues(
       size_t total_batch_size, TRITONBACKEND_Request* request,
-      std::map<int, std::vector<int32_t>>* request_shape_values);
+      std::map<int, ShapeTensor>* request_shape_values);
   TRITONSERVER_Error* GetMostOptimizedProfile(
       size_t total_batch_size, TRITONBACKEND_Request** requests,
       uint32_t request_count,
-      const std::map<int, std::vector<int32_t>>& request_shape_values,
+      const std::map<int, ShapeTensor>& request_shape_values,
       std::map<int, TensorRTContext>::iterator* citr);
   TRITONSERVER_Error* EvaluateTensorRTContext(
       std::map<int, TensorRTContext>::iterator& citr, size_t total_batch_size,
       TRITONBACKEND_Request** requests, uint32_t request_count,
-      const std::map<int, std::vector<int32_t>>& request_shape_values,
+      const std::map<int, ShapeTensor>& request_shape_values,
       int64_t* error_distance);
 
   bool SetOutputShapeTensorBuffer(