add trt_dump_ep_context_model, trt_ep_context_embed_mode, trt_ep_cont…

…ext_compute_capability_enable
microsoft · Nov 23, 2023 · 77a62f2 · 77a62f2
1 parent 8f7c7ac
commit 77a62f2
Show file tree

Hide file tree

Showing 9 changed files with 112 additions and 16 deletions.
diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -46,4 +46,7 @@ struct OrtTensorRTProviderOptionsV2 {
   const char* trt_profile_max_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
   const char* trt_profile_opt_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
   int trt_cuda_graph_enable{0};                          // Enable CUDA graph in ORT TRT
+  int trt_dump_ep_context_model{0};                      // Dump EP context node model
+  int trt_ep_context_embed_mode{0};                      // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
+  int trt_ep_context_compute_capability_enable{1};       // Add GPU compute capability as an EP context node's attribute
 };
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
@@ -83,6 +83,8 @@ ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
                                                char* engine_data,
                                                size_t size,
                                                const int64_t embed_mode,
+                                               bool compute_capability_enable,
+                                               int device_id,
                                                const logging::Logger* logger) {
   auto model_build = graph_viewer.CreateModel(*logger);
   auto& graph_build = model_build->MainGraph();
@@ -102,6 +104,7 @@ ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
   // Create EP context node attributes
   auto attr_0 = ONNX_NAMESPACE::AttributeProto::Create();  // embed_mode
   auto attr_1 = ONNX_NAMESPACE::AttributeProto::Create();  // ep_cache_context
+  auto attr_2 = ONNX_NAMESPACE::AttributeProto::Create();  // hardware_arch
   std::string engine_data_str = "";
   attr_0->set_name(EMBED_MODE);
   attr_0->set_type(onnx::AttributeProto_AttributeType_INT);
@@ -117,10 +120,19 @@ ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
     attr_1->set_s(engine_cache_path);
   }
   auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
-  int num_attributes = 2;
+  int num_attributes = compute_capability_enable ? 3 : 2;
   node_attributes->reserve(num_attributes);
   node_attributes->emplace(EMBED_MODE, *attr_0);
   node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1);
+
+  if (compute_capability_enable) {
+    cudaDeviceProp prop;
+    CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id));
+    attr_2->set_name(COMPUTE_CAPABILITY);
+    attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
+    attr_2->set_s(GetComputeCapacityString(prop));
+    node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
+  }
 
   // Create EP context node
   graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN);
@@ -145,7 +157,7 @@ void DumpCtxNodeModel(ONNX_NAMESPACE::ModelProto* model_proto,
   std::string string_buf;
   model_proto->SerializeToString(string_buf);
 
-  // Dump out EP context node model
+  // Dump EP context node model to disk
   std::fstream dump(engine_cache_path + "_wrapper.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
   model_proto->SerializeToOstream(dump);
   LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path + "_wrapper.onnx";

diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
@@ -12,13 +12,9 @@
 namespace onnxruntime {
 
 static const std::string EPCONTEXT_OP = "EPContext";
-static const std::string MAIN_CONTEXT = "main_context";
 static const std::string EMBED_MODE = "embed_mode";
 static const std::string EP_CACHE_CONTEXT = "ep_cache_context";
-static const std::string EP_SDK_VER = "ep_sdk_version";
 static const std::string COMPUTE_CAPABILITY = "hardware_arch";
-static const std::string PARTITION_NAME = "partition_name";
-static const std::string SOURCE = "source";
 static const std::string EPCONTEXT_OP_DOMAIN = "com.microsoft";
 
 bool GraphHasCtxNode(const GraphViewer& graph_viewer);
@@ -29,6 +25,8 @@ ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
                                                char* engine_data,
                                                size_t size,
                                                const int64_t embed_mode,
+                                               bool compute_capability_enable,
+                                               int device_id,
                                                const logging::Logger* logger);
 void DumpCtxNodeModel(ONNX_NAMESPACE::ModelProto* model_proto,
                       const std::string engine_cache_path);

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1308,6 +1308,9 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     profile_max_shapes = info.profile_max_shapes;
     profile_opt_shapes = info.profile_opt_shapes;
     cuda_graph_enable_ = info.cuda_graph_enable;
+    dump_ep_context_model_ = info.dump_ep_context_model;
+    ep_context_embed_mode_ = info.ep_context_embed_mode;
+    ep_context_compute_capability_enable_ = info.ep_context_compute_capability_enable;
   } else {
     try {
       const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations);
@@ -1461,6 +1464,22 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
       if (!cuda_graph_enable_env.empty()) {
         cuda_graph_enable_ = (std::stoi(cuda_graph_enable_env) == 0 ? false : true);
       }
+
+      const std::string dump_ep_context_model_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDumpEpContextModel);
+      if (!dump_ep_context_model_env.empty()) {
+        dump_ep_context_model_ = (std::stoi(dump_ep_context_model_env) == 0 ? false : true);
+      }
+
+      const std::string ep_context_embed_mode_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextEmbedMode);
+      if (!ep_context_embed_mode_env.empty()) {
+        ep_context_embed_mode_ = std::stoi(ep_context_embed_mode_env);
+      }
+
+      const std::string ep_context_compute_capability_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextComputeCapabilityEnable);
+      if (!ep_context_compute_capability_env.empty()) {
+        ep_context_compute_capability_enable_ = (std::stoi(ep_context_compute_capability_env) == 0 ? false : true);
+      }
+
     } catch (const std::invalid_argument& ex) {
       LOGS_DEFAULT(WARNING) << "[TensorRT EP] Invalid Argument (from environment variables): " << ex.what();
     } catch (const std::out_of_range& ex) {
@@ -2978,11 +2997,12 @@ Status TensorrtExecutionProvider::CreateNodeComputeFromGraph(const GraphViewer&
   CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
   std::string compute_capability = GetComputeCapacity(prop);
   const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
+  const std::string cache_path_prefix = cache_path + "_sm" + compute_capability;
+  const std::string engine_cache_path = cache_path_prefix + ".engine";
+  const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
+  const std::string profile_cache_path = cache_path_prefix + ".profile";
 
   if (!has_dynamic_shape) {
-    const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine";
-    const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
-    const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile";
     std::string timing_cache_path = "";
     bool engine_update = false;
     if (timing_cache_enable_) {
@@ -3123,8 +3143,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeFromGraph(const GraphViewer&
                                                                                      reinterpret_cast<char*>(serialized_engine->data()),
                                                                                      serialized_engine->size(),
                                                                                      ep_context_embed_mode_,
+                                                                                     ep_context_compute_capability_enable_,
+                                                                                     device_id_,
                                                                                      GetLogger())};
-          DumpCtxNodeModel(model_proto.get(), cache_path + "_sm" + compute_capability);
+          DumpCtxNodeModel(model_proto.get(), cache_path_prefix);
         }
       }
     }
@@ -3180,8 +3202,21 @@ Status TensorrtExecutionProvider::CreateNodeComputeFromGraph(const GraphViewer&
   input_shape_ranges_[fused_node.Name()] = input_implicit_shape_ranges;
   profiles_.emplace(fused_node.Name(), std::move(trt_profiles));
 
+  // For dynamic shape input model, firstly TRT EP creates a model proto which includes inputs, outputs and empty engine.
+  // TRT EP will serialize the model at inference time due to engine can be updated and the updated engine should be included in the model.
+  // However, if the embed_mode is 0 (only includes engine path), TRT EP will serialize it here.
   if (dump_ep_context_model_ && has_dynamic_shape) {
-    model_proto_.reset(CreateCtxNodeModel(graph_body_viewer, cache_path + "_sm" + compute_capability, nullptr, 0, ep_context_embed_mode_, GetLogger()));
+    model_proto_.reset(CreateCtxNodeModel(graph_body_viewer,
+                                          engine_cache_path,
+                                          nullptr,
+                                          0,
+                                          ep_context_embed_mode_,
+                                          ep_context_compute_capability_enable_,
+                                          device_id_,
+                                          GetLogger()));
+    if (ep_context_embed_mode_ == 0) {
+      DumpCtxNodeModel(model_proto_.get(), cache_path_prefix);
+    }
   }
 
   // Create function state
@@ -3259,9 +3294,10 @@ Status TensorrtExecutionProvider::CreateNodeComputeFromGraph(const GraphViewer&
 
     // Prepare cache name
     const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
-    const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine";
+    const std::string cache_path_prefix = cache_path + "_sm" + compute_capability;
+    const std::string engine_cache_path = cache_path_prefix + ".engine";
     const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
-    const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile";
+    const std::string profile_cache_path = cache_path_prefix + ".profile";
     std::string timing_cache_path = "";
     if (timing_cache_enable_) {
       timing_cache_path = GetTimingCachePath(global_cache_path_, prop);
@@ -3497,7 +3533,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeFromGraph(const GraphViewer&
 
       if (dump_ep_context_model_ && ep_context_embed_mode_) {
         UpdateCtxNodeModelEngineContext(model_proto_.get(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
-        DumpCtxNodeModel(model_proto_.get(), cache_path + "_sm" + compute_capability);
+        DumpCtxNodeModel(model_proto_.get(), cache_path_prefix);
       }
       context_update = true;
     }

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -46,6 +46,9 @@ static const std::string kProfilesMinShapes = "ORT_TENSORRT_PROFILE_MIN_SHAPES";
 static const std::string kProfilesMaxShapes = "ORT_TENSORRT_PROFILE_MAX_SHAPES";
 static const std::string kProfilesOptShapes = "ORT_TENSORRT_PROFILE_OPT_SHAPES";
 static const std::string kCudaGraphEnable = "ORT_TENSORRT_CUDA_GRAPH_ENABLE";
+static const std::string kDumpEpContextModel = "ORT_DUMP_EP_CONTEXT_MODEL";
+static const std::string kEpContextEmbedMode = "ORT_EP_CONTEXT_EMBED_MODE";
+static const std::string kEpContextComputeCapabilityEnable = "ORT_EP_CONTEXT_COMPUTE_CAPABILITY_ENABLE";
 // Old env variable for backward compatibility
 static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
 }  // namespace tensorrt_env_vars
@@ -315,8 +318,9 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   OrtAllocator* alloc_ = nullptr;
 
   // For create/dump EP context node model
-  bool dump_ep_context_model_ = true;
-  int ep_context_embed_mode_ = 1;
+  bool dump_ep_context_model_ = false;
+  int ep_context_embed_mode_ = 0;
+  bool ep_context_compute_capability_enable_ = true;
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto_ = ONNX_NAMESPACE::ModelProto::Create();
 
   std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -46,6 +46,9 @@ constexpr const char* kProfilesMinShapes = "trt_profile_min_shapes";
 constexpr const char* kProfilesMaxShapes = "trt_profile_max_shapes";
 constexpr const char* kProfilesOptShapes = "trt_profile_opt_shapes";
 constexpr const char* kCudaGraphEnable = "trt_cuda_graph_enable";
+constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
+constexpr const char* kEpContextEmbedMode = "trt_ep_context_embed_mode";
+constexpr const char* kEpContextComputeCapabilityEnable = "trt_ep_context_compute_capability_enable";
 }  // namespace provider_option_names
 }  // namespace tensorrt
 
@@ -97,6 +100,9 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kProfilesMaxShapes, info.profile_max_shapes)
           .AddAssignmentToReference(tensorrt::provider_option_names::kProfilesOptShapes, info.profile_opt_shapes)
           .AddAssignmentToReference(tensorrt::provider_option_names::kCudaGraphEnable, info.cuda_graph_enable)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kDumpEpContextModel, info.dump_ep_context_model)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextEmbedMode, info.ep_context_embed_mode)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, info.ep_context_compute_capability_enable)
           .Parse(options));  // add new provider option here.
 
   return info;
@@ -138,6 +144,9 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kProfilesMaxShapes, MakeStringWithClassicLocale(info.profile_max_shapes)},
       {tensorrt::provider_option_names::kProfilesOptShapes, MakeStringWithClassicLocale(info.profile_opt_shapes)},
       {tensorrt::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.cuda_graph_enable)},
+      {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.dump_ep_context_model)},
+      {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.ep_context_embed_mode)},
+      {tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, MakeStringWithClassicLocale(info.ep_context_compute_capability_enable)},
   };
   return options;
 }
@@ -188,6 +197,9 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kProfilesMaxShapes, kProfilesMaxShapes_},
       {tensorrt::provider_option_names::kProfilesOptShapes, kProfilesOptShapes_},
       {tensorrt::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.trt_cuda_graph_enable)},
+      {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.trt_dump_ep_context_model)},
+      {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.trt_ep_context_embed_mode)},
+      {tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, MakeStringWithClassicLocale(info.trt_ep_context_compute_capability_enable)},
   };
   return options;
 }
@@ -279,5 +291,8 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_profile_opt_shapes = copy_string_if_needed(internal_options.profile_opt_shapes);
 
   trt_provider_options_v2.trt_cuda_graph_enable = internal_options.cuda_graph_enable;
+  trt_provider_options_v2.trt_dump_ep_context_model = internal_options.dump_ep_context_model;
+  trt_provider_options_v2.trt_ep_context_embed_mode = internal_options.ep_context_embed_mode;
+  trt_provider_options_v2.trt_ep_context_compute_capability_enable = internal_options.ep_context_compute_capability_enable;
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -51,6 +51,9 @@ struct TensorrtExecutionProviderInfo {
   std::string profile_max_shapes{""};
   std::string profile_opt_shapes{""};
   bool cuda_graph_enable{false};
+  bool dump_ep_context_model{false};
+  int ep_context_embed_mode{0};
+  bool ep_context_compute_capability_enable{1};
 
   static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -116,6 +116,9 @@ struct Tensorrt_Provider : Provider {
     info.profile_max_shapes = options.trt_profile_max_shapes == nullptr ? "" : options.trt_profile_max_shapes;
     info.profile_opt_shapes = options.trt_profile_opt_shapes == nullptr ? "" : options.trt_profile_opt_shapes;
     info.cuda_graph_enable = options.trt_cuda_graph_enable != 0;
+    info.dump_ep_context_model = options.trt_dump_ep_context_model != 0;
+    info.ep_context_embed_mode = options.trt_ep_context_embed_mode;
+    info.ep_context_compute_capability_enable = options.trt_ep_context_compute_capability_enable != 0;
 
     return std::make_shared<TensorrtProviderFactory>(info);
   }

diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -717,6 +717,28 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_cuda_graph_enable' should be 'True' or 'False'. Default value is 'False'.\n");
             }
+          } else if (option.first == "trt_dump_ep_context_model") {
+            if (option.second == "True" || option.second == "true") {
+              params.trt_dump_ep_context_model = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.trt_dump_ep_context_model = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_dump_ep_context_model' should be 'True' or 'False'. Default value is 'False'.\n");
+            }
+          } else if (option.first == "trt_ep_context_embed_mode") {
+            if (!option.second.empty()) {
+              params.trt_ep_context_embed_mode = std::stoi(option.second);
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_embed_mode' should be a positive integer number i.e. '1'.\n");
+            }
+          } else if (option.first == "trt_ep_context_compute_capability_enable") {
+            if (option.second == "True" || option.second == "true") {
+              params.trt_ep_context_compute_capability_enable = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.trt_ep_context_compute_capability_enable = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_compute_capability_enable' should be 'True' or 'False'. Default value is 'False'.\n");
+            }
           } else {
             ORT_THROW("Invalid TensorRT EP option: ", option.first);
           }