From 9b9f881475a12991d0abba0095c26dad8a4de5e9 Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Tue, 3 Dec 2024 21:58:43 -0800
Subject: [PATCH] [TensorRT EP] Use TRT/CUDA/ORT version from runtime instead
 of build time to generate hash value (#22921)

Use TensorRT and CUDA version fetched at **runtime** to get the hash
value which determines the cache name.

The old way to get the version is at compile/build time that might have
some issues in some cases,
ex:
TRT EP uses the TRT version which we or users built against at compile
time.
However, users can change different TRT version at run time, that can
cause issue because TRT EP always checks the "fixed" TRT version, not
the TRT version it uses now. This can cause TRT EP to use incompatible
TRT engine cache.

see the github issue here:

https://github.com/microsoft/onnxruntime/issues/22382#issuecomment-2404140754
---
 .../providers/tensorrt/tensorrt_execution_provider.cc  |  6 ++++--
 .../providers/tensorrt/tensorrt_execution_provider.h   |  1 +
 .../tensorrt/tensorrt_execution_provider_utils.h       |  7 +++----
 .../test/providers/tensorrt/tensorrt_basic_test.cc     | 10 +++++++---
 4 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index d979d53347c4f..1b432dad44263 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1726,8 +1726,10 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
   }
 
   trt_version_ = getInferLibVersion();
+  CUDA_CALL_THROW(cudaRuntimeGetVersion(&cuda_version_));
 
   LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] TensorRT version is " << trt_version_;
+  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] CUDA version is " << cuda_version_;
 
   LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] TensorRT provider options: "
                         << "device_id: " << device_id_
@@ -2466,13 +2468,13 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   // So, simply return the ComputeCapability here.
   if (graph.NumberOfNodes() == 1 && GraphHasCtxNode(graph)) {
     SubGraph_t supported_node_vector = {{0}, true};
-    std::unique_ptr<IndexedSubGraph> sub_graph = GetSubGraph(supported_node_vector, graph, TRTGenerateId(graph), 0);
+    std::unique_ptr<IndexedSubGraph> sub_graph = GetSubGraph(supported_node_vector, graph, TRTGenerateId(graph, std::to_string(trt_version_), std::to_string(cuda_version_)), 0);
     result.push_back(ComputeCapability::Create(std::move(sub_graph)));
     return result;
   }
 
   // Generate unique kernel name for TRT graph
-  HashValue model_hash = TRTGenerateId(graph);
+  HashValue model_hash = TRTGenerateId(graph, std::to_string(trt_version_), std::to_string(cuda_version_));
 
   // Get supported node list from TensorRT parser
   const int number_of_ort_nodes = graph.NumberOfNodes();
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 9e3a03417d917..d3e0b0fba8891 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -333,6 +333,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
 
   // The format is as for TENSORRT_VERSION: (MAJOR * 100 + MINOR) * 100 + PATCH
   int32_t trt_version_;
+  int32_t cuda_version_;
 
   // The OrtAllocator object will be get during ep compute time
   // and should be kept for the lifetime of TRT EP object.
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
index 95abcd1bad2b8..5a7b135fd92cd 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -520,7 +520,7 @@ void RemoveCachesByType(const std::string& root, std::string file_extension) {
  * compiled kernels, so the name must be unique and deterministic across models and sessions.
  * </remarks>
  */
-HashValue TRTGenerateId(const GraphViewer& graph_viewer) {
+HashValue TRTGenerateId(const GraphViewer& graph_viewer, std::string trt_version, std::string cuda_version) {
   HashValue model_hash = 0;
 
   // find the top level graph
@@ -583,12 +583,11 @@ HashValue TRTGenerateId(const GraphViewer& graph_viewer) {
 #endif
 
 #ifdef CUDA_VERSION
-  hash_str(std::to_string(CUDA_VERSION));
+  hash_str(cuda_version);
 #endif
 
 #if defined(NV_TENSORRT_MAJOR) && defined(NV_TENSORRT_MINOR)
-  std::string TRT_VERSION = std::to_string(NV_TENSORRT_MAJOR) + "." + std::to_string(NV_TENSORRT_MINOR);
-  hash_str(TRT_VERSION);
+  hash_str(trt_version);
 #endif
 
   model_hash = hash[0] | (uint64_t(hash[1]) << 32);
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index 63327a028c6f4..0022d7fc0e184 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -342,8 +342,12 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
   Graph& graph = model->MainGraph();
   GraphViewer viewer(graph);
 
+  std::string trt_version = std::to_string(NV_TENSORRT_MAJOR) + "." + std::to_string(NV_TENSORRT_MINOR);
+  std::string cuda_version = std::to_string(CUDA_VERSION);
+  std::string ort_version = ORT_VERSION;
+
   // get the hash for the model when loaded from file
-  HashValue model_hash = TRTGenerateId(viewer);
+  HashValue model_hash = TRTGenerateId(viewer, trt_version, cuda_version);
   ASSERT_NE(model_hash, 0);
 
   // now load the model from bytes and check the hash differs
@@ -358,7 +362,7 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
   // Test loading same model from file and byte steam. Hash values should be different
   Graph& graph2 = model2->MainGraph();
   GraphViewer viewer2(graph2);
-  HashValue model_hash2 = TRTGenerateId(viewer2);
+  HashValue model_hash2 = TRTGenerateId(viewer2, trt_version, cuda_version);
   ASSERT_NE(model_hash, model_hash2);
 
   // Test loading same model from different path, see if hash values are same as well
@@ -367,7 +371,7 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
   ASSERT_TRUE(Model::Load(model_path, model3, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
   Graph& graph3 = model3->MainGraph();
   GraphViewer viewer3(graph3);
-  HashValue model_hash3 = TRTGenerateId(viewer3);
+  HashValue model_hash3 = TRTGenerateId(viewer3, trt_version, cuda_version);
   ASSERT_EQ(model_hash, model_hash3) << "model 1&3 are same models and they have same hash, no matter where they are loaded";
 }