diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 56abe33989d7c..eb74178b3e032 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -216,7 +216,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "bacfaaa951653cd4e72efe727a543567cb38f7de",
+          "commitHash": "06adf4461ac84035bee658c6cf5df39f7ab6071d",
           "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
         },
         "comments": "onnx_tensorrt"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index b9511af4664ce..d213b09034f02 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -38,7 +38,7 @@ mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01
 neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.0.zip;a6d8b619459fb4657f8bec7d1c6d95ad6d4c069d
 #use the latest commit of 10.0-GA
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/eb43908b02a296ea0594432f06e9d3fac288d672.zip;94d07871810a36a5bc70a1def5c50504101c9bd1
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/06adf4461ac84035bee658c6cf5df39f7ab6071d.zip;46dceef659d75d276e7914a8057c2282269d5e7b
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
diff --git a/cmake/deps_update_and_upload.py b/cmake/deps_update_and_upload.py
index 63df3f6f03869..c11ed95ca3122 100644
--- a/cmake/deps_update_and_upload.py
+++ b/cmake/deps_update_and_upload.py
@@ -6,9 +6,9 @@
 #
 # Run without --do-upload once to verify downloading. Use --do-upload when you are ready to publish.
 # E.g.:
-#   python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82
+#   python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.164
 #   # check contents of C:/temp/onnxruntime_deps
-#   python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82 --no-download --do-upload
+#   python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.164 --no-download --do-upload
 #
 # Next, update the version number in tools/ci_build/github/azure-pipelines/templates/download-deps.yml.
 
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index fc559411df190..da959740a4e23 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1597,6 +1597,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Usually each single EPContext associate with a graph partition.But for some case like QNN, it has single EPContext contains all partitions.In that case, the node with ep_cache_context should set main_context=1. Other nodes set main_context=0 and skip ep_cache_context.The path is relative to this Onnx file. Default is 1.</dd>
 <dt><tt>notes</tt> : string</dt>
 <dd>(Optional) Some notes for the model</dd>
+<dt><tt>onnx_model_filename</tt> : string</dt>
+<dd>(Optional) Filename of the original ONNX model.</dd>
 <dt><tt>partition_name</tt> : string</dt>
 <dd>(Optional) partitioned graph name.</dd>
 <dt><tt>source</tt> : string</dt>
diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
index 32a9f06464ace..d008058821be3 100644
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -64,10 +64,21 @@ struct OrtTensorRTProviderOptionsV2 {
    *       - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir"
    *       - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir"
    *
+   * 3. In the case of building weight-stripped engines, the same security reasons as listed in 1) apply to the
+   *    "onnx_model_filename" node attribute of EP context node, which contains a filename of the ONNX model with the
+   *    weights needed for the refit process. User can specify a folder path relative to the current working
+   *    directory by means of the "trt_onnx_model_folder_path" option.
+   *
    */
-  int trt_dump_ep_context_model{0};               // Dump EP context node model
-  const char* trt_ep_context_file_path{nullptr};  // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path.
-  int trt_ep_context_embed_mode{0};               // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
+  int trt_dump_ep_context_model{0};                 // Dump EP context node model
+  const char* trt_ep_context_file_path{nullptr};    // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path.
+  int trt_ep_context_embed_mode{0};                 // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
+  int trt_weight_stripped_engine_enable{0};         // Enable weight-stripped engine build. Default 0 = false,
+                                                    // nonzero = true
+  const char* trt_onnx_model_folder_path{nullptr};  // Folder path relative to the current working directory for
+                                                    // the ONNX model containing the weights (applicable only when
+                                                    // the "trt_weight_stripped_engine_enable" option is enabled)
 
   const char* trt_engine_cache_prefix{nullptr};  // specify engine cache prefix
+  int trt_engine_hw_compatible{0};               // Enable hardware compatibility. Default 0 = false, nonzero = true
 };
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index 360c0aacd9c7a..39ce6aec90e1a 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -17,6 +17,7 @@
 // Licensed under the MIT License.
 
 #include <algorithm>
+#include <cfloat>
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <math.h>
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 5cf1818bbf9e8..7604e5ab2fe15 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3299,6 +3299,11 @@ void RegisterContribSchemas() {
           "(Optional) SDK version used to convert the model.",
           AttributeProto::STRING,
           OPTIONAL_VALUE)
+      .Attr(
+          "onnx_model_filename",
+          "(Optional) Filename of the original ONNX model.",
+          AttributeProto::STRING,
+          OPTIONAL_VALUE)
       .Attr(
           "hardware_architecture",
           "(Optional) Hardware architecture.",
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
index 1994d1f5ab0b8..2171ce056e029 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
@@ -8,8 +8,10 @@
 #include "onnx_ctx_model_helper.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/framework/execution_provider.h"
+#include "tensorrt_execution_provider.h"
 
 namespace onnxruntime {
+extern TensorrtLogger& GetTensorrtLogger(bool verbose_log);
 
 /*
  *  Check whether the graph has the EP context contrib op.
@@ -67,7 +69,8 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
                                            char* engine_data,
                                            size_t size,
                                            const int64_t embed_mode,
-                                           std::string compute_capability,
+                                           const std::string compute_capability,
+                                           const std::string onnx_model_path,
                                            const logging::Logger* logger) {
   auto model_build = graph_viewer.CreateModel(*logger);
   auto& graph_build = model_build->MainGraph();
@@ -88,6 +91,7 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
   auto attr_0 = ONNX_NAMESPACE::AttributeProto::Create();  // embed_mode
   auto attr_1 = ONNX_NAMESPACE::AttributeProto::Create();  // ep_cache_context
   auto attr_2 = ONNX_NAMESPACE::AttributeProto::Create();  // hardware_architecture
+  auto attr_3 = ONNX_NAMESPACE::AttributeProto::Create();  // onnx_model_filename
   std::string engine_data_str = "";
   attr_0->set_name(EMBED_MODE);
   attr_0->set_type(onnx::AttributeProto_AttributeType_INT);
@@ -106,13 +110,17 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
   attr_2->set_name(COMPUTE_CAPABILITY);
   attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
   attr_2->set_s(compute_capability);
+  attr_3->set_name(ONNX_MODEL_FILENAME);
+  attr_3->set_type(onnx::AttributeProto_AttributeType_STRING);
+  attr_3->set_s(std::filesystem::path(onnx_model_path).filename().string());
 
   auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
-  int num_attributes = 3;
+  constexpr int num_attributes = 4;
   node_attributes->reserve(num_attributes);
   node_attributes->emplace(EMBED_MODE, *attr_0);
   node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1);
   node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
+  node_attributes->emplace(ONNX_MODEL_FILENAME, *attr_3);
 
   // Create EP context node
   graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN);
@@ -205,7 +213,7 @@ void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
   LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Dumped " + ctx_model_path;
 }
 
-bool IsAbsolutePath(std::string& path_string) {
+bool IsAbsolutePath(const std::string& path_string) {
 #ifdef _WIN32
   onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
   auto path = std::filesystem::path(ort_path_string.c_str());
@@ -219,7 +227,7 @@ bool IsAbsolutePath(std::string& path_string) {
 }
 
 // Like "../file_path"
-bool IsRelativePathToParentPath(std::string& path_string) {
+bool IsRelativePathToParentPath(const std::string& path_string) {
 #ifdef _WIN32
   onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
   auto path = std::filesystem::path(ort_path_string.c_str());
@@ -236,6 +244,28 @@ bool IsRelativePathToParentPath(std::string& path_string) {
 #endif
 }
 
+/*
+ * Get the weight-refitted engine cache path from a weight-stripped engine cache path
+ *
+ * Weight-stipped engine:
+ * An engine with weights stripped and its size is smaller than a regualr engine.
+ * The cache name of weight-stripped engine is TensorrtExecutionProvider_TRTKernel_XXXXX.stripped.engine
+ *
+ * Weight-refitted engine:
+ * An engine that its weights have been refitted and it's simply a regular engine.
+ * The cache name of weight-refitted engine is TensorrtExecutionProvider_TRTKernel_XXXXX.engine
+ */
+std::string GetWeightRefittedEnginePath(std::string stripped_engine_cache) {
+  std::filesystem::path stripped_engine_cache_path(stripped_engine_cache);
+  std::string refitted_engine_cache_path = stripped_engine_cache_path.stem().stem().string() + ".engine";
+  return refitted_engine_cache_path;
+}
+
+bool IsWeightStrippedEngineCache(std::filesystem::path& engine_cache_path) {
+  // The weight-stripped engine cache has the naming of xxx.stripped.engine
+  return engine_cache_path.stem().extension().string() == ".stripped";
+}
+
 Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) {
   if (!ValidateEPCtxNode(graph_viewer)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "It's not a valid EP Context node");
@@ -271,6 +301,22 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
     // The engine cache and context model (current model) should be in the same directory
     std::filesystem::path ctx_model_dir(GetPathOrParentPathOfCtxModel(ep_context_model_path_));
     auto engine_cache_path = ctx_model_dir.append(cache_path);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] GetEpContextFromGraph engine_cache_path: " + engine_cache_path.string();
+
+    // If it's a weight-stripped engine cache, it needs to be refitted even though the refit flag is not enabled
+    if (!weight_stripped_engine_refit_) {
+      weight_stripped_engine_refit_ = IsWeightStrippedEngineCache(engine_cache_path);
+    }
+
+    // If the serialized refitted engine is present, use it directly without refitting the engine again
+    if (weight_stripped_engine_refit_) {
+      const std::filesystem::path refitted_engine_cache_path = GetWeightRefittedEnginePath(engine_cache_path.string());
+      if (std::filesystem::exists(refitted_engine_cache_path)) {
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] " + refitted_engine_cache_path.string() + " exists.";
+        engine_cache_path = refitted_engine_cache_path.string();
+        weight_stripped_engine_refit_ = false;
+      }
+    }
 
     if (!std::filesystem::exists(engine_cache_path)) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
@@ -290,6 +336,21 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
                              "TensorRT EP could not deserialize engine from cache: " + engine_cache_path.string());
     }
     LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path.string();
+
+    if (weight_stripped_engine_refit_) {
+      const std::string onnx_model_filename = attrs.at(ONNX_MODEL_FILENAME).s();
+      std::string weight_stripped_engine_cache = engine_cache_path.string();
+      auto status = TensorrtExecutionProvider::RefitEngine(onnx_model_filename,
+                                                           onnx_model_folder_path_,
+                                                           weight_stripped_engine_cache,
+                                                           true /* path check for security */,
+                                                           (*trt_engine_).get(),
+                                                           true /* serialize refitted engine to disk */,
+                                                           detailed_build_log_);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
   }
   return Status::OK();
 }
@@ -306,7 +367,13 @@ bool TensorRTCacheModelHandler::ValidateEPCtxNode(const GraphViewer& graph_viewe
   // Show the warning if compute capability is not matched
   if (attrs.count(COMPUTE_CAPABILITY) > 0) {
     std::string model_compute_capability = attrs.at(COMPUTE_CAPABILITY).s();
-    if (model_compute_capability != compute_capability_) {
+    // Verify if engine was compiled with ampere+ hardware compatibility enabled
+    if (model_compute_capability == "80+") {
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine is compatible to all Ampere+ GPU (except Jetson)";
+      if (std::stoi(compute_capability_) < 80) {
+        LOGS_DEFAULT(WARNING) << "[TensorRT EP] However, this GPU doesn't match. The compute capability of the GPU: " << compute_capability_;
+      }
+    } else if (model_compute_capability != compute_capability_) {
       LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine was compiled for a different compatibility level and might not work or perform suboptimal";
       LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the engine: " << model_compute_capability;
       LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the GPU: " << compute_capability_;
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
index 9f1e5178428e7..f8fefc12c3453 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
@@ -5,6 +5,7 @@
 
 #include <string>
 #include <filesystem>
+#include <memory>
 
 #include "core/providers/tensorrt/nv_includes.h"
 #include "core/providers/shared_library/provider_api.h"
@@ -15,6 +16,7 @@ static const std::string EPCONTEXT_OP = "EPContext";
 static const std::string EMBED_MODE = "embed_mode";
 static const std::string EP_CACHE_CONTEXT = "ep_cache_context";
 static const std::string COMPUTE_CAPABILITY = "hardware_architecture";
+static const std::string ONNX_MODEL_FILENAME = "onnx_model_filename";
 static const std::string EPCONTEXT_OP_DOMAIN = "com.microsoft";
 static const std::string EPCONTEXT_WARNING =
     "It's suggested to set the ORT graph optimization level to 0 and  \
@@ -29,12 +31,13 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
                                            char* engine_data,
                                            size_t size,
                                            const int64_t embed_mode,
-                                           std::string compute_capability,
+                                           const std::string compute_capability,
+                                           const std::string onnx_model_path,
                                            const logging::Logger* logger);
 std::string GetCtxModelPath(const std::string& ep_context_file_path,
                             const std::string& original_model_path);
-bool IsAbsolutePath(std::string& path_string);
-bool IsRelativePathToParentPath(std::string& path_string);
+bool IsAbsolutePath(const std::string& path_string);
+bool IsRelativePathToParentPath(const std::string& path_string);
 void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
                   const std::string& ctx_model_path);
 void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
@@ -46,7 +49,17 @@ class TensorRTCacheModelHandler {
   TensorRTCacheModelHandler(std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine,
                             nvinfer1::IRuntime* trt_runtime,
                             std::string ep_context_model_path,
-                            std::string compute_capability) : trt_engine_(trt_engine), trt_runtime_(trt_runtime), ep_context_model_path_(ep_context_model_path), compute_capability_(compute_capability) {
+                            std::string compute_capability,
+                            bool weight_stripped_engine_refit,
+                            std::string onnx_model_folder_path,
+                            bool detailed_build_log)
+      : trt_engine_(trt_engine),
+        trt_runtime_(trt_runtime),
+        ep_context_model_path_(ep_context_model_path),
+        compute_capability_(compute_capability),
+        weight_stripped_engine_refit_(weight_stripped_engine_refit),
+        onnx_model_folder_path_(onnx_model_folder_path),
+        detailed_build_log_(detailed_build_log) {
   }
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler);
 
@@ -59,5 +72,8 @@ class TensorRTCacheModelHandler {
   nvinfer1::IRuntime* trt_runtime_;
   std::string ep_context_model_path_;  // If using context model, it implies context model and engine cache is in the same directory
   std::string compute_capability_;
+  bool weight_stripped_engine_refit_;
+  std::string onnx_model_folder_path_;
+  bool detailed_build_log_;
 };  // TRTCacheModelHandler
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index bfd464c7007ac..45b5ee65b3164 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -71,6 +71,7 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
     auto dynamic_range_iter = dynamic_range_map.find(tensor_name);
     if (dynamic_range_iter != dynamic_range_map.end()) {
       if (!network.getInput(i)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) {
+        LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for network input " << tensor_name;
         return false;
       }
     }
@@ -84,10 +85,12 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
       auto dynamic_range_iter = dynamic_range_map.find(tensor_name);
       if (dynamic_range_iter != dynamic_range_map.end()) {
         if (!trt_layer->getOutput(j)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) {
+          LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for tensor " << tensor_name;
           return false;
         }
       } else if (trt_layer->getType() == nvinfer1::LayerType::kCONSTANT) {
         nvinfer1::IConstantLayer* const_layer = static_cast<nvinfer1::IConstantLayer*>(trt_layer);
+        const std::string const_layer_name = const_layer->getName();
         auto trt_weights = const_layer->getWeights();
         double max_weight = std::numeric_limits<double>::min();
         for (int64_t k = 0, end = trt_weights.count; k < end; ++k) {
@@ -108,13 +111,19 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
             case nvinfer1::DataType::kINT32:
               weight = static_cast<const int32_t*>(trt_weights.values)[k];
               break;
+#if NV_TENSORRT_MAJOR >= 10
+            case nvinfer1::DataType::kINT64:
+              weight = static_cast<double>(static_cast<const int64_t*>(trt_weights.values)[k]);
+              break;
+#endif  // NV_TENSORRT_MAJOR >= 10
             default:
-              LOGS_DEFAULT(ERROR) << "Found unsupported datatype!";
+              LOGS_DEFAULT(ERROR) << "Found unsupported datatype for layer " << const_layer_name;
               return false;
           }
           max_weight = std::max(max_weight, std::abs(weight));
         }
         if (!trt_layer->getOutput(j)->setDynamicRange(static_cast<float>(-max_weight), static_cast<float>(max_weight))) {
+          LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for layer " << const_layer_name;
           return false;
         }
       }
@@ -1062,8 +1071,12 @@ Status BindKernelOutput(Ort::KernelContext& ctx,
     CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t)
     CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
     CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
+#if NV_TENSORRT_MAJOR >= 10
+    CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t)
+#else
     // The allocation buffer holds the int32 output data since TRT doesn't support int64. So, we need to cast the data (int32 -> int64) for ORT kernel output.
     CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int32_t, int64_t)
+#endif
     // The allocation buffer holds the float output data since TRT doesn't support double. So, we need to cast the data (float -> double) for ORT kernel output.
     CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, float, double)
     default: {
@@ -1234,6 +1247,13 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
 
   std::string profile_min_shapes, profile_max_shapes, profile_opt_shapes;
 
+  // incase the EP context is dumped the engine cache has to be enabled
+  auto enable_engine_cache_for_ep_context_model = [this]() {
+    if (dump_ep_context_model_ && ep_context_embed_mode_ == 0) {
+      engine_cache_enable_ = true;
+    }
+  };
+
   // Get environment variables
   if (info.has_trt_options) {
     max_partition_iterations_ = info.max_partition_iterations;
@@ -1251,12 +1271,15 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     }
     dump_subgraphs_ = info.dump_subgraphs;
     engine_cache_enable_ = info.engine_cache_enable;
+    weight_stripped_engine_enable_ = info.weight_stripped_engine_enable;
+    onnx_model_folder_path_ = info.onnx_model_folder_path;
     timing_cache_enable_ = info.timing_cache_enable;
     force_timing_cache_match_ = info.force_timing_cache;
     detailed_build_log_ = info.detailed_build_log;
     dump_ep_context_model_ = info.dump_ep_context_model;
     ep_context_file_path_ = info.ep_context_file_path;
     ep_context_embed_mode_ = info.ep_context_embed_mode;
+    enable_engine_cache_for_ep_context_model();
     if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
       cache_path_ = info.engine_cache_path;
       cache_prefix_ = info.engine_cache_prefix;
@@ -1287,6 +1310,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     profile_max_shapes = info.profile_max_shapes;
     profile_opt_shapes = info.profile_opt_shapes;
     cuda_graph_enable_ = info.cuda_graph_enable;
+    engine_hw_compatible_ = info.engine_hw_compatible;
   } else {
     try {
       const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations);
@@ -1350,6 +1374,16 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
         engine_cache_enable_ = (std::stoi(engine_cache_enable_env) == 0 ? false : true);
       }
 
+      const std::string weight_stripped_engine_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kWeightStrippedEngineEnable);
+      if (!weight_stripped_engine_enable_env.empty()) {
+        weight_stripped_engine_enable_ = std::stoi(weight_stripped_engine_enable_env) != 0;
+      }
+
+      const std::string onnx_model_folder_path_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kOnnxModelFolderPath);
+      if (!onnx_model_folder_path_env.empty()) {
+        onnx_model_folder_path_ = onnx_model_folder_path_env;
+      }
+
       const std::string timing_cache_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kTimingCacheEnable);
       if (!timing_cache_enable_env.empty()) {
         timing_cache_enable_ = (std::stoi(timing_cache_enable_env) == 0 ? false : true);
@@ -1380,6 +1414,8 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
         ep_context_embed_mode_ = std::stoi(ep_context_embed_mode_env);
       }
 
+      enable_engine_cache_for_ep_context_model();
+
       if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
         const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath);
         cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath);
@@ -1513,6 +1549,22 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     cache_path_ = GetPathOrParentPathOfCtxModel(ep_context_file_path_).append(cache_path_).string();
   }
 
+  // Hardware compatibility: pre-check on environment
+  if (engine_cache_enable_ && engine_hw_compatible_) {
+#if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR > 5 || NV_TENSORRT_MAJOR > 8
+    if (std::stoi(compute_capability_) < 80) {
+      LOGS_DEFAULT(WARNING) << "Engine hardware compatibility cannot be enabled as GPU arch < 80. ";
+      engine_hw_compatible_ = false;
+    } else if (std::stoi(compute_capability_) == 87) {
+      LOGS_DEFAULT(WARNING) << "Engine hardware compatibility cannot be enabled on Jetson Orin. ";
+      engine_hw_compatible_ = false;
+    }
+#else
+    LOGS_DEFAULT(WARNING) << "Engine hardware compatibility cannot be enabled as TRT < 8.6. ";
+    engine_hw_compatible_ = false;
+#endif
+  }
+
   if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
     if (!cache_path_.empty() && !fs::is_directory(cache_path_)) {
       if (!fs::create_directory(cache_path_)) {
@@ -1619,6 +1671,8 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
                         << ", trt_dla_core: " << dla_core_
                         << ", trt_dump_subgraphs: " << dump_subgraphs_
                         << ", trt_engine_cache_enable: " << engine_cache_enable_
+                        << ", trt_weight_stripped_engine_enable: " << weight_stripped_engine_enable_
+                        << ", trt_onnx_model_folder_path: " << onnx_model_folder_path_
                         << ", trt_cache_path: " << cache_path_
                         << ", trt_global_cache_path: " << global_cache_path_
                         << ", trt_engine_decryption_enable: " << engine_decryption_enable_
@@ -1638,7 +1692,8 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
                         << ", trt_dump_ep_context_model: " << dump_ep_context_model_
                         << ", trt_ep_context_file_path: " << ep_context_file_path_
                         << ", trt_ep_context_embed_mode: " << ep_context_embed_mode_
-                        << ", trt_cache_prefix: " << cache_prefix_;
+                        << ", trt_cache_prefix: " << cache_prefix_
+                        << ", trt_engine_hw_compatible: " << engine_hw_compatible_;
 }
 
 TensorrtExecutionProvider::~TensorrtExecutionProvider() {
@@ -2271,7 +2326,6 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
                                          const IKernelLookup& /*kernel_lookup*/) const {
   // Construct subgraph capability from node list
   std::vector<std::unique_ptr<ComputeCapability>> result;
-
   // Get ModelPath
   const auto& path_string = graph.ModelPath().ToPathString();
 #ifdef _WIN32
@@ -2462,6 +2516,67 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   return result;
 }
 
+/**
+ * Refit the weight-stripped engine
+ */
+common::Status TensorrtExecutionProvider::RefitEngine(std::string onnx_model_filename,
+                                                      std::string& onnx_model_folder_path,
+                                                      std::string& weight_stripped_engine_cath_path,
+                                                      bool path_check,
+                                                      nvinfer1::ICudaEngine* trt_engine,
+                                                      bool serialize_refitted_engine,
+                                                      bool detailed_build_log) {
+#if NV_TENSORRT_MAJOR >= 10
+  std::filesystem::path onnx_model_path{onnx_model_folder_path};
+  onnx_model_path.append(onnx_model_filename);
+  if (path_check && IsAbsolutePath(onnx_model_path.string())) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                           "For security purpose, the ONNX model path should be set with "
+                           "a relative path, but it is an absolute path: " +
+                               onnx_model_path.string());
+  }
+  if (path_check && IsRelativePathToParentPath(onnx_model_path.string())) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                           "The ONNX model path has '..'. For security purpose, it's not "
+                           "allowed to point outside the directory.");
+  }
+
+  if (!std::filesystem::exists(onnx_model_path)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                           "The ONNX model " + onnx_model_path.string() +
+                               " does not exist.");
+  }
+
+  // weight-stripped engine refit logic
+  TensorrtLogger& trt_logger = GetTensorrtLogger(detailed_build_log);
+  auto refitter = std::unique_ptr<nvinfer1::IRefitter>(nvinfer1::createInferRefitter(*trt_engine, trt_logger));
+  auto parser_refitter = std::unique_ptr<nvonnxparser::IParserRefitter>(
+      nvonnxparser::createParserRefitter(*refitter, trt_logger));
+  if (!parser_refitter->refitFromFile(onnx_model_path.string().c_str())) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                           "TensorRT EP's IParserRefitter could not refit deserialized weight-stripped engine with weights contained in: " + onnx_model_path.string());
+  }
+  if (refitter->refitCudaEngine()) {
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Successfully refitted the weight-stripped engine.";
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                           "TensorRT EP's IRefitter could not refit deserialized weight-stripped engine with weights contained in: " + onnx_model_path.string());
+  }
+
+  // serialize the refitted engine to disk
+  if (serialize_refitted_engine) {
+    std::string refitted_engine_cache = GetWeightRefittedEnginePath(weight_stripped_engine_cath_path);
+    nvinfer1::IHostMemory* serialized_engine = trt_engine->serialize();
+    std::ofstream engine_file(refitted_engine_cache, std::ios::binary | std::ios::out);
+    engine_file.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialize the refitted engine to " << refitted_engine_cache;
+  }
+  return Status::OK();
+#else
+  return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP's IParserRefitter can only be used on TRT 10.0 onwards.");
+#endif
+}
+
 common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                                                   std::vector<NodeComputeInfo>& node_compute_funcs) {
   for (auto& fused_node_graph : fused_nodes_and_graphs) {
@@ -2485,7 +2600,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
 
     Status status;
     if (GraphHasCtxNode(graph_body_viewer)) {
-      status = CreateNodeComputeInfoFromPrecompiledEngine(graph_body_viewer, fused_node, input_map, output_map, node_compute_funcs);
+      status = CreateNodeComputeInfoFromPrecompiledEngine(graph_body_viewer,
+                                                          fused_node,
+                                                          input_map,
+                                                          output_map,
+                                                          node_compute_funcs);
     } else {
       status = CreateNodeComputeInfoFromGraph(graph_body_viewer, fused_node, input_map, output_map, node_compute_funcs);
     }
@@ -2786,6 +2905,17 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
   }
 #endif
 
+  if (weight_stripped_engine_enable_) {
+#if NV_TENSORRT_MAJOR >= 10
+    trt_config->setFlag(nvinfer1::BuilderFlag::kSTRIP_PLAN);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] STRIP_PLAN is enabled";
+    trt_config->setFlag(nvinfer1::BuilderFlag::kREFIT_IDENTICAL);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] REFIT_IDENTICAL is enabled";
+#else
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] weight-stripped engines can only be used on TRT 10.0 onwards!";
+#endif
+  }
+
   // limit used tactic sources
   if (!tactic_sources_.empty()) {
     nvinfer1::TacticSources tactics = trt_config->getTacticSources();
@@ -2813,13 +2943,29 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
   }
 
+  std::string cache_hw_compat = "_sm" + compute_capability_;
+  // Enable hardware compatility mode if assigned
+  if (engine_cache_enable_ && engine_hw_compatible_) {
+    trt_config->setHardwareCompatibilityLevel(nvinfer1::HardwareCompatibilityLevel::kAMPERE_PLUS);
+    cache_hw_compat = "_sm80+";
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Hardware compatibility is enabled when loading and capturing engine cache.";
+  }
+
   // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
   // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-  const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_;
-  const std::string engine_cache_path = cache_path_prefix + ".engine";
+  const std::string cache_path_prefix = cache_path + cache_hw_compat;
+  std::string engine_cache_path = cache_path_prefix + ".engine";
   const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
   const std::string profile_cache_path = cache_path_prefix + ".profile";
 
+  // If weight-stripped engine is enabled and refitted engine cache is not present,
+  // TRT EP will use the engine cache with ".stripped.engine" appended to the end.
+  const std::filesystem::path engine_cache_fs_path = engine_cache_path;
+  if (weight_stripped_engine_enable_ && !std::filesystem::exists(engine_cache_fs_path)) {
+    engine_cache_path = cache_path_prefix + ".stripped.engine";
+    weight_stripped_engine_refit_ = true;
+  }
+
   // Generate file name for dumping ep context model
   if (dump_ep_context_model_ && ctx_model_path_.empty()) {
     ctx_model_path_ = GetCtxModelPath(ep_context_file_path_, model_path_);
@@ -2859,6 +3005,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
           return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                  "TensorRT EP could not deserialize engine from cache: " + engine_cache_path);
         }
+
       } else if (engine_decryption_enable_ && engine_cache_enable_ && std::filesystem::exists(encrypted_engine_cache_path) && !engine_update) {
         // Decrypt engine
         size_t engine_size = 0;
@@ -2966,19 +3113,36 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
             auto cache_file_name = std::filesystem::path(engine_cache_path).filename();
             ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string();
           }
-
+          std::string compute_capability_hw_compat = compute_capability_;
+          if (engine_cache_enable_ && engine_hw_compatible_) {
+            compute_capability_hw_compat = "80+";
+          }
           std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto{CreateCtxModel(graph_body_viewer,
                                                                                  ep_cache_context_attr_,
                                                                                  reinterpret_cast<char*>(serialized_engine->data()),
                                                                                  serialized_engine->size(),
                                                                                  ep_context_embed_mode_,
-                                                                                 compute_capability_,
+                                                                                 compute_capability_hw_compat,
+                                                                                 model_path_,
                                                                                  GetLogger())};
           DumpCtxModel(model_proto.get(), ctx_model_path_);
         }
       }
     }
 
+    if (weight_stripped_engine_refit_) {
+      auto status = RefitEngine(model_path_,
+                                onnx_model_folder_path_,
+                                engine_cache_path,
+                                false /* path check for security */,
+                                trt_engine.get(),
+                                true /* serialize refitted engine to disk */,
+                                detailed_build_log_);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
     // Build context
     // Note: Creating an execution context from an engine is thread safe per TRT doc
     // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
@@ -3039,12 +3203,17 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       auto cache_file_name = std::filesystem::path(engine_cache_path).filename();
       ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string();
     }
+    std::string compute_capability_hw_compat = compute_capability_;
+    if (engine_cache_enable_ && engine_hw_compatible_) {
+      compute_capability_hw_compat = "80+";
+    }
     model_proto_.reset(CreateCtxModel(graph_body_viewer,
                                       ep_cache_context_attr_,
                                       nullptr,
                                       0,
                                       ep_context_embed_mode_,
-                                      compute_capability_,
+                                      compute_capability_hw_compat,
+                                      model_path_,
                                       GetLogger()));
     if (ep_context_embed_mode_ == 0) {
       DumpCtxModel(model_proto_.get(), ctx_model_path_);
@@ -3065,11 +3234,12 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
           &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
           &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
           input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
-          dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
-          runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_,
-          dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
-          global_cache_path_, force_timing_cache_match_, detailed_build_log_, build_heuristics_enable_, sparsity_enable_,
-          builder_optimization_level_, auxiliary_streams_, !tactic_sources_.empty(), tactics, cuda_graph_enable_, cache_prefix_, cache_suffix};
+          dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision,
+          engine_cache_enable_, cache_path_, runtime_.get(), profiles_[context->node_name],
+          context_memory_sharing_enable_, &max_ctx_mem_size_, dynamic_range_map, engine_decryption_enable_,
+          engine_decryption_, engine_encryption_, timing_cache_enable_, global_cache_path_, force_timing_cache_match_,
+          detailed_build_log_, build_heuristics_enable_, sparsity_enable_, builder_optimization_level_,
+          auxiliary_streams_, !tactic_sources_.empty(), tactics, cuda_graph_enable_, cache_prefix_, cache_suffix, engine_hw_compatible_};
     *state = p.release();
     return 0;
   };
@@ -3132,8 +3302,18 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     } else {
       cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
     }
-    const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_;
-    const std::string engine_cache_path = cache_path_prefix + ".engine";
+
+    // Enable hardware compatility mode if assigned
+    std::string cache_hw_compat = "_sm" + compute_capability_;
+    if (engine_cache_enable_ && engine_hw_compatible_) {
+      cache_hw_compat = "_sm80+";
+      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Hardware compatibility is enabled when loading and capturing engine cache.";
+    }
+
+    // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+    // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
+    const std::string cache_path_prefix = cache_path + cache_hw_compat;
+    std::string engine_cache_path = cache_path_prefix + ".engine";
     const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
     const std::string profile_cache_path = cache_path_prefix + ".profile";
     std::string timing_cache_path = "";
@@ -3141,6 +3321,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       timing_cache_path = GetTimingCachePath(global_cache_path_, compute_capability_);
     }
 
+    // If weight-stripped engine is enabled and refitted engine cache is not present,
+    // TRT EP will use the engine cache with ".stripped.engine" appended to the end.
+    const std::filesystem::path engine_cache_fs_path = engine_cache_path;
+    if (weight_stripped_engine_enable_ && !std::filesystem::exists(engine_cache_fs_path)) {
+      engine_cache_path = cache_path_prefix + ".stripped.engine";
+      weight_stripped_engine_refit_ = true;
+    }
+
     // Load serialized engine
     if (trt_state->engine_cache_enable && trt_engine == nullptr) {
       std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
@@ -3169,6 +3357,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
         LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
         trt_engine = trt_state->engine->get();
         context_update = true;
+
       } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
         shape_ranges = DeserializeProfileV2(profile_file);
         LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
@@ -3281,6 +3470,16 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
         LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
       }
 #endif
+      if (weight_stripped_engine_enable_) {
+#if NV_TENSORRT_MAJOR >= 10
+        trt_config->setFlag(nvinfer1::BuilderFlag::kSTRIP_PLAN);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] STRIP_PLAN is enabled";
+        trt_config->setFlag(nvinfer1::BuilderFlag::kREFIT_IDENTICAL);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] REFIT_IDENTICAL is enabled";
+#else
+        LOGS_DEFAULT(WARNING) << "[TensorRT EP] weight-stripped engines can only be used on TRT 10.0 onwards!";
+#endif
+      }
       // limit used tactic sources
       if (trt_state->filter_tactic_sources) {
         nvinfer1::TacticSources tactics = trt_config->getTacticSources();
@@ -3304,6 +3503,12 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
         }
       }
 
+      // Enable hardware compatility mode if assigned
+      if (trt_state->engine_hw_compatible) {
+        trt_config->setHardwareCompatibilityLevel(nvinfer1::HardwareCompatibilityLevel::kAMPERE_PLUS);
+        LOGS_DEFAULT(INFO) << "[TensorRT EP] Re-generate engine with hardware compatibility enabled.";
+      }
+
       // Build engine
       std::unique_ptr<nvinfer1::IHostMemory> serialized_engine;
       {
@@ -3375,6 +3580,19 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
         DumpCtxModel(model_proto_.get(), ctx_model_path_);
       }
       context_update = true;
+
+      if (weight_stripped_engine_refit_) {
+        auto status = RefitEngine(model_path_,
+                                  onnx_model_folder_path_,
+                                  engine_cache_path,
+                                  false /* path check for security */,
+                                  trt_engine,
+                                  true /* serialize refitted engine to disk */,
+                                  detailed_build_log_);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+        }
+      }
     }
 
     if (context_update) {
@@ -3575,7 +3793,13 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
   std::unordered_map<std::string, size_t> output_types;    // TRT engine output name -> ORT output tensor type
 
   // Get engine binary data and deserialize it
-  auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine, runtime_.get(), model_path_, compute_capability_);
+  auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine,
+                                                           runtime_.get(),
+                                                           model_path_,
+                                                           compute_capability_,
+                                                           weight_stripped_engine_enable_,
+                                                           onnx_model_folder_path_,
+                                                           detailed_build_log_);
   auto status = trt_cache_model_handler.GetEpContextFromGraph(graph_body_viewer);
   if (status != Status::OK()) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index eabbbdea1c4ac..f4dae57487f51 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -27,7 +27,9 @@ static const std::string kDLACore = "ORT_TENSORRT_DLA_CORE";
 static const std::string kDumpSubgraphs = "ORT_TENSORRT_DUMP_SUBGRAPHS";
 static const std::string kEngineCacheEnable = "ORT_TENSORRT_ENGINE_CACHE_ENABLE";
 static const std::string kCachePath = "ORT_TENSORRT_CACHE_PATH";
-// As a timing cache can be used across multiple ONNX files it makes sense to have a seperate cache path
+static const std::string kWeightStrippedEngineEnable = "ORT_TENSORRT_WEIGHT_STRIPPED_ENGINE_ENABLE";
+static const std::string kOnnxModelFolderPath = "ORT_TENSORRT_ONNX_MODEL_FOLDER_PATH";
+// As a timing cache can be used across multiple ONNX files it makes sense to have a separate cache path
 static const std::string kTimingCachePath = "ORT_TENSORRT_GLOBAL_CACHE_PATH";
 static const std::string kDecryptionEnable = "ORT_TENSORRT_ENGINE_DECRYPTION_ENABLE";
 static const std::string kDecryptionLibPath = "ORT_TENSORRT_ENGINE_DECRYPTION_LIB_PATH";
@@ -191,6 +193,7 @@ struct TensorrtFuncState {
   bool cuda_graph_enable = 0;
   std::string cache_prefix;
   std::string cache_suffix;
+  bool engine_hw_compatible = false;
 };
 
 // Minimum information to construct kernel function state for direct engine load code path
@@ -217,6 +220,7 @@ struct SubGraphContext {
 
 using SubGraphContextMap = std::unordered_map<std::string, std::unique_ptr<SubGraphContext>>;
 using DDSOutputAllocatorMap = std::unordered_map<std::string, std::unique_ptr<OutputAllocator>>;
+std::string GetWeightRefittedEnginePath(std::string engine_cache_path);
 
 // Logical device representation.
 class TensorrtExecutionProvider : public IExecutionProvider {
@@ -263,6 +267,17 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool IsGraphCaptured(int graph_annotation_id) const override;
   Status ReplayGraph(int graph_annotation_id) override;
 
+  /**
+   * Refit the weight-stripped engine
+   */
+  static common::Status RefitEngine(std::string onnx_model_filename,
+                                    std::string& onnx_model_folder_path,
+                                    std::string& weight_stripped_engine_cath_path,
+                                    bool path_check,
+                                    nvinfer1::ICudaEngine* trt_engine,
+                                    bool serialize_refitted_engine,
+                                    bool detailed_build_log);
+
  private:
   mutable TensorrtExecutionProviderInfo info_;
   bool external_stream_ = false;
@@ -280,6 +295,9 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool int8_use_native_tensorrt_calibration_table_ = false;
   bool dump_subgraphs_ = false;
   bool engine_cache_enable_ = false;
+  bool weight_stripped_engine_enable_ = false;
+  bool weight_stripped_engine_refit_ = false;
+  std::string onnx_model_folder_path_;
   bool build_heuristics_enable_ = false;
   bool sparsity_enable_ = false;
   int builder_optimization_level_ = 3;
@@ -303,6 +321,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool detailed_build_log_ = false;
   bool cuda_graph_enable_ = false;
   std::string cache_prefix_;
+  bool engine_hw_compatible_ = false;
 
   // The OrtAllocator object will be get during ep compute time
   // and should be kept for the lifetime of TRT EP object.
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
index cd2087c9d7472..9fe39f5921e1c 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -27,6 +27,8 @@ constexpr const char* kDLACore = "trt_dla_core";
 constexpr const char* kDumpSubgraphs = "trt_dump_subgraphs";
 constexpr const char* kEngineCacheEnable = "trt_engine_cache_enable";
 constexpr const char* kEngineCachePath = "trt_engine_cache_path";
+constexpr const char* kWeightStrippedEngineEnable = "trt_weight_stripped_engine_enable";
+constexpr const char* kOnnxModelFolderPath = "trt_onnx_model_folder_path";
 constexpr const char* kEngineCachePrefix = "trt_engine_cache_prefix";
 constexpr const char* kDecryptionEnable = "trt_engine_decryption_enable";
 constexpr const char* kDecryptionLibPath = "trt_engine_decryption_lib_path";
@@ -51,6 +53,8 @@ constexpr const char* kCudaGraphEnable = "trt_cuda_graph_enable";
 constexpr const char* kEpContextEmbedMode = "trt_ep_context_embed_mode";
 constexpr const char* kEpContextFilePath = "trt_ep_context_file_path";
 constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
+constexpr const char* kEngineHwCompatible = "trt_engine_hw_compatible";
+
 }  // namespace provider_option_names
 }  // namespace tensorrt
 
@@ -92,6 +96,8 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCacheEnable, info.engine_cache_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCachePath, info.engine_cache_path)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kWeightStrippedEngineEnable, info.weight_stripped_engine_enable)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kOnnxModelFolderPath, info.onnx_model_folder_path)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCachePrefix, info.engine_cache_prefix)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionEnable, info.engine_decryption_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
@@ -115,6 +121,7 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kDumpEpContextModel, info.dump_ep_context_model)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextFilePath, info.ep_context_file_path)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextEmbedMode, info.ep_context_embed_mode)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEngineHwCompatible, info.engine_hw_compatible)
           .Parse(options));  // add new provider option here.
 
   info.user_compute_stream = user_compute_stream;
@@ -139,6 +146,8 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kDumpSubgraphs, MakeStringWithClassicLocale(info.dump_subgraphs)},
       {tensorrt::provider_option_names::kEngineCacheEnable, MakeStringWithClassicLocale(info.engine_cache_enable)},
       {tensorrt::provider_option_names::kEngineCachePath, MakeStringWithClassicLocale(info.engine_cache_path)},
+      {tensorrt::provider_option_names::kWeightStrippedEngineEnable, MakeStringWithClassicLocale(info.weight_stripped_engine_enable)},
+      {tensorrt::provider_option_names::kOnnxModelFolderPath, MakeStringWithClassicLocale(info.onnx_model_folder_path)},
       {tensorrt::provider_option_names::kEngineCachePrefix, MakeStringWithClassicLocale(info.engine_cache_prefix)},
       {tensorrt::provider_option_names::kDecryptionEnable, MakeStringWithClassicLocale(info.engine_decryption_enable)},
       {tensorrt::provider_option_names::kDecryptionLibPath, MakeStringWithClassicLocale(info.engine_decryption_lib_path)},
@@ -163,6 +172,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.dump_ep_context_model)},
       {tensorrt::provider_option_names::kEpContextFilePath, MakeStringWithClassicLocale(info.ep_context_file_path)},
       {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.ep_context_embed_mode)},
+      {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.engine_hw_compatible)},
   };
   return options;
 }
@@ -180,6 +190,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
   const std::string kProfilesMaxShapes_ = empty_if_null(info.trt_profile_max_shapes);
   const std::string kProfilesOptShapes_ = empty_if_null(info.trt_profile_opt_shapes);
   const std::string kEpContextFilePath_ = empty_if_null(info.trt_ep_context_file_path);
+  const std::string kOnnxModelFolderPath_ = empty_if_null(info.trt_onnx_model_folder_path);
 
   const ProviderOptions options{
       {tensorrt::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
@@ -198,6 +209,8 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kEngineCacheEnable, MakeStringWithClassicLocale(info.trt_engine_cache_enable)},
       {tensorrt::provider_option_names::kEngineCachePath, kEngineCachePath_},
       {tensorrt::provider_option_names::kEngineCachePrefix, kEngineCachePrefix_},
+      {tensorrt::provider_option_names::kWeightStrippedEngineEnable, MakeStringWithClassicLocale(info.trt_weight_stripped_engine_enable)},
+      {tensorrt::provider_option_names::kOnnxModelFolderPath, kOnnxModelFolderPath_},
       {tensorrt::provider_option_names::kDecryptionEnable, MakeStringWithClassicLocale(info.trt_engine_decryption_enable)},
       {tensorrt::provider_option_names::kDecryptionLibPath, kDecryptionLibPath_},
       {tensorrt::provider_option_names::kForceSequentialEngineBuild, MakeStringWithClassicLocale(info.trt_force_sequential_engine_build)},
@@ -220,6 +233,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kEpContextFilePath, kEpContextFilePath_},
       {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.trt_dump_ep_context_model)},
       {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.trt_ep_context_embed_mode)},
+      {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.trt_engine_hw_compatible)},
   };
   return options;
 }
@@ -289,6 +303,8 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_dla_core = internal_options.dla_core;
   trt_provider_options_v2.trt_dump_subgraphs = internal_options.dump_subgraphs;
   trt_provider_options_v2.trt_engine_cache_enable = internal_options.engine_cache_enable;
+  trt_provider_options_v2.trt_weight_stripped_engine_enable = internal_options.weight_stripped_engine_enable;
+  trt_provider_options_v2.trt_onnx_model_folder_path = copy_string_if_needed(internal_options.onnx_model_folder_path);
 
   trt_provider_options_v2.trt_engine_cache_path = copy_string_if_needed(internal_options.engine_cache_path);
   trt_provider_options_v2.trt_engine_cache_prefix = copy_string_if_needed(internal_options.engine_cache_prefix);
@@ -319,5 +335,6 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_dump_ep_context_model = internal_options.dump_ep_context_model;
   trt_provider_options_v2.trt_ep_context_embed_mode = internal_options.ep_context_embed_mode;
   trt_provider_options_v2.trt_ep_context_file_path = copy_string_if_needed(internal_options.ep_context_file_path);
+  trt_provider_options_v2.trt_engine_hw_compatible = internal_options.engine_hw_compatible;
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
index 80424b8d6d196..3b859ea2da466 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -32,6 +32,8 @@ struct TensorrtExecutionProviderInfo {
   bool dump_subgraphs{false};
   bool engine_cache_enable{false};
   std::string engine_cache_path{""};
+  bool weight_stripped_engine_enable{false};
+  std::string onnx_model_folder_path{""};
   bool engine_decryption_enable{false};
   std::string engine_decryption_lib_path{""};
   bool force_sequential_engine_build{false};
@@ -55,6 +57,7 @@ struct TensorrtExecutionProviderInfo {
   std::string ep_context_file_path{""};
   int ep_context_embed_mode{0};
   std::string engine_cache_prefix{""};
+  bool engine_hw_compatible{false};
 
   static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
index 568da57a50956..6430ffab09976 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -90,6 +90,8 @@ struct Tensorrt_Provider : Provider {
     info.dump_subgraphs = options.trt_dump_subgraphs != 0;
     info.engine_cache_enable = options.trt_engine_cache_enable != 0;
     info.engine_cache_path = options.trt_engine_cache_path == nullptr ? "" : options.trt_engine_cache_path;
+    info.weight_stripped_engine_enable = options.trt_weight_stripped_engine_enable != 0;
+    info.onnx_model_folder_path = options.trt_onnx_model_folder_path == nullptr ? "" : options.trt_onnx_model_folder_path;
     info.engine_decryption_enable = options.trt_engine_decryption_enable != 0;
     info.engine_decryption_lib_path = options.trt_engine_decryption_lib_path == nullptr ? "" : options.trt_engine_decryption_lib_path;
     info.force_sequential_engine_build = options.trt_force_sequential_engine_build != 0;
@@ -113,6 +115,7 @@ struct Tensorrt_Provider : Provider {
     info.ep_context_file_path = options.trt_ep_context_file_path == nullptr ? "" : options.trt_ep_context_file_path;
     info.ep_context_embed_mode = options.trt_ep_context_embed_mode;
     info.engine_cache_prefix = options.trt_engine_cache_prefix == nullptr ? "" : options.trt_engine_cache_prefix;
+    info.engine_hw_compatible = options.trt_engine_hw_compatible != 0;
 
     return std::make_shared<TensorrtProviderFactory>(info);
   }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 80bd301c201b3..b16da60f7c7c1 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1643,6 +1643,7 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti
   trt_options_converted.trt_ep_context_file_path = "";
   trt_options_converted.trt_ep_context_embed_mode = 0;
   trt_options_converted.trt_engine_cache_prefix = "";
+  trt_options_converted.trt_engine_hw_compatible = 0;
 
   return trt_options_converted;
 }
@@ -2256,6 +2257,7 @@ ORT_API(void, OrtApis::ReleaseTensorRTProviderOptions, _Frees_ptr_opt_ OrtTensor
     delete[] ptr->trt_profile_max_shapes;
     delete[] ptr->trt_profile_opt_shapes;
     delete[] ptr->trt_ep_context_file_path;
+    delete[] ptr->trt_onnx_model_folder_path;
   }
 
   std::unique_ptr<OrtTensorRTProviderOptionsV2> p(ptr);
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 7fc6515d3d50a..051e870256012 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -475,7 +475,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       // So we need these std::string variables defined here as they will be kept alive for the lifetime of TRT EP and we can still access them from OrtTensorRTProviderOptionsV2 instance.
       // (The reason is string copy is involved, for example params.trt_engine_cache_path = cache_path.c_str() and those std::string variable is referenced by OrtTensorRTProviderOptionsV2 instance
       // and TRT EP instance, so it won't be released.)
-      std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile, ep_context_file_path;
+      std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources,
+          trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile, ep_context_file_path,
+          onnx_model_folder_path;
       auto it = provider_options_map.find(type);
       if (it != provider_options_map.end()) {
         OrtTensorRTProviderOptionsV2 params;
@@ -588,6 +590,21 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_engine_cache_prefix' should be a string to customize engine cache prefix i.e. 'FRCNN' or 'yolov4'.\n");
             }
+          } else if (option.first == "trt_weight_stripped_engine_enable") {
+            if (option.second == "True" || option.second == "true") {
+              params.trt_weight_stripped_engine_enable = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.trt_weight_stripped_engine_enable = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_weight_stripped_engine_enable' should be 'True' or 'False'. Default value is 'False'.\n");
+            }
+          } else if (option.first == "trt_onnx_model_folder_path") {
+            if (!option.second.empty()) {
+              onnx_model_folder_path = option.second;
+              params.trt_onnx_model_folder_path = onnx_model_folder_path.c_str();
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_onnx_model_folder_path' should be a path string i.e. 'engine_cache'.\n");
+            }
           } else if (option.first == "trt_engine_decryption_enable") {
             if (option.second == "True" || option.second == "true") {
               params.trt_engine_decryption_enable = true;
@@ -750,6 +767,14 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_embed_mode' should be a positive integer number i.e. '1'.\n");
             }
+          } else if (option.first == "trt_engine_hw_compatible") {
+            if (option.second == "True" || option.second == "true") {
+              params.trt_engine_hw_compatible = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.trt_engine_hw_compatible = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_engine_hw_compatible' should be 'True' or 'False'. Default value is 'False'.\n");
+            }
           } else {
             ORT_THROW("Invalid TensorRT EP option: ", option.first);
           }
diff --git a/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py b/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py
new file mode 100644
index 0000000000000..492de13fb42b5
--- /dev/null
+++ b/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py
@@ -0,0 +1,132 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import csv
+import datetime
+import os
+import re
+
+import pandas as pd
+from azure.kusto.data import KustoConnectionStringBuilder
+from azure.kusto.ingest import QueuedIngestClient
+from post import get_identifier, parse_arguments, write_table
+
+
+def parse_valgrind_log(input_path, output_path, keywords):
+    is_definitely_lost = False
+    is_ort_trt_related = False
+    buffer = []
+    leak_block = None
+    leak_bytes = None
+    keyword = None
+    results = []
+
+    with open(input_path) as file:
+        lines = file.readlines()
+
+        for line in lines:
+            line = line.strip()  # noqa: PLW2901
+            # Remove "==xxxxx==" pattern from the line
+            line = line.split("==")[-1].strip()  # noqa: PLW2901
+
+            if "blocks are definitely lost in loss" in line:
+                is_definitely_lost = True
+                # Extract LeakBlock and LeakBytes
+                match = re.search(r"([\d,]+) byte[s]? in ([\d,]+) block[s]?", line)
+                if match:
+                    leak_bytes = match.group(1).replace(",", "")
+                    leak_block = match.group(2).replace(",", "")
+                continue
+
+            if is_definitely_lost:
+                if line:
+                    buffer.append(line)
+                    for word in keywords:
+                        if word in line:
+                            is_ort_trt_related = True
+                            keyword = word
+                            break
+
+            # End of section
+            if is_definitely_lost and not line:
+                if is_ort_trt_related:
+                    results.append((keyword, leak_block, leak_bytes, "\n".join(buffer)))
+                # Reset var
+                is_definitely_lost = False
+                is_ort_trt_related = False
+                buffer = []
+                leak_block = None
+                leak_bytes = None
+                keyword = None
+
+    # Writing results to CSV
+    with open(output_path, "w", newline="") as csvfile:
+        csvwriter = csv.writer(csvfile)
+        csvwriter.writerow(["Keyword", "LeakBlock", "LeakBytes", "ValgrindMessage"])
+        for entry in results:
+            csvwriter.writerow([entry[0], entry[1], entry[2], entry[3]])
+
+
+def parse_concurrency_test_log(input_path, output_path):
+    with open(input_path) as log_file:
+        log_content = log_file.read()
+
+    failed_cases_section = log_content.split("Failed Test Cases:")[1]
+
+    # passed = 1 if no failed test cases
+    if failed_cases_section.strip() == "":
+        passed = 1
+    else:
+        passed = 0
+
+    with open(output_path, "w", newline="") as csv_file:
+        csv_writer = csv.writer(csv_file)
+        csv_writer.writerow(["Passed", "Log"])
+        csv_writer.writerow([passed, log_content])
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    # connect to database
+    kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(args.kusto_conn)
+    ingest_client = QueuedIngestClient(kcsb_ingest)
+    identifier = get_identifier(
+        args.commit_datetime, args.commit_hash, args.trt_version, args.branch, args.use_tensorrt_oss_parser
+    )
+    upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
+
+    try:
+        result_mem_test_path = args.report_folder
+        os.chdir(result_mem_test_path)
+        # Parse mem_test log
+        logs = ["valgrind.log", "concurrency_test.log"]
+        csv_paths = ["mem_test.csv", "concurrency_test.csv"]
+        for log, csv_path in zip(logs, csv_paths):
+            if os.path.exists(log):
+                print(f"{identifier}: Parsing {log}")
+                if log == logs[0]:
+                    parse_valgrind_log(log, csv_path, ["TensorrtExecutionProvider", "TensorRT"])
+                else:
+                    parse_concurrency_test_log(log, csv_path)
+
+        # Upload to db
+        for csv_path, db_table_name in zip(csv_paths, ["ep_valgrind_record", "ep_concurrencytest_record"]):
+            if os.path.exists(csv_path):
+                table = pd.read_csv(csv_path)
+                write_table(
+                    ingest_client,
+                    args.database,
+                    table,
+                    db_table_name,
+                    upload_time,
+                    identifier,
+                    args.branch,
+                    args.commit_hash,
+                    args.commit_datetime,
+                )
+                print(f"{identifier}: {csv_path} is synced to db")
+
+    except Exception as e:
+        print(str(e))
diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py
index fe941096e2fe8..9b78743d7e751 100644
--- a/onnxruntime/python/tools/tensorrt/perf/post.py
+++ b/onnxruntime/python/tools/tensorrt/perf/post.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import argparse
-import csv
 import datetime
 import os
 import sys
@@ -421,11 +420,10 @@ def main():
     upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
 
     try:
-        # Load EP Perf test results from /result
         result_file = args.report_folder
-        result_perf_test_path = os.path.join(result_file, "result")
-        folders = os.listdir(result_perf_test_path)
-        os.chdir(result_perf_test_path)
+
+        folders = os.listdir(result_file)
+        os.chdir(result_file)
 
         tables = [
             fail_name,
@@ -448,13 +446,13 @@ def main():
         for model_group in folders:
             os.chdir(model_group)
             csv_filenames = os.listdir()
-            for csv_file in csv_filenames:
-                table = pd.read_csv(csv_file)
-                if session_name in csv_file:
+            for csv in csv_filenames:
+                table = pd.read_csv(csv)
+                if session_name in csv:
                     table_results[session_name] = pd.concat(
                         [table_results[session_name], get_session(table, model_group)], ignore_index=True
                     )
-                elif specs_name in csv_file:
+                elif specs_name in csv:
                     table_results[specs_name] = pd.concat(
                         [
                             table_results[specs_name],
@@ -462,12 +460,12 @@ def main():
                         ],
                         ignore_index=True,
                     )
-                elif fail_name in csv_file:
+                elif fail_name in csv:
                     table_results[fail_name] = pd.concat(
                         [table_results[fail_name], get_failures(table, model_group)],
                         ignore_index=True,
                     )
-                elif latency_name in csv_file:
+                elif latency_name in csv:
                     table_results[memory_name] = pd.concat(
                         [table_results[memory_name], get_memory(table, model_group)],
                         ignore_index=True,
@@ -477,11 +475,11 @@ def main():
                         [table_results[latency_name], get_latency(table, model_group)],
                         ignore_index=True,
                     )
-                elif status_name in csv_file:
+                elif status_name in csv:
                     table_results[status_name] = pd.concat(
                         [table_results[status_name], get_status(table, model_group)], ignore_index=True
                     )
-                elif op_metrics_name in csv_file:
+                elif op_metrics_name in csv:
                     table = table.assign(Group=model_group)
                     table_results[op_metrics_name] = pd.concat(
                         [table_results[op_metrics_name], table], ignore_index=True
@@ -515,43 +513,6 @@ def main():
                 args.commit_datetime,
             )
 
-        # Load concurrency test results
-        result_mem_test_path = os.path.join(result_file, "result_mem_test")
-        os.chdir(result_mem_test_path)
-        log_path = "concurrency_test.log"
-        if os.path.exists(log_path):
-            print("Generating concurrency test report")
-            with open(log_path) as log_file:
-                log_content = log_file.read()
-
-            failed_cases_section = log_content.split("Failed Test Cases:")[1]
-
-            # passed = 1 if no failed test cases
-            if failed_cases_section.strip() == "":
-                passed = 1
-            else:
-                passed = 0
-
-            csv_path = "concurrency_test.csv"
-            with open(csv_path, "w", newline="") as csv_file:
-                csv_writer = csv.writer(csv_file)
-                csv_writer.writerow(["Passed", "Log"])
-                csv_writer.writerow([passed, log_content])
-
-            db_table_name = "ep_concurrencytest_record"
-            table = pd.read_csv(csv_path)
-            write_table(
-                ingest_client,
-                args.database,
-                table,
-                db_table_name,
-                upload_time,
-                identifier,
-                args.branch,
-                args.commit_hash,
-                args.commit_datetime,
-            )
-
     except BaseException as e:
         print(str(e))
         sys.exit(1)
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 62291762f61b8..175079d8197bf 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -112,6 +112,9 @@ namespace perftest {
       "\t    [TensorRT only] [trt_engine_cache_enable]: Enable engine caching.\n"
       "\t    [TensorRT only] [trt_engine_cache_path]: Specify engine cache path.\n"
       "\t    [TensorRT only] [trt_engine_cache_prefix]: Customize engine cache prefix when trt_engine_cache_enable is true.\n"
+      "\t    [TensorRT only] [trt_engine_hw_compatible]: Enable hardware compatibility. Engines ending with '_sm80+' can be re-used across all Ampere+ GPU (a hardware-compatible engine may have lower throughput and/or higher latency than its non-hardware-compatible counterpart).\n"
+      "\t    [TensorRT only] [trt_weight_stripped_engine_enable]: Enable weight-stripped engine build.\n"
+      "\t    [TensorRT only] [trt_onnx_model_folder_path]: Folder path for the ONNX model with weights.\n"
       "\t    [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n"
       "\t    [TensorRT only] [trt_context_memory_sharing_enable]: Enable TensorRT context memory sharing between subgraphs.\n"
       "\t    [TensorRT only] [trt_layer_norm_fp32_fallback]: Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow.\n"
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 19a95730c565d..0d2ad51482078 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1548,7 +1548,11 @@ def generate_build_tree(
             and not args.build_wasm
         ):
             if is_windows():
-                cflags += ["/guard:cf", "/DWIN32", "/D_WINDOWS"]
+                # DLL initialization errors due to old conda msvcp140.dll dll are a result of the new MSVC compiler
+                # See https://developercommunity.visualstudio.com/t/Access-violation-with-std::mutex::lock-a/10664660#T-N10668856
+                # Remove this definition (_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR)
+                # once the conda msvcp140.dll dll is updated.
+                cflags += ["/guard:cf", "/DWIN32", "/D_WINDOWS", "/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR"]
                 if not args.use_gdk:
                     # Target Windows 10
                     cflags += [
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index a16647f17280d..7cfff805c3b3c 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -1,7 +1,7 @@
 parameters:
 
 - name: PostToDashboard
-  displayName: Post to Dashboard
+  displayName: Post EP Perf results to Dashboard
   type: boolean
   default: true
 
@@ -30,7 +30,7 @@ parameters:
     - "partner-models"
 
 - name: MemTest
-  displayName: Run Memory Test and Concurrency Test
+  displayName: Run Memory and Concurrency Test
   type: boolean
   default: true
 
@@ -147,11 +147,27 @@ jobs:
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'
         condition: always()
 
-    - task: PublishBuildArtifacts@1
-      inputs:
-        pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
-        artifactName: 'result-$(Build.BuildNumber)'
-        
+      - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
+        displayName: 'Install dashboard dependencies'
+
+      - script: |
+          az --version || {
+              echo "Azure CLI not found, installing..."
+              curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+          }
+        displayName: 'Check and Install Azure CLI'
+      
+      - task: AzureCLI@2
+        displayName: 'Parse Memory & Concurrency Test Records and Sync'
+        inputs:
+          azureSubscription: AIInfraBuildOnnxRuntimeOSS
+          scriptLocation: inlineScript
+          scriptType: bash
+          inlineScript: |
+            short_hash=$(git rev-parse --short HEAD) &&
+            commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
+            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py -r $(Build.SourcesDirectory)/Artifact/result_mem_test -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
+
     - ${{ if eq(parameters.PostToDashboard, true) }}:
 
       - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs'
@@ -165,7 +181,7 @@ jobs:
         displayName: 'Check and Install Azure CLI'
 
       - task: AzureCLI@2
-        displayName: 'Post EP Perf Results to Dashboard'
+        displayName: 'Azure CLI Post to Dashboard'
         inputs:
           azureSubscription: AIInfraBuildOnnxRuntimeOSS
           scriptLocation: inlineScript
@@ -173,7 +189,12 @@ jobs:
           inlineScript: |
             short_hash=$(git rev-parse --short HEAD) &&
             commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
-            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
+            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
+
+    - task: PublishBuildArtifacts@1
+      inputs:
+        pathtoPublish: '$(Build.SourcesDirectory)/Artifact'
+        artifactName: 'result-$(Build.BuildNumber)'
 
     - template: templates/component-governance-component-detection-steps.yml
       parameters :
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index e00425739b711..85722c1cb8d2a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.150
+      version: 1.0.164
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.150
+      version: 1.0.164
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.