Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TensorRT EP] Enhance EP context configs in session options and provider options #19154

Merged
merged 26 commits into from
Jan 21, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
/// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
/// </summary>
struct OrtTensorRTProviderOptionsV2 {
OrtTensorRTProviderOptionsV2& operator=(const OrtTensorRTProviderOptionsV2& other); // copy assignment operator

int device_id{0}; // cuda device id.
int has_user_compute_stream{0}; // indicator of user specified CUDA compute stream.
void* user_compute_stream{nullptr}; // user specified CUDA compute stream.
Expand Down Expand Up @@ -46,8 +48,26 @@
const char* trt_profile_max_shapes{nullptr}; // Specify the range of the input shapes to build the engine with
const char* trt_profile_opt_shapes{nullptr}; // Specify the range of the input shapes to build the engine with
int trt_cuda_graph_enable{0}; // Enable CUDA graph in ORT TRT
int trt_dump_ep_context_model{0}; // Dump EP context node model
int trt_ep_context_embed_mode{0}; // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
int trt_ep_context_compute_capability_enable{1}; // Add GPU compute capability as an EP context node's attribute
const char* trt_engine_cache_prefix{nullptr}; // specify engine cache prefix

/*
* Please note that there are rules for using following context model related provider options:
*
* 1. In the case of dumping the context model and loading the context model,
* for security reason, TRT EP doesn't allow the "ep_cache_context" node attribute of EP context node to be
* the absolute path or relative path that is outside of context model directory.
* It means engine cache needs to be in the same directory or sub-directory of context model.
*
* 2. In the case of dumping the context model, the engine cache path will be changed to the relative path of context model directory.
* For example:
* If "trt_dump_ep_context_model" is enabled and "trt_engine_cache_enable" is enabled,
* if "trt_ep_context_file_path" is "./context_model_dir",
* - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir"
* - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir"
*
*/
int trt_dump_ep_context_model{0}; // Dump EP context node model
const char* trt_ep_context_file_path{nullptr}; // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path.

Check warning on line 69 in include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h#L69

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h:69:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
int trt_ep_context_embed_mode{0}; // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data

Check warning on line 70 in include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h#L70

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h:70:  Lines should be <= 120 characters long  [whitespace/line_length] [2]

const char* trt_engine_cache_prefix{nullptr}; // specify engine cache prefix
};
211 changes: 155 additions & 56 deletions onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,6 @@
return main_graph.ModelPath();
}

std::filesystem::path LocateEngineRelativeToPath(std::string engine_cache_path, const onnxruntime::Path& path) {
std::filesystem::path base_path(path.ToPathString());
std::filesystem::path parent_path = base_path.parent_path();
std::filesystem::path engine_path = parent_path.append(engine_cache_path);
return engine_path;
}

/*
* Update ep_cache_context attribute of the EP context node with the given engine binary data
*/
Expand All @@ -69,14 +62,13 @@
/*
* Create "EP context node" model where engine information is embedded
*/
ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
const std::string engine_cache_path,
char* engine_data,
size_t size,
const int64_t embed_mode,
bool compute_capability_enable,
std::string compute_capability,
const logging::Logger* logger) {
ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
const std::string engine_cache_path,
char* engine_data,
size_t size,
const int64_t embed_mode,
std::string compute_capability,
const logging::Logger* logger) {
auto model_build = graph_viewer.CreateModel(*logger);
auto& graph_build = model_build->MainGraph();

Expand Down Expand Up @@ -107,21 +99,20 @@
engine_data_str.assign(engine_data, size);
}
attr_1->set_s(engine_data_str);
LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
} else {
attr_1->set_s(engine_cache_path);
}
attr_2->set_name(COMPUTE_CAPABILITY);
attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
attr_2->set_s(compute_capability);

auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
int num_attributes = compute_capability_enable ? 3 : 2;
int num_attributes = 3;
node_attributes->reserve(num_attributes);
node_attributes->emplace(EMBED_MODE, *attr_0);
node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1);

if (compute_capability_enable) {
attr_2->set_name(COMPUTE_CAPABILITY);
attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
attr_2->set_s(compute_capability);
node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
}
node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);

// Create EP context node
graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN);
Expand All @@ -138,14 +129,111 @@
}

/*
* Dump "EP context node" model
* Return the directory where the ep context model locates
*/
std::filesystem::path GetPathOrParentPathOfCtxModel(const std::string& ep_context_file_path) {
if (ep_context_file_path.empty()) {
return std::filesystem::path();
}
std::filesystem::path ctx_path(ep_context_file_path);
if (std::filesystem::is_directory(ep_context_file_path)) {
return ctx_path;
} else {
return ctx_path.parent_path();
}
}

/*
* Get "EP context" model path.
*
* Function logic:
* If ep_context_file_path is provided,
* - If ep_context_file_path is a file, return "ep_context_file_path".
* - If ep_context_file_path is a directory, return "ep_context_file_path/original_model_name_ctx.onnx".
* If ep_context_file_path is not provided,
* - Return "original_model_name_ctx.onnx".
*
* TRT EP has rules about context model path and engine cache path (see tensorrt_execution_provider.cc):
* - If dump_ep_context_model_ and engine_cache_enabled_ is enabled, TRT EP will dump context model and save engine cache
* to the same directory provided by ep_context_file_path_. (i.e. engine_cache_path_ = ep_context_file_path_)
*
* Example 1:
* ep_context_file_path = "/home/user/ep_context_model_directory"
* original_model_path = "model.onnx"
* => return "/home/user/ep_context_model_folder/model_ctx.onnx"
*
* Example 2:
* ep_context_file_path = "my_ctx_model.onnx"
* original_model_path = "model.onnx"
* => return "my_ctx_model.onnx"
*
* Example 3:
* ep_context_file_path = "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
* original_model_path = "model.onnx"
* => return "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
*
*/
std::string GetCtxModelPath(const std::string& ep_context_file_path,
const std::string& original_model_path) {
std::string ctx_model_path;

if (!ep_context_file_path.empty() && !std::filesystem::is_directory(ep_context_file_path)) {
ctx_model_path = ep_context_file_path;
} else {
std::filesystem::path model_path = original_model_path;
std::filesystem::path model_name_stem = model_path.stem(); // model_name.onnx -> model_name
std::string ctx_model_name = model_name_stem.string() + "_ctx.onnx";

if (std::filesystem::is_directory(ep_context_file_path)) {
std::filesystem::path model_directory = ep_context_file_path;
ctx_model_path = model_directory.append(ctx_model_name).string();
} else {
ctx_model_path = ctx_model_name;
}
}
return ctx_model_path;
}

/*
* Dump "EP context" model
*
*/
void DumpCtxNodeModel(ONNX_NAMESPACE::ModelProto* model_proto,
const std::string engine_cache_path) {
std::fstream dump(engine_cache_path + "_wrapper.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
const std::string& ctx_model_path) {
std::fstream dump(ctx_model_path, std::ios::out | std::ios::trunc | std::ios::binary);
model_proto->SerializeToOstream(dump);
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path + "_wrapper.onnx";
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Dumped " + ctx_model_path;
}

bool IsAbsolutePath(std::string& path_string) {
#ifdef _WIN32
onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
auto path = std::filesystem::path(ort_path_string.c_str());
return path.is_absolute();
#else
if (!path_string.empty() && path_string[0] == '/') {
return true;
}
return false;
#endif
}

// Like "../file_path"
bool IsRelativePathToParentPath(std::string& path_string) {
#ifdef _WIN32
onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
auto path = std::filesystem::path(ort_path_string.c_str());
auto relative_path = path.lexically_normal().make_preferred().wstring();
if (relative_path.find(L"..", 0) != std::string::npos) {
return true;
}
return false;
#else
if (!path_string.empty() && path_string.find("..", 0) != std::string::npos) {
return true;
}
return false;
#endif
}

Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) {
Expand All @@ -157,7 +245,7 @@

const int64_t embed_mode = attrs.at(EMBED_MODE).i();
if (embed_mode) {
// Get engine from byte stream
// Get engine from byte stream.
const std::string& context_binary = attrs.at(EP_CACHE_CONTEXT).s();
*(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(const_cast<char*>(context_binary.c_str()),
static_cast<size_t>(context_binary.length())));
Expand All @@ -167,19 +255,41 @@
"TensorRT EP could not deserialize engine from binary data");
}
} else {
// Get engine from cache file
std::ifstream engine_file(engine_cache_path_.string(), std::ios::binary | std::ios::in);
// Get engine from cache file.
std::string cache_path = attrs.at(EP_CACHE_CONTEXT).s();

// For security purpose, in the case of running context model, TRT EP won't allow
// engine cache path to be the relative path like "../file_path" or the absolute path.
// It only allows the engine cache to be in the same directory or sub directory of the context model.
if (IsAbsolutePath(cache_path)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "For security purpose, the ep_cache_context attribute should be set with a relative path, but it is an absolute path: " + cache_path);

Check warning on line 265 in onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc#L265

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc:265:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
}
if (IsRelativePathToParentPath(cache_path)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "The file path in ep_cache_context attribute has '..'. For security purpose, it's not allowed to point outside the directory.");

Check warning on line 268 in onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc#L268

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc:268:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
}

// The engine cache and context model (current model) should be in the same directory
std::filesystem::path ctx_model_dir(GetPathOrParentPathOfCtxModel(ep_context_model_path_));
auto engine_cache_path = ctx_model_dir.append(cache_path);

if (!std::filesystem::exists(engine_cache_path)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP can't find engine cache: " + engine_cache_path.string() +
". Please make sure engine cache is in the same directory or sub-directory of context model.");

Check warning on line 278 in onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc#L278

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc:278:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
}

std::ifstream engine_file(engine_cache_path.string(), std::ios::binary | std::ios::in);
engine_file.seekg(0, std::ios::end);
size_t engine_size = engine_file.tellg();
engine_file.seekg(0, std::ios::beg);
std::unique_ptr<char[]> engine_buf{new char[engine_size]};
engine_file.read((char*)engine_buf.get(), engine_size);
*(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path_.string();
if (!(*trt_engine_)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not deserialize engine from cache: " + engine_cache_path_.string());
"TensorRT EP could not deserialize engine from cache: " + engine_cache_path.string());
}
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path.string();
}
return Status::OK();
}
Expand All @@ -193,37 +303,26 @@
auto node = graph_viewer.GetNode(0);
auto& attrs = node->GetAttributes();

// Check hardware_architecture(compute_capability) if it's present as an attribute
// Show the warning if compute capability is not matched
if (attrs.count(COMPUTE_CAPABILITY) > 0) {
std::string model_compute_capability = attrs.at(COMPUTE_CAPABILITY).s();
if (model_compute_capability != compute_capability_) {
LOGS_DEFAULT(ERROR) << "The compute capability of the engine cache doesn't match with the GPU's compute capability";
LOGS_DEFAULT(ERROR) << "The compute capability of the engine cache: " << model_compute_capability;
LOGS_DEFAULT(ERROR) << "The compute capability of the GPU: " << compute_capability_;
return false;
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine was compiled for a different compatibility level and might not work or perform suboptimal";

Check warning on line 310 in onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc#L310

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc:310:  Lines should be <= 120 characters long  [whitespace/line_length] [2]
LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the engine: " << model_compute_capability;
LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the GPU: " << compute_capability_;
}
}

// "embed_mode" attr and "ep_cache_context" attr should be present
if (attrs.count(EMBED_MODE) > 0 && attrs.count(EP_CACHE_CONTEXT) > 0) {
// ep_cache_context: payload of the execution provider context if embed_mode=1, or path to the context file if embed_mode=0
const int64_t embed_mode = attrs.at(EMBED_MODE).i();

// engine cache path
if (embed_mode == 0) {
// First assume engine cache path is relatvie to model path,
// If not, then assume the engine cache path is an absolute path.
engine_cache_path_ = LocateEngineRelativeToPath(attrs.at(EP_CACHE_CONTEXT).s(), GetModelPath(graph_viewer));
auto default_engine_cache_path_ = engine_cache_path_;
if (!std::filesystem::exists(engine_cache_path_)) {
engine_cache_path_.assign(attrs.at(EP_CACHE_CONTEXT).s());
if (!std::filesystem::exists(engine_cache_path_)) {
LOGS_DEFAULT(ERROR) << "Can't find " << default_engine_cache_path_.string() << " or " << engine_cache_path_.string() << " TensorRT engine";
return false;
}
}
}
assert(attrs.count(EMBED_MODE) > 0);
assert(attrs.count(EP_CACHE_CONTEXT) > 0);

const int64_t embed_mode = attrs.at(EMBED_MODE).i();
if (embed_mode == 1) {
// engine binary data
LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
}

return true;
}
} // namespace onnxruntime
Loading
Loading