Skip to content

Commit

Permalink
[TensorRT EP] Enhance EP context configs in session options and provi…
Browse files Browse the repository at this point in the history
…der options (microsoft#19154)

Several changes:

1. To align with other EPs' setting of EP context configs in session
options, for example [QNN
EP](microsoft#18877), EP context
configs for TRT EP can be configured through:
1. Session Options: `ep.context_enable`, `ep.context_file_path` and
`ep.context_embed_mode`
2. Provider Options: `trt_dump_ep_context_model`,
`trt_ep_context_file_path` and `trt_dump_ep_context_embed_mode`
3. Above setting has 1:1 mapping and provider options has higher
priority over session options.
    
```
    Please note that there are rules for using following context model related provider options:

     1. In the case of dumping the context model and loading the context model,
        for security reason, TRT EP doesn't allow the "ep_cache_context" node attribute of EP context node to be
        the absolute path or relative path that is outside of context model directory.
        It means engine cache needs to be in the same directory or sub-directory of context model.

     2. In the case of dumping the context model, the engine cache path will be changed to the relative path of context model directory.
        For example:
        If "trt_dump_ep_context_model" is enabled and "trt_engine_cache_enable" is enabled,
           if "trt_ep_context_file_path" is "./context_model_dir",
           - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir"
           - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir"
```    

2. User can decide the naming of the dumped "EP context" model by using
`trt_ep_context_file_path`, please see GetCtxModelPath() for more
details.

3. Added suggested comments from
microsoft#18217
  • Loading branch information
chilo-ms authored Jan 21, 2024
1 parent c8ce839 commit f3402de
Show file tree
Hide file tree
Showing 12 changed files with 624 additions and 162 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
/// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
/// </summary>
struct OrtTensorRTProviderOptionsV2 {
OrtTensorRTProviderOptionsV2& operator=(const OrtTensorRTProviderOptionsV2& other); // copy assignment operator

int device_id{0}; // cuda device id.
int has_user_compute_stream{0}; // indicator of user specified CUDA compute stream.
void* user_compute_stream{nullptr}; // user specified CUDA compute stream.
Expand Down Expand Up @@ -46,8 +48,26 @@ struct OrtTensorRTProviderOptionsV2 {
const char* trt_profile_max_shapes{nullptr}; // Specify the range of the input shapes to build the engine with
const char* trt_profile_opt_shapes{nullptr}; // Specify the range of the input shapes to build the engine with
int trt_cuda_graph_enable{0}; // Enable CUDA graph in ORT TRT
int trt_dump_ep_context_model{0}; // Dump EP context node model
int trt_ep_context_embed_mode{0}; // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
int trt_ep_context_compute_capability_enable{1}; // Add GPU compute capability as an EP context node's attribute
const char* trt_engine_cache_prefix{nullptr}; // specify engine cache prefix

/*
* Please note that there are rules for using following context model related provider options:
*
* 1. In the case of dumping the context model and loading the context model,
* for security reason, TRT EP doesn't allow the "ep_cache_context" node attribute of EP context node to be
* the absolute path or relative path that is outside of context model directory.
* It means engine cache needs to be in the same directory or sub-directory of context model.
*
* 2. In the case of dumping the context model, the engine cache path will be changed to the relative path of context model directory.
* For example:
* If "trt_dump_ep_context_model" is enabled and "trt_engine_cache_enable" is enabled,
* if "trt_ep_context_file_path" is "./context_model_dir",
* - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir"
* - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir"
*
*/
int trt_dump_ep_context_model{0}; // Dump EP context node model
const char* trt_ep_context_file_path{nullptr}; // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path.
int trt_ep_context_embed_mode{0}; // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data

const char* trt_engine_cache_prefix{nullptr}; // specify engine cache prefix
};
211 changes: 155 additions & 56 deletions onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,6 @@ const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer) {
return main_graph.ModelPath();
}

std::filesystem::path LocateEngineRelativeToPath(std::string engine_cache_path, const onnxruntime::Path& path) {
std::filesystem::path base_path(path.ToPathString());
std::filesystem::path parent_path = base_path.parent_path();
std::filesystem::path engine_path = parent_path.append(engine_cache_path);
return engine_path;
}

/*
* Update ep_cache_context attribute of the EP context node with the given engine binary data
*/
Expand All @@ -69,14 +62,13 @@ void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
/*
* Create "EP context node" model where engine information is embedded
*/
ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
const std::string engine_cache_path,
char* engine_data,
size_t size,
const int64_t embed_mode,
bool compute_capability_enable,
std::string compute_capability,
const logging::Logger* logger) {
ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
const std::string engine_cache_path,
char* engine_data,
size_t size,
const int64_t embed_mode,
std::string compute_capability,
const logging::Logger* logger) {
auto model_build = graph_viewer.CreateModel(*logger);
auto& graph_build = model_build->MainGraph();

Expand Down Expand Up @@ -107,21 +99,20 @@ ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
engine_data_str.assign(engine_data, size);
}
attr_1->set_s(engine_data_str);
LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
} else {
attr_1->set_s(engine_cache_path);
}
attr_2->set_name(COMPUTE_CAPABILITY);
attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
attr_2->set_s(compute_capability);

auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
int num_attributes = compute_capability_enable ? 3 : 2;
int num_attributes = 3;
node_attributes->reserve(num_attributes);
node_attributes->emplace(EMBED_MODE, *attr_0);
node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1);

if (compute_capability_enable) {
attr_2->set_name(COMPUTE_CAPABILITY);
attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
attr_2->set_s(compute_capability);
node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
}
node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);

// Create EP context node
graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN);
Expand All @@ -138,14 +129,111 @@ ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
}

/*
* Dump "EP context node" model
* Return the directory where the ep context model locates
*/
std::filesystem::path GetPathOrParentPathOfCtxModel(const std::string& ep_context_file_path) {
if (ep_context_file_path.empty()) {
return std::filesystem::path();
}
std::filesystem::path ctx_path(ep_context_file_path);
if (std::filesystem::is_directory(ep_context_file_path)) {
return ctx_path;
} else {
return ctx_path.parent_path();
}
}

/*
* Get "EP context" model path.
*
* Function logic:
* If ep_context_file_path is provided,
* - If ep_context_file_path is a file, return "ep_context_file_path".
* - If ep_context_file_path is a directory, return "ep_context_file_path/original_model_name_ctx.onnx".
* If ep_context_file_path is not provided,
* - Return "original_model_name_ctx.onnx".
*
* TRT EP has rules about context model path and engine cache path (see tensorrt_execution_provider.cc):
* - If dump_ep_context_model_ and engine_cache_enabled_ is enabled, TRT EP will dump context model and save engine cache
* to the same directory provided by ep_context_file_path_. (i.e. engine_cache_path_ = ep_context_file_path_)
*
* Example 1:
* ep_context_file_path = "/home/user/ep_context_model_directory"
* original_model_path = "model.onnx"
* => return "/home/user/ep_context_model_folder/model_ctx.onnx"
*
* Example 2:
* ep_context_file_path = "my_ctx_model.onnx"
* original_model_path = "model.onnx"
* => return "my_ctx_model.onnx"
*
* Example 3:
* ep_context_file_path = "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
* original_model_path = "model.onnx"
* => return "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
*
*/
std::string GetCtxModelPath(const std::string& ep_context_file_path,
const std::string& original_model_path) {
std::string ctx_model_path;

if (!ep_context_file_path.empty() && !std::filesystem::is_directory(ep_context_file_path)) {
ctx_model_path = ep_context_file_path;
} else {
std::filesystem::path model_path = original_model_path;
std::filesystem::path model_name_stem = model_path.stem(); // model_name.onnx -> model_name
std::string ctx_model_name = model_name_stem.string() + "_ctx.onnx";

if (std::filesystem::is_directory(ep_context_file_path)) {
std::filesystem::path model_directory = ep_context_file_path;
ctx_model_path = model_directory.append(ctx_model_name).string();
} else {
ctx_model_path = ctx_model_name;
}
}
return ctx_model_path;
}

/*
* Dump "EP context" model
*
*/
void DumpCtxNodeModel(ONNX_NAMESPACE::ModelProto* model_proto,
const std::string engine_cache_path) {
std::fstream dump(engine_cache_path + "_wrapper.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
const std::string& ctx_model_path) {
std::fstream dump(ctx_model_path, std::ios::out | std::ios::trunc | std::ios::binary);
model_proto->SerializeToOstream(dump);
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path + "_wrapper.onnx";
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Dumped " + ctx_model_path;
}

bool IsAbsolutePath(std::string& path_string) {
#ifdef _WIN32
onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
auto path = std::filesystem::path(ort_path_string.c_str());
return path.is_absolute();
#else
if (!path_string.empty() && path_string[0] == '/') {
return true;
}
return false;
#endif
}

// Like "../file_path"
bool IsRelativePathToParentPath(std::string& path_string) {
#ifdef _WIN32
onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
auto path = std::filesystem::path(ort_path_string.c_str());
auto relative_path = path.lexically_normal().make_preferred().wstring();
if (relative_path.find(L"..", 0) != std::string::npos) {
return true;
}
return false;
#else
if (!path_string.empty() && path_string.find("..", 0) != std::string::npos) {
return true;
}
return false;
#endif
}

Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) {
Expand All @@ -157,7 +245,7 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph

const int64_t embed_mode = attrs.at(EMBED_MODE).i();
if (embed_mode) {
// Get engine from byte stream
// Get engine from byte stream.
const std::string& context_binary = attrs.at(EP_CACHE_CONTEXT).s();
*(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(const_cast<char*>(context_binary.c_str()),
static_cast<size_t>(context_binary.length())));
Expand All @@ -167,19 +255,41 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
"TensorRT EP could not deserialize engine from binary data");
}
} else {
// Get engine from cache file
std::ifstream engine_file(engine_cache_path_.string(), std::ios::binary | std::ios::in);
// Get engine from cache file.
std::string cache_path = attrs.at(EP_CACHE_CONTEXT).s();

// For security purpose, in the case of running context model, TRT EP won't allow
// engine cache path to be the relative path like "../file_path" or the absolute path.
// It only allows the engine cache to be in the same directory or sub directory of the context model.
if (IsAbsolutePath(cache_path)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "For security purpose, the ep_cache_context attribute should be set with a relative path, but it is an absolute path: " + cache_path);
}
if (IsRelativePathToParentPath(cache_path)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "The file path in ep_cache_context attribute has '..'. For security purpose, it's not allowed to point outside the directory.");
}

// The engine cache and context model (current model) should be in the same directory
std::filesystem::path ctx_model_dir(GetPathOrParentPathOfCtxModel(ep_context_model_path_));
auto engine_cache_path = ctx_model_dir.append(cache_path);

if (!std::filesystem::exists(engine_cache_path)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP can't find engine cache: " + engine_cache_path.string() +
". Please make sure engine cache is in the same directory or sub-directory of context model.");
}

std::ifstream engine_file(engine_cache_path.string(), std::ios::binary | std::ios::in);
engine_file.seekg(0, std::ios::end);
size_t engine_size = engine_file.tellg();
engine_file.seekg(0, std::ios::beg);
std::unique_ptr<char[]> engine_buf{new char[engine_size]};
engine_file.read((char*)engine_buf.get(), engine_size);
*(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path_.string();
if (!(*trt_engine_)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not deserialize engine from cache: " + engine_cache_path_.string());
"TensorRT EP could not deserialize engine from cache: " + engine_cache_path.string());
}
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path.string();
}
return Status::OK();
}
Expand All @@ -193,37 +303,26 @@ bool TensorRTCacheModelHandler::ValidateEPCtxNode(const GraphViewer& graph_viewe
auto node = graph_viewer.GetNode(0);
auto& attrs = node->GetAttributes();

// Check hardware_architecture(compute_capability) if it's present as an attribute
// Show the warning if compute capability is not matched
if (attrs.count(COMPUTE_CAPABILITY) > 0) {
std::string model_compute_capability = attrs.at(COMPUTE_CAPABILITY).s();
if (model_compute_capability != compute_capability_) {
LOGS_DEFAULT(ERROR) << "The compute capability of the engine cache doesn't match with the GPU's compute capability";
LOGS_DEFAULT(ERROR) << "The compute capability of the engine cache: " << model_compute_capability;
LOGS_DEFAULT(ERROR) << "The compute capability of the GPU: " << compute_capability_;
return false;
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine was compiled for a different compatibility level and might not work or perform suboptimal";
LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the engine: " << model_compute_capability;
LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the GPU: " << compute_capability_;
}
}

// "embed_mode" attr and "ep_cache_context" attr should be present
if (attrs.count(EMBED_MODE) > 0 && attrs.count(EP_CACHE_CONTEXT) > 0) {
// ep_cache_context: payload of the execution provider context if embed_mode=1, or path to the context file if embed_mode=0
const int64_t embed_mode = attrs.at(EMBED_MODE).i();

// engine cache path
if (embed_mode == 0) {
// First assume engine cache path is relatvie to model path,
// If not, then assume the engine cache path is an absolute path.
engine_cache_path_ = LocateEngineRelativeToPath(attrs.at(EP_CACHE_CONTEXT).s(), GetModelPath(graph_viewer));
auto default_engine_cache_path_ = engine_cache_path_;
if (!std::filesystem::exists(engine_cache_path_)) {
engine_cache_path_.assign(attrs.at(EP_CACHE_CONTEXT).s());
if (!std::filesystem::exists(engine_cache_path_)) {
LOGS_DEFAULT(ERROR) << "Can't find " << default_engine_cache_path_.string() << " or " << engine_cache_path_.string() << " TensorRT engine";
return false;
}
}
}
assert(attrs.count(EMBED_MODE) > 0);
assert(attrs.count(EP_CACHE_CONTEXT) > 0);

const int64_t embed_mode = attrs.at(EMBED_MODE).i();
if (embed_mode == 1) {
// engine binary data
LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
}

return true;
}
} // namespace onnxruntime
Loading

0 comments on commit f3402de

Please sign in to comment.