Skip to content

Commit

Permalink
new test control_flow, error: ErrorMessage:Failed to find kernel for …
Browse files Browse the repository at this point in the history
…MemcpyFromHost(1) (node:'Memcpy' ep:'tensorrtEp'). Kernel not found
  • Loading branch information
jslhcl committed Oct 2, 2024
1 parent a407944 commit f871b25
Show file tree
Hide file tree
Showing 5 changed files with 292 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {

/**
* The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
* Graph::ResolveContext::IsOuterScopeValue(). We have to implement this fuction again.
* Graph::ResolveContext::IsOuterScopeValue(). We have to implement this function again.
*/
bool IsOuterScopeValue(const Graph& graph, const std::string& name) const;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ std::string GetUniqueGraphName(const Graph& graph) {
} // namespace

// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
// Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again.
// Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this function again.
bool TensorrtExecutionProvider::IsInputInitializerOrOutput(const Graph& graph,
const std::string& name,
bool check_ancestors) const {
Expand Down Expand Up @@ -125,7 +125,7 @@ void TensorrtExecutionProvider::BuildSubGraphContext(const Graph& graph) const {
}
}

// Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed.
// Set outer scope values for subgraphs and add those values as top-level graph's inputs if needed.
void TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(Graph& graph_build,
const Graph& graph) const {
// Iterate all the nodes and recurse into inner most subgraph first for both newly built graph and original graph
Expand Down
249 changes: 245 additions & 4 deletions onnxruntime/core/session/onnxruntime_c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "core/providers/get_execution_providers.h"
#include "core/session/environment.h"
#include "core/framework/callback.h"
#include "core/framework/murmurhash3.h"
#include "core/framework/tensorprotoutils.h"
#include "core/framework/onnxruntime_typeinfo.h"
#include "core/session/inference_session.h"
Expand Down Expand Up @@ -2594,6 +2595,244 @@ ORT_API_STATUS_IMPL(OrtApis::OrtGraph_DeserializeFromArray, const void* data, si
return nullptr;
}

struct SubGraphContext2 {
std::unordered_set<std::string> output_args;
std::unordered_map<std::string, const NodeArg*> inputs_and_initializers;
std::unordered_map<std::string, const NodeArg*> manually_added_graph_inputs;
};

static std::string GetUniqueGraphName(const Graph& graph) {
HashValue model_hash = 0;
uint32_t hash[4] = {0, 0, 0, 0};

auto hash_str = [&hash](const std::string& str) {
MurmurHash3::x86_128(str.data(), gsl::narrow_cast<int32_t>(str.size()), hash[0], &hash);
};

// Hash all nodes' name
for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
auto node = graph.GetNode(i);
if (node == nullptr) {
continue;
}
hash_str(node->Name());
}

model_hash = hash[0] | (uint64_t(hash[1]) << 32);

return graph.Name() + "_" + std::to_string(model_hash);
}

static bool IsLocalValue(const Graph& graph,
const std::string& name,
const std::unordered_map<std::string, std::unique_ptr<SubGraphContext2>>& subgraph_context_map) {
std::string unique_graph_name = GetUniqueGraphName(graph);
if (subgraph_context_map.find(unique_graph_name) == subgraph_context_map.end()) {
return false;
}
SubGraphContext2* context = subgraph_context_map.at(unique_graph_name).get();
return context->output_args.find(name) != context->output_args.cend() ||
context->inputs_and_initializers.find(name) != context->inputs_and_initializers.cend();
}

static bool IsInputInitializerOrOutput(const Graph& graph,
const std::string& name,
bool check_ancestors,
const std::unordered_map<std::string, std::unique_ptr<SubGraphContext2>>& subgraph_context_map) {
const Graph* parent_graph = nullptr;
return IsLocalValue(graph, name, subgraph_context_map) ||
(check_ancestors && (parent_graph = graph.ParentGraph()) != nullptr &&
IsInputInitializerOrOutput(*parent_graph, name, check_ancestors, subgraph_context_map));
}

static bool IsOuterScopeValue(const Graph& graph,
const std::string& name,
const std::unordered_map<std::string, std::unique_ptr<SubGraphContext2>>& subgraph_context_map) {
const Graph* parent_graph = nullptr;
return (parent_graph = graph.ParentGraph()) != nullptr &&
IsInputInitializerOrOutput(*parent_graph, name, true, subgraph_context_map);
}

static void BuildSubGraphContext(const Graph& graph, std::unordered_map<std::string, std::unique_ptr<SubGraphContext2>>& subgraph_context_map) {
// Iterate all the nodes and recurse into inner most subgraph first
for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
auto node = graph.GetNode(i);
if (node == nullptr) {
continue;
}

auto subgraph_map = node->GetAttributeNameToSubgraphMap();
for (auto& entry : subgraph_map) {
const Graph* subgraph = entry.second;
BuildSubGraphContext(*subgraph, subgraph_context_map);
}
}

std::string unique_graph_name = GetUniqueGraphName(graph);

// Subgraph context has been built before, no need to do it again
if (subgraph_context_map.find(unique_graph_name) != subgraph_context_map.end()) {
return;
}

subgraph_context_map.emplace(unique_graph_name, std::make_unique<SubGraphContext2>());
SubGraphContext2* context = subgraph_context_map.at(unique_graph_name).get();

// Collect all nodes' outputs and nodes' name
for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
auto node = graph.GetNode(i);
if (node == nullptr) {
continue;
}

for (const auto& output : node->OutputDefs()) {
context->output_args.insert(output->Name());
}
}

// Go thru all node's inputs
for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
auto node = graph.GetNode(i);
if (node == nullptr) {
continue;
}

for (const auto& input : node->InputDefs()) {
if (context->output_args.find(input->Name()) != context->output_args.end()) {
continue;
}
// This input arg is not the output of another node so must come from either a graph input or an initializer.
context->inputs_and_initializers[input->Name()] = input;
}
}
}

static void SetGraphOuterScopeValuesAndInputs(Graph& graph_build,
const Graph& graph,
std::unordered_map<std::string, std::unique_ptr<SubGraphContext2>>& subgraph_context_map) {
// Iterate all the nodes and recurse into inner most subgraph first for both newly built graph and original graph
for (int i = 0; i < graph_build.MaxNodeIndex(); ++i) {
auto graph_build_node = graph_build.GetNode(i);
if (graph_build_node == nullptr) {
continue;
}

auto graph_build_map = graph_build_node->GetAttributeNameToMutableSubgraphMap();
std::unordered_map<std::string, gsl::not_null<const Graph*>> subgraph_map;
const Node* graph_node = nullptr;

// Find corresponding original graph node's subgraphs
for (int j = 0; j < graph.MaxNodeIndex(); ++j) {
if (graph.GetNode(j) && graph.GetNode(j)->Name() == graph_build_node->Name()) {
graph_node = graph.GetNode(j);
subgraph_map = graph_node->GetAttributeNameToSubgraphMap();
break;
}
}

for (auto& entry : graph_build_map) {
auto attr_name = entry.first;
Graph* subgraph_build = entry.second;
if (subgraph_map.find(attr_name) != subgraph_map.end()) {
// recurse into subgraph
const Graph* subgraph = subgraph_map.at(attr_name);
SetGraphOuterScopeValuesAndInputs(*subgraph_build, *subgraph, subgraph_context_map);
}
}
}

// Start from the inner most subgraph first and check whether its outer scope values are existed in the
// newly built graph. If not, we need to add those outer scope values as explicit inputs to the top-level
// of newly built graph.
if (graph_build.ParentNode()) {
auto top_level_graph = &graph_build;
while (top_level_graph->MutableParentGraph()) {
top_level_graph = top_level_graph->MutableParentGraph();
}
std::string unique_graph_name = GetUniqueGraphName(*top_level_graph);
if (subgraph_context_map.find(unique_graph_name) == subgraph_context_map.end()) {
return;
}

SubGraphContext2* context = subgraph_context_map.at(unique_graph_name).get();

// Iterate all the implicit inputs to set outer scope value for the newly built subgraph
for (const auto& input : graph.ParentNode()->ImplicitInputDefs()) {
// LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name();

// The node arg in parent node's implicit inputs could be used for parent node's other subgraph, for example
// "If" op has two subgraphs. So we need to make sure that the node arg is used in current subgraph only.
// (GetNodeArg searches for specific node arg in all node args in the graph)
if (graph_build.GetNodeArg(input->Name())) {
graph_build.AddOuterScopeNodeArg(input->Name());
// LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name() << " is used in this subgraph";

if (context &&
(context->manually_added_graph_inputs.find(input->Name()) != context->manually_added_graph_inputs.end())) {
// LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name() << " is already been added as an explicit input to graph";
continue;
}

// Handle the case where this outer scope value is not existed in any outer scope levels of the
// newly built graph (the newly built graph is the subgraph of the original graph). Need to add
// the outer scope value as an explicit input to the top-level of newly built graph.
if (!IsOuterScopeValue(graph_build, input->Name(), subgraph_context_map)) {
const auto& name = input->Name();
auto graph_inputs_including_initializers = top_level_graph->GetInputsIncludingInitializers();
auto added_graph_input = std::find_if(graph_inputs_including_initializers.begin(),
graph_inputs_including_initializers.end(),
[&name](const NodeArg* entry) { return entry->Name() == name; });

if (added_graph_input == graph_inputs_including_initializers.end()) {
if (context) {
auto type_proto = std::make_unique<ONNX_NAMESPACE::TypeProto>();
type_proto->CopyFrom(*(input->TypeAsProto()));
auto& n_input = top_level_graph->GetOrCreateNodeArg(name, type_proto.get());
context->manually_added_graph_inputs[n_input.Name()] = &n_input;
// LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << n_input.Name() << " is added as an explicit input into the newly built graph";
}
}
}
}
}
}
}

static void SetAllGraphInputs(Graph& graph, std::unordered_map<std::string, std::unique_ptr<SubGraphContext2>>& subgraph_context_map) {
// If ORT TRT doesn't manully set graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
// Graph::Resolve() will help set graph inputs in Graph::SetGraphInputsOutputs(), so no need to set graph inputs here.
std::string unique_graph_name = GetUniqueGraphName(graph);
if (subgraph_context_map.find(unique_graph_name) == subgraph_context_map.end() ||
subgraph_context_map[unique_graph_name].get()->manually_added_graph_inputs.size() == 0) {
return;
}

SubGraphContext2* context = subgraph_context_map[unique_graph_name].get();
std::vector<const NodeArg*> graph_inputs_including_initializers;
std::unordered_set<std::string> graph_inputs_including_initializers_set;

for (const auto& entry : context->inputs_and_initializers) {
graph_inputs_including_initializers.push_back(entry.second);
graph_inputs_including_initializers_set.insert(entry.first);
}

for (const auto& entry : context->manually_added_graph_inputs) {
if (graph_inputs_including_initializers_set.find(entry.first) == graph_inputs_including_initializers_set.end()) {
graph_inputs_including_initializers.push_back(entry.second);
graph_inputs_including_initializers_set.insert(entry.first);
}
}

for (const auto& node_arg : graph.GetInputsIncludingInitializers()) {
if (graph_inputs_including_initializers_set.find(node_arg->Name()) == graph_inputs_including_initializers_set.end()) {
graph_inputs_including_initializers.push_back(node_arg);
graph_inputs_including_initializers_set.insert(node_arg->Name());
}
}

graph.SetInputs(graph_inputs_including_initializers);
}

ORT_API_STATUS_IMPL(OrtApis::OrtGraph_GetSubGraph, const OrtGraphViewer* graph, const int node_num, const size_t* node_indices, _Outptr_ const OrtGraphViewer** subgraph) {
const ::onnxruntime::GraphViewer* graph_viewer = reinterpret_cast<const ::onnxruntime::GraphViewer*>(graph);
// Get parent graph output names
Expand Down Expand Up @@ -2680,11 +2919,13 @@ ORT_API_STATUS_IMPL(OrtApis::OrtGraph_GetSubGraph, const OrtGraphViewer* graph,
// TODO:yang
// Only if the newly built graph has control flow op as well as it has parent node,
// it needs to handle outer scope values before calling graph.Resolve().
// TODO(leca): Is local variable enough? Do we need to make it EP class variable?
std::unordered_map<std::string, std::unique_ptr<SubGraphContext2>> subgraph_context_map;
if (has_control_flow_op && graph_viewer->ParentNode()) {
// LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Handle outer scope values for the subgraph " << graph_build.Name();
// BuildSubGraphContext(graph_build);
// SetGraphOuterScopeValuesAndInputs(graph_build, graph.GetGraph());
// SetAllGraphInputs(graph_build);
BuildSubGraphContext(graph_build, subgraph_context_map);
SetGraphOuterScopeValuesAndInputs(graph_build, graph_viewer->GetGraph(), subgraph_context_map);
SetAllGraphInputs(graph_build, subgraph_context_map);
}

common::Status status = graph_build.Resolve();
Expand Down Expand Up @@ -2878,7 +3119,7 @@ ORT_API(float, OrtApis::OrtNode_GetAttributeFloat, const OrtNode* node, const ch
ORT_API_STATUS_IMPL(OrtApis::OrtNode_GetSubgraphs, const OrtNode* node, _Out_ size_t* len, _Outptr_ const OrtGraphViewer*** subgraphs) {
const ::onnxruntime::Node* n = reinterpret_cast<const ::onnxruntime::Node*>(node);
std::vector<gsl::not_null<const Graph*>> subg = n->GetSubgraphs();
len = new size_t (subg.size());
*len = subg.size();
*subgraphs = new const OrtGraphViewer* [*len];
for (size_t i = 0; i < subg.size(); i++) {
const ::onnxruntime::GraphViewer* graph_viewer = new const ::onnxruntime::GraphViewer(*subg[i]);
Expand Down
39 changes: 39 additions & 0 deletions samples/c_test/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,43 @@ void RunTinyYolov3(OrtEnv* p_env, OrtSessionOptions* so, const char* model) {
for (size_t i = 0; i < 4; i++) std::cout<<output_tensor_data[i]<<" \n";
}

void RunControlFlow(OrtEnv* p_env, OrtSessionOptions* so) {
OrtSession* session = nullptr;
THROW_ON_ERROR(g_ort->CreateSession(p_env, "/home/leca/models/control_flow/control_flow_model.onnx", so, &session));

OrtMemoryInfo* memory_info = nullptr;
THROW_ON_ERROR(g_ort->CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, &memory_info));

std::vector<OrtValue*> input_tensors(3, nullptr);
const int input_cnt = 2;
float input_data[input_cnt];
for (int i = 0; i < input_cnt; i++) input_data[i] = 1;
const size_t input_len = input_cnt * sizeof(float);
const int64_t input_shape[] = {1, 2};
THROW_ON_ERROR(g_ort->CreateTensorWithDataAsOrtValue(memory_info, input_data, input_len, input_shape, sizeof(input_shape)/sizeof(input_shape[0]), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &input_tensors[0]));

float input2[2] = {0.36252614855766296, 0.030415434390306473};
const size_t input2_len = 8; // 2 * sizeof(float)
const int64_t input2_shape[] = {1, 2};
THROW_ON_ERROR(g_ort->CreateTensorWithDataAsOrtValue(memory_info, input2, input2_len, input2_shape, sizeof(input2_shape)/sizeof(input2_shape[0]), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &input_tensors[1]));

float input3 = 0.5945659279823303;
const int64_t input3_shape[] = {1};
THROW_ON_ERROR(g_ort->CreateTensorWithDataAsOrtValue(memory_info, &input3, 4, input3_shape, sizeof(input3_shape)/sizeof(input3_shape[0]), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &input_tensors[2]));

const char* input_names[] = {"x1.opt", "x2", "x3"};
const char* output_names[] = {"y"};

size_t output_count = sizeof(output_names)/sizeof(output_names[0]);
std::vector<OrtValue*> output_tensors(output_count, nullptr);
THROW_ON_ERROR(g_ort->Run(session, nullptr, input_names, (const OrtValue* const*)input_tensors.data(), sizeof(input_names)/sizeof(input_names[0]), output_names, output_count, output_tensors.data()));

float* output_tensor_data = nullptr;
THROW_ON_ERROR(g_ort->GetTensorMutableData(output_tensors[0], (void**)&output_tensor_data));
// std::cout<<"Result:\n";
// for (size_t i = 0; i < 4; i++) std::cout<<output_tensor_data[i]<<" \n";
}

// ./TestOutTreeEp c/k/t/tc/otc relu/resnet/rcnn
int main(int argc, char *argv[]) {
OrtEnv* p_env = nullptr;
Expand Down Expand Up @@ -261,6 +298,8 @@ int main(int argc, char *argv[]) {
RunFastRcnn(g_ort, p_env, so);
} else if (!strcmp(argv[2], "tyolo") || !strcmp(argv[2], "yolo")) {
RunTinyYolov3(p_env, so, argv[2]);
} else if (!strcmp(argv[2], "cf")) {
RunControlFlow(p_env, so);
}

g_ort->ReleaseEnv(p_env);
Expand Down
10 changes: 5 additions & 5 deletions samples/tensorRTEp/tensorrt_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1083,7 +1083,8 @@ bool TensorrtExecutionProvider::IsSubGraphOfControlFlowOp(const OrtGraphViewer*
// Check whether all the nodes of the graph are assigned to specific ep
bool TensorrtExecutionProvider::AllNodesAssignedToSpecificEP(const OrtGraphViewer* graph, const std::string& provider_type) const {
const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
std::vector<size_t> nodes_vector(api->OrtGraph_NumberOfNodes(graph));
const int number_of_ort_nodes = api->OrtGraph_NumberOfNodes(graph);
std::vector<size_t> nodes_vector(number_of_ort_nodes);
std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0);
size_t node_count = 0;
const size_t* nodes_index = nullptr;
Expand All @@ -1093,12 +1094,11 @@ bool TensorrtExecutionProvider::AllNodesAssignedToSpecificEP(const OrtGraphViewe
api->OrtGraph_GetOrtNode(graph, nodes_index[index], &node);
const char* node_ep_type;
api->OrtNode_GetExecutionProviderType(node, &node_ep_type);
if (!strcmp(node_ep_type, provider_type.c_str())) {
if (strcmp(node_ep_type, provider_type.c_str())) {
return false;
}
}
return true;

return number_of_ort_nodes != 0;
}

// Check whether all the nodes of subgraph are supported
Expand Down Expand Up @@ -1430,7 +1430,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const char* ep_type, const
break;
}
// Another subgraph of "If" control flow op has been parsed by GetCapability before and all subgraph's nodes assigned to TRT EP.
else if (p->AllNodesAssignedToSpecificEP(subgraphs[i], "TensorrtExecutionProvider")) {
else if (p->AllNodesAssignedToSpecificEP(subgraphs[i], "tensorrtEp")) {
all_subgraphs_are_supported = true;
break;
}
Expand Down

0 comments on commit f871b25

Please sign in to comment.