From f871b25d337d800813151bff2aeb2ef64ae3c6d6 Mon Sep 17 00:00:00 2001 From: Lei Cao Date: Wed, 2 Oct 2024 23:44:03 +0000 Subject: [PATCH] new test control_flow, error: ErrorMessage:Failed to find kernel for MemcpyFromHost(1) (node:'Memcpy' ep:'tensorrtEp'). Kernel not found --- .../tensorrt/tensorrt_execution_provider.h | 2 +- .../tensorrt_execution_provider_helper.cc | 4 +- onnxruntime/core/session/onnxruntime_c_api.cc | 249 +++++++++++++++++- samples/c_test/test.cpp | 39 +++ .../tensorRTEp/tensorrt_execution_provider.cc | 10 +- 5 files changed, 292 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index b58e86237860c..6090b5a9ec277 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -546,7 +546,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { /** * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage - * Graph::ResolveContext::IsOuterScopeValue(). We have to implement this fuction again. + * Graph::ResolveContext::IsOuterScopeValue(). We have to implement this function again. */ bool IsOuterScopeValue(const Graph& graph, const std::string& name) const; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc index 92fa101118506..6d68327ca2e21 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc @@ -34,7 +34,7 @@ std::string GetUniqueGraphName(const Graph& graph) { } // namespace // The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage -// Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again. +// Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this function again. bool TensorrtExecutionProvider::IsInputInitializerOrOutput(const Graph& graph, const std::string& name, bool check_ancestors) const { @@ -125,7 +125,7 @@ void TensorrtExecutionProvider::BuildSubGraphContext(const Graph& graph) const { } } -// Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed. +// Set outer scope values for subgraphs and add those values as top-level graph's inputs if needed. void TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(Graph& graph_build, const Graph& graph) const { // Iterate all the nodes and recurse into inner most subgraph first for both newly built graph and original graph diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 7836c8024e76e..d14c0747f4426 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -31,6 +31,7 @@ #include "core/providers/get_execution_providers.h" #include "core/session/environment.h" #include "core/framework/callback.h" +#include "core/framework/murmurhash3.h" #include "core/framework/tensorprotoutils.h" #include "core/framework/onnxruntime_typeinfo.h" #include "core/session/inference_session.h" @@ -2594,6 +2595,244 @@ ORT_API_STATUS_IMPL(OrtApis::OrtGraph_DeserializeFromArray, const void* data, si return nullptr; } +struct SubGraphContext2 { + std::unordered_set output_args; + std::unordered_map inputs_and_initializers; + std::unordered_map manually_added_graph_inputs; +}; + +static std::string GetUniqueGraphName(const Graph& graph) { + HashValue model_hash = 0; + uint32_t hash[4] = {0, 0, 0, 0}; + + auto hash_str = [&hash](const std::string& str) { + MurmurHash3::x86_128(str.data(), gsl::narrow_cast(str.size()), hash[0], &hash); + }; + + // Hash all nodes' name + for (int i = 0; i < graph.MaxNodeIndex(); ++i) { + auto node = graph.GetNode(i); + if (node == nullptr) { + continue; + } + hash_str(node->Name()); + } + + model_hash = hash[0] | (uint64_t(hash[1]) << 32); + + return graph.Name() + "_" + std::to_string(model_hash); +} + +static bool IsLocalValue(const Graph& graph, + const std::string& name, + const std::unordered_map>& subgraph_context_map) { + std::string unique_graph_name = GetUniqueGraphName(graph); + if (subgraph_context_map.find(unique_graph_name) == subgraph_context_map.end()) { + return false; + } + SubGraphContext2* context = subgraph_context_map.at(unique_graph_name).get(); + return context->output_args.find(name) != context->output_args.cend() || + context->inputs_and_initializers.find(name) != context->inputs_and_initializers.cend(); +} + +static bool IsInputInitializerOrOutput(const Graph& graph, + const std::string& name, + bool check_ancestors, + const std::unordered_map>& subgraph_context_map) { + const Graph* parent_graph = nullptr; + return IsLocalValue(graph, name, subgraph_context_map) || + (check_ancestors && (parent_graph = graph.ParentGraph()) != nullptr && + IsInputInitializerOrOutput(*parent_graph, name, check_ancestors, subgraph_context_map)); +} + +static bool IsOuterScopeValue(const Graph& graph, + const std::string& name, + const std::unordered_map>& subgraph_context_map) { + const Graph* parent_graph = nullptr; + return (parent_graph = graph.ParentGraph()) != nullptr && + IsInputInitializerOrOutput(*parent_graph, name, true, subgraph_context_map); +} + +static void BuildSubGraphContext(const Graph& graph, std::unordered_map>& subgraph_context_map) { + // Iterate all the nodes and recurse into inner most subgraph first + for (int i = 0; i < graph.MaxNodeIndex(); ++i) { + auto node = graph.GetNode(i); + if (node == nullptr) { + continue; + } + + auto subgraph_map = node->GetAttributeNameToSubgraphMap(); + for (auto& entry : subgraph_map) { + const Graph* subgraph = entry.second; + BuildSubGraphContext(*subgraph, subgraph_context_map); + } + } + + std::string unique_graph_name = GetUniqueGraphName(graph); + + // Subgraph context has been built before, no need to do it again + if (subgraph_context_map.find(unique_graph_name) != subgraph_context_map.end()) { + return; + } + + subgraph_context_map.emplace(unique_graph_name, std::make_unique()); + SubGraphContext2* context = subgraph_context_map.at(unique_graph_name).get(); + + // Collect all nodes' outputs and nodes' name + for (int i = 0; i < graph.MaxNodeIndex(); ++i) { + auto node = graph.GetNode(i); + if (node == nullptr) { + continue; + } + + for (const auto& output : node->OutputDefs()) { + context->output_args.insert(output->Name()); + } + } + + // Go thru all node's inputs + for (int i = 0; i < graph.MaxNodeIndex(); ++i) { + auto node = graph.GetNode(i); + if (node == nullptr) { + continue; + } + + for (const auto& input : node->InputDefs()) { + if (context->output_args.find(input->Name()) != context->output_args.end()) { + continue; + } + // This input arg is not the output of another node so must come from either a graph input or an initializer. + context->inputs_and_initializers[input->Name()] = input; + } + } +} + +static void SetGraphOuterScopeValuesAndInputs(Graph& graph_build, + const Graph& graph, + std::unordered_map>& subgraph_context_map) { + // Iterate all the nodes and recurse into inner most subgraph first for both newly built graph and original graph + for (int i = 0; i < graph_build.MaxNodeIndex(); ++i) { + auto graph_build_node = graph_build.GetNode(i); + if (graph_build_node == nullptr) { + continue; + } + + auto graph_build_map = graph_build_node->GetAttributeNameToMutableSubgraphMap(); + std::unordered_map> subgraph_map; + const Node* graph_node = nullptr; + + // Find corresponding original graph node's subgraphs + for (int j = 0; j < graph.MaxNodeIndex(); ++j) { + if (graph.GetNode(j) && graph.GetNode(j)->Name() == graph_build_node->Name()) { + graph_node = graph.GetNode(j); + subgraph_map = graph_node->GetAttributeNameToSubgraphMap(); + break; + } + } + + for (auto& entry : graph_build_map) { + auto attr_name = entry.first; + Graph* subgraph_build = entry.second; + if (subgraph_map.find(attr_name) != subgraph_map.end()) { + // recurse into subgraph + const Graph* subgraph = subgraph_map.at(attr_name); + SetGraphOuterScopeValuesAndInputs(*subgraph_build, *subgraph, subgraph_context_map); + } + } + } + + // Start from the inner most subgraph first and check whether its outer scope values are existed in the + // newly built graph. If not, we need to add those outer scope values as explicit inputs to the top-level + // of newly built graph. + if (graph_build.ParentNode()) { + auto top_level_graph = &graph_build; + while (top_level_graph->MutableParentGraph()) { + top_level_graph = top_level_graph->MutableParentGraph(); + } + std::string unique_graph_name = GetUniqueGraphName(*top_level_graph); + if (subgraph_context_map.find(unique_graph_name) == subgraph_context_map.end()) { + return; + } + + SubGraphContext2* context = subgraph_context_map.at(unique_graph_name).get(); + + // Iterate all the implicit inputs to set outer scope value for the newly built subgraph + for (const auto& input : graph.ParentNode()->ImplicitInputDefs()) { +// LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name(); + + // The node arg in parent node's implicit inputs could be used for parent node's other subgraph, for example + // "If" op has two subgraphs. So we need to make sure that the node arg is used in current subgraph only. + // (GetNodeArg searches for specific node arg in all node args in the graph) + if (graph_build.GetNodeArg(input->Name())) { + graph_build.AddOuterScopeNodeArg(input->Name()); +// LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name() << " is used in this subgraph"; + + if (context && + (context->manually_added_graph_inputs.find(input->Name()) != context->manually_added_graph_inputs.end())) { +// LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name() << " is already been added as an explicit input to graph"; + continue; + } + + // Handle the case where this outer scope value is not existed in any outer scope levels of the + // newly built graph (the newly built graph is the subgraph of the original graph). Need to add + // the outer scope value as an explicit input to the top-level of newly built graph. + if (!IsOuterScopeValue(graph_build, input->Name(), subgraph_context_map)) { + const auto& name = input->Name(); + auto graph_inputs_including_initializers = top_level_graph->GetInputsIncludingInitializers(); + auto added_graph_input = std::find_if(graph_inputs_including_initializers.begin(), + graph_inputs_including_initializers.end(), + [&name](const NodeArg* entry) { return entry->Name() == name; }); + + if (added_graph_input == graph_inputs_including_initializers.end()) { + if (context) { + auto type_proto = std::make_unique(); + type_proto->CopyFrom(*(input->TypeAsProto())); + auto& n_input = top_level_graph->GetOrCreateNodeArg(name, type_proto.get()); + context->manually_added_graph_inputs[n_input.Name()] = &n_input; +// LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << n_input.Name() << " is added as an explicit input into the newly built graph"; + } + } + } + } + } + } +} + +static void SetAllGraphInputs(Graph& graph, std::unordered_map>& subgraph_context_map) { + // If ORT TRT doesn't manully set graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(), + // Graph::Resolve() will help set graph inputs in Graph::SetGraphInputsOutputs(), so no need to set graph inputs here. + std::string unique_graph_name = GetUniqueGraphName(graph); + if (subgraph_context_map.find(unique_graph_name) == subgraph_context_map.end() || + subgraph_context_map[unique_graph_name].get()->manually_added_graph_inputs.size() == 0) { + return; + } + + SubGraphContext2* context = subgraph_context_map[unique_graph_name].get(); + std::vector graph_inputs_including_initializers; + std::unordered_set graph_inputs_including_initializers_set; + + for (const auto& entry : context->inputs_and_initializers) { + graph_inputs_including_initializers.push_back(entry.second); + graph_inputs_including_initializers_set.insert(entry.first); + } + + for (const auto& entry : context->manually_added_graph_inputs) { + if (graph_inputs_including_initializers_set.find(entry.first) == graph_inputs_including_initializers_set.end()) { + graph_inputs_including_initializers.push_back(entry.second); + graph_inputs_including_initializers_set.insert(entry.first); + } + } + + for (const auto& node_arg : graph.GetInputsIncludingInitializers()) { + if (graph_inputs_including_initializers_set.find(node_arg->Name()) == graph_inputs_including_initializers_set.end()) { + graph_inputs_including_initializers.push_back(node_arg); + graph_inputs_including_initializers_set.insert(node_arg->Name()); + } + } + + graph.SetInputs(graph_inputs_including_initializers); +} + ORT_API_STATUS_IMPL(OrtApis::OrtGraph_GetSubGraph, const OrtGraphViewer* graph, const int node_num, const size_t* node_indices, _Outptr_ const OrtGraphViewer** subgraph) { const ::onnxruntime::GraphViewer* graph_viewer = reinterpret_cast(graph); // Get parent graph output names @@ -2680,11 +2919,13 @@ ORT_API_STATUS_IMPL(OrtApis::OrtGraph_GetSubGraph, const OrtGraphViewer* graph, // TODO:yang // Only if the newly built graph has control flow op as well as it has parent node, // it needs to handle outer scope values before calling graph.Resolve(). + // TODO(leca): Is local variable enough? Do we need to make it EP class variable? + std::unordered_map> subgraph_context_map; if (has_control_flow_op && graph_viewer->ParentNode()) { // LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Handle outer scope values for the subgraph " << graph_build.Name(); - // BuildSubGraphContext(graph_build); - // SetGraphOuterScopeValuesAndInputs(graph_build, graph.GetGraph()); - // SetAllGraphInputs(graph_build); + BuildSubGraphContext(graph_build, subgraph_context_map); + SetGraphOuterScopeValuesAndInputs(graph_build, graph_viewer->GetGraph(), subgraph_context_map); + SetAllGraphInputs(graph_build, subgraph_context_map); } common::Status status = graph_build.Resolve(); @@ -2878,7 +3119,7 @@ ORT_API(float, OrtApis::OrtNode_GetAttributeFloat, const OrtNode* node, const ch ORT_API_STATUS_IMPL(OrtApis::OrtNode_GetSubgraphs, const OrtNode* node, _Out_ size_t* len, _Outptr_ const OrtGraphViewer*** subgraphs) { const ::onnxruntime::Node* n = reinterpret_cast(node); std::vector> subg = n->GetSubgraphs(); - len = new size_t (subg.size()); + *len = subg.size(); *subgraphs = new const OrtGraphViewer* [*len]; for (size_t i = 0; i < subg.size(); i++) { const ::onnxruntime::GraphViewer* graph_viewer = new const ::onnxruntime::GraphViewer(*subg[i]); diff --git a/samples/c_test/test.cpp b/samples/c_test/test.cpp index 362c329039581..db40463b5975c 100644 --- a/samples/c_test/test.cpp +++ b/samples/c_test/test.cpp @@ -233,6 +233,43 @@ void RunTinyYolov3(OrtEnv* p_env, OrtSessionOptions* so, const char* model) { for (size_t i = 0; i < 4; i++) std::cout<CreateSession(p_env, "/home/leca/models/control_flow/control_flow_model.onnx", so, &session)); + + OrtMemoryInfo* memory_info = nullptr; + THROW_ON_ERROR(g_ort->CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, &memory_info)); + + std::vector input_tensors(3, nullptr); + const int input_cnt = 2; + float input_data[input_cnt]; + for (int i = 0; i < input_cnt; i++) input_data[i] = 1; + const size_t input_len = input_cnt * sizeof(float); + const int64_t input_shape[] = {1, 2}; + THROW_ON_ERROR(g_ort->CreateTensorWithDataAsOrtValue(memory_info, input_data, input_len, input_shape, sizeof(input_shape)/sizeof(input_shape[0]), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &input_tensors[0])); + + float input2[2] = {0.36252614855766296, 0.030415434390306473}; + const size_t input2_len = 8; // 2 * sizeof(float) + const int64_t input2_shape[] = {1, 2}; + THROW_ON_ERROR(g_ort->CreateTensorWithDataAsOrtValue(memory_info, input2, input2_len, input2_shape, sizeof(input2_shape)/sizeof(input2_shape[0]), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &input_tensors[1])); + + float input3 = 0.5945659279823303; + const int64_t input3_shape[] = {1}; + THROW_ON_ERROR(g_ort->CreateTensorWithDataAsOrtValue(memory_info, &input3, 4, input3_shape, sizeof(input3_shape)/sizeof(input3_shape[0]), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &input_tensors[2])); + + const char* input_names[] = {"x1.opt", "x2", "x3"}; + const char* output_names[] = {"y"}; + + size_t output_count = sizeof(output_names)/sizeof(output_names[0]); + std::vector output_tensors(output_count, nullptr); + THROW_ON_ERROR(g_ort->Run(session, nullptr, input_names, (const OrtValue* const*)input_tensors.data(), sizeof(input_names)/sizeof(input_names[0]), output_names, output_count, output_tensors.data())); + + float* output_tensor_data = nullptr; + THROW_ON_ERROR(g_ort->GetTensorMutableData(output_tensors[0], (void**)&output_tensor_data)); +// std::cout<<"Result:\n"; +// for (size_t i = 0; i < 4; i++) std::cout<ReleaseEnv(p_env); diff --git a/samples/tensorRTEp/tensorrt_execution_provider.cc b/samples/tensorRTEp/tensorrt_execution_provider.cc index 9b676a0aee76b..be78f52d6df4c 100644 --- a/samples/tensorRTEp/tensorrt_execution_provider.cc +++ b/samples/tensorRTEp/tensorrt_execution_provider.cc @@ -1083,7 +1083,8 @@ bool TensorrtExecutionProvider::IsSubGraphOfControlFlowOp(const OrtGraphViewer* // Check whether all the nodes of the graph are assigned to specific ep bool TensorrtExecutionProvider::AllNodesAssignedToSpecificEP(const OrtGraphViewer* graph, const std::string& provider_type) const { const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION); - std::vector nodes_vector(api->OrtGraph_NumberOfNodes(graph)); + const int number_of_ort_nodes = api->OrtGraph_NumberOfNodes(graph); + std::vector nodes_vector(number_of_ort_nodes); std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0); size_t node_count = 0; const size_t* nodes_index = nullptr; @@ -1093,12 +1094,11 @@ bool TensorrtExecutionProvider::AllNodesAssignedToSpecificEP(const OrtGraphViewe api->OrtGraph_GetOrtNode(graph, nodes_index[index], &node); const char* node_ep_type; api->OrtNode_GetExecutionProviderType(node, &node_ep_type); - if (!strcmp(node_ep_type, provider_type.c_str())) { + if (strcmp(node_ep_type, provider_type.c_str())) { return false; } } - return true; - + return number_of_ort_nodes != 0; } // Check whether all the nodes of subgraph are supported @@ -1430,7 +1430,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const char* ep_type, const break; } // Another subgraph of "If" control flow op has been parsed by GetCapability before and all subgraph's nodes assigned to TRT EP. - else if (p->AllNodesAssignedToSpecificEP(subgraphs[i], "TensorrtExecutionProvider")) { + else if (p->AllNodesAssignedToSpecificEP(subgraphs[i], "tensorrtEp")) { all_subgraphs_are_supported = true; break; }