From 039e489f3ff5cc427030dc58e43af78696cf74e7 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 4 Mar 2024 14:47:32 -0800 Subject: [PATCH 01/10] Add a new function to fallback more nodes to CPUs. Shape-related nodes don't only start with `Shape` or `Size`. In dynamo-captured ONNX model, it can starts with a graph input. A new transform is added to fallback `all` nodes which can be reversely traversed from a `shape-like` variable. Some `shape-like` variables are list below. - all inputs of Range - 2nd input of Reshape - 2nd input of Unsqueeze - 1st input of ConstantOfShape - 2nd-to-last inputs of Slice. Fix header Remove unused variable Versioning shape inputs Fix --- .../core/framework/fallback_cpu_capability.cc | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc index ef68b88187e08..8af05affe8aec 100644 --- a/onnxruntime/core/framework/fallback_cpu_capability.cc +++ b/onnxruntime/core/framework/fallback_cpu_capability.cc @@ -2,9 +2,13 @@ // Licensed under the MIT License. #include "core/framework/fallback_cpu_capability.h" +#include "core/framework/tensorprotoutils.h" #include "core/common/inlined_containers.h" +#include +#include #include +#include #include "onnx/defs/data_type_utils.h" @@ -39,6 +43,115 @@ static bool IsSmallInitializer(const onnxruntime::GraphViewer& graph, const Node } } // namespace +std::unordered_set GetShapeRelatedNodes(const onnxruntime::GraphViewer& viewer) { + // Conceptually, this function traverse from shape-consuming nodes + // to fallback all its upstream nodes to CPU. Consider a graph + // + // + // The traversal should stop when + // 1. hitting Shape, Size nodes, graph inputs, or graph initializers. + // 2. hitting nodes with some large inputs or outputs. + LOGS_DEFAULT(INFO) << "Call GetShapeRelatedNodes to identify extra CPU nodes." << std::endl; + + std::unordered_map>> shape_related_inputs_in_nodes = { + // 2nd input of Expand-13 is a shape-related input. + {"Expand", {{13 /* since version */, {1} /* shape inputs' indices */}}}, + // 2nd input (indexed by 1) of Reshape-13, Reshape-14, Reshape-19, Reshape-21 is a shape-related input. + {"Reshape", {{13, {1}}, {14, {1}}, {19, {1}}, {21, {1}}}}, + // 2nd input of Unsqueeze-13 and Unsqueeze-21 is a shape-related input. + {"Unsqueeze", {{13, {1}}, {21, {1}}}}, + // 1st input of ConstantOfShape is a shape-related input. + {"ConstantOfShape", {{9, {0}}, {20, {0}}, {21, {0}}}}, + // 2nd to 5th inputs of Slice-13 are shape-related inputs. + {"Slice", {{13, {1, 2, 3, 4}}}}}; + + auto& graph = viewer.GetGraph(); + // Each shape-producing node produces a tensor consumed + // as shape, axis, size, and indices. + // E.g., + // shape = onnx::Concat(s0, s1) + // reshaped = onnx::Reshape(x, shape) + // Then, the shape-producing node is Concat. + std::unordered_set shape_producing_nodes; + // This loop collects all shape-producing nodes by finding + // all nodes that produce tensors specified in shape_related_inputs_in_nodes. + // E.g., for the above example, Concat is a shape-producing node because + // "Reshape" has a shape-related input at index 1. + for (auto& node : graph.Nodes()) { + LOGS_DEFAULT(INFO) << "Check if node " << node.Name() << " can be sink of shape sub-graph." << std::endl; + auto op_type_it = shape_related_inputs_in_nodes.find(node.OpType()); + if (op_type_it == shape_related_inputs_in_nodes.end()) { + // This node doesn't consume tensor as shape, + // so we won't find any shape-producing node from it. + continue; + } + auto op_type_version_it = op_type_it->second.find(node.SinceVersion()); + if (op_type_version_it == op_type_it->second.end()) { + // This node doesn't consume tensor as shape in this version, + // so we won't find any shape-producing node from it. + continue; + } + + // shape-like inputs' indices in this node. + // E.g., for Reshape, it's [1] and for Slice, it's [1, 2, 3, 4]. + auto& shape_input_indices = op_type_version_it->second; + // Now, this `node` is a shape-consuming node as defined by shape_related_inputs_in_nodes. + // Let's find producers for shape-like tensors consumed by this `node`. + // Consider this graph: + // shape = onnx::Concat(s0, s1) + // reshaped = onnx::Reshape(x, shape) + // The loop below does: + // 1. checks all `Reshape`'s inputs, `x` and `shape`, + // 2. finds `shape` is a shape-related variable since Reshape's 2nd input is a shape-related input, + // 3. and then records the producer of `shape` (i.e., `Concat`). + for (auto& input_index : shape_input_indices) { + auto input = node.InputDefs().at(input_index); + auto producer_node = graph.GetProducerNode(input->Name()); + if (producer_node != nullptr && producer_node->OpType() != "Shape" && producer_node->OpType() != "Size") { + // Assume shape-computing sub-graphs begins with Shape, Size, or graph inputs. + // We should not fallback those nodes's upstream nodes to CPU; otherwise, + // it may change + // GPU-tensor-x -> Mul -> GPU-tensor-y -> Shape -> CPU-tensor + // to + // CPU-tensor-x -> Mul -> CPU-tensor -> Shape -> CPU-tensor + // and slows down the computation. + + // After this for-loop, we will reversely traverse all nodes from every shape-producing node + // found here until hitting Shape, Size nodes, graph inputs, or graph initializers. + // All nodes on the traversal path will be forced to run on CPU. + LOGS_DEFAULT(INFO) << "Find a shape producing node (i.e., a node produces a tensor consumed as shape-like input in other nodes): " << node.Name() << std::endl; + shape_producing_nodes.insert(producer_node); + } + } + } + + std::unordered_set shape_related_node_indices; + for (auto& node : shape_producing_nodes) { + LOGS_DEFAULT(INFO) << "Begin the (topologically reverse) traversing from shape producing node: " << node->Name() << std::endl; + std::vector start_nodes = {node}; + + auto to_stop = [](const Node* n1, const Node* n2) { + LOGS_DEFAULT(INFO) << "Skip the traversal from " << n1->Name() << " to " << n2->Name() << " since " << n2->Name() << " is a Shape or Size node." << std::endl; + return n2->OpType() == "Shape" || n2->OpType() == "Size"; + }; + + // Reversely traverse all nodes from the shape-producing node. + // Force nodes to be run on CPU when all inputs and outputs are small. + // Stop the traversal when a "Shape" node is found. + graph.ReverseDFSFrom( + start_nodes, + [&shape_related_node_indices](const Node* n) { + LOGS_DEFAULT(INFO) << "Find an upstream node in shape sub-graph (let's fallback it to CPU): " << n->Name() << std::endl; + shape_related_node_indices.insert(n->Index()); + }, + nullptr, + NodeCompare(), + to_stop); + } + + return shape_related_node_indices; +} + std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, gsl::span tentative_nodes) { @@ -171,6 +284,10 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe } } + if (std::strcmp(std::getenv("ORT_AGGRESSIVE_CPU_FALLBACK"), "1") == 0) { + auto shape_related_node_indices = GetShapeRelatedNodes(graph); + cpu_nodes.insert(shape_related_node_indices.begin(), shape_related_node_indices.end()); + } return cpu_nodes; } From 0dac90238fbf80a5b8a710dca7ce0e4a631e2812 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 5 Mar 2024 09:29:41 -0800 Subject: [PATCH 02/10] Fix segfault --- .../core/framework/fallback_cpu_capability.cc | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc index 8af05affe8aec..dc968736dbe8f 100644 --- a/onnxruntime/core/framework/fallback_cpu_capability.cc +++ b/onnxruntime/core/framework/fallback_cpu_capability.cc @@ -152,6 +152,23 @@ std::unordered_set GetShapeRelatedNodes(const onnxruntime::GraphViewe return shape_related_node_indices; } +bool IsAggressiveCpuFallbackEnabled() { +#if !defined(_WIN32) && ENABLE_TRAINING + // std::getenv is not available on each platform. + // Since ORT_AGGRESSIVE_CPU_FALLBACK is experimental, + // we only allow it for training to avoid build issues on + // custom platform such as XBox. + const char* p_env_var = std::getenv("ORT_AGGRESSIVE_CPU_FALLBACK"); + if (!p_env_var) { + // No such an environment variable. + return false; + } + return std::strcmp(p_env_var, "1") == 0; +#else + return false; +#endif +} + std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, gsl::span tentative_nodes) { @@ -284,7 +301,7 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe } } - if (std::strcmp(std::getenv("ORT_AGGRESSIVE_CPU_FALLBACK"), "1") == 0) { + if (IsAggressiveCpuFallbackEnabled()) { auto shape_related_node_indices = GetShapeRelatedNodes(graph); cpu_nodes.insert(shape_related_node_indices.begin(), shape_related_node_indices.end()); } From 520332f760953f69c03b54dfbb142d23c6d42f6c Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 5 Mar 2024 14:50:12 -0800 Subject: [PATCH 03/10] Add a simple test Fix typo Write to fixed place Remove unused import's run it Fix Change test location --- ...rttraining_test_aggressive_cpu_fallback.py | 54 +++++++++++++++++++ .../test/python/orttraining_test_ort_apis.py | 14 +++++ 2 files changed, 68 insertions(+) create mode 100644 orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py diff --git a/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py b/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py new file mode 100644 index 0000000000000..d6fa503695b89 --- /dev/null +++ b/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py @@ -0,0 +1,54 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import unittest + +import onnx +import onnxscript +from onnxscript.onnx_opset import opset18 +from onnxscript.onnx_types import FLOAT, INT64 + +import onnxruntime + + +class TestAggressiveCpuFallback(unittest.TestCase): + def test_cpu_fallback(self): + @onnxscript.script(default_opset=opset18) + def foo(x: FLOAT[12], w: FLOAT[6, 2], dim0: INT64[1], dim1: INT64[1]): + # This should be computed by CPU but is placed + # on CUDA (i.e., all inputs and outputs are GPU tensors). + dim2 = dim1 + 1 + # Same as `dim2 = dim1 + 1`. Another GPU node. + dim3 = dim2 - 1 + # Same as `dim2 = dim1 + 1`. Another GPU node. + new_shape = opset18.Concat(dim0, dim3, axis=0) + # A memcpy node will be inserted to copy GPU output + # to CPU since Reshape's 2nd input is a CPU tensor + # per schema definition. + # + # Use ORT_AGGRESSIVE_CPU_FALLBACK=1 to + # 1. remove memcpy node. + # 2. fallback all computation above this line to CPU. + new_x = opset18.Reshape(x, new_shape) + y = opset18.MatMul(new_x, w) + return y + + model = foo.to_model_proto() + + session_options = onnxruntime.SessionOptions() + session_options.optimized_model_filepath = "cpu_fallback_test.onnx" + # This call should trigger GetCpuPreferredNodes and then GetShapeRelatedNodes + # when environment variable ORT_AGGRESSIVE_CPU_FALLBACK=1 is set. + # As a result, no memcopy node should be observed in optimized graph. + # + # See comments inside `foo`. + onnxruntime.InferenceSession( + path_or_bytes=model.SerializeToString(), sess_options=session_options, providers=["CUDAExecutionProvider"] + ) + optimized = onnx.load("cpu_fallback_test.onnx") + + self.assertTrue(all(node.op_type != "MemcpyToHost" for node in optimized.graph.node)) + + +if __name__ == "__main__": + unittest.main() diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis.py index a3e666dd404f2..524497bdd0b4e 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ort_apis.py +++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis.py @@ -42,6 +42,18 @@ def run_onnxblock_tests(cwd, log): run_subprocess(command, cwd=cwd, log=log).check_returncode() +def run_aggressive_cpu_fallback_test(cwd, log): + log.debug("Running: Aggressive CPU Fallback") + + command = [ + "python3", + "orttraining_test_aggressive_cpu_fallback.py", + ] + + env = {"ORT_AGGRESSIVE_CPU_FALLBACK": "1"} + run_subprocess(command, cwd=cwd, log=log, env=env).check_returncode() + + def main(): args = parse_arguments() cwd = args.cwd @@ -52,6 +64,8 @@ def main(): run_training_apis_python_api_tests(cwd, log) + run_aggressive_cpu_fallback_test(cwd, log) + return 0 From 05c9a4125924dff9209a3921e3ceaa65617c9bcd Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Wed, 6 Mar 2024 15:45:23 -0800 Subject: [PATCH 04/10] Install onnxscript --- .../templates/orttraining-linux-gpu-test-ci-pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml index 5dc156e301357..a079b6e900a6a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml @@ -39,6 +39,7 @@ steps: timeoutInMinutes: 60 # Entry point for all ort training api tests +# TODO: move onnxscript installation to CI image. - script: | docker run \ --gpus all \ @@ -47,7 +48,7 @@ steps: --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \ ${{ parameters.DockerImageTag }} \ - bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \ + bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m pip install onnxscript && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \ displayName: 'Run ORT Training APIs Tests' condition: succeededOrFailed() timeoutInMinutes: 120 From 8f8c8fb0d252c91ee23985b1a1e34751ef757048 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 7 Mar 2024 15:36:16 -0800 Subject: [PATCH 05/10] new session option key --- .../onnxruntime_session_options_config_keys.h | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index b282438795eb5..740095d5b1876 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -256,3 +256,48 @@ static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed // - "0": Gemm FastMath mode is not enabled. [DEFAULT] // - "1": Gemm FastMath mode is enabled. static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16"; + +// Optionally identifying sub-graphs by traversing the graph in reverse order +// starting from all CPU consuming nodes (e.g., for Reshape-13, the traversal +// starts from its 2nd input). Traversing stops when hitting a Size or Shape operator. +// The identified sub-graphs will be assigned to CPU EP. +// +// See comments in the model defined by onnxscript in Python below for an example. +// +// @onnxscript.script(default_opset=opset18) +// def foo(x: FLOAT[12], w: FLOAT[6, 2], dim0: INT64[1], dim1: INT64[1]): +// # This should be computed by CPU but is placed +// # on CUDA (i.e., all inputs and outputs are GPU tensors) +// # when this option is not set to 1. +// dim2 = dim1 + 1 +// # Same as `dim2 = dim1 + 1`. Another GPU node +// # when this option is not set to 1. +// dim3 = dim2 - 1 +// # Same as `dim2 = dim1 + 1`. Another GPU node +// # when this option is not set to 1. +// new_shape = opset18.Concat(dim0, dim3, axis=0) +// +// # A memcpy node will be inserted to copy GPU output +// # `new_shape` to CPU since Reshape's 2nd input is a CPU tensor +// # per schema definition. +// # +// # To +// # 1. remove memcpy node. +// # 2. fallback all computation above this line to CPU. +// # use the following code in Python +// # import onnxruntime +// # so = onnxruntime.SessionOptions() +// # so.add_session_config_entry("session.reverse_traverse_cpu_fallback", "1") +// # +// # Note that x and new_x are still on GPU w/wo +// # setting session.reverse_traverse_cpu_fallback. +// new_x = opset18.Reshape(x, new_shape) +// # A pure GPU node. +// y = opset18.MatMul(new_x, w) +// return y +// +// Option values: +// - "0": Disable reverse-traversing CPU fallback. [DEFAULT] +// - "1": Enable reverse-traversing CPU fallback when calling GetCpuPreferredNodes(...). +// (i.e., adding nodes found by GetShapeRelatedNodes(...) to CPU node list internally). +static const char* const kOrtSessionOptionsReverseTraverseCpuFallback = "session.reverse_traverse_cpu_fallback"; From 526b166f8d21cbd054f3f6cea5533bd6d1c14556 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 12 Mar 2024 15:32:18 -0700 Subject: [PATCH 06/10] Make the flag a session option instead of env var --- .../onnxruntime/core/framework/execution_provider.h | 10 ++++++++++ .../session/onnxruntime_session_options_config_keys.h | 2 +- onnxruntime/core/framework/fallback_cpu_capability.cc | 5 +++-- onnxruntime/core/framework/fallback_cpu_capability.h | 4 +++- onnxruntime/core/framework/session_options.h | 4 ++++ .../core/providers/cann/cann_execution_provider.cc | 8 +++++++- .../core/providers/cuda/cuda_execution_provider.cc | 9 ++++++++- .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp | 8 +++++++- onnxruntime/core/providers/js/js_execution_provider.cc | 8 +++++++- .../core/providers/rocm/rocm_execution_provider.cc | 8 +++++++- .../core/providers/shared_library/provider_api.h | 3 ++- .../shared_library/provider_bridge_provider.cc | 5 +++-- .../providers/shared_library/provider_interfaces.h | 5 ++++- .../providers/shared_library/provider_wrappedtypes.h | 7 +++++++ onnxruntime/core/session/inference_session.cc | 1 + onnxruntime/core/session/provider_bridge_ort.cc | 6 ++++-- .../python/orttraining_test_aggressive_cpu_fallback.py | 1 + 17 files changed, 79 insertions(+), 15 deletions(-) diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index 40ca96a19aef1..85faabaa32bdc 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -11,6 +11,7 @@ #include "core/common/logging/logging.h" #include "core/common/status.h" #include "core/framework/data_transfer.h" +#include "core/framework/session_options.h" #include "core/framework/tensor.h" namespace onnxruntime { @@ -277,6 +278,14 @@ class IExecutionProvider { return logger_; } + void SetSessionOptions(const SessionOptions* session_options) { + session_options_ = session_options; + } + + const SessionOptions* GetSessionOptions() const { + return session_options_; + } + virtual std::unique_ptr GetProfiler() { return {}; } @@ -330,5 +339,6 @@ class IExecutionProvider { // It will be set when this object is registered to a session const logging::Logger* logger_ = nullptr; + const SessionOptions* session_options_ = nullptr; }; } // namespace onnxruntime diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 740095d5b1876..1e02a02349df5 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -300,4 +300,4 @@ static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas // - "0": Disable reverse-traversing CPU fallback. [DEFAULT] // - "1": Enable reverse-traversing CPU fallback when calling GetCpuPreferredNodes(...). // (i.e., adding nodes found by GetShapeRelatedNodes(...) to CPU node list internally). -static const char* const kOrtSessionOptionsReverseTraverseCpuFallback = "session.reverse_traverse_cpu_fallback"; +static const char* const kOrtSessionOptionsAggressiveCpuFallback = "session.aggressive_cpu_fallback"; diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc index dc968736dbe8f..33489ca44038b 100644 --- a/onnxruntime/core/framework/fallback_cpu_capability.cc +++ b/onnxruntime/core/framework/fallback_cpu_capability.cc @@ -171,7 +171,8 @@ bool IsAggressiveCpuFallbackEnabled() { std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, - gsl::span tentative_nodes) { + gsl::span tentative_nodes, + const bool aggressive_cpu_fallback) { // automatic conversion from const std::vector& const auto& ordered_nodes = graph.GetNodesInTopologicalOrder(); InlinedVector node_id_to_order_map(graph.MaxNodeIndex()); @@ -301,7 +302,7 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe } } - if (IsAggressiveCpuFallbackEnabled()) { + if (aggressive_cpu_fallback) { auto shape_related_node_indices = GetShapeRelatedNodes(graph); cpu_nodes.insert(shape_related_node_indices.begin(), shape_related_node_indices.end()); } diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h index 7c8f91c7dad34..3e2f0d85f0306 100644 --- a/onnxruntime/core/framework/fallback_cpu_capability.h +++ b/onnxruntime/core/framework/fallback_cpu_capability.h @@ -16,9 +16,11 @@ namespace onnxruntime { @param graph Graph viewer @param kernel_lookup The kernel lookup for the target execution provider @param tentative_nodes Nodes that are tentative to be placed on on target EP + @param aggressive_cpu_fallback This is the set by kOrtSessionOptionsAggressiveCpuFallback option. */ std::unordered_set GetCpuPreferredNodes(const GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, - gsl::span tentative_nodes); + gsl::span tentative_nodes, + const bool aggressive_cpu_fallback); } // namespace onnxruntime diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h index 796a018ac0f68..5685c3a200556 100644 --- a/onnxruntime/core/framework/session_options.h +++ b/onnxruntime/core/framework/session_options.h @@ -146,6 +146,10 @@ struct SessionOptions { // The configuration keys and value formats are defined in // /include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h ConfigOptions config_options; + + const ConfigOptions& GetConfigOptions() const { + return config_options; + }; std::unordered_map initializers_to_share_map; // See onnxruntime_c_api.h for detailed documentation. diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc index 9a242919665bb..4c0fe1cb20b99 100644 --- a/onnxruntime/core/providers/cann/cann_execution_provider.cc +++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc @@ -1296,7 +1296,13 @@ CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewe candidates.push_back(node.Index()); } - auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates); + auto p_session_options = GetSessionOptions(); + bool aggressive_cpu_fallback = false; + if (p_session_options) { + aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault( + kOrtSessionOptionsAggressiveCpuFallback, "0") == "1"; + } + auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates, aggressive_cpu_fallback); for (auto& node_index : candidates) { if (cpu_nodes.count(node_index) > 0) continue; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 05d9f3b5a1e8f..8ee21d705a765 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -13,6 +13,7 @@ #include "core/providers/cuda/gpu_data_transfer.h" #include "core/providers/cuda/cuda_profiler.h" #include "core/session/onnxruntime_run_options_config_keys.h" +#include "core/session/onnxruntime_session_options_config_keys.h" #ifndef USE_CUDA_MINIMAL #ifndef DISABLE_CONTRIB_OPS @@ -2530,7 +2531,13 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, // For CUDA EP, exclude the subgraph that is preferred to be placed in CPU // These are usually shape related computation subgraphs // Following logic can be extended for other EPs - auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes); + auto p_session_options = GetSessionOptions(); + bool aggressive_cpu_fallback = false; + if (p_session_options) { + aggressive_cpu_fallback = p_session_options->GetConfigOptions().GetConfigEntry( + kOrtSessionOptionsAggressiveCpuFallback) == "1"; + } + auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback); std::vector> result; for (auto& node_index : candidates) { if (cpu_nodes.count(node_index) > 0) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 6c347ebdca7c1..1953fd048ed3a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -886,7 +886,13 @@ namespace Dml } // Get the list of nodes that should stay on the CPU - auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes); + auto p_session_options = GetSessionOptions(); + bool aggressive_cpu_fallback = false; + if (p_session_options) { + aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault( + kOrtSessionOptionsAggressiveCpuFallback, "0") == "1"; + } + auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes, aggressive_cpu_fallback); for (size_t nodeIndex : toplogicalOrder) { diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc index 038423104d92e..c97c5cb4d1871 100644 --- a/onnxruntime/core/providers/js/js_execution_provider.cc +++ b/onnxruntime/core/providers/js/js_execution_provider.cc @@ -728,7 +728,13 @@ std::vector> JsExecutionProvider::GetCapabili candidates.push_back(node.Index()); tenative_candidates.push_back(node.Index()); } - auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates); + auto p_session_options = GetSessionOptions(); + bool aggressive_cpu_fallback = false; + if (p_session_options) { + aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault( + kOrtSessionOptionsAggressiveCpuFallback, "0") == "1"; + } + auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates, aggressive_cpu_fallback); std::vector> result; for (auto& node_index : candidates) { if (cpu_nodes.count(node_index) > 0) { diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index 87daaeea969ac..67212f54a0492 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -2415,7 +2415,13 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, // For ROCM EP, exclude the subgraph that is preferred to be placed in CPU // These are usually shape related computation subgraphs // Following logic can be extended for other EPs - auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, candidates); + auto p_session_options = GetSessionOptions(); + bool aggressive_cpu_fallback = false; + if (p_session_options) { + aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault( + kOrtSessionOptionsAggressiveCpuFallback, "0") == "1"; + } + auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, candidates, aggressive_cpu_fallback); std::vector> result; for (auto& node_index : candidates) { if (cpu_nodes.count(node_index) > 0) diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h index 1cebe4a256fd4..ba3b72c5a7fe4 100644 --- a/onnxruntime/core/providers/shared_library/provider_api.h +++ b/onnxruntime/core/providers/shared_library/provider_api.h @@ -280,7 +280,8 @@ std::unique_ptr CreateGPUDataTransfer(); std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, - gsl::span tentative_nodes); + gsl::span tentative_nodes, + const bool aggressive_cpu_fallback); std::string GetEnvironmentVar(const std::string& var_name); diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index 7b73ab36b3742..8c8e78a8426d1 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -364,8 +364,9 @@ std::string GetEnvironmentVar(const std::string& var_name) { std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, - gsl::span tentative_nodes) { - return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes); + gsl::span tentative_nodes, + const bool aggressive_cpu_fallback) { + return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback); } namespace profiling { diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 8c8d5b1fd460a..73d1d81a4d6ad 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -192,7 +192,8 @@ struct ProviderHost { virtual std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, - gsl::span tentative_nodes) = 0; + gsl::span tentative_nodes, + const bool aggressive_cpu_fallback) = 0; virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) = 0; virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ float* p_data, size_t expected_size) = 0; @@ -913,6 +914,8 @@ struct ProviderHost { // SessionState virtual const DataTransferManager& SessionState__GetDataTransferMgr(const SessionState* p) = 0; + virtual const ConfigOptions& SessionOptions__GetConfigOptions(const SessionOptions* p) = 0; + // Tensor virtual std::unique_ptr Tensor__construct(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator) = 0; virtual std::unique_ptr Tensor__construct(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc, ptrdiff_t offset) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index bdad18c7edec0..a820cf57d9c6c 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -1075,6 +1075,13 @@ class SessionState { PROVIDER_DISALLOW_ALL(SessionState) }; +class SessionOptions { + public: + const ConfigOptions& GetConfigOptions() const { return g_host->SessionOptions__GetConfigOptions(this); } + + PROVIDER_DISALLOW_ALL(SessionOptions) +}; + struct Tensor final { static std::unique_ptr CreateDefault() { return g_host->Tensor__construct_default(); } static std::unique_ptr Create(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator) { return g_host->Tensor__construct(p_type, shape, std::move(allocator)); } diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index ece224ef206fc..ee1f0fe805223 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -763,6 +763,7 @@ common::Status InferenceSession::RegisterExecutionProvider(const std::shared_ptr } p_exec_provider->SetLogger(session_logger_); + p_exec_provider->SetSessionOptions(&session_options_); session_profiler_.AddEpProfilers(p_exec_provider->GetProfiler()); return execution_providers_.Add(provider_type, p_exec_provider); } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index c7cf5963fa10f..b9a613b79b517 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -263,8 +263,9 @@ struct ProviderHostImpl : ProviderHost { std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, - gsl::span tentative_nodes) override { - return onnxruntime::GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes); + gsl::span tentative_nodes, + const bool aggressive_cpu_fallback) override { + return onnxruntime::GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback); } Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) override { return utils::UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); } @@ -1153,6 +1154,7 @@ struct ProviderHostImpl : ProviderHost { // SessionState (wrapped) const DataTransferManager& SessionState__GetDataTransferMgr(const SessionState* p) override { return p->GetDataTransferMgr(); } + const ConfigOptions& SessionOptions__GetConfigOptions(const SessionOptions* p) override { return p->GetConfigOptions(); }; // Tensor (wrapped) std::unique_ptr Tensor__construct(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator) override { diff --git a/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py b/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py index d6fa503695b89..4cac5c1be8083 100644 --- a/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py +++ b/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py @@ -37,6 +37,7 @@ def foo(x: FLOAT[12], w: FLOAT[6, 2], dim0: INT64[1], dim1: INT64[1]): session_options = onnxruntime.SessionOptions() session_options.optimized_model_filepath = "cpu_fallback_test.onnx" + session_options.add_session_config_entry("session.reverse_traverse_cpu_fallback", "1") # This call should trigger GetCpuPreferredNodes and then GetShapeRelatedNodes # when environment variable ORT_AGGRESSIVE_CPU_FALLBACK=1 is set. # As a result, no memcopy node should be observed in optimized graph. From c087069e683f906e624f5d800e1ef883571e914a Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 12 Mar 2024 17:28:47 -0700 Subject: [PATCH 07/10] Use InlinedHash* types --- .../core/framework/fallback_cpu_capability.cc | 51 +++++++------------ 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc index 33489ca44038b..21faffbfe5a00 100644 --- a/onnxruntime/core/framework/fallback_cpu_capability.cc +++ b/onnxruntime/core/framework/fallback_cpu_capability.cc @@ -8,7 +8,6 @@ #include #include #include -#include #include "onnx/defs/data_type_utils.h" @@ -43,17 +42,20 @@ static bool IsSmallInitializer(const onnxruntime::GraphViewer& graph, const Node } } // namespace -std::unordered_set GetShapeRelatedNodes(const onnxruntime::GraphViewer& viewer) { +static InlinedHashSet GetShapeRelatedNodes(const onnxruntime::GraphViewer& viewer) { // Conceptually, this function traverse from shape-consuming nodes // to fallback all its upstream nodes to CPU. Consider a graph // // // The traversal should stop when // 1. hitting Shape, Size nodes, graph inputs, or graph initializers. - // 2. hitting nodes with some large inputs or outputs. - LOGS_DEFAULT(INFO) << "Call GetShapeRelatedNodes to identify extra CPU nodes." << std::endl; + // 2. [TODO] hitting nodes with some large inputs or outputs. Before that, + // we need shape inference to determine the size of the inputs and outputs. + // Some graph transforms add nodes without shape information, so + // checking shapes will make the algorithm more unstable now. + LOGS_DEFAULT(VERBOSE) << "Call GetShapeRelatedNodes to identify extra CPU nodes." << std::endl; - std::unordered_map>> shape_related_inputs_in_nodes = { + const static InlinedHashMap>> shape_related_inputs_in_nodes = { // 2nd input of Expand-13 is a shape-related input. {"Expand", {{13 /* since version */, {1} /* shape inputs' indices */}}}, // 2nd input (indexed by 1) of Reshape-13, Reshape-14, Reshape-19, Reshape-21 is a shape-related input. @@ -72,13 +74,13 @@ std::unordered_set GetShapeRelatedNodes(const onnxruntime::GraphViewe // shape = onnx::Concat(s0, s1) // reshaped = onnx::Reshape(x, shape) // Then, the shape-producing node is Concat. - std::unordered_set shape_producing_nodes; + InlinedHashSet shape_producing_nodes; // This loop collects all shape-producing nodes by finding // all nodes that produce tensors specified in shape_related_inputs_in_nodes. // E.g., for the above example, Concat is a shape-producing node because // "Reshape" has a shape-related input at index 1. for (auto& node : graph.Nodes()) { - LOGS_DEFAULT(INFO) << "Check if node " << node.Name() << " can be sink of shape sub-graph." << std::endl; + LOGS_DEFAULT(VERBOSE) << "Check if node " << node.Name() << " can be sink of shape sub-graph." << std::endl; auto op_type_it = shape_related_inputs_in_nodes.find(node.OpType()); if (op_type_it == shape_related_inputs_in_nodes.end()) { // This node doesn't consume tensor as shape, @@ -119,19 +121,19 @@ std::unordered_set GetShapeRelatedNodes(const onnxruntime::GraphViewe // After this for-loop, we will reversely traverse all nodes from every shape-producing node // found here until hitting Shape, Size nodes, graph inputs, or graph initializers. // All nodes on the traversal path will be forced to run on CPU. - LOGS_DEFAULT(INFO) << "Find a shape producing node (i.e., a node produces a tensor consumed as shape-like input in other nodes): " << node.Name() << std::endl; + LOGS_DEFAULT(VERBOSE) << "Find a shape producing node (i.e., a node produces a tensor consumed as shape-like input in other nodes): " << node.Name() << std::endl; shape_producing_nodes.insert(producer_node); } } } - std::unordered_set shape_related_node_indices; + InlinedHashSet shape_related_node_indices; for (auto& node : shape_producing_nodes) { - LOGS_DEFAULT(INFO) << "Begin the (topologically reverse) traversing from shape producing node: " << node->Name() << std::endl; + LOGS_DEFAULT(VERBOSE) << "Begin the (topologically reverse) traversing from shape producing node: " << node->Name() << std::endl; std::vector start_nodes = {node}; auto to_stop = [](const Node* n1, const Node* n2) { - LOGS_DEFAULT(INFO) << "Skip the traversal from " << n1->Name() << " to " << n2->Name() << " since " << n2->Name() << " is a Shape or Size node." << std::endl; + LOGS_DEFAULT(VERBOSE) << "Skip the traversal from " << n1->Name() << " to " << n2->Name() << " since " << n2->Name() << " is a Shape or Size node." << std::endl; return n2->OpType() == "Shape" || n2->OpType() == "Size"; }; @@ -141,7 +143,7 @@ std::unordered_set GetShapeRelatedNodes(const onnxruntime::GraphViewe graph.ReverseDFSFrom( start_nodes, [&shape_related_node_indices](const Node* n) { - LOGS_DEFAULT(INFO) << "Find an upstream node in shape sub-graph (let's fallback it to CPU): " << n->Name() << std::endl; + LOGS_DEFAULT(VERBOSE) << "Find an upstream node in shape sub-graph (let's fallback it to CPU): " << n->Name() << std::endl; shape_related_node_indices.insert(n->Index()); }, nullptr, @@ -152,23 +154,6 @@ std::unordered_set GetShapeRelatedNodes(const onnxruntime::GraphViewe return shape_related_node_indices; } -bool IsAggressiveCpuFallbackEnabled() { -#if !defined(_WIN32) && ENABLE_TRAINING - // std::getenv is not available on each platform. - // Since ORT_AGGRESSIVE_CPU_FALLBACK is experimental, - // we only allow it for training to avoid build issues on - // custom platform such as XBox. - const char* p_env_var = std::getenv("ORT_AGGRESSIVE_CPU_FALLBACK"); - if (!p_env_var) { - // No such an environment variable. - return false; - } - return std::strcmp(p_env_var, "1") == 0; -#else - return false; -#endif -} - std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, gsl::span tentative_nodes, @@ -214,7 +199,7 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name()); for (auto& consumer_node : consumer_nodes) { candidates.push(consumer_node->Index()); - LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name(); + LOGS_DEFAULT(VERBOSE) << "Candidate for fallback CPU execution: " << consumer_node->Name(); } } return Status::OK(); @@ -290,9 +275,9 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe if (place_in_cpu) { cpu_nodes.insert(cur); - LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name() - << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs " - << " capable of executing this node"; + LOGS_DEFAULT(VERBOSE) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name() + << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs " + << " capable of executing this node"; for (auto* output : node->OutputDefs()) { cpu_output_args.insert(output); } From 9f0ca0ffa777af385c6f7873302c3e8156fcfeb1 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 12 Mar 2024 19:16:20 -0700 Subject: [PATCH 08/10] Fix warning --- .../core/providers/shared_library/provider_wrappedtypes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index a820cf57d9c6c..9711c24d2595c 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -1075,7 +1075,7 @@ class SessionState { PROVIDER_DISALLOW_ALL(SessionState) }; -class SessionOptions { +struct SessionOptions { public: const ConfigOptions& GetConfigOptions() const { return g_host->SessionOptions__GetConfigOptions(this); } From 6db6a4d037cfaa321c79bdb6b96bf1ea787b41b7 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 12 Mar 2024 19:18:38 -0700 Subject: [PATCH 09/10] Fix ROCm build --- onnxruntime/core/providers/rocm/rocm_execution_provider.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index 67212f54a0492..73ec446b18ed2 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -10,6 +10,7 @@ #include "core/providers/rocm/rocm_fwd.h" #include "core/providers/rocm/gpu_data_transfer.h" #include "core/providers/rocm/rocm_profiler.h" +#include "core/session/onnxruntime_session_options_config_keys.h" #ifndef DISABLE_CONTRIB_OPS #include "contrib_ops/rocm/rocm_contrib_kernels.h" From 0f9de47fcf0fb6a53c9c82f021f5427c842a03c4 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 12 Mar 2024 19:32:54 -0700 Subject: [PATCH 10/10] Exclude in min build --- .../session/onnxruntime_session_options_config_keys.h | 2 ++ onnxruntime/core/framework/fallback_cpu_capability.cc | 10 ++++++++++ onnxruntime/core/framework/fallback_cpu_capability.h | 6 ++++++ .../core/providers/cann/cann_execution_provider.cc | 4 ++++ .../core/providers/cuda/cuda_execution_provider.cc | 4 ++++ .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp | 4 ++++ onnxruntime/core/providers/js/js_execution_provider.cc | 5 +++++ .../core/providers/rocm/rocm_execution_provider.cc | 4 ++++ .../core/providers/shared_library/provider_api.h | 7 ++++++- .../shared_library/provider_bridge_provider.cc | 9 ++++++++- .../providers/shared_library/provider_interfaces.h | 5 ++++- onnxruntime/core/session/provider_bridge_ort.cc | 9 ++++++++- 12 files changed, 65 insertions(+), 4 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 1e02a02349df5..028c71fa35167 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -300,4 +300,6 @@ static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas // - "0": Disable reverse-traversing CPU fallback. [DEFAULT] // - "1": Enable reverse-traversing CPU fallback when calling GetCpuPreferredNodes(...). // (i.e., adding nodes found by GetShapeRelatedNodes(...) to CPU node list internally). +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) static const char* const kOrtSessionOptionsAggressiveCpuFallback = "session.aggressive_cpu_fallback"; +#endif diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc index 21faffbfe5a00..dcee730be977f 100644 --- a/onnxruntime/core/framework/fallback_cpu_capability.cc +++ b/onnxruntime/core/framework/fallback_cpu_capability.cc @@ -42,6 +42,7 @@ static bool IsSmallInitializer(const onnxruntime::GraphViewer& graph, const Node } } // namespace +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) static InlinedHashSet GetShapeRelatedNodes(const onnxruntime::GraphViewer& viewer) { // Conceptually, this function traverse from shape-consuming nodes // to fallback all its upstream nodes to CPU. Consider a graph @@ -153,11 +154,18 @@ static InlinedHashSet GetShapeRelatedNodes(const onnxruntime::GraphVi return shape_related_node_indices; } +#endif +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, gsl::span tentative_nodes, const bool aggressive_cpu_fallback) { +#else +std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, + const IExecutionProvider::IKernelLookup& kernel_lookup, + gsl::span tentative_nodes) { +#endif // automatic conversion from const std::vector& const auto& ordered_nodes = graph.GetNodesInTopologicalOrder(); InlinedVector node_id_to_order_map(graph.MaxNodeIndex()); @@ -287,10 +295,12 @@ std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewe } } +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) if (aggressive_cpu_fallback) { auto shape_related_node_indices = GetShapeRelatedNodes(graph); cpu_nodes.insert(shape_related_node_indices.begin(), shape_related_node_indices.end()); } +#endif return cpu_nodes; } diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h index 3e2f0d85f0306..c09fd4e7ea0db 100644 --- a/onnxruntime/core/framework/fallback_cpu_capability.h +++ b/onnxruntime/core/framework/fallback_cpu_capability.h @@ -18,9 +18,15 @@ namespace onnxruntime { @param tentative_nodes Nodes that are tentative to be placed on on target EP @param aggressive_cpu_fallback This is the set by kOrtSessionOptionsAggressiveCpuFallback option. */ +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) std::unordered_set GetCpuPreferredNodes(const GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, gsl::span tentative_nodes, const bool aggressive_cpu_fallback); +#else +std::unordered_set GetCpuPreferredNodes(const GraphViewer& graph, + const IExecutionProvider::IKernelLookup& kernel_lookup, + gsl::span tentative_nodes); +#endif } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc index 4c0fe1cb20b99..0a6264be7d600 100644 --- a/onnxruntime/core/providers/cann/cann_execution_provider.cc +++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc @@ -1296,6 +1296,7 @@ CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewe candidates.push_back(node.Index()); } +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) auto p_session_options = GetSessionOptions(); bool aggressive_cpu_fallback = false; if (p_session_options) { @@ -1303,6 +1304,9 @@ CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewe kOrtSessionOptionsAggressiveCpuFallback, "0") == "1"; } auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates, aggressive_cpu_fallback); +#else + auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates); +#endif for (auto& node_index : candidates) { if (cpu_nodes.count(node_index) > 0) continue; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 8ee21d705a765..0e1e8ae23aca7 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -2528,6 +2528,7 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, } } +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) // For CUDA EP, exclude the subgraph that is preferred to be placed in CPU // These are usually shape related computation subgraphs // Following logic can be extended for other EPs @@ -2538,6 +2539,9 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, kOrtSessionOptionsAggressiveCpuFallback) == "1"; } auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback); +#else + auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes); +#endif std::vector> result; for (auto& node_index : candidates) { if (cpu_nodes.count(node_index) > 0) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 1953fd048ed3a..75f701eecba4c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -886,6 +886,7 @@ namespace Dml } // Get the list of nodes that should stay on the CPU +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) auto p_session_options = GetSessionOptions(); bool aggressive_cpu_fallback = false; if (p_session_options) { @@ -893,6 +894,9 @@ namespace Dml kOrtSessionOptionsAggressiveCpuFallback, "0") == "1"; } auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes, aggressive_cpu_fallback); +#else + auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes); +#endif for (size_t nodeIndex : toplogicalOrder) { diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc index c97c5cb4d1871..b2424714c9fe8 100644 --- a/onnxruntime/core/providers/js/js_execution_provider.cc +++ b/onnxruntime/core/providers/js/js_execution_provider.cc @@ -728,6 +728,7 @@ std::vector> JsExecutionProvider::GetCapabili candidates.push_back(node.Index()); tenative_candidates.push_back(node.Index()); } +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) auto p_session_options = GetSessionOptions(); bool aggressive_cpu_fallback = false; if (p_session_options) { @@ -735,6 +736,10 @@ std::vector> JsExecutionProvider::GetCapabili kOrtSessionOptionsAggressiveCpuFallback, "0") == "1"; } auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates, aggressive_cpu_fallback); +#else + auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates); +#endif + std::vector> result; for (auto& node_index : candidates) { if (cpu_nodes.count(node_index) > 0) { diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index 73ec446b18ed2..db8bc1450b700 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -2413,6 +2413,7 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, } } +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) // For ROCM EP, exclude the subgraph that is preferred to be placed in CPU // These are usually shape related computation subgraphs // Following logic can be extended for other EPs @@ -2423,6 +2424,9 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, kOrtSessionOptionsAggressiveCpuFallback, "0") == "1"; } auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, candidates, aggressive_cpu_fallback); +#else + auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, candidates); +#endif std::vector> result; for (auto& node_index : candidates) { if (cpu_nodes.count(node_index) > 0) diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h index ba3b72c5a7fe4..eff7e4dbd96b4 100644 --- a/onnxruntime/core/providers/shared_library/provider_api.h +++ b/onnxruntime/core/providers/shared_library/provider_api.h @@ -278,11 +278,16 @@ std::unique_ptr CreateROCMPinnedAllocator(const char* name); std::unique_ptr CreateGPUDataTransfer(); +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, gsl::span tentative_nodes, const bool aggressive_cpu_fallback); - +#else +std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, + const IExecutionProvider::IKernelLookup& kernel_lookup, + gsl::span tentative_nodes); +#endif std::string GetEnvironmentVar(const std::string& var_name); namespace profiling { diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index 8c8e78a8426d1..6843de7a0f92f 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -361,13 +361,20 @@ std::unique_ptr CreateGPUDataTransfer() { std::string GetEnvironmentVar(const std::string& var_name) { return g_host->GetEnvironmentVar(var_name); } - +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, gsl::span tentative_nodes, const bool aggressive_cpu_fallback) { return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback); } +#else +std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, + const IExecutionProvider::IKernelLookup& kernel_lookup, + gsl::span tentative_nodes) { + return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes); +} +#endif namespace profiling { diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 73d1d81a4d6ad..5951f99fa8fa0 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -189,11 +189,14 @@ struct ProviderHost { virtual Status RocmCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0; virtual void RocmCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0; #endif - virtual std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) gsl::span tentative_nodes, const bool aggressive_cpu_fallback) = 0; +#else + gsl::span tentative_nodes) = 0; +#endif virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) = 0; virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ float* p_data, size_t expected_size) = 0; diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index b9a613b79b517..50af2394024b1 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -260,13 +260,20 @@ struct ProviderHostImpl : ProviderHost { std::string demangle(const char* name) override { return onnxruntime::profiling::demangle(name); } std::string demangle(const std::string& name) override { return onnxruntime::profiling::demangle(name); } - +#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD) std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, const IExecutionProvider::IKernelLookup& kernel_lookup, gsl::span tentative_nodes, const bool aggressive_cpu_fallback) override { return onnxruntime::GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback); } +#else + std::unordered_set GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph, + const IExecutionProvider::IKernelLookup& kernel_lookup, + gsl::span tentative_nodes) override { + return onnxruntime::GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes); + } +#endif Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) override { return utils::UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); } Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ float* p_data, size_t expected_size) override { return utils::UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }