microsoft · wschin · Mar 4, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 6, 2024
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
@@ -11,6 +11,7 @@
 #include "core/common/logging/logging.h"
 #include "core/common/status.h"
 #include "core/framework/data_transfer.h"
+#include "core/framework/session_options.h"
 #include "core/framework/tensor.h"
 
 namespace onnxruntime {
@@ -277,6 +278,14 @@ class IExecutionProvider {
     return logger_;
   }
 
+  void SetSessionOptions(const SessionOptions* session_options) {
+    session_options_ = session_options;
+  }
+
+  const SessionOptions* GetSessionOptions() const {
+    return session_options_;
+  }
+
   virtual std::unique_ptr<profiling::EpProfiler> GetProfiler() {
     return {};
   }
@@ -330,5 +339,6 @@ class IExecutionProvider {
 
   // It will be set when this object is registered to a session
   const logging::Logger* logger_ = nullptr;
+  const SessionOptions* session_options_ = nullptr;
 };
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -256,3 +256,48 @@ static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed
 // - "0": Gemm FastMath mode is not enabled. [DEFAULT]
 // - "1": Gemm FastMath mode is enabled.
 static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";
+
+// Optionally identifying sub-graphs by traversing the graph in reverse order
+// starting from all CPU consuming nodes (e.g., for Reshape-13, the traversal
+// starts from its 2nd input). Traversing stops when hitting a Size or Shape operator.
+// The identified sub-graphs will be assigned to CPU EP.
+//
+// See comments in the model defined by onnxscript in Python below for an example.
+//
+// @onnxscript.script(default_opset=opset18)
+// def foo(x: FLOAT[12], w: FLOAT[6, 2], dim0: INT64[1], dim1: INT64[1]):
+//     # This should be computed by CPU but is placed
+//     # on CUDA (i.e., all inputs and outputs are GPU tensors)
+//     # when this option is not set to 1.
+//     dim2 = dim1 + 1
+//     # Same as `dim2 = dim1 + 1`. Another GPU node
+//     # when this option is not set to 1.
+//     dim3 = dim2 - 1
+//     # Same as `dim2 = dim1 + 1`. Another GPU node
+//     # when this option is not set to 1.
+//     new_shape = opset18.Concat(dim0, dim3, axis=0)
+//
+//     # A memcpy node will be inserted to copy GPU output
+//     # `new_shape` to CPU since Reshape's 2nd input is a CPU tensor
+//     # per schema definition.
+//     #
+//     # To
+//     #  1. remove memcpy node.
+//     #  2. fallback all computation above this line to CPU.
+//     # use the following code in Python
+//     #  import onnxruntime
+//     #  so = onnxruntime.SessionOptions()
+//     #  so.add_session_config_entry("session.reverse_traverse_cpu_fallback", "1")
+//     #
+//     # Note that x and new_x are still on GPU w/wo
+//     # setting session.reverse_traverse_cpu_fallback.
+//     new_x = opset18.Reshape(x, new_shape)
+//     # A pure GPU node.
+//     y = opset18.MatMul(new_x, w)
+//     return y
+//
+// Option values:
+// - "0": Disable reverse-traversing CPU fallback. [DEFAULT]
+// - "1": Enable reverse-traversing CPU fallback when calling GetCpuPreferredNodes(...).
+//        (i.e., adding nodes found by GetShapeRelatedNodes(...) to CPU node list internally).
+static const char* const kOrtSessionOptionsAggressiveCpuFallback = "session.aggressive_cpu_fallback";
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -2,9 +2,13 @@
 // Licensed under the MIT License.
 
 #include "core/framework/fallback_cpu_capability.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/common/inlined_containers.h"
 
+#include <cstring>
+#include <cstdlib>
 #include <queue>
+#include <unordered_map>
 
 #include "onnx/defs/data_type_utils.h"
 
@@ -39,9 +43,136 @@ static bool IsSmallInitializer(const onnxruntime::GraphViewer& graph, const Node
 }
 }  // namespace
 
+std::unordered_set<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphViewer& viewer) {
+  // Conceptually, this function traverse from shape-consuming nodes
+  // to fallback all its upstream nodes to CPU. Consider a graph
+  //
+  //
+  // The traversal should stop when
+  //  1. hitting Shape, Size nodes, graph inputs, or graph initializers.
+  //  2. hitting nodes with some large inputs or outputs.
+  LOGS_DEFAULT(INFO) << "Call GetShapeRelatedNodes to identify extra CPU nodes." << std::endl;
+
+  std::unordered_map<std::string, std::unordered_map<int64_t, std::vector<size_t>>> shape_related_inputs_in_nodes = {
+      // 2nd input of Expand-13 is a shape-related input.
+      {"Expand", {{13 /* since version */, {1} /* shape inputs' indices */}}},
+      // 2nd input (indexed by 1) of Reshape-13, Reshape-14, Reshape-19, Reshape-21 is a shape-related input.
+      {"Reshape", {{13, {1}}, {14, {1}}, {19, {1}}, {21, {1}}}},
+      // 2nd input of Unsqueeze-13 and Unsqueeze-21 is a shape-related input.
+      {"Unsqueeze", {{13, {1}}, {21, {1}}}},
+      // 1st input of ConstantOfShape is a shape-related input.
+      {"ConstantOfShape", {{9, {0}}, {20, {0}}, {21, {0}}}},
+      // 2nd to 5th inputs of Slice-13 are shape-related inputs.
+      {"Slice", {{13, {1, 2, 3, 4}}}}};
+
+  auto& graph = viewer.GetGraph();
+  // Each shape-producing node produces a tensor consumed
+  // as shape, axis, size, and indices.
+  // E.g.,
+  //  shape = onnx::Concat(s0, s1)
+  //  reshaped = onnx::Reshape(x, shape)
+  // Then, the shape-producing node is Concat.
+  std::unordered_set<const Node*> shape_producing_nodes;
+  // This loop collects all shape-producing nodes by finding
+  // all nodes that produce tensors specified in shape_related_inputs_in_nodes.
+  // E.g., for the above example, Concat is a shape-producing node because
+  // "Reshape" has a shape-related input at index 1.
+  for (auto& node : graph.Nodes()) {
+    LOGS_DEFAULT(INFO) << "Check if node " << node.Name() << " can be sink of shape sub-graph." << std::endl;
+    auto op_type_it = shape_related_inputs_in_nodes.find(node.OpType());
+    if (op_type_it == shape_related_inputs_in_nodes.end()) {
+      // This node doesn't consume tensor as shape,
+      // so we won't find any shape-producing node from it.
+      continue;
+    }
+    auto op_type_version_it = op_type_it->second.find(node.SinceVersion());
+    if (op_type_version_it == op_type_it->second.end()) {
+      // This node doesn't consume tensor as shape in this version,
+      // so we won't find any shape-producing node from it.
+      continue;
+    }
+
+    // shape-like inputs' indices in this node.
+    // E.g., for Reshape, it's [1] and for Slice, it's [1, 2, 3, 4].
+    auto& shape_input_indices = op_type_version_it->second;
+    // Now, this `node` is a shape-consuming node as defined by shape_related_inputs_in_nodes.
+    // Let's find producers for shape-like tensors consumed by this `node`.
+    // Consider this graph:
+    //  shape = onnx::Concat(s0, s1)
+    //  reshaped = onnx::Reshape(x, shape)
+    // The loop below does:
+    //  1. checks all `Reshape`'s inputs, `x` and `shape`,
+    //  2. finds `shape` is a shape-related variable since Reshape's 2nd input is a shape-related input,
+    //  3. and then records the producer of `shape` (i.e., `Concat`).
+    for (auto& input_index : shape_input_indices) {
+      auto input = node.InputDefs().at(input_index);
+      auto producer_node = graph.GetProducerNode(input->Name());
+      if (producer_node != nullptr && producer_node->OpType() != "Shape" && producer_node->OpType() != "Size") {
+        // Assume shape-computing sub-graphs begins with Shape, Size, or graph inputs.
+        // We should not fallback those nodes's upstream nodes to CPU; otherwise,
+        // it may change
+        //   GPU-tensor-x -> Mul -> GPU-tensor-y -> Shape -> CPU-tensor
+        // to
+        //   CPU-tensor-x -> Mul -> CPU-tensor -> Shape -> CPU-tensor
+        // and slows down the computation.
+
+        // After this for-loop, we will reversely traverse all nodes from every shape-producing node
+        // found here until hitting Shape, Size nodes, graph inputs, or graph initializers.
+        // All nodes on the traversal path will be forced to run on CPU.
+        LOGS_DEFAULT(INFO) << "Find a shape producing node (i.e., a node produces a tensor consumed as shape-like input in other nodes): " << node.Name() << std::endl;
+        shape_producing_nodes.insert(producer_node);
+      }
+    }
+  }
+
+  std::unordered_set<NodeIndex> shape_related_node_indices;
+  for (auto& node : shape_producing_nodes) {
+    LOGS_DEFAULT(INFO) << "Begin the (topologically reverse) traversing from shape producing node: " << node->Name() << std::endl;
+    std::vector<const Node*> start_nodes = {node};
+
+    auto to_stop = [](const Node* n1, const Node* n2) {
+      LOGS_DEFAULT(INFO) << "Skip the traversal from " << n1->Name() << " to " << n2->Name() << " since " << n2->Name() << " is a Shape or Size node." << std::endl;
+      return n2->OpType() == "Shape" || n2->OpType() == "Size";
+    };
+
+    // Reversely traverse all nodes from the shape-producing node.
+    // Force nodes to be run on CPU when all inputs and outputs are small.
+    // Stop the traversal when a "Shape" node is found.
+    graph.ReverseDFSFrom(
+        start_nodes,
+        [&shape_related_node_indices](const Node* n) {
+          LOGS_DEFAULT(INFO) << "Find an upstream node in shape sub-graph (let's fallback it to CPU): " << n->Name() << std::endl;
+          shape_related_node_indices.insert(n->Index());
+        },
+        nullptr,
+        NodeCompare(),
+        to_stop);
+  }
+
+  return shape_related_node_indices;
+}
+
+bool IsAggressiveCpuFallbackEnabled() {
+#if !defined(_WIN32) && ENABLE_TRAINING
+  // std::getenv is not available on each platform.
+  // Since ORT_AGGRESSIVE_CPU_FALLBACK is experimental,
+  // we only allow it for training to avoid build issues on
+  // custom platform such as XBox.
+  const char* p_env_var = std::getenv("ORT_AGGRESSIVE_CPU_FALLBACK");
+  if (!p_env_var) {
+    // No such an environment variable.
+    return false;
+  }
+  return std::strcmp(p_env_var, "1") == 0;
+#else
+  return false;
+#endif
+}
+
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes) {
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const bool aggressive_cpu_fallback) {
   // automatic conversion from const std::vector&
   const auto& ordered_nodes = graph.GetNodesInTopologicalOrder();
   InlinedVector<size_t> node_id_to_order_map(graph.MaxNodeIndex());
@@ -171,6 +302,10 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
     }
   }
 
+  if (aggressive_cpu_fallback) {
+    auto shape_related_node_indices = GetShapeRelatedNodes(graph);
+    cpu_nodes.insert(shape_related_node_indices.begin(), shape_related_node_indices.end());
+  }
   return cpu_nodes;
 }
 

diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h
@@ -16,9 +16,11 @@ namespace onnxruntime {
   @param graph Graph viewer
   @param kernel_lookup The kernel lookup for the target execution provider
   @param tentative_nodes Nodes that are tentative to be placed on on target EP
+  @param aggressive_cpu_fallback This is the set by kOrtSessionOptionsAggressiveCpuFallback option.
   */
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes);
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const bool aggressive_cpu_fallback);
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
@@ -146,6 +146,10 @@ struct SessionOptions {
   // The configuration keys and value formats are defined in
   // /include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
   ConfigOptions config_options;
+
+  const ConfigOptions& GetConfigOptions() const {
+    return config_options;
+  };
   std::unordered_map<std::string, const OrtValue*> initializers_to_share_map;
 
   // See onnxruntime_c_api.h for detailed documentation.

diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc
@@ -1296,7 +1296,13 @@ CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewe
       candidates.push_back(node.Index());
     }
 
-    auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates);
+    auto p_session_options = GetSessionOptions();
+    bool aggressive_cpu_fallback = false;
+    if (p_session_options) {
+      aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault(
+                                    kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
+    }
+    auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates, aggressive_cpu_fallback);
     for (auto& node_index : candidates) {
       if (cpu_nodes.count(node_index) > 0)
         continue;

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -11,6 +11,7 @@
 #include "core/providers/cuda/cuda_fwd.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
 #include "core/providers/cuda/cuda_profiler.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 #ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
@@ -2480,7 +2481,13 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   // For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
   // These are usually shape related computation subgraphs
   // Following logic can be extended for other EPs
-  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+  auto p_session_options = GetSessionOptions();
+  bool aggressive_cpu_fallback = false;
+  if (p_session_options) {
+    aggressive_cpu_fallback = p_session_options->GetConfigOptions().GetConfigEntry(
+                                  kOrtSessionOptionsAggressiveCpuFallback) == "1";
+  }
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback);
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -880,7 +880,13 @@ namespace Dml
         }
 
         // Get the list of nodes that should stay on the CPU
-        auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes);
+        auto p_session_options = GetSessionOptions();
+        bool aggressive_cpu_fallback = false;
+        if (p_session_options) {
+          aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault(
+            kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
+        }
+        auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes, aggressive_cpu_fallback);
 
         for (size_t nodeIndex : toplogicalOrder)
         {

diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -729,7 +729,13 @@ std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapabili
     candidates.push_back(node.Index());
     tenative_candidates.push_back(node.Index());
   }
-  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates);
+  auto p_session_options = GetSessionOptions();
+  bool aggressive_cpu_fallback = false;
+  if (p_session_options) {
+    aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault(
+                                  kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
+  }
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates, aggressive_cpu_fallback);
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0) {

diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -2399,7 +2399,13 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   // For ROCM EP, exclude the subgraph that is preferred to be placed in CPU
   // These are usually shape related computation subgraphs
   // Following logic can be extended for other EPs
-  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, candidates);
+  auto p_session_options = GetSessionOptions();
+  bool aggressive_cpu_fallback = false;
+  if (p_session_options) {
+    aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault(
+                                  kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
+  }
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, candidates, aggressive_cpu_fallback);
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0)

diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -279,7 +279,8 @@ std::unique_ptr<IDataTransfer> CreateGPUDataTransfer();
 
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes);
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const bool aggressive_cpu_fallback);
 
 std::string GetEnvironmentVar(const std::string& var_name);
 

diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -364,8 +364,9 @@ std::string GetEnvironmentVar(const std::string& var_name) {
 
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes) {
-  return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const bool aggressive_cpu_fallback) {
+  return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback);
 }
 
 namespace profiling {

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -188,7 +188,8 @@ struct ProviderHost {
 
   virtual std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                              const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                             gsl::span<const NodeIndex> tentative_nodes) = 0;
+                                                             gsl::span<const NodeIndex> tentative_nodes,
+                                                             const bool aggressive_cpu_fallback) = 0;
 
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) = 0;
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ float* p_data, size_t expected_size) = 0;
@@ -903,6 +904,8 @@ struct ProviderHost {
   // SessionState
   virtual const DataTransferManager& SessionState__GetDataTransferMgr(const SessionState* p) = 0;
 
+  virtual const ConfigOptions& SessionOptions__GetConfigOptions(const SessionOptions* p) = 0;
+
   // Tensor
   virtual std::unique_ptr<Tensor> Tensor__construct(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator) = 0;
   virtual std::unique_ptr<Tensor> Tensor__construct(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc, ptrdiff_t offset) = 0;