From 039e489f3ff5cc427030dc58e43af78696cf74e7 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Mon, 4 Mar 2024 14:47:32 -0800
Subject: [PATCH 01/10] Add a new function to fallback more nodes to CPUs.

Shape-related nodes don't only start with `Shape` or `Size`.
In dynamo-captured ONNX model, it can starts with a graph input.
A new transform is added to fallback `all` nodes which can be
reversely traversed from a `shape-like` variable. Some
`shape-like` variables are list below.
- all inputs of Range
- 2nd input of Reshape
- 2nd input of Unsqueeze
- 1st input of ConstantOfShape
- 2nd-to-last inputs of Slice.

Fix header

Remove unused variable

Versioning shape inputs

Fix
---
 .../core/framework/fallback_cpu_capability.cc | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
index ef68b88187e08..8af05affe8aec 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.cc
+++ b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -2,9 +2,13 @@
 // Licensed under the MIT License.
 
 #include "core/framework/fallback_cpu_capability.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/common/inlined_containers.h"
 
+#include <cstring>
+#include <cstdlib>
 #include <queue>
+#include <unordered_map>
 
 #include "onnx/defs/data_type_utils.h"
 
@@ -39,6 +43,115 @@ static bool IsSmallInitializer(const onnxruntime::GraphViewer& graph, const Node
 }
 }  // namespace
 
+std::unordered_set<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphViewer& viewer) {
+  // Conceptually, this function traverse from shape-consuming nodes
+  // to fallback all its upstream nodes to CPU. Consider a graph
+  //
+  //
+  // The traversal should stop when
+  //  1. hitting Shape, Size nodes, graph inputs, or graph initializers.
+  //  2. hitting nodes with some large inputs or outputs.
+  LOGS_DEFAULT(INFO) << "Call GetShapeRelatedNodes to identify extra CPU nodes." << std::endl;
+
+  std::unordered_map<std::string, std::unordered_map<int64_t, std::vector<size_t>>> shape_related_inputs_in_nodes = {
+      // 2nd input of Expand-13 is a shape-related input.
+      {"Expand", {{13 /* since version */, {1} /* shape inputs' indices */}}},
+      // 2nd input (indexed by 1) of Reshape-13, Reshape-14, Reshape-19, Reshape-21 is a shape-related input.
+      {"Reshape", {{13, {1}}, {14, {1}}, {19, {1}}, {21, {1}}}},
+      // 2nd input of Unsqueeze-13 and Unsqueeze-21 is a shape-related input.
+      {"Unsqueeze", {{13, {1}}, {21, {1}}}},
+      // 1st input of ConstantOfShape is a shape-related input.
+      {"ConstantOfShape", {{9, {0}}, {20, {0}}, {21, {0}}}},
+      // 2nd to 5th inputs of Slice-13 are shape-related inputs.
+      {"Slice", {{13, {1, 2, 3, 4}}}}};
+
+  auto& graph = viewer.GetGraph();
+  // Each shape-producing node produces a tensor consumed
+  // as shape, axis, size, and indices.
+  // E.g.,
+  //  shape = onnx::Concat(s0, s1)
+  //  reshaped = onnx::Reshape(x, shape)
+  // Then, the shape-producing node is Concat.
+  std::unordered_set<const Node*> shape_producing_nodes;
+  // This loop collects all shape-producing nodes by finding
+  // all nodes that produce tensors specified in shape_related_inputs_in_nodes.
+  // E.g., for the above example, Concat is a shape-producing node because
+  // "Reshape" has a shape-related input at index 1.
+  for (auto& node : graph.Nodes()) {
+    LOGS_DEFAULT(INFO) << "Check if node " << node.Name() << " can be sink of shape sub-graph." << std::endl;
+    auto op_type_it = shape_related_inputs_in_nodes.find(node.OpType());
+    if (op_type_it == shape_related_inputs_in_nodes.end()) {
+      // This node doesn't consume tensor as shape,
+      // so we won't find any shape-producing node from it.
+      continue;
+    }
+    auto op_type_version_it = op_type_it->second.find(node.SinceVersion());
+    if (op_type_version_it == op_type_it->second.end()) {
+      // This node doesn't consume tensor as shape in this version,
+      // so we won't find any shape-producing node from it.
+      continue;
+    }
+
+    // shape-like inputs' indices in this node.
+    // E.g., for Reshape, it's [1] and for Slice, it's [1, 2, 3, 4].
+    auto& shape_input_indices = op_type_version_it->second;
+    // Now, this `node` is a shape-consuming node as defined by shape_related_inputs_in_nodes.
+    // Let's find producers for shape-like tensors consumed by this `node`.
+    // Consider this graph:
+    //  shape = onnx::Concat(s0, s1)
+    //  reshaped = onnx::Reshape(x, shape)
+    // The loop below does:
+    //  1. checks all `Reshape`'s inputs, `x` and `shape`,
+    //  2. finds `shape` is a shape-related variable since Reshape's 2nd input is a shape-related input,
+    //  3. and then records the producer of `shape` (i.e., `Concat`).
+    for (auto& input_index : shape_input_indices) {
+      auto input = node.InputDefs().at(input_index);
+      auto producer_node = graph.GetProducerNode(input->Name());
+      if (producer_node != nullptr && producer_node->OpType() != "Shape" && producer_node->OpType() != "Size") {
+        // Assume shape-computing sub-graphs begins with Shape, Size, or graph inputs.
+        // We should not fallback those nodes's upstream nodes to CPU; otherwise,
+        // it may change
+        //   GPU-tensor-x -> Mul -> GPU-tensor-y -> Shape -> CPU-tensor
+        // to
+        //   CPU-tensor-x -> Mul -> CPU-tensor -> Shape -> CPU-tensor
+        // and slows down the computation.
+
+        // After this for-loop, we will reversely traverse all nodes from every shape-producing node
+        // found here until hitting Shape, Size nodes, graph inputs, or graph initializers.
+        // All nodes on the traversal path will be forced to run on CPU.
+        LOGS_DEFAULT(INFO) << "Find a shape producing node (i.e., a node produces a tensor consumed as shape-like input in other nodes): " << node.Name() << std::endl;
+        shape_producing_nodes.insert(producer_node);
+      }
+    }
+  }
+
+  std::unordered_set<NodeIndex> shape_related_node_indices;
+  for (auto& node : shape_producing_nodes) {
+    LOGS_DEFAULT(INFO) << "Begin the (topologically reverse) traversing from shape producing node: " << node->Name() << std::endl;
+    std::vector<const Node*> start_nodes = {node};
+
+    auto to_stop = [](const Node* n1, const Node* n2) {
+      LOGS_DEFAULT(INFO) << "Skip the traversal from " << n1->Name() << " to " << n2->Name() << " since " << n2->Name() << " is a Shape or Size node." << std::endl;
+      return n2->OpType() == "Shape" || n2->OpType() == "Size";
+    };
+
+    // Reversely traverse all nodes from the shape-producing node.
+    // Force nodes to be run on CPU when all inputs and outputs are small.
+    // Stop the traversal when a "Shape" node is found.
+    graph.ReverseDFSFrom(
+        start_nodes,
+        [&shape_related_node_indices](const Node* n) {
+          LOGS_DEFAULT(INFO) << "Find an upstream node in shape sub-graph (let's fallback it to CPU): " << n->Name() << std::endl;
+          shape_related_node_indices.insert(n->Index());
+        },
+        nullptr,
+        NodeCompare(),
+        to_stop);
+  }
+
+  return shape_related_node_indices;
+}
+
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
                                                    gsl::span<const NodeIndex> tentative_nodes) {
@@ -171,6 +284,10 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
     }
   }
 
+  if (std::strcmp(std::getenv("ORT_AGGRESSIVE_CPU_FALLBACK"), "1") == 0) {
+    auto shape_related_node_indices = GetShapeRelatedNodes(graph);
+    cpu_nodes.insert(shape_related_node_indices.begin(), shape_related_node_indices.end());
+  }
   return cpu_nodes;
 }
 

From 0dac90238fbf80a5b8a710dca7ce0e4a631e2812 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Tue, 5 Mar 2024 09:29:41 -0800
Subject: [PATCH 02/10] Fix segfault

---
 .../core/framework/fallback_cpu_capability.cc | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
index 8af05affe8aec..dc968736dbe8f 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.cc
+++ b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -152,6 +152,23 @@ std::unordered_set<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphViewe
   return shape_related_node_indices;
 }
 
+bool IsAggressiveCpuFallbackEnabled() {
+#if !defined(_WIN32) && ENABLE_TRAINING
+  // std::getenv is not available on each platform.
+  // Since ORT_AGGRESSIVE_CPU_FALLBACK is experimental,
+  // we only allow it for training to avoid build issues on
+  // custom platform such as XBox.
+  const char* p_env_var = std::getenv("ORT_AGGRESSIVE_CPU_FALLBACK");
+  if (!p_env_var) {
+    // No such an environment variable.
+    return false;
+  }
+  return std::strcmp(p_env_var, "1") == 0;
+#else
+  return false;
+#endif
+}
+
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
                                                    gsl::span<const NodeIndex> tentative_nodes) {
@@ -284,7 +301,7 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
     }
   }
 
-  if (std::strcmp(std::getenv("ORT_AGGRESSIVE_CPU_FALLBACK"), "1") == 0) {
+  if (IsAggressiveCpuFallbackEnabled()) {
     auto shape_related_node_indices = GetShapeRelatedNodes(graph);
     cpu_nodes.insert(shape_related_node_indices.begin(), shape_related_node_indices.end());
   }

From 520332f760953f69c03b54dfbb142d23c6d42f6c Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Tue, 5 Mar 2024 14:50:12 -0800
Subject: [PATCH 03/10] Add a simple test

Fix typo

Write to fixed place

Remove unused import's

run it

Fix

Change test location
---
 ...rttraining_test_aggressive_cpu_fallback.py | 54 +++++++++++++++++++
 .../test/python/orttraining_test_ort_apis.py  | 14 +++++
 2 files changed, 68 insertions(+)
 create mode 100644 orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py

diff --git a/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py b/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py
new file mode 100644
index 0000000000000..d6fa503695b89
--- /dev/null
+++ b/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py
@@ -0,0 +1,54 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import unittest
+
+import onnx
+import onnxscript
+from onnxscript.onnx_opset import opset18
+from onnxscript.onnx_types import FLOAT, INT64
+
+import onnxruntime
+
+
+class TestAggressiveCpuFallback(unittest.TestCase):
+    def test_cpu_fallback(self):
+        @onnxscript.script(default_opset=opset18)
+        def foo(x: FLOAT[12], w: FLOAT[6, 2], dim0: INT64[1], dim1: INT64[1]):
+            # This should be computed by CPU but is placed
+            # on CUDA (i.e., all inputs and outputs are GPU tensors).
+            dim2 = dim1 + 1
+            # Same as `dim2 = dim1 + 1`. Another GPU node.
+            dim3 = dim2 - 1
+            # Same as `dim2 = dim1 + 1`. Another GPU node.
+            new_shape = opset18.Concat(dim0, dim3, axis=0)
+            # A memcpy node will be inserted to copy GPU output
+            # to CPU since Reshape's 2nd input is a CPU tensor
+            # per schema definition.
+            #
+            # Use ORT_AGGRESSIVE_CPU_FALLBACK=1 to
+            #  1. remove memcpy node.
+            #  2. fallback all computation above this line to CPU.
+            new_x = opset18.Reshape(x, new_shape)
+            y = opset18.MatMul(new_x, w)
+            return y
+
+        model = foo.to_model_proto()
+
+        session_options = onnxruntime.SessionOptions()
+        session_options.optimized_model_filepath = "cpu_fallback_test.onnx"
+        # This call should trigger GetCpuPreferredNodes and then GetShapeRelatedNodes
+        # when environment variable ORT_AGGRESSIVE_CPU_FALLBACK=1 is set.
+        # As a result, no memcopy node should be observed in optimized graph.
+        #
+        # See comments inside `foo`.
+        onnxruntime.InferenceSession(
+            path_or_bytes=model.SerializeToString(), sess_options=session_options, providers=["CUDAExecutionProvider"]
+        )
+        optimized = onnx.load("cpu_fallback_test.onnx")
+
+        self.assertTrue(all(node.op_type != "MemcpyToHost" for node in optimized.graph.node))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis.py
index a3e666dd404f2..524497bdd0b4e 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ort_apis.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis.py
@@ -42,6 +42,18 @@ def run_onnxblock_tests(cwd, log):
     run_subprocess(command, cwd=cwd, log=log).check_returncode()
 
 
+def run_aggressive_cpu_fallback_test(cwd, log):
+    log.debug("Running: Aggressive CPU Fallback")
+
+    command = [
+        "python3",
+        "orttraining_test_aggressive_cpu_fallback.py",
+    ]
+
+    env = {"ORT_AGGRESSIVE_CPU_FALLBACK": "1"}
+    run_subprocess(command, cwd=cwd, log=log, env=env).check_returncode()
+
+
 def main():
     args = parse_arguments()
     cwd = args.cwd
@@ -52,6 +64,8 @@ def main():
 
     run_training_apis_python_api_tests(cwd, log)
 
+    run_aggressive_cpu_fallback_test(cwd, log)
+
     return 0
 
 

From 05c9a4125924dff9209a3921e3ceaa65617c9bcd Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Wed, 6 Mar 2024 15:45:23 -0800
Subject: [PATCH 04/10] Install onnxscript

---
 .../templates/orttraining-linux-gpu-test-ci-pipeline.yml       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
index 5dc156e301357..a079b6e900a6a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
@@ -39,6 +39,7 @@ steps:
   timeoutInMinutes: 60
 
 # Entry point for all ort training api tests
+# TODO: move onnxscript installation to CI image.
 - script: |
     docker run \
       --gpus all \
@@ -47,7 +48,7 @@ steps:
       --volume $(Build.SourcesDirectory):/onnxruntime_src \
       --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \
       ${{ parameters.DockerImageTag }} \
-        bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \
+        bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m pip install onnxscript && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \
   displayName: 'Run ORT Training APIs Tests'
   condition: succeededOrFailed()
   timeoutInMinutes: 120

From 8f8c8fb0d252c91ee23985b1a1e34751ef757048 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Thu, 7 Mar 2024 15:36:16 -0800
Subject: [PATCH 05/10] new session option key

---
 .../onnxruntime_session_options_config_keys.h | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index b282438795eb5..740095d5b1876 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -256,3 +256,48 @@ static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed
 // - "0": Gemm FastMath mode is not enabled. [DEFAULT]
 // - "1": Gemm FastMath mode is enabled.
 static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";
+
+// Optionally identifying sub-graphs by traversing the graph in reverse order
+// starting from all CPU consuming nodes (e.g., for Reshape-13, the traversal
+// starts from its 2nd input). Traversing stops when hitting a Size or Shape operator.
+// The identified sub-graphs will be assigned to CPU EP.
+//
+// See comments in the model defined by onnxscript in Python below for an example.
+//
+// @onnxscript.script(default_opset=opset18)
+// def foo(x: FLOAT[12], w: FLOAT[6, 2], dim0: INT64[1], dim1: INT64[1]):
+//     # This should be computed by CPU but is placed
+//     # on CUDA (i.e., all inputs and outputs are GPU tensors)
+//     # when this option is not set to 1.
+//     dim2 = dim1 + 1
+//     # Same as `dim2 = dim1 + 1`. Another GPU node
+//     # when this option is not set to 1.
+//     dim3 = dim2 - 1
+//     # Same as `dim2 = dim1 + 1`. Another GPU node
+//     # when this option is not set to 1.
+//     new_shape = opset18.Concat(dim0, dim3, axis=0)
+//
+//     # A memcpy node will be inserted to copy GPU output
+//     # `new_shape` to CPU since Reshape's 2nd input is a CPU tensor
+//     # per schema definition.
+//     #
+//     # To
+//     #  1. remove memcpy node.
+//     #  2. fallback all computation above this line to CPU.
+//     # use the following code in Python
+//     #  import onnxruntime
+//     #  so = onnxruntime.SessionOptions()
+//     #  so.add_session_config_entry("session.reverse_traverse_cpu_fallback", "1")
+//     #
+//     # Note that x and new_x are still on GPU w/wo
+//     # setting session.reverse_traverse_cpu_fallback.
+//     new_x = opset18.Reshape(x, new_shape)
+//     # A pure GPU node.
+//     y = opset18.MatMul(new_x, w)
+//     return y
+//
+// Option values:
+// - "0": Disable reverse-traversing CPU fallback. [DEFAULT]
+// - "1": Enable reverse-traversing CPU fallback when calling GetCpuPreferredNodes(...).
+//        (i.e., adding nodes found by GetShapeRelatedNodes(...) to CPU node list internally).
+static const char* const kOrtSessionOptionsReverseTraverseCpuFallback = "session.reverse_traverse_cpu_fallback";

From 526b166f8d21cbd054f3f6cea5533bd6d1c14556 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Tue, 12 Mar 2024 15:32:18 -0700
Subject: [PATCH 06/10] Make the flag a session option instead of env var

---
 .../onnxruntime/core/framework/execution_provider.h    | 10 ++++++++++
 .../session/onnxruntime_session_options_config_keys.h  |  2 +-
 onnxruntime/core/framework/fallback_cpu_capability.cc  |  5 +++--
 onnxruntime/core/framework/fallback_cpu_capability.h   |  4 +++-
 onnxruntime/core/framework/session_options.h           |  4 ++++
 .../core/providers/cann/cann_execution_provider.cc     |  8 +++++++-
 .../core/providers/cuda/cuda_execution_provider.cc     |  9 ++++++++-
 .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp |  8 +++++++-
 onnxruntime/core/providers/js/js_execution_provider.cc |  8 +++++++-
 .../core/providers/rocm/rocm_execution_provider.cc     |  8 +++++++-
 .../core/providers/shared_library/provider_api.h       |  3 ++-
 .../shared_library/provider_bridge_provider.cc         |  5 +++--
 .../providers/shared_library/provider_interfaces.h     |  5 ++++-
 .../providers/shared_library/provider_wrappedtypes.h   |  7 +++++++
 onnxruntime/core/session/inference_session.cc          |  1 +
 onnxruntime/core/session/provider_bridge_ort.cc        |  6 ++++--
 .../python/orttraining_test_aggressive_cpu_fallback.py |  1 +
 17 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index 40ca96a19aef1..85faabaa32bdc 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -11,6 +11,7 @@
 #include "core/common/logging/logging.h"
 #include "core/common/status.h"
 #include "core/framework/data_transfer.h"
+#include "core/framework/session_options.h"
 #include "core/framework/tensor.h"
 
 namespace onnxruntime {
@@ -277,6 +278,14 @@ class IExecutionProvider {
     return logger_;
   }
 
+  void SetSessionOptions(const SessionOptions* session_options) {
+    session_options_ = session_options;
+  }
+
+  const SessionOptions* GetSessionOptions() const {
+    return session_options_;
+  }
+
   virtual std::unique_ptr<profiling::EpProfiler> GetProfiler() {
     return {};
   }
@@ -330,5 +339,6 @@ class IExecutionProvider {
 
   // It will be set when this object is registered to a session
   const logging::Logger* logger_ = nullptr;
+  const SessionOptions* session_options_ = nullptr;
 };
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 740095d5b1876..1e02a02349df5 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -300,4 +300,4 @@ static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas
 // - "0": Disable reverse-traversing CPU fallback. [DEFAULT]
 // - "1": Enable reverse-traversing CPU fallback when calling GetCpuPreferredNodes(...).
 //        (i.e., adding nodes found by GetShapeRelatedNodes(...) to CPU node list internally).
-static const char* const kOrtSessionOptionsReverseTraverseCpuFallback = "session.reverse_traverse_cpu_fallback";
+static const char* const kOrtSessionOptionsAggressiveCpuFallback = "session.aggressive_cpu_fallback";
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
index dc968736dbe8f..33489ca44038b 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.cc
+++ b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -171,7 +171,8 @@ bool IsAggressiveCpuFallbackEnabled() {
 
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes) {
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const bool aggressive_cpu_fallback) {
   // automatic conversion from const std::vector&
   const auto& ordered_nodes = graph.GetNodesInTopologicalOrder();
   InlinedVector<size_t> node_id_to_order_map(graph.MaxNodeIndex());
@@ -301,7 +302,7 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
     }
   }
 
-  if (IsAggressiveCpuFallbackEnabled()) {
+  if (aggressive_cpu_fallback) {
     auto shape_related_node_indices = GetShapeRelatedNodes(graph);
     cpu_nodes.insert(shape_related_node_indices.begin(), shape_related_node_indices.end());
   }
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h
index 7c8f91c7dad34..3e2f0d85f0306 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.h
+++ b/onnxruntime/core/framework/fallback_cpu_capability.h
@@ -16,9 +16,11 @@ namespace onnxruntime {
   @param graph Graph viewer
   @param kernel_lookup The kernel lookup for the target execution provider
   @param tentative_nodes Nodes that are tentative to be placed on on target EP
+  @param aggressive_cpu_fallback This is the set by kOrtSessionOptionsAggressiveCpuFallback option.
   */
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes);
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const bool aggressive_cpu_fallback);
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 796a018ac0f68..5685c3a200556 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -146,6 +146,10 @@ struct SessionOptions {
   // The configuration keys and value formats are defined in
   // /include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
   ConfigOptions config_options;
+
+  const ConfigOptions& GetConfigOptions() const {
+    return config_options;
+  };
   std::unordered_map<std::string, const OrtValue*> initializers_to_share_map;
 
   // See onnxruntime_c_api.h for detailed documentation.
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc
index 9a242919665bb..4c0fe1cb20b99 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.cc
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc
@@ -1296,7 +1296,13 @@ CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewe
       candidates.push_back(node.Index());
     }
 
-    auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates);
+    auto p_session_options = GetSessionOptions();
+    bool aggressive_cpu_fallback = false;
+    if (p_session_options) {
+      aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault(
+                                    kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
+    }
+    auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates, aggressive_cpu_fallback);
     for (auto& node_index : candidates) {
       if (cpu_nodes.count(node_index) > 0)
         continue;
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 05d9f3b5a1e8f..8ee21d705a765 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -13,6 +13,7 @@
 #include "core/providers/cuda/gpu_data_transfer.h"
 #include "core/providers/cuda/cuda_profiler.h"
 #include "core/session/onnxruntime_run_options_config_keys.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 #ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
@@ -2530,7 +2531,13 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   // For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
   // These are usually shape related computation subgraphs
   // Following logic can be extended for other EPs
-  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+  auto p_session_options = GetSessionOptions();
+  bool aggressive_cpu_fallback = false;
+  if (p_session_options) {
+    aggressive_cpu_fallback = p_session_options->GetConfigOptions().GetConfigEntry(
+                                  kOrtSessionOptionsAggressiveCpuFallback) == "1";
+  }
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback);
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 6c347ebdca7c1..1953fd048ed3a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -886,7 +886,13 @@ namespace Dml
         }
 
         // Get the list of nodes that should stay on the CPU
-        auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes);
+        auto p_session_options = GetSessionOptions();
+        bool aggressive_cpu_fallback = false;
+        if (p_session_options) {
+          aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault(
+            kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
+        }
+        auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes, aggressive_cpu_fallback);
 
         for (size_t nodeIndex : toplogicalOrder)
         {
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 038423104d92e..c97c5cb4d1871 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -728,7 +728,13 @@ std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapabili
     candidates.push_back(node.Index());
     tenative_candidates.push_back(node.Index());
   }
-  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates);
+  auto p_session_options = GetSessionOptions();
+  bool aggressive_cpu_fallback = false;
+  if (p_session_options) {
+    aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault(
+                                  kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
+  }
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates, aggressive_cpu_fallback);
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0) {
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 87daaeea969ac..67212f54a0492 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -2415,7 +2415,13 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   // For ROCM EP, exclude the subgraph that is preferred to be placed in CPU
   // These are usually shape related computation subgraphs
   // Following logic can be extended for other EPs
-  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, candidates);
+  auto p_session_options = GetSessionOptions();
+  bool aggressive_cpu_fallback = false;
+  if (p_session_options) {
+    aggressive_cpu_fallback = p_session_options->config_options.GetConfigOrDefault(
+                                  kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
+  }
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, candidates, aggressive_cpu_fallback);
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0)
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 1cebe4a256fd4..ba3b72c5a7fe4 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -280,7 +280,8 @@ std::unique_ptr<IDataTransfer> CreateGPUDataTransfer();
 
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes);
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const bool aggressive_cpu_fallback);
 
 std::string GetEnvironmentVar(const std::string& var_name);
 
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 7b73ab36b3742..8c8e78a8426d1 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -364,8 +364,9 @@ std::string GetEnvironmentVar(const std::string& var_name) {
 
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes) {
-  return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const bool aggressive_cpu_fallback) {
+  return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback);
 }
 
 namespace profiling {
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 8c8d5b1fd460a..73d1d81a4d6ad 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -192,7 +192,8 @@ struct ProviderHost {
 
   virtual std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                              const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                             gsl::span<const NodeIndex> tentative_nodes) = 0;
+                                                             gsl::span<const NodeIndex> tentative_nodes,
+                                                             const bool aggressive_cpu_fallback) = 0;
 
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) = 0;
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ float* p_data, size_t expected_size) = 0;
@@ -913,6 +914,8 @@ struct ProviderHost {
   // SessionState
   virtual const DataTransferManager& SessionState__GetDataTransferMgr(const SessionState* p) = 0;
 
+  virtual const ConfigOptions& SessionOptions__GetConfigOptions(const SessionOptions* p) = 0;
+
   // Tensor
   virtual std::unique_ptr<Tensor> Tensor__construct(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator) = 0;
   virtual std::unique_ptr<Tensor> Tensor__construct(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc, ptrdiff_t offset) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index bdad18c7edec0..a820cf57d9c6c 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -1075,6 +1075,13 @@ class SessionState {
   PROVIDER_DISALLOW_ALL(SessionState)
 };
 
+class SessionOptions {
+ public:
+  const ConfigOptions& GetConfigOptions() const { return g_host->SessionOptions__GetConfigOptions(this); }
+
+  PROVIDER_DISALLOW_ALL(SessionOptions)
+};
+
 struct Tensor final {
   static std::unique_ptr<Tensor> CreateDefault() { return g_host->Tensor__construct_default(); }
   static std::unique_ptr<Tensor> Create(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator) { return g_host->Tensor__construct(p_type, shape, std::move(allocator)); }
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index ece224ef206fc..ee1f0fe805223 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -763,6 +763,7 @@ common::Status InferenceSession::RegisterExecutionProvider(const std::shared_ptr
   }
 
   p_exec_provider->SetLogger(session_logger_);
+  p_exec_provider->SetSessionOptions(&session_options_);
   session_profiler_.AddEpProfilers(p_exec_provider->GetProfiler());
   return execution_providers_.Add(provider_type, p_exec_provider);
 }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index c7cf5963fa10f..b9a613b79b517 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -263,8 +263,9 @@ struct ProviderHostImpl : ProviderHost {
 
   std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                      const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                     gsl::span<const NodeIndex> tentative_nodes) override {
-    return onnxruntime::GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+                                                     gsl::span<const NodeIndex> tentative_nodes,
+                                                     const bool aggressive_cpu_fallback) override {
+    return onnxruntime::GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback);
   }
 
   Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) override { return utils::UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }
@@ -1153,6 +1154,7 @@ struct ProviderHostImpl : ProviderHost {
 
   // SessionState (wrapped)
   const DataTransferManager& SessionState__GetDataTransferMgr(const SessionState* p) override { return p->GetDataTransferMgr(); }
+  const ConfigOptions& SessionOptions__GetConfigOptions(const SessionOptions* p) override { return p->GetConfigOptions(); };
 
   // Tensor (wrapped)
   std::unique_ptr<Tensor> Tensor__construct(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator) override {
diff --git a/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py b/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py
index d6fa503695b89..4cac5c1be8083 100644
--- a/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py
+++ b/orttraining/orttraining/test/python/orttraining_test_aggressive_cpu_fallback.py
@@ -37,6 +37,7 @@ def foo(x: FLOAT[12], w: FLOAT[6, 2], dim0: INT64[1], dim1: INT64[1]):
 
         session_options = onnxruntime.SessionOptions()
         session_options.optimized_model_filepath = "cpu_fallback_test.onnx"
+        session_options.add_session_config_entry("session.reverse_traverse_cpu_fallback", "1")
         # This call should trigger GetCpuPreferredNodes and then GetShapeRelatedNodes
         # when environment variable ORT_AGGRESSIVE_CPU_FALLBACK=1 is set.
         # As a result, no memcopy node should be observed in optimized graph.

From c087069e683f906e624f5d800e1ef883571e914a Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Tue, 12 Mar 2024 17:28:47 -0700
Subject: [PATCH 07/10] Use InlinedHash* types

---
 .../core/framework/fallback_cpu_capability.cc | 51 +++++++------------
 1 file changed, 18 insertions(+), 33 deletions(-)

diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
index 33489ca44038b..21faffbfe5a00 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.cc
+++ b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -8,7 +8,6 @@
 #include <cstring>
 #include <cstdlib>
 #include <queue>
-#include <unordered_map>
 
 #include "onnx/defs/data_type_utils.h"
 
@@ -43,17 +42,20 @@ static bool IsSmallInitializer(const onnxruntime::GraphViewer& graph, const Node
 }
 }  // namespace
 
-std::unordered_set<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphViewer& viewer) {
+static InlinedHashSet<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphViewer& viewer) {
   // Conceptually, this function traverse from shape-consuming nodes
   // to fallback all its upstream nodes to CPU. Consider a graph
   //
   //
   // The traversal should stop when
   //  1. hitting Shape, Size nodes, graph inputs, or graph initializers.
-  //  2. hitting nodes with some large inputs or outputs.
-  LOGS_DEFAULT(INFO) << "Call GetShapeRelatedNodes to identify extra CPU nodes." << std::endl;
+  //  2. [TODO] hitting nodes with some large inputs or outputs. Before that,
+  //     we need shape inference to determine the size of the inputs and outputs.
+  //     Some graph transforms add nodes without shape information, so
+  //     checking shapes will make the algorithm more unstable now.
+  LOGS_DEFAULT(VERBOSE) << "Call GetShapeRelatedNodes to identify extra CPU nodes." << std::endl;
 
-  std::unordered_map<std::string, std::unordered_map<int64_t, std::vector<size_t>>> shape_related_inputs_in_nodes = {
+  const static InlinedHashMap<std::string_view, InlinedHashMap<int64_t, std::vector<size_t>>> shape_related_inputs_in_nodes = {
       // 2nd input of Expand-13 is a shape-related input.
       {"Expand", {{13 /* since version */, {1} /* shape inputs' indices */}}},
       // 2nd input (indexed by 1) of Reshape-13, Reshape-14, Reshape-19, Reshape-21 is a shape-related input.
@@ -72,13 +74,13 @@ std::unordered_set<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphViewe
   //  shape = onnx::Concat(s0, s1)
   //  reshaped = onnx::Reshape(x, shape)
   // Then, the shape-producing node is Concat.
-  std::unordered_set<const Node*> shape_producing_nodes;
+  InlinedHashSet<const Node*> shape_producing_nodes;
   // This loop collects all shape-producing nodes by finding
   // all nodes that produce tensors specified in shape_related_inputs_in_nodes.
   // E.g., for the above example, Concat is a shape-producing node because
   // "Reshape" has a shape-related input at index 1.
   for (auto& node : graph.Nodes()) {
-    LOGS_DEFAULT(INFO) << "Check if node " << node.Name() << " can be sink of shape sub-graph." << std::endl;
+    LOGS_DEFAULT(VERBOSE) << "Check if node " << node.Name() << " can be sink of shape sub-graph." << std::endl;
     auto op_type_it = shape_related_inputs_in_nodes.find(node.OpType());
     if (op_type_it == shape_related_inputs_in_nodes.end()) {
       // This node doesn't consume tensor as shape,
@@ -119,19 +121,19 @@ std::unordered_set<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphViewe
         // After this for-loop, we will reversely traverse all nodes from every shape-producing node
         // found here until hitting Shape, Size nodes, graph inputs, or graph initializers.
         // All nodes on the traversal path will be forced to run on CPU.
-        LOGS_DEFAULT(INFO) << "Find a shape producing node (i.e., a node produces a tensor consumed as shape-like input in other nodes): " << node.Name() << std::endl;
+        LOGS_DEFAULT(VERBOSE) << "Find a shape producing node (i.e., a node produces a tensor consumed as shape-like input in other nodes): " << node.Name() << std::endl;
         shape_producing_nodes.insert(producer_node);
       }
     }
   }
 
-  std::unordered_set<NodeIndex> shape_related_node_indices;
+  InlinedHashSet<NodeIndex> shape_related_node_indices;
   for (auto& node : shape_producing_nodes) {
-    LOGS_DEFAULT(INFO) << "Begin the (topologically reverse) traversing from shape producing node: " << node->Name() << std::endl;
+    LOGS_DEFAULT(VERBOSE) << "Begin the (topologically reverse) traversing from shape producing node: " << node->Name() << std::endl;
     std::vector<const Node*> start_nodes = {node};
 
     auto to_stop = [](const Node* n1, const Node* n2) {
-      LOGS_DEFAULT(INFO) << "Skip the traversal from " << n1->Name() << " to " << n2->Name() << " since " << n2->Name() << " is a Shape or Size node." << std::endl;
+      LOGS_DEFAULT(VERBOSE) << "Skip the traversal from " << n1->Name() << " to " << n2->Name() << " since " << n2->Name() << " is a Shape or Size node." << std::endl;
       return n2->OpType() == "Shape" || n2->OpType() == "Size";
     };
 
@@ -141,7 +143,7 @@ std::unordered_set<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphViewe
     graph.ReverseDFSFrom(
         start_nodes,
         [&shape_related_node_indices](const Node* n) {
-          LOGS_DEFAULT(INFO) << "Find an upstream node in shape sub-graph (let's fallback it to CPU): " << n->Name() << std::endl;
+          LOGS_DEFAULT(VERBOSE) << "Find an upstream node in shape sub-graph (let's fallback it to CPU): " << n->Name() << std::endl;
           shape_related_node_indices.insert(n->Index());
         },
         nullptr,
@@ -152,23 +154,6 @@ std::unordered_set<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphViewe
   return shape_related_node_indices;
 }
 
-bool IsAggressiveCpuFallbackEnabled() {
-#if !defined(_WIN32) && ENABLE_TRAINING
-  // std::getenv is not available on each platform.
-  // Since ORT_AGGRESSIVE_CPU_FALLBACK is experimental,
-  // we only allow it for training to avoid build issues on
-  // custom platform such as XBox.
-  const char* p_env_var = std::getenv("ORT_AGGRESSIVE_CPU_FALLBACK");
-  if (!p_env_var) {
-    // No such an environment variable.
-    return false;
-  }
-  return std::strcmp(p_env_var, "1") == 0;
-#else
-  return false;
-#endif
-}
-
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
                                                    gsl::span<const NodeIndex> tentative_nodes,
@@ -214,7 +199,7 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
             auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
             for (auto& consumer_node : consumer_nodes) {
               candidates.push(consumer_node->Index());
-              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name();
+              LOGS_DEFAULT(VERBOSE) << "Candidate for fallback CPU execution: " << consumer_node->Name();
             }
           }
           return Status::OK();
@@ -290,9 +275,9 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
 
     if (place_in_cpu) {
       cpu_nodes.insert(cur);
-      LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
-                         << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
-                         << " capable of executing this node";
+      LOGS_DEFAULT(VERBOSE) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
+                            << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
+                            << " capable of executing this node";
       for (auto* output : node->OutputDefs()) {
         cpu_output_args.insert(output);
       }

From 9f0ca0ffa777af385c6f7873302c3e8156fcfeb1 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Tue, 12 Mar 2024 19:16:20 -0700
Subject: [PATCH 08/10] Fix warning

---
 .../core/providers/shared_library/provider_wrappedtypes.h       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index a820cf57d9c6c..9711c24d2595c 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -1075,7 +1075,7 @@ class SessionState {
   PROVIDER_DISALLOW_ALL(SessionState)
 };
 
-class SessionOptions {
+struct SessionOptions {
  public:
   const ConfigOptions& GetConfigOptions() const { return g_host->SessionOptions__GetConfigOptions(this); }
 

From 6db6a4d037cfaa321c79bdb6b96bf1ea787b41b7 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Tue, 12 Mar 2024 19:18:38 -0700
Subject: [PATCH 09/10] Fix ROCm build

---
 onnxruntime/core/providers/rocm/rocm_execution_provider.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 67212f54a0492..73ec446b18ed2 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -10,6 +10,7 @@
 #include "core/providers/rocm/rocm_fwd.h"
 #include "core/providers/rocm/gpu_data_transfer.h"
 #include "core/providers/rocm/rocm_profiler.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 #ifndef DISABLE_CONTRIB_OPS
 #include "contrib_ops/rocm/rocm_contrib_kernels.h"

From 0f9de47fcf0fb6a53c9c82f021f5427c842a03c4 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Tue, 12 Mar 2024 19:32:54 -0700
Subject: [PATCH 10/10] Exclude in min build

---
 .../session/onnxruntime_session_options_config_keys.h  |  2 ++
 onnxruntime/core/framework/fallback_cpu_capability.cc  | 10 ++++++++++
 onnxruntime/core/framework/fallback_cpu_capability.h   |  6 ++++++
 .../core/providers/cann/cann_execution_provider.cc     |  4 ++++
 .../core/providers/cuda/cuda_execution_provider.cc     |  4 ++++
 .../dml/DmlExecutionProvider/src/ExecutionProvider.cpp |  4 ++++
 onnxruntime/core/providers/js/js_execution_provider.cc |  5 +++++
 .../core/providers/rocm/rocm_execution_provider.cc     |  4 ++++
 .../core/providers/shared_library/provider_api.h       |  7 ++++++-
 .../shared_library/provider_bridge_provider.cc         |  9 ++++++++-
 .../providers/shared_library/provider_interfaces.h     |  5 ++++-
 onnxruntime/core/session/provider_bridge_ort.cc        |  9 ++++++++-
 12 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 1e02a02349df5..028c71fa35167 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -300,4 +300,6 @@ static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas
 // - "0": Disable reverse-traversing CPU fallback. [DEFAULT]
 // - "1": Enable reverse-traversing CPU fallback when calling GetCpuPreferredNodes(...).
 //        (i.e., adding nodes found by GetShapeRelatedNodes(...) to CPU node list internally).
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
 static const char* const kOrtSessionOptionsAggressiveCpuFallback = "session.aggressive_cpu_fallback";
+#endif
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
index 21faffbfe5a00..dcee730be977f 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.cc
+++ b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -42,6 +42,7 @@ static bool IsSmallInitializer(const onnxruntime::GraphViewer& graph, const Node
 }
 }  // namespace
 
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
 static InlinedHashSet<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphViewer& viewer) {
   // Conceptually, this function traverse from shape-consuming nodes
   // to fallback all its upstream nodes to CPU. Consider a graph
@@ -153,11 +154,18 @@ static InlinedHashSet<NodeIndex> GetShapeRelatedNodes(const onnxruntime::GraphVi
 
   return shape_related_node_indices;
 }
+#endif
 
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
                                                    gsl::span<const NodeIndex> tentative_nodes,
                                                    const bool aggressive_cpu_fallback) {
+#else
+std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
+                                                   const IExecutionProvider::IKernelLookup& kernel_lookup,
+                                                   gsl::span<const NodeIndex> tentative_nodes) {
+#endif
   // automatic conversion from const std::vector&
   const auto& ordered_nodes = graph.GetNodesInTopologicalOrder();
   InlinedVector<size_t> node_id_to_order_map(graph.MaxNodeIndex());
@@ -287,10 +295,12 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
     }
   }
 
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
   if (aggressive_cpu_fallback) {
     auto shape_related_node_indices = GetShapeRelatedNodes(graph);
     cpu_nodes.insert(shape_related_node_indices.begin(), shape_related_node_indices.end());
   }
+#endif
   return cpu_nodes;
 }
 
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h
index 3e2f0d85f0306..c09fd4e7ea0db 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.h
+++ b/onnxruntime/core/framework/fallback_cpu_capability.h
@@ -18,9 +18,15 @@ namespace onnxruntime {
   @param tentative_nodes Nodes that are tentative to be placed on on target EP
   @param aggressive_cpu_fallback This is the set by kOrtSessionOptionsAggressiveCpuFallback option.
   */
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
                                                    gsl::span<const NodeIndex> tentative_nodes,
                                                    const bool aggressive_cpu_fallback);
+#else
+std::unordered_set<NodeIndex> GetCpuPreferredNodes(const GraphViewer& graph,
+                                                   const IExecutionProvider::IKernelLookup& kernel_lookup,
+                                                   gsl::span<const NodeIndex> tentative_nodes);
+#endif
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc
index 4c0fe1cb20b99..0a6264be7d600 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.cc
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc
@@ -1296,6 +1296,7 @@ CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewe
       candidates.push_back(node.Index());
     }
 
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
     auto p_session_options = GetSessionOptions();
     bool aggressive_cpu_fallback = false;
     if (p_session_options) {
@@ -1303,6 +1304,9 @@ CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewe
                                     kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
     }
     auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates, aggressive_cpu_fallback);
+#else
+    auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates);
+#endif
     for (auto& node_index : candidates) {
       if (cpu_nodes.count(node_index) > 0)
         continue;
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 8ee21d705a765..0e1e8ae23aca7 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -2528,6 +2528,7 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
     }
   }
 
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
   // For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
   // These are usually shape related computation subgraphs
   // Following logic can be extended for other EPs
@@ -2538,6 +2539,9 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                   kOrtSessionOptionsAggressiveCpuFallback) == "1";
   }
   auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback);
+#else
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+#endif
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 1953fd048ed3a..75f701eecba4c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -886,6 +886,7 @@ namespace Dml
         }
 
         // Get the list of nodes that should stay on the CPU
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
         auto p_session_options = GetSessionOptions();
         bool aggressive_cpu_fallback = false;
         if (p_session_options) {
@@ -893,6 +894,9 @@ namespace Dml
             kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
         }
         auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes, aggressive_cpu_fallback);
+#else
+        auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes);
+#endif
 
         for (size_t nodeIndex : toplogicalOrder)
         {
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index c97c5cb4d1871..b2424714c9fe8 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -728,6 +728,7 @@ std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapabili
     candidates.push_back(node.Index());
     tenative_candidates.push_back(node.Index());
   }
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
   auto p_session_options = GetSessionOptions();
   bool aggressive_cpu_fallback = false;
   if (p_session_options) {
@@ -735,6 +736,10 @@ std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapabili
                                   kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
   }
   auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates, aggressive_cpu_fallback);
+#else
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates);
+#endif
+
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0) {
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 73ec446b18ed2..db8bc1450b700 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -2413,6 +2413,7 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
     }
   }
 
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
   // For ROCM EP, exclude the subgraph that is preferred to be placed in CPU
   // These are usually shape related computation subgraphs
   // Following logic can be extended for other EPs
@@ -2423,6 +2424,9 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                   kOrtSessionOptionsAggressiveCpuFallback, "0") == "1";
   }
   auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, candidates, aggressive_cpu_fallback);
+#else
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, candidates);
+#endif
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0)
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index ba3b72c5a7fe4..eff7e4dbd96b4 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -278,11 +278,16 @@ std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(const char* name);
 
 std::unique_ptr<IDataTransfer> CreateGPUDataTransfer();
 
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
                                                    gsl::span<const NodeIndex> tentative_nodes,
                                                    const bool aggressive_cpu_fallback);
-
+#else
+std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
+                                                   const IExecutionProvider::IKernelLookup& kernel_lookup,
+                                                   gsl::span<const NodeIndex> tentative_nodes);
+#endif
 std::string GetEnvironmentVar(const std::string& var_name);
 
 namespace profiling {
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 8c8e78a8426d1..6843de7a0f92f 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -361,13 +361,20 @@ std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() {
 std::string GetEnvironmentVar(const std::string& var_name) {
   return g_host->GetEnvironmentVar(var_name);
 }
-
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
                                                    gsl::span<const NodeIndex> tentative_nodes,
                                                    const bool aggressive_cpu_fallback) {
   return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback);
 }
+#else
+std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
+                                                   const IExecutionProvider::IKernelLookup& kernel_lookup,
+                                                   gsl::span<const NodeIndex> tentative_nodes) {
+  return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+}
+#endif
 
 namespace profiling {
 
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 73d1d81a4d6ad..5951f99fa8fa0 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -189,11 +189,14 @@ struct ProviderHost {
   virtual Status RocmCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0;
   virtual void RocmCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0;
 #endif
-
   virtual std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                              const IExecutionProvider::IKernelLookup& kernel_lookup,
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
                                                              gsl::span<const NodeIndex> tentative_nodes,
                                                              const bool aggressive_cpu_fallback) = 0;
+#else
+                                                             gsl::span<const NodeIndex> tentative_nodes) = 0;
+#endif
 
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) = 0;
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ float* p_data, size_t expected_size) = 0;
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index b9a613b79b517..50af2394024b1 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -260,13 +260,20 @@ struct ProviderHostImpl : ProviderHost {
 
   std::string demangle(const char* name) override { return onnxruntime::profiling::demangle(name); }
   std::string demangle(const std::string& name) override { return onnxruntime::profiling::demangle(name); }
-
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
   std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                      const IExecutionProvider::IKernelLookup& kernel_lookup,
                                                      gsl::span<const NodeIndex> tentative_nodes,
                                                      const bool aggressive_cpu_fallback) override {
     return onnxruntime::GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, aggressive_cpu_fallback);
   }
+#else
+  std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
+                                                     const IExecutionProvider::IKernelLookup& kernel_lookup,
+                                                     gsl::span<const NodeIndex> tentative_nodes) override {
+    return onnxruntime::GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+  }
+#endif
 
   Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) override { return utils::UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }
   Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ float* p_data, size_t expected_size) override { return utils::UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }