From 9140d9b1ff2383ba4f2a7db3b1efa9719f9f2ee4 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 18 Jul 2024 14:26:26 -0700
Subject: [PATCH 01/15] Update azure-kusto-data and azure-kusto-ingest (#21409)

A vulnerability has been found in the Kusto SDK. We need to update it to
latest to address a security alert.
---
 .../github/windows/post_to_dashboard/requirements.txt         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
index b8c00a610b781..6ece3c1f92c4e 100644
--- a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
+++ b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
@@ -1,2 +1,2 @@
-azure-kusto-data[pandas]==3.0.1
-azure-kusto-ingest[pandas]==3.0.1
+azure-kusto-data[pandas]==4.5.1
+azure-kusto-ingest[pandas]==4.5.1

From cc4049af831034cd7d0b78e4b055b8994939e62a Mon Sep 17 00:00:00 2001
From: glen-amd <146770157+glen-amd@users.noreply.github.com>
Date: Fri, 19 Jul 2024 08:34:03 -0700
Subject: [PATCH 02/15] Enabled more VitisAI backend compilers (#21411)

### Description
Enabled more VitisAI backend compilers
---
 onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc    | 2 +-
 onnxruntime/core/providers/vitisai/include/ep_context_utils.h | 4 ++--
 .../core/providers/vitisai/vitisai_execution_provider.cc      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc b/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc
index ab31aa313cf6d..368c8c0358228 100644
--- a/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc
+++ b/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc
@@ -466,7 +466,7 @@ std::string RetrieveEPContextCache(
   fs::path ep_ctx_fs_path(ep_ctx_model_loc);
   // Attr "ep_cache_context" stores a relative path.
   ep_ctx_fs_path.replace_filename(fs::path(ep_ctx_cache));
-  // TODO: Validaion of the file location to make sure security is met.
+  // TODO: Validation of the file location to make sure security is met.
   if (!fs::exists(ep_ctx_fs_path) || !fs::is_regular_file(ep_ctx_fs_path)) {
     ORT_THROW("File for EP context cache is missing");
   }
diff --git a/onnxruntime/core/providers/vitisai/include/ep_context_utils.h b/onnxruntime/core/providers/vitisai/include/ep_context_utils.h
index 61a595cf1ae15..26546f422765c 100644
--- a/onnxruntime/core/providers/vitisai/include/ep_context_utils.h
+++ b/onnxruntime/core/providers/vitisai/include/ep_context_utils.h
@@ -14,8 +14,8 @@ namespace fs = std::filesystem;
 namespace onnxruntime {
 
 constexpr const uint8_t kXCCode = 1;
-constexpr const uint8_t kDDCode = 2;
-constexpr const uint8_t kVCode = 4;
+[[maybe_unused]] constexpr const uint8_t kDDCode = 2;
+[[maybe_unused]] constexpr const uint8_t kVCode = 4;
 
 static constexpr const char* kEPContextOp = "EPContext";
 static constexpr const char* kMainContextAttr = "main_context";
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index f45b89649bfcb..036831df7a9cf 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -86,7 +86,7 @@ void VitisAIExecutionProvider::PrepareEPContextEnablement(
     model_path_str_ = ToPathString(GetTopLevelModelPath(graph_viewer).string());
   }
   std::string backend_cache_dir, backend_cache_key;
-  get_backend_compilation_cache(model_path_str_, graph_viewer, info_, kXCCode, backend_cache_dir, backend_cache_key, backend_cache_data_);
+  get_backend_compilation_cache(model_path_str_, graph_viewer, info_, kXCCode | kDDCode | kVCode, backend_cache_dir, backend_cache_key, backend_cache_data_);
   info_["cacheDir"] = backend_cache_dir;
   info_["cacheKey"] = backend_cache_key;
   // Create a new model, reusing the graph name, the op-domain-to-opset-version map,

From 22d4d82f3c55525510bef785fab6c7c83a21c2e9 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Fri, 19 Jul 2024 08:36:47 -0700
Subject: [PATCH 03/15] Move ReluQuantFusion to Level2 for CPU EP only (#21329)

### Description
Moves the `Relu -> QuantizeLinear` fusion to Level2 optimizations for
CPU EP only.

### Motivation and Context
See the related PR for motivation and context:
https://github.com/microsoft/onnxruntime/pull/20627
---
 .../core/optimizer/graph_transformer_utils.cc |  2 +-
 .../qdq_transformer/relu_quantizelinear.cc    |  4 +-
 .../test/optimizer/qdq_transformer_test.cc    | 51 +++++++++++++++++++
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 4298551aec412..e6feb3e7ddbe2 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -132,12 +132,12 @@ InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
       rules.push_back(std::make_unique<ConvBNFusion>());
       rules.push_back(std::make_unique<PadFusion>());
       rules.push_back(std::make_unique<MatmulBNFusion>());
-      rules.push_back(std::make_unique<ReluQuantFusion>());
       rules.push_back(std::make_unique<LabelEncoderFusion>());
       break;
 
     case TransformerLevel::Level2:
       rules.push_back(std::make_unique<ClipQuantFusion>());
+      rules.push_back(std::make_unique<ReluQuantFusion>());
       rules.push_back(std::make_unique<GemmTransposeFusion>());
       break;
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc b/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc
index 7417212c570c8..e756ffe78a289 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc
@@ -13,13 +13,15 @@ namespace onnxruntime {
 
 bool ReluQuantFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& /*logger*/) const {
   if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Relu", {6, 13, 14}) ||
+      !graph_utils::IsSupportedProvider(node, {kCpuExecutionProvider}) ||
       !optimizer_utils::CheckOutputEdges(graph, node, 1)) {
     return false;
   }
 
   // if Relu is followed by QuantizeLinear, it can be fused into QuantizeLinear potentially
   const auto& next_node = *node.OutputNodesBegin();
-  if (!QDQ::MatchQNode(next_node)) {
+  if (!graph_utils::IsSupportedProvider(next_node, {kCpuExecutionProvider}) ||
+      !QDQ::MatchQNode(next_node)) {
     return false;
   }
 
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 1c77121ba9df1..1638851daf65a 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -2763,6 +2763,57 @@ TEST(QDQTransformerTests, Clip) {
   }
 }
 
+// Test that the ReluQuantFusion transformer only runs for optimization level >= 2.
+TEST(QDQTransformerTests, ReluQuantFusion_Level2Only) {
+  auto test_case = [&](TransformerLevel opt_level, int8_t zp) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input_arg = builder.MakeInput<int8_t>({1, 2, 2, 2},
+                                                  {-4, -3, -2, 0, 1, 2, 3, 4});
+      auto* output_arg = builder.MakeOutput();
+
+      // add DQ
+      auto* dq_output = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<int8_t>(input_arg, 1.0f, zp, dq_output);
+
+      // add Relu
+      auto* relu_output = builder.MakeIntermediate();
+      builder.AddNode("Relu", {dq_output}, {relu_output});
+
+      // add Q + DQ
+      auto* q_output = builder.MakeIntermediate();
+      builder.AddQuantizeLinearNode<int8_t>(relu_output, 1.0f, zp, q_output);
+      builder.AddDequantizeLinearNode<int8_t>(q_output, 1.0f, zp, output_arg);
+    };
+
+    auto check_relu_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+      // Only fuse relu into Q if level >= 2 and zero_point == -128 for int8.
+      // Level1 graph:   input -> DQ -> Relu -> Q -> DQ -> output
+      // Level2+ graph: input -> DQ -> output (QuantReluFusion + QDQFinalCleanupTransformer transformers)
+      const bool fuse_relu = (zp == -128) &&
+                             (opt_level == TransformerLevel::Level2 || opt_level == TransformerLevel::Level3);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], fuse_relu ? 0 : 1);
+      EXPECT_EQ(op_to_count["Relu"], fuse_relu ? 0 : 1);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], fuse_relu ? 1 : 2);
+    };
+
+    constexpr float epsilon = std::numeric_limits<float>::epsilon();
+
+    TransformerTester(build_test_case, check_relu_graph,
+                      TransformerLevel::Default,
+                      opt_level,
+                      18,
+                      epsilon,
+                      epsilon);
+  };
+
+  test_case(TransformerLevel::Level1, -128);  // Will not fuse Relu into QuantizeLinear due to level1 opt.
+  test_case(TransformerLevel::Level2, -128);  // Will fuse Relu into QuantizeLinear.
+  test_case(TransformerLevel::Level3, -128);  // Will fuse Relu into QuantizeLinear.
+  test_case(TransformerLevel::Level3, 0);     // Will not fuse Relu into QuantizeLinear due to zero-point != -128
+}
+
 TEST(QDQTransformerTests, Concat) {
   auto test_case = [&](const std::vector<std::vector<int64_t>>& input_shapes,
                        int64_t axis,

From 01df8c787d872d5fffb3b48d70d9a0e3d6323e3c Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:11:30 -0700
Subject: [PATCH 04/15] [js/web] fix vulnerable version of dependencies
 (#21412)

### Description
```
# npm audit report

socket.io  3.0.0 - 4.6.2
Severity: high
socket.io has an unhandled 'error' event - https://github.com/advisories/GHSA-25hc-qcg6-38wj
Depends on vulnerable versions of engine.io
fix available via `npm audit fix`
node_modules/socket.io

ws  8.0.0 - 8.17.0
Severity: high
ws affected by a DoS when handling a request with many HTTP headers - https://github.com/advisories/GHSA-3h5v-q93c-6h6q
fix available via `npm audit fix`
node_modules/ws
  engine.io  0.7.8 - 0.7.9 || 6.0.0 - 6.5.4
  Depends on vulnerable versions of ws
  node_modules/engine.io
  socket.io-adapter  2.5.2 - 2.5.4
  Depends on vulnerable versions of ws
  node_modules/socket.io-adapter

4 high severity vulnerabilities
```
---
 js/web/package-lock.json | 126 ++++++++++++++++++++-------------------
 1 file changed, 65 insertions(+), 61 deletions(-)

diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index b802a4e8271a7..3cfc0457c6234 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -194,9 +194,9 @@
       }
     },
     "node_modules/@socket.io/component-emitter": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.0.tgz",
-      "integrity": "sha512-+9jVqKhRSpsc591z5vX+X5Yyw+he/HCB4iQ/RYxw35CEPaY1gnsNE43nf9n9AaYjAQrTiI/mOwKUKdUs9vf7Xg==",
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.2.tgz",
+      "integrity": "sha512-9BCxFwvbGg/RsZK9tjXd8s4UcwR0MWeFQ1XEKIQVVvAGJyINdrqKMcTRyLoK8Rse1GjzLV9cwjWV1olXRWEXVA==",
       "dev": true
     },
     "node_modules/@szmarczak/http-timer": {
@@ -236,9 +236,9 @@
       "dev": true
     },
     "node_modules/@types/cors": {
-      "version": "2.8.13",
-      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.13.tgz",
-      "integrity": "sha512-RG8AStHlUiV5ysZQKq97copd2UmVYw3/pRMLefISZ3S1hK104Cwm7iLQ3fTKx+lsUH2CE8FlLaYeEA2LSeqYUA==",
+      "version": "2.8.17",
+      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
+      "integrity": "sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==",
       "dev": true,
       "dependencies": {
         "@types/node": "*"
@@ -1086,9 +1086,9 @@
       }
     },
     "node_modules/engine.io": {
-      "version": "6.4.2",
-      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.4.2.tgz",
-      "integrity": "sha512-FKn/3oMiJjrOEOeUub2WCox6JhxBXq/Zn3fZOMCBxKnNYtsdKjxhl7yR3fZhM9PV+rdE75SU5SYMc+2PGzo+Tg==",
+      "version": "6.5.5",
+      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.5.5.tgz",
+      "integrity": "sha512-C5Pn8Wk+1vKBoHghJODM63yk8MvrO9EWZUfkAt5HAqIgPE4/8FF0PEGHXtEd40l223+cE5ABWuPzm38PHFXfMA==",
       "dev": true,
       "dependencies": {
         "@types/cookie": "^0.4.1",
@@ -1099,17 +1099,17 @@
         "cookie": "~0.4.1",
         "cors": "~2.8.5",
         "debug": "~4.3.1",
-        "engine.io-parser": "~5.0.3",
-        "ws": "~8.11.0"
+        "engine.io-parser": "~5.2.1",
+        "ws": "~8.17.1"
       },
       "engines": {
-        "node": ">=10.0.0"
+        "node": ">=10.2.0"
       }
     },
     "node_modules/engine.io-parser": {
-      "version": "5.0.6",
-      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.0.6.tgz",
-      "integrity": "sha512-tjuoZDMAdEhVnSFleYPCtdL2GXwVTGtNjoeJd9IhIG3C1xs9uwxqRNEu5WpnDZCaozwVlK/nuQhpodhXSIMaxw==",
+      "version": "5.2.3",
+      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.2.3.tgz",
+      "integrity": "sha512-HqD3yTBfnBxIrbnM1DoD6Pcq8NECnh8d4As1Qgh0z5Gg3jRRIqijury0CL3ghu/edArpUYiYqQiDUQBIs4np3Q==",
       "dev": true,
       "engines": {
         "node": ">=10.0.0"
@@ -3020,35 +3020,37 @@
       }
     },
     "node_modules/socket.io": {
-      "version": "4.6.1",
-      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.6.1.tgz",
-      "integrity": "sha512-KMcaAi4l/8+xEjkRICl6ak8ySoxsYG+gG6/XfRCPJPQ/haCRIJBTL4wIl8YCsmtaBovcAXGLOShyVWQ/FG8GZA==",
+      "version": "4.7.5",
+      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.7.5.tgz",
+      "integrity": "sha512-DmeAkF6cwM9jSfmp6Dr/5/mfMwb5Z5qRrSXLpo3Fq5SqyU8CMF15jIN4ZhfSwu35ksM1qmHZDQ/DK5XTccSTvA==",
       "dev": true,
       "dependencies": {
         "accepts": "~1.3.4",
         "base64id": "~2.0.0",
+        "cors": "~2.8.5",
         "debug": "~4.3.2",
-        "engine.io": "~6.4.1",
+        "engine.io": "~6.5.2",
         "socket.io-adapter": "~2.5.2",
-        "socket.io-parser": "~4.2.1"
+        "socket.io-parser": "~4.2.4"
       },
       "engines": {
-        "node": ">=10.0.0"
+        "node": ">=10.2.0"
       }
     },
     "node_modules/socket.io-adapter": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.2.tgz",
-      "integrity": "sha512-87C3LO/NOMc+eMcpcxUBebGjkpMDkNBS9tf7KJqcDsmL936EChtVva71Dw2q4tQcuVC+hAUy4an2NO/sYXmwRA==",
+      "version": "2.5.5",
+      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.5.tgz",
+      "integrity": "sha512-eLDQas5dzPgOWCk9GuuJC2lBqItuhKI4uxGgo9aIV7MYbk2h9Q6uULEh8WBzThoI7l+qU9Ast9fVUmkqPP9wYg==",
       "dev": true,
       "dependencies": {
-        "ws": "~8.11.0"
+        "debug": "~4.3.4",
+        "ws": "~8.17.1"
       }
     },
     "node_modules/socket.io-parser": {
-      "version": "4.2.3",
-      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.3.tgz",
-      "integrity": "sha512-JMafRntWVO2DCJimKsRTh/wnqVvO4hrfwOqtO7f+uzwsQMuxO6VwImtYxaQ+ieoyshWOTJyV0fA21lccEXRPpQ==",
+      "version": "4.2.4",
+      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.4.tgz",
+      "integrity": "sha512-/GbIKmo8ioc+NIWIhwdecY0ge+qVBSMdgxGygevmdHj24bsfgtCmcUUcQ5ZzcylGFHsN3k4HB4Cgkl96KVnuew==",
       "dev": true,
       "dependencies": {
         "@socket.io/component-emitter": "~3.1.0",
@@ -3449,16 +3451,16 @@
       "dev": true
     },
     "node_modules/ws": {
-      "version": "8.11.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.11.0.tgz",
-      "integrity": "sha512-HPG3wQd9sNQoT9xHyNCXoDUa+Xw/VevmY9FoHyQ+g+rrMn4j6FB4np7Z0OhdTgjx6MgQLK7jwSy1YecU1+4Asg==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
+      "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==",
       "dev": true,
       "engines": {
         "node": ">=10.0.0"
       },
       "peerDependencies": {
         "bufferutil": "^4.0.1",
-        "utf-8-validate": "^5.0.2"
+        "utf-8-validate": ">=5.0.2"
       },
       "peerDependenciesMeta": {
         "bufferutil": {
@@ -3648,9 +3650,9 @@
       "dev": true
     },
     "@socket.io/component-emitter": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.0.tgz",
-      "integrity": "sha512-+9jVqKhRSpsc591z5vX+X5Yyw+he/HCB4iQ/RYxw35CEPaY1gnsNE43nf9n9AaYjAQrTiI/mOwKUKdUs9vf7Xg==",
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.2.tgz",
+      "integrity": "sha512-9BCxFwvbGg/RsZK9tjXd8s4UcwR0MWeFQ1XEKIQVVvAGJyINdrqKMcTRyLoK8Rse1GjzLV9cwjWV1olXRWEXVA==",
       "dev": true
     },
     "@szmarczak/http-timer": {
@@ -3687,9 +3689,9 @@
       "dev": true
     },
     "@types/cors": {
-      "version": "2.8.13",
-      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.13.tgz",
-      "integrity": "sha512-RG8AStHlUiV5ysZQKq97copd2UmVYw3/pRMLefISZ3S1hK104Cwm7iLQ3fTKx+lsUH2CE8FlLaYeEA2LSeqYUA==",
+      "version": "2.8.17",
+      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
+      "integrity": "sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==",
       "dev": true,
       "requires": {
         "@types/node": "*"
@@ -4379,9 +4381,9 @@
       }
     },
     "engine.io": {
-      "version": "6.4.2",
-      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.4.2.tgz",
-      "integrity": "sha512-FKn/3oMiJjrOEOeUub2WCox6JhxBXq/Zn3fZOMCBxKnNYtsdKjxhl7yR3fZhM9PV+rdE75SU5SYMc+2PGzo+Tg==",
+      "version": "6.5.5",
+      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.5.5.tgz",
+      "integrity": "sha512-C5Pn8Wk+1vKBoHghJODM63yk8MvrO9EWZUfkAt5HAqIgPE4/8FF0PEGHXtEd40l223+cE5ABWuPzm38PHFXfMA==",
       "dev": true,
       "requires": {
         "@types/cookie": "^0.4.1",
@@ -4392,14 +4394,14 @@
         "cookie": "~0.4.1",
         "cors": "~2.8.5",
         "debug": "~4.3.1",
-        "engine.io-parser": "~5.0.3",
-        "ws": "~8.11.0"
+        "engine.io-parser": "~5.2.1",
+        "ws": "~8.17.1"
       }
     },
     "engine.io-parser": {
-      "version": "5.0.6",
-      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.0.6.tgz",
-      "integrity": "sha512-tjuoZDMAdEhVnSFleYPCtdL2GXwVTGtNjoeJd9IhIG3C1xs9uwxqRNEu5WpnDZCaozwVlK/nuQhpodhXSIMaxw==",
+      "version": "5.2.3",
+      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.2.3.tgz",
+      "integrity": "sha512-HqD3yTBfnBxIrbnM1DoD6Pcq8NECnh8d4As1Qgh0z5Gg3jRRIqijury0CL3ghu/edArpUYiYqQiDUQBIs4np3Q==",
       "dev": true
     },
     "ent": {
@@ -5862,32 +5864,34 @@
       "dev": true
     },
     "socket.io": {
-      "version": "4.6.1",
-      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.6.1.tgz",
-      "integrity": "sha512-KMcaAi4l/8+xEjkRICl6ak8ySoxsYG+gG6/XfRCPJPQ/haCRIJBTL4wIl8YCsmtaBovcAXGLOShyVWQ/FG8GZA==",
+      "version": "4.7.5",
+      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.7.5.tgz",
+      "integrity": "sha512-DmeAkF6cwM9jSfmp6Dr/5/mfMwb5Z5qRrSXLpo3Fq5SqyU8CMF15jIN4ZhfSwu35ksM1qmHZDQ/DK5XTccSTvA==",
       "dev": true,
       "requires": {
         "accepts": "~1.3.4",
         "base64id": "~2.0.0",
+        "cors": "~2.8.5",
         "debug": "~4.3.2",
-        "engine.io": "~6.4.1",
+        "engine.io": "~6.5.2",
         "socket.io-adapter": "~2.5.2",
-        "socket.io-parser": "~4.2.1"
+        "socket.io-parser": "~4.2.4"
       }
     },
     "socket.io-adapter": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.2.tgz",
-      "integrity": "sha512-87C3LO/NOMc+eMcpcxUBebGjkpMDkNBS9tf7KJqcDsmL936EChtVva71Dw2q4tQcuVC+hAUy4an2NO/sYXmwRA==",
+      "version": "2.5.5",
+      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.5.tgz",
+      "integrity": "sha512-eLDQas5dzPgOWCk9GuuJC2lBqItuhKI4uxGgo9aIV7MYbk2h9Q6uULEh8WBzThoI7l+qU9Ast9fVUmkqPP9wYg==",
       "dev": true,
       "requires": {
-        "ws": "~8.11.0"
+        "debug": "~4.3.4",
+        "ws": "~8.17.1"
       }
     },
     "socket.io-parser": {
-      "version": "4.2.3",
-      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.3.tgz",
-      "integrity": "sha512-JMafRntWVO2DCJimKsRTh/wnqVvO4hrfwOqtO7f+uzwsQMuxO6VwImtYxaQ+ieoyshWOTJyV0fA21lccEXRPpQ==",
+      "version": "4.2.4",
+      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.4.tgz",
+      "integrity": "sha512-/GbIKmo8ioc+NIWIhwdecY0ge+qVBSMdgxGygevmdHj24bsfgtCmcUUcQ5ZzcylGFHsN3k4HB4Cgkl96KVnuew==",
       "dev": true,
       "requires": {
         "@socket.io/component-emitter": "~3.1.0",
@@ -6179,9 +6183,9 @@
       "dev": true
     },
     "ws": {
-      "version": "8.11.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.11.0.tgz",
-      "integrity": "sha512-HPG3wQd9sNQoT9xHyNCXoDUa+Xw/VevmY9FoHyQ+g+rrMn4j6FB4np7Z0OhdTgjx6MgQLK7jwSy1YecU1+4Asg==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
+      "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==",
       "dev": true,
       "requires": {}
     },

From 6ffaaebb60cd43cf7749e67a9bb54c3bd2cc4efd Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Fri, 19 Jul 2024 13:58:54 -0700
Subject: [PATCH 05/15] [CUDA] Attention kernel provider option (#21344)

### Description
* Add a cuda provider option `sdpa_kernel` to choose which attention kernel to run for testing purpose.
* Allow dump which attention kernel is used per node.
* Reserve  a flag for cudnn flash attention which will be added soon.

#### CUDA provider option sdpa_kernel
Instead of setting environment variable, we also support setting it in
provider option. Note that the setting is global per session. That could
help performance testing of each kernel.

#### Attention Kernel Debug Info
Set an environment variable `ORT_ENABLE_ATTENTION_KERNEL_DEBUG_INFO=1`,
and ORT will print sdpa kernel used in each node:

For example
```
ORT_ENABLE_ATTENTION_KERNEL_DEBUG_INFO=1 ./onnxruntime_test_all --gtest_filter=MultiHeadAttentionTest*
```
It will show debug information of kernel used in testing:
```
[ RUN      ] MultiHeadAttentionTest.SelfAttention_Batch2_HeadSize32_NoBias_NoMask_PackedQKV
AttentionKernelOptions: FLASH_ATTENTION=0 EFFICIENT_ATTENTION=0 TRT_FUSED_ATTENTION=1 CUDNN_FLASH_ATTENTION=0 TRT_FLASH_ATTENTION=1 TRT_CROSS_ATTENTION=0 TRT_CAUSAL_ATTENTION=0 MATH=1
Operator=MultiHeadAttention Node=node1 DataType=fp16 TRT_FUSED_ATTENTION=1
AttentionKernelOptions: FLASH_ATTENTION=0 EFFICIENT_ATTENTION=1 TRT_FUSED_ATTENTION=0 CUDNN_FLASH_ATTENTION=0 TRT_FLASH_ATTENTION=0 TRT_CROSS_ATTENTION=0 TRT_CAUSAL_ATTENTION=0 MATH=1
Operator=MultiHeadAttention Node=node1 DataType=fp16 EFFICIENT_ATTENTION=1
```
In this test case, the debug info shows that one session uses trt fused
attention and another session use efficient attention.
---
 cmake/onnxruntime_rocm_hipify.cmake           |   2 +
 cmake/onnxruntime_unittests.cmake             |   3 +-
 .../providers/cuda/cuda_provider_options.h    |   1 +
 .../contrib_ops/cpu/bert/attention_common.h   |  28 ++-
 .../contrib_ops/cuda/bert/attention.cc        |  53 ++---
 onnxruntime/contrib_ops/cuda/bert/attention.h |   4 +-
 .../cuda/bert/attention_kernel_options.cc     | 166 +++++++++++++
 .../cuda/bert/attention_kernel_options.h      |  67 ++++++
 .../cuda/bert/group_query_attention.cc        |  30 +--
 .../cuda/bert/group_query_attention.h         |   2 +
 .../cuda/bert/multihead_attention.cc          |  50 ++--
 .../cuda/bert/multihead_attention.h           |   3 +-
 .../contrib_ops/cuda/bert/packed_attention.cc |  33 ++-
 .../contrib_ops/cuda/bert/packed_attention.h  |   9 +-
 .../cuda/bert/packed_multihead_attention.cc   |  40 ++--
 .../cuda/bert/packed_multihead_attention.h    |   4 +-
 .../providers/cuda/cuda_execution_provider.h  |  17 ++
 .../cuda/cuda_execution_provider_info.cc      |   4 +
 .../cuda/cuda_execution_provider_info.h       |   4 +
 onnxruntime/core/providers/cuda/cuda_kernel.h |   6 +
 .../providers/cuda/cuda_provider_factory.cc   |   2 +
 .../multihead_attention_op_test.cc            |   4 +-
 .../attention_kernel_options_test.cc          | 221 ++++++++++++++++++
 .../test/python/onnxruntime_test_python.py    |   2 +
 24 files changed, 645 insertions(+), 110 deletions(-)
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.cc
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.h
 create mode 100644 onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index 2966a4624a966..a8c876d30873e 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -15,6 +15,8 @@ set(contrib_ops_excluded_files
   "bert/attention_softmax.h"
   "bert/attention_softmax.cu"
   "bert/attention_prepare_qkv.cu"
+  "bert/attention_kernel_options.h"
+  "bert/attention_kernel_options.cc"
   "bert/decoder_attention_impl.h"
   "bert/decoder_attention_impl.cu"
   "bert/decoder_masked_multihead_attention.h"
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 0159c35d1941b..38ed0b1640192 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -786,8 +786,9 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
   config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
+  add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils onnxruntime_common)
   target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
-  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
+  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils onnxruntime_common)
   if (MSVC)
     # Cutlass code has an issue with the following:
     # warning C4100: 'magic': unreferenced formal parameter
diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
index 6d53760ab60b5..01a14de699dc4 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
@@ -38,4 +38,5 @@ struct OrtCUDAProviderOptionsV2 {
   int prefer_nhwc = 0;                                                                                         // make the CUDA EP NHWC preferred
   int use_ep_level_unified_stream = 0;                                                                         // flag specifying if ep level stream is used or not
   int use_tf32 = 1;                                                                                            // use TF32
+  int sdpa_kernel = 0;                                                                                         // Scaled Dot Product Attention kernel option
 };
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
index a5b9c84c63eb9..55292b35e1e38 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -147,6 +147,23 @@ constexpr const char* kDisableSparseAttentionV1 = "ORT_DISABLE_SPARSE_ATTENTION_
 }  // namespace sparse_attention
 
 namespace attention {
+
+enum class AttentionBackend : int {
+  FLASH_ATTENTION = 1,
+  EFFICIENT_ATTENTION = 2,
+  TRT_FUSED_ATTENTION = 4,
+  CUDNN_FLASH_ATTENTION = 8,  // reserved for cuDNN flash attention.
+  MATH = 16,                  // unfused kernel cannot be disabled right now.
+
+  // The following kernels might be deprecated in the future.
+  TRT_FLASH_ATTENTION = 32,
+  TRT_CROSS_ATTENTION = 64,
+  TRT_CAUSAL_ATTENTION = 128,
+};
+
+// Environment variable to enable debug information of attention kernel to be printed. Default is 0 (disabled).
+constexpr const char* kEnableAttentionKernelDebugInfo = "ORT_ENABLE_ATTENTION_KERNEL_DEBUG_INFO";
+
 // Environment variable to enable or disable TRT fused self attention kernel. Default is 0 (enabled).
 constexpr const char* kDisableFusedSelfAttention = "ORT_DISABLE_FUSED_ATTENTION";
 
@@ -157,6 +174,9 @@ constexpr const char* kDisableFusedCrossAttention = "ORT_DISABLE_FUSED_CROSS_ATT
 // Note that those causal attention kernels use fp16 accumulation. There is potential accuracy drop using those kernels.
 constexpr const char* kEnableFusedCausalAttention = "ORT_ENABLE_FUSED_CAUSAL_ATTENTION";
 
+// Environment variable to enable or disable cuDNN flash attention.
+constexpr const char* kEnableCudnnFlashAttention = "ORT_ENABLE_CUDNN_FLASH_ATTENTION";
+
 // Environment variable to enable or disable TRT flash attention. This applies to both self and causal attention. Default is 0 (enabled).
 constexpr const char* kDisableTrtFlashAttention = "ORT_DISABLE_TRT_FLASH_ATTENTION";
 
@@ -166,11 +186,15 @@ constexpr const char* kDisableMemoryEfficientAttention = "ORT_DISABLE_MEMORY_EFF
 // Environment variable to enable or disable flash attention. Default is 0 (enabled).
 constexpr const char* kDisableFlashAttention = "ORT_DISABLE_FLASH_ATTENTION";
 
-// Minimum sequence length to enable memory efficient attention in FP32.
-constexpr int kMinSeqLenForMemoryEfficientAttentionFp32 = 256;
+// Minimum sequence length to perfer memory efficient attention when data type is float32
+constexpr const char* kMinSeqLenForEfficientAttentionFp32 = "ORT_MIN_SEQ_LEN_EFFICIENT_ATTENTION_FP32";
+
+// Default value for minimum sequence length to enable memory efficient attention in FP32.
+constexpr int kDefaultMinSeqLenForEfficientAttentionFp32 = 256;
 
 // Minimum sequence length to prefer flash attention when input format is packed QKV for MultiHeadAttention
 constexpr const char* kMinSeqLenForFlashAttentionPackedQKV = "ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV";
+
 // Default value for the above setting.
 constexpr int kDefaultMinSeqLenForFlashAttentionPackedQKV = 513;
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc
index d9907f09121d0..cacd65313ebcc 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc
@@ -3,7 +3,6 @@
 
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/shared_inc/fpgeneric.h"
-#include "core/platform/env_var_utils.h"
 #include "contrib_ops/cuda/bert/attention_impl.h"
 #include "contrib_ops/cuda/bert/attention.h"
 #include "contrib_ops/cuda/bert/bert_padding.h"
@@ -40,36 +39,17 @@ REGISTER_KERNEL_TYPED(MLFloat16)
 
 template <typename T>
 Attention<T>::Attention(const OpKernelInfo& info) : CudaKernel(info), AttentionBase(info, false) {
-  disable_fused_self_attention_ =
-      sizeof(T) != 2 ||
-      ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFusedSelfAttention, false);
+  kernel_options_ = this->GetAttentionKernelOptions();
 
-  enable_trt_flash_attention_ =
-      sizeof(T) == 2 &&
-      !ParseEnvironmentVariableWithDefault<bool>(attention::kDisableTrtFlashAttention, false);
+  disable_fused_self_attention_ = sizeof(T) != 2 || !kernel_options_->UseTrtFusedAttention();
 
-  enable_fused_causal_attention_ =
-      sizeof(T) == 2 &&
-      ParseEnvironmentVariableWithDefault<bool>(attention::kEnableFusedCausalAttention, false);
+  enable_trt_flash_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtFlashAttention();
 
-#if USE_MEMORY_EFFICIENT_ATTENTION
-  disable_memory_efficient_attention_ =
-      ParseEnvironmentVariableWithDefault<bool>(attention::kDisableMemoryEfficientAttention, false);
-#else
-  disable_memory_efficient_attention_ = true;
-#endif
+  enable_fused_causal_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtCausalAttention();
 
-#if USE_FLASH_ATTENTION
-  disable_flash_attention_ =
-      sizeof(T) != 2 ||
-      onnxruntime::ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFlashAttention, false);
-  min_seq_len_for_flash_attention_packed_qkv_ = ParseEnvironmentVariableWithDefault<int>(
-      attention::kMinSeqLenForFlashAttentionPackedQKV,
-      attention::kDefaultMinSeqLenForFlashAttentionPackedQKV);
-#else
-  disable_flash_attention_ = true;
-  min_seq_len_for_flash_attention_packed_qkv_ = 0;
-#endif
+  disable_memory_efficient_attention_ = !kernel_options_->UseEfficientAttention();
+
+  disable_flash_attention_ = sizeof(T) != 2 || !kernel_options_->UseFlashAttention();
 }
 
 template <typename T>
@@ -134,7 +114,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                                                               parameters.num_heads,
                                                               parameters.num_heads);
   // When input is packed QKV format, TensorRT kernel might be faster when sequence length <= 512.
-  if (use_flash_attention && parameters.sequence_length < min_seq_len_for_flash_attention_packed_qkv_) {
+  if (use_flash_attention && parameters.sequence_length < kernel_options_->MinSeqLenForFlashAttentionPackedQkv()) {
     use_flash_attention = false;
   }
   // Allocate buffers
@@ -220,7 +200,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
       nullptr == past &&
       nullptr == present &&
       (nullptr == mask_index || parameters.mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START) &&
-      (sizeof(T) == 2 || parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32) &&
+      (sizeof(T) == 2 || parameters.sequence_length >= this->kernel_options_->MinSeqLenForEfficientAttentionFp32()) &&
       has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.v_head_size);
 
   if (use_memory_efficient_attention) {
@@ -231,6 +211,20 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   constexpr bool use_memory_efficient_attention = false;
 #endif
 
+  if (kernel_options_->AllowDebugInfo()) {
+    AttentionKernelDebugInfo debug_info;
+    debug_info.use_flash_attention = use_flash_attention;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
+    if (fused_runner != nullptr) {
+      debug_info.SetTrtFusedKernel(is_unidirectional_, enable_trt_flash_attention_, sequence_length);
+    }
+
+    debug_info.Print("Attention",
+                     this->Node().Name(),
+                     std::is_same<T, MLFloat16>::value,
+                     std::is_same<T, BFloat16>::value);
+  }
+
   cublasHandle_t cublas = GetCublasHandle(context);
 
   typedef typename ToCudaType<T>::MappedType CudaT;
@@ -268,7 +262,6 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                                                    use_fused_cross_attention,
                                                    use_memory_efficient_attention);
   IAllocatorUniquePtr<void> work_space = IAllocator::MakeUniquePtr<void>(allocator, workSpaceSize, false, context->GetComputeStream());
-  ;
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   AttentionData<CudaT> data;
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.h b/onnxruntime/contrib_ops/cuda/bert/attention.h
index acafb379d713f..0c7d3621f95ef 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/attention.h
@@ -8,6 +8,7 @@
 #include "core/providers/cuda/cuda_kernel.h"
 #include "contrib_ops/cpu/bert/attention_base.h"
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h"
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -27,9 +28,10 @@ class Attention final : public CudaKernel, public AttentionBase {
   bool enable_trt_flash_attention_;
   bool enable_fused_causal_attention_;
   bool disable_memory_efficient_attention_;
-  int min_seq_len_for_flash_attention_packed_qkv_;
   mutable std::unique_ptr<MHARunner> fused_fp16_runner_;
   mutable std::once_flag fused_fp16_runner_created_;
+
+  const AttentionKernelOptions* kernel_options_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.cc b/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.cc
new file mode 100644
index 0000000000000..28a095e68131e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.cc
@@ -0,0 +1,166 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include "contrib_ops/cpu/bert/attention_common.h"
+#include "core/providers/shared_library/provider_api.h"
+#include "core/platform/env_var_utils.h"
+#include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h"
+
+using namespace onnxruntime::contrib::attention;
+
+namespace onnxruntime {
+void AttentionKernelOptions::Initialize(int value, bool use_build_flag) {
+  if (value > 0) {
+    use_flash_attention_ = (value & static_cast<int>(AttentionBackend::FLASH_ATTENTION)) > 0;
+    use_efficient_attention_ = (value & static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION)) > 0;
+    use_trt_fused_attention_ = (value & static_cast<int>(AttentionBackend::TRT_FUSED_ATTENTION)) > 0;
+    use_cudnn_flash_attention_ = (value & static_cast<int>(AttentionBackend::CUDNN_FLASH_ATTENTION)) > 0;
+    use_unfused_ = (value & static_cast<int>(AttentionBackend::MATH)) > 0;
+    use_trt_flash_attention_ = (value & static_cast<int>(AttentionBackend::TRT_FLASH_ATTENTION)) > 0;
+    use_trt_cross_attention_ = (value & static_cast<int>(AttentionBackend::TRT_CROSS_ATTENTION)) > 0;
+    use_trt_causal_attention_ = (value & static_cast<int>(AttentionBackend::TRT_CAUSAL_ATTENTION)) > 0;
+  } else {
+    use_flash_attention_ = !ParseEnvironmentVariableWithDefault<bool>(kDisableFlashAttention, false);
+    use_efficient_attention_ = !ParseEnvironmentVariableWithDefault<bool>(kDisableMemoryEfficientAttention, false);
+    use_trt_fused_attention_ = !ParseEnvironmentVariableWithDefault<bool>(kDisableFusedSelfAttention, false);
+    use_cudnn_flash_attention_ = ParseEnvironmentVariableWithDefault<bool>(kEnableCudnnFlashAttention, false);
+    use_unfused_ = true;
+    use_trt_flash_attention_ = !ParseEnvironmentVariableWithDefault<bool>(kDisableTrtFlashAttention, false);
+    use_trt_cross_attention_ = !ParseEnvironmentVariableWithDefault<bool>(kDisableFusedCrossAttention, false);
+    use_trt_causal_attention_ = ParseEnvironmentVariableWithDefault<bool>(kEnableFusedCausalAttention, false);
+  }
+
+  enable_kernel_debug_info_ = ParseEnvironmentVariableWithDefault<bool>(kEnableAttentionKernelDebugInfo, false);
+
+  // When value is positive, we use 0 as default minimum sequence lengths to align with common usage in testing.
+  min_seq_len_for_flash_attention_packed_qkv_ = ParseEnvironmentVariableWithDefault<int>(
+      kMinSeqLenForFlashAttentionPackedQKV,
+      value > 0 ? 0 : kDefaultMinSeqLenForFlashAttentionPackedQKV);
+
+  min_seq_len_for_efficient_attention_fp32_ = ParseEnvironmentVariableWithDefault<int>(
+      kMinSeqLenForEfficientAttentionFp32,
+      value > 0 ? 0 : kDefaultMinSeqLenForEfficientAttentionFp32);
+
+  if (use_build_flag) {
+    // Some kernels can be disabled at build time. If they are disabled, we should not use them.
+#ifndef USE_FLASH_ATTENTION
+    use_flash_attention_ = false;
+#endif
+
+#ifndef USE_MEMORY_EFFICIENT_ATTENTION
+    use_efficient_attention_ = false;
+#endif
+  }
+}
+
+void AttentionKernelOptions::InitializeOnce(
+    int sdpa_kernel, bool use_build_flag) {
+  std::call_once(this->initialize_once_flag_, [&]() {
+    this->Initialize(sdpa_kernel, use_build_flag);
+    if (this->enable_kernel_debug_info_) {
+      this->Print();
+    }
+  });
+}
+
+void AttentionKernelOptions::Print() const {
+  std::stringstream sstream;
+  sstream << "AttentionKernelOptions:";
+  sstream << " FLASH_ATTENTION=" << int(use_flash_attention_);
+  sstream << " EFFICIENT_ATTENTION=" << int(use_efficient_attention_);
+  sstream << " TRT_FUSED_ATTENTION=" << int(use_trt_fused_attention_);
+  sstream << " CUDNN_FLASH_ATTENTION=" << int(use_cudnn_flash_attention_);
+  sstream << " TRT_FLASH_ATTENTION=" << int(use_trt_flash_attention_);
+  sstream << " TRT_CROSS_ATTENTION=" << int(use_trt_cross_attention_);
+  sstream << " TRT_CAUSAL_ATTENTION=" << int(use_trt_causal_attention_);
+  sstream << " MATH=" << int(use_unfused_);
+
+  if (!use_unfused_) {
+    sstream << std::endl
+            << "Warning: Unfused kernel cannot be disabled right now. MATH=0 is ignored.";
+  }
+
+  // Output text in Cyan color to make it easier to spot
+  std::cout << "\x1B[36m" << sstream.str() << "\x1B[0m" << std::endl;
+}
+
+// Classify the kernel used in TRT fused runner.
+void AttentionKernelDebugInfo::SetTrtFusedKernel(bool causal, bool enable_trt_flash_attention, int sequence_length) {
+  if (causal) {
+    use_trt_causal_attention = true;
+  } else if (enable_trt_flash_attention && sequence_length >= contrib::cuda::kMinSequenceLengthFlashAttention) {
+    use_trt_flash_attention = true;
+  } else {
+    use_trt_fused_attention = true;
+  }
+}
+
+void AttentionKernelDebugInfo::Print(const char* operator_name,
+                                     const std::string& node_name,
+                                     bool is_float16,
+                                     bool is_bfloat16) const {
+  std::stringstream sstream;
+  sstream << "Operator=" << operator_name;
+
+  if (node_name.length() > 0) {
+    sstream << " Node=" << node_name;
+  }
+
+  if (is_bfloat16) {
+    sstream << " DataType=bf16";
+  } else if (is_float16) {
+    sstream << " DataType=fp16";
+  } else {
+    sstream << " DataType=fp32";
+  }
+
+  if (use_flash_attention.has_value() && use_flash_attention.value()) {
+    sstream << " FLASH_ATTENTION=" << int(use_flash_attention.value());
+  }
+
+  if (use_efficient_attention.has_value() && use_efficient_attention.value()) {
+    sstream << " EFFICIENT_ATTENTION=" << int(use_efficient_attention.value());
+  }
+
+  if (use_trt_fused_attention.has_value() && use_trt_fused_attention.value()) {
+    sstream << " TRT_FUSED_ATTENTION=" << int(use_trt_fused_attention.value());
+  }
+
+  if (use_cudnn_flash_attention.has_value() && use_cudnn_flash_attention.value()) {
+    sstream << " CUDNN_FLASH_ATTENTION=" << int(use_cudnn_flash_attention.value());
+  }
+
+  if (use_trt_flash_attention.has_value() && use_trt_flash_attention.value()) {
+    sstream << " TRT_FLASH_ATTENTION=" << int(use_trt_flash_attention.value());
+  }
+
+  if (use_trt_cross_attention.has_value() && use_trt_cross_attention.value()) {
+    sstream << " TRT_CROSS_ATTENTION=" << int(use_trt_cross_attention.value());
+  }
+
+  if (use_trt_causal_attention.has_value() && use_trt_causal_attention.value()) {
+    sstream << " TRT_CAUSAL_ATTENTION=" << int(use_trt_causal_attention.value());
+  }
+
+  bool use_fused = (use_flash_attention.has_value() && use_flash_attention.value()) ||
+                   (use_efficient_attention.has_value() && use_efficient_attention.value()) ||
+                   (use_trt_fused_attention.has_value() && use_trt_fused_attention.value()) ||
+                   (use_cudnn_flash_attention.has_value() && use_cudnn_flash_attention.value()) ||
+                   (use_trt_flash_attention.has_value() && use_trt_flash_attention.value()) ||
+                   (use_trt_cross_attention.has_value() && use_trt_cross_attention.value()) ||
+                   (use_trt_causal_attention.has_value() && use_trt_causal_attention.value());
+
+  // Fall back to unfused when no fused kernel is enabled.
+  if (!use_fused) {
+    sstream << " MATH=1";
+  }
+
+  // Output text in Cyan color to make it easier to spot.
+  std::cout << "\x1B[36m" << sstream.str() << "\x1B[0m" << std::endl;
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.h b/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.h
new file mode 100644
index 0000000000000..bd7df5f490c76
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.h
@@ -0,0 +1,67 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <mutex>
+#include <optional>
+#include <string>
+
+namespace onnxruntime {
+struct AttentionKernelDebugInfo {
+  std::optional<bool> use_flash_attention = std::nullopt;
+  std::optional<bool> use_efficient_attention = std::nullopt;
+  std::optional<bool> use_trt_fused_attention = std::nullopt;
+  std::optional<bool> use_cudnn_flash_attention = std::nullopt;
+  std::optional<bool> use_trt_flash_attention = std::nullopt;
+  std::optional<bool> use_trt_cross_attention = std::nullopt;
+  std::optional<bool> use_trt_causal_attention = std::nullopt;
+  void SetTrtFusedKernel(bool causal, bool enable_trt_flash_attention, int sequence_length);
+  void Print(const char* operator_name, const std::string& node_name, bool is_float16, bool is_bfloat16) const;
+};
+
+class AttentionKernelOptions {
+ public:
+  void InitializeOnce(int sdpa_kernel, bool use_build_flag);
+
+  bool UseFlashAttention() const { return use_flash_attention_; }
+  bool UseEfficientAttention() const { return use_efficient_attention_; }
+  bool UseTrtFusedAttention() const { return use_trt_fused_attention_; }
+  bool UseCudnnFlashAttention() const { return use_cudnn_flash_attention_; }
+  bool UseUnfusedAttention() const { return use_unfused_; }
+  bool UseTrtFlashAttention() const { return use_trt_flash_attention_; }
+  bool UseTrtCrossAttention() const { return use_trt_cross_attention_; }
+  bool UseTrtCausalAttention() const { return use_trt_causal_attention_; }
+
+  bool AllowDebugInfo() const { return enable_kernel_debug_info_; }
+
+  int MinSeqLenForFlashAttentionPackedQkv() const { return min_seq_len_for_flash_attention_packed_qkv_; }
+  int MinSeqLenForEfficientAttentionFp32() const { return min_seq_len_for_efficient_attention_fp32_; }
+
+ protected:
+  void Print() const;
+
+  void Initialize(int value, bool use_build_flag);
+
+ private:
+  bool use_flash_attention_{true};
+  bool use_efficient_attention_{true};
+  bool use_trt_fused_attention_{true};
+  bool use_cudnn_flash_attention_{false};
+  bool use_unfused_{true};
+
+  bool use_trt_flash_attention_{true};
+  bool use_trt_cross_attention_{true};
+
+  // Causal attention is disabled by default in #14732.
+  bool use_trt_causal_attention_{false};
+
+  bool enable_kernel_debug_info_{false};
+
+  int min_seq_len_for_flash_attention_packed_qkv_{0};
+
+  int min_seq_len_for_efficient_attention_fp32_{0};
+
+  std::once_flag initialize_once_flag_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
index 3b6ad238cc826..797f9b0a1ea47 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -52,20 +52,13 @@ GroupQueryAttention<T>::GroupQueryAttention(const OpKernelInfo& info)
   rotary_interleaved_ = info.GetAttrOrDefault<int64_t>("rotary_interleaved", 0) == 1;
   scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
 
-#if USE_FLASH_ATTENTION
-  disable_flash_attention_ = sizeof(T) != 2 ||
-                             ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFlashAttention, false);
-#else
-  disable_flash_attention_ = true;
-#endif
+  kernel_options_ = this->GetAttentionKernelOptions();
+
+  disable_flash_attention_ = sizeof(T) != 2 || !kernel_options_->UseFlashAttention();
 
-#if USE_MEMORY_EFFICIENT_ATTENTION
   // Memory efficient attention only supports float and float16, not bfloat16.
-  disable_memory_efficient_attention_ = std::is_same<T, BFloat16>::value ||
-                                        ParseEnvironmentVariableWithDefault<bool>(attention::kDisableMemoryEfficientAttention, false);
-#else
-  disable_memory_efficient_attention_ = true;
-#endif
+  disable_memory_efficient_attention_ = std::is_same<T, BFloat16>::value || !kernel_options_->UseEfficientAttention();
+
   if (!disable_flash_attention_) {
     zeros_ = this->GetScratchBuffer<int>(kZerosCount, nullptr);
   }
@@ -161,7 +154,7 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
       !use_flash_attention &&
       !disable_memory_efficient_attention_ &&
       local_window_size_ == -1 &&
-      (sizeof(T) == 2 || parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32) &&
+      (sizeof(T) == 2 || parameters.sequence_length >= this->kernel_options_->MinSeqLenForEfficientAttentionFp32()) &&
       has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.head_size);
   if (!use_flash_attention && !use_memory_efficient_attention && local_window_size_ != -1) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
@@ -201,6 +194,17 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   auto unpacked_qkv_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
 #endif
 
+  if (kernel_options_->AllowDebugInfo()) {
+    AttentionKernelDebugInfo debug_info;
+    debug_info.use_flash_attention = use_flash_attention;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
+
+    debug_info.Print("GroupQueryAttention",
+                     this->Node().Name(),
+                     std::is_same<T, MLFloat16>::value,
+                     std::is_same<T, BFloat16>::value);
+  }
+
   // seqlens_k buffer
   size_t seqlens_k_bytes = 0;
   seqlens_k_bytes = sizeof(int) * parameters.batch_size;
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
index 15573ece166fc..4ff5b0a59f021 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
@@ -6,6 +6,7 @@
 #include <memory>
 #include "core/providers/cuda/cuda_kernel.h"
 #include "contrib_ops/cuda/bert/group_query_attention_impl.h"
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -32,6 +33,7 @@ class GroupQueryAttention final : public CudaKernel {
   bool disable_memory_efficient_attention_;
   static constexpr int kZerosCount = 256;  // In prompt case we create a zero buffer of size 256 for seqlen (assume batch_size <= 256)
   IAllocatorUniquePtr<int> zeros_;
+  const AttentionKernelOptions* kernel_options_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
index ba8b00df07e06..b96140f3897f9 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cuda/cuda_common.h"
-#include "core/platform/env_var_utils.h"
 #include "contrib_ops/cuda/bert/attention_impl.h"
 #include "contrib_ops/cuda/bert/multihead_attention.h"
 #include "contrib_ops/cpu/bert/multihead_attention_helper.h"
@@ -47,31 +46,16 @@ MultiHeadAttention<T>::MultiHeadAttention(const OpKernelInfo& info)
   is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
   ORT_ENFORCE(!is_unidirectional_, "Unidirectional MHA does not support CUDA kernel. Consider using Attention or GQA instead.");
 
-  disable_fused_self_attention_ = sizeof(T) != 2 ||
-                                  ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFusedSelfAttention, false);
+  kernel_options_ = this->GetAttentionKernelOptions();
 
-  enable_trt_flash_attention_ = sizeof(T) == 2 &&
-                                !ParseEnvironmentVariableWithDefault<bool>(attention::kDisableTrtFlashAttention, false);
+  disable_fused_self_attention_ = sizeof(T) != 2 || !kernel_options_->UseTrtFusedAttention();
+  enable_trt_flash_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtFlashAttention();
 
-#if USE_FLASH_ATTENTION
-  disable_flash_attention_ = sizeof(T) != 2 ||
-                             ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFlashAttention, false);
-  min_seq_len_for_flash_attention_packed_qkv_ = ParseEnvironmentVariableWithDefault<int>(
-      attention::kMinSeqLenForFlashAttentionPackedQKV,
-      attention::kDefaultMinSeqLenForFlashAttentionPackedQKV);
-#else
-  disable_flash_attention_ = true;
-  min_seq_len_for_flash_attention_packed_qkv_ = 0;
-#endif
+  disable_flash_attention_ = sizeof(T) != 2 || !kernel_options_->UseFlashAttention();
 
-#if USE_MEMORY_EFFICIENT_ATTENTION
-  disable_memory_efficient_attention_ = ParseEnvironmentVariableWithDefault<bool>(attention::kDisableMemoryEfficientAttention, false);
-#else
-  disable_memory_efficient_attention_ = true;
-#endif
+  disable_memory_efficient_attention_ = !kernel_options_->UseEfficientAttention();
 
-  disable_fused_cross_attention_ = sizeof(T) != 2 ||
-                                   ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFusedCrossAttention, false);
+  disable_fused_cross_attention_ = sizeof(T) != 2 || !kernel_options_->UseTrtCrossAttention();
 
   // Allocate cache buffers
   constexpr size_t cache_bytes = sizeof(int32_t) * (static_cast<size_t>(kCumulatedSequenceLengthCacheMaxBatchSize) + 1);
@@ -155,7 +139,7 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                                               parameters.num_heads);
   // When input is packed QKV format, TensorRT kernel might be faster than flash attention when sequence length <= 512.
   if (use_flash_attention && key == nullptr && value == nullptr &&
-      parameters.sequence_length < min_seq_len_for_flash_attention_packed_qkv_) {
+      parameters.sequence_length < kernel_options_->MinSeqLenForFlashAttentionPackedQkv()) {
     use_flash_attention = false;
   }
   // Allocate buffers
@@ -229,9 +213,10 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
   }
 
 #if USE_MEMORY_EFFICIENT_ATTENTION
+  int length_threshold = this->kernel_options_->MinSeqLenForEfficientAttentionFp32();
   bool is_long_sequence = sizeof(T) == 2 ||  // sequence length threshold is 0 for FP16
-                          parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32 ||
-                          parameters.kv_sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32;
+                          parameters.sequence_length >= length_threshold ||
+                          parameters.kv_sequence_length >= length_threshold;
 
   bool is_good_for_rpb = relative_position_bias != nullptr && parameters.sequence_length % (4 * sizeof(T)) == 0;
 
@@ -249,6 +234,21 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
   constexpr bool use_memory_efficient_attention = false;
 #endif
 
+  if (kernel_options_->AllowDebugInfo()) {
+    AttentionKernelDebugInfo debug_info;
+    debug_info.use_flash_attention = use_flash_attention;
+    debug_info.use_trt_cross_attention = fused_cross_attention_kernel != nullptr;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
+    if (fused_fp16_runner_ != nullptr) {
+      debug_info.SetTrtFusedKernel(is_unidirectional_, enable_trt_flash_attention_, sequence_length);
+    }
+
+    debug_info.Print("MultiHeadAttention",
+                     this->Node().Name(),
+                     std::is_same<T, MLFloat16>::value,
+                     std::is_same<T, BFloat16>::value);
+  }
+
   // When packed kv or packed qkv is used, there is no needed for add bias transpose thus no qkv workspace.
   // TODO(tianleiwu): flash attention or memory efficient attention might not need qkv workspace sometime.
   bool no_qkv_workspace = nullptr == value &&
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
index 86a32c92ce003..26e38dbad9fd7 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
@@ -8,6 +8,7 @@
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h"
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/cross_attention/fmha_cross_attention.h"
 #include "contrib_ops/cuda/bert/attention_impl.h"
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -31,12 +32,12 @@ class MultiHeadAttention final : public CudaKernel {
   bool disable_fused_cross_attention_;
   bool disable_flash_attention_;
   bool disable_memory_efficient_attention_;
-  int min_seq_len_for_flash_attention_packed_qkv_;
   mutable std::unique_ptr<MHARunner> fused_fp16_runner_;
   mutable std::once_flag fused_fp16_runner_created_;
   mutable const FusedMultiHeadCrossAttentionKernel* fused_fp16_cross_attention_kernel_;
   mutable CumulatedSequenceLengthCache cumulated_sequence_length_q_cache_;
   mutable CumulatedSequenceLengthCache cumulated_sequence_length_kv_cache_;
+  const AttentionKernelOptions* kernel_options_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
index 0146cce30c7d1..a1149ddbf99f5 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
@@ -33,12 +33,11 @@ REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
 
 template <typename T>
-TrtFusedAttention<T>::TrtFusedAttention() {
-  disable_fused_runner_ = sizeof(T) != 2 ||
-                          ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFusedSelfAttention, false);
-
-  enable_trt_flash_attention_ = sizeof(T) == 2 &&
-                                !ParseEnvironmentVariableWithDefault<bool>(attention::kDisableTrtFlashAttention, false);
+TrtFusedAttention<T>::TrtFusedAttention(const OpKernelInfo& info)
+    : CudaKernel(info) {
+  kernel_options_ = this->GetAttentionKernelOptions();
+  disable_fused_runner_ = sizeof(T) != 2 || !kernel_options_->UseTrtFusedAttention();
+  enable_trt_flash_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtFlashAttention();
 }
 
 template <typename T>
@@ -86,7 +85,8 @@ template class TrtFusedAttention<float>;
 template class TrtFusedAttention<MLFloat16>;
 
 template <typename T>
-PackedAttention<T>::PackedAttention(const OpKernelInfo& info) : TrtFusedAttention<T>(), CudaKernel(info) {
+PackedAttention<T>::PackedAttention(const OpKernelInfo& info)
+    : TrtFusedAttention<T>(info) {
   int64_t num_heads = 0;
   ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
   num_heads_ = static_cast<int32_t>(num_heads);
@@ -268,7 +268,7 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* relative_position_bias = context->Input<Tensor>(5);
 
   PackedAttentionParameters parameters;
-  parameters.use_tf32 = UseTF32();
+  parameters.use_tf32 = this->UseTF32();
   ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(),
                                   weights->Shape(),
                                   bias->Shape(),
@@ -295,6 +295,19 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   }
 #endif
 
+  if (this->kernel_options_->AllowDebugInfo()) {
+    AttentionKernelDebugInfo debug_info;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
+    if (fused_runner != nullptr) {
+      debug_info.SetTrtFusedKernel(false /*causal*/, this->enable_trt_flash_attention_, parameters.sequence_length);
+    }
+
+    debug_info.Print("PackedAttention",
+                     this->Node().Name(),
+                     std::is_same<T, MLFloat16>::value,
+                     std::is_same<T, BFloat16>::value);
+  }
+
   typedef typename ToCudaType<T>::MappedType CudaT;
   CudaT one = ToCudaType<T>::FromFloat(1.0f);
   CudaT zero = ToCudaType<T>::FromFloat(0.0f);
@@ -313,7 +326,7 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
       cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
       reinterpret_cast<const CudaT*>(weights->Data<T>()), n,
       reinterpret_cast<const CudaT*>(input->Data<T>()), k,
-      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop, UseTF32()));
+      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop, this->UseTF32()));
 
   constexpr size_t element_size = sizeof(T);
   constexpr bool no_qkv_workspace = false;  // need workspace to add bias
@@ -341,7 +354,7 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   data.fused_runner = reinterpret_cast<void*>(fused_runner);
   data.use_memory_efficient_attention = use_memory_efficient_attention;
 
-  return QkvToContext<CudaT>(device_prop, cublas, Stream(context), parameters, data);
+  return QkvToContext<CudaT>(device_prop, cublas, this->Stream(context), parameters, data);
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention.h b/onnxruntime/contrib_ops/cuda/bert/packed_attention.h
index f00c112fc73d2..67b420764169a 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention.h
@@ -9,6 +9,7 @@
 #include "core/providers/cuda/cuda_kernel.h"
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h"
 #include "contrib_ops/cpu/bert/attention_common.h"
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -17,14 +18,16 @@ namespace cuda {
 using namespace onnxruntime::cuda;
 
 template <typename T>
-class TrtFusedAttention {
+class TrtFusedAttention : public CudaKernel {
  public:
-  TrtFusedAttention();
+  TrtFusedAttention(const OpKernelInfo& info);
 
  protected:
   MHARunner* GetFusedRunner(const cudaDeviceProp& device_prop, const PackedAttentionParameters& parameters) const;
 
  protected:
+  const AttentionKernelOptions* kernel_options_;
+
   bool disable_fused_runner_;
   bool enable_trt_flash_attention_;
   mutable std::unique_ptr<MHARunner> fused_fp16_runner_;
@@ -32,7 +35,7 @@ class TrtFusedAttention {
 };
 
 template <typename T>
-class PackedAttention final : public TrtFusedAttention<T>, public CudaKernel {
+class PackedAttention final : public TrtFusedAttention<T> {
  public:
   PackedAttention(const OpKernelInfo& info);
   Status ComputeInternal(OpKernelContext* context) const override;
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc
index 3fbbafc01254e..53e96fc732a33 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc
@@ -35,30 +35,16 @@ REGISTER_KERNEL_TYPED(MLFloat16)
 
 template <typename T>
 PackedMultiHeadAttention<T>::PackedMultiHeadAttention(const OpKernelInfo& info)
-    : TrtFusedAttention<T>(), CudaKernel(info) {
+    : TrtFusedAttention<T>(info) {
   int64_t num_heads = 0;
   ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
   num_heads_ = static_cast<int32_t>(num_heads);
 
   scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
 
-#if USE_FLASH_ATTENTION
-  disable_flash_attention_ = sizeof(T) != 2 || onnxruntime::ParseEnvironmentVariableWithDefault<bool>(
-                                                   attention::kDisableFlashAttention, false);
-  min_seq_len_for_flash_attention_packed_qkv_ = ParseEnvironmentVariableWithDefault<int>(
-      attention::kMinSeqLenForFlashAttentionPackedQKV,
-      attention::kDefaultMinSeqLenForFlashAttentionPackedQKV);
-#else
-  disable_flash_attention_ = true;
-  min_seq_len_for_flash_attention_packed_qkv_ = 0;
-#endif
+  disable_flash_attention_ = sizeof(T) != 2 || !this->kernel_options_->UseFlashAttention();
 
-#if USE_MEMORY_EFFICIENT_ATTENTION
-  disable_memory_efficient_attention_ = onnxruntime::ParseEnvironmentVariableWithDefault<bool>(
-      attention::kDisableMemoryEfficientAttention, false);
-#else
-  disable_memory_efficient_attention_ = true;
-#endif
+  disable_memory_efficient_attention_ = !this->kernel_options_->UseEfficientAttention();
 }
 
 template <typename T>
@@ -228,7 +214,7 @@ Status PackedMultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) co
   const Tensor* relative_position_bias = context->Input<Tensor>(6);
 
   PackedAttentionParameters parameters;
-  parameters.use_tf32 = UseTF32();
+  parameters.use_tf32 = this->UseTF32();
   ORT_RETURN_IF_ERROR(CheckInputs(query->Shape(),
                                   key,
                                   value,
@@ -255,7 +241,7 @@ Status PackedMultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) co
 
     // When input is packed QKV format, TensorRT kernel might be faster when sequence length <= 512.
     if (use_flash_attention && key == nullptr && value == nullptr &&
-        parameters.sequence_length < min_seq_len_for_flash_attention_packed_qkv_) {
+        parameters.sequence_length < this->kernel_options_->MinSeqLenForFlashAttentionPackedQkv()) {
       use_flash_attention = false;
     }
   }
@@ -271,11 +257,25 @@ Status PackedMultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) co
     bool is_good_for_rpb = !parameters.has_relative_position_bias || parameters.sequence_length % (4 * sizeof(T)) == 0;
     use_memory_efficient_attention =
         is_good_for_rpb &&
-        (sizeof(T) == 2 || parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32) &&
+        (sizeof(T) == 2 || parameters.sequence_length >= this->kernel_options_->MinSeqLenForEfficientAttentionFp32()) &&
         has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.v_head_size);
   }
 #endif
 
+  if (this->kernel_options_->AllowDebugInfo()) {
+    AttentionKernelDebugInfo debug_info;
+    debug_info.use_flash_attention = use_flash_attention;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
+    if (fused_runner != nullptr) {
+      debug_info.SetTrtFusedKernel(false /*causal*/, this->enable_trt_flash_attention_, parameters.sequence_length);
+    }
+
+    debug_info.Print("PackedMultiHeadAttention",
+                     this->Node().Name(),
+                     std::is_same<T, MLFloat16>::value,
+                     std::is_same<T, BFloat16>::value);
+  }
+
   typedef typename ToCudaType<T>::MappedType CudaT;
 
   cublasHandle_t cublas = this->GetCublasHandle(context);
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.h b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.h
index e30c603dc30aa..9b52a70fc6181 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.h
@@ -4,13 +4,14 @@
 #pragma once
 
 #include "contrib_ops/cuda/bert/packed_attention.h"
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
 
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
 template <typename T>
-class PackedMultiHeadAttention final : public TrtFusedAttention<T>, public CudaKernel {
+class PackedMultiHeadAttention final : public TrtFusedAttention<T> {
  public:
   PackedMultiHeadAttention(const OpKernelInfo& info);
   Status ComputeInternal(OpKernelContext* context) const override;
@@ -32,7 +33,6 @@ class PackedMultiHeadAttention final : public TrtFusedAttention<T>, public CudaK
 
   bool disable_memory_efficient_attention_;
   bool disable_flash_attention_;
-  int min_seq_len_for_flash_attention_packed_qkv_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index f53779058a8af..9c8a8712ca51c 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -17,6 +17,10 @@
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/tunable/cuda_tuning_context.h"
 
+#ifndef DISABLE_CONTRIB_OPS
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
+#endif
+
 namespace onnxruntime {
 
 void RunOnUnload(std::function<void()> function);
@@ -80,6 +84,14 @@ class CUDAExecutionProvider : public IExecutionProvider {
   bool IsNHWCPreferred() const { return info_.prefer_nhwc; }
   bool UseTF32() const { return info_.use_tf32; }
 
+#ifndef DISABLE_CONTRIB_OPS
+  // Attention kernel options parsed from sdpa_kernel cuda provider option.
+  const AttentionKernelOptions* GetAttentionKernelOptions() const {
+    attention_kernel_options_.InitializeOnce(info_.sdpa_kernel, true);
+    return &attention_kernel_options_;
+  }
+#endif
+
   ProviderOptions GetProviderOptions() const override {
     return CUDAExecutionProviderInfo::ToProviderOptions(info_);
   }
@@ -110,6 +122,11 @@ class CUDAExecutionProvider : public IExecutionProvider {
   // the tuning context might be altered when calling into a TunableOp
   mutable cuda::tunable::CudaTuningContext tuning_context_;
 
+#ifndef DISABLE_CONTRIB_OPS
+  // Attention kernel options parsed from sdpa_kernel cuda provider option.
+  mutable AttentionKernelOptions attention_kernel_options_;
+#endif
+
   class PerThreadContext final {
    public:
     PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy,
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
index c96381e3e68b1..31cf991a34fc9 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
@@ -34,6 +34,7 @@ constexpr const char* kEnableSkipLayerNormStrictMode = "enable_skip_layer_norm_s
 constexpr const char* kPreferNHWCMode = "prefer_nhwc";
 constexpr const char* kUseEPLevelUnifiedStream = "use_ep_level_unified_stream";
 constexpr const char* kUseTF32 = "use_tf32";
+constexpr const char* kSdpaKernel = "sdpa_kernel";
 
 }  // namespace provider_option_names
 }  // namespace cuda
@@ -117,6 +118,7 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
           .AddAssignmentToReference(cuda::provider_option_names::kPreferNHWCMode, info.prefer_nhwc)
           .AddAssignmentToReference(cuda::provider_option_names::kUseEPLevelUnifiedStream, info.use_ep_level_unified_stream)
           .AddAssignmentToReference(cuda::provider_option_names::kUseTF32, info.use_tf32)
+          .AddAssignmentToReference(cuda::provider_option_names::kSdpaKernel, info.sdpa_kernel)
           .AddValueParser(
               cuda::provider_option_names::kTunableOpEnable,
               [&info](const std::string& value_str) -> Status {
@@ -170,6 +172,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
       {cuda::provider_option_names::kPreferNHWCMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
       {cuda::provider_option_names::kUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
       {cuda::provider_option_names::kUseTF32, MakeStringWithClassicLocale(info.use_tf32)},
+      {cuda::provider_option_names::kSdpaKernel, MakeStringWithClassicLocale(info.sdpa_kernel)},
   };
 
   return options;
@@ -192,6 +195,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid
       {cuda::provider_option_names::kPreferNHWCMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
       {cuda::provider_option_names::kUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
       {cuda::provider_option_names::kUseTF32, MakeStringWithClassicLocale(info.use_tf32)},
+      {cuda::provider_option_names::kSdpaKernel, MakeStringWithClassicLocale(info.sdpa_kernel)},
   };
 
   return options;
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
index 1cac3d1513698..0efad80f743df 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
@@ -79,6 +79,8 @@ struct CUDAExecutionProviderInfo {
   // By default, enable TF32 to speed up float GEMM/MatMul or cuDNN convolution of float matrices.
   bool use_tf32{true};
 
+  int sdpa_kernel{0};
+
   static CUDAExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const CUDAExecutionProviderInfo& info);
   static ProviderOptions ToProviderOptions(const OrtCUDAProviderOptionsV2& info);
@@ -91,6 +93,7 @@ struct std::hash<::onnxruntime::CUDAExecutionProviderInfo> {
     size_t value{0xbc9f1d34};  // seed
 
     // Bits: device_id (16), arena_extend_strategy/cudnn_conv_algo_search (reserved 2), boolean options (1 each)
+    // Do not exceed 32 bits here otherwise some bits will be lost in x86.
     size_t data = static_cast<size_t>(info.device_id) ^
                   (static_cast<size_t>(info.arena_extend_strategy) << 16) ^
                   (static_cast<size_t>(info.cudnn_conv_algo_search) << 18) ^
@@ -109,6 +112,7 @@ struct std::hash<::onnxruntime::CUDAExecutionProviderInfo> {
 
     onnxruntime::HashCombine(info.gpu_mem_limit, value);
     onnxruntime::HashCombine(info.tunable_op.max_tuning_duration_ms, value);
+    onnxruntime::HashCombine(info.sdpa_kernel, value);
 
     // Memory pointers
     onnxruntime::HashCombine(reinterpret_cast<size_t>(info.user_compute_stream), value);
diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h
index 288da23f35ec8..9d37a9775872f 100644
--- a/onnxruntime/core/providers/cuda/cuda_kernel.h
+++ b/onnxruntime/core/providers/cuda/cuda_kernel.h
@@ -94,6 +94,12 @@ class CudaKernel : public OpKernel {
     return provider_->UseTF32();
   }
 
+#ifndef DISABLE_CONTRIB_OPS
+  const AttentionKernelOptions* GetAttentionKernelOptions() const {
+    return provider_->GetAttentionKernelOptions();
+  }
+#endif
+
   tunable::CudaTuningContext* GetTuningContext() const {
     return static_cast<tunable::CudaTuningContext*>(provider_->GetTuningContext());
   }
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
index 7851da7fa91a3..b1d54e56ded4e 100644
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -226,6 +226,7 @@ struct CUDA_Provider : Provider {
     info.enable_skip_layer_norm_strict_mode = params->enable_skip_layer_norm_strict_mode != 0;
     info.use_ep_level_unified_stream = params->use_ep_level_unified_stream != 0;
     info.use_tf32 = params->use_tf32 != 0;
+    info.sdpa_kernel = params->sdpa_kernel;
 
     return std::make_shared<CUDAProviderFactory>(info);
   }
@@ -260,6 +261,7 @@ struct CUDA_Provider : Provider {
     cuda_options.prefer_nhwc = internal_options.prefer_nhwc;
     cuda_options.use_ep_level_unified_stream = internal_options.use_ep_level_unified_stream;
     cuda_options.use_tf32 = internal_options.use_tf32;
+    cuda_options.sdpa_kernel = internal_options.sdpa_kernel;
   }
 
   ProviderOptions GetProviderOptions(const void* provider_options) override {
diff --git a/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
index a61e917b41e51..f0255d7ece84e 100644
--- a/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
@@ -394,8 +394,8 @@ static void RunMultiHeadAttentionTests(AttentionTestData& data, bool disable_cpu
     }
 
 #if USE_MEMORY_EFFICIENT_ATTENTION
-    if (data.sequence_length >= contrib::attention::kMinSeqLenForMemoryEfficientAttentionFp32 ||
-        data.kv_sequence_length >= contrib::attention::kMinSeqLenForMemoryEfficientAttentionFp32) {
+    if (data.sequence_length >= contrib::attention::kDefaultMinSeqLenForEfficientAttentionFp32 ||
+        data.kv_sequence_length >= contrib::attention::kDefaultMinSeqLenForEfficientAttentionFp32) {
       kernel_type = AttentionKernelType::AttentionKernel_CutlassMemoryEfficientAttention;
       if (!SkipAttentionKernel(data, kernel_type)) {
         RunMultiHeadAttentionKernel(
diff --git a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
new file mode 100644
index 0000000000000..b2e986f680763
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
@@ -0,0 +1,221 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef DISABLE_CONTRIB_OPS
+
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
+#include "contrib_ops/cpu/bert/attention_common.h"
+#include "test/util/include/scoped_env_vars.h"
+#include "gtest/gtest.h"
+
+#include <unordered_map>
+#include <string>
+
+using onnxruntime::AttentionKernelOptions;
+using onnxruntime::contrib::attention::AttentionBackend;
+
+namespace onnxruntime {
+namespace test {
+
+TEST(AttentionKernelOptionsTest, NonZeroValue) {
+  {
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION) | static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_TRUE(options.UseFlashAttention());
+    ASSERT_TRUE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  {
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::TRT_FUSED_ATTENTION) | static_cast<int>(AttentionBackend::MATH);
+    options.InitializeOnce(value, false);
+    ASSERT_FALSE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_TRUE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_TRUE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  {
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::CUDNN_FLASH_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_FALSE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_TRUE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  {
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::TRT_FLASH_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_FALSE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_TRUE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  {
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::TRT_CROSS_ATTENTION) | static_cast<int>(AttentionBackend::TRT_CAUSAL_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_FALSE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_TRUE(options.UseTrtCrossAttention());
+    ASSERT_TRUE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  // Test environment variables are ignored when option value is non-zero
+  // Test default min sequence lengths are zeros
+  {
+    ScopedEnvironmentVariables scoped_env_vars{
+        EnvVarMap{
+            {onnxruntime::contrib::attention::kDisableFlashAttention, "0"},
+            {onnxruntime::contrib::attention::kDisableTrtFlashAttention, "0"},
+            {onnxruntime::contrib::attention::kDisableFusedSelfAttention, "0"},
+            {onnxruntime::contrib::attention::kEnableCudnnFlashAttention, "1"},
+            {onnxruntime::contrib::attention::kDisableFusedCrossAttention, "0"},
+            {onnxruntime::contrib::attention::kDisableMemoryEfficientAttention, "0"},
+            {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "1"},
+            {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "1"}}};
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_TRUE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  // Test min sequence lengths can be parsed from environment variables when option value is non-zero
+  {
+    ScopedEnvironmentVariables scoped_env_vars{
+        EnvVarMap{
+            {onnxruntime::contrib::attention::kDisableFlashAttention, "1"},
+            {onnxruntime::contrib::attention::kDisableTrtFlashAttention, "1"},
+            {onnxruntime::contrib::attention::kDisableFusedSelfAttention, "1"},
+            {onnxruntime::contrib::attention::kDisableFusedCrossAttention, "1"},
+            {onnxruntime::contrib::attention::kDisableMemoryEfficientAttention, "1"},
+            {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "0"},
+            {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "0"},
+            {onnxruntime::contrib::attention::kMinSeqLenForFlashAttentionPackedQKV, "128"},
+            {onnxruntime::contrib::attention::kMinSeqLenForEfficientAttentionFp32, "256"}}};
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_TRUE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 128);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 256);
+  }
+}
+
+// Test all environment variables take effect when option value is 0.
+TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
+  constexpr int value = 0;
+  ScopedEnvironmentVariables scoped_env_vars{
+      EnvVarMap{
+          {onnxruntime::contrib::attention::kDisableFlashAttention, "0"},
+          {onnxruntime::contrib::attention::kDisableTrtFlashAttention, "0"},
+          {onnxruntime::contrib::attention::kDisableFusedSelfAttention, "0"},
+          {onnxruntime::contrib::attention::kEnableCudnnFlashAttention, "1"},
+          {onnxruntime::contrib::attention::kDisableFusedCrossAttention, "0"},
+          {onnxruntime::contrib::attention::kDisableMemoryEfficientAttention, "0"},
+          {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "1"},
+          {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "1"},
+          {onnxruntime::contrib::attention::kMinSeqLenForFlashAttentionPackedQKV, "128"},
+          {onnxruntime::contrib::attention::kMinSeqLenForEfficientAttentionFp32, "256"}}};
+  AttentionKernelOptions options;
+  options.InitializeOnce(value, false);
+  ASSERT_TRUE(options.UseFlashAttention());
+  ASSERT_TRUE(options.UseEfficientAttention());
+  ASSERT_TRUE(options.UseTrtFusedAttention());
+  ASSERT_TRUE(options.UseCudnnFlashAttention());
+  ASSERT_TRUE(options.UseUnfusedAttention());
+  ASSERT_TRUE(options.UseTrtFlashAttention());
+  ASSERT_TRUE(options.UseTrtCrossAttention());
+  ASSERT_TRUE(options.UseTrtCausalAttention());
+  ASSERT_TRUE(options.UseTrtCausalAttention());
+  EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 128);
+  EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 256);
+}
+
+// Test default min sequence lengths when environment variables are not set.
+TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) {
+  constexpr int value = 0;
+  ScopedEnvironmentVariables scoped_env_vars{
+      EnvVarMap{
+          {onnxruntime::contrib::attention::kDisableFlashAttention, "1"},
+          {onnxruntime::contrib::attention::kDisableTrtFlashAttention, "1"},
+          {onnxruntime::contrib::attention::kDisableFusedSelfAttention, "1"},
+          {onnxruntime::contrib::attention::kDisableFusedCrossAttention, "1"},
+          {onnxruntime::contrib::attention::kEnableCudnnFlashAttention, "0"},
+          {onnxruntime::contrib::attention::kDisableMemoryEfficientAttention, "1"},
+          {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "0"},
+          {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "0"}}};
+  AttentionKernelOptions options;
+  options.InitializeOnce(value, false);
+  ASSERT_FALSE(options.UseFlashAttention());
+  ASSERT_FALSE(options.UseEfficientAttention());
+  ASSERT_FALSE(options.UseTrtFusedAttention());
+  ASSERT_FALSE(options.UseCudnnFlashAttention());
+  ASSERT_TRUE(options.UseUnfusedAttention());
+  ASSERT_FALSE(options.UseTrtFlashAttention());
+  ASSERT_FALSE(options.UseTrtCrossAttention());
+  ASSERT_FALSE(options.UseTrtCausalAttention());
+  ASSERT_FALSE(options.UseTrtCausalAttention());
+  EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(),
+            onnxruntime::contrib::attention::kDefaultMinSeqLenForFlashAttentionPackedQKV);
+  EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(),
+            onnxruntime::contrib::attention::kDefaultMinSeqLenForEfficientAttentionFp32);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index e4814aa7fc033..892e7de8bb6ed 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -446,6 +446,8 @@ def test_get_and_set_option_with_values(option_name, option_values):
 
                 test_get_and_set_option_with_values("use_tf32", ["1", "0"])
 
+                test_get_and_set_option_with_values("sdpa_kernel", ["0", "1", "2"])
+
                 option["gpu_external_alloc"] = "0"
                 option["gpu_external_free"] = "0"
                 option["gpu_external_empty_cache"] = "0"

From 34cd2e8ed8688c42d00adda1d260ac787f76bf29 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Sat, 20 Jul 2024 09:35:05 +1000
Subject: [PATCH 06/15] Add CoreML ML Program Resize (#21370)

### Description
<!-- Describe your changes. -->
Add CoreML ML Program Resize
- refactor existing logic to try and simplify and share between
NeuralNetwork and MLProgram checks
- add handling for some new attributes
- antialias and axes - should have been done when setting the CoreML EP
max opset to 21

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Support priority models
---
 .../core/providers/coreml/builders/helper.cc  |  18 +-
 .../core/providers/coreml/builders/helper.h   |   3 +-
 .../coreml/builders/impl/base_op_builder.cc   |   2 +-
 .../coreml/builders/impl/base_op_builder.h    |   6 +
 .../coreml/builders/impl/resize_op_builder.cc | 607 +++++++++++++-----
 .../providers/coreml/builders/model_builder.h |  13 +-
 .../coreml/coreml_execution_provider.cc       |   1 +
 .../builders/impl/resize_op_builder.cc        |  25 +-
 onnxruntime/core/providers/utils.cc           |  16 +
 onnxruntime/core/providers/utils.h            |   5 +
 .../core/providers/xnnpack/tensor/resize.cc   |  21 +-
 .../providers/cpu/tensor/resize_op_test.cc    | 152 ++++-
 .../apple/coreml_supported_mlprogram_ops.md   |   1 +
 13 files changed, 671 insertions(+), 199 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/helper.cc b/onnxruntime/core/providers/coreml/builders/helper.cc
index b8ebbd05a2a20..e1f148fa93e23 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.cc
+++ b/onnxruntime/core/providers/coreml/builders/helper.cc
@@ -50,8 +50,8 @@ bool IsNodeSupported(const Node& node, const OpBuilderInputParams& input_params,
   }
 }
 
-bool IsInputSupported(const Node& node, const NodeArg& input,
-                      const OpBuilderInputParams& input_params, const logging::Logger& logger) {
+bool IsInputSupported(const Node& node, const NodeArg& input, const OpBuilderInputParams& input_params,
+                      const logging::Logger& logger, bool allow_empty_input) {
   if (!input.Exists()) {
     // optional input that is not provided
     return true;
@@ -84,16 +84,10 @@ bool IsInputSupported(const Node& node, const NodeArg& input,
       return false;
     }
 
-    if (dim == 0) {
-      if (node.OpType() == "Resize" && &input == node.InputDefs()[1]) {
-        // one special case. Resize 'roi' input was originally a required input but is rarely used.
-        // ROI is not supported in the CoreML implementation so we will ignore the value, but is often added
-        // (at least in the unit tests) as an initializer with shape {0}.
-      } else {
-        LOGS(logger, WARNING) << "CoreML does not support shapes with dimension values of 0. Input:" << input_name
-                              << ", shape: " << Shape2String(shape);
-        return false;
-      }
+    if (dim == 0 && !allow_empty_input) {
+      LOGS(logger, WARNING) << "CoreML does not support shapes with dimension values of 0. Input:" << input_name
+                            << ", shape: " << Shape2String(shape);
+      return false;
     }
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/helper.h b/onnxruntime/core/providers/coreml/builders/helper.h
index 300de2dedd122..0acaa0dd8a4a3 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.h
+++ b/onnxruntime/core/providers/coreml/builders/helper.h
@@ -30,7 +30,8 @@ OpBuilderInputParams MakeOpBuilderParams(const GraphViewer& graph_viewer,
 const IOpBuilder* GetOpBuilder(const Node& node);
 
 bool IsInputSupported(const Node& node, const NodeArg& node_arg, const OpBuilderInputParams& input_params,
-                      const logging::Logger& logger);
+                      const logging::Logger& logger,
+                      bool allow_empty_input = false);
 
 bool IsNodeSupported(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger);
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 83a572f4b60fa..2cae85a0a1c8d 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -74,7 +74,7 @@ bool BaseOpBuilder::IsOpSupported(const Node& node, const OpBuilderInputParams&
 bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputParams& input_params,
                                        const logging::Logger& logger) const {
   for (const auto* input : node.InputDefs()) {
-    if (!IsInputSupported(node, *input, input_params, logger)) {
+    if (!IsInputSupported(node, *input, input_params, logger, allow_empty_tensor_as_input_)) {
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
index 4a23640d0f34c..071008520fbdc 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
@@ -28,6 +28,10 @@ class BaseOpBuilder : public IOpBuilder {
   void AddInitializersToSkip(ModelBuilder& /*model_builder*/, const Node& /*node*/) const override {}
 
  protected:
+  explicit BaseOpBuilder(bool allow_empty_tensor_as_input = false)
+      : allow_empty_tensor_as_input_(allow_empty_tensor_as_input) {
+  }
+
   // currently we only support float
   static bool IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
                            const logging::Logger& logger);
@@ -50,6 +54,8 @@ class BaseOpBuilder : public IOpBuilder {
 
   virtual Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                        const logging::Logger& logger) const = 0;
+
+  const bool allow_empty_tensor_as_input_;  // some operators can handle ignoring an empty tensor as input
 };
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index 3400f09b4056f..65b5c17f2c6a6 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -1,13 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <math.h>
+#include <cmath>
 
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
+#include "core/providers/utils.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -18,6 +20,11 @@ namespace onnxruntime {
 namespace coreml {
 
 class ResizeOpBuilder : public BaseOpBuilder {
+ public:
+  // allow roi and scales potentially being empty inputs that are ignored during processing
+  ResizeOpBuilder() : BaseOpBuilder(/*allow empty inputs*/ true) {}
+
+ private:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
@@ -29,196 +36,382 @@ class ResizeOpBuilder : public BaseOpBuilder {
   // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing
   // We only support Resize opset 11+ here
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 11; }
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 namespace {
-bool GetResizeScales(const InitializedTensorSet& initializers,
-                     const Node& node, std::vector<float>& scales,
-                     const logging::Logger&) {
+std::vector<int64_t> GetAxes(const NodeAttrHelper& helper, size_t input_rank) {
+  auto axes = helper.Get("axes", std::vector<int64_t>{});
+  if (axes.empty()) {
+    axes.resize(input_rank);
+    std::iota(axes.begin(), axes.end(), 0);
+  } else {
+    for (auto& value : axes) {
+      if (value < 0) {
+        value = HandleNegativeAxis(value, input_rank);
+      }
+    }
+  }
+
+  return axes;
+}
+
+bool GetValidatedResizeScales(const GraphViewer& graph_viewer,
+                              const Node& node,
+                              const std::vector<int64_t>& input_shape,
+                              const std::vector<int64_t>& axes,
+                              std::vector<float>& scales,
+                              const logging::Logger& logger) {
   const auto& input_defs = node.InputDefs();
-  if (input_defs.size() < 3)
+  int64_t input_rank = input_shape.size();
+
+  if (input_shape[input_rank - 2] == -1 || input_shape[input_rank - 1] == -1) {
+    LOGS(logger, VERBOSE) << "Resize with 'scales' requires the H and W dimensions to have fixed values";
     return false;
+  }
 
-  const auto& scales_tensor = *initializers.at(input_defs[2]->Name());
-  if (scales_tensor.dims_size() != 1 || scales_tensor.dims()[0] != 4)
+  const auto* scales_tensor = graph_viewer.GetConstantInitializer(input_defs[2]->Name());
+  if (!scales_tensor) {
+    LOGS(logger, VERBOSE) << "Resize 'scales' input must be a constant initializer";
     return false;
-  Initializer unpacked_tensor(scales_tensor);
+  }
+
+  Initializer unpacked_tensor(*scales_tensor);
   auto scales_data = unpacked_tensor.DataAsSpan<float>();
-  scales = std::vector<float>{scales_data.begin(), scales_data.end()};
+  scales.assign(scales_data.begin(), scales_data.end());
+
+  for (size_t idx = 0, end = axes.size(); idx < end; ++idx) {
+    auto axis = axes[idx];
+    auto scale = scales[idx];
+    if (axis < (input_rank - 2) && scale != 1.0f) {
+      LOGS(logger, VERBOSE) << "Resize only supports resizing the last two axes. Scale of axis " << axis << " is "
+                            << scale;
+      return false;
+    }
+  }
+
   return true;
 }
 
-bool GetResizeOutputSizes(const InitializedTensorSet& initializers,
-                          const Node& node, std::vector<int64_t>& sizes,
-                          const logging::Logger&) {
+bool GetValidatedResizeSizes(const GraphViewer& graph_viewer,
+                             const Node& node,
+                             const std::vector<int64_t>& input_shape,
+                             const std::vector<int64_t>& axes,
+                             std::vector<int64_t>& sizes, const logging::Logger& logger) {
   const auto& input_defs = node.InputDefs();
-  if (input_defs.size() < 4)
-    return false;
+  int64_t input_rank = input_shape.size();
 
-  const auto& sizes_tensor = *initializers.at(input_defs[3]->Name());
-  if (sizes_tensor.dims_size() != 1 || sizes_tensor.dims()[0] != 4)
+  const auto* sizes_tensor = graph_viewer.GetConstantInitializer(input_defs[3]->Name());
+  if (!sizes_tensor) {
+    LOGS(logger, VERBOSE) << "Resize 'sizes' input must be a constant initializer";
     return false;
-  Initializer unpacked_tensor(sizes_tensor);
+  }
+
+  Initializer unpacked_tensor(*sizes_tensor);
   auto sizes_data = unpacked_tensor.DataAsSpan<int64_t>();
-  sizes = std::vector<int64_t>(sizes_data.begin(), sizes_data.end());
+  sizes.assign(sizes_data.begin(), sizes_data.end());
+
+  for (size_t idx = 0, end = axes.size(); idx < end; ++idx) {
+    auto axis = axes[idx];
+    auto cur_size = input_shape[idx];
+    auto new_size = sizes[idx];
+    if (axis < (input_rank - 2) && cur_size != new_size) {
+      LOGS(logger, VERBOSE) << "Resize only supports resizing the last two axes. Input rank: " << input_rank
+                            << " Change to size of axis " << axis << " from " << cur_size << " to " << new_size;
+      return false;
+    }
+  }
+
   return true;
 }
 }  // namespace
 
 void ResizeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
-  // We don't really use ROI here, so add it to skipped list if it's an initializer tensor
-  model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());  // ROI
-  model_builder.AddInputToSkip(node.InputDefs()[1]->Name());        // ROI
-
-  // We will still add scales to the skipped list even sizes are present
-  // since there is no use of it, we will not process it later
-  model_builder.AddInitializerToSkip(node.InputDefs()[2]->Name());  // scales
-  model_builder.AddInputToSkip(node.InputDefs()[2]->Name());        // scales
-
-  if (node.InputDefs().size() > 3) {
-    model_builder.AddInitializerToSkip(node.InputDefs()[3]->Name());  // sizes
-    model_builder.AddInputToSkip(node.InputDefs()[3]->Name());        // sizes
+  const auto& input_defs = node.InputDefs();
+
+  // In Resize-11 both roi and scales were required even if you were using sizes.
+  // https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-11
+  // From Resize-13 on they're all optional.
+  //
+  // We don't support roi so would never take a node with meaningful roi input. The roi input can however be provided
+  // and is ignored unless coordinate_transformation_mode is set to 'tf_crop_and_resize'.
+  // e.g. our unit tests tend to always provide an empty tensor as roi input instead of as a missing optional input.
+  // Due to this we always call AddInputToSkip on the roi input.
+  //
+  // We require the sizes or scales input to be a constant initializers to take the node (i.e. they won't be an input
+  // to the CoreML model for the partition, so calling AddInputToSkip isn't relevant).
+  // Individual values from scales and sizes are added directly to the layer, so we won't use the initializer.
+  //
+  // That leaves an edge case for Resize-11 where scales could have been provided as an empty input tensor but
+  // we're using a constant initializer for sizes. In this case AddInputToSkip needs to be called for the scales input.
+
+  model_builder.AddInitializerToSkip(input_defs[1]->Name());  // roi
+  model_builder.AddInputToSkip(input_defs[1]->Name());
+
+  if (input_defs[2]->Exists()) {
+    model_builder.AddInitializerToSkip(input_defs[2]->Name());  // scales
+  }
+
+  if (input_defs.size() > 3 && input_defs[3]->Exists()) {
+    model_builder.AddInitializerToSkip(input_defs[3]->Name());  // sizes
+
+    if (node.SinceVersion() < 13) {
+      model_builder.AddInputToSkip(input_defs[2]->Name());  // skip the unused scales input
+    }
   }
 }
 
-Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
-                                              const Node& node,
+Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                               const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+  const auto input_defs = node.InputDefs();
+  const auto output_defs = node.OutputDefs();
+  const auto& graph_viewer = model_builder.GetGraphViewer();
+
+  std::vector<int64_t> input_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Error getting input shape");
+  size_t input_rank = input_shape.size();
+
+  // we know we have either a scales or sizes input so this is safe.
+  // check for sizes first. this handles Resize-11 where scales was a required input but sizes were used if provided.
+  bool using_sizes = input_defs.size() >= 4 && input_defs[3]->Exists();
+  bool using_scales = !using_sizes;
 
-  auto* coreml_upsample = layer->mutable_upsample();
   NodeAttrHelper helper(node);
-  const auto mode = helper.Get("mode", "nearest");
-  if (mode == "linear") {
-    coreml_upsample->set_mode(COREML_SPEC::UpsampleLayerParams_InterpolationMode_BILINEAR);
-  } else {  // we already checked the mode must be NN or Bilinear in IsOpSupportedImpl
-    coreml_upsample->set_mode(COREML_SPEC::UpsampleLayerParams_InterpolationMode_NN);
+  const auto& mode = helper.Get("mode", "nearest");
+  bool is_nearest = mode == "nearest";
+  bool is_linear = !is_nearest;
+
+  auto axes = GetAxes(helper, input_rank);
+  std::vector<float> output_scales;
+  std::vector<int64_t> output_sizes;
+  size_t num_scales = 0;
+  size_t num_sizes = 0;
+
+  if (using_scales) {
+    ORT_RETURN_IF_NOT(GetValidatedResizeScales(graph_viewer, node, input_shape, axes, output_scales, logger),
+                      "Error getting validated scales");
+    num_scales = output_scales.size();
+
+    // special case linear downsample.
+    // the CoreML implementation seems to be flaky and gives different outputs on different OS versions.
+    // use bilinear_resize instead. we check in IsOpSupportedImpl that the downsample input is evenly
+    // divisible by the output size so there's no rounding involved.
+    if (is_linear && (output_scales[num_scales - 1] < 1.f || output_scales[num_scales - 2] < 1.f)) {
+      using_scales = false;
+      using_sizes = true;
+      num_sizes = num_scales;
+      output_sizes = input_shape;
+      // only the last two dims have their size changed
+      output_sizes[input_rank - 2] = static_cast<int64_t>(input_shape[input_rank - 2] * output_scales[num_scales - 2]);
+      output_sizes[input_rank - 1] = static_cast<int64_t>(input_shape[input_rank - 1] * output_scales[num_scales - 1]);
+    }
+  } else {
+    ORT_RETURN_IF_NOT(GetValidatedResizeSizes(graph_viewer, node, input_shape, axes, output_sizes, logger),
+                      "Error getting validated sizes");
+    num_sizes = output_sizes.size();
   }
 
-  const auto& input_defs = node.InputDefs();
-  const auto& initializers(model_builder.GetInitializerTensors());
-
-  if (input_defs.size() >= 3 && input_defs[2]->Exists()) {  // use scales
-    std::vector<float> scales;
-    ORT_RETURN_IF_NOT(GetResizeScales(initializers, node, scales, logger), "Error getting resize scales");
-    coreml_upsample->add_scalingfactor(static_cast<int64_t>(scales[2]));
-    coreml_upsample->add_scalingfactor(static_cast<int64_t>(scales[3]));
-  } else {  // we already checked number of inputs in IsOpSupportedImpl
-    std::vector<int64_t> input_shape;
-    ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Error getting input shape");
-    std::vector<int64_t> output_sizes;
-    ORT_RETURN_IF_NOT(GetResizeOutputSizes(initializers, node, output_sizes, logger),
-                      "Error getting resize output_sizes");
-    coreml_upsample->add_scalingfactor(static_cast<int64_t>(output_sizes[2] / input_shape[2]));
-    coreml_upsample->add_scalingfactor(static_cast<int64_t>(output_sizes[3] / input_shape[3]));
-  }
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;  // NOLINT
+
+    std::string_view coreml_op_type;
+    if (using_scales) {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.upsample_bilinear
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.upsample_nearest_neighbor
+      coreml_op_type = is_linear ? "upsample_bilinear" : "upsample_nearest_neighbor";
+    } else {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.resize_bilinear
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.resize_nearest_neighbor
+      coreml_op_type = is_linear ? "resize_bilinear" : "resize_nearest_neighbor";
+    }
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+
+    std::string coord_trans_mode = helper.Get("coordinate_transformation_mode", "half_pixel");
+
+    if (using_scales) {
+      float scale_height = output_scales[num_scales - 2];
+      float scale_width = output_scales[num_scales - 1];
+      AddOperationInput(*op, "scale_factor_height",
+                        model_builder.AddScalarConstant(coreml_op_type, "scale_factor_height", scale_height));
+      AddOperationInput(*op, "scale_factor_width",
+                        model_builder.AddScalarConstant(coreml_op_type, "scale_factor_width", scale_width));
+
+      if (is_linear) {
+        // we only allow these coord modes in the 'is supported' check,
+        //   - half_pixel or pytorch_half_pixel with output size > 1 -> align_corners = false
+        //   - align_corners -> align_corners = true
+        bool align_corners = coord_trans_mode == "align_corners";
+        AddOperationInput(*op, "align_corners",
+                          model_builder.AddScalarConstant(coreml_op_type, "align_corners", align_corners));
+      }
+    } else {
+      assert(using_sizes);
+      int64_t target_height = output_sizes[num_sizes - 2];
+      int64_t target_width = output_sizes[num_sizes - 1];
+
+      AddOperationInput(*op, "target_size_height",
+                        model_builder.AddScalarConstant(coreml_op_type, "target_size_height", target_height));
+      AddOperationInput(*op, "target_size_width",
+                        model_builder.AddScalarConstant(coreml_op_type, "target_size_width", target_width));
+
+      if (is_linear) {
+        // we only allow these coord modes in the 'is supported' check,
+        //   - half_pixel or pytorch_half_pixel with output size > 1 -> UNALIGN_CORNERS
+        //   - align_corners -> STRICT_ALIGN_CORNERS
+        //   - asymmetric -> DEFAULT
+        std::string sampling_mode_value;
+        if (coord_trans_mode == "asymmetric") {
+          sampling_mode_value = "DEFAULT";
+        } else if (coord_trans_mode == "align_corners") {
+          sampling_mode_value = "STRICT_ALIGN_CORNERS";
+        } else {
+          sampling_mode_value = "UNALIGN_CORNERS";
+        }
+
+        AddOperationInput(*op, "sampling_mode",
+                          model_builder.AddScalarConstant(coreml_op_type, "sampling_mode", sampling_mode_value));
+      }
+    }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    AddOperationOutput(*op, *output_defs[0]);
+    model_builder.AddOperation(std::move(op));
+  } else  // NOLINT
+#endif
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    auto* coreml_upsample = layer->mutable_upsample();
+
+    // we already checked the mode must be NN or Bilinear in IsOpSupportedImpl
+    if (is_linear) {
+      coreml_upsample->set_mode(COREML_SPEC::UpsampleLayerParams_InterpolationMode_BILINEAR);
+    } else {
+      coreml_upsample->set_mode(COREML_SPEC::UpsampleLayerParams_InterpolationMode_NN);
+    }
+
+    if (using_scales) {
+      coreml_upsample->add_scalingfactor(static_cast<int64_t>(output_scales[num_scales - 2]));
+      coreml_upsample->add_scalingfactor(static_cast<int64_t>(output_scales[num_scales - 1]));
+    } else {
+      auto scale_height = output_sizes[num_sizes - 2] / input_shape[input_rank - 2];
+      auto scale_width = output_sizes[num_sizes - 1] / input_shape[input_rank - 1];
+      coreml_upsample->add_scalingfactor(static_cast<int64_t>(scale_height));
+      coreml_upsample->add_scalingfactor(static_cast<int64_t>(scale_width));
+    }
+
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = output_defs[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
 bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
 
   std::vector<int64_t> input_shape;
-  if (!GetShape(*input_defs[0], input_shape, logger))
+  if (!GetShape(*input_defs[0], input_shape, logger)) {
+    LOGS(logger, VERBOSE) << "Resize: input shape was not known";
     return false;
+  }
 
-  const auto input_size = input_shape.size();
-  if (input_size != 4) {
-    LOGS(logger, VERBOSE) << "Resize only support 4d shape, input is "
-                          << input_size << "d shape";
+  // as we allow empty shapes in the checks done by BaseOpBuilder::HasSupportedInputs we explicitly check for an empty
+  // an empty input here to be consistent.
+  // this should never happen in a real model though as a dim with value 0 (i.e. no input data) would typically be a
+  // dynamic dimension where a previous step had no output (e.g. Loop of zero interations, NonZero with no matches,
+  // NonMaxSupression with no boxes).
+  if (DoesShapeSpecifyZeroElements(input_shape)) {
+    LOGS(logger, VERBOSE) << "Resize input shape has with dimension values of 0 which is not supported.";
     return false;
   }
 
-  {  // check attributes
-    NodeAttrHelper helper(node);
-    const auto mode = helper.Get("mode", "nearest");
-    bool is_linear_resize = mode == "linear";
-    bool is_nearest_resize = mode == "nearest";
-    if (!is_linear_resize && !is_nearest_resize) {
-      LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode;
+  const auto input_rank = input_shape.size();
+  if (input_params.create_mlprogram) {
+    if (input_rank < 3 || input_rank > 5) {
+      LOGS(logger, VERBOSE) << "Resize only supports 3D to 5D input. Got: " << input_rank << "D";
       return false;
     }
-
-    const auto exclude_outside = helper.Get("exclude_outside", 0);
-    if (exclude_outside != 0) {
-      LOGS(logger, VERBOSE) << "Resize does not support exclude_outside for now";
+  } else {
+    if (input_rank != 4) {
+      LOGS(logger, VERBOSE) << "Resize only support 4d shape. Got: " << input_rank << "D";
       return false;
     }
+  }
 
-    const auto coord_trans_mode = helper.Get("coordinate_transformation_mode", "half_pixel");
-    bool using_asymmetric = coord_trans_mode == "asymmetric";
-    if (is_linear_resize) {
-      // TODO, add support of align_corners and half_pixel
-      if (!using_asymmetric) {
-        LOGS(logger, VERBOSE) << "Resize bilinear, unsupported coord_trans_mode, " << coord_trans_mode;
-        return false;
-      }
-    } else {
-      // nearest neighbor resizing
-      // For resize using nearest neighbor, we only support coord_trans_mode == "asymmetric" && nearest_mode == "floor"
-      if (!using_asymmetric) {
-        LOGS(logger, VERBOSE) << "Resize nearest neighbor, unsupported coord_trans_mode, " << coord_trans_mode;
-        return false;
-      }
+  // check attributes
+  NodeAttrHelper helper(node);
 
-      const auto nearest_mode = helper.Get("nearest_mode", "round_prefer_floor");
-      if (nearest_mode != "floor") {
-        LOGS(logger, VERBOSE) << "Resize nearest neighbor, unsupported nearest_mode, " << nearest_mode;
-        return false;
-      }
-    }
+  if (helper.Get("antialias", 0) != 0) {
+    LOGS(logger, VERBOSE) << "Resize does not support antialias";
+    return false;
   }
 
-  {  // scales and sizes (if present) must be initializers
-    if (input_defs.size() < 3) {
-      LOGS(logger, VERBOSE) << "Input scales or sizes of Resize must be known";
-      return false;
-    }
+  const auto& mode = helper.Get("mode", "nearest");
+  bool is_linear = mode == "linear";
+  bool is_nearest = mode == "nearest";
+  if (!is_linear && !is_nearest) {
+    LOGS(logger, VERBOSE) << "Resize unsupported input mode: " << mode;
+    return false;
+  }
 
-    bool using_scales = input_defs.size() >= 3 && input_defs[2]->Exists();
-    // scales
-    if (using_scales && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) {
-      LOGS(logger, VERBOSE) << "scales input of Resize must be a constant initializer";
+  if (is_nearest) {
+    const auto nearest_mode = helper.Get("nearest_mode", "round_prefer_floor");
+    if (nearest_mode != "floor") {
+      LOGS(logger, VERBOSE) << "Resize only supports 'floor' nearest_mode. Got: " << nearest_mode;
       return false;
     }
+  }
 
-    // sizes
-    if (!using_scales &&
-        (input_defs.size() < 4 ||
-         !input_defs[3]->Exists() ||
-         !input_params.graph_viewer.GetConstantInitializer(input_defs[3]->Name()))) {
-      LOGS(logger, VERBOSE) << "sizes input of Resize must be a constant initializer";
-      return false;
-    }
+  if (helper.Get("exclude_outside", 0) != 0) {
+    LOGS(logger, VERBOSE) << "Resize does not support 'exclude_outside'";
+    return false;
+  }
 
-    // We want to check if the scales or sizes are not trying to resize on N/C channels here
-    if (using_scales) {
-      std::vector<float> scales;
-      if (!GetResizeScales(initializers, node, scales, logger))
-        return false;
+  const auto keep_aspect_ratio_policy = helper.Get("keep_aspect_ratio_policy", "stretch");
+  if (keep_aspect_ratio_policy != "stretch") {
+    LOGS(logger, VERBOSE) << "Resize only supports keep_aspect_ratio_policy of 'stretch'. Got "
+                          << keep_aspect_ratio_policy;
+    return false;
+  }
 
-      float scale_n = scales[0];
-      float scale_c = scales[1];
-      if (scale_n != 1.0f || scale_c != 1.0f) {
-        LOGS(logger, VERBOSE) << "Scales of N/C channel should be 1"
-                              << "Resize of N/C channels are not supported"
-                              << ", scale_n, " << scale_n << ", scale_c, " << scale_c;
-        return false;
-      }
+  // check for sizes first. this handles Resize-11 where scales was a required input but sizes were used if provided.
+  bool using_sizes = input_defs.size() >= 4 && input_defs[3]->Exists();
+  bool using_scales = !using_sizes && input_defs.size() >= 3 && input_defs[2]->Exists();
 
-      // For now we only support upscale, so the scale_h and scale_w should be an integer >= 1
-      // TODO support ResizeBilinear
-      float scale_h = scales[2];
-      float scale_w = scales[3];
+  if (!using_scales && !using_sizes) {
+    LOGS(logger, VERBOSE) << "Resize requires 'scales' or 'sizes' input";
+    return false;
+  }
+
+  // 'axes' is from opset 18 on and allows scales or sizes to have entries for the subset of axes.
+  // we fill with default values if necessary so that the processing is consistent across all supported opsets.
+  auto axes = GetAxes(helper, input_rank);
+  std::vector<float> output_scales;
+  std::vector<int64_t> output_sizes;
+
+  // make sure scales/sizes are constant initializers, and are only modifying the last two dimensions of the input.
+  if (using_scales) {
+    if (!GetValidatedResizeScales(input_params.graph_viewer, node, input_shape, axes, output_scales, logger)) {
+      return false;
+    }
 
-      // Onnx spec requires scale to be a positive float, so we are not checking that here
+    size_t num_scales = output_scales.size();
+    float scale_h = output_scales[num_scales - 2];
+    float scale_w = output_scales[num_scales - 1];
+
+    // NeuralNetwork supports upsample only with round numbers.
+    //
+    // ML Program results seem to match if round numbers are involved. When downsampling the scaling value should be
+    // 1 / <factor of input size>. e.g. if input size is 8, scaling factor could be 1/8, 1/4 or 1/2.
+    if (scale_h >= 1.f && scale_w >= 1.f) {
+      // upsample (or no-op with both == 1.f that we won't bother special-casing)
       if (roundf(scale_h) != scale_h) {
         LOGS(logger, VERBOSE) << "Resize: scale_h: " << scale_h << " is not a whole number";
         return false;
@@ -228,33 +421,57 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
         LOGS(logger, VERBOSE) << "Resize: scale_w: " << scale_w << " is not a whole number";
         return false;
       }
-    } else {
-      // we are using sizes
-      std::vector<int64_t> output_sizes;
-      if (!GetResizeOutputSizes(initializers, node, output_sizes, logger))
-        return false;
-
-      if (!IsStaticShape(input_shape)) {
-        LOGS(logger, VERBOSE) << "Input shape with dynamic dimensions is not supported.";
+    } else if (scale_h <= 1.f && scale_w <= 1.f) {
+      // downsample
+      if (input_params.create_mlprogram) {
+        auto h_in = input_shape[input_rank - 2];
+        auto w_in = input_shape[input_rank - 1];
+
+        if (!utils::IsScalingByAFactorOfN(h_in, scale_h)) {
+          LOGS(logger, VERBOSE) << "Resize: downsampling scale " << scale_h
+                                << " is not a factor of input height: " << h_in;
+          return false;
+        }
+
+        if (!utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+          LOGS(logger, VERBOSE) << "Resize: downsampling scale " << scale_w
+                                << " is not a factor of input width: " << w_in;
+          return false;
+        }
+
+      } else {
+        LOGS(logger, VERBOSE) << "Resize: downsampling is not supported.";
         return false;
       }
+    } else {
+      LOGS(logger, VERBOSE) << "Resize: scale_h: " << scale_h << " and scale_w: " << scale_w
+                            << " must both be >= 1 or <= 1";
+      return false;
+    }
+  } else {
+    assert(using_sizes);
+
+    if (!GetValidatedResizeSizes(input_params.graph_viewer, node, input_shape, axes, output_sizes, logger)) {
+      return false;
+    }
 
-      auto output_size_n = output_sizes[0];
-      auto output_size_c = output_sizes[1];
-      if (output_size_n != input_shape[0] || output_size_c != input_shape[1]) {
-        LOGS(logger, VERBOSE) << "Output sizes of N/C channel should match the input sizes, "
-                              << "Resize of N/C channels are not supported"
-                              << ", input_size_n, " << input_shape[0] << ", output_size_n, " << output_size_n
-                              << ". input_size_c, " << input_shape[1] << ", output_size_c, " << output_size_c;
+    if (input_params.create_mlprogram) {
+      // no additional requirements
+    } else {
+      if (!IsStaticShape(input_shape)) {
+        // need to convert from sizes to scales when creating the NN layer, so the input H and W are required
+        LOGS(logger, VERBOSE) << "Resize input shape with dynamic dimensions is not supported.";
         return false;
       }
 
-      // For now we only support upscale, so the output_size_h and output_size_w should be an integer >= 1
+      // For now we only support upsample, so the output_size_h and output_size_w should be an integer >= 1
       // TODO support ResizeBilinear
-      auto output_size_h = output_sizes[2];
-      auto output_size_w = output_sizes[3];
-      auto input_size_h = input_shape[2];
-      auto input_size_w = input_shape[3];
+      auto input_size_h = input_shape[input_rank - 2];
+      auto input_size_w = input_shape[input_rank - 1];
+
+      auto num_sizes = output_sizes.size();  // could be smaller than input_rank if axes was used
+      auto output_size_h = output_sizes[num_sizes - 2];
+      auto output_size_w = output_sizes[num_sizes - 1];
 
       // Onnx spec requires output sizes to be a positive integer, so we are not checking that here
       if (output_size_h % input_size_h != 0) {
@@ -271,6 +488,92 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
     }
   }
 
+  std::string coord_trans_mode = helper.Get("coordinate_transformation_mode", "half_pixel");
+  bool using_asymmetric = coord_trans_mode == "asymmetric";
+
+  if (input_params.create_mlprogram) {
+    if (is_nearest) {
+      // Potential CoreML operators we could map to:
+      //
+      // image_resizing.upsample_nearest_neighbor
+      // - mode: nearest
+      // - coordinate_transformation_mode: asymmetric
+      // - 'scales' input
+      //
+      // image_resizing.resize_nearest_neighbor
+      // - mode: nearest
+      // - coordinate_transformation_mode: asymmetric
+      // - 'sizes' input
+      if (!using_asymmetric) {
+        LOGS(logger, VERBOSE) << "Resize with 'mode' of 'nearest' requires 'coordinate_transformation_mode' of "
+                                 "'asymmetric' . Got: "
+                              << coord_trans_mode;
+        return false;
+      }
+    } else {
+      assert(is_linear);
+      // Potential CoreML operators we could map to:
+      //
+      // image_resizing.upsample_bilinear
+      // - mode: linear
+      // - 'scales' input
+      // - coordinate_transformation_mode
+      //   - half_pixel -> align_corners = false
+      //   - align_corners -> align_corners = true
+      //
+      // image_resizing.resize_bilinear
+      // - mode: linear
+      // - 'sizes' input
+      // - coordinate_transformation_mode -> sampling_mode
+      //   - half_pixel -> UNALIGN_CORNERS
+      //   - align_corners -> STRICT_ALIGN_CORNERS
+      //   - asymmetric -> DEFAULT
+      //
+
+      // if output size != 1, coordinate_transformation_mode of pytorch_half_pixel is the same as half_pixel
+      if (coord_trans_mode == "pytorch_half_pixel") {
+        int64_t h_out{0}, w_out{0};
+        if (using_scales) {
+          size_t num_scales = output_scales.size();
+          h_out = std::llround(input_shape[input_rank - 2] * output_scales[num_scales - 2]);
+          w_out = std::llround(input_shape[input_rank - 1] * output_scales[num_scales - 1]);
+        } else {
+          size_t num_sizes = output_sizes.size();
+          h_out = output_sizes[num_sizes - 2];
+          w_out = output_sizes[num_sizes - 1];
+        }
+
+        if (h_out > 1 && w_out > 1) {
+          coord_trans_mode = "half_pixel";
+        }
+      }
+
+      if (coord_trans_mode == "half_pixel" ||
+          coord_trans_mode == "align_corners" ||
+          (using_sizes && coord_trans_mode == "asymmetric")) {
+        // supported
+
+        // FWIW we could calculate (if shape inferencing didn't already) the output sizes and convert a node with
+        // `scales` and co-ord mode of `asymmetric` to having `sizes` input so it's supported.
+      } else {
+        LOGS(logger, VERBOSE) << "Resize with 'mode' of 'linear' requires 'coordinate_transformation_mode' of "
+                                 "'half_pixel', or 'align_corners', or 'asymmetric' with 'sizes' input. Got: "
+                              << coord_trans_mode;
+
+        return false;
+      }
+    }
+  } else {
+    // NeuralNetwork checks
+    if (!using_asymmetric) {
+      // align_corners and half_pixel could be supported in ResizeBilinear but as NeuralNetwork is deprecated
+      // there's no known value to adding that.
+      LOGS(logger, VERBOSE) << "Resize only supports 'asymmetric' coordinate_transformation_mode. Got: "
+                            << coord_trans_mode;
+      return false;
+    }
+  }
+
   return true;
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index 8f85ab2c09e7c..385588dbfdcb8 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -141,8 +141,17 @@ class ModelBuilder {
   // so we don't do a copy of the original initializer into the model.
   void AddInitializerToSkip(const std::string& tensor_name);
 
-  // There are some input which will not be used, add it to a list which will not
-  // be added to CoreML model, since CoreML does not like input unused
+  /// <summary>
+  /// Skip a non-initializer value, that is not used in the CoreML model, but was an input to a supported node.
+  ///
+  /// This is for a rare edge case where a value is an input to a node but is empty/unused, as the
+  /// CoreML model requires all model inputs to be consumed.
+  /// </summary>
+  /// <remarks>
+  /// The only known use case for this currently is Resize, and that is largely due to how the unit tests are
+  /// setup rather than something you'd expect to see in a real model.
+  /// See ResizeOpBuilder::AddInitializersToSkip for more details.
+  /// </remarks>
   void AddInputToSkip(const std::string& input_name);
 
   const std::string& GetUniqueName(const std::string& base_name);
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index 0ba715cc7c6d9..a92fef81ac395 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -27,6 +27,7 @@ CoreMLExecutionProvider::CoreMLExecutionProvider(uint32_t coreml_flags)
     : IExecutionProvider{onnxruntime::kCoreMLExecutionProvider},
       coreml_flags_(coreml_flags),
       coreml_version_(coreml::util::CoreMLVersion()) {
+  LOGS_DEFAULT(VERBOSE) << "CoreML version: " << coreml_version_;
   if (coreml_version_ < MINIMUM_COREML_VERSION) {
     LOGS_DEFAULT(ERROR) << "CoreML EP is not supported on this platform.";
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
index d75b9cc72ff4b..ef27f6c942f44 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
@@ -9,6 +9,7 @@
 #include "core/graph/graph_viewer.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
+#include "core/providers/utils.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/helper.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/model_builder.h"
@@ -251,14 +252,34 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const N
       const Initializer unpacked_tensor(*scales);
       auto scales_data = unpacked_tensor.DataAsSpan<float>();
       input_is_nchw = scales_data[1] == 1.0F;
-      float const scale_n = scales_data[0];
-      float const scale_c = input_is_nchw ? scales_data[1] : scales_data[3];
+      const float scale_n = scales_data[0];
+      const float scale_c = input_is_nchw ? scales_data[1] : scales_data[3];
+      const float scale_h = input_is_nchw ? scales_data[2] : scales_data[1];
+      const float scale_w = input_is_nchw ? scales_data[3] : scales_data[2];
+
       if (scale_n != 1.0f || scale_c != 1.0f) {
         LOGS_DEFAULT(VERBOSE) << "Scales of N/C channel should be 1"
                               << "Resize of N/C channels are not supported"
                               << ", scale_n, " << scale_n << ", scale_c, " << scale_c;
         return false;
       }
+
+      // if downsampling the input size must be evenly divisible by the output size to match the onnx output
+      if (scale_h < 1.0f || scale_w < 1.0f) {
+        // we also require input_shape to be known to check
+        auto h_in = input_is_nchw ? input_shape[2] : input_shape[1];
+        auto w_in = input_is_nchw ? input_shape[3] : input_shape[2];
+        if (h_in == 0 || w_in == 0) {
+          LOGS_DEFAULT(VERBOSE) << "Input H and W must be known to downsample with scales";
+          return false;
+        }
+
+        if (!utils::IsScalingByAFactorOfN(h_in, scale_h) ||
+            !utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+          LOGS_DEFAULT(VERBOSE) << "Input size must be evenly divisible by output size when downsampling";
+          return false;
+        }
+      }
     } else {
       const auto* sizes = graph_viewer.GetConstantInitializer(inputs[3].node_arg.Name());
       if (!sizes) {
diff --git a/onnxruntime/core/providers/utils.cc b/onnxruntime/core/providers/utils.cc
index b2f9d265ca053..747b09e42aa21 100644
--- a/onnxruntime/core/providers/utils.cc
+++ b/onnxruntime/core/providers/utils.cc
@@ -23,5 +23,21 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto&
   return Status::OK();
 }
 #endif
+
+bool IsScalingByAFactorOfN(int64_t n, float scale) {
+  bool is_factor = false;
+  if (scale > 0.f && scale < 1.f) {
+    const double factor = 1.0 / scale;
+    const double factor_rounded = std::round(factor);
+    constexpr double epsilon = 1.0e-4;  // arbitrarily small enough
+    if (std::abs(factor - factor_rounded) < epsilon) {
+      // result is integer. check if a factor of n
+      const int64_t factor_i = static_cast<int64_t>(factor_rounded);
+      is_factor = n % factor_i == 0;
+    }
+  }
+
+  return is_factor;
+}
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/utils.h b/onnxruntime/core/providers/utils.h
index 8cafdb8c05cc3..9ea8496a02f85 100644
--- a/onnxruntime/core/providers/utils.h
+++ b/onnxruntime/core/providers/utils.h
@@ -15,5 +15,10 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto&
                                                OpKernelContext* context, int output_index);
 #endif
 
+/// <summary>
+/// Check if the reciprocal of 'scale' is a factor of 'n'.
+///   e.g. a scale of 0.5 is 1/2, the reciprocal is 2, and 2 is a factor of any even number.
+/// </summary>
+bool IsScalingByAFactorOfN(int64_t n, float scale);
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/xnnpack/tensor/resize.cc b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
index 09666c8039402..c752b5f849808 100644
--- a/onnxruntime/core/providers/xnnpack/tensor/resize.cc
+++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
@@ -11,6 +11,7 @@
 #include "core/framework/op_kernel.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/xnnpack/xnnpack_init.h"
+#include "core/providers/utils.h"
 
 namespace onnxruntime {
 namespace xnnpack {
@@ -68,9 +69,27 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit,
     InlinedVector<float> scale(4, 1.0F);
     if (scale_tensor) {
       const Initializer scale_val(*scale_tensor, node_unit.ModelPath());
-      if (scale_val.DataAsSpan<float>()[1] != 1.0F) {
+      const auto scales = scale_val.DataAsSpan<float>();
+      if (scales[1] != 1.0F) {
         break;
       }
+
+      // downsampling output seems to require the output size to be a factor of the input to match ONNX
+      if (scales[2] < 1.0f || scales[3] < 1.0f) {
+        // we also require input_shape to be known to check
+        int64_t h_in = x_shape->dim(2).dim_value();
+        int64_t w_in = x_shape->dim(3).dim_value();
+        if (h_in < 0 || w_in < 0) {
+          break;
+        }
+
+        float scale_h = scales[2];
+        float scale_w = scales[3];
+        if (!utils::IsScalingByAFactorOfN(h_in, scale_h) ||
+            !utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+          break;
+        }
+      }
     }
 
     if (size_tensor) {
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 496f2213e9d32..111520ef03e26 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -227,28 +227,33 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) {
-  OpTester test("Resize", 13);
-  std::vector<float> roi{};
-  std::vector<float> scales{1.0f, 1.0f, 0.6f, 0.6f};
+  auto run_test = [](bool scales_in_initializer) {
+    OpTester test("Resize", 13);
+    std::vector<float> roi{};
+    std::vector<float> scales{1.0f, 1.0f, 0.6f, 0.6f};
 
-  test.AddAttribute("mode", "linear");
+    test.AddAttribute("mode", "linear");
 
-  constexpr int64_t N = 1, C = 1, H = 2, W = 4;
-  std::vector<float> X = {
-      1.0f, 2.0f, 3.0f, 4.0f,
-      5.0f, 6.0f, 7.0f, 8.0f};
+    constexpr int64_t N = 1, C = 1, H = 2, W = 4;
+    std::vector<float> X = {
+        1.0f, 2.0f, 3.0f, 4.0f,
+        5.0f, 6.0f, 7.0f, 8.0f};
 
-  test.AddInput<float>("X", {N, C, H, W}, X);
-  test.AddInput<float>("roi", {0}, roi);
-  test.AddInput<float>("scales", {4}, scales);
+    test.AddInput<float>("X", {N, C, H, W}, X);
+    test.AddInput<float>("roi", {0}, roi);
+    test.AddInput<float>("scales", {4}, scales, scales_in_initializer);
 
-  std::vector<float> Y = {2.66666651f, 4.3333331f};
+    std::vector<float> Y = {2.66666651f, 4.3333331f};
 
-  test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  // QNN: result diff
-  // TRT: Segmentation fault in A100
-  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
+    test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
+    // QNN: result diff
+    // TRT: Segmentation fault in A100
+    std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
+  };
+
+  run_test(false);
+  run_test(true);
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
@@ -327,13 +332,14 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
 // Since NNAPI(TFLite) only using the scale calculate using the input/output size
 // For the above test (ResizeOpLinearDownSampleTest_4DBilinear)
 // The output size is [1,1,2,4].*[1,1,0.6,0.6]=[1,1,1,2]
-// NNAPI will recaluclate the scales as the output size divided by input size
+// NNAPI will recalculate the scales as the output size divided by input size
 // scales = [1,1,1,2]./[1,1,2,4] = [1,1,0.5,0.5]
 // See:https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/reference_ops.h
 // So the result of the above example will be different than CPU EP
-// Add the following 2 tests to test with scales valid to NNAPI
+// Add the following 2 tests to test with scales valid to NNAPI.
+// CoreML also doesn't handle a scale that doesn't divide the input size evenly.
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -360,8 +366,38 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
   run_test(true);
 }
 
+// Downsize with factor being an odd number (1/3)
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1_OddNumber) {
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
+  auto run_test = [](bool scales_in_initializer) {
+    OpTester test("Resize", 13);
+    std::vector<float> roi{};
+    std::vector<float> scales{1.0f, 1.0f, (1.f / 3), (1.f / 3)};
+
+    test.AddAttribute("mode", "linear");
+
+    constexpr int64_t N = 1, C = 1, H = 3, W = 6;
+    std::vector<float> X = {
+        1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+        7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f,
+        13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f};
+
+    test.AddInput<float>("X", {N, C, H, W}, X);
+    test.AddInput<float>("roi", {0}, roi);
+    test.AddInput<float>("scales", {4}, scales, scales_in_initializer);
+
+    std::vector<float> Y = {8.f, 11.f};
+
+    test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
+  };
+
+  run_test(false);
+  run_test(true);
+}
+
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1_WithSizes) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_and_sizes_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -389,8 +425,32 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1_WithSizes) {
   run_test(true);
 }
 
+// test handling for opset 11. scales input is provided but should be ignored in favor of sizes
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1_WithSizesOpset11) {
+  OpTester test("Resize", 11);
+  std::vector<float> roi{};
+  std::vector<float> scales{};
+  constexpr int64_t N = 1, C = 1, H = 2, W = 4;
+  std::vector<int64_t> sizes{N, C, 1, 2};
+  test.AddAttribute("mode", "linear");
+
+  std::vector<float> X = {
+      1.0f, 2.0f, 3.0f, 4.0f,
+      5.0f, 6.0f, 7.0f, 8.0f};
+
+  test.AddInput<float>("X", {N, C, H, W}, X);
+  test.AddInput<float>("roi", {0}, roi);
+  test.AddInput<float>("scales", {0}, scales);
+  test.AddInput<int64_t>("sizes", {4}, sizes, true);  // add as initializer so CoreML EP can take
+
+  std::vector<float> Y = {3.5f, 5.5f};
+
+  test.AddOutput<float>("Y", sizes, Y);
+  test.Run();
+}
+
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -416,15 +476,51 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
 
   run_test(false);
 
-#ifdef USE_NNAPI
-  // NNAPI will need the scales as an initializer
+#if defined(USE_NNAPI) || defined(USE_COREML)
+  // NNAPI and CoreML need the scales as an initializer
+  // Also tensor RT EP will fail if scales is an initializer but will pass if it is not
+  run_test(true);
+#endif
+}
+
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners_sizes) {
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
+  auto run_test = [](bool scales_in_initializer) {
+    OpTester test("Resize", 13);
+    std::vector<float> roi{};
+    std::vector<float> scales{};
+    std::vector<int64_t> sizes{1, 1, 1, 2};
+
+    test.AddAttribute("mode", "linear");
+    test.AddAttribute("coordinate_transformation_mode", "align_corners");
+
+    constexpr int64_t N = 1, C = 1, H = 2, W = 4;
+    std::vector<float> X = {
+        1.0f, 2.0f, 3.0f, 4.0f,
+        5.0f, 6.0f, 7.0f, 8.0f};
+
+    test.AddInput<float>("X", {N, C, H, W}, X);
+    test.AddInput<float>("roi", {0}, roi);
+    test.AddInput<float>("", {0}, scales);
+    test.AddInput<int64_t>("sizes", {4}, sizes, scales_in_initializer);
+
+    std::vector<float> Y = {1.0f, 4.0f};
+
+    test.AddOutput<float>("Y", {N, C, 1, 2}, Y);
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
+  };
+
+  run_test(false);
+
+#if defined(USE_NNAPI) || defined(USE_COREML)
+  // NNAPI and CoreML will need the scales as an initializer
   // Also tensor RT EP will fail if scales is an initializer but will pass if it is not
   run_test(true);
 #endif
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_uint8) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -456,7 +552,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_uin
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_int8) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -622,7 +718,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric_scales) {
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_uint8) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -668,7 +764,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_uint8) {
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_int8) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index 1bbb933f66ba4..3b3790ba06599 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -17,6 +17,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Pow|Only supports cases when both inputs are fp32.|
 |ai.onnx:Relu||
 |ai.onnx:Reshape||
+|ai.onnx:Resize|See [resize_op_builder.cc](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc) implementation. There are too many permutations to describe the valid combinations.|
 |ai.onnx:Sub||
 |ai.onnx:Sigmoid||
 |ai:onnx:Tanh||

From 5bec52203d980d5925a8f806c7e7c922cf694636 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?=
 <44298237+gedoensmax@users.noreply.github.com>
Date: Sat, 20 Jul 2024 06:11:04 +0200
Subject: [PATCH 07/15] [TensorRT] Enable refitting an embedded engine when
 provided as byte stream (#21357)

### Description

This allows refitting an engine using an ONNX file not available on
disk. This is important for encrypted ONNX files on disk.
---
 .../tensorrt/tensorrt_provider_options.h      |  7 ++
 .../tensorrt/onnx_ctx_model_helper.cc         | 24 +++++-
 .../tensorrt/onnx_ctx_model_helper.h          |  6 ++
 .../tensorrt/tensorrt_execution_provider.cc   | 81 +++++++++++++-----
 .../tensorrt/tensorrt_execution_provider.h    |  7 +-
 .../tensorrt_execution_provider_info.cc       | 19 +++++
 .../tensorrt_execution_provider_info.h        |  2 +
 .../tensorrt/tensorrt_provider_factory.cc     |  2 +
 .../core/session/provider_bridge_ort.cc       |  4 +
 .../test/perftest/command_args_parser.cc      |  6 +-
 onnxruntime/test/perftest/ort_test_session.cc | 18 +++-
 .../test/perftest/test_configuration.h        |  1 +
 .../providers/tensorrt/tensorrt_basic_test.cc | 83 +++++++++++++++++--
 13 files changed, 225 insertions(+), 35 deletions(-)

diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
index d008058821be3..816eaaf9bc71a 100644
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -16,6 +16,7 @@ struct OrtTensorRTProviderOptionsV2 {
   int device_id{0};                                      // cuda device id.
   int has_user_compute_stream{0};                        // indicator of user specified CUDA compute stream.
   void* user_compute_stream{nullptr};                    // user specified CUDA compute stream.
+                                                         // can be updated using: UpdateTensorRTProviderOptionsWithValue
   int trt_max_partition_iterations{1000};                // maximum iterations for TensorRT parser to get capability
   int trt_min_subgraph_size{1};                          // minimum size of TensorRT subgraphs
   size_t trt_max_workspace_size{1 << 30};                // maximum workspace size for TensorRT.
@@ -78,6 +79,12 @@ struct OrtTensorRTProviderOptionsV2 {
   const char* trt_onnx_model_folder_path{nullptr};  // Folder path relative to the current working directory for
                                                     // the ONNX model containing the weights (applicable only when
                                                     // the "trt_weight_stripped_engine_enable" option is enabled)
+  const void* trt_onnx_bytestream{nullptr};         // The byte stream of th original ONNX model containing the weights
+                                                    // (applicable only when the "trt_weight_stripped_engine_enable"
+                                                    // option is enabled)
+                                                    // can be updated using: UpdateTensorRTProviderOptionsWithValue
+  size_t trt_onnx_bytestream_size{0};               // size of the byte stream provided as "trt_onnx_bytestream"
+                                                    // can be updated using: UpdateTensorRTProviderOptionsWithValue
 
   const char* trt_engine_cache_prefix{nullptr};  // specify engine cache prefix
   int trt_engine_hw_compatible{0};               // Enable hardware compatibility. Default 0 = false, nonzero = true
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
index 42788f2960197..ef45d6c85d6a9 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
@@ -274,6 +274,9 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
   auto& attrs = node->GetAttributes();
 
   const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+  // Only make path checks if model not provided as byte buffer
+  bool make_secure_path_checks = !GetModelPath(graph_viewer).empty();
+
   if (embed_mode) {
     // Get engine from byte stream.
     const std::string& context_binary = attrs.at(EP_CACHE_CONTEXT).s();
@@ -284,6 +287,23 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                              "TensorRT EP could not deserialize engine from binary data");
     }
+
+    if (weight_stripped_engine_refit_) {
+      const std::string onnx_model_filename = attrs.at(ONNX_MODEL_FILENAME).s();
+      std::string placeholder;
+      auto status = TensorrtExecutionProvider::RefitEngine(onnx_model_filename,
+                                                           onnx_model_folder_path_,
+                                                           placeholder,
+                                                           make_secure_path_checks,
+                                                           onnx_model_bytestream_,
+                                                           onnx_model_bytestream_size_,
+                                                           (*trt_engine_).get(),
+                                                           false /* serialize refitted engine to disk */,
+                                                           detailed_build_log_);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
   } else {
     // Get engine from cache file.
     std::string cache_path = attrs.at(EP_CACHE_CONTEXT).s();
@@ -343,7 +363,9 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
       auto status = TensorrtExecutionProvider::RefitEngine(onnx_model_filename,
                                                            onnx_model_folder_path_,
                                                            weight_stripped_engine_cache,
-                                                           true /* path check for security */,
+                                                           make_secure_path_checks,
+                                                           onnx_model_bytestream_,
+                                                           onnx_model_bytestream_size_,
                                                            (*trt_engine_).get(),
                                                            true /* serialize refitted engine to disk */,
                                                            detailed_build_log_);
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
index 3be08d043da48..3af0143cbf14e 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
@@ -52,6 +52,8 @@ class TensorRTCacheModelHandler {
                             std::string compute_capability,
                             bool weight_stripped_engine_refit,
                             std::string onnx_model_folder_path,
+                            const void* onnx_model_bytestream,
+                            size_t onnx_model_bytestream_size,
                             bool detailed_build_log)
       : trt_engine_(trt_engine),
         trt_runtime_(trt_runtime),
@@ -59,6 +61,8 @@ class TensorRTCacheModelHandler {
         compute_capability_(compute_capability),
         weight_stripped_engine_refit_(weight_stripped_engine_refit),
         onnx_model_folder_path_(onnx_model_folder_path),
+        onnx_model_bytestream_(onnx_model_bytestream),
+        onnx_model_bytestream_size_(onnx_model_bytestream_size),
         detailed_build_log_(detailed_build_log) {
   }
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler);
@@ -74,6 +78,8 @@ class TensorRTCacheModelHandler {
   std::string compute_capability_;
   bool weight_stripped_engine_refit_;
   std::string onnx_model_folder_path_;
+  const void* onnx_model_bytestream_;
+  size_t onnx_model_bytestream_size_;
   bool detailed_build_log_;
 };  // TRTCacheModelHandler
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 67cbc8f5d6f13..cdbb7bb2a8094 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1333,6 +1333,14 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     engine_cache_enable_ = info.engine_cache_enable;
     weight_stripped_engine_enable_ = info.weight_stripped_engine_enable;
     onnx_model_folder_path_ = info.onnx_model_folder_path;
+    onnx_model_bytestream_ = info.onnx_bytestream;
+    onnx_model_bytestream_size_ = info.onnx_bytestream_size;
+    if ((onnx_model_bytestream_ != nullptr && onnx_model_bytestream_size_ == 0) ||
+        (onnx_model_bytestream_ == nullptr && onnx_model_bytestream_size_ != 0)) {
+      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                         "When providing either 'trt_onnx_bytestream_size' or "
+                                         "'trt_onnx_bytestream' both have to be provided"));
+    }
     timing_cache_enable_ = info.timing_cache_enable;
     force_timing_cache_match_ = info.force_timing_cache;
     detailed_build_log_ = info.detailed_build_log;
@@ -1757,7 +1765,8 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
                         << ", trt_ep_context_file_path: " << ep_context_file_path_
                         << ", trt_ep_context_embed_mode: " << ep_context_embed_mode_
                         << ", trt_cache_prefix: " << cache_prefix_
-                        << ", trt_engine_hw_compatible: " << engine_hw_compatible_;
+                        << ", trt_engine_hw_compatible: " << engine_hw_compatible_
+                        << ", trt_onnx_model_bytestream_size_: " << onnx_model_bytestream_size_;
 }
 
 TensorrtExecutionProvider::~TensorrtExecutionProvider() {
@@ -2597,28 +2606,42 @@ common::Status TensorrtExecutionProvider::RefitEngine(std::string onnx_model_fil
                                                       std::string& onnx_model_folder_path,
                                                       std::string& weight_stripped_engine_cath_path,
                                                       bool path_check,
+                                                      const void* onnx_model_bytestream,
+                                                      size_t onnx_model_bytestream_size,
                                                       nvinfer1::ICudaEngine* trt_engine,
                                                       bool serialize_refitted_engine,
                                                       bool detailed_build_log) {
 #if NV_TENSORRT_MAJOR >= 10
+  bool refit_from_file = onnx_model_bytestream == nullptr && onnx_model_bytestream_size == 0;
   std::filesystem::path onnx_model_path{onnx_model_folder_path};
-  onnx_model_path.append(onnx_model_filename);
-  if (path_check && IsAbsolutePath(onnx_model_path.string())) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                           "For security purpose, the ONNX model path should be set with "
-                           "a relative path, but it is an absolute path: " +
-                               onnx_model_path.string());
-  }
-  if (path_check && IsRelativePathToParentPath(onnx_model_path.string())) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                           "The ONNX model path has '..'. For security purpose, it's not "
-                           "allowed to point outside the directory.");
-  }
+  if (refit_from_file) {
+    if (!onnx_model_filename.empty()) {
+      onnx_model_path.append(onnx_model_filename);
+    }
+    if (onnx_model_path.empty()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "The ONNX model was not provided as path. "
+                             "Please use provide an ONNX bytestream to enable refitting the weightless engine.");
+    } else {
+      // check if file path to ONNX is legal
+      if (path_check && IsAbsolutePath(onnx_model_path.string())) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "For security purpose, the ONNX model path should be set with "
+                               "a relative path, but it is an absolute path: " +
+                                   onnx_model_path.string());
+      }
+      if (path_check && IsRelativePathToParentPath(onnx_model_path.string())) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "The ONNX model path has '..'. For security purpose, it's not "
+                               "allowed to point outside the directory.");
+      }
 
-  if (!std::filesystem::exists(onnx_model_path)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                           "The ONNX model " + onnx_model_path.string() +
-                               " does not exist.");
+      if (!(std::filesystem::exists(onnx_model_path) && std::filesystem::is_regular_file(onnx_model_path))) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "The ONNX model " + onnx_model_path.string() +
+                                   " does not exist.");
+      }
+    }
   }
 
   // weight-stripped engine refit logic
@@ -2626,9 +2649,18 @@ common::Status TensorrtExecutionProvider::RefitEngine(std::string onnx_model_fil
   auto refitter = std::unique_ptr<nvinfer1::IRefitter>(nvinfer1::createInferRefitter(*trt_engine, trt_logger));
   auto parser_refitter = std::unique_ptr<nvonnxparser::IParserRefitter>(
       nvonnxparser::createParserRefitter(*refitter, trt_logger));
-  if (!parser_refitter->refitFromFile(onnx_model_path.string().c_str())) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                           "TensorRT EP's IParserRefitter could not refit deserialized weight-stripped engine with weights contained in: " + onnx_model_path.string());
+  if (refit_from_file) {
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Refitting from file on disk: " << onnx_model_path.string();
+    if (!parser_refitter->refitFromFile(onnx_model_path.string().c_str())) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP's IParserRefitter could not refit deserialized weight-stripped engine with weights contained in: " + onnx_model_path.string());
+    }
+  } else {
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Refitting from byte array";
+    if (!parser_refitter->refitFromBytes(onnx_model_bytestream, onnx_model_bytestream_size)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP's IParserRefitter could not refit deserialized weight-stripped engine with weights contained in the provided bytestraem");
+    }
   }
   if (refitter->refitCudaEngine()) {
     LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Successfully refitted the weight-stripped engine.";
@@ -3212,10 +3244,15 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     }
 
     if (weight_stripped_engine_refit_) {
+      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Refit engine from main ONNX file after engine build";
+      char* onnx = string_buf.data();
+      size_t onnx_size = string_buf.size();
       auto status = RefitEngine(model_path_,
                                 onnx_model_folder_path_,
                                 engine_cache_path,
                                 false /* path check for security */,
+                                onnx,
+                                onnx_size,
                                 trt_engine.get(),
                                 true /* serialize refitted engine to disk */,
                                 detailed_build_log_);
@@ -3685,6 +3722,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
                                   onnx_model_folder_path_,
                                   engine_cache_path,
                                   false /* path check for security */,
+                                  onnx_model_bytestream_,
+                                  onnx_model_bytestream_size_,
                                   trt_engine,
                                   true /* serialize refitted engine to disk */,
                                   detailed_build_log_);
@@ -3910,6 +3949,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
                                                            compute_capability_,
                                                            weight_stripped_engine_enable_,
                                                            onnx_model_folder_path_,
+                                                           onnx_model_bytestream_,
+                                                           onnx_model_bytestream_size_,
                                                            detailed_build_log_);
   auto status = trt_cache_model_handler.GetEpContextFromGraph(graph_body_viewer);
   if (status != Status::OK()) {
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index b58e86237860c..3f20314438564 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -274,13 +274,12 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool IsGraphCaptured(int graph_annotation_id) const override;
   Status ReplayGraph(int graph_annotation_id) override;
 
-  /**
-   * Refit the weight-stripped engine
-   */
   static common::Status RefitEngine(std::string onnx_model_filename,
                                     std::string& onnx_model_folder_path,
                                     std::string& weight_stripped_engine_cath_path,
                                     bool path_check,
+                                    const void* onnx_model_bytestream,
+                                    size_t onnx_model_bytestream_size,
                                     nvinfer1::ICudaEngine* trt_engine,
                                     bool serialize_refitted_engine,
                                     bool detailed_build_log);
@@ -305,6 +304,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool weight_stripped_engine_enable_ = false;
   bool weight_stripped_engine_refit_ = false;
   std::string onnx_model_folder_path_;
+  const void* onnx_model_bytestream_;
+  size_t onnx_model_bytestream_size_;
   bool build_heuristics_enable_ = false;
   bool sparsity_enable_ = false;
   int builder_optimization_level_ = 3;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
index 9fe39f5921e1c..63b6d35072290 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -54,6 +54,8 @@ constexpr const char* kEpContextEmbedMode = "trt_ep_context_embed_mode";
 constexpr const char* kEpContextFilePath = "trt_ep_context_file_path";
 constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
 constexpr const char* kEngineHwCompatible = "trt_engine_hw_compatible";
+constexpr const char* kONNXBytestream = "trt_onnx_bytestream";
+constexpr const char* kONNXBytestreamSize = "trt_onnx_bytestream_size";
 
 }  // namespace provider_option_names
 }  // namespace tensorrt
@@ -61,6 +63,7 @@ constexpr const char* kEngineHwCompatible = "trt_engine_hw_compatible";
 TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
   TensorrtExecutionProviderInfo info{};
   void* user_compute_stream = nullptr;
+  void* onnx_bytestream = nullptr;
   ORT_THROW_IF_ERROR(
       ProviderOptionsParser{}
           .AddValueParser(
@@ -122,10 +125,20 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextFilePath, info.ep_context_file_path)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextEmbedMode, info.ep_context_embed_mode)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEngineHwCompatible, info.engine_hw_compatible)
+          .AddValueParser(
+              tensorrt::provider_option_names::kONNXBytestream,
+              [&onnx_bytestream](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                onnx_bytestream = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
+          .AddAssignmentToReference(tensorrt::provider_option_names::kONNXBytestreamSize, info.onnx_bytestream_size)
           .Parse(options));  // add new provider option here.
 
   info.user_compute_stream = user_compute_stream;
   info.has_user_compute_stream = (user_compute_stream != nullptr);
+  info.onnx_bytestream = onnx_bytestream;
   return info;
 }
 
@@ -173,6 +186,8 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kEpContextFilePath, MakeStringWithClassicLocale(info.ep_context_file_path)},
       {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.ep_context_embed_mode)},
       {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.engine_hw_compatible)},
+      {tensorrt::provider_option_names::kONNXBytestream, MakeStringWithClassicLocale(info.onnx_bytestream)},
+      {tensorrt::provider_option_names::kONNXBytestreamSize, MakeStringWithClassicLocale(info.onnx_bytestream_size)},
   };
   return options;
 }
@@ -234,6 +249,8 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.trt_dump_ep_context_model)},
       {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.trt_ep_context_embed_mode)},
       {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.trt_engine_hw_compatible)},
+      {tensorrt::provider_option_names::kONNXBytestream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.trt_onnx_bytestream))},
+      {tensorrt::provider_option_names::kONNXBytestreamSize, MakeStringWithClassicLocale(info.trt_onnx_bytestream_size)},
   };
   return options;
 }
@@ -336,5 +353,7 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_ep_context_embed_mode = internal_options.ep_context_embed_mode;
   trt_provider_options_v2.trt_ep_context_file_path = copy_string_if_needed(internal_options.ep_context_file_path);
   trt_provider_options_v2.trt_engine_hw_compatible = internal_options.engine_hw_compatible;
+  trt_provider_options_v2.trt_onnx_bytestream = internal_options.onnx_bytestream;
+  trt_provider_options_v2.trt_onnx_bytestream_size = internal_options.onnx_bytestream_size;
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
index 3b859ea2da466..50b934fd5fcbc 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -34,6 +34,8 @@ struct TensorrtExecutionProviderInfo {
   std::string engine_cache_path{""};
   bool weight_stripped_engine_enable{false};
   std::string onnx_model_folder_path{""};
+  const void* onnx_bytestream{nullptr};
+  size_t onnx_bytestream_size{0};
   bool engine_decryption_enable{false};
   std::string engine_decryption_lib_path{""};
   bool force_sequential_engine_build{false};
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
index 6430ffab09976..e242788ff389a 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -116,6 +116,8 @@ struct Tensorrt_Provider : Provider {
     info.ep_context_embed_mode = options.trt_ep_context_embed_mode;
     info.engine_cache_prefix = options.trt_engine_cache_prefix == nullptr ? "" : options.trt_engine_cache_prefix;
     info.engine_hw_compatible = options.trt_engine_hw_compatible != 0;
+    info.onnx_bytestream = options.trt_onnx_bytestream;
+    info.onnx_bytestream_size = options.trt_onnx_bytestream_size;
 
     return std::make_shared<TensorrtProviderFactory>(info);
   }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 4f9669a7dcc4c..1d21933e9cba9 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -2465,6 +2465,10 @@ ORT_API_STATUS_IMPL(OrtApis::UpdateTensorRTProviderOptionsWithValue,
   if (strcmp(key, "user_compute_stream") == 0) {
     tensorrt_options->has_user_compute_stream = 1;
     tensorrt_options->user_compute_stream = value;
+  } else if (strcmp(key, "trt_onnx_bytestream") == 0) {
+    tensorrt_options->trt_onnx_bytestream = value;
+  } else if (strcmp(key, "trt_onnx_bytestream_size") == 0) {
+    tensorrt_options->trt_onnx_bytestream_size = *reinterpret_cast<size_t*>(value);
   }
   return nullptr;
 #else
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index b7c99fa66a1ea..e6d4e0a94abd3 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -143,6 +143,7 @@ namespace perftest {
       "\t-D [Disable thread spinning]: disable spinning entirely for thread owned by onnxruntime intra-op thread pool.\n"
       "\t-Z [Force thread to stop spinning between runs]: disallow thread from spinning during runs to reduce cpu usage.\n"
       "\t-n [Exit after session creation]: allow user to measure session creation time to measure impact of enabling any initialization optimizations.\n"
+      "\t-l Provide file as binary in memory by using fopen before session creation.\n"
       "\t-h: help\n");
 }
 #ifdef _WIN32
@@ -205,7 +206,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
 
 /*static*/ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int argc, ORTCHAR_T* argv[]) {
   int ch;
-  while ((ch = getopt(argc, argv, ORT_TSTR("m:e:r:t:p:x:y:c:d:o:u:i:f:F:S:T:C:AMPIDZvhsqzn"))) != -1) {
+  while ((ch = getopt(argc, argv, ORT_TSTR("m:e:r:t:p:x:y:c:d:o:u:i:f:F:S:T:C:AMPIDZvhsqznl"))) != -1) {
     switch (ch) {
       case 'f': {
         std::basic_string<ORTCHAR_T> dim_name;
@@ -390,6 +391,9 @@ static bool ParseSessionConfigs(const std::string& configs_string,
       case 'n':
         test_config.run_config.exit_after_session_creation = true;
         break;
+      case 'l':
+        test_config.model_info.load_via_path = true;
+        break;
       case '?':
       case 'h':
       default:
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index ff782da35cbe6..92d732fba2a0a 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -5,6 +5,7 @@
 #include "ort_test_session.h"
 #include <algorithm>
 #include <limits>
+#include <fstream>
 #include <set>
 #include <list>
 #include <type_traits>
@@ -816,8 +817,21 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #endif
   }
 
-  session_ = Ort::Session(env, performance_test_config.model_info.model_file_path.c_str(), session_options);
-
+  if (!performance_test_config.model_info.load_via_path) {
+    session_ = Ort::Session(env, performance_test_config.model_info.model_file_path.c_str(), session_options);
+  } else {
+    std::ifstream file(performance_test_config.model_info.model_file_path.c_str(),
+                       std::ios::binary | std::ios::in | std::ios::ate);
+    if (file.is_open()) {
+      const std::streamsize fsize = file.tellg();
+      file.seekg(0, std::ios_base::beg);
+      std::vector<char> model_bytes(narrow<size_t>(fsize));
+      file.read(model_bytes.data(), fsize);
+      session_ = Ort::Session(env, model_bytes.data(), model_bytes.size(), session_options);
+    } else {
+      ORT_THROW("Model file could not be opened.\n");
+    }
+  }
   size_t output_count = session_.GetOutputCount();
   output_names_.resize(output_count);
   Ort::AllocatorWithDefaultOptions a;
diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h
index 70a6b12690d5d..209fb55fe93d4 100644
--- a/onnxruntime/test/perftest/test_configuration.h
+++ b/onnxruntime/test/perftest/test_configuration.h
@@ -29,6 +29,7 @@ struct ModelInfo {
   std::basic_string<ORTCHAR_T> model_file_path;
   std::basic_string<ORTCHAR_T> input_file_path;
   std::basic_string<ORTCHAR_T> result_file_path;
+  bool load_via_path = false;
 };
 
 struct MachineConfig {
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index 2b5b82d0fc16a..63327a028c6f4 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -122,6 +122,18 @@ void CreateBaseModel(const PathString& model_name,
   status = onnxruntime::Model::Save(model, model_name);
 }
 
+std::vector<char> ReadFileFromDisk(const PathString& path) {
+  std::fstream file(path.c_str(), std::fstream::binary | std::fstream::in | std::fstream::ate);
+  std::vector<char> file_bytes;
+  if (file.is_open()) {
+    auto fsize = file.tellg();
+    file.seekg(0, std::ios_base::beg);
+    file_bytes.resize(fsize);
+    file.read(file_bytes.data(), fsize);
+  }
+  return file_bytes;
+}
+
 bool HasCacheFileWithPrefix(const std::string& prefix, std::string file_dir = "") {
   std::filesystem::path target_dir;
   if (file_dir.empty()) {
@@ -360,7 +372,8 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
 }
 
 TEST(TensorrtExecutionProviderTest, EPContextNode) {
-  PathString model_name = ORT_TSTR("EPContextNode_test.onnx");
+  std::string model_name_str = "EPContextNode_test.onnx";
+  PathString model_name = ToPathString(model_name_str);
   std::string graph_name = "EPContextNode_test";
   std::string sess_log_id = "EPContextNode_test";
   std::vector<int> dims = {1, 3, 2};
@@ -461,11 +474,11 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
    */
   InferenceSession session_object3{so, GetEnvironment()};
   OrtTensorRTProviderOptionsV2 params3;
-  model_name = ToPathString(params.trt_ep_context_file_path);
+  PathString ctx_model_name = ToPathString(params.trt_ep_context_file_path);
   params3.trt_engine_cache_enable = 1;
   execution_provider = TensorrtExecutionProviderWithOptions(&params3);
   EXPECT_TRUE(session_object3.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
-  status = session_object3.Load(model_name);
+  status = session_object3.Load(ctx_model_name);
   ASSERT_TRUE(status.IsOK());
   status = session_object3.Initialize();
   ASSERT_TRUE(status.IsOK());
@@ -490,10 +503,10 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
    */
   InferenceSession session_object4{so, GetEnvironment()};
   OrtTensorRTProviderOptionsV2 params4;
-  model_name = ORT_TSTR("./context_model_folder/EPContextNode_test_ctx.onnx");
+  ctx_model_name = ToPathString("./context_model_folder/EPContextNode_test_ctx.onnx");
   execution_provider = TensorrtExecutionProviderWithOptions(&params4);
   EXPECT_TRUE(session_object4.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
-  status = session_object4.Load(model_name);
+  status = session_object4.Load(ctx_model_name);
   ASSERT_TRUE(status.IsOK());
   status = session_object4.Initialize();
   ASSERT_TRUE(status.IsOK());
@@ -514,7 +527,6 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
   params5.trt_dump_ep_context_model = 1;
   params5.trt_ep_context_embed_mode = 1;
   params5.trt_ep_context_file_path = "EP_Context_model_2.onnx";
-  model_name = ORT_TSTR("EPContextNode_test.onnx");
   execution_provider = TensorrtExecutionProviderWithOptions(&params5);
   EXPECT_TRUE(session_object5.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
   status = session_object5.Load(model_name);
@@ -528,10 +540,10 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
   InferenceSession session_object6{so, GetEnvironment()};
   OrtTensorRTProviderOptionsV2 params6;
   params6.trt_ep_context_embed_mode = 1;
-  model_name = ToPathString(params5.trt_ep_context_file_path);
+  ctx_model_name = ToPathString(params5.trt_ep_context_file_path);
   execution_provider = TensorrtExecutionProviderWithOptions(&params6);
   EXPECT_TRUE(session_object6.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
-  status = session_object6.Load(model_name);
+  status = session_object6.Load(ctx_model_name);
   ASSERT_TRUE(status.IsOK());
   status = session_object6.Initialize();
   ASSERT_TRUE(status.IsOK());
@@ -543,6 +555,61 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
   // Y: 1, 3, 3, 2, 2, 2
   // Z: 1, 3, 3, 2, 2, 2
   RunSession(session_object6, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 7: Run context model with ONNX in memory
+   */
+  auto model_bytes = ReadFileFromDisk(model_name);
+  std::string ctx_model_name_str = "EP_Context_model_weight_stripped.onnx";
+  ctx_model_name = ToPathString(ctx_model_name_str);
+  InferenceSession session_object7{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params7;
+  params7.trt_dump_ep_context_model = 1;
+  params7.trt_ep_context_embed_mode = 1;
+  params7.trt_weight_stripped_engine_enable = 1;
+  params7.trt_ep_context_file_path = ctx_model_name_str.c_str();
+  execution_provider = TensorrtExecutionProviderWithOptions(&params7);
+  EXPECT_TRUE(session_object7.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object7.Load(model_bytes.data(), static_cast<int>(model_bytes.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_object7.Initialize();
+  std::cerr << status.ErrorMessage();
+  ASSERT_TRUE(status.IsOK());
+  RunSession(session_object7, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 7: Refit weightless context model with ONNX in memory
+   */
+  auto ctx_model_bytes = ReadFileFromDisk(ctx_model_name);
+  InferenceSession session_object8{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params8;
+  params8.trt_weight_stripped_engine_enable = 1;
+  params8.trt_onnx_bytestream = model_bytes.data();
+  params8.trt_onnx_bytestream_size = model_bytes.size();
+  execution_provider = TensorrtExecutionProviderWithOptions(&params8);
+  EXPECT_TRUE(session_object8.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object8.Load(ctx_model_bytes.data(), static_cast<int>(ctx_model_bytes.size()));
+  std::cerr << status.ErrorMessage();
+  ASSERT_TRUE(status.IsOK());
+  status = session_object8.Initialize();
+  std::cerr << status.ErrorMessage();
+  ASSERT_TRUE(status.IsOK());
+  RunSession(session_object8, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 7: Refit weightless context model with ONNX from disk
+   */
+  InferenceSession session_object9{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params9;
+  params9.trt_weight_stripped_engine_enable = 1;
+  params9.trt_onnx_model_folder_path = model_name_str.c_str();
+  execution_provider = TensorrtExecutionProviderWithOptions(&params9);
+  EXPECT_TRUE(session_object9.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object9.Load(ctx_model_bytes.data(), static_cast<int>(ctx_model_bytes.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_object9.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  RunSession(session_object9, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
 }
 
 TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) {

From 11bf3097360d271e8c0d6f26683a8b16477e6c42 Mon Sep 17 00:00:00 2001
From: Jing Fang <126209182+fajin-corp@users.noreply.github.com>
Date: Fri, 19 Jul 2024 22:55:15 -0700
Subject: [PATCH 08/15] add transform part of the dq matmul tool chain (#21374)

### Description

This is a partial change from
[fajin/qdqmatmulnbitstoolchain](https://github.com/microsoft/onnxruntime/pull/21180).
The original PR is blocked by Web CI failures.

MatMulNBits is a heavily optimized matmul operation. Currently a MatMul
can be converted to MatMulNBits to speed up the model inference.
However, MatMulNBits is an ORT only op. To make the graph compatible
with ONNX ops and utilize MatMulNBits at the same time, we introduce
Q/DQ support for MatMulNBits.

To convert MatMul ops in a model to MatMulNBits:
1. use matmul_4bits_quantizer.py to convert MatMul to DQ + MatMul using
QDQ mode.
2. In ORT session, DQ + MatMul is fused to MatMulNBits

#### Note
MatMulNBits assume B weight is uint4. When no zp is provided, zp
defaults to 8, which is different from DQ. DQ defaults zp to 0 when no
zp provided. And DQ supports int4. Therefore some conversions are
introduced during DQ + MatMul --> MatMulNBits step.

#### Perf
Using QDQ format will increase the model initialization time and memory
consumption. With current implement, model init time increased from ~4s
to ~9s, and memory consumption increased from ~2.8GB to ~4.8GB.
The memory increase is due to
1. in optimizer, after transpose the B weight, a in-memory tensor proto
is created using protobuf's arena.
2. in finalize step, when saving initializer and prepacking, ORT arena
is used to create buffers for initializers.

The memory allocated by arenas cannot be fully deallocated.
If disable ORT arena memory allocation, the memory consumptions of both
QDQ format and original format are ~2.2GB.
The time increase is mainly due to multiple memory copy, but can be
further optimized.

### Motivation and Context
Please see description for details.
---
 .../core/optimizer/graph_transformer_utils.h  |   7 +-
 .../onnxruntime_session_options_config_keys.h |   5 +
 .../core/optimizer/graph_transformer_utils.cc |  26 +-
 .../selectors_actions/qdq_actions.cc          | 173 ++++++-
 .../selectors_actions/qdq_actions.h           |  29 ++
 .../qdq_selector_action_transformer.cc        |  39 +-
 .../qdq_selector_action_transformer.h         |   6 +-
 .../selectors_actions/qdq_selectors.cc        |  85 ++++
 .../selectors_actions/qdq_selectors.h         |  15 +
 .../optimizer/selectors_actions/actions.cc    |   4 +-
 .../optimizer/selectors_actions/actions.h     |   7 +-
 onnxruntime/core/session/inference_session.cc |  14 +-
 onnxruntime/test/common/random_generator.h    |  17 +
 .../optimizer/graph_transform_test_builder.h  |  16 -
 .../qdq_matmulnbits_transformer_test.cc       | 425 ++++++++++++++++++
 onnxruntime/test/optimizer/qdq_test_utils.h   |   2 +-
 16 files changed, 833 insertions(+), 37 deletions(-)
 create mode 100644 onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc

diff --git a/include/onnxruntime/core/optimizer/graph_transformer_utils.h b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
index e609745b5e03f..0bb5c7432f0a7 100644
--- a/include/onnxruntime/core/optimizer/graph_transformer_utils.h
+++ b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
@@ -10,6 +10,7 @@
 #include "core/common/inlined_containers.h"
 #include "core/framework/session_options.h"
 #include "core/optimizer/graph_transformer.h"
+#include "core/platform/threadpool.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/optimizer/rule_based_graph_transformer.h"
@@ -49,7 +50,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     TransformerLevel level,
     const SessionOptions& session_options,
     const IExecutionProvider& execution_provider /*required by constant folding*/,
-    const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {});
+    const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
+    concurrency::ThreadPool* intra_op_thread_pool = nullptr);
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
@@ -78,7 +80,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
     const SessionOptions& session_options,
     const SatApplyContextVariant& apply_context,
     const IExecutionProvider& cpu_execution_provider,
-    const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {});
+    const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
+    concurrency::ThreadPool* intra_op_thread_pool = nullptr);
 
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index c32e2a77e8453..17ae649e6f174 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -270,3 +270,8 @@ static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed
 // - "0": Gemm FastMath mode is not enabled. [DEFAULT]
 // - "1": Gemm FastMath mode is enabled.
 static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";
+
+// When converting DQ + MatMul -> MatMulNBits, the accuracy level of the MatMulNBits is controlled by this option.
+// Refer to MatMulNBits op schema for more details.
+// If not provided, default is 4.
+static const char* const kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel = "session.qdq_matmulnbits_accuracy_level";
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index e6feb3e7ddbe2..7da65f18ccacb 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -13,6 +13,7 @@
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
 #include "core/optimizer/selectors_actions/selector_action_transformer_apply_contexts.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/platform/threadpool.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 
@@ -187,7 +188,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     TransformerLevel level,
     const SessionOptions& session_options,
     const IExecutionProvider& cpu_execution_provider, /*required by constant folding*/
-    const InlinedHashSet<std::string>& rules_and_transformers_to_disable) {
+    const InlinedHashSet<std::string>& rules_and_transformers_to_disable,
+    concurrency::ThreadPool* intra_op_thread_pool) {
   InlinedVector<std::unique_ptr<GraphTransformer>> transformers;
   const bool disable_quant_qdq =
       session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableQuantQDQ, "0") == "1";
@@ -287,6 +289,10 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
                                                                                onnxruntime::kJsExecutionProvider};
       const InlinedHashSet<std::string_view> cpu_dml_eps = {onnxruntime::kCpuExecutionProvider,
                                                             onnxruntime::kDmlExecutionProvider};
+      const int64_t qdq_matmulnbits_accuracy_level =
+          ParseStringWithClassicLocale<int64_t>(
+              session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                                "4"));
 #ifdef MLAS_TARGET_AMD64_IX86
       const bool avx2_precision_mode =
           session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow();
@@ -300,7 +306,10 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
         if (!qdq_is_int8_allowed) {
           transformers.emplace_back(std::make_unique<QDQS8ToU8Transformer>(avx2_precision_mode, cpu_ep));
         }
-        transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>(qdq_is_int8_allowed));
+        transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>(qdq_is_int8_allowed,
+                                                                                 SatApplyContextVariant{},
+                                                                                 qdq_matmulnbits_accuracy_level,
+                                                                                 intra_op_thread_pool));
       }
 
       transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
@@ -409,7 +418,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
     const SessionOptions& session_options,
     const SatApplyContextVariant& apply_context,
     const IExecutionProvider& cpu_execution_provider,
-    const InlinedHashSet<std::string>& rules_and_transformers_to_disable) {
+    const InlinedHashSet<std::string>& rules_and_transformers_to_disable,
+    concurrency::ThreadPool* intra_op_thread_pool) {
   InlinedVector<std::unique_ptr<GraphTransformer>> transformers;
   const bool saving = std::holds_alternative<SatRuntimeOptimizationSaveContext>(apply_context);
 
@@ -423,12 +433,18 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
       const bool qdq_is_int8_allowed =
           session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsQDQIsInt8Allowed,
                                                             QDQIsInt8Allowed() ? "1" : "0") == "1";
-
+      const int64_t qdq_matmulnbits_accuracy_level =
+          ParseStringWithClassicLocale<int64_t>(
+              session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                                "4"));
       // runtime optimizations only support CPU EP now
       const InlinedHashSet<std::string_view> cpu_ep = {onnxruntime::kCpuExecutionProvider};
 
       if (!disable_quant_qdq) {
-        transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>(qdq_is_int8_allowed, apply_context));
+        transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>(qdq_is_int8_allowed,
+                                                                                 apply_context,
+                                                                                 qdq_matmulnbits_accuracy_level,
+                                                                                 intra_op_thread_pool));
       }
 
       transformers.emplace_back(std::make_unique<ConvActivationFusion>(cpu_ep, apply_context));
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
index 3497ea4c85523..74fecb0427e14 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
@@ -2,10 +2,12 @@
 // Licensed under the MIT License.
 
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h"
-
 #include "core/optimizer/qdq_transformer/qdq_util.h"
+#include "core/optimizer/initializer.h"
 #include "core/graph/node_attr_utils.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/mlas/inc/mlas_q4.h"
+
 namespace onnxruntime {
 namespace QDQ {
 
@@ -273,6 +275,175 @@ Status MatMulReplaceWithQLinear::Run(Graph& graph, const NodesToOptimize& select
   }
 }
 
+DQMatMulToMatMulNBitsAction::DQMatMulToMatMulNBitsAction(int64_t accuracy_level,
+                                                         concurrency::ThreadPool* intra_op_thread_pool)
+    : accuracy_level_{accuracy_level},
+      domain_{kMSDomain},
+      op_type_{"MatMulNBits"},
+      value_moves_{[]() {
+        NTO::NodeLocation target{NTO::NodeType::kTarget, 0};
+        return std::vector<NodeAndMoveInfo>{
+            MoveAndAppend(target, ArgType::kInput, 0, ArgType::kInput),
+            MoveAll(target, ArgType::kOutput)};
+      }()},
+      intra_op_thread_pool_{intra_op_thread_pool} {
+  ORT_ENFORCE(accuracy_level_ >= 0 && accuracy_level_ <= 4, "MatMulNBits accuracy level must be between 0 and 4");
+}
+
+NodeAttributes
+DQMatMulToMatMulNBitsAction::ExtraAttributes(const RuntimeState& runtime_state) const {
+  NodeAttributes extra_attributes;
+
+  const auto* dq_node = runtime_state.selected_nodes.Input(0);
+  auto& attrs = dq_node->GetAttributes();
+  const auto* weight_shape = dq_node->InputDefs()[0]->Shape();
+
+  utils::SetNodeAttribute(utils::MakeAttribute("K", weight_shape->dim(0).dim_value()), extra_attributes);
+  utils::SetNodeAttribute(utils::MakeAttribute("N", weight_shape->dim(1).dim_value()), extra_attributes);
+  utils::SetNodeAttribute(utils::MakeAttribute("accuracy_level", accuracy_level_), extra_attributes);
+  // currently only 4bits is supported. In the future, derive bits from DQ's weight type.
+  utils::SetNodeAttribute(utils::MakeAttribute("bits", static_cast<int64_t>(4)), extra_attributes);
+  utils::SetNodeAttribute(utils::MakeAttribute("block_size", attrs.at("block_size").i()), extra_attributes);
+
+  return extra_attributes;
+}
+
+Status DQMatMulToMatMulNBitsAction::ProcessNewNode(Graph& graph,
+                                                   const NodesToOptimize& selected_nodes,
+                                                   Node& replacement_node) const {
+  const auto* dq_node = selected_nodes.Input(0);
+  const auto* weight_arg = dq_node->InputDefs()[0];
+  const auto* scale_arg = dq_node->InputDefs()[1];
+  const auto* zp_arg = dq_node->InputDefs().size() > 2 ? dq_node->InputDefs()[2] : nullptr;
+  const auto& attrs = dq_node->GetAttributes();
+
+  const ONNX_NAMESPACE::TensorProto* weight_tensor_proto = nullptr;
+  const ONNX_NAMESPACE::TensorProto* scale_tensor_proto = nullptr;
+  const ONNX_NAMESPACE::TensorProto* zp_tensor_proto = nullptr;
+  graph.GetInitializedTensor(weight_arg->Name(), weight_tensor_proto);
+  graph.GetInitializedTensor(scale_arg->Name(), scale_tensor_proto);
+  if (zp_arg) {
+    graph.GetInitializedTensor(zp_arg->Name(), zp_tensor_proto);
+  }
+
+  auto K = weight_arg->Shape()->dim(0).dim_value();
+  auto N = weight_arg->Shape()->dim(1).dim_value();
+  auto block_size = attrs.at("block_size").i();
+  auto quant_num = (K + block_size - 1) / block_size;
+  auto blob_bytes = (block_size + 1) / 2;
+
+  // Unfortunately iterating the source data is complicated, the data maybe in
+  // external file, a raw buffer, or a repeated field depending on the data
+  // type.  UnpackTensor() already contains some of these logic and is closest
+  // to what we need. But it does not handle external data.
+  Initializer weight_src(*weight_tensor_proto, graph.ModelPath());
+  Initializer scale_src(*scale_tensor_proto, graph.ModelPath());
+  std::optional<Initializer> zp_src;
+  Initializer weight_dst(ONNX_NAMESPACE::TensorProto_DataType_UINT8,
+                         graph.GenerateNodeArgName(weight_arg->Name() + "_T"),
+                         std::vector<int64_t>{N, quant_num, blob_bytes});
+  Initializer scale_dst(static_cast<ONNX_NAMESPACE::TensorProto_DataType>(scale_src.data_type()),
+                        graph.GenerateNodeArgName(scale_arg->Name() + "_T"),
+                        std::vector<int64_t>{N * quant_num});
+  std::optional<Initializer> zp_dst;
+
+  if (zp_tensor_proto) {
+    zp_src.emplace(*zp_tensor_proto, graph.ModelPath());
+    zp_dst.emplace(ONNX_NAMESPACE::TensorProto_DataType_UINT8,
+                   graph.GenerateNodeArgName(zp_arg->Name() + "_T"),
+                   std::vector<int64_t>{N * ((quant_num + 1) / 2)});
+  } else if (weight_src.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
+    zp_dst.emplace(ONNX_NAMESPACE::TensorProto_DataType_UINT8,
+                   graph.GenerateNodeArgName("fused_DQ_MatMul_zero_point_T"),
+                   std::vector<int64_t>{N * ((quant_num + 1) / 2)});
+  }
+
+  if (scale_src.data_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+    if (weight_src.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
+      MlasQDQTransposeBlockwiseQuantized<float, 4, true>(
+          weight_src.DataAsByteSpan().data(),
+          scale_src.data<float>(),
+          zp_src ? zp_src->DataAsByteSpan().data() : nullptr,
+          weight_dst.data<uint8_t>(),
+          scale_dst.data<float>(),
+          zp_dst ? zp_dst->data<uint8_t>() : nullptr,
+          true,
+          static_cast<int>(K),
+          static_cast<int>(N),
+          static_cast<int>(block_size),
+          intra_op_thread_pool_);
+    } else {
+      MlasQDQTransposeBlockwiseQuantized<float, 4, false>(
+          weight_src.DataAsByteSpan().data(),
+          scale_src.data<float>(),
+          zp_src ? zp_src->DataAsByteSpan().data() : nullptr,
+          weight_dst.data<uint8_t>(),
+          scale_dst.data<float>(),
+          zp_dst ? zp_dst->data<uint8_t>() : nullptr,
+          true,
+          static_cast<int>(K),
+          static_cast<int>(N),
+          static_cast<int>(block_size),
+          intra_op_thread_pool_);
+    }
+  } else {
+    if (weight_src.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
+      MlasQDQTransposeBlockwiseQuantized<MLFloat16, 4, true>(
+          weight_src.DataAsByteSpan().data(),
+          scale_src.data<MLFloat16>(),
+          zp_src ? zp_src->DataAsByteSpan().data() : nullptr,
+          weight_dst.data<uint8_t>(),
+          scale_dst.data<MLFloat16>(),
+          zp_dst ? zp_dst->data<uint8_t>() : nullptr,
+          true,
+          static_cast<int>(K),
+          static_cast<int>(N),
+          static_cast<int>(block_size),
+          intra_op_thread_pool_);
+
+    } else {
+      MlasQDQTransposeBlockwiseQuantized<MLFloat16, 4, false>(
+          weight_src.DataAsByteSpan().data(),
+          scale_src.data<MLFloat16>(),
+          zp_src ? zp_src->DataAsByteSpan().data() : nullptr,
+          weight_dst.data<uint8_t>(),
+          scale_dst.data<MLFloat16>(),
+          zp_dst ? zp_dst->data<uint8_t>() : nullptr,
+          true,
+          static_cast<int>(K),
+          static_cast<int>(N),
+          static_cast<int>(block_size),
+          intra_op_thread_pool_);
+    }
+  }
+
+  ONNX_NAMESPACE::TensorProto weight_T_tp;
+  ONNX_NAMESPACE::TensorProto scale_T_tp;
+  std::optional<ONNX_NAMESPACE::TensorProto> zp_T_tp;
+
+  // TODO(fajin): external_data to memory location to avoid arena allocation
+  // https://github.com/microsoft/onnxruntime/pull/12465
+  weight_dst.ToProto(weight_T_tp);
+  scale_dst.ToProto(scale_T_tp);
+  if (zp_dst) {
+    zp_T_tp.emplace();
+    zp_dst->ToProto(zp_T_tp.value());
+  }
+
+  auto& input_defs = replacement_node.MutableInputDefs();
+  input_defs.push_back(&graph_utils::AddInitializer(graph, weight_T_tp));
+  replacement_node.MutableInputArgsCount().push_back(1);
+  input_defs.push_back(&graph_utils::AddInitializer(graph, scale_T_tp));
+  replacement_node.MutableInputArgsCount().push_back(1);
+
+  if (zp_T_tp) {
+    input_defs.push_back(&graph_utils::AddInitializer(graph, zp_T_tp.value()));
+    replacement_node.MutableInputArgsCount().push_back(1);
+  }
+
+  return Status::OK();
+}
+
 static std::vector<NodeAndMoveInfo> GetGemmMoveInfo(bool does_q_node_exist) {
   NTO::NodeLocation dq_A{NTO::NodeType::kInput, 0};
   NTO::NodeLocation dq_B{NTO::NodeType::kInput, 1};
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
index 8179a030508a5..47821619db65a 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
@@ -3,7 +3,12 @@
 
 #pragma once
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "core/optimizer/selectors_actions/actions.h"
+#include "core/platform/threadpool.h"
 
 namespace onnxruntime {
 
@@ -76,6 +81,30 @@ struct MatMulReplaceWithQLinear : public Action {
   BinaryReplaceWithQLinear qlinear_matmul_replacer_;
 };
 
+// used together with DQMatMulNodeGroupSelector, which does the sanity check
+struct DQMatMulToMatMulNBitsAction : public ReplaceWithNew {
+  DQMatMulToMatMulNBitsAction(int64_t accuracy_level,
+                              concurrency::ThreadPool* intra_op_thread_pool);
+
+ private:
+  std::string OpType(const RuntimeState&) const override { return op_type_; }
+
+  std::string Domain(const RuntimeState&) const override { return domain_; }
+
+  NodeAttributes ExtraAttributes(const RuntimeState&) const override;
+
+  std::vector<NodeAndMoveInfo> ValueMoves(const RuntimeState&) const override { return value_moves_; }
+
+  // transpose initializers, and add to the MatMulNBits inputs
+  Status ProcessNewNode(Graph&, const NodesToOptimize&, Node&) const override;
+
+  const int64_t accuracy_level_;
+  const std::string domain_;
+  const std::string op_type_;
+  const std::vector<NodeAndMoveInfo> value_moves_;
+  concurrency::ThreadPool* intra_op_thread_pool_;
+};
+
 struct GemmReplaceWithQuant : public Action {
   GemmReplaceWithQuant();
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
index 80ead8f8c68d6..17e66a3953b97 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@@ -228,6 +228,30 @@ void MatMulQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool i
 #endif
 }
 
+void DQMatMulToMatMulNBitsRules(SelectorActionRegistry& qdq_selector_action_registry,
+                                int64_t qdq_matmulnbits_accuracy_level,
+                                concurrency::ThreadPool* intra_op_thread_pool) {
+  // 2 nodes. DQ -> MatMul. DQ is the second input to MatMul.
+  // DQ's weight is int4/uint4. DQ's scale is float/float16.
+  // DQ is block-quantized along axis 0, with block_size >= 16 and as 2's power.
+  const std::string action_name{"DQMatMulToMatMulNBits"};
+
+  std::unique_ptr<Action> action =
+      std::make_unique<QDQ::DQMatMulToMatMulNBitsAction>(qdq_matmulnbits_accuracy_level,
+                                                         intra_op_thread_pool);
+
+#if !defined(ORT_MINIMAL_BUILD)
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DQMatMulToMatMulNBitsSelector>();
+  qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
+                                                         {{"MatMul", {}}},
+                                                         std::move(selector),
+                                                         std::move(action));
+
+#else
+  qdq_selector_action_registry.RegisterAction(action_name, std::move(action));
+#endif
+}
+
 void GemmQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
   // 3 to 5 nodes. 0=DQ A, 1=DQ B, 2=DQ C(optional), 3=Gemm, 4=Q Y(optional)
   // Replace with QGemm
@@ -271,7 +295,9 @@ void WhereQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
 #endif
 }
 
-SelectorActionRegistry CreateSelectorActionRegistry(bool is_int8_allowed) {
+SelectorActionRegistry CreateSelectorActionRegistry(bool is_int8_allowed,
+                                                    int64_t qdq_matmulnbits_accuracy_level,
+                                                    concurrency::ThreadPool* intra_op_thread_pool) {
   SelectorActionRegistry qdq_selector_action_registry;
   SplitQDQRules(qdq_selector_action_registry);
   DropQDQNodesRules(qdq_selector_action_registry);
@@ -283,17 +309,22 @@ SelectorActionRegistry CreateSelectorActionRegistry(bool is_int8_allowed) {
   MatMulQDQRules(qdq_selector_action_registry, is_int8_allowed);
   GemmQDQRules(qdq_selector_action_registry);
   WhereQDQRules(qdq_selector_action_registry);
+  DQMatMulToMatMulNBitsRules(qdq_selector_action_registry,
+                             qdq_matmulnbits_accuracy_level,
+                             intra_op_thread_pool);
 
   return qdq_selector_action_registry;
 }
 
 }  // namespace
 
-QDQSelectorActionTransformer::QDQSelectorActionTransformer(
-    bool is_int8_allowed, const SatApplyContextVariant& apply_context)
+QDQSelectorActionTransformer::QDQSelectorActionTransformer(bool is_int8_allowed,
+                                                           const SatApplyContextVariant& apply_context,
+                                                           int64_t qdq_matmulnbits_accuracy_level,
+                                                           concurrency::ThreadPool* intra_op_thread_pool)
     : SelectorActionTransformer{
           "QDQSelectorActionTransformer",
-          CreateSelectorActionRegistry(is_int8_allowed),
+          CreateSelectorActionRegistry(is_int8_allowed, qdq_matmulnbits_accuracy_level, intra_op_thread_pool),
           apply_context,
           // this transformer is only compatible with the CPU and DML EP
           {kCpuExecutionProvider, kDmlExecutionProvider}} {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h
index 1780923f3f273..ba636f76d1900 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h
@@ -5,6 +5,7 @@
 
 #include "core/optimizer/selectors_actions/selector_action_transformer.h"
 #include "core/mlas/inc/mlas.h"
+#include "core/platform/threadpool.h"
 
 namespace onnxruntime {
 
@@ -21,7 +22,10 @@ Transformer that fuses QDQ and fp32 ops into quantized ops.
 */
 class QDQSelectorActionTransformer : public SelectorActionTransformer {
  public:
-  QDQSelectorActionTransformer(bool is_int8_allowed, const SatApplyContextVariant& apply_context = {});
+  QDQSelectorActionTransformer(bool is_int8_allowed,
+                               const SatApplyContextVariant& apply_context = {},
+                               int64_t qdq_matmulnbits_accuracy_level = 4,
+                               concurrency::ThreadPool* intra_op_thread_pool = nullptr);
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 09705f61c82ce..6e93445c7c5c7 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -414,6 +414,91 @@ bool MatMulNodeGroupSelector::Check(const GraphViewer& graph_viewer,
   }
 }
 
+bool DQMatMulNodeGroupSelector::Check(const GraphViewer& graph_viewer,
+                                      const Node& node,
+                                      const std::vector<const Node*>& dq_nodes,
+                                      const std::vector<const Node*>& q_nodes) const {
+  // Should not have any Q nodes
+  if (!q_nodes.empty()) {
+    return false;
+  }
+
+  const auto& graph = graph_viewer.GetGraph();
+
+  // MatMul has only 1 DQ input and the DQ must have 1 output edge and not be a graph output
+  if (dq_nodes.size() != 1 || !optimizer_utils::CheckOutputEdges(graph, *dq_nodes[0], 1)) {
+    return false;
+  }
+
+  // DQ must be MatMul's the second input
+  if (node.InputDefs()[1] != dq_nodes[0]->OutputDefs()[0]) {
+    return false;
+  }
+
+  // DQ weight/zero points types are int4/uint4, scales/output types are float or float16
+  const auto* weight_arg = dq_nodes[0]->InputDefs()[0];
+  const auto* scale_arg = dq_nodes[0]->InputDefs()[1];
+  const auto* zero_point_arg = dq_nodes[0]->InputDefs().size() == 3 ? dq_nodes[0]->InputDefs()[2] : nullptr;
+  int32_t dt_weight = weight_arg->TypeAsProto()->tensor_type().elem_type();
+  int32_t dt_scales = scale_arg->TypeAsProto()->tensor_type().elem_type();
+  if (dt_scales != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT &&
+      dt_scales != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) {
+    return false;
+  }
+
+  if (!Is4BitIntType(dt_weight)) {
+    return false;
+  }
+
+  // DQ is blockwise quantized along axis 0, and block_size must be 2's power and >= 16
+  const auto& dq_attrs = dq_nodes[0]->GetAttributes();
+  if (const auto a_iter = dq_attrs.find("axis");
+      a_iter == dq_attrs.end() || a_iter->second.i() != 0) {
+    return false;
+  }
+
+  const auto a_iter = dq_attrs.find("block_size");
+  if (a_iter == dq_attrs.end()) {
+    return false;
+  }
+
+  auto block_size = a_iter->second.i();
+  if (block_size < 16 || ((block_size - 1) & block_size)) {
+    return false;
+  }
+
+  // weight, scale and zero points (if exists) must be constants
+  const auto* weight_tensor_proto = graph.GetConstantInitializer(weight_arg->Name(), true);
+  const auto* scale_tensor_proto = graph.GetConstantInitializer(scale_arg->Name(), true);
+  const auto* zp_tensor_proto = zero_point_arg ? graph.GetConstantInitializer(zero_point_arg->Name(), true) : nullptr;
+
+  if (!weight_tensor_proto || !scale_tensor_proto) {
+    return false;
+  }
+
+  if (zero_point_arg && !zp_tensor_proto) {
+    return false;
+  }
+
+  // weight, scale and zero points (if exists) must have the rank 2
+  if (weight_tensor_proto->dims_size() != 2 ||
+      scale_tensor_proto->dims_size() != 2 ||
+      (zp_tensor_proto && zp_tensor_proto->dims_size() != 2)) {
+    return false;
+  }
+
+  // check weight, scale and zero points (if exists) shapes
+  if ((weight_tensor_proto->dims()[0] + block_size - 1) / block_size != scale_tensor_proto->dims()[0] ||
+      weight_tensor_proto->dims()[1] != scale_tensor_proto->dims()[1] ||
+      (zp_tensor_proto &&
+       (zp_tensor_proto->dims()[0] != scale_tensor_proto->dims()[0] ||
+        zp_tensor_proto->dims()[1] != scale_tensor_proto->dims()[1]))) {
+    return false;
+  }
+
+  return true;
+}
+
 bool GemmNodeGroupSelector::Check(const GraphViewer& graph_viewer,
                                   const Node& node,
                                   const std::vector<const Node*>& dq_nodes,
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index 1a2a620acb480..491a15b62cb03 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -204,6 +204,14 @@ class MatMulNodeGroupSelector : public NodeGroupSelector {
   bool allow_4bit_;
 };
 
+// Convert "1 DQ node for input B -> MatMul" to "MatMulNBits"
+class DQMatMulNodeGroupSelector : public NodeGroupSelector {
+ private:
+  bool Check(const GraphViewer& graph_viewer, const Node& node,
+             const std::vector<const Node*>& dq_nodes,
+             const std::vector<const Node*>& q_nodes) const override;
+};
+
 // Input: DQ nodes for A, B and optional C
 // Output: optional Q node for Y
 class GemmNodeGroupSelector : public NodeGroupSelector {
@@ -358,6 +366,13 @@ class MatMulSelector : public BaseSelector {
                                                                allow_16bit, allow_4bit)) {}
 };
 
+// Convert "1 DQ node for input B -> MatMul" to "MatMulNBits"
+class DQMatMulToMatMulNBitsSelector : public BaseSelector {
+ public:
+  explicit DQMatMulToMatMulNBitsSelector(gsl::span<const char*> compatible_providers = {})
+      : BaseSelector(std::make_unique<DQMatMulNodeGroupSelector>(), compatible_providers) {}
+};
+
 // Input: DQ nodes for A, B and optional C
 // Output: optional Q node for Y
 class GemmSelector : public BaseSelector {
diff --git a/onnxruntime/core/optimizer/selectors_actions/actions.cc b/onnxruntime/core/optimizer/selectors_actions/actions.cc
index c8d5acbf66b78..bb4033afedc49 100644
--- a/onnxruntime/core/optimizer/selectors_actions/actions.cc
+++ b/onnxruntime/core/optimizer/selectors_actions/actions.cc
@@ -102,12 +102,14 @@ static Status CreateReplacementNode(Graph& graph,
 
 Status ReplaceWithNew::Run(Graph& graph, const NodesToOptimize& selected_nodes) const {
   const RuntimeState runtime_state{graph, selected_nodes};
+  Node* replacement{};
   ORT_RETURN_IF_ERROR(CreateReplacementNode(graph, selected_nodes,
                                             OpType(runtime_state),
                                             Domain(runtime_state),
                                             ExtraAttributes(runtime_state),
                                             ValueMoves(runtime_state),
-                                            /* only_update_dest_definitions */ false, nullptr));
+                                            /* only_update_dest_definitions */ false, &replacement));
+  ORT_RETURN_IF_ERROR(ProcessNewNode(graph, selected_nodes, *replacement));
   return node_remover_.Run(graph, selected_nodes);
 }
 
diff --git a/onnxruntime/core/optimizer/selectors_actions/actions.h b/onnxruntime/core/optimizer/selectors_actions/actions.h
index 9384bfa7027cd..465ae38565b15 100644
--- a/onnxruntime/core/optimizer/selectors_actions/actions.h
+++ b/onnxruntime/core/optimizer/selectors_actions/actions.h
@@ -158,6 +158,12 @@ struct ReplaceWithNew : public Action {
   // specifies how the inputs and outputs for the replaced nodes are moved to the new node
   virtual std::vector<NodeAndMoveInfo> ValueMoves(const RuntimeState&) const = 0;
 
+  // For the changes that cannot be done by simply moving node args around, use this method to make
+  // additional changes to the new node and the graph. e.g., DQMatMulToMatMulNBitsAction transposes
+  // the second weight of MatMul ops and create new node args.
+  // Note: This method is only used in Run(), but not in RunForSave().
+  virtual Status ProcessNewNode(Graph&, const NodesToOptimize&, Node&) const { return Status::OK(); }
+
   RemoveNodes node_remover_;
 };
 
@@ -187,5 +193,4 @@ struct ReplaceWithNewFixed : public ReplaceWithNew {
   const NodeAttributes extra_attrs_;
   const std::vector<NodeAndMoveInfo> value_moves_;
 };
-
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index f0eed91d70440..3fd6e84e0e5ce 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1609,7 +1609,8 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 Status ApplyOrtFormatModelRuntimeOptimizations(
     onnxruntime::Graph& graph, const logging::Logger& logger, const SessionOptions& session_options,
-    const InlinedHashSet<std::string>& optimizers_to_disable, const IExecutionProvider& cpu_ep) {
+    const InlinedHashSet<std::string>& optimizers_to_disable, const IExecutionProvider& cpu_ep,
+    concurrency::ThreadPool* intra_op_thread_pool) {
   bool modified = false;
 
   for (int level = static_cast<int>(TransformerLevel::Level2);
@@ -1617,7 +1618,7 @@ Status ApplyOrtFormatModelRuntimeOptimizations(
        ++level) {
     const auto transformers = optimizer_utils::GenerateTransformersForMinimalBuild(
         static_cast<TransformerLevel>(level), session_options, SatRuntimeOptimizationLoadContext{}, cpu_ep,
-        optimizers_to_disable);
+        optimizers_to_disable, intra_op_thread_pool);
 
     for (const auto& transformer : transformers) {
       ORT_RETURN_IF_ERROR(transformer->Apply(graph, modified, logger));
@@ -2005,7 +2006,8 @@ common::Status InferenceSession::Initialize() {
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);
       ORT_RETURN_IF_ERROR_SESSIONID_(
-          ApplyOrtFormatModelRuntimeOptimizations(graph, *session_logger_, session_options_, optimizers_to_disable_, cpu_ep));
+          ApplyOrtFormatModelRuntimeOptimizations(graph, *session_logger_, session_options_, optimizers_to_disable_,
+                                                  cpu_ep, GetIntraOpThreadPoolToUse()));
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
     }
 
@@ -3167,7 +3169,8 @@ common::Status InferenceSession::AddPredefinedTransformers(
 
         if (use_full_build_optimizations) {
           return optimizer_utils::GenerateTransformers(level, session_options_, cpu_ep,
-                                                       optimizers_to_disable_);
+                                                       optimizers_to_disable_,
+                                                       GetIntraOpThreadPoolToUse());
         } else {
           const auto sat_context =
               minimal_build_optimization_handling ==
@@ -3176,7 +3179,8 @@ common::Status InferenceSession::AddPredefinedTransformers(
                         record_runtime_optimization_produced_op_schema_fn}}
                   : SatApplyContextVariant{SatDirectApplicationContext{}};
           return optimizer_utils::GenerateTransformersForMinimalBuild(level, session_options_, sat_context, cpu_ep,
-                                                                      optimizers_to_disable_);
+                                                                      optimizers_to_disable_,
+                                                                      GetIntraOpThreadPoolToUse());
         }
       }();
 
diff --git a/onnxruntime/test/common/random_generator.h b/onnxruntime/test/common/random_generator.h
index 9ab4a82463d51..9bc50ce88ef16 100644
--- a/onnxruntime/test/common/random_generator.h
+++ b/onnxruntime/test/common/random_generator.h
@@ -12,6 +12,7 @@
 #include "core/common/common.h"
 #include "core/common/optional.h"
 #include "core/common/type_utils.h"
+#include "core/framework/int4.h"
 #include "test/util/include/test_random_seed.h"
 
 namespace onnxruntime {
@@ -108,6 +109,22 @@ class RandomValueGenerator {
     return val;
   }
 
+  template <typename TInt4>
+  typename std::enable_if<
+      std::is_same_v<TInt4, Int4x2> || std::is_same_v<TInt4, UInt4x2>,
+      std::vector<TInt4>>::type
+  Uniform(gsl::span<const int64_t> dims, TInt4 min, TInt4 max) {
+    using UnpackedType = typename TInt4::UnpackedType;
+    std::vector<UnpackedType> data_int8 = Uniform<UnpackedType>(dims, min.GetElem(0), max.GetElem(0));
+    std::vector<TInt4> data(TInt4::CalcNumInt4Pairs(data_int8.size()));
+    for (size_t i = 0; i < data_int8.size(); i++) {
+      size_t r = i >> 1;
+      size_t c = i & 0x1;
+      data[r].SetElem(c, data_int8[i]);
+    }
+    return data;
+  }
+
   // Gaussian distribution for float
   template <typename TFloat>
   typename std::enable_if<
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h
index 6214094a26c4f..b9af675afe74d 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.h
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h
@@ -117,22 +117,6 @@ class ModelTestBuilder {
     return MakeInput<bool>(shape, data);
   }
 
-  template <typename TInt4>
-  typename std::enable_if<
-      std::is_same_v<TInt4, Int4x2> || std::is_same_v<TInt4, UInt4x2>,
-      NodeArg*>::type
-  MakeInputInt4(const std::vector<int64_t>& shape, typename TInt4::UnpackedType min, typename TInt4::UnpackedType max) {
-    using UnpackedType = typename TInt4::UnpackedType;
-    std::vector<UnpackedType> data_int8 = rand_gen_.Uniform<UnpackedType>(shape, min, max);
-    std::vector<TInt4> data(TInt4::CalcNumInt4Pairs(data_int8.size()));
-    for (size_t i = 0; i < data_int8.size(); i++) {
-      size_t r = i >> 1;
-      size_t c = i & 0x1;
-      data[r].SetElem(c, data_int8[i]);
-    }
-    return MakeInput<TInt4>(shape, data);
-  }
-
   template <typename T>
   NodeArg* MakeInput(const std::optional<std::vector<int64_t>>& shape,
                      std::optional<std::string> input_name = std::nullopt) {
diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
new file mode 100644
index 0000000000000..3d117794104fa
--- /dev/null
+++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
@@ -0,0 +1,425 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <type_traits>
+
+#include "core/common/span_utils.h"
+#include "core/framework/int4.h"
+#include "core/graph/node_attr_utils.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+
+#include "test/compare_ortvalue.h"
+#include "test/test_environment.h"
+#include "test/framework/test_utils.h"
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/optimizer/graph_transform_test_builder.h"
+#include "test/util/include/asserts.h"
+#include "test/util/include/inference_session_wrapper.h"
+
+#include "gtest/gtest.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4127)
+#endif  // #if defined(_MSC_VER)
+
+struct QDQOpKeys {
+  const char* quantize_linear;
+  const char* dequantize_linear;
+};
+
+constexpr QDQOpKeys GetQDQOpKeys(bool use_contrib_qdq) {
+  if (use_contrib_qdq) {
+    return {"com.microsoft.QuantizeLinear", "com.microsoft.DequantizeLinear"};
+  }
+  return {"QuantizeLinear", "DequantizeLinear"};
+}
+
+namespace onnxruntime {
+namespace test {
+
+#if !defined(DISABLE_CONTRIB_OPS)
+
+// Input1   Input2
+//   |        |
+//    \      DQ
+//     \    /
+//     MatMul
+//       |
+//     output
+template <typename T, bool use_zp>
+typename std::enable_if<std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>, void>::type
+RunDQMatMulNotConverted_NonConstDQ(const std::vector<int64_t>& input1_shape,
+                                   const std::vector<int64_t>& input2_shape,
+                                   const int64_t axis,
+                                   const int64_t block_size,
+                                   int64_t accuracy_level) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input1_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
+    auto* input2_arg = builder.MakeInput(input2_shape, T(T::min_val, 0), T(T::max_val, 0));
+    auto* output_arg = builder.MakeOutput();
+
+    // add DQ
+    auto* dq_output = builder.MakeIntermediate();
+    NodeAttributes attrs;
+    utils::SetNodeAttribute(utils::MakeAttribute("axis", axis), attrs);
+    utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), attrs);
+
+    auto scale_shape = std::vector<int64_t>{input2_shape};
+    scale_shape[axis] = (scale_shape[axis] + block_size - 1) / block_size;
+    auto* scale_arg = builder.MakeInitializer(scale_shape, 8.0f, 12.0f);
+    if constexpr (use_zp) {
+      auto* zp_arg = builder.MakeInitializer(scale_shape, T(0, 0), T(2, 0));
+      builder.AddNode("DequantizeLinear", {input2_arg, scale_arg, zp_arg}, {dq_output}, "", &attrs);
+    } else {
+      builder.AddNode("DequantizeLinear", {input2_arg, scale_arg}, {dq_output}, "", &attrs);
+    }
+
+    builder.AddNode("MatMul", {input1_arg, dq_output}, {output_arg});
+  };
+
+  auto check_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+    EXPECT_EQ(op_to_count["MatMul"], 1);
+    EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+    EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+  };
+
+  std::function<void(SessionOptions&)> add_session_options_fn{};
+  if (accuracy_level >= 0) {
+    add_session_options_fn = [accuracy_level](SessionOptions& sess_opts) {
+      std::ignore = sess_opts.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                            std::to_string(accuracy_level).c_str());
+    };
+  }
+
+  TransformerTester(build_test_case,
+                    check_graph,
+                    TransformerLevel::Level1,
+                    TransformerLevel::Level2,
+                    21 /*opset_version*/,
+                    1e-5 /*per_sample_tolerance*/,
+                    1e-5 /*relative_per_sample_tolerance*/,
+                    nullptr,
+                    add_session_options_fn);
+}
+
+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
+}
+
+//        Input2
+//           |
+//    DQ     /
+//     \    /
+//     MatMul
+//       |
+//     output
+template <typename T, bool use_zp>
+typename std::enable_if<std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>, void>::type
+RunDQMatMulNotConverted_FirstDQInput(const std::vector<int64_t>& weight_shape,
+                                     const std::vector<int64_t>& input2_shape,
+                                     const int64_t axis,
+                                     const int64_t block_size,
+                                     int64_t accuracy_level) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* weight_arg = builder.MakeInitializer(weight_shape, T(T::min_val, 0), T(T::max_val, 0));
+    auto* input2_arg = builder.MakeInput(input2_shape, -100.0f, 100.0f);
+    auto* output_arg = builder.MakeOutput();
+
+    // add DQ
+    auto* dq_output = builder.MakeIntermediate();
+    NodeAttributes attrs;
+    utils::SetNodeAttribute(utils::MakeAttribute("axis", axis), attrs);
+    utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), attrs);
+
+    auto scale_shape = std::vector<int64_t>{weight_shape};
+    scale_shape[axis] = (scale_shape[axis] + block_size - 1) / block_size;
+    auto* scale_arg = builder.MakeInitializer(scale_shape, 8.0f, 12.0f);
+    if constexpr (use_zp) {
+      auto* zp_arg = builder.MakeInitializer(scale_shape, T(0, 0), T(2, 0));
+      builder.AddNode("DequantizeLinear", {weight_arg, scale_arg, zp_arg}, {dq_output}, "", &attrs);
+    } else {
+      builder.AddNode("DequantizeLinear", {weight_arg, scale_arg}, {dq_output}, "", &attrs);
+    }
+
+    builder.AddNode("MatMul", {dq_output, input2_arg}, {output_arg});
+  };
+
+  auto check_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+    EXPECT_EQ(op_to_count["MatMul"], 1);
+    EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+    EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+  };
+
+  std::function<void(SessionOptions&)> add_session_options_fn{};
+  if (accuracy_level >= 0) {
+    add_session_options_fn = [accuracy_level](SessionOptions& sess_opts) {
+      std::ignore = sess_opts.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                            std::to_string(accuracy_level).c_str());
+    };
+  }
+
+  TransformerTester(build_test_case,
+                    check_graph,
+                    TransformerLevel::Level1,
+                    TransformerLevel::Level2,
+                    21 /*opset_version*/,
+                    1e-5 /*per_sample_tolerance*/,
+                    1e-5 /*relative_per_sample_tolerance*/,
+                    nullptr,
+                    add_session_options_fn);
+}
+
+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
+}
+
+// Input1
+//   |
+//    \      DQ
+//     \    /
+//     MatMul
+//       |
+//     output
+template <typename T, bool use_zp>
+void RunDQMatMulNotConverted_TypeShapeMismatch(const std::vector<int64_t>& input1_shape,
+                                               const std::vector<int64_t>& weight_shape,
+                                               const int64_t axis,
+                                               const int64_t block_size,
+                                               int64_t accuracy_level) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
+    auto* output_arg = builder.MakeOutput();
+    NodeArg* weight_arg = nullptr;
+
+    // add DQ
+    if constexpr (std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>) {
+      weight_arg = builder.MakeInitializer(weight_shape, T(T::min_val, 0), T(T::max_val, 0));
+    } else {
+      weight_arg = builder.MakeInitializer(weight_shape,
+                                           std::numeric_limits<T>::min(),
+                                           std::numeric_limits<T>::max());
+    }
+
+    auto* dq_output = builder.MakeIntermediate();
+    NodeAttributes attrs;
+    utils::SetNodeAttribute(utils::MakeAttribute("axis", axis), attrs);
+    utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), attrs);
+
+    auto scale_shape = std::vector<int64_t>{weight_shape};
+    scale_shape[axis] = (scale_shape[axis] + block_size - 1) / block_size;
+    auto* scale_arg = builder.MakeInitializer(scale_shape, 8.0f, 12.0f);
+    if constexpr (use_zp) {
+      NodeArg* zp_arg;
+      if constexpr (std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>) {
+        zp_arg = builder.MakeInitializer(scale_shape, T(0, 0), T(2, 0));
+      } else {
+        zp_arg = builder.MakeInitializer<T>(scale_shape, 0, 2);
+      }
+
+      builder.AddNode("DequantizeLinear", {weight_arg, scale_arg, zp_arg}, {dq_output}, "", &attrs);
+    } else {
+      builder.AddNode("DequantizeLinear", {weight_arg, scale_arg}, {dq_output}, "", &attrs);
+    }
+
+    builder.AddNode("MatMul", {input_arg, dq_output}, {output_arg});
+  };
+
+  auto check_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+    EXPECT_EQ(op_to_count["MatMul"], 1);
+    EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+    EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+  };
+
+  std::function<void(SessionOptions&)> add_session_options_fn{};
+  if (accuracy_level >= 0) {
+    add_session_options_fn = [accuracy_level](SessionOptions& sess_opts) {
+      std::ignore = sess_opts.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                            std::to_string(accuracy_level).c_str());
+    };
+  }
+
+  TransformerTester(build_test_case,
+                    check_graph,
+                    TransformerLevel::Level1,
+                    TransformerLevel::Level2,
+                    21 /*opset_version*/,
+                    1e-5 /*per_sample_tolerance*/,
+                    1e-5 /*relative_per_sample_tolerance*/,
+                    nullptr,
+                    add_session_options_fn);
+}
+
+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_TypeMismatch) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulNotConverted_TypeShapeMismatch<int8_t, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<int8_t, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<uint8_t, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<uint8_t, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<int16_t, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<int16_t, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<uint16_t, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<uint16_t, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<int32_t, false>({12, 37}, {37, 12}, 0, 16, 0);
+}
+
+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_ShapeMismatch) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  // block size too small
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 12}, 0, 8, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 12}, 0, 8, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 12}, 0, 8, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 12}, 0, 8, 0);
+  // block size not 2's power
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 12}, 0, 17, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 12}, 0, 17, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 12}, 0, 17, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 12}, 0, 17, 0);
+  // not axis 0
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 37}, 1, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 37}, 1, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 37}, 1, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 37}, 1, 16, 0);
+  // not rank 2
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
+}
+
+//  Input1
+//    |      DQ
+//     \    /
+//     MatMul
+//       |      DQ
+//        \    /
+//        MatMul
+//          |
+//        output
+template <typename T, bool use_zp>
+typename std::enable_if<std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>, void>::type
+RunDQMatMulConverted(const std::vector<int64_t>& input1_shape,
+                     const std::vector<int64_t>& weight1_shape,
+                     const std::vector<int64_t>& weight2_shape,
+                     const int64_t axis,
+                     const int64_t block_size,
+                     int64_t accuracy_level) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
+    auto* output_arg = builder.MakeOutput();
+
+    // add DQ
+    NodeAttributes attrs;
+    utils::SetNodeAttribute(utils::MakeAttribute("axis", axis), attrs);
+    utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), attrs);
+    auto scale1_shape = std::vector<int64_t>{weight1_shape};
+    auto scale2_shape = std::vector<int64_t>{weight2_shape};
+    scale1_shape[axis] = (scale1_shape[axis] + block_size - 1) / block_size;
+    scale2_shape[axis] = (scale2_shape[axis] + block_size - 1) / block_size;
+
+    auto* weight1_arg = builder.MakeInitializer(weight1_shape, T(T::min_val, 0), T(T::max_val, 0));
+    auto* weight2_arg = builder.MakeInitializer(weight2_shape, T(T::min_val, 0), T(T::max_val, 0));
+    auto* dq1_output = builder.MakeIntermediate();
+    auto* dq2_output = builder.MakeIntermediate();
+    auto* matmul1_output = builder.MakeIntermediate();
+
+    auto* scales1_arg = builder.MakeInitializer(scale1_shape, 8.0f, 12.0f);
+    auto* scales2_arg = builder.MakeInitializer(scale2_shape, 8.0f, 12.0f);
+    if constexpr (use_zp) {
+      auto* zp1_arg = builder.MakeInitializer(scale1_shape, T(0, 0), T(2, 0));
+      auto* zp2_arg = builder.MakeInitializer(scale2_shape, T(0, 0), T(2, 0));
+      builder.AddNode("DequantizeLinear", {weight1_arg, scales1_arg, zp1_arg}, {dq1_output}, "", &attrs);
+      builder.AddNode("DequantizeLinear", {weight2_arg, scales2_arg, zp2_arg}, {dq2_output}, "", &attrs);
+    } else {
+      builder.AddNode("DequantizeLinear", {weight1_arg, scales1_arg}, {dq1_output}, "", &attrs);
+      builder.AddNode("DequantizeLinear", {weight2_arg, scales2_arg}, {dq2_output}, "", &attrs);
+    }
+
+    builder.AddNode("MatMul", {input_arg, dq1_output}, {matmul1_output});
+    builder.AddNode("MatMul", {matmul1_output, dq2_output}, {output_arg});
+  };
+
+  auto check_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+    EXPECT_EQ(op_to_count["MatMul"], 0);
+    EXPECT_EQ(op_to_count["com.microsoft.MatMulNBits"], 2);
+    EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+    EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
+  };
+
+  std::function<void(SessionOptions&)> add_session_options_fn{};
+  if (accuracy_level >= 0) {
+    add_session_options_fn = [accuracy_level](SessionOptions& sess_opts) {
+      std::ignore = sess_opts.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                            std::to_string(accuracy_level).c_str());
+    };
+  }
+
+  TransformerTester(build_test_case,
+                    check_graph,
+                    TransformerLevel::Level1,
+                    TransformerLevel::Level2,
+                    21 /*opset_version*/,
+                    1e-5 /*per_sample_tolerance*/,
+                    1e-5 /*relative_per_sample_tolerance*/,
+                    nullptr,
+                    add_session_options_fn);
+}
+
+TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulConverted<Int4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulConverted<Int4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulConverted<UInt4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulConverted<Int4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulConverted<Int4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulConverted<UInt4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
+}
+
+#endif  // !defined(DISABLE_CONTRIB_OPS)
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h
index 862408f31f004..52ac2a2541a79 100644
--- a/onnxruntime/test/optimizer/qdq_test_utils.h
+++ b/onnxruntime/test/optimizer/qdq_test_utils.h
@@ -517,7 +517,7 @@ GetQDQTestCaseFn BuildQDQSplitTestCase(const std::vector<int64_t>& input_shape,
     NodeArg* input_arg = nullptr;
 
     if constexpr (std::is_same_v<InputType, Int4x2> || std::is_same_v<InputType, UInt4x2>) {
-      input_arg = builder.MakeInputInt4<InputType>(input_shape, InputType::min_val, InputType::max_val);
+      input_arg = builder.MakeInput(input_shape, InputType(InputType::min_val, 0), InputType(InputType::max_val, 0));
       dq_zp = InputType(static_cast<std::byte>(InputType::max_val / 2));
       q_zp = OutputType(static_cast<std::byte>(OutputType::max_val / 2));
     } else {

From a6c5e2cd20dd890f416806e0afbb3b5968030f4d Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 22 Jul 2024 10:41:08 -0700
Subject: [PATCH 09/15] [CUDA] FusedMHARunnerFP16v2 thread-safe (#21420)

### Description
- [x] Rewrite FusedMHARunnerFP16v2 to make it thread-safe.
- [x] Add multi-threading tests

Previously, the kernel parameters params is stored as a member of mha
runner, which means that different threads might change the params at
the same time and impacts the other threads.

For example, if batch_size and seq_len was changed by another thread to
larger values in setup(...), buffer overrun might happen in run(...)
because a kernel could read/write memory out of range of allocated
buffers.

In new implementation, I change the api and remove mutable member
variables to make it thread safe. Below is summary of change:

Before:
```
class FusedMHARunnerFP16v2::mhaImpl {
   void setup(int seq_len, int batch_size) {
      // change scalar params
   }

   void run(input, output) {
      // change params for input and output pointers
      // launch kernel using params
   }

   Fused_multihead_attention_params_v2 params; // mutable, not thread-safe
}
```

After:
```
class FusedMHARunnerFP16v2::FmhaImpl {
   void setup(int seq_len, int batch_size, Fused_multihead_attention_params_v2& params) {
      // change params
   }

   void run(params, input, output) {
      // change params with input and output pointers
      // launch kernel using params
   }
}
```

### Motivation and Context
https://github.com/microsoft/onnxruntime/issues/18854
https://github.com/microsoft/onnxruntime/issues/21413
---
 .../contrib_ops/cuda/bert/attention.cc        |  12 +-
 .../contrib_ops/cuda/bert/attention_impl.cu   |  10 +-
 .../cuda/bert/multihead_attention.cc          |   8 +-
 .../contrib_ops/cuda/bert/packed_attention.cc |  14 +-
 .../cuda/bert/packed_attention_impl.cu        |   7 +-
 .../bert/packed_multihead_attention_impl.cu   |   6 +-
 .../mha_runner.cu                             | 230 ++++++-------
 .../mha_runner.h                              | 119 +++----
 .../test/python/transformers/benchmark_mha.py |  49 ++-
 .../test/python/transformers/test_mha.py      | 313 +++++++++++++++++-
 10 files changed, 534 insertions(+), 234 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc
index cacd65313ebcc..3b7f980ba1881 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc
@@ -149,8 +149,8 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                                        nullptr == relative_position_bias &&
                                        parameters.past_sequence_length == 0 &&
                                        parameters.hidden_size == parameters.v_hidden_size &&
-                                       FusedMHARunnerFP16v2::is_supported(sm, parameters.head_size, sequence_length,
-                                                                          enable_trt_flash_attention_, true);
+                                       FusedMHARunnerFP16v2::IsSupported(sm, parameters.head_size, sequence_length,
+                                                                         enable_trt_flash_attention_, true);
         if (use_causal_fused_runner) {
           // Here we assume that num_heads, head_size and is_unidirectional does not change for an Attention node.
           if (nullptr == fused_fp16_runner_.get()) {
@@ -171,8 +171,8 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                               nullptr == present &&
                               nullptr == relative_position_bias &&
                               parameters.hidden_size == parameters.v_hidden_size &&
-                              FusedMHARunnerFP16v2::is_supported(sm, parameters.head_size, sequence_length,
-                                                                 enable_trt_flash_attention_, false);
+                              FusedMHARunnerFP16v2::IsSupported(sm, parameters.head_size, sequence_length,
+                                                                enable_trt_flash_attention_, false);
 
       if (use_fused_runner) {
         // Here we assume that num_heads, head_size and is_unidirectional does not change for an Attention node.
@@ -184,8 +184,8 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
         }
 
         // In case some kernel not loaded due to shared memory limit, we need to double check here.
-        const int S = fused_fp16_runner_->getSFromMaxSeqLen(sequence_length);
-        if (fused_fp16_runner_->isValid(S)) {
+        const int normalized_seq_len = fused_fp16_runner_->NormalizeSequenceLength(sequence_length);
+        if (fused_fp16_runner_->IsValid(normalized_seq_len)) {
           fused_runner = fused_fp16_runner_.get();
         }
       }
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
index 150079cdf157a..997493acd9cb7 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@@ -245,12 +245,10 @@ Status FusedTrtSelfAttention(
 
   FusedMHARunnerFP16v2* fused_fp16_runner = reinterpret_cast<FusedMHARunnerFP16v2*>(data.fused_runner);
 
-  const int S = causal ? sequence_length : fused_fp16_runner->getSFromMaxSeqLen(sequence_length);
+  const int s = causal ? sequence_length : fused_fp16_runner->NormalizeSequenceLength(sequence_length);
 
   // B = 2 * batch_size when there is padding in input, and B = batch_size when padding is removed.
-  const int B = (nullptr == data.mask_index ? batch_size : 2 * batch_size);
-
-  fused_fp16_runner->setup(S, B);
+  const int b = (nullptr == data.mask_index ? batch_size : 2 * batch_size);
 
   if (!causal) {
     assert(data.qkv_format == AttentionQkvFormat::QKV_BSN3H);
@@ -261,12 +259,12 @@ Status FusedTrtSelfAttention(
       packed_qkv = data.query;
     }
 
-    fused_fp16_runner->run(packed_qkv, sequence_offset, data.output, stream);
+    fused_fp16_runner->Run(b, s, packed_qkv, sequence_offset, data.output, stream);
     DUMP_TENSOR("fused output", data.output,
                 batch_size, sequence_length, parameters.num_heads, parameters.v_head_size);
   } else {
     assert(data.qkv_format == AttentionQkvFormat::Q_K_V_BNSH_QKV_BS3NH);
-    fused_fp16_runner->run(data.gemm_buffer, sequence_offset, data.output, stream);
+    fused_fp16_runner->Run(b, s, data.gemm_buffer, sequence_offset, data.output, stream);
     DUMP_TENSOR("fused causal output", data.output,
                 batch_size, sequence_length, parameters.num_heads, parameters.v_head_size);
   }
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
index b96140f3897f9..663bd020ddac7 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
@@ -193,8 +193,8 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
                           (nullptr == key_padding_mask || is_mask_1d_seq_len) &&
                           parameters.hidden_size == parameters.v_hidden_size &&
                           parameters.sequence_length == parameters.kv_sequence_length &&
-                          FusedMHARunnerFP16v2::is_supported(sm, parameters.head_size, sequence_length,
-                                                             enable_trt_flash_attention_, false);
+                          FusedMHARunnerFP16v2::IsSupported(sm, parameters.head_size, sequence_length,
+                                                            enable_trt_flash_attention_, false);
   if (use_fused_runner) {
     // Here we assume that num_heads and head_size does not change for a MultiHeadAttention node.
     if (nullptr == fused_fp16_runner_.get()) {
@@ -206,8 +206,8 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
     }
 
     // In case some kernel not loaded due to shared memory limit, we need to double check here.
-    const int S = fused_fp16_runner_->getSFromMaxSeqLen(sequence_length);
-    if (fused_fp16_runner_->isValid(S)) {
+    const int normalized_seq_len = fused_fp16_runner_->NormalizeSequenceLength(sequence_length);
+    if (fused_fp16_runner_->IsValid(normalized_seq_len)) {
       fused_runner = fused_fp16_runner_.get();
     }
   }
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
index a1149ddbf99f5..d1c6993d48e62 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
@@ -55,11 +55,11 @@ MHARunner* TrtFusedAttention<T>::GetFusedRunner(const cudaDeviceProp& device_pro
 
   // Check whether we can use fused kernel
   int sm = device_prop.major * 10 + device_prop.minor;
-  bool is_fMHA_supported = FusedMHARunnerFP16v2::is_supported(sm,
-                                                              parameters.head_size,
-                                                              parameters.sequence_length,
-                                                              enable_trt_flash_attention_,
-                                                              false /*causal*/);
+  bool is_fMHA_supported = FusedMHARunnerFP16v2::IsSupported(sm,
+                                                             parameters.head_size,
+                                                             parameters.sequence_length,
+                                                             enable_trt_flash_attention_,
+                                                             false /*causal*/);
 
   if (!is_fMHA_supported) {
     return fused_runner;
@@ -72,8 +72,8 @@ MHARunner* TrtFusedAttention<T>::GetFusedRunner(const cudaDeviceProp& device_pro
   }
 
   // In case some kernel not loaded due to shared memory limit, we need to double check here.
-  const int S = fused_fp16_runner_->getSFromMaxSeqLen(parameters.sequence_length);
-  if (fused_fp16_runner_->isValid(S)) {
+  const int normalized_seq_len = fused_fp16_runner_->NormalizeSequenceLength(parameters.sequence_length);
+  if (fused_fp16_runner_->IsValid(normalized_seq_len)) {
     fused_runner = fused_fp16_runner_.get();
   }
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
index db9f30c25c013..ac2cb5165a94c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
@@ -459,10 +459,9 @@ Status FusedScaledDotProductAttention(
                          parameters.token_count, stream);
 
   FusedMHARunnerFP16v2* fused_fp16_runner = reinterpret_cast<FusedMHARunnerFP16v2*>(fused_runner);
-  const int S = fused_fp16_runner->getSFromMaxSeqLen(sequence_length);
-  fused_fp16_runner->setup(S, batch_size);
-
-  fused_fp16_runner->run(data.workspace, data.cumulative_sequence_length, data.output, stream);
+  const int normalized_seq_len = fused_fp16_runner->NormalizeSequenceLength(sequence_length);
+  fused_fp16_runner->Run(batch_size, normalized_seq_len,
+                         data.workspace, data.cumulative_sequence_length, data.output, stream);
   return Status::OK();
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
index 3e168189be3d5..b4ca0194b08bc 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
@@ -575,10 +575,8 @@ Status FusedAttentionTrt(
   }
 
   FusedMHARunnerFP16v2* fused_fp16_runner = reinterpret_cast<FusedMHARunnerFP16v2*>(fused_runner);
-  const int S = fused_fp16_runner->getSFromMaxSeqLen(sequence_length);
-  fused_fp16_runner->setup(S, batch_size);
-
-  fused_fp16_runner->run(qkv, data.cumulative_sequence_length, data.output, stream);
+  const int normalized_seq_len = fused_fp16_runner->NormalizeSequenceLength(sequence_length);
+  fused_fp16_runner->Run(batch_size, normalized_seq_len, qkv, data.cumulative_sequence_length, data.output, stream);
   return Status::OK();
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
index 4a4e3eeecf642..8af28e874729a 100644
--- a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+// Modifications: Update interface and implmentation to be thread-safe
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h"
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/fused_multihead_attention_v2.h"
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/flash_attention/fmha_flash_attention.h"
@@ -34,28 +38,28 @@ void set_alpha_fp16(uint32_t& alpha, float norm) {
   alpha = temp.u32;
 }
 
-class FusedMHARunnerFP16v2::mhaImpl {
+class FusedMHARunnerFP16v2::FmhaImpl {
  public:
-  mhaImpl(FusedMHARunnerFP16v2* interface)
-      : interface(interface),
-        sm(interface->mSm),
-        xmmaKernel(getXMMAKernelsV2(DATA_TYPE_FP16, sm)) {
+  FmhaImpl(FusedMHARunnerFP16v2* interface, int sm)
+      : interface_(interface),
+        sm_(sm),
+        xmma_kernel_(getXMMAKernelsV2(DATA_TYPE_FP16, sm)) {
     ORT_ENFORCE((sm == kSM_70 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89),
                 "Unsupported architecture");
 
-    flash_attention_kernel = nullptr;
-    if (interface->mEnableFlashAttention) {
-      flash_attention_kernel = get_flash_attention_kernels(DATA_TYPE_FP16, sm);
+    flash_kernel_ = nullptr;
+    if (interface_->enable_flash_attention_) {
+      flash_kernel_ = get_flash_attention_kernels(DATA_TYPE_FP16, sm);
     }
-
-    params.clear();
   }
 
-  ~mhaImpl() {}
+  ~FmhaImpl() {}
 
-  void setup(const int seq_len, const int B) {
-    // For bert and vit, use flash attention when sequence length is larger than the threshold.
-    use_flash_attention = is_flash_attention(seq_len);
+  void Setup(Fused_multihead_attention_params_v2& params,
+             int sequence_length,  // normalized sequence length
+             int batch_size,
+             bool& use_flash_attention) const {
+    use_flash_attention = UseFlashAttention(sequence_length);
 
     params.force_unroll = use_flash_attention;
 
@@ -67,27 +71,27 @@ class FusedMHARunnerFP16v2::mhaImpl {
       warps_m = 4;
       warps_n = 1;
     } else {
-      if (sm == 70) {
-        if (seq_len == 64 || seq_len == 96) {
+      if (sm_ == 70) {
+        if (sequence_length == 64 || sequence_length == 96) {
           warps_m = 2;
           warps_n = 2;
-        } else if (seq_len == 128) {
+        } else if (sequence_length == 128) {
           warps_m = 1;
           warps_n = 4;
-        } else if (seq_len == 256 || seq_len == 384) {
+        } else if (sequence_length == 256 || sequence_length == 384) {
           warps_m = 1;
           warps_n = 8;
         } else {
           ORT_ENFORCE(false, "Unsupported sequence length");
         }
       } else {
-        if (seq_len == 32 || seq_len == 64 || seq_len == 96 || seq_len == 128) {
+        if (sequence_length == 32 || sequence_length == 64 || sequence_length == 96 || sequence_length == 128) {
           warps_m = 2;
           warps_n = 2;
-        } else if (seq_len == 192 || seq_len == 256) {
+        } else if (sequence_length == 192 || sequence_length == 256) {
           warps_m = 1;
           warps_n = 4;
-        } else if (seq_len == 384) {
+        } else if (sequence_length == 384) {
           warps_m = 1;
           warps_n = 8;
         } else {
@@ -97,11 +101,11 @@ class FusedMHARunnerFP16v2::mhaImpl {
     }
 
     // The number of threads per CTA.
-    threads_per_cta = warps_m * warps_n * warps_k * 32;
+    size_t threads_per_cta = warps_m * warps_n * warps_k * 32;
     // The number of xmmas in the M dimension. We use one uint32_t per XMMA in the M dimension.
-    xmmas_m = (seq_len + 16 * warps_m - 1) / (16 * warps_m);
+    size_t xmmas_m = (sequence_length + 16 * warps_m - 1) / (16 * warps_m);
 
-    const float scale_bmm1 = interface->mScale;
+    const float scale_bmm1 = interface_->scale_;
     const float scale_softmax = 1.f;  // Seems to be only required for int8
     const float scale_bmm2 = 1.f;
 
@@ -109,20 +113,21 @@ class FusedMHARunnerFP16v2::mhaImpl {
     set_alpha_fp16(params.scale_softmax, scale_softmax);
     set_alpha_fp16(params.scale_bmm2, scale_bmm2);
 
-    params.b = B;
-    params.h = interface->mNumHeads;
-    params.s = seq_len;
-    params.d = interface->mHeadSize;
+    params.b = batch_size;
+    params.h = interface_->num_heads_;
+    params.s = sequence_length;
+    params.d = interface_->head_size_;
 
-    params.qkv_stride_in_bytes = 3 * interface->mNumHeads * interface->mHeadSize * sizeof(half);
+    params.qkv_stride_in_bytes = 3 * interface_->num_heads_ * interface_->head_size_ * sizeof(half);
     params.packed_mask_stride_in_bytes = xmmas_m * threads_per_cta * sizeof(uint32_t);
-    params.o_stride_in_bytes = interface->mNumHeads * interface->mHeadSize * sizeof(half);
-
-    has_causal_mask = false;
+    params.o_stride_in_bytes = interface_->num_heads_ * interface_->head_size_ * sizeof(half);
   }
 
-  void setup_causal_masked_fmha(const int seq_len, const int B) {
-    const float scale_bmm1 = interface->mScale;
+  void SetupCausal(Fused_multihead_attention_params_v2& params,
+                   int sequence_length,  // normalized sequence length
+                   int batch_size,
+                   bool& use_flash_attention) const {
+    const float scale_bmm1 = interface_->scale_;
     const float scale_softmax = 1.f;  // Seems to be only required for int8
     const float scale_bmm2 = 1.f;
 
@@ -130,16 +135,17 @@ class FusedMHARunnerFP16v2::mhaImpl {
     set_alpha_fp16(params.scale_softmax, scale_softmax);
     set_alpha_fp16(params.scale_bmm2, scale_bmm2);
 
-    params.b = B;
-    params.h = interface->mNumHeads;
-    params.s = seq_len;
-    params.d = interface->mHeadSize;
+    params.b = batch_size;
+    params.h = interface_->num_heads_;
+    params.s = sequence_length;
+    params.d = interface_->head_size_;
+
+    params.qkv_stride_in_bytes = 3 * interface_->num_heads_ * interface_->head_size_ * sizeof(half);
+    params.o_stride_in_bytes = interface_->num_heads_ * interface_->head_size_ * sizeof(half);
 
-    params.qkv_stride_in_bytes = 3 * interface->mNumHeads * interface->mHeadSize * sizeof(half);
-    params.o_stride_in_bytes = interface->mNumHeads * interface->mHeadSize * sizeof(half);
+    use_flash_attention = interface_->enable_flash_attention_;
 
-    // fallback to original fmha_v2 when head_size <= 64 and seq_len <- 128
-    use_flash_attention = interface->mEnableFlashAttention;
+    // fallback to original fmha_v2 when head_size <= 64 and sequence_length <= 128
     if (params.d <= 64 && params.s <= 128) {
       use_flash_attention = false;
       // get max sequence length
@@ -152,97 +158,87 @@ class FusedMHARunnerFP16v2::mhaImpl {
 
     // set flags
     params.force_unroll = use_flash_attention;
-    has_causal_mask = true;
   }
 
-  void run(const void* input, const void* cu_seqlens, void* output, cudaStream_t stream) {
+  void Run(Fused_multihead_attention_params_v2& params,
+           const void* input,
+           const void* cu_seqlens,
+           void* output,
+           cudaStream_t stream,
+           bool use_flash_attention,
+           bool has_causal_mask) const {
     params.qkv_ptr = const_cast<void*>(input);
     params.o_ptr = output;
     params.cu_seqlens = static_cast<int*>(const_cast<void*>(cu_seqlens));
 
-    if (use_flash_attention && flash_attention_kernel != nullptr && !has_causal_mask) {
-      flash_attention_kernel->run(params, stream);
+    if (use_flash_attention && flash_kernel_ != nullptr && !has_causal_mask) {
+      flash_kernel_->run(params, stream);
     } else {
-      xmmaKernel->run(params, stream, use_flash_attention, has_causal_mask);
+      xmma_kernel_->run(params, stream, use_flash_attention, has_causal_mask);
     }
 
     CUDA_CALL_THROW(cudaPeekAtLastError());
   }
 
-  bool isValid(int s) const {
-    if (is_flash_attention(s)) {
-      return (flash_attention_kernel != nullptr) && flash_attention_kernel->isValid(s);
+  bool IsValid(int sequence_length) const {
+    if (UseFlashAttention(sequence_length)) {
+      return (flash_kernel_ != nullptr) && flash_kernel_->isValid(sequence_length);
     }
 
-    return xmmaKernel->isValid(s);
+    return xmma_kernel_->isValid(sequence_length);
   }
 
-  int getSFromMaxSeqLen(const int max_seq_len) const {
-    if (is_flash_attention(max_seq_len)) {
+  int NormalizeSequenceLength(int max_seq_len) const {
+    if (UseFlashAttention(max_seq_len)) {
       return max_seq_len;
     }
 
-    int seq_len = max_seq_len;
+    int sequence_length = max_seq_len;
     if (max_seq_len <= 32) {
-      seq_len = (sm == 70) ? 64 : 32;
+      sequence_length = (sm_ == 70) ? 64 : 32;
     } else if (max_seq_len <= 64) {
-      seq_len = 64;
+      sequence_length = 64;
     } else if (max_seq_len <= 96) {
-      seq_len = 96;
+      sequence_length = 96;
     } else if (max_seq_len <= 128) {
-      seq_len = 128;
+      sequence_length = 128;
     } else if (max_seq_len <= 192) {
-      seq_len = (sm == 70) ? 256 : 192;
+      sequence_length = (sm_ == 70) ? 256 : 192;
     } else if (max_seq_len <= 256) {
-      seq_len = 256;
+      sequence_length = 256;
     } else if (max_seq_len <= 384) {
-      seq_len = 384;
+      sequence_length = 384;
     }
 
-    return seq_len;
+    return sequence_length;
   }
 
  protected:
-  bool is_flash_attention(const int seq_len) const {
-    ORT_ENFORCE(interface->mHasCausalMask == false);
-    return interface->mEnableFlashAttention && seq_len >= kMinSequenceLengthFlashAttention;
+  bool UseFlashAttention(int sequence_length) const {
+    ORT_ENFORCE(interface_->is_causal_ == false);
+    return interface_->enable_flash_attention_ && sequence_length >= kMinSequenceLengthFlashAttention;
   }
 
  private:
-  FusedMHARunnerFP16v2* interface;
-  Fused_multihead_attention_params_v2 params;
-  int sm;
-  const FusedMultiHeadAttentionXMMAKernelV2* xmmaKernel;
-  const FusedMultiHeadFlashAttentionKernel* flash_attention_kernel;
-  size_t xmmas_m;
-  size_t threads_per_cta;
-  bool use_flash_attention = false;
-  bool has_causal_mask = false;
+  FusedMHARunnerFP16v2* interface_;
+  int sm_;
+  const FusedMultiHeadAttentionXMMAKernelV2* xmma_kernel_;
+  const FusedMultiHeadFlashAttentionKernel* flash_kernel_;
 };
 
-FusedMHARunnerFP16v2::FusedMHARunnerFP16v2(const int numHeads,
-                                           const int headSize,
-                                           const int sm,
-                                           bool causal_mask,
+FusedMHARunnerFP16v2::FusedMHARunnerFP16v2(int num_heads,
+                                           int head_size,
+                                           int sm,
+                                           bool causal,
                                            bool enable_flash_attention,
-                                           const float scale)
-    : MHARunner(numHeads, headSize, 2, causal_mask, scale),
-      mSm(sm),
-      mEnableFlashAttention(enable_flash_attention),
-      pimpl(new mhaImpl(this)) {
+                                           float scale)
+    : MHARunner(num_heads, head_size, causal, scale),
+      enable_flash_attention_(enable_flash_attention),
+      impl_(new FmhaImpl(this, sm)) {
 }
 
-void FusedMHARunnerFP16v2::setup(const int seq_len, const int B) {
-  MHARunner::setup(seq_len, B);
-  if (mHasCausalMask) {
-    pimpl->setup_causal_masked_fmha(seq_len, B);
-  } else {
-    pimpl->setup(seq_len, B);
-  }
-}
-
-bool FusedMHARunnerFP16v2::is_supported(int sm, int head_size, int sequence_length,
-                                        bool enable_flash_attention, bool causal) {
+bool FusedMHARunnerFP16v2::IsSupported(int sm, int head_size, int sequence_length,
+                                       bool enable_flash_attention, bool causal) {
   if (causal) {
     if (!(sm == kSM_70 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89)) {
       return false;
@@ -284,34 +280,44 @@ bool FusedMHARunnerFP16v2::is_supported(int sm, int head_size, int sequence_leng
   return sequence_length <= max_sequence_length;
 }
 
-size_t FusedMHARunnerFP16v2::getWorkspaceSize() const {
-  return 0;
-}
+void FusedMHARunnerFP16v2::Run(int batch_size,
+                               int normalized_sequence_length,
+                               const void* input,
+                               const void* cu_seqlens,
+                               void* output,
+                               cudaStream_t stream) const {
+  Fused_multihead_attention_params_v2 params;
+  bool use_flash_attention = false;
+  if (is_causal_) {
+    impl_->SetupCausal(params, normalized_sequence_length, batch_size, use_flash_attention);
+  } else {
+    impl_->Setup(params, normalized_sequence_length, batch_size, use_flash_attention);
+  }
 
-void FusedMHARunnerFP16v2::run(const void* input, const void* cu_seqlens, void* output, cudaStream_t stream) {
-  pimpl->run(input, cu_seqlens, output, stream);
+  impl_->Run(params, input, cu_seqlens, output, stream, use_flash_attention, is_causal_);
 }
 
-bool FusedMHARunnerFP16v2::isValid(int s) const {
-  return pimpl->isValid(s);
+bool FusedMHARunnerFP16v2::IsValid(int normalized_sequence_length) const {
+  return impl_->IsValid(normalized_sequence_length);
 }
 
-int FusedMHARunnerFP16v2::getSFromMaxSeqLen(const int max_seq_len) const {
-  return pimpl->getSFromMaxSeqLen(max_seq_len);
+int FusedMHARunnerFP16v2::NormalizeSequenceLength(int max_seq_len) const {
+  return impl_->NormalizeSequenceLength(max_seq_len);
 }
 
-std::unique_ptr<MHARunner> FusedMHARunnerFP16v2::Create(const int numHeads,
-                                                                   const int headSize,
-                                                                   const int sm,
-                                                                   bool causal_mask,
-                                                                   bool enable_flash_attention,
-                                                                   const float scale) {
+std::unique_ptr<MHARunner> FusedMHARunnerFP16v2::Create(int num_heads,
+                                                        int head_size,
+                                                        int sm,
+                                                        bool causal,
+                                                        bool enable_flash_attention,
+                                                        const float scale) {
 #ifdef _MSC_VER
-  return std::make_unique<FusedMHARunnerFP16v2>(numHeads, headSize, sm, causal_mask, enable_flash_attention, scale);
+  return std::make_unique<FusedMHARunnerFP16v2>(num_heads, head_size, sm, causal, enable_flash_attention, scale);
 #else
-  // Linux build has error using make_unique: invalid application of ‘sizeof’ to incomplete type ‘onnxruntime::contrib::cuda::FusedMHARunnerFP16v2::mhaImpl
+  // Linux build has error using make_unique: invalid application of ‘sizeof’ to
+  // incomplete type ‘onnxruntime::contrib::cuda::FusedMHARunnerFP16v2::FmhaImpl
   std::unique_ptr<MHARunner> runner;
-  runner.reset(new FusedMHARunnerFP16v2(numHeads, headSize, sm, causal_mask, enable_flash_attention, scale));
+  runner.reset(new FusedMHARunnerFP16v2(num_heads, head_size, sm, causal, enable_flash_attention, scale));
   return runner;
 #endif
 }
diff --git a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h
index f7c1dc85361df..82914b07e524c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h
+++ b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+// Modifications: Update interface and implmentation to be thread-safe
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 #pragma once
 
 #include <memory>
@@ -25,103 +29,70 @@ namespace cuda {
 
 constexpr int kMinSequenceLengthFlashAttention = 385;
 
-// Multi-Head Attention runner
 class MHARunner {
  public:
-  MHARunner(const int numHeads, const int headSize, const int wordSize, bool causal_mask, const float scale)
-      : mS(0),
-        mB(0),
-        mOmatSize(0),
-        mNumMats(0),
-        mNumHeads(numHeads),
-        mHeadSize(headSize),
-        mWordSize(wordSize),
-        mLdQKV(0),
-        mStrideQKV(0),
-        mLdOut(0),
-        mStrideOut(0),
-        mScale(scale == 0.0f ? 1.f / sqrtf(static_cast<float>(headSize))
-                             : scale),
-        mHasCausalMask(causal_mask) {
+  MHARunner(int num_heads, int head_size, bool causal, float scale)
+      : num_heads_(num_heads),
+        head_size_(head_size),
+        scale_(scale == 0.0f ? 1.f / sqrtf(static_cast<float>(head_size)) : scale),
+        is_causal_(causal) {
   }
 
   virtual ~MHARunner() = default;
 
-  virtual void setup(const int S, const int B) {
-    ORT_ENFORCE(S > 0);
-    ORT_ENFORCE(B > 0);
-
-    mB = B;
-    mS = S;
-
-    mLdQKV = 3 * B * mNumHeads * mHeadSize;
-    mStrideQKV = 3 * mHeadSize;
-
-    mLdOut = B * mNumHeads * mHeadSize;
-    mStrideOut = mHeadSize;
-    mOmatSize = S * S;
-    mNumMats = B * mNumHeads;
-  }
-
-  virtual void run(const void* input, const void* cu_seqlens, void* output, cudaStream_t stream) = 0;
-
-  virtual size_t getWorkspaceSize() const = 0;
+  virtual int NormalizeSequenceLength(int max_seq_len) const = 0;
 
-  virtual bool isValid(int s) const = 0;
+  virtual bool IsValid(int normalized_sequence_length) const = 0;
 
-  virtual int getSFromMaxSeqLen(const int max_seq_len) const = 0;
+  virtual void Run(int batch_size,
+                   int normalized_sequence_length,
+                   const void* input,
+                   const void* cu_seqlens,
+                   void* output,
+                   cudaStream_t stream) const = 0;
 
  protected:
-  int mS;
-  int mB;
-  int mOmatSize;
-  int mNumMats;
-  int mNumHeads;
-  int mHeadSize;
-  int mWordSize;
-  int mLdQKV;
-  int mStrideQKV;
-  int mLdOut;
-  int mStrideOut;
-
-  float mScale;
-  bool mHasCausalMask;
+  int num_heads_;
+  int head_size_;
+  float scale_;
+  bool is_causal_;
 };
 
 class FusedMHARunnerFP16v2 : public MHARunner {
  public:
-  FusedMHARunnerFP16v2(const int numHeads,
-                       const int headSize,
-                       const int sm,
-                       bool causal_mask,
+  FusedMHARunnerFP16v2(int num_heads,
+                       int head_size,
+                       int sm,
+                       bool causal,
                        bool enable_flash_attention,
-                       const float scale);
-  ~FusedMHARunnerFP16v2() = default;  // for pimpl
-
-  virtual void setup(const int S, const int B) override;
+                       float scale);
 
-  static bool is_supported(int sm, int head_size, int sequence_length, bool enable_flash_attention, bool causal);
+  ~FusedMHARunnerFP16v2() = default;  // for impl_
 
-  void run(const void* input, const void* cu_seqlens, void* output, cudaStream_t stream) override;
+  static bool IsSupported(int sm, int head_size, int sequence_length, bool enable_flash_attention, bool causal);
 
-  size_t getWorkspaceSize() const override;
+  static std::unique_ptr<MHARunner> Create(int num_heads,
+                                           int head_size,
+                                           int sm,
+                                           bool causal,
+                                           bool enable_flash_attention,
+                                           float scale);
 
-  bool isValid(int s) const override;
+  bool IsValid(int normalized_sequence_length) const override;
 
-  int getSFromMaxSeqLen(const int max_seq_len) const override;
+  int NormalizeSequenceLength(int max_seq_len) const override;
 
-  static std::unique_ptr<MHARunner> Create(const int numHeads,
-                                           const int headSize,
-                                           const int sm,
-                                           bool causal_mask,
-                                           bool enable_flash_attention,
-                                           const float scale);
+  void Run(int batch_size,
+           int normalized_sequence_length,
+           const void* input,
+           const void* cu_seqlens,
+           void* output,
+           cudaStream_t stream) const override;
 
  private:
-  int mSm;
-  bool mEnableFlashAttention;
-  class mhaImpl;
-  std::unique_ptr<mhaImpl> pimpl;
+  bool enable_flash_attention_;
+  class FmhaImpl;
+  std::unique_ptr<FmhaImpl> impl_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/test/python/transformers/benchmark_mha.py b/onnxruntime/test/python/transformers/benchmark_mha.py
index 797461bae2efd..111c417479d20 100644
--- a/onnxruntime/test/python/transformers/benchmark_mha.py
+++ b/onnxruntime/test/python/transformers/benchmark_mha.py
@@ -156,6 +156,49 @@ def shape_dict(self, input_format=None):
             )
         return shapes
 
+    def symbolic_shape_dict(self, input_format=None):
+        input_format = input_format or self.input_format
+        if input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH:
+            # cross attention does not have past state
+            return {
+                "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                "key": ("batch_size", self.num_heads, "sequence_length", self.head_size),
+                "value": ("batch_size", self.num_heads, "sequence_length", self.head_size),
+                "output": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+            }
+
+        if self.use_kv_cache:
+            shapes = {
+                "past_key": ("batch_size", self.num_heads, "past_buffer_length", self.head_size),
+                "past_value": ("batch_size", self.num_heads, "past_buffer_length", self.head_size),
+                "output": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                "present_key": ("batch_size", self.num_heads, "present_buffer_length", self.head_size),
+                "present_value": ("batch_size", self.num_heads, "present_buffer_length", self.head_size),
+            }
+        else:
+            shapes = {
+                "output": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+            }
+
+        if input_format == InputFormats.QKV_BSN3H:
+            shapes.update({"query": ("batch_size", "sequence_length", self.num_heads, 3, self.head_size)})
+        elif input_format == InputFormats.Q_KV_BSNH_BSN2H:
+            shapes.update(
+                {
+                    "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                    "key": ("batch_size", "sequence_length", self.num_heads, 2, self.head_size),
+                }
+            )
+        else:  # input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH
+            shapes.update(
+                {
+                    "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                    "key": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                    "value": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                }
+            )
+        return shapes
+
     def random_inputs(self, seed: int = 123):
         device = self.device
         dtype = self.dtype
@@ -215,7 +258,7 @@ def random_inputs(self, seed: int = 123):
 
     def get_input_output_names(self):
         if self.input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH:
-            return ["query", "key"], ["output"]
+            return ["query", "key", "value"], ["output"]
 
         if self.input_format == InputFormats.QKV_BSN3H:
             inputs, outputs = ["query"], ["output"]
@@ -235,7 +278,7 @@ def fill_optional_mha_inputs(input_names):
     return input_names[:-2] + [""] * (len(inputs) - len(input_names)) + input_names[-2:]
 
 
-def create_multi_head_attention_onnx_model(config: MultiHeadAttentionConfig):
+def create_multi_head_attention_onnx_model(config: MultiHeadAttentionConfig, use_symbolic_shape=False):
     input_names, output_names = config.get_input_output_names()
 
     float_type = TensorProto.FLOAT16 if config.dtype == torch.float16 else TensorProto.FLOAT
@@ -252,7 +295,7 @@ def create_multi_head_attention_onnx_model(config: MultiHeadAttentionConfig):
         ),
     ]
 
-    shape_dict = config.shape_dict()
+    shape_dict = config.symbolic_shape_dict() if use_symbolic_shape else config.shape_dict()
     inputs = [
         helper.make_tensor_value_info(input_name, float_type, list(shape_dict[input_name]))
         for input_name in input_names
diff --git a/onnxruntime/test/python/transformers/test_mha.py b/onnxruntime/test/python/transformers/test_mha.py
index 5335e7115ad78..ff473cc2ced92 100644
--- a/onnxruntime/test/python/transformers/test_mha.py
+++ b/onnxruntime/test/python/transformers/test_mha.py
@@ -7,17 +7,39 @@
 Test MultiHeadAttention operator for CUDA and CPU.
 """
 
+import concurrent.futures
 import itertools
 import unittest
-from typing import Optional
+from enum import IntEnum
+from typing import Dict, List, Optional
 
 import numpy
 import torch
-from benchmark_mha import InputFormats, MultiHeadAttentionConfig, OrtMultiHeadAttention
+from benchmark_mha import (
+    InputFormats,
+    MultiHeadAttentionConfig,
+    OrtMultiHeadAttention,
+    create_multi_head_attention_onnx_model,
+)
 from einops import rearrange
 from parameterized import parameterized
 
 import onnxruntime
+from onnxruntime import InferenceSession
+
+
+class SdpaKernel(IntEnum):
+    """Bit flags for sdpa_kernel CUDA provider option"""
+
+    DEFAULT = 0
+    FLASH_ATTENTION = 1
+    EFFICIENT_ATTENTION = 2
+    TRT_FUSED_ATTENTION = 4
+    CUDNN_FLASH_ATTENTION = 8
+    MATH = 16
+    TRT_FLASH_ATTENTION = 32
+    TRT_CROSS_ATTENTION = 64
+    TRT_CAUSAL_ATTENTION = 128
 
 
 def attention_reference(
@@ -105,9 +127,16 @@ def mha_with_past_reference(
 
 def get_provider_support_info(provider: str, use_kv_cache: bool):
     if provider == "CUDAExecutionProvider":
-        formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH, InputFormats.Q_KV_BSNH_BSN2H, InputFormats.QKV_BSN3H]
         if not use_kv_cache:
-            formats.append(InputFormats.Q_K_V_BSNH_BSNH_BSNH)
+            formats = [
+                InputFormats.Q_K_V_BSNH_BSNH_BSNH,
+                InputFormats.Q_KV_BSNH_BSN2H,
+                InputFormats.QKV_BSN3H,
+                InputFormats.Q_K_V_BSNH_BNSH_BNSH,
+            ]
+        else:
+            formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH]
+
         device_id = torch.cuda.current_device()
         device = torch.device("cuda", device_id)
         dtype = torch.float16
@@ -121,15 +150,16 @@ def get_provider_support_info(provider: str, use_kv_cache: bool):
     return device, dtype, formats
 
 
-def has_cuda_support():
+def get_compute_capability():
     if torch.cuda.is_available() and "CUDAExecutionProvider" in onnxruntime.get_available_providers():
-        major, _ = torch.cuda.get_device_capability()
-        return major >= 6
-    return False
+        major, minor = torch.cuda.get_device_capability()
+        sm = major * 10 + minor
+        return sm
+    return 0
 
 
 def no_kv_cache_test_cases(provider: str, comprehensive: bool):
-    if provider == "CUDAExecutionProvider" and not has_cuda_support():
+    if provider == "CUDAExecutionProvider" and get_compute_capability() < 60:
         return
         yield
 
@@ -192,7 +222,7 @@ def no_kv_cache_test_cases(provider: str, comprehensive: bool):
 
 
 def kv_cache_test_cases(provider: str, comprehensive: bool):
-    if provider == "CUDAExecutionProvider" and not has_cuda_support():
+    if provider == "CUDAExecutionProvider" and get_compute_capability() < 60:
         return
         yield
 
@@ -262,6 +292,92 @@ def mha_test_cases(provider: str, comprehensive: bool):
     )
 
 
+def no_kv_cache_multi_thread_test_cases(provider: str, comprehensive: bool):
+    if provider == "CUDAExecutionProvider" and get_compute_capability() < 60:
+        return
+        yield
+
+    batch_sizes = [1, 2]
+    sequence_lengths = [1, 16, 127, 128, 255, 256, 383, 384, 400] if comprehensive else [1, 64, 128, 256]
+    heads = [4]
+    head_sizes = [8, 16, 32, 40, 64, 80, 96, 128, 160, 192, 224, 256] if comprehensive else [32, 64]
+
+    device, dtype, formats = get_provider_support_info(provider, False)
+
+    for format in formats:
+        for causal in [True, False]:
+            for num_heads in heads:
+                for head_size in head_sizes:
+                    configs = []  # list of configurations to run in parallel
+                    for batch_size in batch_sizes:
+                        for sequence_length in sequence_lengths:
+                            config = MultiHeadAttentionConfig(
+                                batch_size=batch_size,
+                                sequence_length=sequence_length,
+                                num_heads=num_heads,
+                                head_size=head_size,
+                                causal=causal,
+                                past_sequence_length=0,
+                                kv_sequence_length=sequence_length,
+                                max_cache_sequence_length=None,
+                                provider=provider,
+                                device=device,
+                                dtype=dtype,
+                                use_kv_cache=False,
+                                share_past_present_buffer=False,
+                                input_format=format,
+                            )
+                            configs.append(config)
+                    yield configs
+
+
+def kv_cache_multi_thread_test_cases(provider: str, comprehensive: bool):
+    if provider == "CUDAExecutionProvider" and get_compute_capability() < 60:
+        return
+        yield
+
+    batch_sizes = [1, 2]
+    sequence_lengths = [1, 32, 127, 128, 383, 384, 400] if comprehensive else [1, 32, 127, 128]
+    heads = [4]
+    head_sizes = [8, 16, 32, 40, 64, 80, 96, 128, 160, 192, 224, 256] if comprehensive else [32, 64]
+
+    sequence_length = 1
+    device, dtype, formats = get_provider_support_info(provider, True)
+
+    for format in formats:
+        for causal in [True, False]:
+            for num_heads in heads:
+                for head_size in head_sizes:
+                    configs = []
+                    for batch_size in batch_sizes:
+                        for past_sequence_length in sequence_lengths:
+                            config = MultiHeadAttentionConfig(
+                                batch_size=batch_size,
+                                sequence_length=sequence_length,
+                                num_heads=num_heads,
+                                head_size=head_size,
+                                causal=causal,
+                                past_sequence_length=past_sequence_length,
+                                kv_sequence_length=sequence_length,
+                                max_cache_sequence_length=None,
+                                provider=provider,
+                                device=device,
+                                dtype=dtype,
+                                use_kv_cache=True,
+                                share_past_present_buffer=False,
+                                input_format=format,
+                            )
+                            configs.append(config)
+                    yield configs
+
+
+def multi_thread_test_cases(provider: str, comprehensive: bool):
+    return itertools.chain(
+        no_kv_cache_multi_thread_test_cases(provider, comprehensive),
+        kv_cache_multi_thread_test_cases(provider, comprehensive),
+    )
+
+
 def causal_mask(seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, device=None):
     row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
     col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
@@ -346,20 +462,189 @@ def parity_check_mha(
         )
 
 
+def parity_check_mha_multi_threading(
+    test_inputs: List[Dict],
+    rtol: float = 1e-3,
+    atol: float = 1e-3,
+    sdpa_kernel: int = SdpaKernel.DEFAULT,
+    max_threads: int = 5,
+    verbose: bool = False,
+):
+    # Use the first config to create a session, which is shared by all configs to run in parallel.
+    config = test_inputs[0]["config"]
+    # For now, MHA CUDA kernel does not support causal so skip such test cases.
+    if config.causal and config.provider == "CUDAExecutionProvider":
+        return None
+    # Some kernel does not support certain input format.
+    if sdpa_kernel not in [
+        SdpaKernel.DEFAULT,
+        SdpaKernel.FLASH_ATTENTION,
+        SdpaKernel.EFFICIENT_ATTENTION,
+    ] and config.input_format in [InputFormats.Q_KV_BSNH_BSN2H]:
+        return None
+    if verbose:
+        print(f"create a shared session with {vars(config)}")
+    onnx_model_str = create_multi_head_attention_onnx_model(config, use_symbolic_shape=True)
+    if config.provider == "CUDAExecutionProvider":
+        provider_options = {"arena_extend_strategy": "kSameAsRequested", "sdpa_kernel": int(sdpa_kernel)}
+        providers = [(config.provider, provider_options), "CPUExecutionProvider"]
+    else:
+        providers = ["CPUExecutionProvider"]
+    ort_session = InferenceSession(onnx_model_str, providers=providers)
+
+    def convert_to_ort_inputs(feed_dict):
+        ort_inputs = {}
+
+        for k, v in feed_dict.items():
+            if isinstance(v, numpy.ndarray):
+                ort_inputs[k] = v
+            else:
+                ort_inputs[k] = v.detach().cpu().numpy()
+        return ort_inputs
+
+    def check_parity_with_config(i: int):
+        config = test_inputs[i]["config"]
+        if verbose:
+            print(f"Thread {i} with {vars(config)}")
+
+        ort_inputs = test_inputs[i]["ort_inputs"]
+
+        if verbose:
+            print(f"Thread {i} ort inputs: {ort_inputs}")
+        ort_outputs = ort_session.run(None, convert_to_ort_inputs(ort_inputs))
+        out = numpy.reshape(
+            ort_outputs[0], (config.batch_size, config.sequence_length, config.num_heads, config.head_size)
+        )
+
+        # Create reference inputs
+        config.input_format = InputFormats.Q_K_V_BSNH_BSNH_BSNH
+        ref_inputs = test_inputs[i]["ref_inputs"]
+        if verbose:
+            print(f"Thread {i} ref inputs: {ref_inputs}")
+        q = (
+            ref_inputs["query"]
+            .reshape((config.batch_size, config.sequence_length, config.num_heads, config.head_size))
+            .transpose(1, 2)
+        )
+        k = (
+            ref_inputs["key"]
+            .reshape((config.batch_size, config.kv_sequence_length, config.num_heads, config.head_size))
+            .transpose(1, 2)
+        )
+        v = (
+            ref_inputs["value"]
+            .reshape((config.batch_size, config.kv_sequence_length, config.num_heads, config.head_size))
+            .transpose(1, 2)
+        )
+
+        mask = None
+        if config.causal:
+            mask = causal_mask(config.sequence_length, config.total_sequence_length, device=config.device)
+
+        k_cache = None
+        v_cache = None
+        if config.use_kv_cache:
+            past_k = ref_inputs["past_key"]
+            past_v = ref_inputs["past_value"]
+            out_ref, k_cache, v_cache = mha_with_past_reference(config, past_k, past_v, q, k, v, mask=mask)
+        else:
+            out_ref = attention_reference(config.head_size, q, k, v, mask=mask)
+
+        try:
+            numpy.testing.assert_allclose(
+                out,
+                out_ref.detach().cpu().numpy(),
+                rtol=rtol,
+                atol=atol,
+                equal_nan=True,
+                err_msg=f"output not close: {config=}",
+            )
+
+            if config.use_kv_cache:
+                present_key = ort_outputs[1]
+                numpy.testing.assert_allclose(
+                    k_cache.detach().cpu().numpy(),
+                    present_key,
+                    rtol=rtol,
+                    atol=atol,
+                    equal_nan=True,
+                    err_msg=f"present_key not close: {config=}",
+                )
+
+                present_value = ort_outputs[2]
+                numpy.testing.assert_allclose(
+                    v_cache.detach().cpu().numpy(),
+                    present_value,
+                    rtol=rtol,
+                    atol=atol,
+                    equal_nan=True,
+                    err_msg=f"present_value not close: {config=}",
+                )
+        except AssertionError as e:
+            print(f"Failed with {vars(config)}: {e}")
+            return e
+
+        if verbose:
+            print(f"Passed: {vars(config)}")
+        return None
+
+    num_threads = min(max_threads, len(test_inputs))
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+        future_tasks = [executor.submit(check_parity_with_config, i) for i in range(num_threads)]
+        for future in concurrent.futures.as_completed(future_tasks):
+            result = future.result()
+            if result is not None:
+                return result
+
+    return None
+
+
 # Do not run too many tests in CI pipeline. Change it to True to run all combinations in dev machine.
 comprehensive_mode = False
 
 
 class TestMultiHeadAttention(unittest.TestCase):
-    # TODO: enable tests on CUDAExecutionProvider after fixing the issue.
-    # @parameterized.expand(mha_test_cases("CUDAExecutionProvider", comprehensive_mode), skip_on_empty=True)
-    # def test_mha_cuda(self, config):
-    #     parity_check_mha(config)
+    @parameterized.expand(mha_test_cases("CUDAExecutionProvider", comprehensive_mode), skip_on_empty=True)
+    def test_mha_cuda(self, config):
+        parity_check_mha(config)
 
     @parameterized.expand(mha_test_cases("CPUExecutionProvider", comprehensive_mode), skip_on_empty=True)
     def test_mha_cpu(self, config):
         parity_check_mha(config)
 
+    def run_mha_cuda_multi_threading(self, spda_kernel):
+        for configs in multi_thread_test_cases("CUDAExecutionProvider", comprehensive_mode):
+            test_inputs = []
+            for config in configs:
+                ort_inputs = config.random_inputs()
+
+                # Create reference inputs
+                old_format = config.input_format
+                config.input_format = InputFormats.Q_K_V_BSNH_BSNH_BSNH
+                ref_inputs = config.random_inputs()
+                config.input_format = old_format
+                test_inputs.append({"config": config, "ort_inputs": ort_inputs, "ref_inputs": ref_inputs})
+
+            exception = parity_check_mha_multi_threading(test_inputs, sdpa_kernel=spda_kernel, max_threads=len(configs))
+            assert exception is None, f"{spda_kernel=}, {vars(configs[0])}, {exception}"
+
+    def test_mha_cuda_multi_threading(self):
+        self.run_mha_cuda_multi_threading(SdpaKernel.DEFAULT)
+
+    def test_mha_cuda_multi_threading_efficient(self):
+        self.run_mha_cuda_multi_threading(SdpaKernel.EFFICIENT_ATTENTION)
+
+    def test_mha_cuda_multi_threading_trt(self):
+        sm = get_compute_capability()
+        if sm in [75, 80, 86, 89]:
+            self.run_mha_cuda_multi_threading(
+                SdpaKernel.TRT_FUSED_ATTENTION
+                | SdpaKernel.TRT_FLASH_ATTENTION
+                | SdpaKernel.TRT_CROSS_ATTENTION
+                | SdpaKernel.TRT_CAUSAL_ATTENTION
+            )
+
 
 if __name__ == "__main__":
     with torch.no_grad():

From 17e9ea62352b71e3d432a66b70e666ade128cac6 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Tue, 23 Jul 2024 02:56:09 +0800
Subject: [PATCH 10/15] [WebNN EP] Add outputDataType option for the
 ArgMax/ArgMin ops (#21385)

### Description
WebNN spec introduces a new option: `outputDataType` to `argMax` and
`argMin` ops, it's default value is `int32`, we should explicitly set it
to `int64` for WebNN EP.

Spec CR: "Add outputDataType to argmin/argmax"
https://github.com/webmachinelearning/webnn/pull/730
---
 .../providers/webnn/builders/impl/argmax_min_op_builder.cc   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
index f8b77b6350a76..1330a3e354871 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
@@ -50,6 +50,11 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   options.set("axes", axes);
   options.set("keepDimensions", keep_dims == 1);
   options.set("selectLastIndex", select_last_index == 1);
+  // TODO: use WebNN's opSupportLimits API to check the backend's supported output data types.
+  // If the backend doesn't support int64 output, we should use default int32 output data type
+  // then do a type casting (int32 -> int64) for the output. Refer to the CoreML EP for how to
+  // support int64 output.
+  options.set("outputDataType", "int64");
   emscripten::val output = emscripten::val::object();
 
   const auto& op_type = node.OpType();

From 4e75605eec985e579ee1e8db9a2bb2fd441a4837 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 22 Jul 2024 12:39:10 -0700
Subject: [PATCH 11/15] Replace inline pip install with pip install from
 requirements*.txt (#21106)

### Description
Replace inline pip install with pip install from requirements*.txt


### Motivation and Context
so that CG can recognize

### Dependency

- [x] https://github.com/microsoft/onnxruntime/pull/21085
---
 tools/ci_build/build.py                       | 27 +++++++++----------
 .../orttraining-pai-ci-pipeline.yml           |  2 +-
 .../requirements/pybind/requirements.txt      |  8 ++++++
 .../transformers-test/requirements.txt}       |  0
 4 files changed, 21 insertions(+), 16 deletions(-)
 create mode 100644 tools/ci_build/requirements/pybind/requirements.txt
 rename tools/ci_build/{requirements-transformers-test.txt => requirements/transformers-test/requirements.txt} (100%)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 75fbf5d0851ae..54f7b6c3a8fa7 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -261,9 +261,6 @@ def convert_arg_line_to_args(self, arg_line):
         "--wheel_name_suffix",
         help="Suffix to append to created wheel names. This value is currently only used for nightly builds.",
     )
-    parser.add_argument(
-        "--numpy_version", help="Installs a specific version of numpy before building the python binding."
-    )
     parser.add_argument("--skip-keras-test", action="store_true", help="Skip tests with Keras if keras is installed")
 
     # C-Sharp bindings
@@ -868,16 +865,6 @@ def update_submodules(source_dir):
     run_subprocess(["git", "submodule", "update", "--init", "--recursive"], cwd=source_dir)
 
 
-def install_python_deps(numpy_version=""):
-    dep_packages = ["setuptools", "wheel", "pytest"]
-    dep_packages.append(f"numpy=={numpy_version}" if numpy_version else "numpy>=1.16.6")
-    dep_packages.append("sympy>=1.10")
-    dep_packages.append("packaging")
-    dep_packages.append("cerberus")
-    dep_packages.append("psutil")
-    run_subprocess([sys.executable, "-m", "pip", "install", *dep_packages])
-
-
 def setup_test_data(source_onnx_model_dir, dest_model_dir_name, build_dir, configs):
     # create the symlink/shortcut of onnx models dir under build_dir
     # currently, there're 2 sources of onnx models, one is build in OS image, another is
@@ -2146,7 +2133,14 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                         numpy_init_version = numpy.__version__
                         pb_init_version = google.protobuf.__version__
                         run_subprocess(
-                            [sys.executable, "-m", "pip", "install", "-r", "requirements-transformers-test.txt"],
+                            [
+                                sys.executable,
+                                "-m",
+                                "pip",
+                                "install",
+                                "-r",
+                                "requirements/transformers-test/requirements.txt",
+                            ],
                             cwd=SCRIPT_DIR,
                         )
                         run_subprocess([sys.executable, "-m", "pytest", "transformers"], cwd=cwd)
@@ -2818,7 +2812,10 @@ def main():
             run_subprocess([emsdk_file, "activate", emsdk_version], cwd=emsdk_dir)
 
         if args.enable_pybind and is_windows():
-            install_python_deps(args.numpy_version)
+            run_subprocess(
+                [sys.executable, "-m", "pip", "install", "-r", "requirements/pybind/requirements.txt"],
+                cwd=SCRIPT_DIR,
+            )
 
         if args.use_rocm and args.rocm_version is None:
             args.rocm_version = ""
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index 0e1afdcc5b8ca..2c520a25cb39e 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -276,7 +276,7 @@ jobs:
           onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \
             /bin/bash -c "
               set -ex; \
-              pip install -r /onnxruntime_src/tools/ci_build/requirements-transformers-test.txt; \
+              pip install -r /onnxruntime_src/tools/ci_build/requirements/transformers-test/requirements.txt; \
               pytest /onnxruntime_src/onnxruntime/test/python/transformers/test_flash_attn_rocm.py -v -n 4 --reruns 1"
       workingDirectory: $(Build.SourcesDirectory)
     displayName: 'Run tranformers tests'
diff --git a/tools/ci_build/requirements/pybind/requirements.txt b/tools/ci_build/requirements/pybind/requirements.txt
new file mode 100644
index 0000000000000..8f00a25627c21
--- /dev/null
+++ b/tools/ci_build/requirements/pybind/requirements.txt
@@ -0,0 +1,8 @@
+setuptools
+wheel
+pytest
+numpy>=1.19.0
+sympy>=1.10
+packaging
+cerberus
+psutil
diff --git a/tools/ci_build/requirements-transformers-test.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
similarity index 100%
rename from tools/ci_build/requirements-transformers-test.txt
rename to tools/ci_build/requirements/transformers-test/requirements.txt

From 5b9369e93c704f55fd6a4e49934f41dccffe55a5 Mon Sep 17 00:00:00 2001
From: mindest <30493312+mindest@users.noreply.github.com>
Date: Tue, 23 Jul 2024 04:37:32 +0800
Subject: [PATCH 12/15] Fix typos according to reviewdog report. (#21335)

### Description
Fix typos based on reviewdog report but with some
exceptions/corrections.
---
 .gitattributes                                |   2 +-
 ThirdPartyNotices.txt                         |   2 +-
 cmake/onnxruntime.cmake                       |   2 +-
 .../composable_kernel/Fix_Clang_Build.patch   |   2 +-
 .../default/partials/title.tmpl.partial       |   2 +-
 dockerfiles/README.md                         |   4 +-
 .../onnx-inference-byoc-gpu-cpu-aks.ipynb     |   4 +-
 .../platform/EigenNonBlockingThreadPool.h     |   2 +-
 .../core/providers/cuda/cuda_context.h        |  12 +-
 .../core/providers/cuda/cuda_resource.h       |   2 +-
 .../core/providers/rocm/rocm_context.h        |   9 +-
 .../core/providers/rocm/rocm_resource.h       |   2 +-
 .../core/session/onnxruntime_c_api.h          |  19 ++--
 .../core/session/onnxruntime_lite_custom_op.h |   2 +-
 java/build.gradle                             |   2 +-
 .../main/java/ai/onnxruntime/OnnxRuntime.java |   2 +-
 .../onnxruntime/providers/package-info.java   |   2 +-
 java/src/test/java/sample/ScoreMNIST.java     |   2 +-
 .../backends/webgl/glsl-coordinate-lib.ts     |   2 +-
 js/web/lib/onnxjs/backends/webgl/ops/pack.ts  |   2 +-
 .../cpu/attnlstm/deep_cpu_attn_lstm.h         |   2 +-
 .../cpu/transformers/sampling_cpu_helper.h    |   2 +-
 onnxruntime/core/codegen/common/common.cc     |   2 +-
 onnxruntime/core/codegen/mti/common.h         |   2 +-
 .../passes/scheduler/schedule_utils.cc        |   4 +-
 .../codegen/passes/scheduler/schedule_utils.h |   4 +-
 .../passes/scheduler/tvm_schedule_builder.cc  |   2 +-
 .../passes/weight_layout/weight_layout.h      |   2 +-
 onnxruntime/core/common/logging/logging.cc    |   2 +-
 onnxruntime/core/common/status.cc             |   2 +-
 .../core/framework/allocation_planner.cc      |   4 +-
 .../core/framework/allocation_planner.h       |   6 +-
 .../framework/device_stream_collection.cc     |   3 +-
 onnxruntime/core/framework/execution_frame.h  |   2 +-
 .../partial_graph_execution_state.cc          |   2 +-
 .../framework/sequential_execution_plan.h     |   6 +-
 .../core/framework/sequential_executor.cc     |   2 +-
 onnxruntime/core/framework/session_options.h  |   2 +-
 onnxruntime/core/framework/session_state.cc   |   4 +-
 onnxruntime/core/framework/sparse_tensor.cc   |   4 +-
 onnxruntime/core/framework/tensorprotoutils.h |   6 +-
 onnxruntime/core/framework/utils.h            |   2 +-
 .../mickey/gemm/warp/quantb_meta_loader.h     |   8 +-
 onnxruntime/core/mlas/lib/convolve.cpp        |   4 +-
 .../core/optimizer/attention_fusion_helper.h  |   4 +-
 .../free_dim_override_transformer.cc          |   4 +-
 .../core/optimizer/insert_cast_transformer.cc |   6 +-
 .../onnx_transpose_optimization.cc            |   4 +-
 .../core/providers/acl/nn/batch_norm.cc       |   2 +-
 onnxruntime/core/providers/acl/nn/pool.cc     |   2 +-
 .../providers/armnn/activation/activations.cc |   2 +-
 onnxruntime/core/providers/armnn/math/gemm.h  |   2 +-
 .../core/providers/armnn/nn/batch_norm.cc     |   2 +-
 onnxruntime/core/providers/armnn/nn/conv.cc   |   2 +-
 onnxruntime/core/providers/armnn/nn/pool.cc   |   4 +-
 .../math/einsum_utils/einsum_auxiliary_ops.cc |   4 +-
 .../einsum_typed_compute_processor.cc         |   2 +-
 .../cpu/object_detection/roialign.cc          |   2 +-
 .../providers/cpu/sequence/sequence_ops.cc    |   2 +-
 .../core/providers/cpu/tensor/unique.cc       |   2 +-
 .../core/providers/cuda/cuda_allocator.cc     |   2 +-
 .../core/providers/cuda/cuda_stream_handle.cc |   2 +-
 .../cuda/math/softmax_blockwise_impl.cuh      |   2 +-
 onnxruntime/core/providers/cuda/nn/conv.cc    |   4 +-
 .../cuda/object_detection/roialign_impl.cu    | 106 +++++++++---------
 .../providers/cuda/reduction/reduction_ops.cc |   2 +-
 .../core/providers/cuda/tensor/resize_impl.cu |   6 +-
 .../providers/cuda/tensor/transpose_impl.cu   |   4 +-
 .../src/Operators/DmlOperatorFusedMatMul.cpp  |   2 +-
 .../providers/dnnl/dnnl_execution_provider.cc |   2 +-
 .../providers/dnnl/dnnl_node_capability.cc    |   6 +-
 .../core/providers/dnnl/subgraph/dnnl_conv.h  |   8 +-
 .../providers/dnnl/subgraph/dnnl_convgrad.cc  |   2 +-
 .../providers/dnnl/subgraph/dnnl_convgrad.h   |   4 +-
 .../dnnl/subgraph/dnnl_dequantizelinear.cc    |   2 +-
 .../providers/dnnl/subgraph/dnnl_matmul.cc    |   8 +-
 .../providers/dnnl/subgraph/dnnl_reduce.cc    |   8 +-
 .../dnnl/subgraph/dnnl_subgraph_primitive.h   |   2 +-
 .../providers/dnnl/subgraph/dnnl_transpose.cc |   5 +-
 .../providers/migraphx/migraphx_allocator.cc  |   2 +-
 .../migraphx/migraphx_stream_handle.cc        |   2 +-
 .../nnapi_lib/nnapi_implementation.cc         |   2 +-
 .../rknpu/rknpu_execution_provider.cc         |   4 +-
 onnxruntime/core/providers/rocm/nn/conv.cc    |   2 +-
 .../providers/rocm/reduction/reduction_ops.cc |   2 +-
 .../core/providers/rocm/rocm_allocator.cc     |   2 +-
 .../core/providers/rocm/rocm_stream_handle.cc |   2 +-
 .../webnn/webnn_execution_provider.cc         |   2 +-
 onnxruntime/core/session/IOBinding.cc         |   2 +-
 onnxruntime/core/session/inference_session.cc |   2 +-
 onnxruntime/core/session/inference_session.h  |   2 +-
 onnxruntime/core/util/qmath.h                 |   8 +-
 .../python/onnxruntime_pybind_schema.cc       |   2 +-
 .../onnxruntime_pybind_sparse_tensor.cc       |   4 +-
 .../python/onnxruntime_pybind_state.cc        |  18 +--
 .../python/tools/quantization/calibrate.py    |   4 +-
 .../tools/quantization/operators/direct_q8.py |   2 +-
 .../python/tools/quantization/quant_utils.py  |   2 +-
 .../python/tools/transformers/README.md       |   2 +-
 .../python/tools/transformers/benchmark.py    |   2 +-
 .../transformers/large_model_exporter.py      |   2 +-
 .../models/gpt2/benchmark_gpt2.py             |   2 +-
 .../models/gpt2/convert_to_onnx.py            |   2 +-
 .../transformers/models/gpt2/gpt2_tester.py   |   2 +-
 .../PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb  |   2 +-
 .../tools/transformers/onnx_model_phi.py      |   2 +-
 .../tools/transformers/onnx_model_tnlr.py     |   2 +-
 .../python/tools/transformers/optimizer.py    |   2 +-
 .../tools/transformers/shape_optimizer.py     |   2 +-
 .../contrib_ops/attention_lstm_op_test.cc     |  16 +--
 .../test/framework/allocation_planner_test.cc |   2 +-
 .../test/framework/inference_session_test.cc  |   2 +-
 onnxruntime/test/framework/tunable_op_test.cc |   2 +-
 .../test/fuzzing/include/BetaDistribution.h   |   4 +-
 onnxruntime/test/fuzzing/src/test.cpp         |   2 +-
 onnxruntime/test/ir/graph_test.cc             |   4 +-
 .../test/ir/schema_registry_manager_test.cc   |   4 +-
 .../test/mlas/unittest/test_fgemm_fixture.h   |   2 +-
 .../test/mlas/unittest/test_halfgemm.cpp      |   2 +-
 .../test/mlas/unittest/test_pool2d_fixture.h  |   2 +-
 .../test/mlas/unittest/test_pool3d_fixture.h  |   2 +-
 .../test/mlas/unittest/test_qgemm_fixture.h   |   2 +-
 .../test/mlas/unittest/test_sbgemm.cpp        |   4 +-
 .../mlas/unittest/test_symm_qgemm_fixture.h   |   2 +-
 .../optimizer/transpose_optimizer_test.cc     |   4 +-
 onnxruntime/test/perftest/ReadMe.txt          |   2 +-
 onnxruntime/test/perftest/ort_test_session.cc |   2 +-
 .../platform/android/cxa_demangle_test.cc     |   2 +-
 .../providers/cpu/controlflow/scan_test.cc    |   2 +-
 .../cpu/rnn/deep_cpu_lstm_op_test.cc          |   4 +-
 .../cuda/test_cases/allocator_cuda_test.cc    |   2 +-
 .../matmul_post_op_transform_test.cc          |   6 +-
 .../internal_testing_partitioning_tests.cc    |   2 +-
 .../test/providers/qnn/qnn_ep_context_test.cc |   4 +-
 .../test/python/onnxruntime_test_python.py    |   2 +-
 .../python/onnxruntime_test_python_mlops.py   |   2 +-
 .../onnxruntime_test_python_sparse_matmul.py  |   2 +-
 .../test_quantize_static_resnet.py            |   2 +-
 .../python/transformers/test_generation.py    |   2 +-
 onnxruntime/test/shared_lib/test_inference.cc |   6 +-
 .../model_creation_for_testing.ipynb          |   2 +-
 .../self_attention_megatron_basic_test.py     |   2 +-
 ...transpose_optimizer_shared_initializers.py |   2 +-
 onnxruntime/wasm/api.h                        |   2 +-
 .../core/framework/adasum/adasum_interface.h  |   2 +-
 .../core/framework/ortmodule_graph_builder.cc |   2 +-
 .../orttraining/core/framework/pipeline.cc    |   2 +-
 .../core/graph/mixed_precision_transformer.cc |   2 +-
 .../core/graph/optimizer_graph_builder.h      |   2 +-
 .../core/graph/pipeline_transformer.cc        |   4 +-
 .../core/graph/training_op_defs.cc            |  22 ++--
 .../core/optimizer/graph_transformer_config.h |   2 +-
 .../core/session/training_session.cc          |   4 +-
 orttraining/orttraining/models/bert/main.cc   |  16 +--
 orttraining/orttraining/models/mnist/main.cc  |  16 +--
 .../models/runner/training_runner.cc          |   2 +-
 .../python/training/onnxblock/blocks.py       |   4 +-
 .../python/training/ortmodule/__init__.py     |   2 +-
 .../cuda/fused_ops/fused_ops_frontend.cpp     |   6 +-
 .../test/distributed/partition_utils.h        |   6 +-
 .../orttraining/test/graph/bert_toy_fetches.h |   2 +-
 .../python/orttraining_test_ortmodule_api.py  |   2 +-
 .../test/python/qat_poc_example/qat.py        |   2 +-
 .../training_ops/cuda/cross_entropy_test.cc   |   2 +-
 .../orttraining/training_api/optimizer.cc     |   6 +-
 .../orttraining/training_api/optimizer.h      |   2 +-
 .../cpu/activation/activations_grad.cc        |   2 +-
 .../training_ops/cuda/math/div_grad_impl.cu   |   2 +-
 .../training_ops/cuda/optimizer/lamb.cc       |   2 +-
 .../training_ops/cuda/optimizer/lamb_impl.cu  |   2 +-
 .../tools/scripts/layer_norm_transform.py     |   2 +-
 orttraining/tools/scripts/model_transform.py  |   2 +-
 .../tools/scripts/opset12_model_transform.py  |   2 +-
 rust/onnxruntime-sys/examples/c_api_sample.rs |   4 +-
 .../src/tensor/ort_output_tensor.rs           |   2 +-
 tools/ci_build/build.py                       |   4 +-
 .../azure-pipelines/web-ci-pipeline.yml       |   2 +-
 tools/python/util/android/android.py          |   2 +-
 winml/adapter/winml_adapter_session.cpp       |   2 +-
 ...rosoft.AI.MachineLearning.Experimental.idl |   2 +-
 winml/api/Windows.AI.MachineLearning.idl      |   2 +-
 winml/lib/Api/LearningModelBinding.cpp        |   2 +-
 winml/lib/Api/impl/NumericData.h              |   2 +-
 .../test/api/LearningModelSessionAPITest.cpp  |   2 +-
 winml/test/common/googleTestMacros.h          |   2 +-
 winml/test/common/taefTestMacros.h            |   2 +-
 winml/test/common/test.h                      |   6 +-
 winml/test/image/imagetests.cpp               |   4 +-
 winml/test/model/model_tests.cpp              |   2 +-
 189 files changed, 380 insertions(+), 360 deletions(-)

diff --git a/.gitattributes b/.gitattributes
index 41eae6dac52f5..8bfd419922d6b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +1,4 @@
-# This sets the default behaviour, overriding core.autocrlf
+# This sets the default behavior, overriding core.autocrlf
 * text=auto
 
 # All source files should have unix line-endings in the repository,
diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 8ec770da22159..6a11f414361bd 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -4820,7 +4820,7 @@ SOFTWARE.
 
 ----------------------------------------------------------------------------
 
-This is the MIT/Expat Licence. For more information see:
+This is the MIT/Expat License. For more information see:
 
 1. http://www.opensource.org/licenses/mit-license.php
 
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 21ae0947f3788..0e89c2f14d34b 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -150,7 +150,7 @@ endif()
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_MINIMAL_BUILD)
   # target onnxruntime is a shared library, the dummy __cxa_demangle is only attach to it to avoid
-  # affecting downstream ort library users with the behaviour of dummy __cxa_demangle. So the dummy
+  # affecting downstream ort library users with the behavior of dummy __cxa_demangle. So the dummy
   # __cxa_demangle must not expose to libonnxruntime_common.a. It works as when the linker is
   # creating the DSO, our dummy __cxa_demangle always comes before libc++abi.a so the
   # __cxa_demangle in libc++abi.a is discarded, thus, huge binary size reduction.
diff --git a/cmake/patches/composable_kernel/Fix_Clang_Build.patch b/cmake/patches/composable_kernel/Fix_Clang_Build.patch
index 0352f8eb9bb34..73ece647d82c7 100644
--- a/cmake/patches/composable_kernel/Fix_Clang_Build.patch
+++ b/cmake/patches/composable_kernel/Fix_Clang_Build.patch
@@ -44,7 +44,7 @@ index c23746e7f..bc326c8b5 100644
  find_package(HIP REQUIRED)
  # Override HIP version in config.h, if necessary.
 @@ -269,12 +248,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
-     message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
+     message(STATUS "CK_HIP_VERSION_PATCH overridden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
  endif()
  message(STATUS "Build with HIP ${HIP_VERSION}")
 -link_libraries(hip::device)
diff --git a/csharp/ApiDocs/_exported_templates/default/partials/title.tmpl.partial b/csharp/ApiDocs/_exported_templates/default/partials/title.tmpl.partial
index 38c62fe55f603..fd589fd74877c 100644
--- a/csharp/ApiDocs/_exported_templates/default/partials/title.tmpl.partial
+++ b/csharp/ApiDocs/_exported_templates/default/partials/title.tmpl.partial
@@ -39,7 +39,7 @@ Event {{name.0.value}}
 Operator {{name.0.value}}
 {{/inOperator}}
 {{#inEii}}
-Explict Interface Implementation {{name.0.value}}
+Explicit Interface Implementation {{name.0.value}}
 {{/inEii}}
 {{#inVariable}}
 Variable {{name.0.value}}
diff --git a/dockerfiles/README.md b/dockerfiles/README.md
index a2e99d66d4654..008587a01082b 100644
--- a/dockerfiles/README.md
+++ b/dockerfiles/README.md
@@ -32,7 +32,7 @@
   docker run -it onnxruntime-source
   ```
 
-The docker file supports both x86_64 and ARM64(aarch64). You may use docker's "--platform" parameter to explictly specify which CPU architecture you want to build. For example:
+The docker file supports both x86_64 and ARM64(aarch64). You may use docker's "--platform" parameter to explicitly specify which CPU architecture you want to build. For example:
 
 ```bash
   docker build --platform linux/arm64/v8 -f Dockerfile.source
@@ -274,7 +274,7 @@ Note: You may add --use_tensorrt and --tensorrt_home options if you wish to use
 Note: Resulting Docker image will have ONNX Runtime installed in /usr, and ONNX Runtime wheel copied to /onnxruntime directory.
 Nothing else from ONNX Runtime source tree will be copied/installed to the image.
 
-Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropiate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
+Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropriate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
 
 ## MIGraphX
 **Ubuntu 20.04, ROCm6.0, MIGraphX**
diff --git a/docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb b/docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb
index be34a812c77db..c1278b63a84d3 100644
--- a/docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb
+++ b/docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb
@@ -64,7 +64,7 @@
     "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, please follow the [Azure ML configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) to set up your environment.\n",
     "\n",
     "### Install additional packages needed for this Notebook\n",
-    "You need to install the popular plotting library matplotlib, the image manipulation library opencv, and the onnx library in the conda environment where Azure Maching Learning SDK is installed.\n",
+    "You need to install the popular plotting library matplotlib, the image manipulation library opencv, and the onnx library in the conda environment where Azure Machine Learning SDK is installed.\n",
     "\n",
     "```\n",
     "(myenv) $ pip install matplotlib onnx opencv-python\n",
@@ -79,7 +79,7 @@
    "source": [
     "## 1. Obtain a model from the ONNX Model Zoo\n",
     "\n",
-    "For more information on the Facial Emotion Recognition (FER+) model, you can explore the notebook explaning how to deploy [FER+ with ONNX Runtime on an ACI Instance](onnx-inference-facial-expression-recognition-deploy.ipynb)."
+    "For more information on the Facial Emotion Recognition (FER+) model, you can explore the notebook explaining how to deploy [FER+ with ONNX Runtime on an ACI Instance](onnx-inference-facial-expression-recognition-deploy.ipynb)."
    ]
   },
   {
diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
index f9b694efb936f..e33007102e198 100644
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -1129,7 +1129,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
   //
   // Ensure that the ThreadPoolParallelSection has sufficient workers to
   // execute a loop with degree of parallelism n.  We track the number
-  // of workers already avaiable to the parallel section, prior to
+  // of workers already available to the parallel section, prior to
   // submitting tasks to the work queues to make up the total.
   //
   // Each worker will call in to worker_fn(idx) with a per-worker thread
diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h
index 9ada01673d4d9..462b31bb433a5 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_context.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -53,7 +53,8 @@ struct CudaContext : public CustomOpContext {
     cudnn_conv_use_max_workspace = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv_use_max_workspace_t);
 
     cudnn_conv1d_pad_to_nc1d = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv1d_pad_to_nc1d_t);
-    enable_skip_layer_norm_strict_mode = FetchResource<bool>(kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t);
+    enable_skip_layer_norm_strict_mode = FetchResource<bool>(
+        kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t);
     prefer_nhwc = FetchResource<bool>(kernel_ctx, CudaResource::prefer_nhwc_t);
     use_tf32 = FetchResource<bool>(kernel_ctx, CudaResource::use_tf32_t);
   }
@@ -61,13 +62,16 @@ struct CudaContext : public CustomOpContext {
   template <typename T>
   T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) {
     if constexpr (sizeof(T) > sizeof(void*)) {
-      ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type), OrtErrorCode::ORT_INVALID_ARGUMENT);
+      ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type),
+                        OrtErrorCode::ORT_INVALID_ARGUMENT);
     }
     const auto& ort_api = Ort::GetApi();
     void* resource = {};
-    OrtStatus* status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, resource_type, &resource);
+    OrtStatus* status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_CUDA_RESOURCE_VERSION, resource_type, &resource);
     if (status) {
-      ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resouce type: " + std::to_string(resource_type), OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+      ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resource type: " + std::to_string(resource_type),
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
     T t = {};
     memcpy(&t, &resource, sizeof(T));
diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h
index 00e7dec5727d1..555023c442c01 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_resource.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h
@@ -3,7 +3,7 @@
 
 #include "core/providers/resource.h"
 
-#define ORT_CUDA_RESOUCE_VERSION 3
+#define ORT_CUDA_RESOURCE_VERSION 3
 
 enum CudaResource : int {
   cuda_stream_t = cuda_resource_offset,  // 10000
diff --git a/include/onnxruntime/core/providers/rocm/rocm_context.h b/include/onnxruntime/core/providers/rocm/rocm_context.h
index 5f04289a8c6e0..f187e0cbb3a89 100644
--- a/include/onnxruntime/core/providers/rocm/rocm_context.h
+++ b/include/onnxruntime/core/providers/rocm/rocm_context.h
@@ -23,21 +23,24 @@ struct RocmContext : public CustomOpContext {
     void* resource = {};
     OrtStatus* status = nullptr;
 
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_ROCM_RESOUCE_VERSION, RocmResource::hip_stream_t, &resource);
+    status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::hip_stream_t, &resource);
     if (status) {
       ORT_CXX_API_THROW("failed to fetch hip stream", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
     hip_stream = reinterpret_cast<hipStream_t>(resource);
 
     resource = {};
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_ROCM_RESOUCE_VERSION, RocmResource::miopen_handle_t, &resource);
+    status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::miopen_handle_t, &resource);
     if (status) {
       ORT_CXX_API_THROW("failed to fetch miopen handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
     miopen_handle = reinterpret_cast<miopenHandle_t>(resource);
 
     resource = {};
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_ROCM_RESOUCE_VERSION, RocmResource::rocblas_handle_t, &resource);
+    status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::rocblas_handle_t, &resource);
     if (status) {
       ORT_CXX_API_THROW("failed to fetch rocblas handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
diff --git a/include/onnxruntime/core/providers/rocm/rocm_resource.h b/include/onnxruntime/core/providers/rocm/rocm_resource.h
index 53f26c13e93e0..772447a1809d8 100644
--- a/include/onnxruntime/core/providers/rocm/rocm_resource.h
+++ b/include/onnxruntime/core/providers/rocm/rocm_resource.h
@@ -3,7 +3,7 @@
 
 #include "core/providers/resource.h"
 
-#define ORT_ROCM_RESOUCE_VERSION 1
+#define ORT_ROCM_RESOURCE_VERSION 1
 
 enum RocmResource : int {
   hip_stream_t = rocm_resource_offset,
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 5c61963a2f39c..5aafdd149e889 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -473,13 +473,13 @@ typedef struct OrtCUDAProviderOptions {
 
   /** \brief Enable TunableOp for using.
    *   Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default.
-   *   This option can be overriden by environment variable ORT_CUDA_TUNABLE_OP_ENABLE.
+   *   This option can be overridden by environment variable ORT_CUDA_TUNABLE_OP_ENABLE.
    */
   int tunable_op_enable;
 
   /** \brief Enable TunableOp for tuning.
    *   Set it to 1/0 to enable/disable TunableOp tuning. Otherwise, it is disabled by default.
-   *   This option can be overriden by environment variable ORT_CUDA_TUNABLE_OP_TUNING_ENABLE.
+   *   This option can be overridden by environment variable ORT_CUDA_TUNABLE_OP_TUNING_ENABLE.
    */
   int tunable_op_tuning_enable;
 
@@ -562,13 +562,13 @@ typedef struct OrtROCMProviderOptions {
 
   /** \brief Enable TunableOp for using.
    *   Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default.
-   *   This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_ENABLE.
+   *   This option can be overridden by environment variable ORT_ROCM_TUNABLE_OP_ENABLE.
    */
   int tunable_op_enable;
 
   /** \brief Enable TunableOp for tuning.
    *   Set it to 1/0 to enable/disable TunableOp tuning. Otherwise, it is disabled by default.
-   *   This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_TUNING_ENABLE.
+   *   This option can be overridden by environment variable ORT_ROCM_TUNABLE_OP_TUNING_ENABLE.
    */
   int tunable_op_tuning_enable;
 
@@ -2798,7 +2798,7 @@ struct OrtApi {
    * "initial_growth_chunk_size_bytes": (Possible) Size of the second allocation in the arena.
    *  Only relevant if arena strategy is `kNextPowerOfTwo`. Use -1 to allow ORT to choose the default.
    * "max_power_of_two_extend_bytes": The maximum enxtend size if arena strategy is `kNextPowerOfTwo`.
-   *  It is not an allocation limit, it is only a limit for extention when requested byte is less than the limit.
+   *  It is not an allocation limit, it is only a limit for extension when requested byte is less than the limit.
    *  When requested bytes is more than the limit, allocator will still return as requested.
    *  Use -1 to allow ORT to choose the default 1GB for max_power_of_two_extend_bytes.
    *  Ultimately, the allocation size is determined by the allocation memory request.
@@ -4467,13 +4467,14 @@ struct OrtApi {
    * E.g. a cuda stream or a cublas handle
    *
    * \param context - Kernel context
-   * \param resouce_version - Version of the resource
+   * \param resource_version - Version of the resource
    * \param resource_id - Type of resource
    * \param resource - A pointer to returned resource
    *
    * \since Version 1.16.
    */
-  ORT_API2_STATUS(KernelContext_GetResource, _In_ const OrtKernelContext* context, _In_ int resouce_version, _In_ int resource_id, _Outptr_ void** resource);
+  ORT_API2_STATUS(KernelContext_GetResource, _In_ const OrtKernelContext* context, _In_ int resource_version,
+                  _In_ int resource_id, _Outptr_ void** resource);
 
   /** \brief Set user logging function
    *
@@ -4528,10 +4529,10 @@ struct OrtApi {
   ORT_API2_STATUS(ShapeInferContext_GetAttribute, _In_ const OrtShapeInferContext* context, _In_ const char* attr_name, _Outptr_ const OrtOpAttr** attr);
 
   /**
-   * Set type and shape info of an ouput
+   * Set type and shape info of an output
    *
    * \param[in] context
-   * \param[in] index The index of the ouput
+   * \param[in] index The index of the output
    * \param[out] info Type shape info of the output
    *
    * \since Version 1.17.
diff --git a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
index ee60f25da115e..57a64380faeb0 100644
--- a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
+++ b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
@@ -403,7 +403,7 @@ using Variadic = TensorArray;
 Note:
 OrtLiteCustomOp inherits from OrtCustomOp to bridge tween a custom func/struct and ort core.
 The lifetime of an OrtLiteCustomOp instance is managed by customer code, not ort, so:
-1. DO NOT cast OrtLiteCustomOp to OrtCustomOp and release since there is no virtual destructor in the hierachy.
+1. DO NOT cast OrtLiteCustomOp to OrtCustomOp and release since there is no virtual destructor in the hierarchy.
 2. OrtLiteCustomFunc and OrtLiteCustomStruct, as two sub-structs, can be released in form of OrtLiteCustomOp since all members are kept in the OrtLiteCustomOp,
    hence memory could still be recycled properly.
 Further, OrtCustomOp is a c struct bearing no v-table, so offspring structs are by design to be of zero virtual functions to maintain cast safety.
diff --git a/java/build.gradle b/java/build.gradle
index 3219b082994ff..8b4d5429b0f70 100644
--- a/java/build.gradle
+++ b/java/build.gradle
@@ -54,7 +54,7 @@ java {
 	targetCompatibility = JavaVersion.VERSION_1_8
 }
 
-// This jar tasks serves as a CMAKE signalling
+// This jar tasks serves as a CMAKE signaling
 // mechanism. The jar will be overwritten by allJar task
 jar {
 }
diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index f552badd4f83e..b80debdde47c4 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -438,7 +438,7 @@ private static String mapLibraryName(String library) {
   /**
    * Extracts the providers array from the C API, converts it into an EnumSet.
    *
-   * <p>Throws IllegalArgumentException if a provider isn't recognised (note this exception should
+   * <p>Throws IllegalArgumentException if a provider isn't recognized (note this exception should
    * only happen during development of ONNX Runtime, if it happens at any other point, file an issue
    * on <a href="https://github.com/microsoft/onnxruntime">GitHub</a>).
    *
diff --git a/java/src/main/java/ai/onnxruntime/providers/package-info.java b/java/src/main/java/ai/onnxruntime/providers/package-info.java
index 1f1e70a589f3a..33c24c6139f52 100644
--- a/java/src/main/java/ai/onnxruntime/providers/package-info.java
+++ b/java/src/main/java/ai/onnxruntime/providers/package-info.java
@@ -3,5 +3,5 @@
  * Licensed under the MIT License.
  */
 
-/** Classes for controlling the behaviour of ONNX Runtime Execution Providers. */
+/** Classes for controlling the behavior of ONNX Runtime Execution Providers. */
 package ai.onnxruntime.providers;
diff --git a/java/src/test/java/sample/ScoreMNIST.java b/java/src/test/java/sample/ScoreMNIST.java
index 6ecbc5cd56d10..efc7ef9fd6e47 100644
--- a/java/src/test/java/sample/ScoreMNIST.java
+++ b/java/src/test/java/sample/ScoreMNIST.java
@@ -242,7 +242,7 @@ public static void writeDataSKL(float[][] data, int[] indices, float[] values) {
   /**
    * Find the maximum probability and return it's index.
    *
-   * @param probabilities The probabilites.
+   * @param probabilities The probabilities.
    * @return The index of the max.
    */
   public static int pred(float[] probabilities) {
diff --git a/js/web/lib/onnxjs/backends/webgl/glsl-coordinate-lib.ts b/js/web/lib/onnxjs/backends/webgl/glsl-coordinate-lib.ts
index 1f2b27c7bdea8..717233182ed8a 100644
--- a/js/web/lib/onnxjs/backends/webgl/glsl-coordinate-lib.ts
+++ b/js/web/lib/onnxjs/backends/webgl/glsl-coordinate-lib.ts
@@ -1234,7 +1234,7 @@ export class CoordsGlslLib extends GlslLib {
   }
 
   /**
-   * This is the main function to map from the given texture coordiantes (s,t)
+   * This is the main function to map from the given texture coordinates (s,t)
    * to logical indices for the output
    * There will only be one single variation of this
    * Also see coordsToOffset and offsetToIndices for input-specific versions
diff --git a/js/web/lib/onnxjs/backends/webgl/ops/pack.ts b/js/web/lib/onnxjs/backends/webgl/ops/pack.ts
index 42a275a96fb8a..37ef8c8fe2435 100644
--- a/js/web/lib/onnxjs/backends/webgl/ops/pack.ts
+++ b/js/web/lib/onnxjs/backends/webgl/ops/pack.ts
@@ -85,7 +85,7 @@ function getOutOfBoundsCondition(rank: number, shape: readonly number[], dims: s
 }
 
 /**
- * code snippet to sample input texture with output coordiantes
+ * code snippet to sample input texture with output coordinates
  */
 function getOutput(shape: readonly number[], dims: string[]): string {
   const rank = shape.length;
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
index 326b2d8dc4925..bce8fd118e957 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
@@ -19,7 +19,7 @@ using onnxruntime::rnn::detail::Direction;
 using onnxruntime::rnn::detail::MakeDirection;
 
 // The class represents DeepCPU implementation of a long short term memory (LSTM) plus a Bahdanau Attention wraper.
-// The equivilent python usage could be checked int the corresponding op test directory, attention_lstm_data_gen.py.
+// The equivalent python usage could be checked int the corresponding op test directory, attention_lstm_data_gen.py.
 // Also please note that detail implementation re-used lot of code from current ONNXRuntime LSTM operator, refactor
 // is needed in future if this is become part of ONNX.
 class DeepCpuAttnLstmOp final : public OpKernel {
diff --git a/onnxruntime/contrib_ops/cpu/transformers/sampling_cpu_helper.h b/onnxruntime/contrib_ops/cpu/transformers/sampling_cpu_helper.h
index 413ef596cd118..2f41746c1d4e7 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/sampling_cpu_helper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/sampling_cpu_helper.h
@@ -152,7 +152,7 @@ Status Sample(AllocatorPtr& allocator,
                                                         1,
                                                         generator,
                                                         *sampled_idx));
-  // TODO: update presense_mask()
+  // TODO: update presence_mask()
 #ifdef DEBUG_GENERATION
   dumper->Print("sampled_idx", *sampled_idx);
 #endif
diff --git a/onnxruntime/core/codegen/common/common.cc b/onnxruntime/core/codegen/common/common.cc
index c2ae4ddba584e..818b919e99ef2 100644
--- a/onnxruntime/core/codegen/common/common.cc
+++ b/onnxruntime/core/codegen/common/common.cc
@@ -159,7 +159,7 @@ std::unique_ptr<ComputeCapability> ToCapacity(const onnxruntime::GraphViewer& gr
     ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.ImplicitInputDefs(), process_input_fn));
 
     // Handle outouts
-    // two cases are considerd as outputs
+    // two cases are considered as outputs
     // 1. Output NodeArg is not used by any Node
     // 2. Output NodeArg is used by at least one Node out of this subgraph.
     //    Note a NodeArg can be used by Nodes in and out of the subgraph at the same time.
diff --git a/onnxruntime/core/codegen/mti/common.h b/onnxruntime/core/codegen/mti/common.h
index 87bce55715ee1..d71e740b9284a 100644
--- a/onnxruntime/core/codegen/mti/common.h
+++ b/onnxruntime/core/codegen/mti/common.h
@@ -8,7 +8,7 @@
 
 #define MTI_ASSERT(condition)                                           \
   if (!(condition)) {                                                   \
-    std::string error_msg = "Not satsified: " #condition                \
+    std::string error_msg = "Not satisfied: " #condition                \
                             ": line " +                                 \
                             std::to_string(__LINE__) +                  \
                             " in file " + std::string(__FILE__) + "\n"; \
diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
index 3595229bbe132..76c2ad509c401 100644
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
+++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
@@ -74,7 +74,7 @@ bool ShouldTryVectorization(
 // Check the schedule of tensor
 // If it is not scheduled, try to vectorize it.
 // Note TryVectorization has to use with compute_root.
-// Therefore, there is a safty check of tensor's schedule
+// Therefore, there is a safety check of tensor's schedule
 bool TryVectorization(
     const tvm::Tensor& tensor,
     int64_t natural_vector_size,
@@ -124,7 +124,7 @@ bool TryVectorization(
 // Check the schedule of tensor
 // If it is not scheduled, try to add compute_inline on it.
 // Note TryInlineSchedule cannot be used with compute_root.
-// Therefore, there is a safty check of tensor's schedule.
+// Therefore, there is a safety check of tensor's schedule.
 bool TryInlineSchedule(
     const tvm::Tensor& tensor,
     ScheduleContext& ctx) {
diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
index 757366b551cf8..4a0781f94d385 100644
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
+++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
@@ -34,7 +34,7 @@ bool ShouldTryVectorization(
 // Check the schedule of tensor
 // If it is not scheduled, try to vectorize it.
 // Note TryVectorization has to use with compute_root.
-// Therefore, there is a safty check of tensor's schedule
+// Therefore, there is a safety check of tensor's schedule
 bool TryVectorization(
     const tvm::Tensor& tensor,
     int64_t natural_vector_size,
@@ -43,7 +43,7 @@ bool TryVectorization(
 // Check the schedule of tensor
 // If it is not scheduled, try to add compute_inline on it.
 // Note TryInlineSchedule cannot be used with compute_root.
-// Therefore, there is a safty check of tensor's schedule.
+// Therefore, there is a safety check of tensor's schedule.
 bool TryInlineSchedule(
     const tvm::Tensor& tensor,
     ScheduleContext& ctx);
diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc
index 6f0ffa14e8abb..2c8250198fa5f 100644
--- a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc
+++ b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc
@@ -39,7 +39,7 @@ void TVMScheduleBuilder::DumpAllSchedulers() const {
 
     d->ForEach([&stream](const std::string& key, Scheduler* op) {
       stream << "Key " << key
-             << ", Creater " << op->Name() << std::endl;
+             << ", Creator " << op->Name() << std::endl;
     });
 
     ++count;
diff --git a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h
index af61641a74937..1b45a38e7e24e 100644
--- a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h
+++ b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h
@@ -13,7 +13,7 @@ namespace tvm_codegen {
 
 using CoordTransFunc = std::function<tvm::Array<tvm::Expr>(const tvm::Array<tvm::Expr>&)>;
 
-// WeightLayout is data layout trasnformer for weight/initializer
+// WeightLayout is data layout transformer for weight/initializer
 class WeightLayout {
  public:
   // Static function to return unique string as a key
diff --git a/onnxruntime/core/common/logging/logging.cc b/onnxruntime/core/common/logging/logging.cc
index ad6f666a2d989..a086c90ea4b14 100644
--- a/onnxruntime/core/common/logging/logging.cc
+++ b/onnxruntime/core/common/logging/logging.cc
@@ -56,7 +56,7 @@ LoggingManager* LoggingManager::GetDefaultInstance() {
   return static_cast<LoggingManager*>(DefaultLoggerManagerInstance().load());
 }
 
-// GSL_SUPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
+// GSL_SUPPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
 // and should not have any destruction order issues via pragmas instead.
 // https://developercommunity.visualstudio.com/content/problem/249706/gslsuppress-does-not-work-for-i22-c-core-guideline.html
 #ifdef _MSC_VER
diff --git a/onnxruntime/core/common/status.cc b/onnxruntime/core/common/status.cc
index 4ffc7adaac88d..e824a66eaed58 100644
--- a/onnxruntime/core/common/status.cc
+++ b/onnxruntime/core/common/status.cc
@@ -70,7 +70,7 @@ std::string Status::ToString() const {
   return result;
 }
 
-// GSL_SUPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
+// GSL_SUPPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
 // and should not have any destruction order issues via pragmas instead.
 // https://developercommunity.visualstudio.com/content/problem/249706/gslsuppress-does-not-work-for-i22-c-core-guideline.html
 #ifdef _MSC_VER
diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index 7747058f0d0aa..5dca4cf6c165b 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -1073,7 +1073,7 @@ class PlannerImpl {
 
 #ifdef ORT_ENABLE_STREAM
   // assume we already have a baseline reuse plan (no memory reuse at all)
-  // this funciton will optimize the plan by building a reuse plan with stream safety.
+  // this function will optimize the plan by building a reuse plan with stream safety.
   Status OptimizeReusePlanForMultiStream() {
     InlinedHashMap<NodeIndex, int> dependent_counter;
     for (const auto& it : dependence_graph_) {
@@ -2012,7 +2012,7 @@ class PlannerImpl {
             for (auto* output : node->OutputDefs()) {
               if (output->Exists()) {
                 if (std::find(it->InputDefs().begin(), it->InputDefs().end(), output) != it->InputDefs().end()) {
-                  output_consumed_in_subgraph = false;  // output direclty consumed in current graph
+                  output_consumed_in_subgraph = false;  // output directly consumed in current graph
                   OrtValueIndex output_arg_idx;
                   ORT_THROW_IF_ERROR(ort_value_name_idx_map_.GetIdx(output->Name(), output_arg_idx));
                   // there are two cases we need notification:
diff --git a/onnxruntime/core/framework/allocation_planner.h b/onnxruntime/core/framework/allocation_planner.h
index 10ea5920b8809..aa62f218d9ff6 100644
--- a/onnxruntime/core/framework/allocation_planner.h
+++ b/onnxruntime/core/framework/allocation_planner.h
@@ -53,7 +53,7 @@ class SequentialPlannerContext : public ISequentialPlannerContext {
  public:
   SequentialPlannerContext(ExecutionMode execution_mode, ExecutionOrder execution_order, bool enable_memory_reuse)
       : execution_mode_(execution_mode),
-        exection_order_(execution_order),
+        execution_order_(execution_order),
         enable_memory_reuse_(enable_memory_reuse) {
   }
 
@@ -63,13 +63,13 @@ class SequentialPlannerContext : public ISequentialPlannerContext {
 
   bool IsParallelExecutionEnabled() const override { return execution_mode_ == ExecutionMode::ORT_PARALLEL; }
 
-  ExecutionOrder GetExecutionOrder() const override { return exection_order_; }
+  ExecutionOrder GetExecutionOrder() const override { return execution_order_; }
 
   bool GetEnableMemoryReuse() const override { return enable_memory_reuse_; }
 
  private:
   ExecutionMode execution_mode_ = ExecutionMode::ORT_SEQUENTIAL;
-  ExecutionOrder exection_order_ = ExecutionOrder::DEFAULT;
+  ExecutionOrder execution_order_ = ExecutionOrder::DEFAULT;
   bool enable_memory_reuse_ = true;
 };
 
diff --git a/onnxruntime/core/framework/device_stream_collection.cc b/onnxruntime/core/framework/device_stream_collection.cc
index 13948289e1c37..8d15e03c2e5ce 100644
--- a/onnxruntime/core/framework/device_stream_collection.cc
+++ b/onnxruntime/core/framework/device_stream_collection.cc
@@ -93,7 +93,8 @@ class DeviceStreamCollectionImpl {
   const AllocatorMap& allocators_;
   bool is_main_graph_ = false;
   // This is used in ExecutionFrame when memory pattern is enabled, to allocate the peak size memory
-  // labelled this stream in the current thread, instead of the default stream which will be used in all the threads (thus caused thread safe issue)
+  // labeled this stream in the current thread, instead of the default stream which will be used in all the threads
+  // (thus caused thread safe issue)
   std::unique_ptr<Stream> root_stream_;
   OrtDevice root_stream_device_;
   void ReleaseSingleStreamBuffers();
diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h
index 18d210ffd48f7..de571f86f1c77 100644
--- a/onnxruntime/core/framework/execution_frame.h
+++ b/onnxruntime/core/framework/execution_frame.h
@@ -167,7 +167,7 @@ class ExecutionFrame final : public IExecutionFrame {
   }
 
   // This function try retrieve the inferred shapes for the given NodeArg index.
-  // If the retrival is sucessful, this function returns true and false otherwise.
+  // If the retrival is successful, this function returns true and false otherwise.
   bool TryGetInferredShape(int index, TensorShape& shape) const override;
 
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
diff --git a/onnxruntime/core/framework/partial_graph_execution_state.cc b/onnxruntime/core/framework/partial_graph_execution_state.cc
index a053634adbe35..ce0572927d94a 100644
--- a/onnxruntime/core/framework/partial_graph_execution_state.cc
+++ b/onnxruntime/core/framework/partial_graph_execution_state.cc
@@ -50,7 +50,7 @@ PartialGraphExecutionState::~PartialGraphExecutionState() {
 DeviceStreamCollection* PartialGraphExecutionState::GetDeviceStreamCollection(const SessionState& session_state) {
   if (device_stream_collection_ == nullptr) {
     device_stream_collection_ = session_state.AcquireDeviceStreamCollection();
-    // the life-time of partial graph execution state is in-consistant with session,
+    // the life-time of partial graph execution state is inconsistent with session,
     // so we can't make sure it is safe to return the device stream collection to
     // session when deconstruct partial graph execution state.
     // so let's always delete the stream collections.
diff --git a/onnxruntime/core/framework/sequential_execution_plan.h b/onnxruntime/core/framework/sequential_execution_plan.h
index 62c66bc6f336c..d9472e404c0e4 100644
--- a/onnxruntime/core/framework/sequential_execution_plan.h
+++ b/onnxruntime/core/framework/sequential_execution_plan.h
@@ -106,7 +106,7 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
   // types of steps:
   // 1. Kernel Launch
   // 2. Activate notification
-  // 3. Wait on a notificaiton
+  // 3. Wait on a notification
   class ExecutionStep {
    public:
     ExecutionStep(NodeIndex node_index) : node_index_(node_index) {}
@@ -122,7 +122,7 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
    protected:
     NodeIndex node_index_;
   };
-  // LogicStream is a sequence of execution steps that can be executed independetly.
+  // LogicStream is a sequence of execution steps that can be executed independently.
   // The steps within a sequence are executed in order, and happened on the same device.
   struct LogicStream {
     std::vector<std::unique_ptr<ExecutionStep>> steps_;
@@ -160,7 +160,7 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
   std::vector<size_t> notification_owners;
   // key: notification index.
   // value:  {stream_idx, step_idx}
-  // giving a notificaiton, we used this map to figure out what is the downstream steps it need to trigger.
+  // giving a notification, we used this map to figure out what is the downstream steps it need to trigger.
   InlinedHashMap<onnxruntime::NotificationIndex, std::vector<std::pair<size_t, size_t>>> downstream_map;
 
   size_t num_barriers{0};
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index a374e381a2b0e..aa762ca32fdb4 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -442,7 +442,7 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
   if (p_kernel->KernelDef().OpName() == "YieldOp") {
     // Do not execute YieldOp (it is an no-op anyways).
     // Decrement the reference count of tensors that are not needed beyond this point.
-    // REVEIW(codemzs): The current model assumes the intermediate tensors that are exported
+    // REVIEW(codemzs): The current model assumes the intermediate tensors that are exported
     // as graph outputs are owned by ORT, the risk of caller freeing the tensor or manipulating tensor
     // memory lingers while the tensor is used downstream after the export.
     ctx.RecycleNodeInputs(idx);
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 46bfc3630303c..8d4db36106f28 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -62,7 +62,7 @@ enum class ExecutionPriority : int {
 
 struct FreeDimensionOverride {
   std::string dim_identifier;
-  FreeDimensionOverrideType dim_identifer_type;
+  FreeDimensionOverrideType dim_identifier_type;
   int64_t dim_value;
 };
 
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 42fb7b392283a..a88f36f63639c 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -22,9 +22,9 @@ using namespace ::onnxruntime::common;
 
 namespace onnxruntime {
 #ifdef ORT_ENABLE_STREAM
-static inline std::string GetWaitKey(const OrtDevice::DeviceType notificaiton_device_type,
+static inline std::string GetWaitKey(const OrtDevice::DeviceType notification_device_type,
                                      const OrtDevice::DeviceType executor_device_type) {
-  return std::to_string(notificaiton_device_type) + ":" + std::to_string(executor_device_type);
+  return std::to_string(notification_device_type) + ":" + std::to_string(executor_device_type);
 }
 
 class StreamCommandHandleRegistryImpl : public IStreamCommandHandleRegistry {
diff --git a/onnxruntime/core/framework/sparse_tensor.cc b/onnxruntime/core/framework/sparse_tensor.cc
index a3bcea4762d3e..4e40e3dd81ca2 100644
--- a/onnxruntime/core/framework/sparse_tensor.cc
+++ b/onnxruntime/core/framework/sparse_tensor.cc
@@ -551,7 +551,7 @@ Status SparseTensor::Copy(const IDataTransfer& data_transfer, SparseTensor& dst_
   }
 
   if (Values().Shape().Size() > 0) {
-    // This instance may either have a contigious buffer which we can copy in one shot
+    // This instance may either have a contiguous buffer which we can copy in one shot
     // or it can point to users buffers, in which case we have to copy each buffer individually
     // strings can not be memcpyed albeit always on CPU.
     if (p_data_ != nullptr) {
@@ -569,7 +569,7 @@ Status SparseTensor::Copy(const IDataTransfer& data_transfer, SparseTensor& dst_
         ORT_RETURN_IF_ERROR(data_transfer.CopyTensor(src, dst));
       }
     } else {
-      // non-contiguos buffer
+      // non-contiguous buffer
       if (is_string) {
         CopyStrings(Values(), result_values);
       } else {
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index aabfc0487f3e0..e5197adcb94ec 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -151,7 +151,7 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::
 // the data location is external. i.e. it does not load the external data.
 // However if AttributeProto contains SparseTensorProto then it converts the data into dense tensor proto
 // (including loading external data when applicable).
-// model_path is used for contructing full path for external_data
+// model_path is used for constructing full path for external_data
 // tensor_name specifies the name for the new TensorProto TensorProto
 common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& node,
                                               const std::filesystem::path& model_path,
@@ -165,7 +165,7 @@ common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& n
 // Convert a SparseTensorProto to a dense TensorProto
 // If the SparseTensorProto contains external data then it loads the data and converts to dense tensor proto
 // The resulting TensorProto will contain the data as raw data.
-// model_path is used for contructing full path for external_data
+// model_path is used for constructing full path for external_data
 common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseTensorProto& sparse,
                                                    const std::filesystem::path& model_path,
                                                    ONNX_NAMESPACE::TensorProto& dense);
@@ -174,7 +174,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
 // Convert a TensorProto to a SparseTensorProto
 // If the tensorproto contains external data then it loads the data and converts to sparse tensor
 // The resulting SparseTensorProto will contain the data as raw data
-// model_path is used for contructing full path for external_data
+// model_path is used for constructing full path for external_data
 common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& dense,
                                               const std::filesystem::path& model_path,
                                               ONNX_NAMESPACE::SparseTensorProto& sparse);
diff --git a/onnxruntime/core/framework/utils.h b/onnxruntime/core/framework/utils.h
index 17cf9671b70eb..afdb5a2cb27f5 100644
--- a/onnxruntime/core/framework/utils.h
+++ b/onnxruntime/core/framework/utils.h
@@ -47,7 +47,7 @@ void ConstructStrings(void* p_data, int64_t elements);
 
 /// <summary>
 /// Destroy std::string objects in the contiquous chunk of memory
-/// by explicitely invoking ~string();
+/// by explicitly invoking ~string();
 /// </summary>
 /// <param name="p_data"></param>
 /// <param name="elements"></param>
diff --git a/onnxruntime/core/mickey/gemm/warp/quantb_meta_loader.h b/onnxruntime/core/mickey/gemm/warp/quantb_meta_loader.h
index 4a784a1a49109..79c582279f2c8 100644
--- a/onnxruntime/core/mickey/gemm/warp/quantb_meta_loader.h
+++ b/onnxruntime/core/mickey/gemm/warp/quantb_meta_loader.h
@@ -37,12 +37,12 @@ void weightsMinuEight2Half(uint32_t const &weights,
   //
   // For element 0, 1, 4, 5, we have 0x000?000?, set the high bits
   // to 0x6400, essentially we set the exponent bits to 25, effective
-  // exp = 25 - 15 = 10, with explicity hight bit, the value is
+  // exp = 25 - 15 = 10, with explicitly hight bit, the value is
   //   2^10 + q_w.
   //
   // Similarly for element 2, 3, 6, 7, we have 0x00?000?, set the
   // high bits to 0x5400, essentially we set the exponent bits to 21,
-  // effective exp = 21 - 15 = 6, with explicity hight bit, the value
+  // effective exp = 21 - 15 = 6, with explicitly hight bit, the value
   // is 2^6 + q_w.
   //
   // 1.125 instruction per weight, 9 instructions in total.
@@ -86,12 +86,12 @@ void weights2Half([[maybe_unused]] uint32_t const &weights,
   //
   // For element 0, 1, 4, 5, we have 0x000?000?, set the high bits
   // to 0x6400, essentially we set the exponent bits to 25, effective
-  // exp = 25 - 15 = 10, with explicity hight bit, the value is
+  // exp = 25 - 15 = 10, with explicitly hight bit, the value is
   //   2^10 + q_w.
   //
   // Similarly for element 2, 3, 6, 7, we have 0x00?000?, set the
   // high bits to 0x5400, essentially we set the exponent bits to 21,
-  // effective exp = 21 - 15 = 6, with explicity hight bit, the value
+  // effective exp = 21 - 15 = 6, with explicitly hight bit, the value
   // is 2^6 + q_w.
   //
   // 1.125 instruction per weight, 9 instructions in total.
diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 5d2c35fbfb406..ec79641559c6b 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -61,7 +61,7 @@ Routine Description:
 
     This implementation supports sampling a portion of the convolution
     patches. This avoids the need to allocate very large buffers to store
-    all of the convolution patches at once, when the underyling GEMM
+    all of the convolution patches at once, when the underlying GEMM
     implementation will already break up the operation into panels. Multiple
     threads can also be used to process different portions of the image.
 
@@ -267,7 +267,7 @@ Routine Description:
 
     This implementation supports sampling a portion of the convolution
     patches. This avoids the need to allocate very large buffers to store
-    all of the convolution patches at once, when the underyling GEMM
+    all of the convolution patches at once, when the underlying GEMM
     implementation will already break up the operation into panels. Multiple
     threads can also be used to process different portions of the image.
 
diff --git a/onnxruntime/core/optimizer/attention_fusion_helper.h b/onnxruntime/core/optimizer/attention_fusion_helper.h
index ca744adddbeec..267a82b72670c 100644
--- a/onnxruntime/core/optimizer/attention_fusion_helper.h
+++ b/onnxruntime/core/optimizer/attention_fusion_helper.h
@@ -1118,8 +1118,8 @@ bool CheckNodesInPathV(const Graph& graph, const Node& reshape, const Node& tran
   head_size = v_reshape_shape[3];
 
   // Check reshape for attention output has shape input (0, 0, -1) or (0, 0, N*H)
-  // In DistilBert, the reshape after qkv paths can not be fused during reshape fusion, so we do not have the correspondig
-  // initializer. We need to get the shape information from the input of concat.
+  // In DistilBert, the reshape after qkv paths can not be fused during reshape fusion, so we do not have the
+  // corresponding initializer. We need to get the shape information from the input of concat.
   InlinedVector<int64_t> reshape_shape;
   if (!optimizer_utils::AppendTensorFromInitializer(graph, *(reshape.InputDefs()[1]), reshape_shape)) {
     if (CheckDistilBertReshapeShape(graph, reshape, hidden_size, record_node_idx, logger)) {
diff --git a/onnxruntime/core/optimizer/free_dim_override_transformer.cc b/onnxruntime/core/optimizer/free_dim_override_transformer.cc
index 0d162b5238b18..bce73a0dcec45 100644
--- a/onnxruntime/core/optimizer/free_dim_override_transformer.cc
+++ b/onnxruntime/core/optimizer/free_dim_override_transformer.cc
@@ -22,9 +22,9 @@ FreeDimensionOverrideTransformer::FreeDimensionOverrideTransformer(gsl::span<con
     : GraphTransformer("FreeDimensionOverrideTransformer") {
   for (const auto& o : overrides_to_apply) {
     // Convert to lowercase to perform case-insensitive comparisons later
-    if (o.dim_identifer_type == FreeDimensionOverrideType::Denotation) {
+    if (o.dim_identifier_type == FreeDimensionOverrideType::Denotation) {
       dimension_override_by_denotation_.emplace(ToLower(o.dim_identifier), o.dim_value);
-    } else if (o.dim_identifer_type == FreeDimensionOverrideType::Name) {
+    } else if (o.dim_identifier_type == FreeDimensionOverrideType::Name) {
       dimension_override_by_name_.emplace(o.dim_identifier, o.dim_value);
     } else {
       ORT_THROW("Invalid free dimension override.");
diff --git a/onnxruntime/core/optimizer/insert_cast_transformer.cc b/onnxruntime/core/optimizer/insert_cast_transformer.cc
index 959fcd6efdc3c..67ebc22dab41d 100644
--- a/onnxruntime/core/optimizer/insert_cast_transformer.cc
+++ b/onnxruntime/core/optimizer/insert_cast_transformer.cc
@@ -284,10 +284,12 @@ class RemoveDuplicateCastTransformer : public GraphTransformer {
  private:
   static bool UnsafeCast(DataType src_type, DataType dst_type, const Node& node) {
     // This is not a complete cast optimisation pass, and is more conservative than it could be.
-    // For instance, certain integral -> floating point casts could be optimised but this is left to an explicit cast optimisation pass.
+    // For instance, certain integral -> floating point casts could be optimized but
+    // this is left to an explicit cast optimisation pass.
 
     // The comparison with "InsertedPrecisionFreeCast_" reflects cast nodes that are inserted by InsertCastTransformer.
-    // Such casts should not be considered as loss of precision - the inserted upcasts (f16 -> f32) and downcasts (f32 -> f16) are inserted to support kernels when on a CPU EP without F16 support.
+    // Such casts should not be considered as loss of precision - the inserted upcasts (f16 -> f32) and
+    // downcasts (f32 -> f16) are inserted to support kernels when on a CPU EP without F16 support.
     auto src_type_group = GetTypeGroup(src_type);
     auto dst_type_group = GetTypeGroup(dst_type);
     if (Unknown == src_type_group || Unknown == dst_type_group) {
diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
index d4ed9c4e26cc6..bdb6a44bddaaf 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
@@ -1258,7 +1258,7 @@ static int EstimateTransposeValueCost(const api::GraphRef& graph, std::string_vi
   std::unique_ptr<api::NodeRef> producer_node = graph.GetNodeProducingOutput(input);
 
   if (producer_node != nullptr) {
-    // this handles cancelling out a Transpose or Squeeze added to a shared initializer that was updated
+    // this handles canceling out a Transpose or Squeeze added to a shared initializer that was updated
     // by TransposeInputImpl Case 1 or UnqueezeInput Case 1.
     //   - if a shared initializer is not broadcast, we have <updated initializer> -> Transpose -> DQ
     //   - if a shared initializer is broadcast, we have <updated initializer> -> Transpose -> Squeeze -> DQ and need
@@ -1992,7 +1992,7 @@ static bool HandleTile(HandlerArgs& args) {
 
 constexpr HandlerInfo tile_handler = {&FirstInput, &HandleTile};
 
-// Helper to remove cancelling Transpose -> Transpose or
+// Helper to remove canceling Transpose -> Transpose or
 // Transpose -> Reshape nodes.
 static void RemoveCancelingTransposeNodes(HandlerArgs& args) {
   // Input to 1st transpose
diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.cc b/onnxruntime/core/providers/acl/nn/batch_norm.cc
index eb6a10074f1db..be0e57c5c0543 100755
--- a/onnxruntime/core/providers/acl/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/acl/nn/batch_norm.cc
@@ -118,7 +118,7 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
     ACLImportMemory(tbatch_norm.b->allocator(), (void*)b_data, B->Shape().Size() * 4);
     ACLImportMemory(tbatch_norm.scale->allocator(), (void*)scale_data, S->Shape().Size() * 4);
 
-    // allocate space for input tensor to accomodate paddings and strides
+    // allocate space for input tensor to accommodate paddings and strides
     tbatch_norm.in->allocator()->allocate();
 
     tbatch_norm.layer = std::move(layer);
diff --git a/onnxruntime/core/providers/acl/nn/pool.cc b/onnxruntime/core/providers/acl/nn/pool.cc
index 8fbcba3ed87a7..01d9bc0302c3a 100644
--- a/onnxruntime/core/providers/acl/nn/pool.cc
+++ b/onnxruntime/core/providers/acl/nn/pool.cc
@@ -121,7 +121,7 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
       layer->configure(tpool.in.get(), tpool.out.get(), pool_info);
     }
 
-    // allocate space for input tensor to accomodate paddings and strides
+    // allocate space for input tensor to accommodate paddings and strides
     tpool.in->allocator()->allocate();
 
     tpool.layer = std::move(layer);
diff --git a/onnxruntime/core/providers/armnn/activation/activations.cc b/onnxruntime/core/providers/armnn/activation/activations.cc
index 93017c26271f7..7ab7a14f7e206 100644
--- a/onnxruntime/core/providers/armnn/activation/activations.cc
+++ b/onnxruntime/core/providers/armnn/activation/activations.cc
@@ -56,7 +56,7 @@ Status Relu<T>::Compute(OpKernelContext* context) const {
     armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
     activation->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
-    // Optimise ArmNN network
+    // Optimize ArmNN network
     armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Relu::run->GetDeviceSpec());
 
     if (optNet == nullptr) {
diff --git a/onnxruntime/core/providers/armnn/math/gemm.h b/onnxruntime/core/providers/armnn/math/gemm.h
index 4f77c4afb725a..039a9c3b75adb 100644
--- a/onnxruntime/core/providers/armnn/math/gemm.h
+++ b/onnxruntime/core/providers/armnn/math/gemm.h
@@ -130,7 +130,7 @@ class Gemm : public onnxruntime::Gemm<T> {
       armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
       fc_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
-      // Optimise ArmNN network
+      // Optimize ArmNN network
       armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Gemm::run->GetDeviceSpec());
 
       if (optNet == nullptr) {
diff --git a/onnxruntime/core/providers/armnn/nn/batch_norm.cc b/onnxruntime/core/providers/armnn/nn/batch_norm.cc
index e9d8e6fb47852..9a7821d81bdb1 100755
--- a/onnxruntime/core/providers/armnn/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/armnn/nn/batch_norm.cc
@@ -89,7 +89,7 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
     armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
     layer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
-    // Optimise ArmNN network
+    // Optimize ArmNN network
     armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, BatchNorm::run->GetDeviceSpec());
 
     if (optNet == nullptr) {
diff --git a/onnxruntime/core/providers/armnn/nn/conv.cc b/onnxruntime/core/providers/armnn/nn/conv.cc
index 674e927ffc324..db261e67ecd00 100644
--- a/onnxruntime/core/providers/armnn/nn/conv.cc
+++ b/onnxruntime/core/providers/armnn/nn/conv.cc
@@ -266,7 +266,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
       activation->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
     }
 
-    // Optimise ArmNN network
+    // Optimize ArmNN network
     armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Conv::run->GetDeviceSpec());
 
     if (optNet == nullptr) {
diff --git a/onnxruntime/core/providers/armnn/nn/pool.cc b/onnxruntime/core/providers/armnn/nn/pool.cc
index c4eeb17779fcb..9d25b4eed2db4 100644
--- a/onnxruntime/core/providers/armnn/nn/pool.cc
+++ b/onnxruntime/core/providers/armnn/nn/pool.cc
@@ -161,7 +161,7 @@ Status Pool<T, PoolType>::Compute(OpKernelContext* context) const {
     armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
     pool_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
-    // Optimise ArmNN network
+    // Optimize ArmNN network
     armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Pool::run->GetDeviceSpec());
 
     if (optNet == nullptr) {
@@ -250,7 +250,7 @@ Status MaxPoolV8<T>::Compute(OpKernelContext* context) const {
     armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
     pool_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
-    // Optimise ArmNN network
+    // Optimize ArmNN network
     armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, MaxPoolV8::run->GetDeviceSpec());
 
     if (optNet == nullptr) {
diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc
index 91e7955aa9fbe..a602a85fc2737 100644
--- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc
@@ -290,9 +290,9 @@ std::unique_ptr<Tensor> Transpose(const Tensor& input, const TensorShape& input_
   // and it will de-allocate the memory for this intermediate tensor when it goes out of scope
   std::unique_ptr<Tensor> output = std::make_unique<Tensor>(input.DataType(), output_dims, allocator);
 
-  TensorShape overriden_shape(input_shape_override);
+  TensorShape overridden_shape(input_shape_override);
 
-  auto status = device_transpose_func(permutation, input, *output, &overriden_shape, einsum_cuda_assets);
+  auto status = device_transpose_func(permutation, input, *output, &overridden_shape, einsum_cuda_assets);
 
   if (!status.IsOK()) {
     ORT_THROW(ONNXRUNTIME, FAIL, "Einsum op: Transpose failed: ", status.ErrorMessage());
diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
index a362bb06220d8..343ed485a150a 100644
--- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
+++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
@@ -209,7 +209,7 @@ std::unique_ptr<Tensor> EinsumTypedComputeProcessor<T>::PairwiseOperandProcess(c
     if (current_left && IsTransposeReshapeForEinsum(left_permutation,
                                                     current_left->Shape().GetDims(),
                                                     reshaped_dims)) {
-      // This can be done because curent_* tensors (if they exist) and output tensors are
+      // This can be done because current_* tensors (if they exist) and output tensors are
       // intermediate tensors and cannot be input tensors to the Einsum node itself
       // (which are immutable).
       // Covered by ExplicitEinsumAsTensorContractionReshapeLeft.
diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
index ac1bb111494fd..ead2ccaef002e 100644
--- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc
+++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
@@ -135,7 +135,7 @@ static void PreCalcForBilinearInterpolate(const int64_t height, const int64_t wi
           T w3 = ly * hx;
           T w4 = ly * lx;
 
-          // save weights and indeces
+          // save weights and indices
           PreCalc<T> pc;
           pc.pos1 = y_low * width + x_low;
           pc.pos2 = y_low * width + x_high;
diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
index 2913f4ac32b6e..7a27b04ece7cf 100644
--- a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
+++ b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
@@ -317,7 +317,7 @@ Status SequenceConstruct::Compute(OpKernelContext* context) const {
     const auto* X = context->Input<Tensor>(input_idx);
     if (input_idx > 0 && X->DataType() != first_dtype) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Violation of the requirment that all input tensors must have the same data type.");
+                             "Violation of the requirement that all input tensors must have the same data type.");
     }
   }
 
diff --git a/onnxruntime/core/providers/cpu/tensor/unique.cc b/onnxruntime/core/providers/cpu/tensor/unique.cc
index ab99d87da83fd..92c163a0f08a1 100644
--- a/onnxruntime/core/providers/cpu/tensor/unique.cc
+++ b/onnxruntime/core/providers/cpu/tensor/unique.cc
@@ -51,7 +51,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             1,
             "indices",
             "A 1-D INT64 tensor "
-            "containing indices of 'Y' elements' first occurance in 'X'. "
+            "containing indices of 'Y' elements' first occurrence in 'X'. "
             "When 'axis' is provided, it contains indices to subtensors in input 'X' on the 'axis'. "
             "When 'axis' is not provided, it contains indices to values in the flattened input tensor. ",
             "tensor(int64)",
diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.cc b/onnxruntime/core/providers/cuda/cuda_allocator.cc
index 314aa1062f1b0..2189af8e0ee2d 100644
--- a/onnxruntime/core/providers/cuda/cuda_allocator.cc
+++ b/onnxruntime/core/providers/cuda/cuda_allocator.cc
@@ -60,7 +60,7 @@ void* CUDAExternalAllocator::Alloc(size_t size) {
   if (size > 0) {
     p = alloc_(size);
 
-    // review(codemzs): ORT_ENFORCE does not seem appropiate.
+    // review(codemzs): ORT_ENFORCE does not seem appropriate.
     ORT_ENFORCE(p != nullptr);
   }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index 58e57572131b1..14b75d2383b58 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -179,7 +179,7 @@ Status CudaStream::CleanUpOnRunEnd() {
 }
 
 void* CudaStream::GetResource(int version, int id) const {
-  ORT_ENFORCE(version <= ORT_CUDA_RESOUCE_VERSION, "resource version unsupported!");
+  ORT_ENFORCE(version <= ORT_CUDA_RESOURCE_VERSION, "resource version unsupported!");
   void* resource{};
   switch (id) {
     case CudaResource::cuda_stream_t:
diff --git a/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh b/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh
index 6cb65ea8e739c..8bb87035cdc6d 100644
--- a/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh
+++ b/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh
@@ -30,7 +30,7 @@ dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
   uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
 
   // In the vectorized case we want to trade off allowing more of the buffers to be accessed
-  // in a vectorized way against wanting a larger block size to get better utilisation.
+  // in a vectorized way against wanting a larger block size to get better utilization.
   // In general with ILP you can have (ILP-1)/ILP of the buffer accessed vectorised, at the risk
   // of having a very small block size. We choose to keep >= 1/2 of the buffer vectorised while
   // allowing a larger block size.
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index e05786248cbcf..764feadcf4cb3 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -15,7 +15,7 @@ namespace onnxruntime {
 namespace cuda {
 
 // Op Set 11 for Conv only update document to clearify default dilations and strides value.
-// which are already convered by op set 11 cpu versoin, so simply add declaration.
+// which are already convered by op set 11 cpu version, so simply add declaration.
 #define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC)                                             \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
       Conv,                                                                                \
@@ -269,7 +269,7 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
       // especially for EXHAUSTIVE algo search which may result in a better algo selection.
       // ORTModule uses different algo search options (HEURISTIC, and use max workspace size) compared to
       // inference build (EXHAUSTIVE, 32M workspace size). We observed better perf when we pad input shape
-      // [N,C,D] to [N,C,1,D], expecially on A100, and especially for ConvGrad.
+      // [N,C,D] to [N,C,1,D], especially on A100, and especially for ConvGrad.
       // PyTorch also pads to [N,C,1,D]. For inference build, we still pad it to [N, C, D, 1] as this seems
       // to be the sweet spot for all algo search options: EXHAUSTIVE, HEURISTIC, and DEFAULT.
       // See PR #7348 and #7702 for more context.
diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu
index 537ad0a8b9efe..10053c630ab66 100644
--- a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu
+++ b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu
@@ -20,7 +20,7 @@
 
 namespace onnxruntime {
 namespace cuda {
-  
+
 template <typename T>
 __device__ T bilinear_interpolate(
     const T* bottom_data,
@@ -73,8 +73,8 @@ __device__ T bilinear_interpolate(
   T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 
   T val = is_mode_avg
-            ? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4)  // mode Avg
-            : max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4);  // mode Max
+              ? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4)             // mode Avg
+              : max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4);  // mode Max
 
   return val;
 }
@@ -116,7 +116,7 @@ __global__ void RoIAlignForward(
 
     T roi_width = roi_end_w - roi_start_w;
     T roi_height = roi_end_h - roi_start_h;
-    if (!half_pixel) { // backward compatiblity
+    if (!half_pixel) {  // backward compatibility
       // Force malformed ROIs to be 1x1
       roi_width = max(roi_width, (T)1.);
       roi_height = max(roi_height, (T)1.);
@@ -129,29 +129,29 @@ __global__ void RoIAlignForward(
 
     // We use roi_bin_grid to sample the grid and mimic integral
     int roi_bin_grid_h = (sampling_ratio > 0)
-        ? sampling_ratio
-        : _Ceil(roi_height / pooled_height); // e.g., = 2
+                             ? sampling_ratio
+                             : _Ceil(roi_height / pooled_height);  // e.g., = 2
     int roi_bin_grid_w =
         (sampling_ratio > 0) ? sampling_ratio : _Ceil(roi_width / pooled_width);
 
     // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
 
     T output_val = 0.;
     bool max_flag = false;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
     {
       const T y = roi_start_h + ph * bin_size_h +
-          static_cast<T>(iy + .5f) * bin_size_h /
-              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
         const T x = roi_start_w + pw * bin_size_w +
-            static_cast<T>(ix + .5f) * bin_size_w /
-                static_cast<T>(roi_bin_grid_w);
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
 
         T val = bilinear_interpolate(
             offset_bottom_data, height, width, y, x, is_mode_avg, index);
-        
+
         if (is_mode_avg) {
           output_val += val;
         } else {
@@ -174,24 +174,24 @@ __global__ void RoIAlignForward(
 
 template <typename T>
 void RoiAlignImpl(
-  cudaStream_t stream,
-  const int64_t nthreads,
-  const T* bottom_data,
-  const T spatial_scale,
-  const int64_t channels,
-  const int64_t height,
-  const int64_t width,
-  const int64_t pooled_height,
-  const int64_t pooled_width,
-  const int64_t sampling_ratio,
-  const T* bottom_rois,
-  int64_t roi_cols,
-  T* top_data,
-  const bool is_mode_avg,
-  const bool half_pixel,
-  const int64_t* batch_indices_ptr) {
-    int blocksPerGrid = (int)(ceil(static_cast<float>(nthreads) / GridDim::maxThreadsPerBlock)); 
-    RoIAlignForward<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+    cudaStream_t stream,
+    const int64_t nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t pooled_height,
+    const int64_t pooled_width,
+    const int64_t sampling_ratio,
+    const T* bottom_rois,
+    int64_t roi_cols,
+    T* top_data,
+    const bool is_mode_avg,
+    const bool half_pixel,
+    const int64_t* batch_indices_ptr) {
+  int blocksPerGrid = (int)(ceil(static_cast<float>(nthreads) / GridDim::maxThreadsPerBlock));
+  RoIAlignForward<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
       nthreads,
       bottom_data,
       spatial_scale,
@@ -206,30 +206,30 @@ void RoiAlignImpl(
       top_data,
       is_mode_avg,
       half_pixel,
-      batch_indices_ptr);    
+      batch_indices_ptr);
 }
 
-#define SPECIALIZED_IMPL(T)                     \
-  template void RoiAlignImpl<T>(                \
-        cudaStream_t stream,              \
-        const int64_t nthreads,                 \
-        const T* bottom_data,                   \
-        const T spatial_scale,                  \
-        const int64_t channels,                 \
-        const int64_t height,                   \
-        const int64_t width,                    \
-        const int64_t pooled_height,            \
-        const int64_t pooled_width,             \
-        const int64_t sampling_ratio,           \
-        const T* bottom_rois,                   \
-        int64_t roi_cols,                       \
-        T* top_data,                            \
-        const bool is_mode_avg,                 \
-        const bool half_pixel,                  \
-        const int64_t* batch_indices_ptr);
+#define SPECIALIZED_IMPL(T)         \
+  template void RoiAlignImpl<T>(    \
+      cudaStream_t stream,          \
+      const int64_t nthreads,       \
+      const T* bottom_data,         \
+      const T spatial_scale,        \
+      const int64_t channels,       \
+      const int64_t height,         \
+      const int64_t width,          \
+      const int64_t pooled_height,  \
+      const int64_t pooled_width,   \
+      const int64_t sampling_ratio, \
+      const T* bottom_rois,         \
+      int64_t roi_cols,             \
+      T* top_data,                  \
+      const bool is_mode_avg,       \
+      const bool half_pixel,        \
+      const int64_t* batch_indices_ptr);
 
 SPECIALIZED_IMPL(float)
 SPECIALIZED_IMPL(double)
-  
-} // namespace cuda
-} // namespace onnxruntime
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
index bc78e577c5052..c921339ee6f33 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
@@ -115,7 +115,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
   CUDNN_RETURN_IF_ERROR(cudnnGetReductionIndicesSize(cudnn_handle, reduce_desc, input_tensor, output_tensor, &indices_bytes));
   auto indices_cuda = GetScratchBuffer<uint32_t>(indices_bytes, stream);
 
-  // need to allocate a separate buffer for ArgMin/ArgMax comparsion output
+  // need to allocate a separate buffer for ArgMin/ArgMax comparison output
   auto output_count = output_shape.Size();
 
   if (ReduceTensorIndices == CUDNN_REDUCE_TENSOR_NO_INDICES) {
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index e788f24052985..a96d4c82a7fdc 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -234,15 +234,15 @@ __global__ void _ResizeNearestKernel(
 
   int output_index = static_cast<int>(id);
   int input_index = 0;
-  int extrapolation_occured = 0;
+  int extrapolation_occurred = 0;
   for (int axis = 0; axis < rank; ++axis) {
     int dim = 0;
     output_div_pitches[axis].divmod(output_index, dim, output_index);
     const NearestMappingInfo& mi = dims_mapping[prefix_dim_sum[axis] + dim];
-    extrapolation_occured += mi.extrapolate_;
+    extrapolation_occurred += mi.extrapolate_;
     input_index += input_strides[axis] * mi.origin_;
   }
-  output_data[id] = extrapolation_occured ? extrapolation_value : input_data[input_index];
+  output_data[id] = extrapolation_occurred ? extrapolation_value : input_data[input_index];
 }
 
 struct LinearMappingInfo {
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
index 6344845359b32..602514d1c8227 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
@@ -145,7 +145,7 @@ bool CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim(const cu
         (input_dims[3] % num_elements_per_thread) == 0 &&
         input_dims[1] <= prop.maxGridSize[1] &&
         input_dims[0] <= prop.maxGridSize[2]) {
-      // There are 2 constrains when luanching the kernels
+      // There are 2 constrains when launching the kernels
       // 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
       // 2. block_size_y * num_block_ext >= input_dims[2]
       int64_t block_size_x = input_dims[3] / num_elements_per_thread;
@@ -261,7 +261,7 @@ bool CanDoTranspose4DParallelizeOneElementPerThread(const cudaDeviceProp& prop,
     if (input_dims[3] <= prop.maxThreadsPerBlock &&
         input_dims[1] <= prop.maxGridSize[1] &&
         input_dims[0] <= prop.maxGridSize[2]) {
-      // There are 2 constrains when luanching the kernels
+      // There are 2 constrains when launching the kernels
       // 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
       // 2. block_size_y * num_block_ext >= input_dims[2]
       int64_t block_size_x = input_dims[3];
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorFusedMatMul.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorFusedMatMul.cpp
index 0bc543c56f7d1..a0c9289a87156 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorFusedMatMul.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorFusedMatMul.cpp
@@ -44,7 +44,7 @@ class DmlOperatorFusedMatMul : public DmlOperator
 
         // At this point, we have manipulated input/output shapes and strides and
         // we do not care about actual input shapes present in the model (.onnx file).
-        // Create the TensorDesc with the manipulated input shapes becuase we don't want incorrect
+        // Create the TensorDesc with the manipulated input shapes because we don't want incorrect
         // broadcasting to be happen inside TensorDesc constructor.
         std::vector<std::optional<uint32_t>> inputIndices = { 0, 1, std::nullopt };
         gsl::span<const uint32_t> inputShapes[2] = {sizesA, sizesB};
diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
index 3271dab13f675..ffda84921a3ee 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
@@ -344,7 +344,7 @@ Status DnnlExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fuse
         auto input_tensor = ctx.GetInput(i);
         auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
         auto shape = tensor_info.GetShape();
-        // dnnl expectes non-const data
+        // dnnl expects non-const data
         void* inputBuffer = const_cast<void*>(input_tensor.GetTensorRawData());
         inputs.emplace(
             input_name,
diff --git a/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc b/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
index 5db52f29a93cf..01f44e91fd49c 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
@@ -431,7 +431,7 @@ bool DnnlMatMulIntegerNodeCapability::IsDimensionSupported(const Node* node, con
     }
   }
 
-  // if shape nullptr, not enough information to reject it. attempt to run it (no gaurantee)
+  // if shape nullptr, not enough information to reject it. attempt to run it (no guarantee)
   if (node_inputs[0]->Shape() == nullptr || node_inputs[1]->Shape() == nullptr) {
     return true;
   }
@@ -465,7 +465,7 @@ bool DnnlSumNodeCapability::Supported(const Node* node, const GraphViewer& graph
 }
 
 // OneDNN version of Sum does not support Numpy style broadcasting.
-// If the dimentions of all inputs do not match return false
+// If the dimensions of all inputs do not match return false
 bool DnnlSumNodeCapability::IsDimensionSupported(const Node* node) const {
   auto node_inputs = node->InputDefs();
   // find first non-null shape
@@ -615,7 +615,7 @@ bool DnnlReshapeNodeCapability::Supported(const Node* node, const GraphViewer& g
 }
 bool DnnlReshapeNodeCapability::IsDimensionSupported(const Node* node) const {
   auto node_inputs = node->InputDefs();
-  // We can not reshape a one dimentional tensor to a scalar output
+  // We can not reshape a one dimensional tensor to a scalar output
   if (node_inputs[1]->Shape() != nullptr &&
       node_inputs[1]->Shape()->dim_size() == 1 &&
       node_inputs[1]->Shape()->dim(0).dim_value() == 0) {
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h
index 831b10c3e147f..1af9e503e7816 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h
@@ -32,9 +32,9 @@ class DnnlConv {
 
  private:
   /*
-   * Return the infered padding.
+   * Return the inferred padding.
    *
-   * The padding will be based on the specified padding or will infered based on the
+   * The padding will be based on the specified padding or will inferred based on the
    * Onnx 'auto_pad' attributes.
    *
    * This will return the padding in the format specified in the Onnx specification.
@@ -47,9 +47,9 @@ class DnnlConv {
                                       const dnnl::memory::dims& dilations,
                                       const std::vector<int64_t>& kernel_shape,
                                       const dnnl::memory::dims& strides);
-  /* Get the padding left values from the infered pads */
+  /* Get the padding left values from the inferred pads */
   dnnl::memory::dims GetPaddingLeft(const std::vector<int64_t>& onnx_padding, ConvShape shape);
-  /* Get the padding right values from the infered pads */
+  /* Get the padding right values from the inferred pads */
   dnnl::memory::dims GetPaddingRight(const std::vector<int64_t>& onnx_padding, ConvShape shape);
 
   /*
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc
index 21218e24c17d6..e05693f3e5f2e 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc
@@ -40,7 +40,7 @@ ConvGrad: (According to OnnxRuntime discovered using code inspection and Onnx do
 
 Attributes (auto_pad, dilations, group, kernel_shap, pads, and strides) should be the same as the forward pass Conv operator
 
-To acheive Everything specified in the OnnxRuntime ConvGrad we must use both:
+To achieve Everything specified in the OnnxRuntime ConvGrad we must use both:
 1) dnnl::convolution_backward_data - used to calculate (dX) diff_src
 2) dnnl::convolution_backward_weights - used to calculate (dW) diff_weights and (dB) diff_bias
 */
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h
index 3a27788745ef0..c45c85859c25e 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h
@@ -39,9 +39,9 @@ class DnnlConvGrad {
   std::vector<int64_t> GetKernelShape(DnnlNode& node);
   /* Get the 'pads' attribute */
   dnnl::memory::dims GetPads(DnnlNode& node, ConvShape shape);
-  /* Get the padding left values from the infered pads */
+  /* Get the padding left values from the inferred pads */
   dnnl::memory::dims GetPaddingLeft(const std::vector<int64_t>& onnx_padding, ConvShape shape);
-  /* Get the padding right values from the infered pads */
+  /* Get the padding right values from the inferred pads */
   dnnl::memory::dims GetPaddingRight(const std::vector<int64_t>& onnx_padding, ConvShape shape);
   /*
    * Get the 'dilations' attribute.
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc
index 074df058806e5..ac668aad1bb4a 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc
@@ -68,7 +68,7 @@ void DnnlDequantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode&
   auto dst_md = dnnl::memory::desc(x_md.get_dims(), node.Output(OUT_Y).Type(), dnnl::memory::format_tag::any);
   dnnl::memory dst_mem;
 
-  // If zero point exists and we are NOT dequantizing int32, then substract zp from x and scale
+  // If zero point exists and we are NOT dequantizing int32, then subtract zp from x and scale
   if (isZeroPointUseful && (x_mem.get_desc().get_data_type() != dnnl::memory::data_type::s32)) {
     // Get Zero point
     auto x_zp_mem = sp.GetMemory(node.Input(IN_X_ZERO_POINT));
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc
index 54528011850be..82a9e9f3ec898 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc
@@ -126,7 +126,7 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
       }
 
       // The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
-      // that will have the correct dimentions and correct memory::format
+      // that will have the correct dimensions and correct memory::format
       transposedA_md = dnnl::memory::desc(transposedA_dims, node.Input(IN_A).Type(), sp.GetDnnlFormat(transposedA_dims.size()));
       transposedA_mem = dnnl::memory(transposedA_md, eng, nullptr);
       void* handle = intermediateA_mem.get_data_handle();
@@ -146,7 +146,7 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
       }
 
       // The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
-      // that will have the correct dimentions and correct memory::format
+      // that will have the correct dimensions and correct memory::format
       transposedB_md = dnnl::memory::desc(transposedB_dims, node.Input(IN_B).Type(), sp.GetDnnlFormat(transposedB_dims.size()));
       transposedB_mem = dnnl::memory(transposedB_md, eng, nullptr);
       void* handle = intermediateB_mem.get_data_handle();
@@ -193,8 +193,8 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
   create a post op binary with possible unsqueezing in order to make sure onednn properly broadcast
   current limitation
   1. is no unsqueeze for matmul output as it is not exposed due to post op fusion
-  2. the third input has to be reordered to plain format (eg, no memory format propogation if the third input is internal to subgraph)
-  3. adding 1s to front (unsqueeze/expand) in logical dims would possibly fail if physcial layout is not plain format
+  2. the third input has to be reordered to plain format (eg, no memory format propagation if the third input is internal to subgraph)
+  3. adding 1s to front (unsqueeze/expand) in logical dims would possibly fail if physical layout is not plain format
   */
   dnnl::primitive_attr attr;
   if (has_postop_fusion) {
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc
index f49fdd7e9bde1..b19411e61767c 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc
@@ -135,16 +135,16 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
    * shape reduction. For this reason we have code paths that are taken if the source dimensions and
    * destination dimensions are equal that will not call the reduction op.
    *
-   * "ReduceLogSum" is equivelent to Log(ReduceSum(input))
+   * "ReduceLogSum" is equivalent to Log(ReduceSum(input))
    *   - if the reduction op is called then the eltwise_log post op will added to the reduction primitive.
    *   - if the reduction op is not called then the eltwise_log primitive is added as its own primitive
    *   - NOTE "ReduceLogSum" follows the code flow of "All other reduce ops" with the exception of the added
    *          post op and an extra check if src_dims == dest_dims.
-   * "ReduceLogSumExp" is equivelent to Log(ReduceSum(Exp(input)))
+   * "ReduceLogSumExp" is equivalent to Log(ReduceSum(Exp(input)))
    *   - if the reduction op is called then the eltwise_exp primitive is added before the reduction op
    *     the eletwise_log post op will be added to the reduction primitive
    *   - if the reduction op is not called then the input is not modified since Log(Exp(input) == input
-   * "ReduceSumSquare" is equivelent to ReduceSum(Square(input))
+   * "ReduceSumSquare" is equivalent to ReduceSum(Square(input))
    *   - the eltwise_square primitive is added before the reduction op
    *   - if the source and destination dimensions are not equal the reduction op is called
    * All other reduce ops
@@ -298,7 +298,7 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
     dnnl::memory squeeze_mem = dnnl::memory(squeeze_md, dnnl_engine, nullptr);
     // if the src and dst dims are equal then we will have a valid data handle here.
     // Otherwise we must get the data handle at runtime using the AddReshape function.
-    // reading the data handle directy is more efficent if is it possible.
+    // reading the data handle directly is more efficient if is it possible.
     if (!src_and_dst_dims_equal) {
       squeeze_mem.set_data_handle(reduce_dst_mem.get_data_handle());
     } else {
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
index f97268465e46e..a7e49b54d4507 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
@@ -65,7 +65,7 @@ class DnnlSubgraphPrimitive {
   dnnl::memory::desc GetOutputInfo(std::string name);
   bool IsScalarOutput(const std::string& name);
   bool IsDynamic();
-  // All Scalar inputs are automatically converterted to a one dimentional tensor when used in OneDNN
+  // All Scalar inputs are automatically converterted to a one dimensional tensor when used in OneDNN
   // If the input being a scalar affects the operator this function can be used to determine if the
   // original input from ORT was a scalar.
   bool IsScalar(const DnnlTensor& tensor);
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc
index 3a7f45c72f27f..b74dbf97a2547 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc
@@ -56,7 +56,8 @@ void DnnlTranspose::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
     strides_inverse.push_back(strides[ndata_dims - i - 1]);
   }
 
-  // Memory descriptor describes the memory reorder but will not have the correct output dimentions or the correct dnnl::memory::format
+  // Memory descriptor describes the memory reorder but will not have the correct output dimensions
+  // or the correct dnnl::memory::format
   dnnl::memory::desc intermediate_md = dnnl::memory::desc(data_dims, node.Input(IN_DATA).Type(), strides);
   dnnl::memory intermediate_mem = dnnl::memory(intermediate_md, dnnl_engine);
 
@@ -65,7 +66,7 @@ void DnnlTranspose::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
                                        {DNNL_ARG_TO, intermediate_mem}});
 
   // The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
-  // that will have the correct dimentions and correct memory::format
+  // that will have the correct dimensions and correct memory::format
   dnnl::memory::desc transposed_md = dnnl::memory::desc(transposed_dims, node.Input(IN_DATA).Type(), sp.GetDnnlFormat(data_dims.size()));
   dnnl::memory transposed_mem = dnnl::memory(transposed_md, dnnl_engine, nullptr);
   void* handle = intermediate_mem.get_data_handle();
diff --git a/onnxruntime/core/providers/migraphx/migraphx_allocator.cc b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
index 0693eea056416..c9db31e8744a7 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
@@ -42,7 +42,7 @@ void* MIGraphXExternalAllocator::Alloc(size_t size) {
   if (size > 0) {
     p = alloc_(size);
 
-    // review(codemzs): ORT_ENFORCE does not seem appropiate.
+    // review(codemzs): ORT_ENFORCE does not seem appropriate.
     ORT_ENFORCE(p != nullptr);
   }
 
diff --git a/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc
index 9c5bb4ecf5c97..e8e349af75aba 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc
@@ -123,7 +123,7 @@ Status MIGraphXStream::CleanUpOnRunEnd() {
 }
 
 void* MIGraphXStream::GetResource(int version, int id) const {
-  ORT_ENFORCE(version <= ORT_ROCM_RESOUCE_VERSION, "resource version unsupported!");
+  ORT_ENFORCE(version <= ORT_ROCM_RESOURCE_VERSION, "resource version unsupported!");
   void* resource{};
   switch (id) {
     case RocmResource::hip_stream_t:
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.cc
index cdf1075beb827..91d85efd09c65 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.cc
@@ -228,7 +228,7 @@ const NnApi LoadNnApi() {
   nnapi.ASharedMemory_create = getASharedMemory_create();
 #else
   // Mock ASharedMemory_create only if libneuralnetworks.so was successfully
-  // loaded. This ensures identical behaviour on platforms which use this
+  // loaded. This ensures identical behavior on platforms which use this
   // implementation, but don't have libneuralnetworks.so library, and
   // platforms which use nnapi_implementation_disabled.cc stub.
   if (libneuralnetworks != nullptr) {
diff --git a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
index 64d8f235840bc..44b34f4b4ce6c 100644
--- a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
+++ b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
@@ -28,7 +28,7 @@ constexpr const char* RKNPU = "Rknpu";
 struct RknpuFuncState {
   std::string uniq_input_shape;
 
-  std::unique_ptr<rk::nn::Exection> exector;
+  std::unique_ptr<rk::nn::Execution> exector;
   ONNX_NAMESPACE::ModelProto model_proto;
   std::unordered_map<std::string, int> input_map;
   std::unordered_map<std::string, int> output_map;
@@ -282,7 +282,7 @@ common::Status RknpuExecutionProvider::Compile(const std::vector<FusedNodeAndGra
       std::unique_ptr<RknpuFuncState> p =
           std::make_unique<RknpuFuncState>();
       rk::nn::Graph* graph = new rk::nn::Graph();
-      *p = {"", std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(graph)),
+      *p = {"", std::unique_ptr<rk::nn::Execution>(new rk::nn::Execution(graph)),
             model_proto_[context->node_name], input_info_[context->node_name],
             output_info_[context->node_name],
             std::vector<int>{}, std::vector<int>{}};
diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
index a2b587a56466f..d7f47d07a8fec 100644
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@@ -12,7 +12,7 @@ namespace onnxruntime {
 namespace rocm {
 
 // Op Set 11 for Conv only update document to clearify default dilations and strides value.
-// which are already convered by op set 11 cpu versoin, so simply add declaration.
+// which are already convered by op set 11 cpu version, so simply add declaration.
 #define REGISTER_KERNEL_TYPED(T)                                                           \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
       Conv,                                                                                \
diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
index 820745b22f614..11073ab3584eb 100644
--- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
@@ -226,7 +226,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
   MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(miopen_handle, reduce_desc, input_tensor, output_tensor, &indices_bytes));
   auto indices_rocm = GetScratchBuffer<uint32_t>(indices_bytes, stream);
 
-  // need to allocate a separate buffer for ArgMin/ArgMax comparsion output
+  // need to allocate a separate buffer for ArgMin/ArgMax comparison output
   auto output_count = output_shape.Size();
 
   if (ReduceTensorIndices == MIOPEN_REDUCE_TENSOR_NO_INDICES) {
diff --git a/onnxruntime/core/providers/rocm/rocm_allocator.cc b/onnxruntime/core/providers/rocm/rocm_allocator.cc
index 8645b791d4b0f..4a11b158c2cce 100644
--- a/onnxruntime/core/providers/rocm/rocm_allocator.cc
+++ b/onnxruntime/core/providers/rocm/rocm_allocator.cc
@@ -60,7 +60,7 @@ void* ROCMExternalAllocator::Alloc(size_t size) {
   if (size > 0) {
     p = alloc_(size);
 
-    // review(codemzs): ORT_ENFORCE does not seem appropiate.
+    // review(codemzs): ORT_ENFORCE does not seem appropriate.
     ORT_ENFORCE(p != nullptr);
   }
 
diff --git a/onnxruntime/core/providers/rocm/rocm_stream_handle.cc b/onnxruntime/core/providers/rocm/rocm_stream_handle.cc
index 0c0f64a8bfaf0..ef5689fc9a2d0 100644
--- a/onnxruntime/core/providers/rocm/rocm_stream_handle.cc
+++ b/onnxruntime/core/providers/rocm/rocm_stream_handle.cc
@@ -140,7 +140,7 @@ Status RocmStream::CleanUpOnRunEnd() {
 }
 
 void* RocmStream::GetResource(int version, int id) const {
-  ORT_ENFORCE(version <= ORT_ROCM_RESOUCE_VERSION, "resource version unsupported!");
+  ORT_ENFORCE(version <= ORT_ROCM_RESOURCE_VERSION, "resource version unsupported!");
   void* resource{};
   switch (id) {
     case RocmResource::hip_stream_t:
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index e839d6d17b7d9..0da0dfc6dfb26 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -329,7 +329,7 @@ common::Status WebNNExecutionProvider::Compile(const std::vector<FusedNodeAndGra
     node_compute_funcs.push_back(compute_info);
   }
 
-  // Explictly release the WebNN builder to free memory.
+  // Explicitly release the WebNN builder to free memory.
   wnn_builder_ = emscripten::val::undefined();
 
   return Status::OK();
diff --git a/onnxruntime/core/session/IOBinding.cc b/onnxruntime/core/session/IOBinding.cc
index 583206ecafcdc..aa10c867fe960 100644
--- a/onnxruntime/core/session/IOBinding.cc
+++ b/onnxruntime/core/session/IOBinding.cc
@@ -29,7 +29,7 @@ common::Status IOBinding::BindInput(const std::string& name, const OrtValue& ml_
     // It may copy the data instead of copying the pointer.
     // When OrtValue is empty, the pointer is copied. When it is not
     // (if feeds_[index] is not for example),
-    // CopyOneInputAcrossDevices has a different behaviour.
+    // CopyOneInputAcrossDevices has a different behavior.
     ORT_RETURN_IF_ERROR(utils::CopyOneInputAcrossDevices(session_state_, name, ml_value, new_mlvalue));
     add_or_replace(new_mlvalue);
   } else {
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 3fd6e84e0e5ce..cc3a9943ca0a3 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -2827,7 +2827,7 @@ std::pair<common::Status, const InputDefList*> InferenceSession::GetOverridableI
     }
   }
 
-  // returns a list of initializers that can be overriden.
+  // returns a list of initializers that can be overridden.
   return std::make_pair(common::Status::OK(), &model_->MainGraph().GetOverridableInitializers());
 }
 
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index e1cd085d2c271..9662095bf0ed3 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -386,7 +386,7 @@ class InferenceSession {
    * @param run_options run options.
    * @param mutable_feeds inputs owned by client code and will be released as long as the feeds be set in session states.
    * Then the feeds will purely managed in the session states.
-   * @param fetches outputs produced after the executin of this function.
+   * @param fetches outputs produced after the execution of this function.
    * @param state State of the graph needed to resume partial graph run.
    * @param feeds_fetches_manager Contains feed/fetches name to internal indices mapping and information for device
    *                              copy/checks.
diff --git a/onnxruntime/core/util/qmath.h b/onnxruntime/core/util/qmath.h
index c982a7aa2e7e0..1b2180da95058 100644
--- a/onnxruntime/core/util/qmath.h
+++ b/onnxruntime/core/util/qmath.h
@@ -552,7 +552,7 @@ struct BlockedQuantizeLinear<float, TOut, 2> {
                             std::ptrdiff_t N, const std::ptrdiff_t quant_block_size,
                             const std::ptrdiff_t thread_block_size, bool saturate) {
     ORT_UNUSED_PARAMETER(saturate);
-    // to avoid a byte being writen from mutiple threads, use 2 * N as thread block
+    // to avoid a byte being written from mutiple threads, use 2 * N as thread block
     ORT_UNUSED_PARAMETER(thread_block_size);
     constexpr auto low = static_cast<int32_t>(TOut::min_val);
     constexpr auto high = static_cast<int32_t>(TOut::max_val);
@@ -637,7 +637,7 @@ struct BlockedQuantizeLinear<float, TOut, 2> {
     ORT_UNUSED_PARAMETER(saturate);
     constexpr auto low = static_cast<int32_t>(TOut::min_val);
     constexpr auto high = static_cast<int32_t>(TOut::max_val);
-    // to avoid a byte being writen from mutiple threads, use 2 * K as thread block
+    // to avoid a byte being written from mutiple threads, use 2 * K as thread block
     auto size_thread_block = 2 * K;
     auto quant_block_num_K = (K + quant_block_size - 1) / quant_block_size;
     auto num_thread_block = (M + 1) / 2;
@@ -697,7 +697,7 @@ struct BlockedQuantizeLinear<MLFloat16, TOut, 2> {
                             std::ptrdiff_t N, const std::ptrdiff_t quant_block_size,
                             const std::ptrdiff_t thread_block_size, bool saturate) {
     ORT_UNUSED_PARAMETER(saturate);
-    // to avoid a byte being writen from mutiple threads, use 2 * N as thread block
+    // to avoid a byte being written from mutiple threads, use 2 * N as thread block
     ORT_UNUSED_PARAMETER(thread_block_size);
     constexpr auto low = static_cast<int32_t>(TOut::min_val);
     constexpr auto high = static_cast<int32_t>(TOut::max_val);
@@ -786,7 +786,7 @@ struct BlockedQuantizeLinear<MLFloat16, TOut, 2> {
     ORT_UNUSED_PARAMETER(saturate);
     constexpr auto low = static_cast<int32_t>(TOut::min_val);
     constexpr auto high = static_cast<int32_t>(TOut::max_val);
-    // to avoid a byte being writen from mutiple threads, use 2 * K as thread block
+    // to avoid a byte being written from mutiple threads, use 2 * K as thread block
     auto size_thread_block = 2 * K;
     auto quant_block_num_K = (K + quant_block_size - 1) / quant_block_size;
     auto num_thread_block = (M + 1) / 2;
diff --git a/onnxruntime/python/onnxruntime_pybind_schema.cc b/onnxruntime/python/onnxruntime_pybind_schema.cc
index 218b59688b01c..c5757095e2e1e 100644
--- a/onnxruntime/python/onnxruntime_pybind_schema.cc
+++ b/onnxruntime/python/onnxruntime_pybind_schema.cc
@@ -15,7 +15,7 @@ void addGlobalSchemaFunctions(pybind11::module& m) {
       "get_all_operator_schema", []() -> const std::vector<ONNX_NAMESPACE::OpSchema> {
         return ONNX_NAMESPACE::OpSchemaRegistry::get_all_schemas_with_history();
       },
-      "Return a vector of OpSchema all registed operators");
+      "Return a vector of OpSchema all registered operators");
   m.def(
       "get_all_opkernel_def", []() -> const std::vector<onnxruntime::KernelDef> {
         std::vector<onnxruntime::KernelDef> result;
diff --git a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
index db0b2e392b29f..7dcead113ac4f 100644
--- a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
+++ b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
@@ -41,7 +41,7 @@ struct MakeDType {
 
 /// <summary>
 /// The function creates a numpy array that points to
-/// data stored within the corresponing tensor. Parent object
+/// data stored within the corresponding tensor. Parent object
 /// holds a reference to the object that owns the data so it
 /// does not disappear.
 /// </summary>
@@ -396,7 +396,7 @@ void addSparseTensorMethods(pybind11::module& m) {
       })
       // pybind apparently has a bug with returning enums from def_property_readonly or methods
       // returning a method object instead of the enumeration value
-      // so we are using def_property and throw on a potential modificaiton
+      // so we are using def_property and throw on a potential modification
       .def_property(
           "format", [](const PySparseTensor* py_tensor) -> OrtSparseFormat {
         const SparseTensor& tensor = py_tensor->Instance();
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index e13285c60e69f..d7155b2b6899a 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -152,7 +152,7 @@ void AsyncCallback(void* user_data, OrtValue** outputs, size_t num_outputs, OrtS
   } else {
     // acquire GIL to safely:
     // 1) invoke python callback
-    // 2) create, manipulate, and destory python objects
+    // 2) create, manipulate, and destroy python objects
     py::gil_scoped_acquire acquire;
     invoke_callback();
   }
@@ -946,7 +946,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
                                                                             provider_options_map);
 
         // This variable is never initialized because the APIs by which it should be initialized are deprecated,
-        // however they still exist are are in-use. Neverthless, it is used to return CUDAAllocator,
+        // however they still exist are are in-use. Nevertheless, it is used to return CUDAAllocator,
         // hence we must try to initialize it here if we can since FromProviderOptions might contain
         // external CUDA allocator.
         external_allocator_info = info.external_allocator_info;
@@ -973,14 +973,17 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       const ROCMExecutionProviderInfo info = GetRocmExecutionProviderInfo(rocm_provider_info,
                                                                           provider_options_map);
 
-      // This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
-      // exist are are in-use. Neverthless, it is used to return ROCMAllocator, hence we must try to initialize it here if we can
-      // since FromProviderOptions might contain external ROCM allocator.
+      // This variable is never initialized because the APIs by which is it should be initialized are deprecated,
+      // however they still exist and are in-use. Nevertheless, it is used to return ROCMAllocator, hence we must
+      // try to initialize it here if we can since FromProviderOptions might contain external ROCM allocator.
       external_allocator_info = info.external_allocator_info;
       return rocm_provider_info->CreateExecutionProviderFactory(info)->CreateProvider();
     } else {
       if (!Env::Default().GetEnvironmentVar("ROCM_PATH").empty()) {
-        ORT_THROW("ROCM_PATH is set but ROCM wasn't able to be loaded. Please install the correct version of ROCM and MIOpen as mentioned in the GPU requirements page, make sure they're in the PATH, and that your GPU is supported.");
+        ORT_THROW(
+            "ROCM_PATH is set but ROCM wasn't able to be loaded. Please install the correct version "
+            "of ROCM and MIOpen as mentioned in the GPU requirements page, make sure they're in the PATH, "
+            "and that your GPU is supported.");
       }
     }
 #endif
@@ -1389,7 +1392,8 @@ void addGlobalMethods(py::module& m) {
         LogDeprecationWarning("set_openvino_device", "OpenVINO execution provider option \"device_type\"");
         openvino_device_type = device_type;
       },
-      "Set the prefered OpenVINO device type to be used. If left unset, the device type selected during build time will be used.");
+      "Set the preferred OpenVINO device type to be used. If left unset, "
+      "the device type selected during build time will be used.");
   // TODO remove deprecated global config
   m.def(
       "get_openvino_device", []() -> std::string {
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index 10492ae419817..65875d09102bd 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -812,7 +812,7 @@ def collect_absolute_value(self, name_to_arr):
                 hist_edges = hist_edges.astype(data_arr_np.dtype)
                 assert (
                     data_arr_np.dtype != np.float64
-                ), "only float32 or float16 is supported, every constant must be explicetly typed"
+                ), "only float32 or float16 is supported, every constant must be explicitly typed"
                 self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value)
             else:
                 old_histogram = self.histogram_dict[tensor]
@@ -834,7 +834,7 @@ def collect_absolute_value(self, name_to_arr):
                 hist[: len(old_hist)] += old_hist
                 assert (
                     data_arr_np.dtype != np.float64
-                ), "only float32 or float16 is supported, every constant must be explicetly typed"
+                ), "only float32 or float16 is supported, every constant must be explicitly typed"
                 self.histogram_dict[tensor] = (hist, hist_edges, min(old_min, min_value), max(old_max, max_value))
 
     def collect_value(self, name_to_arr):
diff --git a/onnxruntime/python/tools/quantization/operators/direct_q8.py b/onnxruntime/python/tools/quantization/operators/direct_q8.py
index ae9679ae8ec7a..de610a4c01326 100644
--- a/onnxruntime/python/tools/quantization/operators/direct_q8.py
+++ b/onnxruntime/python/tools/quantization/operators/direct_q8.py
@@ -13,7 +13,7 @@ def quantize(self):
         node = self.node
 
         if not self.quantizer.force_quantize_no_input_check:
-            # Keep backward compatiblity
+            # Keep backward compatibility
             # Quantize when input[0] is quantized already. Otherwise keep it.
             quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
             if quantized_input_value is None:
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index e4a9b867b1482..0fdef4ef6f6d3 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -357,7 +357,7 @@ def quantize_data(
     - when data `type == int8`, from `[-m , m]` -> :math:`[-(2^{b-1}-1), 2^{b-1}-1]` where
         `m = max(abs(rmin), abs(rmax))`
 
-    and add necessary intermediate nodes to trasnform quantized weight to full weight using the equation
+    and add necessary intermediate nodes to transform quantized weight to full weight using the equation
 
     :math:`r = S(q-z)`, where
 
diff --git a/onnxruntime/python/tools/transformers/README.md b/onnxruntime/python/tools/transformers/README.md
index 547d1a883c165..4f147219f19f1 100644
--- a/onnxruntime/python/tools/transformers/README.md
+++ b/onnxruntime/python/tools/transformers/README.md
@@ -29,7 +29,7 @@ Models not in the list may only be partially optimized or not optimized at all.
 - **hidden_size**: (*default: 768*)
     BERT-base and BERT-large has 768 and 1024 hidden nodes respectively.
 - **input_int32**: (*optional*)
-    Exported model ususally uses int64 tensor as input. If this flag is specified, int32 tensors will be used as input, and it could avoid un-necessary Cast nodes and get better performance.
+    Exported model usually uses int64 tensor as input. If this flag is specified, int32 tensors will be used as input, and it could avoid un-necessary Cast nodes and get better performance.
 - **float16**: (*optional*)
     By default, model uses float32 in computation. If this flag is specified, half-precision float will be used. This option is recommended for NVidia GPU with Tensor Core like V100 and T4. For older GPUs, float32 is likely faster.
 -  **use_gpu**: (*optional*)
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 9baafbbfff0e3..5ec2ab4e50799 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -930,7 +930,7 @@ def main():
 
     if len(results) == 0:
         if args.batch_sizes != [0]:
-            logger.warning("No any result avaiable.")
+            logger.warning("No any result available.")
         return
 
     csv_filename = args.detail_csv or f"benchmark_detail_{time_stamp}.csv"
diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
index 2083419087a69..0eaccc0fafcc4 100644
--- a/onnxruntime/python/tools/transformers/large_model_exporter.py
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -368,7 +368,7 @@ def parse_arguments():
         required=False,
         type=str,
         default=None,
-        help=("cache directy of huggingface, by setting this to avoid useless downloading if you have one"),
+        help=("cache directly of huggingface, by setting this to avoid useless downloading if you have one"),
     )
     parser.add_argument(
         "--with_past",
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
index 6d6a057574a17..7e786fce30985 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@@ -193,7 +193,7 @@ def main(args):
     config = AutoConfig.from_pretrained(args.model_name_or_path, torchscript=args.torchscript, cache_dir=cache_dir)
     model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
 
-    # This scirpt does not support float16 for PyTorch.
+    # This script does not support float16 for PyTorch.
     # if args.float16:
     #    model.half()
 
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
index 27e3899c11b7a..0ab26308295a9 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
@@ -105,7 +105,7 @@ def parse_arguments(argv=None):
         required=False,
         type=float,
         default=0,
-        help="the aboslute and relative tolerance for parity verification",
+        help="the absolute and relative tolerance for parity verification",
     )
 
     parser.add_argument(
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
index f4705bef6a988..6bfcb0368eaaa 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
@@ -137,7 +137,7 @@ def __init__(
         self.has_position_ids = position_ids is not None
         self.has_attention_mask = attention_mask is not None
 
-        # Emtpy past state for first inference
+        # Empty past state for first inference
         self.past = []
         past_shape = [
             2,
diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
index 43c31e1ea45ac..7295ae1436c99 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
@@ -1665,7 +1665,7 @@
     "### Packing Mode (Effective Transformer)\n",
     "\n",
     "When padding ratio is high, it is helpful to use packing mode, also known as [effective transformer](https://github.com/bytedance/effective_transformer).\n",
-    "This feature requires onnxruntime-gpu verison 1.16 or later. \n",
+    "This feature requires onnxruntime-gpu version 1.16 or later. \n",
     "\n",
     "In below example, average sequence length after removing paddings is 32, the sequence length with paddings is 128. We can see 3x throughput with packing mode (QPS increased from 1617 to 5652)."
    ]
diff --git a/onnxruntime/python/tools/transformers/onnx_model_phi.py b/onnxruntime/python/tools/transformers/onnx_model_phi.py
index 05a27ba487f4d..5df765033578b 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_phi.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_phi.py
@@ -65,7 +65,7 @@ def __call__(self, x):
         return x
 
 
-# TODO: move to a seperate file
+# TODO: move to a separate file
 class Fission(Fusion):
     def __init__(
         self,
diff --git a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
index 98235de6ba6fd..f5a47b19d67fc 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
@@ -17,7 +17,7 @@
 class FusionTnlrAttention(FusionAttention):
     """
     Fuse TNLR Attention subgraph into one Attention node.
-    TNLR Attention has extra addtion after qk nodes and adopts [S, B, NH] as I/O shape.
+    TNLR Attention has extra addition after qk nodes and adopts [S, B, NH] as I/O shape.
     """
 
     def __init__(
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index 5f161674b614e..06264b426d1e5 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -531,7 +531,7 @@ def _parse_arguments():
         "--disable_symbolic_shape_infer",
         required=False,
         action="store_true",
-        help="diable symoblic shape inference",
+        help="diable symbolic shape inference",
     )
     parser.set_defaults(disable_symbolic_shape_infer=False)
 
diff --git a/onnxruntime/python/tools/transformers/shape_optimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py
index 503930b23229f..17fd54f19baf2 100644
--- a/onnxruntime/python/tools/transformers/shape_optimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-# This tool is not used directly in bert optimization. It could assist developing the optimization script on the following senarios:
+# This tool is not used directly in bert optimization. It could assist developing the optimization script on the following scenarios:
 # (1) It could simplify graph by removing many sub-graphs related to reshape.
 # (2) It could reduce extra inputs and outputs to fit other tools. The script compare_bert_results.py or bert_perf_test.py requires 3 inputs.
 
diff --git a/onnxruntime/test/contrib_ops/attention_lstm_op_test.cc b/onnxruntime/test/contrib_ops/attention_lstm_op_test.cc
index e78b3528c11a4..0634f545e6f7b 100644
--- a/onnxruntime/test/contrib_ops/attention_lstm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/attention_lstm_op_test.cc
@@ -266,8 +266,8 @@ static const std::vector<float> s_M_2batch{0.1f, -0.25f, 1.0f, 1.0f, -1.0f, -1.5
                                            0.1f, -0.25f, 0.5f, -0.25f, -1.25f, 0.25f, -1.0f, 1.5f, -1.25f};
 
 // real seq lens for memory
-static std::vector<int> s_mem_seq_lenghts{3};
-static const std::vector<int> s_mem_seq_lenghts_2batch{3, 2};
+static std::vector<int> s_mem_seq_lengths{3};
+static const std::vector<int> s_mem_seq_lengths_2batch{3, 2};
 
 // [batch_size=1, input_max_step=3, input_only_depth=3]
 static std::vector<float> s_X_T_data{
@@ -352,7 +352,7 @@ TEST(AttnLSTMTest, ForwardLstmWithBahdanauAMZeroAttention) {
 
   RunAttnLstmTest(
       X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
-      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lenghts, &zero_attn_layer_weight,
+      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lengths, &zero_attn_layer_weight,
       input_only_depth, batch_size, cell_hidden_size, input_max_step,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &B_data, nullptr, nullptr, nullptr, &s_seq_lengths,
@@ -389,7 +389,7 @@ TEST(AttnLSTMTest, ForwardLstmWithBahdanauAM) {
 
   RunAttnLstmTest(
       X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
-      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lenghts, &s_attn_layer_weight,
+      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lengths, &s_attn_layer_weight,
       input_only_depth, batch_size, cell_hidden_size, input_max_step,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &B_data, nullptr, nullptr, nullptr, &s_seq_lengths,
@@ -428,7 +428,7 @@ TEST(AttnLSTMTest, ForwardLstmWithBahdanauAMShortenSeqLength) {
 
   RunAttnLstmTest(
       X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
-      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lenghts, &s_attn_layer_weight,
+      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lengths, &s_attn_layer_weight,
       input_only_depth, batch_size, cell_hidden_size, input_max_step,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &B_data, nullptr, nullptr, nullptr, &shortenSeqLen,
@@ -467,7 +467,7 @@ TEST(AttnLSTMTest, ReverseLstmWithBahdanauAMShortenSeqLength) {
 
   RunAttnLstmTest(
       X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
-      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lenghts, &s_attn_layer_weight,
+      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lengths, &s_attn_layer_weight,
       input_only_depth, batch_size, cell_hidden_size, input_max_step,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &B_data, nullptr, nullptr, nullptr, &shortenSeqLen,
@@ -521,7 +521,7 @@ TEST(AttnLSTMTest, BidirectionLstmWithBahdanauAMShortenSeqLength) {
 
   RunAttnLstmTest(
       X_data, d_W_data, d_R_data, Y_data, Y_h_data, Y_c_data,
-      d_memory_layer_weight, d_query_layer_weight, d_attn_v, s_M_data, &s_mem_seq_lenghts, &d_attn_layer_weight,
+      d_memory_layer_weight, d_query_layer_weight, d_attn_v, s_M_data, &s_mem_seq_lengths, &d_attn_layer_weight,
       input_only_depth, batch_size, cell_hidden_size, input_max_step,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &d_B_data, nullptr, nullptr, nullptr, &shortenSeqLen,
@@ -578,7 +578,7 @@ TEST(AttnLSTMTest, BidirectionLstmWithBahdanauAM2BatchShortenSeqLen) {
 
   RunAttnLstmTest(
       X_data, d_W_data, d_R_data, Y_data, Y_h_data, Y_c_data,
-      d_memory_layer_weight, d_query_layer_weight, d_attn_v, s_M_2batch, &s_mem_seq_lenghts_2batch, &d_attn_layer_weight,
+      d_memory_layer_weight, d_query_layer_weight, d_attn_v, s_M_2batch, &s_mem_seq_lengths_2batch, &d_attn_layer_weight,
       input_only_depth, batch2Size, cell_hidden_size, inputMaxStep4,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &d_B_data, nullptr, nullptr, nullptr, &s_seq_lengths_2batch,
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index bf15a9d35b56a..26e40b25930c8 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -1288,7 +1288,7 @@ TEST_F(PlannerTest, MultiStream) {
 
   CreatePlan({}, false);
 
-  EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams for CPU and CUDA seperately";
+  EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams for CPU and CUDA separately";
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 6) << "CPU stream has 6 steps";
   EXPECT_NE(strstr(typeid(*GetState().GetExecutionPlan()->execution_plan[0]->steps_[0]).name(), "LaunchKernelStep"), nullptr) << "0th step: LaunchKernelStep for node 1";
   EXPECT_NE(strstr(typeid(*GetState().GetExecutionPlan()->execution_plan[0]->steps_[1]).name(), "LaunchKernelStep"), nullptr) << "1st step: LaunchKernelStep for node 2";
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 84389c1d9711c..8b230db351edc 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -1400,7 +1400,7 @@ TEST(ExecutionProviderTest, OpKernelInfoCanReadConfigOptions) {
   so.session_logid = "ExecutionProviderTest.OpKernelInfoCanReadConfigOptions";
 
   // add a config key that if read causes the Fuse op kernel to throw in the ctor. this is just to test the value is passed
-  // through in the simplest way, as the kernel is constructed in InferenceSession::Intialize so we don't need to
+  // through in the simplest way, as the kernel is constructed in InferenceSession::Initialize so we don't need to
   // actually run the model.
   ASSERT_STATUS_OK(so.config_options.AddConfigEntry("ThrowInKernelCtor", "1"));
 
diff --git a/onnxruntime/test/framework/tunable_op_test.cc b/onnxruntime/test/framework/tunable_op_test.cc
index 6fe0754db40d3..53aa949647c77 100644
--- a/onnxruntime/test/framework/tunable_op_test.cc
+++ b/onnxruntime/test/framework/tunable_op_test.cc
@@ -668,7 +668,7 @@ TEST(TuningContext, TunableOpRespectTuningContext) {
     ASSERT_TRUE(status.IsOK());
     ASSERT_EQ(last_run, "FastFull");
 
-    // After TunableOp(...), the result entry is corretly written.
+    // After TunableOp(...), the result entry is correctly written.
     ASSERT_EQ(mgr.Lookup(op.Signature()).size(), 1u);
     ASSERT_EQ(mgr.Lookup(op.Signature(), params.Signature()), tuning::TunableVecAddSelectFast::kFastFullId);
   }
diff --git a/onnxruntime/test/fuzzing/include/BetaDistribution.h b/onnxruntime/test/fuzzing/include/BetaDistribution.h
index c5c59922d864c..40e42a598c85a 100644
--- a/onnxruntime/test/fuzzing/include/BetaDistribution.h
+++ b/onnxruntime/test/fuzzing/include/BetaDistribution.h
@@ -83,7 +83,7 @@ class BetaDistribution {
       calc_type highest_probability_temp = highest_probability;
       highest_probability = std::max({highest_probability_temp, distribution(sample)});
 
-      // A new sample number with a higher probabilty has been found
+      // A new sample number with a higher probability has been found
       //
       if (highest_probability > highest_probability_temp) {
         likely_number = sample;
@@ -137,7 +137,7 @@ class BetaDistribution {
     }
   }
 
-  // Generate the probabilty of having this number
+  // Generate the probability of having this number
   //
   inline calc_type distribution(calc_type randVar) {
     if (randVar > max() || randVar < min()) {
diff --git a/onnxruntime/test/fuzzing/src/test.cpp b/onnxruntime/test/fuzzing/src/test.cpp
index 0d51af6b6b0fa..490f7dd4d37a3 100644
--- a/onnxruntime/test/fuzzing/src/test.cpp
+++ b/onnxruntime/test/fuzzing/src/test.cpp
@@ -365,7 +365,7 @@ int main(int argc, char* argv[]) {
       std::ifstream ortModelStream(ort_model_file, std::ifstream::in | std::ifstream::binary);
       ortModelStream.read(model_data.data(), num_bytes);
       ortModelStream.close();
-      // Currently mutations are generated by using XOR of a byte with the preceeding byte at a time.
+      // Currently mutations are generated by using XOR of a byte with the preceding byte at a time.
       // Other possible ways may be considered in future, for example swaping two bytes randomly at a time.
       Logger::testLog << "Starting Test" << Logger::endl;
       for (size_t& i = run_stats.iteration; i < num_bytes - 1; i++) {
diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc
index 5fc036790b765..f6b7bdb1a001c 100644
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@@ -464,7 +464,7 @@ TEST_F(GraphTest, LocalCustomRegistry) {
 
 // Tests the case where function op and function body ops belong to different domains.
 // Tests that such a model can be loaded successfully, function body initialization is
-// successful and domain and verison mapping for each node is successful (by verifying
+// successful and domain and version mapping for each node is successful (by verifying
 // op schema for each of the function body nodes can be found).
 TEST_F(GraphTest, FunctionOpsetImportTest) {
   std::shared_ptr<Model> model;
@@ -481,7 +481,7 @@ TEST_F(GraphTest, FunctionOpsetImportTest) {
       // phase .i.e. Init function body only if none of EPs have a kernel matching the function op
       // then this check will not hold true and should be removed.
 
-      // We delay the funciton instantiate untill partition the graph
+      // We delay the function instantiate until partition the graph
       // this check is no longer valid anymore.
       /*ASSERT_TRUE(!schema->HasFunction() && !schema->HasContextDependentFunction());*/
       continue;
diff --git a/onnxruntime/test/ir/schema_registry_manager_test.cc b/onnxruntime/test/ir/schema_registry_manager_test.cc
index 704c84343173a..52c286d187e53 100644
--- a/onnxruntime/test/ir/schema_registry_manager_test.cc
+++ b/onnxruntime/test/ir/schema_registry_manager_test.cc
@@ -89,7 +89,7 @@ TEST(SchemaRegistryManager, OpsetRegTest) {
   // registry2 has:(op1,domain1,version2)
   ASSERT_TRUE(registry2->GetSchema("Op1", 1, "Domain1") == nullptr);
   ASSERT_TRUE(registry2->GetSchema("Op1", 2, "Domain1") != nullptr);
-  // Fail because this registery doesn't have the information of opset3
+  // Fail because this registry doesn't have the information of opset3
   ASSERT_TRUE(registry2->GetSchema("Op1", 3, "Domain1") == nullptr);
 
   std::shared_ptr<onnxruntime::OnnxRuntimeOpSchemaRegistry> registry3 = std::make_shared<OnnxRuntimeOpSchemaRegistry>();
@@ -126,7 +126,7 @@ TEST(SchemaRegistryManager, OpsetRegTest) {
   // Note that "Op5" has SinceVersion equal to 1, but a V1 operator set was already registered
   // without this operator.  This would normally be invalid, and the registry with the missing
   // operator could trigger the operator lookup to fail.  Version 1 is a special case to allow
-  // for experimental operators, and is accomplished by not reducing the targetted version to
+  // for experimental operators, and is accomplished by not reducing the targeted version to
   // zero in OnnxRuntimeOpSchemaRegistry::GetSchemaAndHistory.
   // TODO - Consider making the registration algorithm robust to this invalid usage in general
   ASSERT_TRUE(manager.GetSchema("Op5", 5, "Domain1")->since_version() == 1);
diff --git a/onnxruntime/test/mlas/unittest/test_fgemm_fixture.h b/onnxruntime/test/mlas/unittest/test_fgemm_fixture.h
index 05c6a0098eecb..53b3edafdf84f 100644
--- a/onnxruntime/test/mlas/unittest/test_fgemm_fixture.h
+++ b/onnxruntime/test/mlas/unittest/test_fgemm_fixture.h
@@ -9,7 +9,7 @@
 #include <sstream>
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename T, bool Packed, bool Threaded>
 class FgemmShortExecuteTest : public MlasTestFixture<MlasFgemmTest<T, Packed, Threaded>> {
diff --git a/onnxruntime/test/mlas/unittest/test_halfgemm.cpp b/onnxruntime/test/mlas/unittest/test_halfgemm.cpp
index 2a478675d09eb..aafdcc14c0028 100644
--- a/onnxruntime/test/mlas/unittest/test_halfgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_halfgemm.cpp
@@ -17,7 +17,7 @@ Module Name:
 #include "test_halfgemm.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename AType, typename BType, bool Packed, bool Threaded>
 class HalfGemmShortExecuteTest : public MlasTestFixture<MlasHalfGemmTest<AType, BType, Packed, Threaded>> {
diff --git a/onnxruntime/test/mlas/unittest/test_pool2d_fixture.h b/onnxruntime/test/mlas/unittest/test_pool2d_fixture.h
index 2ede8c3f0ab11..cb748bbaccce0 100644
--- a/onnxruntime/test/mlas/unittest/test_pool2d_fixture.h
+++ b/onnxruntime/test/mlas/unittest/test_pool2d_fixture.h
@@ -7,7 +7,7 @@
 #include "test_pool2d.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename Pool2DTester>
 class Pooling2dShortExecuteTest : public MlasTestFixture<Pool2DTester> {
diff --git a/onnxruntime/test/mlas/unittest/test_pool3d_fixture.h b/onnxruntime/test/mlas/unittest/test_pool3d_fixture.h
index 00f95bb00b9ae..e3d2aebc39cec 100644
--- a/onnxruntime/test/mlas/unittest/test_pool3d_fixture.h
+++ b/onnxruntime/test/mlas/unittest/test_pool3d_fixture.h
@@ -7,7 +7,7 @@
 #include "test_pool3d.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <MLAS_POOLING_KIND PoolingKind, bool Threaded>
 class Pooling3dShortExecuteTest : public MlasTestFixture<MlasPool3DTest<PoolingKind, Threaded>> {
diff --git a/onnxruntime/test/mlas/unittest/test_qgemm_fixture.h b/onnxruntime/test/mlas/unittest/test_qgemm_fixture.h
index b2657fbde9afa..40f688a16ecca 100644
--- a/onnxruntime/test/mlas/unittest/test_qgemm_fixture.h
+++ b/onnxruntime/test/mlas/unittest/test_qgemm_fixture.h
@@ -7,7 +7,7 @@
 #include "test_qgemm.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename AType, typename BType, typename OutputType, bool Packed, bool Threaded>
 class QgemmShortExecuteTest;
diff --git a/onnxruntime/test/mlas/unittest/test_sbgemm.cpp b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
index 941de8f05061f..f85fe97776dc1 100644
--- a/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
@@ -20,7 +20,7 @@ Module Name:
 #include "test_sbgemm.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename AType, typename BType, bool Packed, bool Threaded>
 class SBGemmShortExecuteTest : public MlasTestFixture<MlasSBGemmTest<AType, BType, Packed, Threaded>> {
@@ -76,7 +76,7 @@ class SBGemmShortExecuteTest : public MlasTestFixture<MlasSBGemmTest<AType, BTyp
         test_registered += RegisterSingleTest(1, 32, b, 5, false);
       }
     }
-    // TODO: check why the cosine similary is < 0.99 for this shape alone
+    // TODO: check why the cosine similarly is < 0.99 for this shape alone
     // test_registered += RegisterSingleTest(43, 500, 401, 1, true);
     test_registered += RegisterSingleTest(1001, 1027, 1031, 1, false);
     if (!Packed) {
diff --git a/onnxruntime/test/mlas/unittest/test_symm_qgemm_fixture.h b/onnxruntime/test/mlas/unittest/test_symm_qgemm_fixture.h
index 9b1a3a7502723..71c022211d5d4 100644
--- a/onnxruntime/test/mlas/unittest/test_symm_qgemm_fixture.h
+++ b/onnxruntime/test/mlas/unittest/test_symm_qgemm_fixture.h
@@ -7,7 +7,7 @@
 #include "test_symm_qgemm.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename AType, typename OutputType, bool Threaded>
 class SymmQgemmShortExecuteTest;
diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
index ea2823916798e..5ecbf4967b044 100644
--- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
@@ -4883,7 +4883,7 @@ static void CheckSharedInitializerHandling(bool broadcast) {
 
 // test we re-use a modified shared initializer wherever possible. model has one initializer that is used by 3 DQ nodes
 // and one initializer that is used by 2 Add nodes. both cases should be handled with the initializer being
-// modified in-place for the first usage, and the Transpose added to the second usage being cancelled out when the
+// modified in-place for the first usage, and the Transpose added to the second usage being canceled out when the
 // original Transpose at the start of the model is pushed down.
 TEST(TransposeOptimizerTests, SharedInitializerHandling) {
   CheckSharedInitializerHandling(/*broadcast*/ false);
@@ -4899,7 +4899,7 @@ TEST(TransposeOptimizerTests, SharedInitializerHandlingBroadcast) {
 }
 
 // Unit test where EstimateTransposeValueCost must look past a DQ -> Squeeze to see the Transponse of a shared
-// initializer for the overall cost of pushing the Transpose throught the second Where to be negative.
+// initializer for the overall cost of pushing the Transpose through the second Where to be negative.
 TEST(TransposeOptimizerTests, SharedInitializerHandlingBroadcast2) {
   auto model_uri = ORT_TSTR("testdata/transpose_optimizer_shared_initializers_broadcast2.onnx");
 
diff --git a/onnxruntime/test/perftest/ReadMe.txt b/onnxruntime/test/perftest/ReadMe.txt
index 4142beefbd034..9c0dbf5d673e7 100644
--- a/onnxruntime/test/perftest/ReadMe.txt
+++ b/onnxruntime/test/perftest/ReadMe.txt
@@ -10,7 +10,7 @@ Options:
         -h: help
 
 Model path and input data dependency:
-    Performance test uses the same input structure as onnx_test_runner. It requrires the direcotry trees as below: 
+    Performance test uses the same input structure as onnx_test_runner. It requrires the directory trees as below: 
     
     --ModelName
         --test_data_set_0
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 92d732fba2a0a..0e4f0d0cad3f4 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -941,7 +941,7 @@ bool OnnxRuntimeTestSession::PopulateGeneratedInputTestData(int32_t seed) {
       auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
       std::vector<int64_t> input_node_dim = tensor_info.GetShape();
 
-      // free dimensions are treated as 1 if not overriden
+      // free dimensions are treated as 1 if not overridden
       for (int64_t& dim : input_node_dim) {
         if (dim == -1) {
           dim = 1;
diff --git a/onnxruntime/test/platform/android/cxa_demangle_test.cc b/onnxruntime/test/platform/android/cxa_demangle_test.cc
index 47f149c4d3a22..dbb050ce623f4 100644
--- a/onnxruntime/test/platform/android/cxa_demangle_test.cc
+++ b/onnxruntime/test/platform/android/cxa_demangle_test.cc
@@ -27,7 +27,7 @@ TEST(DummyCxaDemangleTest, Alloc) {
   ASSERT_STREQ(output_buffer, input);
   std::free(output_buffer);
 
-  // verify status can be omited
+  // verify status can be omitted
   char* output_buffer2 = __cxa_demangle(input, nullptr, nullptr, nullptr);
   ASSERT_STREQ(output_buffer2, input);
   std::free(output_buffer2);
diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
index e5f3956438b7a..6bf2fc63ab165 100644
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@@ -155,7 +155,7 @@ static common::Status CreateSubgraph(Graph& graph, RunOptions& options, const st
     graph.AddNode("add", "Add", "Add 1 to the loop state", inputs, outputs);
   }
 
-  // subgraph with multiple inputs and outputs to test variadic behaviour.
+  // subgraph with multiple inputs and outputs to test variadic behavior.
   // 2 inputs of 2 that are concatenated and then split into 4 outputs of 1
 
   // Concat node
diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
index e73a1b492cc05..3b7e93b8f7668 100644
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
@@ -284,7 +284,7 @@ TEST(LSTMTest, MixedSequenceLengths) {
   }
 
   // we don't have numpy output for this, but by testing twice and swapping which batch is smaller
-  // we can largely verify the behaviour by comparing to ForwardSimpleWeightsNoBiasTwoRows output.
+  // we can largely verify the behavior by comparing to ForwardSimpleWeightsNoBiasTwoRows output.
   std::vector<int> seq_lengths{1, 2};
 
   std::vector<float> Y_data{
@@ -333,7 +333,7 @@ TEST(LSTMTest, MixedSequenceLengthsReverse) {
   }
 
   // we don't have numpy output for this, but by testing twice and swapping which batch is smaller
-  // we can largely verify the behaviour by comparing to ReverseSimpleWeightsNoBiasTwoRows output.
+  // we can largely verify the behavior by comparing to ReverseSimpleWeightsNoBiasTwoRows output.
   std::vector<int> seq_lengths{1, 2};
 
   std::vector<float> Y_data{
diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
index 27a0696acb599..b413d04fe81e8 100644
--- a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
@@ -14,7 +14,7 @@ namespace test {
 TEST(AllocatorTest, CUDAAllocatorTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
-  // ensure CUDA device is avaliable.
+  // ensure CUDA device is available.
   CUDA_CALL_THROW(cudaSetDevice(cuda_device_id));
 
   AllocatorCreationInfo default_memory_info(
diff --git a/onnxruntime/test/providers/dnnl/transformer/matmul_post_op_transform_test.cc b/onnxruntime/test/providers/dnnl/transformer/matmul_post_op_transform_test.cc
index a13fa91366aaf..1274efedbeb61 100644
--- a/onnxruntime/test/providers/dnnl/transformer/matmul_post_op_transform_test.cc
+++ b/onnxruntime/test/providers/dnnl/transformer/matmul_post_op_transform_test.cc
@@ -14,10 +14,10 @@
  * The tests validate that if a fusion occures the expected output matches
  * the output of each graph if they had not be done separatly.
  *
- * Unfortantly there is no hook to actually check that the fussion occured
+ * Unfortantly there is no hook to actually check that the fussion occurred
  * other than inspecting debug logs.
  *
- * The 8 tests use patterns that we have seen in actual models durring testing.
+ * The 8 tests use patterns that we have seen in actual models during testing.
  * Other tests validate that non-associative ops work as expected. We are able
  * to fuse the output of matmul divided by another value but we can not fuse
  * the a value divided by the output of matmul. Similar with Subtraction.
@@ -673,7 +673,7 @@ TEST(DnnlMatMulFusion, matmul_div_sub_1) {
 // in the matmul post op fusion to check that the 32 post op
 // limit is not exceded.
 // to do this we just run the matmul->[add->mul->sub-div] 9 times
-// input params are shared accross multiple ops
+// input params are shared across multiple ops
 class Dnnl_matmul_36_post_ops_PostOpTester : public OpTester {
  public:
   explicit Dnnl_matmul_36_post_ops_PostOpTester(int opset_version = 7)
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
index 8cf7efe14b1c9..d58db5178032d 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
@@ -83,7 +83,7 @@ TEST(InternalTestingEP, TestSortResultsInSinglePartition) {
 }
 
 // mode has Resize op with optional input roi which is just a placeholder.
-// partition funtion should skip the placeholder inputs.
+// partition function should skip the placeholder inputs.
 TEST(InternalTestingEP, TestResizeWithOptionalInput) {
   // Resize op has optional input roi which is just a placeholder
   const ORTCHAR_T* model_path = ORT_TSTR("testdata/model_resize_empty_optional_input.onnx");
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index 012845f5eb161..a3768cb98f584 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -654,7 +654,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
 // Context binary only contains a single QNN graph, generated context cache model (detached mode) only has 1 EPContext node
 // Create another Onnx model which also reference to the bin file,
 // but the node name is not same with the QNN graph name inside the bin file.
-// This is to support backward compitable for the models generated before the PR that
+// This is to support backward compatible for the models generated before the PR that
 // make context generation support multi-partition
 TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphNameInCtx) {
   ProviderOptions provider_options;
@@ -732,7 +732,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphName
   ASSERT_EQ(std::remove(context_bin.string().c_str()), 0);
 }
 
-// Model has 2 EPContext nodes, both with main_context=1 and embeded context binary
+// Model has 2 EPContext nodes, both with main_context=1 and embedded context binary
 TEST_F(QnnHTPBackendTests, QnnMultiContextEmbeded) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 892e7de8bb6ed..32eac6f7638c1 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -1783,7 +1783,7 @@ def test_multiple_devices(self):
                 return
 
             # https://github.com/microsoft/onnxruntime/issues/18432. Make sure device Id is properly set
-            # Scenario 1, 3 sessions created with differnt device Id under IOBinding
+            # Scenario 1, 3 sessions created with different device Id under IOBinding
             sessions = []
             for i in range(3):
                 sessions.append(
diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
index 6cdf820c8a0e9..8b6b029c57752 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
@@ -173,7 +173,7 @@ def test_run_model_mlnet(self):
         # In memory, the size of each element is fixed and equal to the
         # longest element. We cannot use bytes because numpy is trimming
         # every final 0 for strings and bytes before creating the array
-        # (to save space). It does not have this behaviour for void
+        # (to save space). It does not have this behavior for void
         # but as a result, numpy does not know anymore the size
         # of each element, they all have the same size.
         c1 = np.array([b"A\0A\0\0", b"B\0B\0\0", b"C\0C\0\0"], np.void).reshape(1, 3)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py b/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
index 22a09ef565d59..fe64aac54951b 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
@@ -54,7 +54,7 @@ def test_run_sparse_output_only(self):
 
     def test_run_contrib_sparse_mat_mul(self):
         """
-        Mutliple sparse COO tensor to dense
+        Multiple sparse COO tensor to dense
         """
         common_shape = [9, 9]  # inputs and oputputs same shape
         A_values = np.array(  # noqa: N806
diff --git a/onnxruntime/test/python/quantization/test_quantize_static_resnet.py b/onnxruntime/test/python/quantization/test_quantize_static_resnet.py
index 1efa283af6881..d105f647c813b 100644
--- a/onnxruntime/test/python/quantization/test_quantize_static_resnet.py
+++ b/onnxruntime/test/python/quantization/test_quantize_static_resnet.py
@@ -87,7 +87,7 @@ def test_quantize_static_resnet(self):
                     # * uint8([128, 128, ..., 127, ...]) if per_channel is True
                     # QLinearConv : zero point of per-channel filter must be same.
                     # That's why the quantization forces a symmetric quantization into INT8.
-                    # zero_point is guaranted to be zero whatever the channel is.
+                    # zero_point is guaranteed to be zero whatever the channel is.
 
                     with open(qdq_file, "rb") as f:
                         onx = onnx.load(f)
diff --git a/onnxruntime/test/python/transformers/test_generation.py b/onnxruntime/test/python/transformers/test_generation.py
index 33ec1bd7728fe..88f870e92d558 100644
--- a/onnxruntime/test/python/transformers/test_generation.py
+++ b/onnxruntime/test/python/transformers/test_generation.py
@@ -47,7 +47,7 @@ def setUp(self):
             "Test best way to invest",
             # "The AI community building the future",
             # "The selloff in tech shares deepened",
-            # "Abortion rights take centre stage",
+            # "Abortion rights take center stage",
         ]
         self.enable_cuda = torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers()
         self.remove_onnx_files()
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index eacd41e6b9c6d..52491a179c2ce 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -1620,7 +1620,7 @@ TEST(CApiTest, test_custom_op_openvino_wrapper_library) {
 // It has memory leak. The OrtCustomOpDomain created in custom_op_library.cc:RegisterCustomOps function was not freed
 #if defined(__ANDROID__)
 TEST(CApiTest, DISABLED_test_custom_op_library) {
-// To accomodate a reduced op build pipeline
+// To accommodate a reduced op build pipeline
 #elif defined(REDUCED_OPS_BUILD) && defined(USE_CUDA)
 TEST(CApiTest, DISABLED_test_custom_op_library) {
 #else
@@ -1674,7 +1674,7 @@ TestInference<int32_t>(*ort_env, CUSTOM_OP_LIBRARY_TEST_MODEL_URI, inputs, "outp
 // Has memory leak
 #if defined(__ANDROID__) || defined(ABSL_HAVE_ADDRESS_SANITIZER)
 TEST(CApiTest, DISABLED_test_custom_op_shape_infer_attr) {
-// To accomodate a reduced op build pipeline
+// To accommodate a reduced op build pipeline
 #elif defined(REDUCED_OPS_BUILD) && defined(USE_CUDA)
 TEST(CApiTest, DISABLED_test_custom_op_shape_infer_attr) {
 #else
@@ -1705,7 +1705,7 @@ TEST(CApiTest, test_custom_op_shape_infer_attr) {
 // It has memory leak. The OrtCustomOpDomain created in custom_op_library.cc:RegisterCustomOps function was not freed
 #if defined(__ANDROID__)
 TEST(CApiTest, test_custom_op_library_copy_variadic) {
-// To accomodate a reduced op build pipeline
+// To accommodate a reduced op build pipeline
 #elif defined(REDUCED_OPS_BUILD) && defined(USE_CUDA)
 TEST(CApiTest, test_custom_op_library_copy_variadic) {
 #else
diff --git a/onnxruntime/test/testdata/transform/model_creation_for_testing.ipynb b/onnxruntime/test/testdata/transform/model_creation_for_testing.ipynb
index f8af2d8a9f6e8..e6118e3b53b1d 100644
--- a/onnxruntime/test/testdata/transform/model_creation_for_testing.ipynb
+++ b/onnxruntime/test/testdata/transform/model_creation_for_testing.ipynb
@@ -309,7 +309,7 @@
     "        helper.make_node('Slice', ['E', 'startsE', 'endsE', 'axesE', 'stepsE'], ['F']),\n",
     "        # Will be removed.\n",
     "        helper.make_node('Slice', ['F', 'startsF', 'endsF', 'axesF'], ['G']),\n",
-    "        # Will not be removed because of endsG appearing in graph inputs (can be overriden).\n",
+    "        # Will not be removed because of endsG appearing in graph inputs (can be overridden).\n",
     "        helper.make_node('Slice', ['G', 'startsG', 'endsG'], ['H']),\n",
     "        helper.make_node('Max', ['H'], ['I']),\n",
     "        # Will not be removed because node output participates in graph output.\n",
diff --git a/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
index c57024538f5b2..306ad7d37403a 100644
--- a/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
+++ b/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
@@ -7,7 +7,7 @@
 hidden_per_attention = 2
 
 # Self-attention.
-# Handle self-attension.
+# Handle self-attention.
 # MatMul->Add->Split->Reshape->Transpose->MatMul->Div->Mul->Sub->Softmax->Dropout->MatMul->Transpose->Reshape->MatMul->Add
 #                  |->Reshape->Transpose->|                                        |
 #                  |->Reshape->Transpose------------------------------------------>|
diff --git a/onnxruntime/test/testdata/transpose_optimizer_shared_initializers.py b/onnxruntime/test/testdata/transpose_optimizer_shared_initializers.py
index d710c796fb0ad..293c5aafe7f0c 100644
--- a/onnxruntime/test/testdata/transpose_optimizer_shared_initializers.py
+++ b/onnxruntime/test/testdata/transpose_optimizer_shared_initializers.py
@@ -59,7 +59,7 @@ def create_model_with_Where():  # noqa 'Where' is the operator name
     initializer and other usage. We need to use Where as we require more than 2 inputs.
     The `condition` input will be having a Transpose pushed through it will have a negative cost.
     The `X` input will have a positive cost which cancels out the negative value.
-    The `Y` input will be a shared initializer that is braodcast. If we don't find the Transpose to make the cost of it
+    The `Y` input will be a shared initializer that is broadcast. If we don't find the Transpose to make the cost of it
     negative we will not push the Transpose though.
 
     If we only have 2 inputs, the broadcast initializer will always cost less due to its smaller rank, meaning we don't
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index 2cd1515d191c8..0730559c4375b 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -3,7 +3,7 @@
 
 // NOTE: This file contains declarations of exported functions as WebAssembly API.
 // Unlike a normal C-API, the purpose of this API is to make emcc to generate correct exports for the WebAssembly. The
-// macro "EMSCRIPTEN_KEEPALIVE" helps the compiler to mark the function as an exported funtion of the WebAssembly
+// macro "EMSCRIPTEN_KEEPALIVE" helps the compiler to mark the function as an exported function of the WebAssembly
 // module. Users are expected to consume those functions from JavaScript side.
 
 #pragma once
diff --git a/orttraining/orttraining/core/framework/adasum/adasum_interface.h b/orttraining/orttraining/core/framework/adasum/adasum_interface.h
index e872da78fdcf5..d7dc62336421c 100644
--- a/orttraining/orttraining/core/framework/adasum/adasum_interface.h
+++ b/orttraining/orttraining/core/framework/adasum/adasum_interface.h
@@ -138,7 +138,7 @@ class AdasumInterface {
   //              first n-1 levels are skipped. This is useful when the
   //              communication inside the node is implemented using another
   //              reduce-scatter algorithm, e.g. the one in NCCL, which may be
-  //              desireable on some hardware configurations. When
+  //              desirable on some hardware configurations. When
   //              start_level>1, tensor_counts must be set according to the
   //              slices owned by this rank.
   // communicator: the communicator to reduce with.
diff --git a/orttraining/orttraining/core/framework/ortmodule_graph_builder.cc b/orttraining/orttraining/core/framework/ortmodule_graph_builder.cc
index e01456ee3d769..593a8be399bd6 100644
--- a/orttraining/orttraining/core/framework/ortmodule_graph_builder.cc
+++ b/orttraining/orttraining/core/framework/ortmodule_graph_builder.cc
@@ -223,7 +223,7 @@ void OrtModuleGraphBuilder::GetFrontierTensors() {
   for (const auto& param : graph_info_.initializer_names_to_train) {
     std::vector<const Node*> consumer_nodes = graph.GetConsumerNodes(param);
     // Initial support is limited to caching Cast output. This can
-    // be extended to accomodate more ops whose result depends only
+    // be extended to accommodate more ops whose result depends only
     // on the weight tensor which is a WIP.
     for (const Node* node : consumer_nodes) {
       if (node != nullptr && node->OpType() == "Cast") {
diff --git a/orttraining/orttraining/core/framework/pipeline.cc b/orttraining/orttraining/core/framework/pipeline.cc
index 3b0a63bb2a71a..3614637ca0987 100644
--- a/orttraining/orttraining/core/framework/pipeline.cc
+++ b/orttraining/orttraining/core/framework/pipeline.cc
@@ -193,7 +193,7 @@ std::vector<int> PipelineScheduler::FindForwardComputeTime(const std::vector<int
       }
 
       if (s > 0 && t <= forward_time.at(s - 1)) {
-        // Foward of the s-th stage must happen after forward of (s-1)-th stage.
+        // Forward of the s-th stage must happen after forward of (s-1)-th stage.
         // Note that forward_time[s] is the time slot of the s-th stage.
         continue;
       }
diff --git a/orttraining/orttraining/core/graph/mixed_precision_transformer.cc b/orttraining/orttraining/core/graph/mixed_precision_transformer.cc
index 1bed983cde64d..a4143e7c817fd 100644
--- a/orttraining/orttraining/core/graph/mixed_precision_transformer.cc
+++ b/orttraining/orttraining/core/graph/mixed_precision_transformer.cc
@@ -46,7 +46,7 @@ static const std::unordered_map<std::string, std::vector<int>> stage1_fp32_node_
 };
 
 // Currently the list here is same as stage1 above due to empty FP32_Nodes.
-// It's possibile we will have more FP32 nodes added, this map will also be extended.
+// It's possible we will have more FP32 nodes added, this map will also be extended.
 static const std::unordered_map<std::string, std::vector<int>> stage2_fp32_node_args = {
     {"Dropout", {1}},
     {"DropoutGrad", {2}},
diff --git a/orttraining/orttraining/core/graph/optimizer_graph_builder.h b/orttraining/orttraining/core/graph/optimizer_graph_builder.h
index b79bde28c0d9c..d33902379cb5e 100644
--- a/orttraining/orttraining/core/graph/optimizer_graph_builder.h
+++ b/orttraining/orttraining/core/graph/optimizer_graph_builder.h
@@ -125,7 +125,7 @@ class OptimizerGraphBuilder {
       GraphAugmenter::GraphDefs& graph_defs,
       std::unordered_map<std::string, std::unordered_map<std::string, std::string>>& weight_to_opt_mapping);
 
-  // This function can be overriden by child classes to have different logic
+  // This function can be overridden by child classes to have different logic
   // for building optimizers.
   virtual Status BuildOptimizerNode(
       const std::unique_ptr<OptimizerBuilder>& opt_builder,
diff --git a/orttraining/orttraining/core/graph/pipeline_transformer.cc b/orttraining/orttraining/core/graph/pipeline_transformer.cc
index a58cca0acd014..f989d53aa85d5 100644
--- a/orttraining/orttraining/core/graph/pipeline_transformer.cc
+++ b/orttraining/orttraining/core/graph/pipeline_transformer.cc
@@ -446,7 +446,7 @@ void FindPipelineLandmarks(
 //
 // The input graph is a pipeline's stage, which contains some Send's and Recv's.
 //
-// For diferent pipeline stages, they have different communication patterns as
+// For different pipeline stages, they have different communication patterns as
 // shown below.
 //
 //  1. First stage:
@@ -1615,7 +1615,7 @@ Status ApplyPipelinePartitionToMainGraph(Graph& graph,
                                                        send_nodes, recv_nodes,
                                                        stage_to_rank));
 
-  // Take care of weights that are shared accross stages.
+  // Take care of weights that are shared across stages.
   ORT_RETURN_IF_ERROR(HandleSharedInitializer(graph, send_nodes, recv_nodes));
 
   std::set<const NodeArg*> visited_outputs;
diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc
index 20122d378a246..2a8d2de982e79 100644
--- a/orttraining/orttraining/core/graph/training_op_defs.cc
+++ b/orttraining/orttraining/core/graph/training_op_defs.cc
@@ -1737,7 +1737,7 @@ void RegisterTrainingOpSchemas() {
         propagateShapeAndTypeFromFirstInput(ctx);
       });
 
-  // TODO: Depreacate this schema when training support is udpated to opset-12
+  // TODO: Depreacate this schema when training support is updated to opset-12
   ONNX_CONTRIB_OPERATOR_SCHEMA(GatherND)
       .SetDomain(kOnnxDomain)
       .SinceVersion(1)
@@ -1820,7 +1820,7 @@ Example 4:
       .Input(0, "shape", "The shape of source data input of GatherND.", "T1")
       .Input(1, "indices", "Tensor of rank q >= 1.", "Tind")
       .Input(2, "update", "The gradient of the output.", "T")
-      .Output(0, "output", "Tensor graident of the input.", "T")
+      .Output(0, "output", "Tensor gradient of the input.", "T")
       .TypeConstraint(
           "T",
           {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
@@ -2493,7 +2493,7 @@ Example 4:
       .SetSupportLevel(OpSchema::SupportType::EXPERIMENTAL)
       .SetDoc(
           "Returns the reduction axes for computing gradients of s0 op s1 with broadcast."
-          "The ouput axes are deterministic from last to first. "
+          "The output axes are deterministic from last to first. "
           "Output is an empty vector when no reduction is necessary for the corresponding input.")
       .Input(0, "a_shape", "The 1st input shape as Tensor.", "T")
       .Input(1, "b_shape", "The 2nd input shape as Tensor.", "T")
@@ -2530,7 +2530,7 @@ Example 4:
   ONNX_CONTRIB_OPERATOR_SCHEMA(GistBinarizeDecoder)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
-      .Input(0, "X", "compresssed input", "T1")
+      .Input(0, "X", "compressed input", "T1")
       .Output(0, "Y", "uncompressed output", "T")
       .Attr("to",
             "The data type to which the elements of the input tensor are cast. "
@@ -2568,7 +2568,7 @@ Example 4:
   ONNX_CONTRIB_OPERATOR_SCHEMA(GistPack1Decoder)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
-      .Input(0, "X", "1 bit compresssed input", "T1")
+      .Input(0, "X", "1 bit compressed input", "T1")
       .Output(0, "Y", "uncompressed output", "T")
       .Attr("to",
             "The data type to which the elements of the input tensor are cast. "
@@ -2606,7 +2606,7 @@ Example 4:
   ONNX_CONTRIB_OPERATOR_SCHEMA(GistPack8Decoder)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
-      .Input(0, "X", "compresssed input", "T1")
+      .Input(0, "X", "compressed input", "T1")
       .Output(0, "Y", "uncompressed output", "T")
       .Attr("to",
             "The data type to which the elements of the input tensor are cast. "
@@ -2682,7 +2682,7 @@ Example 4:
   ONNX_CONTRIB_OPERATOR_SCHEMA(GistPackMsfp15Decoder)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
-      .Input(0, "X", "compresssed input", "T1")
+      .Input(0, "X", "compressed input", "T1")
       .Output(0, "Y", "uncompressed output", "T")
       .Attr("to",
             "The data type to which the elements of the input tensor are cast. "
@@ -3191,7 +3191,7 @@ Return true if all elements are true and false otherwise.
             "Strictly must be one of the types from DataType enum in TensorProto",
             AttributeProto::INT)
       .Attr("fuse_outputs",
-            "If true, fuse all outputs into one continous buffer.",
+            "If true, fuse all outputs into one continuous buffer.",
             AttributeProto::INT,
             static_cast<int64_t>(0))
       .TypeConstraint(
@@ -3240,7 +3240,7 @@ Return true if all elements are true and false otherwise.
       .Input(1, "scale", "Scale scalar tensor.", "ScaleT")
       .Output(0, "output", "The scaled output tensor.", "T")
       .Attr("scale_down",
-            "If true, the output tensor is input tensor devided by scale, "
+            "If true, the output tensor is input tensor divided by scale, "
             "otherwise, it's input tensor multiplied by scale. "
             "The default value is false.",
             AttributeProto::INT,
@@ -3636,7 +3636,7 @@ Return true if all elements are true and false otherwise.
           fail_shape_inference("RecordEvent must have at least (num_outputs + 1) inputs.");
 
         // note: if num_input > num_output + 1,
-        // the additional inputs (idx >= num_ouput + 1) are regarded as dependencies
+        // the additional inputs (idx >= num_output + 1) are regarded as dependencies
         // which are only used for maintain topological order
         for (size_t i = 0; i < ctx.getNumOutputs(); ++i) {
           propagateElemTypeFromInputToOutput(ctx, i + 1, i);
@@ -3689,7 +3689,7 @@ Return true if all elements are true and false otherwise.
           fail_shape_inference("WaitEvent must have at least 1 output.");
 
         // note: if num_input > num_output + 1,
-        // the additional inputs (idx >= num_ouput + 1) are regarded as dependencies
+        // the additional inputs (idx >= num_output + 1) are regarded as dependencies
         // which are only used for maintain topological order
         for (size_t i = 0; i < ctx.getNumOutputs(); ++i) {
           propagateElemTypeFromInputToOutput(ctx, i + 1, i);
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_config.h b/orttraining/orttraining/core/optimizer/graph_transformer_config.h
index c496e36689de1..a2b44689f9ef0 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_config.h
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_config.h
@@ -17,7 +17,7 @@ struct TrainingGraphTransformerConfiguration : public GraphTransformerConfigurat
   bool attn_dropout_recompute{false};
   // Enable recompute of Gelu activation output to save memory
   bool gelu_recompute{false};
-  // Enable recompute of transformer layer ouput to save memory
+  // Enable recompute of transformer layer output to save memory
   bool transformer_layer_recompute{false};
   // Number of layers to apply recompute
   int number_recompute_layers{0};
diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc
index 1bf08fa55ca88..87a7cbc0375a4 100644
--- a/orttraining/orttraining/core/session/training_session.cc
+++ b/orttraining/orttraining/core/session/training_session.cc
@@ -1425,7 +1425,7 @@ std::unordered_set<std::string> TrainingSession::GetTrainableModelInitializers(
 
 #if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
 // Create NCCL's communication plan. In runtime, we will provide details such
-// as pointer to sent/recieved data and the size of the data in byte. See how
+// as pointer to sent/received data and the size of the data in byte. See how
 // Send and Recv call SubmitSendAndWait and SubmitRecvAndWait, respectively.
 void PipelineTrainingSession::LaunchNcclService(const int pipeline_stage_id) {
   ORT_ENFORCE(pipeline_stage_id >= 0, "Pipeline stage ID cannot be negative.");
@@ -1444,7 +1444,7 @@ void PipelineTrainingSession::LaunchNcclService(const int pipeline_stage_id) {
         // In this time slot, stage "pipeline_stage_id" sendss data to "task.peer_rank".
         nccl_service.PlanSend(task.peer_rank);
       } else if (task.type == pipeline::PipelineTask::Type::Recv) {
-        // In this time slot, stage "pipeline_stage_id" recieves data from "task.peer_rank".
+        // In this time slot, stage "pipeline_stage_id" receives data from "task.peer_rank".
         nccl_service.PlanRecv(task.peer_rank);
       }
     }
diff --git a/orttraining/orttraining/models/bert/main.cc b/orttraining/orttraining/models/bert/main.cc
index 33d0d0346a48a..22cdd9351a206 100644
--- a/orttraining/orttraining/models/bert/main.cc
+++ b/orttraining/orttraining/models/bert/main.cc
@@ -204,12 +204,14 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet
       ("data_parallel_size", "Data parallel group size.", cxxopts::value<int>()->default_value("1"))
       ("horizontal_parallel_size", "Horizontal model parallel group size.", cxxopts::value<int>()->default_value("1"))
       ("pipeline_parallel_size", "Number of pipeline stages.", cxxopts::value<int>()->default_value("1"))
-      ("pipeline_stage_paths", "Specify the forward ONNX files for pipeline evaluation.", cxxopts::value<std::vector<std::string>>()->default_value(""))
-      ("cut_group_info", "Specify the cutting info for graph partition (pipeline only). An example of a cut_group_info of "
-      "size two is: 1393:407-1463/1585/1707,2369:407-2439/2561/2683. Here, the cut info is split by ',', with the first "
-      "cut_info equal to 1393:407-1463/1585/1707, and second cut_info equal to 2369:407-2439/2561/2683. Each CutEdge is "
-      "seperated by ':'. If consumer nodes need to be specified, specify them after producer node with a '-' delimiter and "
-      "separate each consumer node with a '/'. ", cxxopts::value<std::vector<std::string>>()->default_value(""))
+      ("pipeline_stage_paths", "Specify the forward ONNX files for pipeline evaluation.",
+      cxxopts::value<std::vector<std::string>>()->default_value(""))
+      ("cut_group_info", "Specify the cutting info for graph partition (pipeline only). An example of a cut_group_info "
+      "of size two is: 1393:407-1463/1585/1707,2369:407-2439/2561/2683. Here, the cut info is split by ',', with the "
+      "first cut_info equal to 1393:407-1463/1585/1707, and second cut_info equal to 2369:407-2439/2561/2683. Each "
+      "CutEdge is separated by ':'. If consumer nodes need to be specified, specify them after producer node with a "
+      "'-' delimiter and separate each consumer node with a '/'. ",
+      cxxopts::value<std::vector<std::string>>()->default_value(""))
       ("enable_grad_norm_clip", "Specify whether to enable gradient clipping for optimizers.",
         cxxopts::value<bool>()->default_value("true"))
       ("enable_gelu_approximation", "Specify whether to enable GELU approximation.",
@@ -572,7 +574,7 @@ float GetLossValue(const Tensor& loss_tensor) {
 
 // use this table mapping to define what to be stored in mapped_dimensions, and ultimately in json structure
 // Be mindful on the position, if it's invalid or out of bound, the property population process will be
-// either incorrect or aborted. Also make sure to substract the index position by 1 to get valid correspondent value
+// either incorrect or aborted. Also make sure to subtract the index position by 1 to get valid correspondent value
 // namely, in the graph, sequence is at position 1, but in initial tensor shape vector loaded from training data is at position 0,
 // batch is not part of the initial tensor shape vector till later
 // see GetTensorDimensionsFromInputs() in training_util.h and training_runner.cc for more details
diff --git a/orttraining/orttraining/models/mnist/main.cc b/orttraining/orttraining/models/mnist/main.cc
index a2fc6909a86a6..8aaa6b1ebf7f2 100644
--- a/orttraining/orttraining/models/mnist/main.cc
+++ b/orttraining/orttraining/models/mnist/main.cc
@@ -51,7 +51,8 @@ Status ParseArguments(int argc, char* argv[], MnistParameters& params) {
         cxxopts::value<std::string>()->default_value("mnist_data"))
       ("log_dir", "The directory to write tensorboard events.",
         cxxopts::value<std::string>()->default_value(""))
-      ("use_profiler", "Collect runtime profile data during this training run.", cxxopts::value<bool>()->default_value("false"))
+      ("use_profiler", "Collect runtime profile data during this training run.",
+       cxxopts::value<bool>()->default_value("false"))
       ("use_gist", "Whether to use GIST encoding/decoding.")
       ("gist_op", "Opearator type(s) to which GIST is applied.", cxxopts::value<int>()->default_value("0"))
       ("gist_compr", "Compression type used for GIST", cxxopts::value<std::string>()->default_value("GistPack8"))
@@ -66,11 +67,12 @@ Status ParseArguments(int argc, char* argv[], MnistParameters& params) {
       ("data_parallel_size", "Data parallel group size.", cxxopts::value<int>()->default_value("1"))
       ("horizontal_parallel_size", "Horizontal model parallel group size.", cxxopts::value<int>()->default_value("1"))
       ("pipeline_parallel_size", "Number of pipeline stages.", cxxopts::value<int>()->default_value("1"))
-      ("cut_group_info", "Specify the cutting info for graph partition (pipeline only). An example of a cut_group_info of "
-      "size two is: 1393:407-1463/1585/1707,2369:407-2439/2561/2683. Here, the cut info is split by ',', with the first "
-      "cut_info equal to 1393:407-1463/1585/1707, and second cut_info equal to 2369:407-2439/2561/2683. Each CutEdge is "
-      "seperated by ':'. If consumer nodes need to be specified, specify them after producer node with a '-' delimiter and "
-      "separate each consumer node with a '/'. ", cxxopts::value<std::vector<std::string>>()->default_value(""))
+      ("cut_group_info", "Specify the cutting info for graph partition (pipeline only). An example of a cut_group_info "
+      "of size two is: 1393:407-1463/1585/1707,2369:407-2439/2561/2683. Here, the cut info is split by ',', with the "
+      "first cut_info equal to 1393:407-1463/1585/1707, and second cut_info equal to 2369:407-2439/2561/2683. Each "
+      "CutEdge is separated by ':'. If consumer nodes need to be specified, specify them after producer node with a "
+      "'-' delimiter and separate each consumer node with a '/'. ",
+      cxxopts::value<std::vector<std::string>>()->default_value(""))
       ("evaluation_period", "How many training steps to make before making an evaluation.",
         cxxopts::value<size_t>()->default_value("1"));
   // clang-format on
@@ -301,7 +303,7 @@ int main(int argc, char* args[]) {
   }
 
   if (testData->NumSamples() == 0) {
-    printf("Warning: No data loaded - run cancelled.\n");
+    printf("Warning: No data loaded - run canceled.\n");
     return -1;
   }
 
diff --git a/orttraining/orttraining/models/runner/training_runner.cc b/orttraining/orttraining/models/runner/training_runner.cc
index 6421f7c81f7fb..dae6f613f4329 100644
--- a/orttraining/orttraining/models/runner/training_runner.cc
+++ b/orttraining/orttraining/models/runner/training_runner.cc
@@ -1188,7 +1188,7 @@ Status TrainingRunner::Evaluate(TrainingSession& session, IDataLoader& data_load
                                     fetch_names,
                                     &fetches));
 
-    // Assume that user-specified fetches are avaliable only on the last pipeline stage.
+    // Assume that user-specified fetches are available only on the last pipeline stage.
     // When there is no pipeline, all pipeline_context_.pipeline_stage_id should be 0 and
     // params_.pipeline_parallel_size is 1. Thus, the following condition is always true if there
     // is no pipeline.
diff --git a/orttraining/orttraining/python/training/onnxblock/blocks.py b/orttraining/orttraining/python/training/onnxblock/blocks.py
index 80f07c3738a7e..ed68171cc6f9c 100644
--- a/orttraining/orttraining/python/training/onnxblock/blocks.py
+++ b/orttraining/orttraining/python/training/onnxblock/blocks.py
@@ -403,12 +403,12 @@ def __init__(self, like: str):
     def build(self, input_name: Optional[str] = None):
         cloned_input = None
         with contextlib.suppress(LookupError):
-            # Supress LookupError because we want to try to get the input from the output if it's not found in the inputs
+            # Suppress LookupError because we want to try to get the input from the output if it's not found in the inputs
             cloned_input = copy.deepcopy(_graph_utils.get_input_from_input_name(self.base, self._like))
 
         if cloned_input is None:
             with contextlib.suppress(LookupError):
-                # Supress LookupError because we deal with the case where no input or output was found later.
+                # Suppress LookupError because we deal with the case where no input or output was found later.
                 cloned_input = copy.deepcopy(_graph_utils.get_output_from_output_name(self.base, self._like))
 
         if cloned_input is None:
diff --git a/orttraining/orttraining/python/training/ortmodule/__init__.py b/orttraining/orttraining/python/training/ortmodule/__init__.py
index 20e3493395b3d..4bc470c633437 100644
--- a/orttraining/orttraining/python/training/ortmodule/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/__init__.py
@@ -194,7 +194,7 @@ def export_context():
         ),
     )
 
-# Initalized ORT's random seed with pytorch's initial seed
+# Initialized ORT's random seed with pytorch's initial seed
 # in case user has set pytorch seed before importing ORTModule
 set_seed(torch.initial_seed() % sys.maxsize)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/fused_ops_frontend.cpp b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/fused_ops_frontend.cpp
index 19ba6b17aba02..4e9db732b5385 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/fused_ops_frontend.cpp
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/fused_ops_frontend.cpp
@@ -10,7 +10,7 @@
 
 const size_t EMIT_NUM = 4;
 
-// This will avoid the copies when doing implict Python list <==> C++ std::vector<> conversion.
+// This will avoid the copies when doing implicit Python list <==> C++ std::vector<> conversion.
 PYBIND11_MAKE_OPAQUE(std::vector<at::Tensor>);
 
 // This function is adapted from microsoft/DeepSpeed fused_adam_frontend.cpp
@@ -150,7 +150,7 @@ void unscale_fp16_grads_into_fp32_grads(std::vector<at::Tensor>& all_fp16_params
 
   if (idx_to_fp32_from_fp16_params.size() > 0) {
     auto mem_buffer = MemoryBuffer(memory_buffer_size, idx_to_fp32_from_fp16_params.begin()->second);
-    const size_t emit_threshhold = memory_buffer_size / EMIT_NUM;
+    const size_t emit_threshold = memory_buffer_size / EMIT_NUM;
 
     size_t acc_size = 0;
     std::vector<at::Tensor> partial_new_fp32_grads;
@@ -167,7 +167,7 @@ void unscale_fp16_grads_into_fp32_grads(std::vector<at::Tensor>& all_fp16_params
       partial_new_fp32_grads.emplace_back(idx_to_fp32_from_fp16_params[idx].grad());
       partial_fp16_grads_needing_unscale.emplace_back(fp16_grads_needing_unscale[fp32_from_fp16_param_idx]);
 
-      if (acc_size > emit_threshhold || fp32_from_fp16_param_idx == idx_to_fp32_from_fp16_params.size() - 1) {
+      if (acc_size > emit_threshold || fp32_from_fp16_param_idx == idx_to_fp32_from_fp16_params.size() - 1) {
         if (partial_fp16_grads_needing_unscale.size() > 0) {
           std::vector<std::vector<at::Tensor>> tensor_lists;
           tensor_lists.emplace_back(partial_fp16_grads_needing_unscale);
diff --git a/orttraining/orttraining/test/distributed/partition_utils.h b/orttraining/orttraining/test/distributed/partition_utils.h
index 1369b493655b6..c22d0a3eb2f93 100644
--- a/orttraining/orttraining/test/distributed/partition_utils.h
+++ b/orttraining/orttraining/test/distributed/partition_utils.h
@@ -338,7 +338,7 @@ common::Status SplitGraph(Graph& graph,
   //    but nodeA, nodeB belong to parition0, nodeC belongs to parition1, and nodeD belongs to parition2.
   //    This means we need to cut edge nodeA->nodeC for the first partition and nodeA->nodeD for the second partition.
   //
-  //    During the first cut, we identify the edge nodeA->nodeC, for this edge, based on the origional node_arg,
+  //    During the first cut, we identify the edge nodeA->nodeC, for this edge, based on the original node_arg,
   //    we create a new node_arg, called updated_node_arg. The inserted send node will take the original node_arg
   //    as input and the inserted recv node will take the updated_node_arg as the output.
   //    And we update updated_node_args with updated_node_args[original_node_arg] = updated_node_arg
@@ -414,7 +414,7 @@ common::Status SplitGraph(Graph& graph,
       auto producer_node = graph.GetMutableProducerNode(id.node_arg_name);
       if (!producer_node) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Cannot find producer node of node_arg with name: ", id.node_arg_name,
-                               ". Wrong cutting infomation.");
+                               ". Wrong cutting information.");
       }
 
       // once we find out the producer node for id.node_arg_name, find which output index that leads
@@ -606,7 +606,7 @@ Status CutBasedApplyPipelinePartitionToMainGraph(
     ORT_RETURN_IF_ERROR(GenerateSubgraph(graph, recv_nodes.back()));
   }
 
-  // Post check to ensure the curent partition is correct and matches with Send/Recv nodes inserted during split.
+  // Post check to ensure the current partition is correct and matches with Send/Recv nodes inserted during split.
   Node* send_node{nullptr};
   Node* recv_node{nullptr};
   for (auto& node : graph.Nodes()) {
diff --git a/orttraining/orttraining/test/graph/bert_toy_fetches.h b/orttraining/orttraining/test/graph/bert_toy_fetches.h
index 5bfc5da742cd4..71465c142f127 100644
--- a/orttraining/orttraining/test/graph/bert_toy_fetches.h
+++ b/orttraining/orttraining/test/graph/bert_toy_fetches.h
@@ -8,7 +8,7 @@
 
 namespace onnxruntime {
 namespace test {
-// Avoid this arrary being initialized on stack.
+// Avoid this array being initialized on stack.
 // Limit the number of arguments to compile with clang.
 constexpr std::array<double, 16384> bert_embeddings_position_embeddings_weight_grad = {-0.009673337, 0.015859816, -0.0060598925, 0.0061725015, 0.0686829, 0.031034196, -0.041214723, 0.04238321, -0.045230567, -0.03455956, 0.037526406, 0.019020742, -0.008562718, -0.030574083, -0.012788322, -0.0008712788, -0.041134313, 0.027024698, -0.012437805, 0.059991226, -0.026614683, -0.06257652, -0.020100333, -0.03510955, 0.05741506, 0.068152145, -0.065179504, 0.038520053, 0.019393224, 0.03954512, 0.006873767, -0.084907904, -0.0050477944, 0.0012708178, 0.0030560307, -0.032130327, -0.0144646885, -0.016298112, -0.042901997, 0.07588, 0.01613088, -0.018301323, -0.010611727, 0.005544794, -0.014955264, -0.016850606, 0.022336477, -0.0030460241, -0.014482946, 0.00859436, -0.014712406, 0.03867981, -0.022954227, 0.015440098, -0.005059921, 0.0035975706, 0.01880927, 0.062380753, 0.02279159, 0.0036130734, 0.029864375, -0.022658946, -0.0069784625, -0.06653513, -0.01116233, 0.021000436, -0.028701056, -0.024398895, 0.011476517, 0.032129377, -0.04200533, 0.05585559, 0.027091827, -0.03708192, -0.029153917, 0.014818583, -0.03863439, -0.03299714, 0.026062695, 0.027578063, -0.033457935, 0.023994414, -0.00042527216, 0.020991987, -0.043016825, 0.03330429, -0.0051043453, -0.061040144, 0.02476727, 0.07664442, -0.0109203905, 0.046167813, 0.05265824, -0.009806289, -0.032828216, -0.053807136, -0.018357445, -0.0060726395, 0.012883636, -0.03604291, -0.020931121, -0.017016709, -0.06521842, 0.09689566, 0.010757825, -0.014480298, -0.011673617, 0.014982184, -0.011422393, -0.015741495, 0.021494215, -0.013776923, -0.017716365, 0.02294489, -0.00073889084, 0.036582764, -0.013822639, 0.0075510093, -0.015371518, 0.012141101, 0.009292599, 0.0632079, 0.023068016, -0.0034772623, 0.033849746, -0.009428004, -0.0021826755, -0.07218023, -0.00040298235, 0.008162888, -0.009084097, -0.025772562, 0.01697198, 0.0096272295, -0.05384024, 0.054271728, 0.0061686123, -0.012313863, -0.010857888, 0.011092398, -0.017863888, -0.023245087, 0.0147367595, 0.0022649313, -0.0307159, 0.004318953, 0.0035282676, 0.026500994, -0.029873395, 0.0049419748, -0.007642911, -0.02280794, 0.016169535, 0.059451614, 0.015289053, 0.021232026, 0.042667653, -0.0034166733, -0.014750072, -0.05480911, 0.0012827339, -0.00061177486, 0.008855328, -0.014449824, -0.008173137, -0.033359475, -0.06602954, 0.074186556, -0.0031156093, 0.0009635263, -0.0151721025, 0.007254398, 0.015830085, 0.009578684, -0.0053947777, -0.020233134, -0.016644966, 0.002484738, -0.019542504, 0.026349604, -0.017563643, -0.005398605, 0.0013201954, 0.034780584, 0.007976923, 0.054721735, 0.015226502, -0.001414868, 0.030154174, 0.011785319, 0.0033271122, -0.07897424, 0.01796715, -0.00018319988, 0.006205301, -0.019297902, 0.03912447, 0.0022418862, -0.048669476, 0.031012537, -0.0155599145, -0.01757, -0.0011392199, 0.016611777, 0.008555129, -0.017760677, -0.02604977, 0.014489464, -0.041648414, -0.017570462, 0.005586198, 0.03271513, -0.04649407, -0.038035538, 2.2510882e-05, -0.006990753, 0.043797504, 0.0970251, 0.0041649155, 0.020328937, 0.058848612, -0.008414367, -0.026458042, -0.06685481};
 static std::unordered_map<std::string, std::vector<double>> BERT_TOY_FETCHES = {
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 541473b1561db..6f5b03685e801 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -264,7 +264,7 @@ class UnusedBeginParameterNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size1, hidden_size2, num_classes):
         super().__init__()
 
-        # fc1 is an unused initializer (which is in the begining of initializer list)
+        # fc1 is an unused initializer (which is in the beginning of initializer list)
         # which will be dropped after export
         self.fc1 = torch.nn.Linear(input_size, hidden_size1)
         self.relu = torch.nn.ReLU()
diff --git a/orttraining/orttraining/test/python/qat_poc_example/qat.py b/orttraining/orttraining/test/python/qat_poc_example/qat.py
index dcc9e116fda7d..4378118b71b9f 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/qat.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/qat.py
@@ -24,7 +24,7 @@
     onnx.save(onnx_model, os.path.join(model_dir, f"{model_name}.onnx"))
 
     logging.info(
-        "Begining Quantization process for model saved at: %s",
+        "Beginning Quantization process for model saved at: %s",
         os.path.join(model_dir, f"{model_name}.onnx"),
     )
     logging.info("Skipping model preprocessing step. As QAT requires a un preprocessed model.")
diff --git a/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc b/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
index d36f9b307ec70..61bd9c19f3541 100644
--- a/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
@@ -1036,7 +1036,7 @@ TEST(CrossEntropyTest, SoftmaxCrossEntropyLossInternalGrad_TinySizeTensorFloatIn
   std::vector<int64_t> index_dims{8};
   std::vector<int64_t> weight_dims{2};
   std::vector<int64_t> dX_dims{8, 2};
-  // Set run_cpu_baseline_seperately = True because CPU kernel did not support multiple type support
+  // Set run_cpu_baseline_separately = True because CPU kernel did not support multiple type support
   // for input and output.
   TestSoftmaxCrossEntropyLossInternalGrad<float, MLFloat16>(dY_dims, log_prob_dims, index_dims, weight_dims,
                                                             dX_dims, "mean", -1, 5e-2, false /*has_bias*/);
diff --git a/orttraining/orttraining/training_api/optimizer.cc b/orttraining/orttraining/training_api/optimizer.cc
index 4647f890729f4..e42752b3a2d55 100644
--- a/orttraining/orttraining/training_api/optimizer.cc
+++ b/orttraining/orttraining/training_api/optimizer.cc
@@ -205,7 +205,7 @@ Optimizer::Optimizer(const ModelIdentifiers& model_identifiers,
       // by invoking ConstructOptimizerStateAndInputs().
       ORT_THROW_IF_ERROR(ConstructOptimizerStateAndInputs());
     } else {
-      delay_optimizer_state_contruction_ = true;
+      delay_optimizer_state_construction_ = true;
     }
   } else {
     ORT_THROW_IF_ERROR(LoadStateDict(state_->optimizer_checkpoint_state));
@@ -256,7 +256,7 @@ void Optimizer::Initialize(const ModelIdentifiers& model_identifiers,
 }
 
 Status Optimizer::Step() {
-  if (delay_optimizer_state_contruction_) {
+  if (delay_optimizer_state_construction_) {
     ORT_RETURN_IF_ERROR(ConstructOptimizerStateAndInputs());
   }
 
@@ -343,7 +343,7 @@ Status Optimizer::ConstructOptimizerStateAndInputs() {
   ORT_RETURN_IF_ERROR(GenerateMomentumNamedStates(state_->optimizer_checkpoint_state));
   ORT_RETURN_IF_ERROR(ConstructInputs());
 
-  delay_optimizer_state_contruction_ = false;
+  delay_optimizer_state_construction_ = false;
 
   return Status::OK();
 }
diff --git a/orttraining/orttraining/training_api/optimizer.h b/orttraining/orttraining/training_api/optimizer.h
index 5b908acf7c9e3..a0717563a8bd0 100644
--- a/orttraining/orttraining/training_api/optimizer.h
+++ b/orttraining/orttraining/training_api/optimizer.h
@@ -166,7 +166,7 @@ struct Optimizer {
 
   int32_t group_count_{0};
 
-  bool delay_optimizer_state_contruction_{false};
+  bool delay_optimizer_state_construction_{false};
 };
 
 }  // namespace api
diff --git a/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc b/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc
index d3f2f9c7a8767..40497467a31a5 100644
--- a/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc
+++ b/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc
@@ -82,7 +82,7 @@ Status ComputeGeluGradDX(gsl::span<const T> dY, gsl::span<const T> X, gsl::span<
   static constexpr T kBeta = static_cast<T>(kGamma * kAlpha * 3.0f);
 
   //
-  // Commented out EIGEN implentation due to EIGEN bug.
+  // Commented out EIGEN implementation due to EIGEN bug.
   // On Windows Release build with GPU enabled, kAlpha * EIGEN_X below would produce pure 0
   // result, even though neither kAlpha nor EIGEN_X is zero.
   // Given that CPU kernel is mostly for conformance check, where performance is not of high
diff --git a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu
index 56520337fe683..a468c756ef74d 100644
--- a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu
@@ -10,7 +10,7 @@
 namespace onnxruntime {
 namespace cuda {
 
-// for now this operator classes are no different than a funciton.
+// for now this operator classes are no different than a function.
 // Eventually once multiple binary gradient ops are needed, we will pass
 // its instance from API instead of direct function call.
 template <class T>
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
index 501c48e687e98..1152c98447444 100644
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
@@ -582,7 +582,7 @@ Status LambOptimizer<T1, T2, T3, T4, T_GRAD_NORM, T_MIXED_PRECISION_FP>::Compute
     // Allocate a buffer in byte for reduction API calls.
     size_t rbs = compute_reduction_buffer_size<CudaT2>(max_tensor_size);
 
-    // Enlarge reduction buffer to accomodate multi-tensor reduction kernel as well
+    // Enlarge reduction buffer to accommodate multi-tensor reduction kernel as well
     constexpr int tensor_group_size = 4;  // w, d, w_norm, d_norm
     constexpr int max_blocks = ChunkGroup<tensor_group_size>::max_block_count;
     constexpr size_t multitensor_block_reduce_buffer_size = 2 * max_blocks * sizeof(CudaT2);
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu b/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
index fd55f7c30ff75..f59f5f7dc9c33 100644
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
@@ -192,7 +192,7 @@ __device__ __forceinline__ void _LambUpdateRule(
     T2* w_new,
     T3* g_new,
     T_MIXED_PRECISION_FP* w_mixed_precision_new) {
-  // Confidence coefficeint of this update.
+  // Confidence coefficient of this update.
   const T2 ratio = (w_norm != T2(0.0f) && r_norm != T2(0.0f)) ? T2(eta) * _Max(T2(ratio_min), _Min(T2(ratio_max), _Sqrt(w_norm / r_norm))) : T2(eta);
 
   // Compute delta using the saved update direction.
diff --git a/orttraining/tools/scripts/layer_norm_transform.py b/orttraining/tools/scripts/layer_norm_transform.py
index b397d1d26a456..bc6fe0eaf8b29 100644
--- a/orttraining/tools/scripts/layer_norm_transform.py
+++ b/orttraining/tools/scripts/layer_norm_transform.py
@@ -164,7 +164,7 @@ def main():
     vocab_size = 30528
 
     # Create a fake data point.
-    vocab_size = 30528  # It shoudl match the value from BERT config file.
+    vocab_size = 30528  # It should match the value from BERT config file.
     input_ids = np.random.randint(low=0, high=vocab_size, size=(batch, sq_length), dtype=np.int64)
     segment_ids = np.random.randint(low=0, high=2, size=(batch, sq_length), dtype=np.int64)
     input_mask = np.ones((batch, sq_length), dtype=np.int64)
diff --git a/orttraining/tools/scripts/model_transform.py b/orttraining/tools/scripts/model_transform.py
index f0cf53990eac3..2fb1936ff2184 100644
--- a/orttraining/tools/scripts/model_transform.py
+++ b/orttraining/tools/scripts/model_transform.py
@@ -269,7 +269,7 @@ def process_dropout(model):
         del model.graph.node[d]
 
 
-# Also need to set following line differently for differnt verison of bert
+# Also need to set following line differently for different version of bert
 # expand_out.name = '412'
 def add_expand_shape(model):
     expand_out = model.graph.value_info.add()
diff --git a/orttraining/tools/scripts/opset12_model_transform.py b/orttraining/tools/scripts/opset12_model_transform.py
index e8c2263a39c32..790bdc34e1ff7 100644
--- a/orttraining/tools/scripts/opset12_model_transform.py
+++ b/orttraining/tools/scripts/opset12_model_transform.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 #
 # This converter is an internal util to upgrade existing bert/gpt-2 models,
-# which were previously transformed/optimized from orginal model, to Opset 12
+# which were previously transformed/optimized from original model, to Opset 12
 # version as well as replacing deprecated node, i.e., TrainableDropout with
 # the "Dropout" node matching the Opset 12 Spec. Typically, a model to
 # be run by this scripts would have "_optimized" substring in its model name,
diff --git a/rust/onnxruntime-sys/examples/c_api_sample.rs b/rust/onnxruntime-sys/examples/c_api_sample.rs
index e8c9ca8f09a5a..3cfb9d76029a0 100644
--- a/rust/onnxruntime-sys/examples/c_api_sample.rs
+++ b/rust/onnxruntime-sys/examples/c_api_sample.rs
@@ -31,8 +31,8 @@ fn main() {
     assert_ne!(g_ort, std::ptr::null_mut());
 
     //*************************************************************************
-    // initialize  enviroment...one enviroment per process
-    // enviroment maintains thread pools and other state info
+    // initialize  environment...one environment per process
+    // environment maintains thread pools and other state info
     let mut env_ptr: *mut OrtEnv = std::ptr::null_mut();
     let env_name = std::ffi::CString::new("test").unwrap();
     let status = unsafe {
diff --git a/rust/onnxruntime/src/tensor/ort_output_tensor.rs b/rust/onnxruntime/src/tensor/ort_output_tensor.rs
index 006fbdba6cdb8..83663c0d303f8 100644
--- a/rust/onnxruntime/src/tensor/ort_output_tensor.rs
+++ b/rust/onnxruntime/src/tensor/ort_output_tensor.rs
@@ -70,7 +70,7 @@ impl Drop for OrtOutputTensor {
     }
 }
 
-/// An Ouput tensor with the ptr and the item that will copy from the ptr.
+/// An Output tensor with the ptr and the item that will copy from the ptr.
 #[derive(Debug)]
 pub struct WithOutputTensor<'a, T> {
     #[allow(dead_code)]
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 54f7b6c3a8fa7..98d9ba22b7190 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -718,7 +718,7 @@ def convert_arg_line_to_args(self, arg_line):
 
     # Code coverage
     parser.add_argument(
-        "--code_coverage", action="store_true", help="Generate code coverage when targetting Android (only)."
+        "--code_coverage", action="store_true", help="Generate code coverage when targeting Android (only)."
     )
 
     # lazy tensor support.
@@ -2749,7 +2749,7 @@ def main():
                         cmake_extra_args += ["-D", "BUILD_AS_ARM64X=ARM64EC"]
                 cmake_extra_args += ["-G", args.cmake_generator]
                 # Cannot test on host build machine for cross-compiled
-                # builds (Override any user-defined behaviour for test if any)
+                # builds (Override any user-defined behavior for test if any)
                 if args.test:
                     log.warning(
                         "Cannot test on host build machine for cross-compiled "
diff --git a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
index 24809ccfdec1f..036becb7df077 100644
--- a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
@@ -35,7 +35,7 @@ parameters:
   default: 'nightly (@dev)'
 
 variables:
-  # pipeline should define the following varaibles
+  # pipeline should define the following variables
   #   ExtraBuildArgs
   #   VersionSuffix
 
diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py
index fd25d8bc147cd..dd2dcce01bf4a 100644
--- a/tools/python/util/android/android.py
+++ b/tools/python/util/android/android.py
@@ -30,7 +30,7 @@ def filename(name, windows_extension):
     sdk_root = Path(sdk_root).resolve(strict=True)
 
     return SdkToolPaths(
-        # do not use sdk_root/tools/emulator as that is superceeded by sdk_root/emulator/emulator
+        # do not use sdk_root/tools/emulator as that is superseded by sdk_root/emulator/emulator
         emulator=str((sdk_root / "emulator" / filename("emulator", "exe")).resolve(strict=True)),
         adb=str((sdk_root / "platform-tools" / filename("adb", "exe")).resolve(strict=True)),
         sdkmanager=str(
diff --git a/winml/adapter/winml_adapter_session.cpp b/winml/adapter/winml_adapter_session.cpp
index fa91978b564ba..5e27d8fb9a985 100644
--- a/winml/adapter/winml_adapter_session.cpp
+++ b/winml/adapter/winml_adapter_session.cpp
@@ -310,7 +310,7 @@ ORT_API_STATUS_IMPL(
   winrt::Windows::Foundation::Collections::IMap<winrt::hstring, uint32_t> override_map =
     winrt::single_threaded_map<winrt::hstring, uint32_t>();
   for (auto freeDimOverride : session_options.free_dimension_overrides) {
-    if (freeDimOverride.dim_identifer_type == onnxruntime::FreeDimensionOverrideType::Name) {
+    if (freeDimOverride.dim_identifier_type == onnxruntime::FreeDimensionOverrideType::Name) {
       override_map.Insert(
         winrt::to_hstring(freeDimOverride.dim_identifier), static_cast<uint32_t>(freeDimOverride.dim_value)
       );
diff --git a/winml/api/Microsoft.AI.MachineLearning.Experimental.idl b/winml/api/Microsoft.AI.MachineLearning.Experimental.idl
index ad39a1ed7e684..3322c76f6eef2 100644
--- a/winml/api/Microsoft.AI.MachineLearning.Experimental.idl
+++ b/winml/api/Microsoft.AI.MachineLearning.Experimental.idl
@@ -128,7 +128,7 @@ namespace ROOT_NS.AI.MachineLearning.Experimental {
       Boolean CloseModelOnJoin { get; set; };
 
       //! The JoinedNodePrefix property specifies whether the nodes of the second model should have a specific prefixed in the joined model.
-      //! Node names must be unique or empty. By enabling this, the engine can specifiy the prefix, or eliminate it entirely in cases
+      //! Node names must be unique or empty. By enabling this, the engine can specify the prefix, or eliminate it entirely in cases
       //! where the model is known to contain no duplicate node names.
       //! The default value for CloseModelOnJoin is a new random GUID.
       String JoinedNodePrefix { get; set; };
diff --git a/winml/api/Windows.AI.MachineLearning.idl b/winml/api/Windows.AI.MachineLearning.idl
index 2b55fa8c7a95c..59c58ba80efca 100644
--- a/winml/api/Windows.AI.MachineLearning.idl
+++ b/winml/api/Windows.AI.MachineLearning.idl
@@ -9,7 +9,7 @@ import "windows.media.idl";
 #ifndef WINDOWSAI_RAZZLE_BUILD
 // Pull in definition for DualApiPartitionAttribute, because the WinML IDL
 // does not build in the OS Repo, and needs to access internal definitions for
-// various custom attirbute definitions.
+// various custom attribute definitions.
 import "dualapipartitionattribute.idl";
 import "windows.graphics.directx.direct3d11.idl";
 import "windows.graphics.imaging.idl";
diff --git a/winml/lib/Api/LearningModelBinding.cpp b/winml/lib/Api/LearningModelBinding.cpp
index 17440f6f0a561..222fdba986dcb 100644
--- a/winml/lib/Api/LearningModelBinding.cpp
+++ b/winml/lib/Api/LearningModelBinding.cpp
@@ -30,7 +30,7 @@ static winml::ILearningModelFeatureDescriptor FindValidBinding(
     uint32_t size;
     WINML_THROW_IF_FAILED(descriptor_native->GetName(&feature_name, &size));
 
-    // Case insensetive comparison of onnx name in feature descriptor, and passed in name
+    // Case insensitive comparison of onnx name in feature descriptor, and passed in name
     if (_wcsicmp(feature_name, name.c_str()) == 0) {
       return descriptor;
     }
diff --git a/winml/lib/Api/impl/NumericData.h b/winml/lib/Api/impl/NumericData.h
index 71c61b3c29f6f..129c7cbf1f294 100644
--- a/winml/lib/Api/impl/NumericData.h
+++ b/winml/lib/Api/impl/NumericData.h
@@ -15,7 +15,7 @@ class numeric_data : public _winml::idata {
     size_t num_elements, size_t element_size_in_bytes, wfc::IIterable<wss::IBuffer> const& buffers
   );
 
-  // Privte constructor as this type should be created as a shared_ptr
+  // Private constructor as this type should be created as a shared_ptr
   numeric_data(size_t num_elements, size_t element_size_in_bytes, wfc::IIterable<wss::IBuffer> const& buffers);
   gsl::span<byte> buffer_at(size_t index);
   gsl::span<byte> combined_buffer();
diff --git a/winml/test/api/LearningModelSessionAPITest.cpp b/winml/test/api/LearningModelSessionAPITest.cpp
index d6e70e35e3a6d..587f3e28928ae 100644
--- a/winml/test/api/LearningModelSessionAPITest.cpp
+++ b/winml/test/api/LearningModelSessionAPITest.cpp
@@ -315,7 +315,7 @@ static void NamedDimensionOverride() {
   LearningModelDevice device(nullptr);
   WINML_EXPECT_NO_THROW(device = LearningModelDevice(LearningModelDeviceKind::Cpu));
 
-  // the model input shape. the batch size, n, is overriden to 5
+  // the model input shape. the batch size, n, is overridden to 5
   uint32_t n = 5;
   int64_t c = 3, h = 720, w = 720;
 
diff --git a/winml/test/common/googleTestMacros.h b/winml/test/common/googleTestMacros.h
index 2f493c9b6d6b9..111abd1c3914e 100644
--- a/winml/test/common/googleTestMacros.h
+++ b/winml/test/common/googleTestMacros.h
@@ -64,7 +64,7 @@
 #define INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
 #endif
 
-#define WINML_SKIP_TEST(message) WINML_SUPRESS_UNREACHABLE_BELOW(GTEST_SKIP() << message)
+#define WINML_SKIP_TEST(message) WINML_SUPPRESS_UNREACHABLE_BELOW(GTEST_SKIP() << message)
 
 #define WINML_EXPECT_NO_THROW(statement) EXPECT_NO_THROW(statement)
 #define WINML_EXPECT_TRUE(statement) EXPECT_TRUE(statement)
diff --git a/winml/test/common/taefTestMacros.h b/winml/test/common/taefTestMacros.h
index 48119ff293fc8..3f6377c0a56b2 100644
--- a/winml/test/common/taefTestMacros.h
+++ b/winml/test/common/taefTestMacros.h
@@ -48,7 +48,7 @@ using namespace WEX::TestExecution;
   }
 
 #define WINML_SKIP_TEST(message)                                                                                       \
-  WINML_SUPRESS_UNREACHABLE_BELOW(                                                                                     \
+  WINML_SUPPRESS_UNREACHABLE_BELOW(                                                                                    \
     Log::Result(TestResults::Skipped, std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(message).c_str()); \
     return;                                                                                                            \
   )
diff --git a/winml/test/common/test.h b/winml/test/common/test.h
index f5adce2b40602..b7afa5dbb5f21 100644
--- a/winml/test/common/test.h
+++ b/winml/test/common/test.h
@@ -18,9 +18,9 @@ constexpr bool alwaysTrue() {
 constexpr bool alwaysFalse() {
   return false;
 }
-#define WINML_SUPRESS_UNREACHABLE_BELOW(statement) \
-  if (alwaysTrue()) {                              \
-    statement;                                     \
+#define WINML_SUPPRESS_UNREACHABLE_BELOW(statement) \
+  if (alwaysTrue()) {                               \
+    statement;                                      \
   }
 
 #ifdef BUILD_TAEF_TEST
diff --git a/winml/test/image/imagetests.cpp b/winml/test/image/imagetests.cpp
index 2251954c59e4c..b408c0315f94a 100644
--- a/winml/test/image/imagetests.cpp
+++ b/winml/test/image/imagetests.cpp
@@ -211,12 +211,12 @@ class ImageTests : public ::testing::Test {
   bool ShouldSkip(
     const std::wstring& model_file_name, const std::wstring& image_file_name, const InputImageSource input_image_source
   ) {
-    // Case that the tensor's shape doesn't match model's shape should be skiped
+    // Case that the tensor's shape doesn't match model's shape should be skipped
     if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) && (InputImageSource::FromGPUResource == input_image_source || InputImageSource::FromCPUResource == input_image_source)) {
       return true;
     }
 
-    // Case that the images's shape doesn't match model's shape which expects free dimension should be skiped.
+    // Case that the images's shape doesn't match model's shape which expects free dimension should be skipped.
     // Because the fns-candy is not real model that can handle free dimensional input
     if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) && L"fns-candy_Bgr8_freeDimInput.onnx" == model_file_name) {
       return true;
diff --git a/winml/test/model/model_tests.cpp b/winml/test/model/model_tests.cpp
index 27d74d7d6b034..859914014b8bb 100644
--- a/winml/test/model/model_tests.cpp
+++ b/winml/test/model/model_tests.cpp
@@ -170,7 +170,7 @@ std::string GetTestDataPath() {
     testDataPath.replace(environmentVariableFetchSuceeded, testDataPathFolderName.length(), testDataPathFolderName);
   } else {
     throw std::exception(
-      "WINML_TEST_DATA_PATH environment variable path needs to be shorter to accomodate the maximum path size of %d\n",
+      "WINML_TEST_DATA_PATH environment variable path needs to be shorter to accommodate the maximum path size of %d\n",
       MAX_PATH
     );
   }

From 11ad29945125d3f12f5935570fdd1f48bb0285d1 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Mon, 22 Jul 2024 16:37:04 -0700
Subject: [PATCH 13/15] Adds ATen fallback for scaled_dot_product_attention
 (#21107)

### Description
<!-- Describe your changes. -->

Introduces an ATen fallback for
`torch.nn.functional.scaled_dot_product_attention`. This operator was
introduced in torch 2.0 and, since then, has had many updates including
the implementation of memory efficient attention for V100 machines. The
current torchscript exporter exports a subgraph for attention which does
not provide the same memory savings that PyTorch's memory efficient
attention kernel provides. Allowing fallback to PyTorch ATen op for
attention helps mitigate memory spike issues for models leveraging
memory efficient attention.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Memory issues arose when integrating ONNX Runtime Training with AML
Stable Diffusion.

---------

Co-authored-by: root <prathikrao@microsoft.com>
---
 docs/ORTModule_Training_Guidelines.md         | 10 +++
 .../core/graph/gradient_builder.cc            | 15 +++-
 .../ortmodule/_custom_gradient_registry.py    | 37 ++++++++++
 .../ortmodule/_custom_op_symbolic_registry.py | 24 +++++++
 .../python/orttraining_test_ortmodule_api.py  | 71 +++++++++++++++++++
 5 files changed, 156 insertions(+), 1 deletion(-)

diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
index 8d5472ba30601..c79ba59a07ee9 100644
--- a/docs/ORTModule_Training_Guidelines.md
+++ b/docs/ORTModule_Training_Guidelines.md
@@ -304,6 +304,16 @@ A classical usage of disabling the deep copy: when the deep copy before module e
 	export ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=0 # Disable
 	```
 
+#### ORTMODULE_ATEN_SDPA_FALLBACK
+
+- **Feature Area**: *ORTMODULE/Optimizations*
+- **Description**: By default, this is disabled. This env var can be used for enabling pre-export attention fall back to PyTorch's [_scaled_dot_product_efficient_attention](https://github.com/pytorch/pytorch/blob/c12a4f2e65ad41b739aab1a261e2336b4a79fcfb/aten/src/ATen/native/native_functions.yaml#L14778) ATen kernel for execution when calling torch.nn.functional.scaled_dot_product_attention. NOTE: only use this feature if user model leverages memory efficient attention WITHOUT masking (ie. attn_mask=None). Utilize GPU profiling looks like NVIDIA Nsight Systems to identify if user model leverages memory efficient attention.
+
+    ```bash
+    export ORTMODULE_ATEN_SDPA_FALLBACK=1 # ENABLE
+    unset ORTMODULE_ATEN_SDPA_FALLBACK # DISABLE
+    ```
+
 ### 2.2 Memory Optimization
 
 Q: *Want to run a bigger batch size?*
diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index 22dcf4eb92411..76fe0ee91d4c6 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -1794,7 +1794,20 @@ IMPLEMENT_GRADIENT_BUILDER(GetExternalGradient) {
     }
 
     std::vector<ArgDef> output_args;
-    for (const auto& output : node_def.outputs) {
+    for (size_t output_index = 0; output_index < node_def.outputs.size(); ++output_index) {
+      // If the input is not used in the forward computation, we don't need it for gradient computation
+      // Required for ORTMODULE_ATEN_SDPA_FALLBACK
+      if (static_cast<int>(output_index) >= GetSrcNodeInputSize()) {
+        continue;
+      }
+
+      if (!IsGradientRequiredForSrcNodeInput(static_cast<int>(output_index))) {
+        output_args.emplace_back(ArgDef());
+        continue;
+      }
+
+      const auto& output = node_def.outputs[output_index];
+
       if (output.find("GI(") == 0) {
         size_t index = static_cast<size_t>(std::stoi(output.substr(3, output.length() - 4)));
         output_args.emplace_back(GI(index));
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
index a8590cea22887..97650f509ac88 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
@@ -25,6 +25,7 @@
 #                     'is_tensor' is optional, if not present, the default is False.
 
 import json
+import os
 
 from onnxruntime.capi import _pybind_state as C
 
@@ -276,3 +277,39 @@ def upsample_nearest3d_gradient():
 @register_gradient("org.pytorch.aten", "ATen", "upsample_bicubic2d", "vec")
 def upsample_bicubic2d_gradient():
     return _upsample_gradient("upsample_bicubic2d_backward", 2)
+
+
+ATEN_SDPA_FALLBACK = os.getenv("ORTMODULE_ATEN_SDPA_FALLBACK", None)
+if ATEN_SDPA_FALLBACK:
+    # based on the following internal PyTorch kernel for efficient attention:
+    # https://github.com/pytorch/pytorch/blob/c12a4f2e65ad41b739aab1a261e2336b4a79fcfb/aten/src/ATen/native/native_functions.yaml#L14784
+    @register_gradient("org.pytorch.aten", "ATen", "_scaled_dot_product_efficient_attention", "")
+    def scaled_dot_product_attention_gradient():
+        return [
+            (
+                "Constant",
+                [],
+                ["grad_input_mask"],
+                {"value": {"value": [1, 1, 1, 0], "dtype": "int", "is_tensor": True}},
+            ),
+            (
+                ("ATen", "org.pytorch.aten"),
+                [
+                    "GO(0)",
+                    "I(0)",
+                    "I(1)",
+                    "I(2)",
+                    "I(3)",
+                    "O(0)",
+                    "O(1)",
+                    "O(2)",
+                    "O(3)",
+                    "I(5)",
+                    "grad_input_mask",
+                    "I(6)",
+                    "I(7)",
+                ],
+                ["GI(0)", "GI(1)", "GI(2)", ""],
+                {"operator": {"value": "_scaled_dot_product_efficient_attention_backward", "dtype": "string"}},
+            ),
+        ]
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
index 10e7f60b7da0f..c48968efbb262 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
@@ -3,6 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+import os
 from typing import Callable
 
 import torch
@@ -969,3 +970,26 @@ def softmax(g, input, dim, dtype=None):
     softmax = g.op("Softmax", casted_input, axis_i=dim)
 
     return softmax
+
+
+ATEN_SDPA_FALLBACK = os.getenv("ORTMODULE_ATEN_SDPA_FALLBACK", None)
+if ATEN_SDPA_FALLBACK:
+    # based on the following internal PyTorch kernel for efficient attention:
+    # https://github.com/pytorch/pytorch/blob/c12a4f2e65ad41b739aab1a261e2336b4a79fcfb/aten/src/ATen/native/native_functions.yaml#L14778
+    @register_symbolic("scaled_dot_product_attention")
+    def scaled_dot_product_attention(g, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None):
+        dropout_p_f = g.op("Cast", dropout_p, to_i=torch.onnx.TensorProtoDataType.FLOAT)
+        compute_logsumexp = g.op("Constant", value_t=torch.tensor([1], dtype=torch.bool))
+        return g.op(
+            "org.pytorch.aten::ATen",
+            query,
+            key,
+            value,
+            attn_mask,
+            compute_logsumexp,
+            dropout_p_f,
+            is_causal,
+            scale,
+            operator_s="_scaled_dot_product_efficient_attention",
+            outputs=4,
+        )[0]
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 6f5b03685e801..fe59c398d7abb 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -6953,3 +6953,74 @@ def generate_inputs(batch_size, max_seq_length, vocab_size):
     else:
         if "ORTMODULE_MEMORY_OPT_LEVEL" in os.environ:
             del os.environ["ORTMODULE_MEMORY_OPT_LEVEL"]
+
+
+@pytest.mark.skipif(
+    Version(torch.__version__) < Version("2.3.0"),
+    reason="torch.nn.attention module was introduced in PyTorch 2.3.0",
+)
+def test_aten_attention():
+    from torch.nn.attention import SDPBackend, sdpa_kernel
+
+    class _NeuralNetAttention(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, q, k, v, attn_mask=None):
+            with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
+                return torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask)
+
+    def gen_inputs(device, dtype):
+        return [
+            torch.randn(32, 8, 128, 64, dtype=dtype, device=device, requires_grad=True),
+            torch.randn(32, 8, 128, 64, dtype=dtype, device=device, requires_grad=True),
+            torch.randn(32, 8, 128, 64, dtype=dtype, device=device, requires_grad=True),
+        ]
+
+    def run_step(model, inputs, attn_mask=None):
+        prediction = model(*inputs, attn_mask)
+        prediction.sum().backward()
+        return prediction
+
+    device = "cuda"
+
+    os.environ["ORTMODULE_ATEN_SDPA_FALLBACK"] = "1"  # TESTING WITHOUT ATTN_MASK
+
+    pt_model = _NeuralNetAttention().to(device)
+    ort_model = ORTModule(copy.deepcopy(pt_model), DebugOptions(save_onnx=True, onnx_prefix="mem_eff_attn"))
+
+    # reset manual seed to reset the generator
+    torch.manual_seed(2333)
+    pt_input = gen_inputs(device=device, dtype=torch.float32)
+    ort_input = copy.deepcopy(pt_input)
+    pt_prediction = run_step(pt_model, pt_input)
+    ort_prediction = run_step(ort_model, ort_input)
+
+    _test_helpers.assert_values_are_close(ort_prediction, pt_prediction)
+    _test_helpers.assert_values_are_close(ort_input[0].grad, pt_input[0].grad)
+    _test_helpers.assert_values_are_close(ort_input[1].grad, pt_input[1].grad)
+    _test_helpers.assert_values_are_close(ort_input[2].grad, pt_input[2].grad)
+
+    execution_mgr = ort_model._torch_module._execution_manager._training_manager
+    from onnxruntime.training.ortmodule._onnx_models import _get_onnx_file_name
+
+    path = os.path.join(
+        execution_mgr._debug_options.save_onnx_models.path,
+        _get_onnx_file_name(
+            execution_mgr._debug_options.save_onnx_models.name_prefix, "execution_model", execution_mgr._export_mode
+        ),
+    )
+
+    onnx_model = onnx.load(path)
+    onnx_nodes = onnx_model.graph.node
+
+    mem_eff_attn_nodes = 0
+    for node in onnx_nodes:
+        if "ATen" in node.name:
+            for attr in node.attribute:
+                if b"_scaled_dot_product_efficient_attention" in attr.s:
+                    mem_eff_attn_nodes += 1
+
+    assert mem_eff_attn_nodes > 0, "No mem_eff_attn nodes are found"
+
+    del os.environ["ORTMODULE_ATEN_SDPA_FALLBACK"]

From dd010edb37c8c7b34ba5d40cdfb3f6ce0a0fa789 Mon Sep 17 00:00:00 2001
From: Sheil Kumar <smk2007@gmail.com>
Date: Mon, 22 Jul 2024 16:59:03 -0700
Subject: [PATCH 14/15] Update DirectML from 1.14.1 to 1.15.0 (#21323)

Update DirectML from 1.14.1 to 1.15.0

---------

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
Co-authored-by: Dwayne Robinson <dwayner@microsoft.com>
---
 .pipelines/nuget_config/x64/packages.config   |  2 +-
 .pipelines/nuget_config/x86/packages.config   |  2 +-
 cmake/external/dml.cmake                      |  2 +-
 docs/OperatorKernels.md                       |  8 ++-
 .../DmlExecutionProvider/src/ApiTraits.cpp    |  8 ++-
 .../src/External/DirectMLHelpers/ApiHelpers.h |  8 ++-
 .../src/External/DirectMLHelpers/ApiTraits.h  | 57 ++++++++++++++++--
 .../External/DirectMLHelpers/DirectMLSchema.h | 58 ++++++++++++++++++
 .../DirectMLHelpers/DmlGraphDeserialization.h |  2 +-
 .../DirectMLHelpers/GeneratedSchemaHelpers.h  | 60 ++++++++++++++++++-
 .../DirectMLHelpers/GeneratedSchemaTypes.h    | 55 +++++++++--------
 .../src/Operators/DmlOperatorResize.cpp       | 21 ++-----
 .../OperatorAuthorHelper/OperatorHelper.cpp   | 11 ++--
 packages.config                               |  2 +-
 .../nuget/generate_nuspec_for_native_nuget.py |  2 +-
 15 files changed, 231 insertions(+), 67 deletions(-)

diff --git a/.pipelines/nuget_config/x64/packages.config b/.pipelines/nuget_config/x64/packages.config
index 9066e13ee1c8d..7bf8181b1f838 100644
--- a/.pipelines/nuget_config/x64/packages.config
+++ b/.pipelines/nuget_config/x64/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="python" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.14.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.0" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/.pipelines/nuget_config/x86/packages.config b/.pipelines/nuget_config/x86/packages.config
index a8e5b35b28b36..30f7862a11078 100644
--- a/.pipelines/nuget_config/x86/packages.config
+++ b/.pipelines/nuget_config/x86/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="pythonx86" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.14.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.0" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
index f74b694471203..54e361ffdb3ae 100644
--- a/cmake/external/dml.cmake
+++ b/cmake/external/dml.cmake
@@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.14.1)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.15.0)
 
   # Restore nuget packages, which will pull down the DirectML redist package.
   add_custom_command(
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index df5897529baae..ed944b5a6df79 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -970,6 +970,7 @@ Do not modify directly.*
 |||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||11+|**T** = tensor(float), tensor(float16)|
 |||6+|**T** = tensor(float), tensor(float16)|
+|Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Concat|*in* inputs:**T**<br> *out* concat_result:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||4+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -1131,7 +1132,8 @@ Do not modify directly.*
 |PRelu|*in* X:**T**<br> *in* slope:**T**<br> *out* Y:**T**|16+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
 |||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
 |||7+|**T** = tensor(float), tensor(float16)|
-|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||2+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -1199,7 +1201,9 @@ Do not modify directly.*
 |||14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||5+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|13+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|19+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||18+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||13+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
 |||11+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
 |||10+|**T** = tensor(float), tensor(float16)|
 |ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
index ccc2bfd872231..a10ba8099f39a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
@@ -1,4 +1,4 @@
-﻿//---------------------------------------------------------------------------
+//---------------------------------------------------------------------------
 // Copyright (c) Microsoft Corporation. All rights reserved.
 //
 // This file is automatically generated. Please do not edit it directly.
@@ -241,6 +241,7 @@ DML_OPERATOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value
         {"DML_OPERATOR_ACTIVATION_SWISH", DML_OPERATOR_ACTIVATION_SWISH},
         {"DML_OPERATOR_ACTIVATION_HARD_SWISH", DML_OPERATOR_ACTIVATION_HARD_SWISH},
         {"DML_OPERATOR_RESAMPLE2", DML_OPERATOR_RESAMPLE2},
+        {"DML_OPERATOR_RESAMPLE3", DML_OPERATOR_RESAMPLE3},
         {"DML_OPERATOR_RESAMPLE_GRAD1", DML_OPERATOR_RESAMPLE_GRAD1},
         {"DML_OPERATOR_DIAGONAL_MATRIX1", DML_OPERATOR_DIAGONAL_MATRIX1},
         {"DML_OPERATOR_MULTIHEAD_ATTENTION", DML_OPERATOR_MULTIHEAD_ATTENTION},
@@ -250,6 +251,9 @@ DML_OPERATOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value
         {"DML_OPERATOR_MULTIHEAD_ATTENTION1", DML_OPERATOR_MULTIHEAD_ATTENTION1},
         {"DML_OPERATOR_QUANTIZE", DML_OPERATOR_QUANTIZE},
         {"DML_OPERATOR_DEQUANTIZE", DML_OPERATOR_DEQUANTIZE},
+        {"DML_OPERATOR_ROI_ALIGN_GRAD", DML_OPERATOR_ROI_ALIGN_GRAD},
+        {"DML_OPERATOR_FOLD", DML_OPERATOR_FOLD},
+        {"DML_OPERATOR_UNFOLD", DML_OPERATOR_UNFOLD},
     };
     auto index = StringUtil::MapToIndex(value, mapping);
     if (!index)
@@ -369,6 +373,7 @@ DML_PADDING_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
         {"DML_PADDING_MODE_EDGE", DML_PADDING_MODE_EDGE},
         {"DML_PADDING_MODE_REFLECTION", DML_PADDING_MODE_REFLECTION},
         {"DML_PADDING_MODE_SYMMETRIC", DML_PADDING_MODE_SYMMETRIC},
+        {"DML_PADDING_MODE_WRAP", DML_PADDING_MODE_WRAP},
     };
     auto index = StringUtil::MapToIndex(value, mapping);
     if (!index)
@@ -454,6 +459,7 @@ DML_FEATURE_LEVEL ApiTraits::StringifyHelpers::FromString(std::string_view value
         {"DML_FEATURE_LEVEL_6_1", DML_FEATURE_LEVEL_6_1},
         {"DML_FEATURE_LEVEL_6_2", DML_FEATURE_LEVEL_6_2},
         {"DML_FEATURE_LEVEL_6_3", DML_FEATURE_LEVEL_6_3},
+        {"DML_FEATURE_LEVEL_6_4", DML_FEATURE_LEVEL_6_4},
     };
     auto index = StringUtil::MapToIndex(value, mapping);
     if (!index)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiHelpers.h
index 9a1c23093f9b9..431a3fdef5a9a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiHelpers.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiHelpers.h
@@ -29,6 +29,9 @@ union ActivationOperatorDescUnion
     DML_ACTIVATION_THRESHOLDED_RELU_OPERATOR_DESC thresholdedRelu;
     DML_ACTIVATION_SHRINK_OPERATOR_DESC shrink;
     DML_ACTIVATION_GELU_OPERATOR_DESC gelu;
+    DML_ACTIVATION_SWISH_OPERATOR_DESC swish;
+    DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC hardSwish;
+    DML_ELEMENT_WISE_CLIP_OPERATOR_DESC clip;
 };
 
 struct ActivationOperatorDesc
@@ -46,7 +49,7 @@ struct ActivationOperatorDesc
         case DML_OPERATOR_ACTIVATION_CELU: return { activationType, &params.celu };
         case DML_OPERATOR_ACTIVATION_HARDMAX: return { activationType, &params.hardmax };
         case DML_OPERATOR_ACTIVATION_HARDMAX1: return { activationType, &params.hardmax1 };
-        case DML_OPERATOR_ACTIVATION_HARD_SIGMOID: return { activationType, &params.sigmoid };
+        case DML_OPERATOR_ACTIVATION_HARD_SIGMOID: return { activationType, &params.hardSigmoid };
         case DML_OPERATOR_ACTIVATION_IDENTITY: return { activationType, &params.identity };
         case DML_OPERATOR_ACTIVATION_LEAKY_RELU: return { activationType, &params.leakyRelu };
         case DML_OPERATOR_ACTIVATION_LINEAR: return { activationType, &params.linear };
@@ -66,6 +69,9 @@ struct ActivationOperatorDesc
         case DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU: return { activationType, &params.thresholdedRelu };
         case DML_OPERATOR_ACTIVATION_SHRINK: return { activationType, &params.shrink };
         case DML_OPERATOR_ACTIVATION_GELU: return { activationType, &params.gelu };
+        case DML_OPERATOR_ACTIVATION_SWISH: return { activationType, &params.swish };
+        case DML_OPERATOR_ACTIVATION_HARD_SWISH: return { activationType, &params.hardSwish };
+        case DML_OPERATOR_ELEMENT_WISE_CLIP: return { activationType, &params.clip };
         default:
             ORT_THROW_HR(E_INVALIDARG);
             return { activationType, &params.relu };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
index 6a4354feb2e2e..ccd4d4c76e744 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #pragma once
 
@@ -24,7 +24,7 @@ struct EnumTraits<DML_TENSOR_TYPE>
 template <>
 struct EnumTraits<DML_OPERATOR_TYPE>
 {
-    static constexpr auto ValueCount = 174;
+    static constexpr auto ValueCount = 178;
     static constexpr size_t ActivationFunctionCount = 26;
 };
 
@@ -62,7 +62,7 @@ struct EnumTraits<DML_CONVOLUTION_DIRECTION>
 template <>
 struct EnumTraits<DML_PADDING_MODE>
 {
-    static constexpr auto ValueCount = 4;
+    static constexpr auto ValueCount = 5;
 };
 
 template <>
@@ -86,7 +86,7 @@ struct EnumTraits<DML_FEATURE>
 template <>
 struct EnumTraits<DML_FEATURE_LEVEL>
 {
-    static constexpr auto ValueCount = 14;
+    static constexpr auto ValueCount = 15;
 };
 
 template <>
@@ -1023,6 +1023,12 @@ struct OperatorDescTraits<DML_RESAMPLE2_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_RESAMPLE2;
 };
 
+template <>
+struct OperatorDescTraits<DML_RESAMPLE3_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_RESAMPLE3;
+};
+
 template <>
 struct OperatorDescTraits<DML_RESAMPLE_GRAD1_OPERATOR_DESC>
 {
@@ -1053,6 +1059,18 @@ struct OperatorDescTraits<DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT;
 };
 
+template <>
+struct OperatorDescTraits<DML_FOLD_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_FOLD;
+};
+
+template <>
+struct OperatorDescTraits<DML_UNFOLD_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_UNFOLD;
+};
+
 template <>
 struct OperatorDescTraits<DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_DESC>
 {
@@ -2073,6 +2091,12 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_RESAMPLE2>
     using DescType = DML_RESAMPLE2_OPERATOR_DESC;
 };
 
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_RESAMPLE3>
+{
+    using DescType = DML_RESAMPLE3_OPERATOR_DESC;
+};
+
 template <>
 struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_RESAMPLE_GRAD1>
 {
@@ -2103,6 +2127,18 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MATRIX_MULTIPLY_INTEGE
     using DescType = DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC;
 };
 
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_FOLD>
+{
+    using DescType = DML_FOLD_OPERATOR_DESC;
+};
+
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_UNFOLD>
+{
+    using DescType = DML_UNFOLD_OPERATOR_DESC;
+};
+
 template <>
 struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION2>
 {
@@ -2575,6 +2611,8 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
         return std::invoke(std::forward<Visitor>(visitor), DML_BATCH_NORMALIZATION_TRAINING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_RESAMPLE2:
         return std::invoke(std::forward<Visitor>(visitor), DML_RESAMPLE2_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_RESAMPLE3:
+        return std::invoke(std::forward<Visitor>(visitor), DML_RESAMPLE3_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_RESAMPLE_GRAD1:
         return std::invoke(std::forward<Visitor>(visitor), DML_RESAMPLE_GRAD1_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_DIAGONAL_MATRIX1:
@@ -2585,6 +2623,10 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
         return std::invoke(std::forward<Visitor>(visitor), DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
         return std::invoke(std::forward<Visitor>(visitor), DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_FOLD:
+        return std::invoke(std::forward<Visitor>(visitor), DML_FOLD_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_UNFOLD:
+        return std::invoke(std::forward<Visitor>(visitor), DML_UNFOLD_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION2:
         return std::invoke(std::forward<Visitor>(visitor), DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_MULTIHEAD_ATTENTION1:
@@ -2650,7 +2692,6 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
     }
 }
 
-
 namespace StringifyHelpers
 {
 template <typename T>
@@ -2871,6 +2912,7 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_ACTIVATION_SWISH: return "DML_OPERATOR_ACTIVATION_SWISH";
     case DML_OPERATOR_ACTIVATION_HARD_SWISH: return "DML_OPERATOR_ACTIVATION_HARD_SWISH";
     case DML_OPERATOR_RESAMPLE2: return "DML_OPERATOR_RESAMPLE2";
+    case DML_OPERATOR_RESAMPLE3: return "DML_OPERATOR_RESAMPLE3";
     case DML_OPERATOR_RESAMPLE_GRAD1: return "DML_OPERATOR_RESAMPLE_GRAD1";
     case DML_OPERATOR_DIAGONAL_MATRIX1: return "DML_OPERATOR_DIAGONAL_MATRIX1";
     case DML_OPERATOR_MULTIHEAD_ATTENTION: return "DML_OPERATOR_MULTIHEAD_ATTENTION";
@@ -2880,6 +2922,9 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_MULTIHEAD_ATTENTION1: return "DML_OPERATOR_MULTIHEAD_ATTENTION1";
     case DML_OPERATOR_QUANTIZE: return "DML_OPERATOR_QUANTIZE";
     case DML_OPERATOR_DEQUANTIZE: return "DML_OPERATOR_DEQUANTIZE";
+    case DML_OPERATOR_ROI_ALIGN_GRAD: return "DML_OPERATOR_ROI_ALIGN_GRAD";
+    case DML_OPERATOR_FOLD: return "DML_OPERATOR_FOLD";
+    case DML_OPERATOR_UNFOLD: return "DML_OPERATOR_UNFOLD";
     default:
         assert(false);
         return "<unknown>";
@@ -2971,6 +3016,7 @@ inline gsl::czstring ToString(DML_PADDING_MODE value)
     case DML_PADDING_MODE_EDGE: return "DML_PADDING_MODE_EDGE";
     case DML_PADDING_MODE_REFLECTION: return "DML_PADDING_MODE_REFLECTION";
     case DML_PADDING_MODE_SYMMETRIC: return "DML_PADDING_MODE_SYMMETRIC";
+    case DML_PADDING_MODE_WRAP: return "DML_PADDING_MODE_WRAP";
     default:
         assert(false);
         return "<unknown>";
@@ -3036,6 +3082,7 @@ inline gsl::czstring ToString(DML_FEATURE_LEVEL value)
     case DML_FEATURE_LEVEL_6_1: return "DML_FEATURE_LEVEL_6_1";
     case DML_FEATURE_LEVEL_6_2: return "DML_FEATURE_LEVEL_6_2";
     case DML_FEATURE_LEVEL_6_3: return "DML_FEATURE_LEVEL_6_3";
+    case DML_FEATURE_LEVEL_6_4: return "DML_FEATURE_LEVEL_6_4";
     default:
         assert(false);
         return "<unknown>";
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
index e0ccb2f51f109..14a7383e67897 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
@@ -2306,6 +2306,26 @@ constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE2_OPERATOR_SCHEMA {
     DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_RESAMPLE3_OPERATOR_SCHEMA_FIELDS[9] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "InterpolationMode", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "RoundingDirection", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "Scales", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "InputPixelOffsets", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "OutputPixelOffsets", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "Antialiased", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE3_OPERATOR_SCHEMA {
+    "DML_OPERATOR_RESAMPLE3",
+    DML_OPERATOR_RESAMPLE3,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    9,
+    DML_RESAMPLE3_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA_FIELDS[8] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputGradientTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputGradientTensor", false },
@@ -2414,6 +2434,44 @@ constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHE
     DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_FOLD_OPERATOR_SCHEMA_FIELDS[8] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSizes", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_FOLD_OPERATOR_SCHEMA {
+    "DML_OPERATOR_FOLD",
+    DML_OPERATOR_FOLD,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    8,
+    DML_FOLD_OPERATOR_SCHEMA_FIELDS,
+};
+
+constexpr DML_SCHEMA_FIELD DML_UNFOLD_OPERATOR_SCHEMA_FIELDS[8] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSizes", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_UNFOLD_OPERATOR_SCHEMA {
+    "DML_OPERATOR_UNFOLD",
+    DML_OPERATOR_UNFOLD,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    8,
+    DML_UNFOLD_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_SCHEMA_FIELDS[10] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ScaleTensor", true },
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
index 9decf0dce1bb2..203df0b3b8371 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
@@ -11,4 +11,4 @@ struct NodeIndex
 
 DmlSerializedGraphDesc DeserializeDmlGraph(
     const uint8_t* flatbufferGraphDescBlob,
-    /*out*/ std::vector<std::unique_ptr<std::byte[]>>& rawData);
\ No newline at end of file
+    /*out*/ std::vector<std::unique_ptr<std::byte[]>>& rawData);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
index 298ecd657635e..23b5a491c7d96 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #pragma once
@@ -1422,6 +1422,20 @@ inline std::vector<OperatorField> GetFields(const DML_RESAMPLE2_OPERATOR_DESC& d
         OperatorField(&DML_RESAMPLE2_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const FLOAT*>(desc.OutputPixelOffsets), desc.DimensionCount)),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_RESAMPLE3_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<UINT>(desc.InterpolationMode))),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<UINT>(desc.RoundingDirection))),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const FLOAT*>(desc.Scales), desc.DimensionCount)),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const FLOAT*>(desc.InputPixelOffsets), desc.DimensionCount)),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const FLOAT*>(desc.OutputPixelOffsets), desc.DimensionCount)),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast<UINT>(desc.Antialiased))),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_RESAMPLE_GRAD1_OPERATOR_DESC& desc)
 {
     return {
@@ -1500,6 +1514,32 @@ inline std::vector<OperatorField> GetFields(const DML_MATRIX_MULTIPLY_INTEGER_TO
         OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_FOLD_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const UINT*>(desc.WindowSizes), desc.DimensionCount)),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const UINT*>(desc.Strides), desc.DimensionCount)),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const UINT*>(desc.Dilations), desc.DimensionCount)),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const UINT*>(desc.StartPadding), desc.DimensionCount)),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const UINT*>(desc.EndPadding), desc.DimensionCount)),
+    };
+}
+inline std::vector<OperatorField> GetFields(const DML_UNFOLD_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const UINT*>(desc.WindowSizes), desc.DimensionCount)),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const UINT*>(desc.Strides), desc.DimensionCount)),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const UINT*>(desc.Dilations), desc.DimensionCount)),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const UINT*>(desc.StartPadding), desc.DimensionCount)),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const UINT*>(desc.EndPadding), desc.DimensionCount)),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_DESC& desc)
 {
     return {
@@ -1912,11 +1952,14 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_ROI_ALIGN_GRAD: return DML_ROI_ALIGN_GRAD_OPERATOR_SCHEMA;
     case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING: return DML_BATCH_NORMALIZATION_TRAINING_OPERATOR_SCHEMA;
     case DML_OPERATOR_RESAMPLE2: return DML_RESAMPLE2_OPERATOR_SCHEMA;
+    case DML_OPERATOR_RESAMPLE3: return DML_RESAMPLE3_OPERATOR_SCHEMA;
     case DML_OPERATOR_RESAMPLE_GRAD1: return DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA;
     case DML_OPERATOR_DIAGONAL_MATRIX1: return DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA;
     case DML_OPERATOR_MULTIHEAD_ATTENTION: return DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA;
     case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: return DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA;
     case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA;
+    case DML_OPERATOR_FOLD: return DML_FOLD_OPERATOR_SCHEMA;
+    case DML_OPERATOR_UNFOLD: return DML_UNFOLD_OPERATOR_SCHEMA;
     case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION2: return DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_SCHEMA;
     case DML_OPERATOR_MULTIHEAD_ATTENTION1: return DML_MULTIHEAD_ATTENTION1_OPERATOR_SCHEMA;
     case DML_OPERATOR_QUANTIZE: return DML_QUANTIZE_OPERATOR_SCHEMA;
@@ -2095,11 +2138,14 @@ inline const bool IsValidOperator(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_ROI_ALIGN_GRAD:
     case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING:
     case DML_OPERATOR_RESAMPLE2:
+    case DML_OPERATOR_RESAMPLE3:
     case DML_OPERATOR_RESAMPLE_GRAD1:
     case DML_OPERATOR_DIAGONAL_MATRIX1:
     case DML_OPERATOR_MULTIHEAD_ATTENTION:
     case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING:
     case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
+    case DML_OPERATOR_FOLD:
+    case DML_OPERATOR_UNFOLD:
     case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION2:
     case DML_OPERATOR_MULTIHEAD_ATTENTION1:
     case DML_OPERATOR_QUANTIZE:
@@ -2695,6 +2741,10 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_RESAMPLE2_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_RESAMPLE2_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_RESAMPLE3:
+        return AbstractOperatorDesc(
+            &DML_RESAMPLE3_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_RESAMPLE3_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_RESAMPLE_GRAD1:
         return AbstractOperatorDesc(
             &DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA,
@@ -2715,6 +2765,14 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_FOLD:
+        return AbstractOperatorDesc(
+            &DML_FOLD_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_FOLD_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_UNFOLD:
+        return AbstractOperatorDesc(
+            &DML_UNFOLD_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_UNFOLD_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION2:
         return AbstractOperatorDesc(
             &DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_SCHEMA,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
index a94bb67b68d36..5ea0d470b20ce 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
@@ -1,21 +1,21 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #pragma once
 
 using ApiAttributeVariant = std::variant<
-    const DML_TENSOR_DESC*, 
-    const DML_OPERATOR_DESC*, 
-    UINT, 
-    UINT64, 
-    INT, 
-    FLOAT, 
-    const UINT*, 
-    const INT*, 
-    const FLOAT*, 
-    const DML_SCALE_BIAS*, 
-    DML_SIZE_2D, 
-    DML_SCALAR_UNION, 
+    const DML_TENSOR_DESC*,
+    const DML_OPERATOR_DESC*,
+    UINT,
+    UINT64,
+    INT,
+    FLOAT,
+    const UINT*,
+    const INT*,
+    const FLOAT*,
+    const DML_SCALE_BIAS*,
+    DML_SIZE_2D,
+    DML_SCALAR_UNION,
     BOOL
     >;
 
@@ -39,20 +39,20 @@ namespace OperatorFieldTypes
 }
 
 using OperatorFieldVariant = std::variant<
-    OperatorFieldTypes::TensorDesc, 
-    OperatorFieldTypes::TensorDescArray, 
-    OperatorFieldTypes::FusedActivationOperatorDesc, 
-    OperatorFieldTypes::FusedActivationOperatorDescArray, 
-    OperatorFieldTypes::UInt, 
-    OperatorFieldTypes::UInt64, 
-    OperatorFieldTypes::Int, 
-    OperatorFieldTypes::Float, 
-    OperatorFieldTypes::UIntArray, 
-    OperatorFieldTypes::IntArray, 
-    OperatorFieldTypes::FloatArray, 
-    OperatorFieldTypes::ScaleBias, 
-    OperatorFieldTypes::Size2D, 
-    OperatorFieldTypes::ScalarUnion, 
+    OperatorFieldTypes::TensorDesc,
+    OperatorFieldTypes::TensorDescArray,
+    OperatorFieldTypes::FusedActivationOperatorDesc,
+    OperatorFieldTypes::FusedActivationOperatorDescArray,
+    OperatorFieldTypes::UInt,
+    OperatorFieldTypes::UInt64,
+    OperatorFieldTypes::Int,
+    OperatorFieldTypes::Float,
+    OperatorFieldTypes::UIntArray,
+    OperatorFieldTypes::IntArray,
+    OperatorFieldTypes::FloatArray,
+    OperatorFieldTypes::ScaleBias,
+    OperatorFieldTypes::Size2D,
+    OperatorFieldTypes::ScalarUnion,
     OperatorFieldTypes::Bool
     >;
 
@@ -126,4 +126,3 @@ class OperatorField
     const DML_SCHEMA_FIELD* m_schema;
     OperatorFieldVariant m_data;
 };
-
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp
index 5256e01f86fb6..d31203308aef7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp
@@ -263,11 +263,6 @@ class DmlOperatorResize : public DmlOperator, public ResizeHelper
         std::string mode = kernelCreationContext.GetOptionalAttribute<std::string>(AttrName::Mode, "NEAREST");
         DML_INTERPOLATION_MODE interpolationMode = Dml::MapStringToInteropolationMode(mode);
 
-
-#if DML_TARGET_VERSION >= 0x6400
-        const int antialiased = kernelCreationContext.GetOptionalAttribute<int>(AttrName::Antialiased, 0);
-#endif
-
         // Map ONNX to DML's mode using offsets and rounding direction.
         // These offsets are in addition to the coordinate transform offsets.
         DML_AXIS_DIRECTION roundingDirection = DML_AXIS_DIRECTION_DECREASING;
@@ -307,12 +302,11 @@ class DmlOperatorResize : public DmlOperator, public ResizeHelper
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
 
-#if DML_TARGET_VERSION >= 0x6400
+        DML_OPERATOR_DESC opDesc = {};
+        const int antialiased = kernelCreationContext.GetOptionalAttribute<int>(AttrName::Antialiased, 0);
+
         DML_RESAMPLE3_OPERATOR_DESC operatorDesc = {};
         operatorDesc.Antialiased = static_cast<BOOL>(antialiased);
-#else
-        DML_RESAMPLE2_OPERATOR_DESC operatorDesc = {};
-#endif
         operatorDesc.InputTensor = inputDescs.data();
         operatorDesc.OutputTensor = outputDescs.data();
         operatorDesc.InterpolationMode = interpolationMode;
@@ -321,11 +315,8 @@ class DmlOperatorResize : public DmlOperator, public ResizeHelper
         operatorDesc.DimensionCount = gsl::narrow_cast<uint32_t>(paddedScales.size());
         operatorDesc.InputPixelOffsets = inputPixelOffsets.data();
         operatorDesc.OutputPixelOffsets = outputPixelOffsets.data();
-#if DML_TARGET_VERSION >= 0x6400
-        DML_OPERATOR_DESC opDesc = { DML_OPERATOR_RESAMPLE3, &operatorDesc };
-#else
-        DML_OPERATOR_DESC opDesc = { DML_OPERATOR_RESAMPLE2, &operatorDesc };
-#endif
+        opDesc = { DML_OPERATOR_RESAMPLE3, &operatorDesc };
+
         SetDmlOperatorDesc(opDesc, kernelCreationContext);
     }
 };
@@ -368,10 +359,8 @@ void CALLBACK QueryResize(IMLOperatorSupportQueryContextPrivate* context, bool*
 DML_OP_DEFINE_CREATION_FUNCTION(Resize10, VersionedKernel<DmlOperatorResize, 10>);
 DML_OP_DEFINE_CREATION_FUNCTION(Resize11, VersionedKernel<DmlOperatorResize, 11>);
 DML_OP_DEFINE_CREATION_FUNCTION(Resize13, VersionedKernel<DmlOperatorResize, 13>);
-#if DML_TARGET_VERSION >= 0x6400
 DML_OP_DEFINE_CREATION_FUNCTION(Resize18, VersionedKernel<DmlOperatorResize, 18>);
 DML_OP_DEFINE_CREATION_FUNCTION(Resize19, VersionedKernel<DmlOperatorResize, 19>);
-#endif
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample7, VersionedKernel<DmlOperatorResize, 7>);
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample9, VersionedKernel<DmlOperatorResize, 9>);
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample10, VersionedKernel<DmlOperatorResize, 10>);
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
index 3a7cf28ef903e..deed62901dfb0 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@@ -852,7 +852,7 @@ namespace OperatorHelper
             {
                 ML_CHECK_VALID_ARGUMENT(outputShape[C] == gsl::narrow_cast<int>(m_outputShapes[0].GetShape()[C]),
                     "Output channel must be equivalent to filter channel.");
-            } 
+            }
 
             for (size_t i = 0; i < m_kernel.spatialDimensionCount; ++i)
             {
@@ -1857,14 +1857,13 @@ namespace OperatorHelper
         DowncastDimensions(gsl::span(shapeData), /*out*/ m_blockShape);
 
         const uint32_t dimCount = gsl::narrow_cast<uint32_t>(m_blockShape.size());
-        m_dilations = {dimCount, 1};
-        m_pads = {dimCount * 2, 0};
-        m_strides = {dimCount, 1};
+        m_dilations.assign(dimCount, 1);
+        m_pads.assign(dimCount, 0);
+        m_strides.assign(dimCount, 1);
 
         if (kernelInformation.HasAttribute(AttrName::Dilations, MLOperatorAttributeType::IntArray))
         {
             shapeData = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::Dilations);
-            m_dilations.resize(shapeData.size());
             DowncastDimensions(gsl::span(shapeData), /*out*/ m_dilations);
             ML_CHECK_VALID_ARGUMENT(m_dilations.size() == dimCount);
         }
@@ -1872,7 +1871,6 @@ namespace OperatorHelper
         if (kernelInformation.HasAttribute(AttrName::Pads, MLOperatorAttributeType::IntArray))
         {
             shapeData = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::Pads);
-            m_pads.resize(shapeData.size());
             DowncastDimensions(gsl::span(shapeData), /*out*/ m_pads);
             ML_CHECK_VALID_ARGUMENT(m_pads.size() == dimCount * 2);
         }
@@ -1880,7 +1878,6 @@ namespace OperatorHelper
         if (kernelInformation.HasAttribute(AttrName::Strides, MLOperatorAttributeType::IntArray))
         {
             shapeData = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::Strides);
-            m_strides.resize(shapeData.size());
             DowncastDimensions(gsl::span(shapeData), /*out*/ m_strides);
             ML_CHECK_VALID_ARGUMENT(m_strides.size() == dimCount);
         }
diff --git a/packages.config b/packages.config
index 3f3e4f5298881..f69e5b4f27956 100644
--- a/packages.config
+++ b/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="Microsoft.AI.DirectML" version="1.14.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.0" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
   <package id="google.protobuf.tools" version="3.21.12" targetFramework="native" />
 </packages>
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 88d1cebc84f8d..60d1884a9591f 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -219,7 +219,7 @@ def add_common_dependencies(xml_text, package_name, version):
 
 
 def generate_dependencies(xml_text, package_name, version):
-    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.14.1"/>'
+    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.15.0"/>'
 
     if package_name == "Microsoft.AI.MachineLearning":
         xml_text.append("<dependencies>")

From 0f1f3b7705ddc2fe4f371f78a8a8b6a0428a68de Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Tue, 23 Jul 2024 20:21:55 +1000
Subject: [PATCH 15/15] CoreML: ML Program Slice (#21433)

### Description
<!-- Describe your changes. -->
Add support for Slice


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
High priority models.
---
 .../coreml/builders/impl/builder_utils.cc     |  10 +-
 .../coreml/builders/impl/builder_utils.h      |   7 +-
 .../coreml/builders/impl/slice_op_builder.cc  | 130 +++++++++++++-----
 .../coreml/builders/model_builder.cc          |   7 +
 .../providers/coreml/builders/model_builder.h |   7 +
 .../providers/cpu/tensor/slice_op.test.cc     |   2 +-
 .../apple/coreml_supported_mlprogram_ops.md   |   1 +
 7 files changed, 127 insertions(+), 37 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index 2fcf9a1d7d9ba..ebb3f97895f06 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -164,6 +164,7 @@ void SetTensorTypeInfo(MILSpec::TensorType& tensor_type, MILSpec::DataType data_
 void SetTensorTypeInfo(MILSpec::TensorType& tensor_type, MILSpec::DataType data_type,
                        const ONNX_NAMESPACE::TensorShapeProto* shape, bool convert_scalar = false) {
   tensor_type.set_datatype(data_type);
+
   if (shape) {
     auto rank = shape->dim_size();
     if (convert_scalar && rank == 0) {
@@ -313,7 +314,8 @@ void AddOperationInput(MILSpec::Operation& op, std::string_view input_name, std:
   (*op.mutable_inputs())[input_name] = std::move(arg);
 }
 
-void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output) {
+void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output,
+                        std::optional<int32_t> override_element_type) {
   auto& outputs = *op.mutable_outputs();
   auto& output_arg = *outputs.Add();
   output_arg.set_name(output.Name());
@@ -321,8 +323,10 @@ void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& outp
   MILSpec::ValueType& value = *output_arg.mutable_type();
   MILSpec::TensorType& tensor_type = *value.mutable_tensortype();
 
-  SetTensorTypeInfo(tensor_type, OnnxDataTypeToMILSpec(output.TypeAsProto()->tensor_type().elem_type()),
-                    output.Shape(), /*convert_scalar*/ true);
+  auto elem_type = override_element_type ? *override_element_type
+                                         : output.TypeAsProto()->tensor_type().elem_type();
+
+  SetTensorTypeInfo(tensor_type, OnnxDataTypeToMILSpec(elem_type), output.Shape(), /*convert_scalar*/ true);
 }
 
 void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index 97fb83b6dc482..f012e6af0d718 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -134,7 +134,12 @@ void AddOperationInput(COREML_SPEC::MILSpec::Operation& op,
 /// </summary>
 /// <param name="op">Operation to update.</param>
 /// <param name="output">NodeArg with details of output to add.</param>
-void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output);
+/// <param name="override_element_type">
+///   Override the element type. Only set to handle cases where we believe the data at runtime will be int32 but
+///   the original ONNX node has type int64.
+/// </param>
+void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output,
+                        std::optional<int32_t> override_element_type = std::nullopt);
 
 /// <summary>
 /// Add pad_type and pad values.
diff --git a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
index 39bfbfe5bba1f..51fc3f2c11c73 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
@@ -4,6 +4,7 @@
 #include "core/optimizer/initializer.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -28,12 +29,14 @@ class SliceOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& builder_params,
                          const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 namespace {
-Status PrepareSliceComputeMetadataFromConstantInitializers(const Node& slice_node,
-                                                           const GraphViewer& graph_viewer,
-                                                           SliceOp::PrepareForComputeMetadata& compute_metadata) {
+Status PrepareSliceComputeMetadata(const Node& slice_node,
+                                   const GraphViewer& graph_viewer,
+                                   SliceOp::PrepareForComputeMetadata& compute_metadata) {
   // TODO largely copied from nnapi::SliceOpBuilder::AddToModelBuilderImpl. put it somewhere where it can be reused?
 
   const auto input_defs = slice_node.InputDefs();
@@ -114,55 +117,113 @@ void SliceOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const No
 
 Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                              const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  const auto& output_defs = node.OutputDefs();
+
   std::vector<int64_t> data_shape;
   ORT_RETURN_IF_NOT(GetStaticShape(*node.InputDefs()[0], data_shape, logger), "Failed to get input shape.");
+  auto rank = data_shape.size();
 
   SliceOp::PrepareForComputeMetadata compute_metadata{data_shape};
-  ORT_RETURN_IF_ERROR(PrepareSliceComputeMetadataFromConstantInitializers(node, model_builder.GetGraphViewer(),
-                                                                          compute_metadata));
+  ORT_RETURN_IF_ERROR(PrepareSliceComputeMetadata(node, model_builder.GetGraphViewer(), compute_metadata));
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;  // NOLINT
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.slice_by_index
+
+    const InlinedVector<bool> begin_mask_values(rank, false);
+    InlinedVector<bool> end_mask_values(rank, false);
+
+    // Special case - stepping backwards up to and including the first index in the dimension.
+    // In ONNX Slice, we use end <= -(rank + 1) to represent this. In CoreML, setting endids like that doesn't work,
+    // so use endmasks to specify the rest of the dimension instead.
+    for (size_t i = 0; i < rank; ++i) {
+      if (compute_metadata.steps_[i] < 0 && compute_metadata.ends_[i] == -1) {
+        end_mask_values[i] = true;
+      }
+    }
 
-  auto layer = model_builder.CreateNNLayer(node);
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
-  auto* slice_static = layer->mutable_slicestatic();
+    // Only int32 and float are supported by CoreML slice_by_index.
+    // We convert any int64 model input to int32 when running the CoreML model for the partition.
+    // Any other integer data created at runtime is the output from CoreML operations, and should int32 not int64.
+    // Based on that, we assume that the actual input when running will be int32, so we override the output data
+    // type to reflect this.
+    // If we were to leave it as TensorProto_DataType_INT64 the CoreML model would be invalid.
+    std::optional<int32_t> output_datatype;
 
-  for (size_t i = 0; i < compute_metadata.starts_.size(); ++i) {
-    const auto step = compute_metadata.steps_[i],
-               start = compute_metadata.starts_[i],
-               end = compute_metadata.ends_[i];
+    int32_t input_type;
+    ORT_RETURN_IF_NOT(GetType(*node.InputDefs()[0], input_type, logger), "Failed to get input type");
 
-    slice_static->add_beginids(start);
-    slice_static->add_beginmasks(false);
+    if (input_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+      output_datatype = ONNX_NAMESPACE::TensorProto_DataType_INT32;
+    }
 
-    if (step < 0 && end == -1) {
-      // Special case - stepping backwards up to and including the first index in the dimension.
-      // In ONNX Slice, we use end <= -(rank + 1) to represent this. In CoreML, setting endids like that doesn't work,
-      // so use endmasks to specify the rest of the dimension instead.
-      slice_static->add_endids(-1);  // ignored
-      slice_static->add_endmasks(true);
-    } else {
-      slice_static->add_endids(end);
-      slice_static->add_endmasks(false);
+    auto op = model_builder.CreateOperation(node, "slice_by_index");
+
+    auto begin = model_builder.AddConstant(op->type(), "begin", AsSpan(compute_metadata.starts_));
+    auto end = model_builder.AddConstant(op->type(), "end", AsSpan(compute_metadata.ends_));
+    auto stride = model_builder.AddConstant(op->type(), "stride", AsSpan(compute_metadata.steps_));
+    auto begin_mask = model_builder.AddConstant(op->type(), "begin_mask", AsSpan(begin_mask_values));
+    auto end_mask = model_builder.AddConstant(op->type(), "end_mask", AsSpan(end_mask_values));
+
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    AddOperationInput(*op, "begin", begin);
+    AddOperationInput(*op, "end", end);
+    AddOperationInput(*op, "stride", stride);
+    AddOperationInput(*op, "begin_mask", begin_mask);
+    AddOperationInput(*op, "end_mask", end_mask);
+
+    AddOperationOutput(*op, *output_defs[0], output_datatype);
+
+    model_builder.AddOperation(std::move(op));
+
+  } else  // NOLINT
+#endif    // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    auto layer = model_builder.CreateNNLayer(node);
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = output_defs[0]->Name();
+    auto* slice_static = layer->mutable_slicestatic();
+
+    for (size_t i = 0; i < rank; ++i) {
+      const auto step = compute_metadata.steps_[i],
+                 start = compute_metadata.starts_[i],
+                 end = compute_metadata.ends_[i];
+
+      slice_static->add_beginids(start);
+      slice_static->add_beginmasks(false);
+
+      if (step < 0 && end == -1) {
+        // Special case - stepping backwards up to and including the first index in the dimension.
+        // In ONNX Slice, we use end <= -(rank + 1) to represent this. In CoreML, setting endids like that doesn't work,
+        // so use endmasks to specify the rest of the dimension instead.
+        slice_static->add_endids(-1);  // ignored
+        slice_static->add_endmasks(true);
+      } else {
+        slice_static->add_endids(end);
+        slice_static->add_endmasks(false);
+      }
+
+      slice_static->add_strides(step);
     }
 
-    slice_static->add_strides(step);
+    model_builder.AddLayer(std::move(layer));
   }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
 bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
                                             const logging::Logger& logger) const {
   int32_t input_type;
-  if (!GetType(*node.InputDefs()[0], input_type, logger))
+  if (!GetType(*node.InputDefs()[0], input_type, logger)) {
     return false;
+  }
 
   if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
       input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
-    LOGS(logger, VERBOSE) << "[" << node.OpType()
-                          << "] Input type: [" << input_type
-                          << "] is not supported for now";
+    LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not supported";
     return false;
   }
 
@@ -197,9 +258,14 @@ bool SliceOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar
   }
 
   SliceOp::PrepareForComputeMetadata compute_metadata{data_shape};
-  ORT_THROW_IF_ERROR(PrepareSliceComputeMetadataFromConstantInitializers(node, builder_params.graph_viewer,
-                                                                         compute_metadata));
+  auto status = PrepareSliceComputeMetadata(node, builder_params.graph_viewer, compute_metadata);
+  if (status != Status::OK()) {
+    LOGS(logger, VERBOSE) << "PrepareSliceComputeMetadata failed:" << status.ErrorMessage();
+    return false;
+  }
+
   if (!ValidateSliceComputeMetadataForCoreML(compute_metadata, logger)) {
+    // error logged in ValidateSliceComputeMetadataForCoreML
     return false;
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index eec0fcce51dbc..9668bfcd09adf 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -839,6 +839,13 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
     if (is_input) {
       // the model inputs need to be wired up as args to the 'main' function.
       auto tensor_value_type = CreateNamedTensorValueType(node_arg, /*convert_scalar*/ true);
+
+      // we need to convert int64 to int32 here as well
+      if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+        tensor_value_type.mutable_type()->mutable_tensortype()->set_datatype(
+            OnnxDataTypeToMILSpec(ONNX_NAMESPACE::TensorProto_DataType_INT32));
+      }
+
       tensor_value_type.set_name(name);
 
       mlprogram_main_fn_->mutable_inputs()->Add(std::move(tensor_value_type));
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index 385588dbfdcb8..bb791fb902908 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -121,6 +121,13 @@ class ModelBuilder {
     return AddConstant(op_type, value_type, AsSpan(value), shape);
   }
 
+  // helper to convert a span of non-const data to const
+  template <typename T>
+  std::string_view AddConstant(std::string_view op_type, std::string_view value_type, gsl::span<T> value,
+                               std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
+    return AddConstant(op_type, value_type, gsl::span<const T>(value), shape);
+  }
+
   /// <summary>
   /// Add a scalar value as a 'const' operation. See AddConstant for details.
   /// </summary>
diff --git a/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc b/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
index af54ae96ef86b..83b308b57f26b 100644
--- a/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
@@ -90,7 +90,7 @@ void RunSliceTest(const std::vector<int64_t>& input_dims,
 
   run_test(false);
 
-  // NNAPI EP requires the starts/ends/axes/steps be initializers
+  // EPs like NNAPI and CoreML require the starts/ends/axes/steps be initializers
   run_test(true);
 }
 
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index 3b3790ba06599..c33184686c932 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -18,6 +18,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Relu||
 |ai.onnx:Reshape||
 |ai.onnx:Resize|See [resize_op_builder.cc](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc) implementation. There are too many permutations to describe the valid combinations.|
+|ai.onnx.Slice|starts/ends/axes/steps must be constant initializers.|
 |ai.onnx:Sub||
 |ai.onnx:Sigmoid||
 |ai:onnx:Tanh||