diff --git a/.config/1espt/PipelineAutobaseliningConfig.yml b/.config/1espt/PipelineAutobaseliningConfig.yml
index daa9b73d5971a..183d52d5c1d44 100644
--- a/.config/1espt/PipelineAutobaseliningConfig.yml
+++ b/.config/1espt/PipelineAutobaseliningConfig.yml
@@ -5,15 +5,16 @@ pipelines:
     retail:
       source:
         credscan:
-          lastModifiedDate: 2024-10-24
+          lastModifiedDate: 2024-10-25
         policheck:
-          lastModifiedDate: 2024-10-24
+          lastModifiedDate: 2024-10-25
         eslint:
-          lastModifiedDate: 2024-10-24
+          lastModifiedDate: 2024-10-25
         psscriptanalyzer:
-          lastModifiedDate: 2024-10-24
+          lastModifiedDate: 2024-10-25
         armory:
-          lastModifiedDate: 2024-10-24
+          lastModifiedDate: 2024-10-25
+    usedNonDefaultBranch: true
   1299:
     retail:
       source:
@@ -25,6 +26,8 @@ pipelines:
           lastModifiedDate: 2024-10-25
         armory:
           lastModifiedDate: 2024-10-25
+        policheck:
+          lastModifiedDate: 2024-10-29
       binary:
         credscan:
           lastModifiedDate: 2024-10-25
@@ -32,3 +35,43 @@ pipelines:
           lastModifiedDate: 2024-10-25
         spotbugs:
           lastModifiedDate: 2024-10-25
+  1625:
+    retail:
+      source:
+        credscan:
+          lastModifiedDate: 2024-11-05
+        policheck:
+          lastModifiedDate: 2024-11-05
+        eslint:
+          lastModifiedDate: 2024-11-05
+        psscriptanalyzer:
+          lastModifiedDate: 2024-11-05
+        armory:
+          lastModifiedDate: 2024-11-05
+      binary:
+        credscan:
+          lastModifiedDate: 2024-11-13
+        binskim:
+          lastModifiedDate: 2024-11-13
+        spotbugs:
+          lastModifiedDate: 2024-11-13
+  1626:
+    retail:
+      source:
+        credscan:
+          lastModifiedDate: 2024-11-13
+        policheck:
+          lastModifiedDate: 2024-11-13
+        eslint:
+          lastModifiedDate: 2024-11-13
+        psscriptanalyzer:
+          lastModifiedDate: 2024-11-13
+        armory:
+          lastModifiedDate: 2024-11-13
+      binary:
+        credscan:
+          lastModifiedDate: 2024-11-13
+        binskim:
+          lastModifiedDate: 2024-11-13
+        spotbugs:
+          lastModifiedDate: 2024-11-13
diff --git a/.config/guardian/.gdnbaselines b/.config/guardian/.gdnbaselines
new file mode 100644
index 0000000000000..a7ee2a4b69dda
--- /dev/null
+++ b/.config/guardian/.gdnbaselines
@@ -0,0 +1,43 @@
+{
+  "properties": {
+    "helpUri": "https://eng.ms/docs/microsoft-security/security/azure-security/cloudai-security-fundamentals-engineering/security-integration/guardian-wiki/microsoft-guardian/general/baselines"
+  },
+  "version": "1.0.0",
+  "baselines": {
+    "default": {
+      "name": "default",
+      "createdDate": "2024-11-13 00:40:35Z",
+      "lastUpdatedDate": "2024-11-13 00:40:35Z"
+    }
+  },
+  "results": {
+    "48f03e2797fc40ecea50f878a0268947c7e13db1b2fa51aa3981246844fc4c68": {
+      "signature": "48f03e2797fc40ecea50f878a0268947c7e13db1b2fa51aa3981246844fc4c68",
+      "alternativeSignatures": [],
+      "target": "ScanTelemetry_20241113003616898.json",
+      "line": 1,
+      "memberOf": [
+        "default"
+      ],
+      "tool": "credscan",
+      "ruleId": "CSCAN-AZURE0130",
+      "createdDate": "2024-11-13 00:40:35Z",
+      "expirationDate": "2025-05-02 01:29:47Z",
+      "justification": "This error is baselined with an expiration date of 180 days from 2024-11-13 01:29:47Z"
+    },
+    "9cb6eddb3f3e886ad06cae65f5886412ff0c5fb0b96d4e943e4efa237be617b1": {
+      "signature": "9cb6eddb3f3e886ad06cae65f5886412ff0c5fb0b96d4e943e4efa237be617b1",
+      "alternativeSignatures": [],
+      "target": "ScanTelemetry_20241113111547065.json",
+      "line": 1,
+      "memberOf": [
+        "default"
+      ],
+      "tool": "credscan",
+      "ruleId": "CSCAN-AZURE0130",
+      "createdDate": "2024-11-13 11:20:17Z",
+      "expirationDate": "2025-05-02 11:55:15Z",
+      "justification": "This error is baselined with an expiration date of 180 days from 2024-11-13 11:55:15Z"
+    }
+  }
+}
\ No newline at end of file
diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index c704adb263db4..7cca0969a168b 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -20,18 +20,17 @@ permissions:
 
 jobs:
   build:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
     env:
       DOCFXVERSION: 2.62.2
     steps:
     - uses: actions/checkout@v4
-    - name: Setup .NET
-      uses: actions/setup-dotnet@v4
-      with:
-        dotnet-version: 8.0.x
     - name: Install DocFX
       run: |
         dotnet tool update -g docfx
+    - name: Update PATH
+      run: |
+        Add-Content -Value "$env:USERPROFILE\.dotnet\tools" -Encoding utf8 -Path $env:GITHUB_PATH
     # NOTE: We need to restore Microsoft.ML.OnnxRuntime.csproj manually to set IncludeMobileTargets=false
     # docfx doesn't seem to be able to do that properly resulting in build errors
     - name: Restore dependencies
@@ -50,10 +49,12 @@ jobs:
     - name: Log source commit
       run: git rev-parse --short HEAD > csharp/ApiDocs/csharp/source-version.txt
     - name: Move C# docs into site
+      shell: pwsh
       run: |
-        mkdir -p _site/docs/api
-        rm -rf _site/docs/api/csharp
-        mv csharp/ApiDocs/csharp _site/docs/api/csharp
+        New-Item -Path _site/docs/api -Force -ItemType "Directory" | Out-Null
+        $OutputDirectory="_site/docs/api/csharp"
+        if (Test-Path $OutputDirectory) { Remove-Item -Recurse -Force $OutputDirectory }
+        Move-Item -Path csharp\ApiDocs\csharp -Destination $OutputDirectory
     - name: Upload docs artifact
       uses: actions/upload-artifact@v4
       with:
diff --git a/CPPLINT.cfg b/CPPLINT.cfg
new file mode 100644
index 0000000000000..12c1c7be0d773
--- /dev/null
+++ b/CPPLINT.cfg
@@ -0,0 +1 @@
+filter=-whitespace
diff --git a/README.md b/README.md
index 8452e26a58d4d..f1817282b61a0 100644
--- a/README.md
+++ b/README.md
@@ -24,8 +24,8 @@
 
 |System|Inference|Training|
 |---|---|---|
-|Windows|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CI%20Pipeline?label=Windows+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=10)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20TensorRT%20CI%20Pipeline?label=Windows+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=47)||
-|Linux|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20Minimal%20Build%20E2E%20CI%20Pipeline?label=Linux+CPU+Minimal+Build)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=64)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20TensorRT%20CI%20Pipeline?label=Linux+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=45)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20OpenVINO%20CI%20Pipeline?label=Linux+OpenVINO)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=55)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining/orttraining-ortmodule-distributed?label=Training+Distributed)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=148)|
+|Windows|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CUDA%20CI%20Pipeline?label=Windows+GPU+CUDA)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=218)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20TensorRT%20CI%20Pipeline?label=Windows+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=47)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20WebGPU%20CI%20Pipeline?label=Windows+GPU+WebGPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=228)||
+|Linux|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20Minimal%20Build%20E2E%20CI%20Pipeline?label=Linux+CPU+Minimal+Build)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=64)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20TensorRT%20CI%20Pipeline?label=Linux+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=45)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20OpenVINO%20CI%20Pipeline?label=Linux+OpenVINO)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=55)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86)<br>[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84)|
 |Mac|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20CI%20Pipeline?label=MacOS+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=13)||
 |Android|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Android%20CI%20Pipeline?label=Android)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=53)||
 |iOS|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/iOS%20CI%20Pipeline?label=iOS)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=134)||
diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 20142e734dfac..26084ab42ec1c 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -2108,261 +2108,6 @@ SOFTWARE.
 
 _____
 
-TVM Open Deep Learning Compiler Stack
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright {yyyy} {name of copyright owner}
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-CONTRIBUTORS
-
-TVM Contributors
-================
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use,
-contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community.
-
-See the [community structure document](http://docs.tvm.ai/contribute/community.html) for the explanation of community structure and contribution guidelines.
-
-## Committers
-- [Tianqi Chen](https://github.com/tqchen) (PMC)
-- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/)
-- [Ziheng Jiang](https://github.com/ZihengJiang)
-- [Haichen Shen](http://homes.cs.washington.edu/~haichen/)
-- [Yizhi Liu](https://github.com/yzhliu)
-
-## Code Owners
-- [Aditya Atluri](https://github.com/adityaatluri) ROCM
-- [Leyuan Wang](https://github.com/Laurawly) TOPI
-- [Yuwei Hu](https://github.com/Huyuwei) TOPI
-- [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
-- [Nick Hynes](https://github.com/nhynes) SGX and secured computing
-- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
-
-## Reviewers
-- [Zhi Chen](https://github.com/zhiics)
-- [Xiaoqiang Dan](https://github.com/xqdan)
-- [Liangfu Chen](https://github.com/liangfu)
-- [Masahiro Masuda](https://github.com/masahi)
-- [Kazutaka Morita](https://github.com/kazum)
-- [Tatsuya Nishiyama](https://github.com/nishi-t)
-- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
-- [Jared Roesch](https://github.com/jroesch)
-- [Siva](https://github.com/srkreddy1238)
-- [Siju Samuel](https://github.com/siju-samuel)
-- [Alex Weaver](https://github.com/alex-weaver)
-- [Yao Wang](https://github.com/kevinthesun)
-- [Jian Weng](https://github.com/were)
-- [Eddie Yan](https://github.com/eqy)
-- [Joshua Z. Zhang](https://github.com/zhreshold)
-
-## List of Contributors
-- [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)
-  - To contributors: please add your name to the list.
-- [Qiao Zhang](https://github.com/zhangqiaorjc)
-- [Haolong Zhang](https://github.com/haolongzhangm)
-- [Cody Hao Yu](https://github.com/comaniac)
-- [Chris Nuernberger](https://github.com/cnuernber)
-
-_____
-
 FreeBSD: getopt.c file
 
 Copyright (c) 1987, 1993, 1994
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index 1432193ac9080..46349f43923e2 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -1,578 +1,508 @@
 {
-   "$schema": "https://json.schemastore.org/component-detection-manifest.json",
-   "Registrations": [
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "215105818dfde3174fe799600bb0f3cae233d0bf",
-               "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "Type": "maven",
-            "maven": {
-               "GroupId": "org.junit.platform",
-               "ArtifactId": "junit-platform-console-standalone",
-               "Version": "1.6.2"
-            },
-            "DevelopmentDependency": true
-         }
-      },
-      {
-         "component": {
-            "Type": "maven",
-            "maven": {
-               "GroupId": "com.google.protobuf",
-               "ArtifactId": "protobuf-java",
-               "Version": "3.21.7"
-            },
-            "DevelopmentDependency": true
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "2379917985919ed3918dc12cad47f469f245be7a",
-               "repositoryUrl": "https://github.com/apache/tvm.git"
-            },
-            "comments": "needed for TVM EP"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "cabe04d6d6b05356fa8f9741704924788f0dd762",
-               "repositoryUrl": "https://github.com/agauniyal/rang.git"
-            },
-            "comments": "dependency from tvm"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "a3bcc6981d5dad3afb212689e2c7853d1b1ee45d",
-               "repositoryUrl": "https://github.com/NVIDIA/cutlass.git"
-            },
-            "comments": "dependency from tvm"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "08f7c7e69f8ea61a0c4151359bc8023be8e9217b",
-               "repositoryUrl": "https://github.com/tlc-pack/libbacktrace.git"
-            },
-            "comments": "dependency from tvm"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "36a91576edf633479c78649e050f18dd2ddc8103",
-               "repositoryUrl": "https://github.com/apache/incubator-tvm-vta.git"
-            },
-            "comments": "dependency from tvm"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "111c9be5188f7350c2eac9ddaedd8cca3d7bf394",
-               "repositoryUrl": "https://github.com/kazuho/picojson.git"
-            },
-            "comments": "dependency from tvm"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "b5e4186d7ab63458e79084842dced166be2ca5b5",
-               "repositoryUrl": "https://github.com/lammertb/libcrc.git"
-            },
-            "comments": "dependency from tvm"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "e4a4c02764d37c9c3db0d64c4996651a3ef9513c",
-               "repositoryUrl": "https://github.com/dmlc/HalideIR.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3",
-               "repositoryUrl": "https://github.com/dmlc/dlpack.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "4d49691f1a9d944c3b0aa5e63f1db3cad1f941f8",
-               "repositoryUrl": "https://github.com/dmlc/dmlc-core.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "7de7e5d02bf687f971e7668963649728356e0c20",
-               "repositoryUrl": "https://github.com/intel/mkl-dnn.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "d860915b0198ddb96f93e9e97a789af156544dc6",
-               "repositoryUrl": "https://github.com/tensorflow/tensorflow.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "eddf9023206dc40974c26f589ee2ad63a4227a1e",
-               "repositoryUrl": "https://github.com/glennrp/libpng.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "217f52fb121ef92491e5d5f71394b07ce4ead1d0",
-               "repositoryUrl": "https://github.com/KjellKod/g3log.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "50893291621658f355bc5b4d450a8d06a563053d",
-               "repositoryUrl": "https://github.com/madler/zlib.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "d264a2603493fecda607c1d1cda87fedba77d36b",
-               "repositoryUrl": "https://github.com/Microsoft/CNTK.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "971e2e89d08deeae0139d3011d15646fdac13c92",
-               "repositoryUrl": "https://github.com/numpy/numpy.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "90537289a04ef5d572496240e2ac3a881be518d2",
-               "repositoryUrl": "https://github.com/pytorch/pytorch.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "b31f58de6fa8bbda5353b3c77d9be4914399724d",
-               "repositoryUrl": "https://github.com/pytorch/pytorch.git"
-            },
-            "comments": "pytorch 1.6 used by onnxruntime training image"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "7389dbac82d362f296dc2746f10e43ffa1615660",
-               "repositoryUrl": "https://github.com/scikit-learn/scikit-learn.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "eeebdab16155d34ff8f5f42137da7df4d1c7eab0",
-               "repositoryUrl": "https://github.com/BVLC/caffe.git"
-            }
-         }
-      },
-      {
-         "component": {
-            "Type": "other",
-            "Other": {
-               "Name": "LLVM",
-               "Version": "9.0.0",
-               "DownloadUrl": "https://releases.llvm.org/9.0.0/llvm-9.0.0.src.tar.xz"
-            }
-         }
-      },
-      {
-         "component": {
-            "Type": "other",
-            "Other": {
-               "Name": "FreeBSD GetOpt",
-               "Version": "12.0.0",
-               "DownloadUrl": "https://svnweb.freebsd.org/base/release/12.0.0/lib/libc/stdlib/getopt.c?revision=341707&view=co"
-            }
-         }
-      },
-      {
-         "component": {
-            "Type": "other",
-            "Other": {
-               "Name": "Boost",
-               "Version": "1.69.0",
-               "DownloadUrl": "https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.bz2"
-            }
-         }
-      },
-      {
-         "component": {
-            "git": {
-               "commitHash": "02a2a458ac15912d7d87cc1171e811b0c5219ece",
-               "repositoryUrl": "https://github.com/grpc/grpc"
-            },
-            "type": "git"
-         }
-      },
-      {
-         "component": {
-            "git": {
-               "commitHash": "b29b21a81b32ec273f118f589f46d56ad3332420",
-               "repositoryUrl": "https://github.com/google/boringssl.git"
-            },
-            "type": "git"
-         }
-      },
-      {
-         "component": {
-            "git": {
-               "commitHash": "3be1924221e1326df520f8498d704a5c4c8d0cce",
-               "repositoryUrl": "https://github.com/c-ares/c-ares.git"
-            },
-            "type": "git"
-         }
-      },
-      {
-         "component": {
-            "git": {
-               "commitHash": "6599cac0965be8e5a835ab7a5684bbef033d5ad0",
-               "repositoryUrl": "https://github.com/llvm-mirror/libcxx.git"
-            },
-            "type": "git"
-         }
-      },
-      {
-         "component": {
-            "git": {
-               "commitHash": "9245d481eb3e890f708ff2d7dadf2a10c04748ba",
-               "repositoryUrl": "https://github.com/llvm-mirror/libcxxabi.git"
-            },
-            "type": "git"
-         }
-      },
-      {
-         "component": {
-            "git": {
-               "commitHash": "9ce4a77f61c134bbed28bfd5be5cd7dc0e80f5e3",
-               "repositoryUrl": "https://github.com/google/upb.git"
-            },
-            "type": "git"
-         }
-      },
-      {
-         "component": {
-            "type": "other",
-            "Other": {
-               "Name": "Go",
-               "Version": "1.12.6",
-               "DownloadUrl": "https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz"
-            }
-         }
-      },
-      {
-         "component": {
-            "Type": "other",
-            "Other": {
-               "Name": "OpenMPI",
-               "Version": "4.0.0",
-               "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz"
-            }
-         }
-      },
-      {
-         "component": {
-            "Type": "other",
-            "Other": {
-               "Name": "OpenMPI",
-               "Version": "4.0.4",
-               "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz"
-            },
-            "comments": "openmpi 4.0.4 used by onnxruntime training image"
-         }
-      },
-      {
-         "component": {
-            "Type": "git",
-            "git": {
-               "commitHash": "7db3f9c741d3dfd8dda14ffb537ed251280d2025",
-               "repositoryUrl": "https://github.com/mpi4py/mpi4py"
-            },
-            "comments": "mpi4py 3.0.3 used by onnxruntime training image"
-         }
-      },
-      {
-         "component": {
-            "Type": "other",
-            "Other": {
-               "Name": "NCCL",
-               "Version": "2.4.8",
-               "DownloadUrl": "https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "67afac65ce64fd4dce1494f43e565e8fe34bdffb",
-               "repositoryUrl": "https://android.googlesource.com/platform/frameworks/ml"
-            },
-            "comments": "used by onnxruntime"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "c30b7da2301202da5f9f0529966944f110e5d6e7",
-               "repositoryUrl": "https://github.com/openucx/ucx"
-            },
-            "comments": "middleware between IB verbs and OpenMPI used by onnxruntime training image"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "63d1e08e64e7e09408eb63cd8dd7c65ad766f277",
-               "repositoryUrl": "https://github.com/nodejs/node"
-            },
-            "comments": "For Nodejs binding"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "aead4d751c2101e23336aa73f2380df83e7a13f3",
-               "repositoryUrl": "https://github.com/pypa/manylinux"
-            },
-            "comments": "For building our CI build docker image"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "c974557598645360fbabac71352b083117e3cc17",
-               "repositoryUrl": "https://gitlab.kitware.com/cmake/cmake"
-            },
-            "comments": "CMake 3.24.3. For building our CI build docker image"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "1e5d33e9b9b8631b36f061103a30208b206fd03a",
-               "repositoryUrl": "https://github.com/python/cpython"
-            },
-            "comments": "Python 3.9.1"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "6503f05dd59e26a9986bdea097b3da9b3546f45b",
-               "repositoryUrl": "https://github.com/python/cpython"
-            },
-            "comments": "Python 3.8.7"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "13c94747c74437e594b7fc242ff7da668e81887c",
-               "repositoryUrl": "https://github.com/python/cpython"
-            },
-            "comments": "Python 3.7.9"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "c0a9afe2ac1820409e6173bd1893ebee2cf50270",
-               "repositoryUrl": "https://github.com/python/cpython"
-            },
-            "comments": "Python 3.6.12"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "426b022776672fdf3d71ddd98d89af341c88080f",
-               "repositoryUrl": "https://github.com/python/cpython"
-            },
-            "comments": "Python 3.5.10"
-         }
-      },
-      {
-         "component": {
-            "type": "pip",
-            "pip": {
-               "Name": "transformers",
-               "Version": "4.38.0"
-            },
-            "comments": "Installed in the training docker image"
-         }
-      },
-      {
-         "component": {
-            "type": "pip",
-            "pip": {
-               "Name": "msgpack",
-               "Version": "1.0.0"
-            },
-            "comments": "Installed in the training docker image"
-         }
-      },
-      {
-         "component": {
-            "type": "pip",
-            "pip": {
-               "Name": "tensorboardX",
-               "Version": "1.8"
-            },
-            "comments": "Installed in the training docker image"
-         }
-      },
-      {
-         "component": {
-            "type": "pip",
-            "pip": {
-               "Name": "tensorboard",
-               "Version": "2.3.0"
-            },
-            "comments": "Installed in the training docker image"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "92cf3702fcfaadc84eb7bef59825a23e0cd84f56",
-               "repositoryUrl": "https://github.com/aappleby/smhasher"
-            },
-            "comments": "MurmurHash3"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "b89da3c5a0aa18fb2c6163ad9984f81ab65b22e3",
-               "repositoryUrl": "https://github.com/mestevens/gtest-ios-framework"
-            },
-            "comments": "gtest-ios-framework"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "277508879878e0a5b5b43599b1bea11f66eb3c6c",
-               "repositoryUrl": "https://github.com/dmlc/dlpack.git"
-            },
-            "comments": "dlpack"
-         }
-      },
-      {
-         "component": {
-            "Type": "other",
-            "Other": {
-               "Name": "SQLite3",
-               "Version": "3.22.0",
-               "DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb"
-            }
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "9d0ef119d9fcb9139f831adc224857b791c81140",
-               "repositoryUrl": "https://github.com/dlfcn-win32/dlfcn-win32.git"
-            },
-            "comments": "dlfcn-win32"
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "6812205f18ca4ef54372e87e1a13ce4a859434df",
-               "repositoryUrl": "https://github.com/python-pillow/Pillow.git"
-            },
-            "comments": "python-pillow. Implementation logic for anti-aliasing copied by Resize CPU kernel."
-         }
-      },
-      {
-         "component": {
-            "type": "git",
-            "git": {
-               "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac",
-               "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
-            }
-         }
-      }
-   ],
-   "Version": 1
+  "$schema": "https://json.schemastore.org/component-detection-manifest.json",
+  "Registrations": [
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "215105818dfde3174fe799600bb0f3cae233d0bf",
+          "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "Type": "maven",
+        "maven": {
+          "GroupId": "org.junit.platform",
+          "ArtifactId": "junit-platform-console-standalone",
+          "Version": "1.6.2"
+        },
+        "DevelopmentDependency": true
+      }
+    },
+    {
+      "component": {
+        "Type": "maven",
+        "maven": {
+          "GroupId": "com.google.protobuf",
+          "ArtifactId": "protobuf-java",
+          "Version": "3.21.7"
+        },
+        "DevelopmentDependency": true
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "e4a4c02764d37c9c3db0d64c4996651a3ef9513c",
+          "repositoryUrl": "https://github.com/dmlc/HalideIR.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3",
+          "repositoryUrl": "https://github.com/dmlc/dlpack.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "4d49691f1a9d944c3b0aa5e63f1db3cad1f941f8",
+          "repositoryUrl": "https://github.com/dmlc/dmlc-core.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "7de7e5d02bf687f971e7668963649728356e0c20",
+          "repositoryUrl": "https://github.com/intel/mkl-dnn.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "d860915b0198ddb96f93e9e97a789af156544dc6",
+          "repositoryUrl": "https://github.com/tensorflow/tensorflow.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "eddf9023206dc40974c26f589ee2ad63a4227a1e",
+          "repositoryUrl": "https://github.com/glennrp/libpng.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "217f52fb121ef92491e5d5f71394b07ce4ead1d0",
+          "repositoryUrl": "https://github.com/KjellKod/g3log.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "50893291621658f355bc5b4d450a8d06a563053d",
+          "repositoryUrl": "https://github.com/madler/zlib.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "d264a2603493fecda607c1d1cda87fedba77d36b",
+          "repositoryUrl": "https://github.com/Microsoft/CNTK.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "971e2e89d08deeae0139d3011d15646fdac13c92",
+          "repositoryUrl": "https://github.com/numpy/numpy.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "90537289a04ef5d572496240e2ac3a881be518d2",
+          "repositoryUrl": "https://github.com/pytorch/pytorch.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "b31f58de6fa8bbda5353b3c77d9be4914399724d",
+          "repositoryUrl": "https://github.com/pytorch/pytorch.git"
+        },
+        "comments": "pytorch 1.6 used by onnxruntime training image"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "7389dbac82d362f296dc2746f10e43ffa1615660",
+          "repositoryUrl": "https://github.com/scikit-learn/scikit-learn.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "eeebdab16155d34ff8f5f42137da7df4d1c7eab0",
+          "repositoryUrl": "https://github.com/BVLC/caffe.git"
+        }
+      }
+    },
+    {
+      "component": {
+        "Type": "other",
+        "Other": {
+          "Name": "LLVM",
+          "Version": "9.0.0",
+          "DownloadUrl": "https://releases.llvm.org/9.0.0/llvm-9.0.0.src.tar.xz"
+        }
+      }
+    },
+    {
+      "component": {
+        "Type": "other",
+        "Other": {
+          "Name": "FreeBSD GetOpt",
+          "Version": "12.0.0",
+          "DownloadUrl": "https://svnweb.freebsd.org/base/release/12.0.0/lib/libc/stdlib/getopt.c?revision=341707&view=co"
+        }
+      }
+    },
+    {
+      "component": {
+        "Type": "other",
+        "Other": {
+          "Name": "Boost",
+          "Version": "1.69.0",
+          "DownloadUrl": "https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.bz2"
+        }
+      }
+    },
+    {
+      "component": {
+        "git": {
+          "commitHash": "02a2a458ac15912d7d87cc1171e811b0c5219ece",
+          "repositoryUrl": "https://github.com/grpc/grpc"
+        },
+        "type": "git"
+      }
+    },
+    {
+      "component": {
+        "git": {
+          "commitHash": "b29b21a81b32ec273f118f589f46d56ad3332420",
+          "repositoryUrl": "https://github.com/google/boringssl.git"
+        },
+        "type": "git"
+      }
+    },
+    {
+      "component": {
+        "git": {
+          "commitHash": "3be1924221e1326df520f8498d704a5c4c8d0cce",
+          "repositoryUrl": "https://github.com/c-ares/c-ares.git"
+        },
+        "type": "git"
+      }
+    },
+    {
+      "component": {
+        "git": {
+          "commitHash": "6599cac0965be8e5a835ab7a5684bbef033d5ad0",
+          "repositoryUrl": "https://github.com/llvm-mirror/libcxx.git"
+        },
+        "type": "git"
+      }
+    },
+    {
+      "component": {
+        "git": {
+          "commitHash": "9245d481eb3e890f708ff2d7dadf2a10c04748ba",
+          "repositoryUrl": "https://github.com/llvm-mirror/libcxxabi.git"
+        },
+        "type": "git"
+      }
+    },
+    {
+      "component": {
+        "git": {
+          "commitHash": "9ce4a77f61c134bbed28bfd5be5cd7dc0e80f5e3",
+          "repositoryUrl": "https://github.com/google/upb.git"
+        },
+        "type": "git"
+      }
+    },
+    {
+      "component": {
+        "type": "other",
+        "Other": {
+          "Name": "Go",
+          "Version": "1.12.6",
+          "DownloadUrl": "https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz"
+        }
+      }
+    },
+    {
+      "component": {
+        "Type": "other",
+        "Other": {
+          "Name": "OpenMPI",
+          "Version": "4.0.0",
+          "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz"
+        }
+      }
+    },
+    {
+      "component": {
+        "Type": "other",
+        "Other": {
+          "Name": "OpenMPI",
+          "Version": "4.0.4",
+          "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz"
+        },
+        "comments": "openmpi 4.0.4 used by onnxruntime training image"
+      }
+    },
+    {
+      "component": {
+        "Type": "git",
+        "git": {
+          "commitHash": "7db3f9c741d3dfd8dda14ffb537ed251280d2025",
+          "repositoryUrl": "https://github.com/mpi4py/mpi4py"
+        },
+        "comments": "mpi4py 3.0.3 used by onnxruntime training image"
+      }
+    },
+    {
+      "component": {
+        "Type": "other",
+        "Other": {
+          "Name": "NCCL",
+          "Version": "2.4.8",
+          "DownloadUrl": "https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "67afac65ce64fd4dce1494f43e565e8fe34bdffb",
+          "repositoryUrl": "https://android.googlesource.com/platform/frameworks/ml"
+        },
+        "comments": "used by onnxruntime"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "c30b7da2301202da5f9f0529966944f110e5d6e7",
+          "repositoryUrl": "https://github.com/openucx/ucx"
+        },
+        "comments": "middleware between IB verbs and OpenMPI used by onnxruntime training image"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "63d1e08e64e7e09408eb63cd8dd7c65ad766f277",
+          "repositoryUrl": "https://github.com/nodejs/node"
+        },
+        "comments": "For Nodejs binding"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "aead4d751c2101e23336aa73f2380df83e7a13f3",
+          "repositoryUrl": "https://github.com/pypa/manylinux"
+        },
+        "comments": "For building our CI build docker image"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "c974557598645360fbabac71352b083117e3cc17",
+          "repositoryUrl": "https://gitlab.kitware.com/cmake/cmake"
+        },
+        "comments": "CMake 3.24.3. For building our CI build docker image"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "1e5d33e9b9b8631b36f061103a30208b206fd03a",
+          "repositoryUrl": "https://github.com/python/cpython"
+        },
+        "comments": "Python 3.9.1"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "6503f05dd59e26a9986bdea097b3da9b3546f45b",
+          "repositoryUrl": "https://github.com/python/cpython"
+        },
+        "comments": "Python 3.8.7"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "13c94747c74437e594b7fc242ff7da668e81887c",
+          "repositoryUrl": "https://github.com/python/cpython"
+        },
+        "comments": "Python 3.7.9"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "c0a9afe2ac1820409e6173bd1893ebee2cf50270",
+          "repositoryUrl": "https://github.com/python/cpython"
+        },
+        "comments": "Python 3.6.12"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "426b022776672fdf3d71ddd98d89af341c88080f",
+          "repositoryUrl": "https://github.com/python/cpython"
+        },
+        "comments": "Python 3.5.10"
+      }
+    },
+    {
+      "component": {
+        "type": "pip",
+        "pip": {
+          "Name": "transformers",
+          "Version": "4.38.0"
+        },
+        "comments": "Installed in the training docker image"
+      }
+    },
+    {
+      "component": {
+        "type": "pip",
+        "pip": {
+          "Name": "msgpack",
+          "Version": "1.0.0"
+        },
+        "comments": "Installed in the training docker image"
+      }
+    },
+    {
+      "component": {
+        "type": "pip",
+        "pip": {
+          "Name": "tensorboardX",
+          "Version": "1.8"
+        },
+        "comments": "Installed in the training docker image"
+      }
+    },
+    {
+      "component": {
+        "type": "pip",
+        "pip": {
+          "Name": "tensorboard",
+          "Version": "2.3.0"
+        },
+        "comments": "Installed in the training docker image"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "92cf3702fcfaadc84eb7bef59825a23e0cd84f56",
+          "repositoryUrl": "https://github.com/aappleby/smhasher"
+        },
+        "comments": "MurmurHash3"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "b89da3c5a0aa18fb2c6163ad9984f81ab65b22e3",
+          "repositoryUrl": "https://github.com/mestevens/gtest-ios-framework"
+        },
+        "comments": "gtest-ios-framework"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "277508879878e0a5b5b43599b1bea11f66eb3c6c",
+          "repositoryUrl": "https://github.com/dmlc/dlpack.git"
+        },
+        "comments": "dlpack"
+      }
+    },
+    {
+      "component": {
+        "Type": "other",
+        "Other": {
+          "Name": "SQLite3",
+          "Version": "3.22.0",
+          "DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb"
+        }
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "9d0ef119d9fcb9139f831adc224857b791c81140",
+          "repositoryUrl": "https://github.com/dlfcn-win32/dlfcn-win32.git"
+        },
+        "comments": "dlfcn-win32"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "6812205f18ca4ef54372e87e1a13ce4a859434df",
+          "repositoryUrl": "https://github.com/python-pillow/Pillow.git"
+        },
+        "comments": "python-pillow. Implementation logic for anti-aliasing copied by Resize CPU kernel."
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac",
+          "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
+        }
+      }
+    }
+  ],
+  "Version": 1
 }
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index df27fa5ab1b95..07dff50f9a3bd 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -346,7 +346,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "511eb80847afe6bded34ec491a38d5d78ba2d604",
+          "commitHash": "12a3b24c456cebd9fd11f23ac0164f78129b00c6",
           "repositoryUrl": "https://github.com/google/dawn.git"
         },
         "comments": "dawn"
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 31ebf58b03152..7710ab2f4cac7 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -102,7 +102,6 @@ option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
 option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF)
 option(onnxruntime_USE_PREINSTALLED_EIGEN "Use pre-installed EIGEN. Need to provide eigen_SOURCE_PATH if turn this on." OFF)
 option(onnxruntime_BUILD_BENCHMARKS "Build ONNXRuntime micro-benchmarks" OFF)
-option(onnxruntime_USE_LLVM "Build TVM with LLVM" OFF)
 option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)
 
 cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
@@ -145,14 +144,11 @@ option(onnxruntime_USE_TELEMETRY "Build with Telemetry" OFF)
 cmake_dependent_option(onnxruntime_USE_MIMALLOC "Override new/delete and arena allocator with mimalloc" OFF "WIN32;NOT onnxruntime_USE_CUDA;NOT onnxruntime_USE_OPENVINO" OFF)
 option(onnxruntime_USE_CANN "Build with CANN support" OFF)
 option(onnxruntime_USE_ROCM "Build with AMD GPU support" OFF)
-option(onnxruntime_USE_TVM "Build with TVM support" OFF)
-option(onnxruntime_TVM_CUDA_RUNTIME "Build TVM with CUDA support" OFF)
-option(onnxruntime_TVM_USE_LLVM "Build TVM with LLVM. Set customized path to llvm-config.exe here if need" OFF)
-option(onnxruntime_TVM_USE_HASH "Build ipp-crypto library for support hash algorithm. It is defined for TVM only")
 option(onnxruntime_USE_XNNPACK "Build with XNNPACK support. Provides an alternative math library on ARM, WebAssembly and x86." OFF)
 option(onnxruntime_USE_WEBNN "Build with WebNN support. Enable hardware acceleration in web browsers." OFF)
 option(onnxruntime_USE_WEBGPU "Build with WebGPU support. Enable WebGPU via C/C++ interface." OFF)
 option(onnxruntime_USE_EXTERNAL_DAWN "Build with treating Dawn as external dependency. Will not link Dawn at build time." OFF)
+option(onnxruntime_CUSTOM_DAWN_SRC_PATH "Path to custom Dawn src dir.")
 
 # Options related to reducing the binary size produced by the build
 # XNNPACK EP requires the internal NHWC contrib ops to be available, so this option must be OFF when onnxruntime_USE_XNNPACK is ON
@@ -257,6 +253,7 @@ cmake_dependent_option(MSVC_Z7_OVERRIDE "replacing /Zi and /ZI with /Z7 when usi
 
 option(onnxruntime_USE_AZURE "Build with azure inferencing support" OFF)
 option(onnxruntime_USE_LOCK_FREE_QUEUE "Build with lock-free task queue for threadpool." OFF)
+option(onnxruntime_FORCE_GENERIC_ALGORITHMS "Disable optimized arch-specific algorithms. Use only for testing and debugging generic algorithms." OFF)
 
 # ENABLE_TRAINING includes all training functionality
 # The following 2 entry points
@@ -906,11 +903,6 @@ if (onnxruntime_USE_SNPE)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES snpe)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_SNPE=1)
 endif()
-if (onnxruntime_USE_TVM)
-    list(APPEND ORT_PROVIDER_FLAGS  -DUSE_TVM=1)
-    list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_TVM=1)
-    list(APPEND ONNXRUNTIME_PROVIDER_NAMES tvm)
-endif()
 if (onnxruntime_USE_WINML)
   list(APPEND ORT_PROVIDER_FLAGS  -DUSE_WINML=1)
   list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_WINML=1)
@@ -981,6 +973,10 @@ if (onnxruntime_USE_LOCK_FREE_QUEUE)
     add_compile_definitions(USE_LOCK_FREE_QUEUE)
 endif()
 
+if (onnxruntime_FORCE_GENERIC_ALGORITHMS)
+    add_compile_definitions(FORCE_GENERIC_ALGORITHMS)
+endif()
+
 if (onnxruntime_ENABLE_LAZY_TENSOR)
   # To support LazyTensor, ORT needs to call Python function from C/C++.
   # so onnxruntime_ENABLE_PYTHON is required.
@@ -1313,50 +1309,6 @@ if (onnxruntime_USE_DNNL)
   add_compile_definitions(DNNL_OPENMP)
 endif()
 
-# TVM EP
-if (onnxruntime_USE_TVM)
-  if (NOT TARGET tvm)
-    message(STATUS "Include TVM(*).")
-    include(tvm)
-  endif()
-
-  # ipp-crypto
-  if (onnxruntime_TVM_USE_HASH)
-    message(STATUS "Include ipp-crypto(*).")
-    include(ipp-crypto)
-  endif()
-
-  # TVM
-  if (onnxruntime_TVM_USE_LLVM)
-    set(USE_LLVM "${onnxruntime_TVM_USE_LLVM}" CACHE STRING "Path to LLVM for correct TVM build")
-  elseif(onnxruntime_USE_LLVM)
-    set(USE_LLVM ON CACHE BOOL "Only defined for TVM")
-  endif()
-
-  if (onnxruntime_TVM_CUDA_RUNTIME)
-    set(USE_CUDA ON CACHE BOOL "Only defined for TVM" FORCE)
-  endif()
-
-  # TODO(vvchernov): customized tvm logger is hidden due to the issue on TVM side (https://github.com/apache/tvm/issues/10139)
-  # add_compile_definitions(TVM_LOG_CUSTOMIZE=1)
-  # add_library(tvm_custom_logger STATIC ${ONNXRUNTIME_ROOT}/core/providers/tvm/custom_logging.cc)
-
-  set(USE_OPENMP gnu CACHE STRING "Only defined for TVM")
-  add_subdirectory(${tvm_SOURCE_DIR} ${tvm_BINARY_DIR} EXCLUDE_FROM_ALL)
-
-  set_target_properties(tvm PROPERTIES FOLDER ${tvm_SOURCE_DIR})
-  # target_link_libraries(tvm PUBLIC tvm_custom_logger)
-
-  set(TVM_INCLUDES ${tvm_SOURCE_DIR}/include
-    ${tvm_SOURCE_DIR}/3rdparty/dmlc-core/include
-    ${tvm_SOURCE_DIR}/3rdparty/dlpack/include
-    $<TARGET_PROPERTY:tvm,INTERFACE_INCLUDE_DIRECTORIES>)
-
-  set(onnxruntime_tvm_libs onnxruntime_providers_tvm)
-  list(APPEND onnxruntime_EXTERNAL_LIBRARIES tvm)
-  list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES tvm)
-endif()
-
 # onnxruntime-extensions
 if (onnxruntime_USE_EXTENSIONS)
   include(extensions)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 9cf92bf417fcb..21f9ee1701c46 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -58,5 +58,5 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d839
 composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.7.0.zip;d0753d8d5b39947ca0729d7773cb84653a129eb1
-dawn;https://github.com/google/dawn/archive/511eb80847afe6bded34ec491a38d5d78ba2d604.zip;c493f5aca5586f6634e25d0121c85df71189fb99
+dawn;https://github.com/google/dawn/archive/12a3b24c456cebd9fd11f23ac0164f78129b00c6.zip;ad428f6dc16f1336d584f7bad5714e1097dafc43
 kleidiai;https://gitlab.arm.com/kleidi/kleidiai/-/archive/v0.2.0/kleidiai-v0.2.0.zip;B1E3173992FD91F20DB904AB77D6E901778C2681
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index d9e833a2d8cd4..ee7abcbad025c 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -615,12 +615,25 @@ if (onnxruntime_USE_COREML)
 endif()
 
 if (onnxruntime_USE_WEBGPU)
-  FetchContent_Declare(
-    dawn
-    URL ${DEP_URL_dawn}
-    URL_HASH SHA1=${DEP_SHA1_dawn}
-    PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
-  )
+  if (onnxruntime_CUSTOM_DAWN_SRC_PATH)
+    # use the custom dawn source path if provided
+    #
+    # specified as:
+    # build.py --use_webgpu --cmake_extra_defines "onnxruntime_CUSTOM_DAWN_SRC_PATH=<PATH_TO_DAWN_SRC_ROOT>"
+    FetchContent_Declare(
+      dawn
+      SOURCE_DIR ${onnxruntime_CUSTOM_DAWN_SRC_PATH}
+    )
+  else()
+    FetchContent_Declare(
+      dawn
+      URL ${DEP_URL_dawn}
+      URL_HASH SHA1=${DEP_SHA1_dawn}
+      # All previous patches are merged into the upstream dawn project. We don't need to apply any patches right now.
+      # if we need to apply patches in the future, we can uncomment the following line.
+      # PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
+    )
+  endif()
 
   # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
   set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
diff --git a/cmake/external/tvm.cmake b/cmake/external/tvm.cmake
deleted file mode 100644
index 93049c8b85853..0000000000000
--- a/cmake/external/tvm.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-if (onnxruntime_USE_TVM)
-  message(STATUS "onnxruntime_USE_TVM: Fetch tvm for TVM EP")
-
-  FetchContent_Declare(
-    tvm
-    GIT_REPOSITORY https://github.com/apache/tvm.git
-    GIT_TAG        2379917985919ed3918dc12cad47f469f245be7a
-  )
-
-  FetchContent_GetProperties(tvm)
-  if(NOT tvm_POPULATED)
-    FetchContent_Populate(tvm)
-    if (WIN32)
-      execute_process(
-        COMMAND ${CMAKE_COMMAND} -E create_symlink ${tvm_BINARY_DIR}/${CMAKE_BUILD_TYPE} ${tvm_SOURCE_DIR}/build
-      )
-    else()
-      file(CREATE_LINK ${tvm_BINARY_DIR} ${tvm_SOURCE_DIR}/build SYMBOLIC)
-    endif()
-  endif()
-
-  set(tvm_INCLUDE_DIRS ${tvm_SOURCE_DIR}/include)
-
-endif()
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 1278bb7dc9e7e..732c0511d400f 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -123,7 +123,11 @@ else()
     onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c )
   endif()
   if(NOT APPLE)
-    target_link_options(onnxruntime PRIVATE "LINKER:-rpath=\$ORIGIN")
+    include(CheckLinkerFlag)
+    check_linker_flag(CXX "LINKER:-rpath=\$ORIGIN" LINKER_SUPPORT_RPATH)
+    if(LINKER_SUPPORT_RPATH)
+      target_link_options(onnxruntime PRIVATE "LINKER:-rpath=\$ORIGIN")
+    endif()
   endif()
 endif()
 
@@ -206,7 +210,6 @@ set(onnxruntime_INTERNAL_LIBRARIES
   ${PROVIDERS_NNAPI}
   ${PROVIDERS_QNN}
   ${PROVIDERS_SNPE}
-  ${PROVIDERS_TVM}
   ${PROVIDERS_RKNPU}
   ${PROVIDERS_VSINPU}
   ${PROVIDERS_XNNPACK}
@@ -217,7 +220,6 @@ set(onnxruntime_INTERNAL_LIBRARIES
   ${onnxruntime_winml}
   onnxruntime_optimizer
   onnxruntime_providers
-  ${onnxruntime_tvm_libs}
   onnxruntime_lora
   onnxruntime_framework
   onnxruntime_graph
diff --git a/cmake/onnxruntime_codegen_tvm.cmake b/cmake/onnxruntime_codegen_tvm.cmake
deleted file mode 100644
index 7b50d8f8603ae..0000000000000
--- a/cmake/onnxruntime_codegen_tvm.cmake
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-file(GLOB_RECURSE onnxruntime_codegen_common_srcs
-    "${ONNXRUNTIME_ROOT}/core/codegen/common/*.h"
-    "${ONNXRUNTIME_ROOT}/core/codegen/common/*.cc"
-)
-
-file(GLOB_RECURSE onnxruntime_codegen_tvm_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/codegen/mti/*.h"
-    "${ONNXRUNTIME_ROOT}/core/codegen/mti/*.cc"
-    "${ONNXRUNTIME_ROOT}/core/codegen/passes/*.h"
-    "${ONNXRUNTIME_ROOT}/core/codegen/passes/*.cc"
-)
-
-source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_codegen_common_srcs} ${onnxruntime_codegen_tvm_srcs})
-
-#onnxruntime_codegen_tvm depends on onnxruntime framework
-onnxruntime_add_static_library(onnxruntime_codegen_tvm ${onnxruntime_codegen_common_srcs} ${onnxruntime_codegen_tvm_srcs})
-set_target_properties(onnxruntime_codegen_tvm PROPERTIES FOLDER "ONNXRuntime")
-target_include_directories(onnxruntime_codegen_tvm PRIVATE ${ONNXRUNTIME_ROOT} ${TVM_INCLUDES} ${MKLML_INCLUDE_DIR} ${eigen_INCLUDE_DIRS})
-onnxruntime_add_include_to_target(onnxruntime_codegen_tvm onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers  safeint_interface Boost::mp11)
-target_compile_options(onnxruntime_codegen_tvm PRIVATE ${DISABLED_WARNINGS_FOR_TVM})
-# need onnx to build to create headers that this project includes
-add_dependencies(onnxruntime_codegen_tvm ${onnxruntime_EXTERNAL_DEPENDENCIES})
diff --git a/cmake/onnxruntime_csharp.cmake b/cmake/onnxruntime_csharp.cmake
index 22c993d07f7f9..39533429e181c 100644
--- a/cmake/onnxruntime_csharp.cmake
+++ b/cmake/onnxruntime_csharp.cmake
@@ -30,10 +30,6 @@ if (onnxruntime_USE_NNAPI_BUILTIN)
   STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_NNAPI;")
 endif()
 
-if (onnxruntime_USE_TVM)
-  STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_TVM,")
-endif()
-
 if (onnxruntime_USE_OPENVINO)
   STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_OPENVINO;")
 endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 20bb1fb772189..5124262ec0004 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -36,11 +36,13 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qpostprocessor.cpp
   ${MLAS_SRC_DIR}/qlgavgpool.cpp
   ${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
-  ${MLAS_SRC_DIR}/sqnbitgemm.h
-  ${MLAS_SRC_DIR}/sqnbitgemm.cpp
+  ${MLAS_SRC_DIR}/qnbitgemm.h
+  ${MLAS_SRC_DIR}/qnbitgemm.cpp
   ${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
   ${MLAS_SRC_DIR}/flashattn.cpp
   ${MLAS_SRC_DIR}/cast.cpp
+  ${MLAS_SRC_DIR}/rotary_embedding.h
+  ${MLAS_SRC_DIR}/rotary_embedding.cpp
 )
 
 target_sources(onnxruntime_mlas PRIVATE
@@ -84,11 +86,15 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
         ${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
         ${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
-        ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h
-        ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
+        ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.h
+        ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.cpp
         ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp
         ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
-        ${MLAS_SRC_DIR}/fp16_neon_common.cpp
+        ${MLAS_SRC_DIR}/cast_kernel_neon.cpp
+        ${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp
+        ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h
+        ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp
+        ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
       )
 
       set(mlas_platform_preprocess_srcs
@@ -362,10 +368,12 @@ else()
           ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
           ${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
           ${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
-          ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h
-          ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
+          ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.h
+          ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
+          ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h
+          ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp
         )
         set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
                                     PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
@@ -383,7 +391,9 @@ else()
             ${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
             ${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
             ${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp
-            ${MLAS_SRC_DIR}/fp16_neon_common.cpp
+            ${MLAS_SRC_DIR}/cast_kernel_neon.cpp
+            ${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp
+            ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
           )
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
@@ -393,7 +403,9 @@ else()
           set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
-          set_source_files_properties(${MLAS_SRC_DIR}/fp16_neon_common.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/cast_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
         endif()
 
         if(ONNXRUNTIME_MLAS_MULTI_ARCH)
@@ -453,7 +465,6 @@ else()
                 bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
                 return 0;
               }
-              }
               #endif"
               HAS_P10_RUNTIME
             )
@@ -677,6 +688,13 @@ endif()
     if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
         file(GLOB_RECURSE mlas_platform_srcs
           "${MLAS_SRC_DIR}/scalar/*.cpp")
+    elseif (onnxruntime_FORCE_GENERIC_ALGORITHMS)
+        file(GLOB_RECURSE mlas_platform_srcs_generic
+          "${MLAS_SRC_DIR}/scalar/*.cpp")
+        set(mlas_platform_srcs
+            ${mlas_platform_srcs}
+            ${mlas_platform_srcs_generic}
+            )
     endif()
     target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
 endif()
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 9666877cdc206..582491de9503d 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -101,9 +101,6 @@ endif()
 if(onnxruntime_USE_ROCM)
   set(PROVIDERS_ROCM onnxruntime_providers_rocm)
 endif()
-if (onnxruntime_USE_TVM)
-  set(PROVIDERS_TVM onnxruntime_providers_tvm)
-endif()
 if (onnxruntime_USE_XNNPACK)
   set(PROVIDERS_XNNPACK onnxruntime_providers_xnnpack)
 endif()
@@ -194,10 +191,6 @@ if (onnxruntime_USE_ROCM)
   include(onnxruntime_providers_rocm.cmake)
 endif()
 
-if (onnxruntime_USE_TVM)
-  include(onnxruntime_providers_tvm.cmake)
-endif()
-
 if (onnxruntime_USE_VSINPU)
   include(onnxruntime_providers_vsinpu.cmake)
 endif()
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 39ad530146b33..4f86717026118 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -224,8 +224,7 @@
     include(cutlass)
     target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples ${cutlass_SOURCE_DIR}/tools/util/include)
 
-    target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}  ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES}
-     PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+    target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}  ${eigen_INCLUDE_DIRS} PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
     # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
     set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA)
     set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index 5dcee285a5b13..e500957f864f8 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -11,22 +11,22 @@
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
 
-  if (WIN32)
-      set(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO Release)
-  endif()
-
   # Header paths
   find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
-  if(OpenVINO_VERSION VERSION_LESS 2024.0)
-    message(FATAL_ERROR "OpenVINO 2024.0 and newer are supported. Please, use latest OpenVINO release")
+  if(OpenVINO_VERSION VERSION_LESS 2024.3)
+    message(FATAL_ERROR "OpenVINO 2024.3 and newer are supported. Please, use latest OpenVINO release")
   endif()
 
   if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4)
     add_definitions(-DUSE_OVEP_NPU_MEMORY=1)
   endif()
 
-  if (WIN32)
-    unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO)
+  # If building RelWithDebInfo and OV package does not have that configuration map to Release
+  get_target_property(ov_rt_implib_rwdi openvino::runtime IMPORTED_IMPLIB_RELWITHDEBINFO)
+  if ((CMAKE_BUILD_TYPE STREQUAL RelWithDebInfo) AND NOT ov_rt_implib_rwdi)
+    set_target_properties(openvino::runtime PROPERTIES
+      MAP_IMPORTED_CONFIG_RELWITHDEBINFO Release
+    )
   endif()
 
   list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
@@ -82,3 +82,8 @@
             LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
   endif()
+
+set_target_properties(onnxruntime_providers_openvino PROPERTIES
+  MAP_IMPORTED_CONFIG_RELEASE RelWithDebInfo
+  MAP_IMPORTED_CONFIG_DEBUG RelWithDebInfo
+  )
\ No newline at end of file
diff --git a/cmake/onnxruntime_providers_tvm.cmake b/cmake/onnxruntime_providers_tvm.cmake
deleted file mode 100644
index 8fd50c70dd5d7..0000000000000
--- a/cmake/onnxruntime_providers_tvm.cmake
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-  add_definitions(-DUSE_TVM=1)
-  if (onnxruntime_TVM_USE_HASH)
-    add_definitions(-DUSE_TVM_HASH=1)
-  endif()
-
-  if (onnxruntime_TVM_USE_HASH)
-    file (GLOB_RECURSE onnxruntime_providers_tvm_cc_srcs CONFIGURE_DEPENDS
-      "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.h"
-      "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.cc"
-    )
-  else()
-    file (GLOB onnxruntime_providers_tvm_cc_srcs CONFIGURE_DEPENDS
-      "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.h"
-      "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.cc"
-    )
-  endif()
-
-  source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_tvm_cc_srcs})
-  onnxruntime_add_static_library(onnxruntime_providers_tvm ${onnxruntime_providers_tvm_cc_srcs})
-
-  if ( CMAKE_COMPILER_IS_GNUCC )
-    target_compile_options(onnxruntime_providers_tvm PRIVATE -Wno-unused-parameter -Wno-missing-field-initializers)
-  endif()
-
-  target_include_directories(onnxruntime_providers_tvm PRIVATE
-          ${TVM_INCLUDES}
-          ${PYTHON_INCLUDE_DIRS})
-  onnxruntime_add_include_to_target(onnxruntime_providers_tvm onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface)
-
-  add_dependencies(onnxruntime_providers_tvm ${onnxruntime_EXTERNAL_DEPENDENCIES})
-
-  if (onnxruntime_TVM_USE_HASH)
-    add_dependencies(onnxruntime_providers_tvm ippcp_s)
-    target_include_directories(onnxruntime_providers_tvm PRIVATE ${IPP_CRYPTO_INCLUDE_DIR})
-    target_link_libraries(onnxruntime_providers_tvm PRIVATE ippcp_s)
-  endif()
-
-  set_target_properties(onnxruntime_providers_tvm PROPERTIES FOLDER "ONNXRuntime")
-  set_target_properties(onnxruntime_providers_tvm PROPERTIES LINKER_LANGUAGE CXX)
-
-  if (WIN32 AND MSVC)
-    # wd4100: identifier' : unreferenced formal parameter
-    # wd4127: conditional expression is constant
-    # wd4244: conversion from 'int' to 'char', possible loss of data
-    # TODO: 4244 should not be disabled
-    target_compile_options(onnxruntime_providers_tvm PRIVATE "/wd4100" "/wd4127" "/wd4244")
-  else()
-    target_compile_options(onnxruntime_providers_tvm PRIVATE "-Wno-error=type-limits")
-  endif()
-  target_compile_definitions(onnxruntime_providers_tvm PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
-
-  install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/tvm/tvm_provider_factory.h
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
-
-  if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_providers_tvm
-            ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
-            FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
-  endif()
\ No newline at end of file
diff --git a/cmake/onnxruntime_providers_vitisai.cmake b/cmake/onnxruntime_providers_vitisai.cmake
index 764cde9491da8..561a323533f48 100644
--- a/cmake/onnxruntime_providers_vitisai.cmake
+++ b/cmake/onnxruntime_providers_vitisai.cmake
@@ -12,6 +12,7 @@
   file(GLOB onnxruntime_providers_vitisai_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/vitisai/*.cc"
     "${ONNXRUNTIME_ROOT}/core/providers/vitisai/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/vitisai/include/vaip/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/vitisai/imp/*.cc"
     "${ONNXRUNTIME_ROOT}/core/providers/vitisai/imp/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 7239b245a7245..5a87252b08573 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -169,8 +169,8 @@ endif()
 target_link_libraries(onnxruntime_pybind11_state PRIVATE
     onnxruntime_session
     ${onnxruntime_libs}
-    ${PROVIDERS_TVM}
     ${PROVIDERS_NNAPI}
+    ${PROVIDERS_VSINPU}
     ${PROVIDERS_XNNPACK}
     ${PROVIDERS_COREML}
     ${PROVIDERS_RKNPU}
@@ -184,7 +184,6 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
     onnxruntime_optimizer
     onnxruntime_providers
     onnxruntime_util
-    ${onnxruntime_tvm_libs}
     onnxruntime_lora
     onnxruntime_framework
     onnxruntime_util
@@ -965,37 +964,6 @@ if (onnxruntime_USE_ROCM)
     )
 endif()
 
-if (onnxruntime_USE_TVM)
-  file(GLOB onnxruntime_python_providers_tvm_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/python/providers/tvm/*.py"
-  )
-  add_custom_command(
-    TARGET onnxruntime_pybind11_state POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/providers
-    COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/providers/tvm
-    COMMAND ${CMAKE_COMMAND} -E copy
-        ${onnxruntime_python_providers_tvm_srcs}
-        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/providers/tvm
-    COMMAND ${CMAKE_COMMAND} -E copy
-        $<TARGET_FILE:onnxruntime_providers_tvm>
-        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
-  )
-
-  add_custom_command(
-    TARGET onnxruntime_pybind11_state POST_BUILD
-      WORKING_DIRECTORY ${tvm_SOURCE_DIR}/python
-      COMMAND ${Python_EXECUTABLE} setup.py bdist_wheel
-    )
-
-  add_custom_command(
-    TARGET onnxruntime_pybind11_state POST_BUILD
-    COMMAND ${Python_EXECUTABLE}
-          $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/providers/tvm/extend_python_file.py
-          --target_file $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/_ld_preload.py
-  )
-
-endif()
-
 if (onnxruntime_USE_DML)
   if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
     set(dml_shared_lib_path ${DML_PACKAGE_DIR}/bin/${onnxruntime_target_platform}-win/${DML_SHARED_LIB})
@@ -1051,4 +1019,13 @@ if (onnxruntime_USE_QNN)
   endif()
 endif()
 
+if (onnxruntime_USE_VSINPU)
+  add_custom_command(
+    TARGET onnxruntime_pybind11_state POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+        $<TARGET_FILE:onnxruntime_providers_vsinpu>
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
+  )
+endif()
+
 endif()
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 561f65a33b89c..e822f0a3655fc 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -9,9 +9,6 @@ set(TEST_INC_DIR ${ONNXRUNTIME_ROOT})
 if (onnxruntime_ENABLE_TRAINING)
   list(APPEND TEST_INC_DIR ${ORTTRAINING_ROOT})
 endif()
-if (onnxruntime_USE_TVM)
-  list(APPEND TEST_INC_DIR ${TVM_INCLUDES})
-endif()
 
 set(disabled_warnings)
 function(AddTest)
@@ -67,7 +64,10 @@ function(AddTest)
     if(onnxruntime_USE_CUDA)
       #XXX: we should not need to do this. onnxruntime_test_all.exe should not have direct dependency on CUDA DLLs,
       # otherwise it will impact when CUDA DLLs can be unloaded.
-      target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart cudnn_frontend)
+      target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart)
+      if(NOT onnxruntime_CUDA_MINIMAL)
+          target_link_libraries(${_UT_TARGET} PRIVATE cudnn_frontend)
+      endif()
     endif()
     target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES})
   endif()
@@ -111,7 +111,6 @@ function(AddTest)
     endif()
     target_compile_options(${_UT_TARGET} PRIVATE ${disabled_warnings})
   else()
-    target_compile_options(${_UT_TARGET} PRIVATE ${DISABLED_WARNINGS_FOR_TVM})
     target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options -Wno-error=sign-compare>"
             "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-error=sign-compare>")
     if (${HAS_NOERROR})
@@ -641,13 +640,11 @@ set(ONNXRUNTIME_TEST_LIBS
     ${PROVIDERS_ACL}
     ${PROVIDERS_ARMNN}
     ${PROVIDERS_COREML}
-    # ${PROVIDERS_TVM}
     ${PROVIDERS_XNNPACK}
     ${PROVIDERS_AZURE}
     onnxruntime_optimizer
     onnxruntime_providers
     onnxruntime_util
-    ${onnxruntime_tvm_libs}
     onnxruntime_lora
     onnxruntime_framework
     onnxruntime_util
@@ -749,12 +746,6 @@ if(onnxruntime_USE_AZURE)
   list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_azure)
 endif()
 
-if(WIN32)
-  if (onnxruntime_USE_TVM)
-    list(APPEND disabled_warnings ${DISABLED_WARNINGS_FOR_TVM})
-  endif()
-endif()
-
 file(GLOB onnxruntime_test_framework_src CONFIGURE_DEPENDS
   ${onnxruntime_test_framework_src_patterns}
   )
@@ -855,9 +846,6 @@ if (onnxruntime_ENABLE_TRAINING_APIS)
     list(APPEND all_tests ${onnxruntime_test_training_api_src})
 endif()
 
-if (onnxruntime_USE_TVM)
-    list(APPEND all_tests ${onnxruntime_test_tvm_src})
-endif()
 
 if (onnxruntime_USE_OPENVINO)
   list(APPEND all_tests ${onnxruntime_test_openvino_src})
@@ -1089,15 +1077,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       COMMAND ${CMAKE_COMMAND} -E copy ${DNNL_DLL_PATH} $<TARGET_FILE_DIR:${test_data_target}>
       )
   endif()
-  if(WIN32)
-    if (onnxruntime_USE_TVM)
-      add_custom_command(
-        TARGET ${test_data_target} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:tvm> $<TARGET_FILE_DIR:${test_data_target}>
-        )
-    endif()
-  endif()
-
   if(WIN32)
     set(wide_get_opt_src_dir ${TEST_SRC_DIR}/win_getopt/wide)
     onnxruntime_add_static_library(win_getopt_wide ${wide_get_opt_src_dir}/getopt.cc ${wide_get_opt_src_dir}/include/getopt.h)
@@ -1139,12 +1118,6 @@ if (NOT IOS)
     endif()
     set_target_properties(onnx_test_runner PROPERTIES FOLDER "ONNXRuntimeTest")
 
-    if (onnxruntime_USE_TVM)
-      if (WIN32)
-        target_link_options(onnx_test_runner PRIVATE "/STACK:4000000")
-      endif()
-    endif()
-
     install(TARGETS onnx_test_runner
             ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
@@ -1298,11 +1271,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     endif()
     set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
 
-    if (onnxruntime_USE_TVM)
-      if (WIN32)
-        target_link_options(onnxruntime_perf_test PRIVATE "/STACK:4000000")
-      endif()
-    endif()
   endif()
 
 
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
deleted file mode 100644
index 7a2a01d55be46..0000000000000
--- a/cmake/patches/dawn/dawn.patch
+++ /dev/null
@@ -1,81 +0,0 @@
-diff --git a/src/dawn/native/CMakeLists.txt b/src/dawn/native/CMakeLists.txt
-index 9c0bd6fa4e..bf8a57aeac 100644
---- a/src/dawn/native/CMakeLists.txt
-+++ b/src/dawn/native/CMakeLists.txt
-@@ -857,6 +857,11 @@ if (DAWN_ENABLE_SWIFTSHADER)
-     target_compile_definitions(dawn_native PRIVATE "DAWN_ENABLE_SWIFTSHADER")
- endif()
-
-+if (IOS)
-+    target_compile_options(dawn_native_objects PRIVATE -fno-objc-arc)
-+    target_compile_options(dawn_native PRIVATE -fno-objc-arc)
-+endif()
-+
- if (DAWN_BUILD_MONOLITHIC_LIBRARY)
-     ###############################################################################
-     # Do the 'complete_lib' build.
-diff --git a/src/dawn/native/Surface_metal.mm b/src/dawn/native/Surface_metal.mm
-index ce55acbd43..2cfd363479 100644
---- a/src/dawn/native/Surface_metal.mm
-+++ b/src/dawn/native/Surface_metal.mm
-@@ -33,10 +33,18 @@
-
- #import <QuartzCore/CAMetalLayer.h>
-
-+#include "dawn/common/Platform.h"
-+
- namespace dawn::native {
-
- bool InheritsFromCAMetalLayer(void* obj) {
--    id<NSObject> object = static_cast<id>(obj);
-+    id<NSObject> object =
-+#if DAWN_PLATFORM_IS(IOS)
-+        (__bridge id)obj;
-+#else   // DAWN_PLATFORM_IS(IOS)
-+        static_cast<id>(obj);
-+#endif  // DAWN_PLATFORM_IS(IOS)
-+
-     return [object isKindOfClass:[CAMetalLayer class]];
- }
-
-diff --git a/src/dawn/native/metal/SharedFenceMTL.mm b/src/dawn/native/metal/SharedFenceMTL.mm
-index bde8bfea07..8906185d6f 100644
---- a/src/dawn/native/metal/SharedFenceMTL.mm
-+++ b/src/dawn/native/metal/SharedFenceMTL.mm
-@@ -25,6 +25,8 @@
- // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-+#include "dawn/common/Platform.h"
-+
- #include "dawn/native/metal/SharedFenceMTL.h"
-
- #include "dawn/native/ChainUtils.h"
-@@ -39,8 +41,13 @@ ResultOrError<Ref<SharedFence>> SharedFence::Create(
-     const SharedFenceMTLSharedEventDescriptor* descriptor) {
-     DAWN_INVALID_IF(descriptor->sharedEvent == nullptr, "MTLSharedEvent is missing.");
-     if (@available(macOS 10.14, iOS 12.0, *)) {
--        return AcquireRef(new SharedFence(
--            device, label, static_cast<id<MTLSharedEvent>>(descriptor->sharedEvent)));
-+        return AcquireRef(new SharedFence(device, label,
-+#if DAWN_PLATFORM_IS(IOS)
-+                                          (__bridge id<MTLSharedEvent>)(descriptor->sharedEvent)
-+#else   // DAWN_PLATFORM_IS(IOS)
-+                                          static_cast<id<MTLSharedEvent>>(descriptor->sharedEvent)
-+#endif  // DAWN_PLATFORM_IS(IOS)
-+                                              ));
-     } else {
-         return DAWN_INTERNAL_ERROR("MTLSharedEvent not supported.");
-     }
-diff --git a/src/tint/api/BUILD.cmake b/src/tint/api/BUILD.cmake
-index 0037d83276..6372c4ee77 100644
---- a/src/tint/api/BUILD.cmake
-+++ b/src/tint/api/BUILD.cmake
-@@ -57,6 +57,7 @@ tint_target_add_dependencies(tint_api lib
-   tint_lang_wgsl_ast_transform
-   tint_lang_wgsl_common
-   tint_lang_wgsl_features
-+  tint_lang_wgsl_inspector
-   tint_lang_wgsl_program
-   tint_lang_wgsl_sem
-   tint_lang_wgsl_writer_ir_to_program
diff --git a/csharp/ApiDocs/docfx.json b/csharp/ApiDocs/docfx.json
index 0671d4aeb7d95..88a3283ad76e8 100644
--- a/csharp/ApiDocs/docfx.json
+++ b/csharp/ApiDocs/docfx.json
@@ -14,7 +14,7 @@
       "disableDefaultFilter": false,
       "noRestore": true,
       "properties": {
-        "AllowUnsafeBlocks": true,
+        "AllowUnsafeBlocks": "true",
         "TargetFramework": "net8.0",
         "Nullable": "enable",
         "LangVersion": "8.0",
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index be157a0419fc0..d628b065ceaa7 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -1142,9 +1142,6 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca
 
         [DllImport(NativeLib.DllName, CharSet = CharSet.Ansi)]
         public static extern IntPtr /*(OrtStatus*)*/ OrtSessionOptionsAppendExecutionProvider_MIGraphX(IntPtr /*(OrtSessionOptions*)*/ options, int device_id);
-
-        [DllImport(NativeLib.DllName, CharSet = CharSet.Ansi)]
-        public static extern IntPtr /*(OrtStatus*)*/ OrtSessionOptionsAppendExecutionProvider_Tvm(IntPtr /*(OrtSessionOptions*) */ options, byte[] /*(char char*)*/ settings);
 #endif
         /// <summary>
         /// Append a TensorRT EP instance (configured based on given provider options) to the native OrtSessionOptions instance
@@ -1272,7 +1269,7 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca
         /// <summary>
         /// Append an execution provider instance to the native OrtSessionOptions instance.
         ///
-        /// 'SNPE' and 'XNNPACK' are currently supported as providerName values.
+        /// 'SNPE', 'XNNPACK' and 'CoreML' are currently supported as providerName values.
         ///
         /// The number of providerOptionsKeys must match the number of providerOptionsValues and equal numKeys.
         /// </summary>
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
index 3acd84b3016de..bd450451a1265 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
@@ -146,27 +146,6 @@ public static SessionOptions MakeSessionOptionWithTensorrtProvider(OrtTensorRTPr
             }
         }
 
-        /// <summary>
-        /// A helper method to construct a SessionOptions object for TVM execution.
-        /// Use only if you have the onnxruntime package specific to this Execution Provider.
-        /// </summary>
-        /// <param name="settings">settings string, comprises of comma separated key:value pairs. default is empty</param>
-        /// <returns>A SessionsOptions() object configured for execution with TVM</returns>
-        public static SessionOptions MakeSessionOptionWithTvmProvider(String settings = "")
-        {
-            SessionOptions options = new SessionOptions();
-            try
-            {
-                options.AppendExecutionProvider_Tvm(settings);
-                return options;
-            }
-            catch (Exception)
-            {
-                options.Dispose();
-                throw;
-            }
-        }
-
         /// <summary>
         /// A helper method to construct a SessionOptions object for ROCM execution.
         /// Use only if ROCM is installed and you have the onnxruntime package specific to this Execution Provider.
@@ -397,20 +376,6 @@ public void AppendExecutionProvider_CoreML(CoreMLFlags coremlFlags = CoreMLFlags
 #endif
         }
 
-        /// <summary>
-        /// Use only if you have the onnxruntime package specific to this Execution Provider.
-        /// </summary>
-        /// <param name="settings">string with TVM specific settings</param>
-        public void AppendExecutionProvider_Tvm(string settings = "")
-        {
-#if __MOBILE__
-            throw new NotSupportedException("The TVM Execution Provider is not supported in this build");
-#else
-            var utf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(settings);
-            NativeApiStatus.VerifySuccess(NativeMethods.OrtSessionOptionsAppendExecutionProvider_Tvm(handle, utf8));
-#endif
-        }
-
         private class ExecutionProviderAppender
         {
             private byte[] _utf8ProviderName;
@@ -430,16 +395,10 @@ public IntPtr Appender(IntPtr handle, IntPtr[] optKeys, IntPtr[] optValues, UInt
         /// <summary>
         /// Append QNN, SNPE or XNNPACK execution provider
         /// </summary>
-        /// <param name="providerName">Execution provider to add. 'QNN', 'SNPE' or 'XNNPACK' are currently supported.</param>
+        /// <param name="providerName">Execution provider to add. 'QNN', 'SNPE' 'XNNPACK', 'CoreML and 'AZURE are currently supported.</param>
         /// <param name="providerOptions">Optional key/value pairs to specify execution provider options.</param>
         public void AppendExecutionProvider(string providerName, Dictionary<string, string> providerOptions = null)
         {
-            if (providerName != "SNPE" && providerName != "XNNPACK" && providerName != "QNN" && providerName != "AZURE")
-            {
-                throw new NotSupportedException(
-                    "Only QNN, SNPE, XNNPACK and AZURE execution providers can be enabled by this method.");
-            }
-
             if (providerOptions == null)
             {
                 providerOptions = new Dictionary<string, string>();
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
index aa0e6ee62248a..17738da515134 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
@@ -146,10 +146,6 @@ public void TestSessionOptions()
                 opt.AppendExecutionProvider_Nnapi(0);
 #endif
 
-#if USE_TVM
-                opt.AppendExecutionProvider_Tvm("Vulkan -device=amd_apu");
-#endif
-
 #if USE_OPENVINO
                 opt.AppendExecutionProvider_OpenVINO();
 #endif
@@ -179,6 +175,12 @@ public void TestSessionOptions()
                 ex = Assert.Throws<OnnxRuntimeException>(() => { opt.AppendExecutionProvider("QNN"); });
                 Assert.Contains("QNN execution provider is not supported in this build", ex.Message);
 #endif
+#if USE_COREML
+                opt.AppendExecutionProvider("CoreML");
+#else
+                ex = Assert.Throws<OnnxRuntimeException>(() => { opt.AppendExecutionProvider("CoreML"); });
+                Assert.Contains("CoreML execution provider is not supported in this build", ex.Message);
+#endif
 
                 opt.AppendExecutionProvider_CPU(1);
             }
@@ -2041,7 +2043,7 @@ public SkipNonPackageTests()
         }
 
         // Test hangs on mobile.
-#if !(ANDROID || IOS)  
+#if !(ANDROID || IOS)
         [Fact(DisplayName = "TestModelRunAsyncTask")]
         private async Task TestModelRunAsyncTask()
         {
diff --git a/dockerfiles/Dockerfile.cuda b/dockerfiles/Dockerfile.cuda
index ce4560e9b0c7c..40f11dca623a7 100644
--- a/dockerfiles/Dockerfile.cuda
+++ b/dockerfiles/Dockerfile.cuda
@@ -48,7 +48,7 @@ RUN cd /code \
     && python3 -m venv /code/env \
     && . /code/env/bin/activate \
     && pip install --upgrade psutil setuptools wheel packaging \
-    && pip install -r tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt \
+    && pip install -r /code/tools/ci_build/github/linux/python/requirements.txt \
     && python /code/tools/ci_build/build.py --build_dir /code/build/Linux \
     --allow_running_as_root --skip_submodule_sync \
     --use_cuda --cuda_home /usr/local/cuda \
diff --git a/dockerfiles/Dockerfile.migraphx b/dockerfiles/Dockerfile.migraphx
index c5d998d503899..876a07e4ffaf6 100644
--- a/dockerfiles/Dockerfile.migraphx
+++ b/dockerfiles/Dockerfile.migraphx
@@ -10,7 +10,7 @@ FROM rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
 
-ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH}
+ENV PATH=/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
 
 RUN apt-get update &&\
     apt-get install -y migraphx
diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino
index 39e75a68a369f..d1ebdae3cbdd6 100644
--- a/dockerfiles/Dockerfile.openvino
+++ b/dockerfiles/Dockerfile.openvino
@@ -11,7 +11,7 @@ FROM openvino/ubuntu22_runtime:${OPENVINO_VERSION} AS builder
 
 ENV WORKDIR_PATH=/home/openvino
 WORKDIR $WORKDIR_PATH
-ENV DEBIAN_FRONTEND noninteractive
+ENV DEBIAN_FRONTEND=noninteractive
 
 ARG DEVICE=CPU
 ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git
@@ -41,7 +41,7 @@ RUN tar cvf GPL_sources.tar.gz /sources
 # Deploy stage
 FROM openvino/ubuntu22_runtime:${OPENVINO_VERSION}
 
-ENV DEBIAN_FRONTEND noninteractive
+ENV DEBIAN_FRONTEND=noninteractive
 USER root
 COPY --from=builder /home/openvino/onnxruntime/build/Linux/Release/dist/*.whl ./
 COPY --from=builder /GPL_sources.tar.gz ./
@@ -50,7 +50,7 @@ ARG BUILD_UID=1001
 ARG BUILD_USER=onnxruntimedev
 RUN adduser --uid $BUILD_UID $BUILD_USER
 RUN usermod -a -G video,users ${BUILD_USER}
-ENV WORKDIR_PATH /home/${BUILD_USER}
+ENV WORKDIR_PATH=/home/${BUILD_USER}
 WORKDIR ${WORKDIR_PATH}
 
 USER ${BUILD_USER}
diff --git a/dockerfiles/Dockerfile.rocm b/dockerfiles/Dockerfile.rocm
index bef8d7a5f47d2..aca8c3feaff71 100644
--- a/dockerfiles/Dockerfile.rocm
+++ b/dockerfiles/Dockerfile.rocm
@@ -12,7 +12,7 @@ ARG ONNXRUNTIME_BRANCH=main
 
 WORKDIR /code
 
-ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH}
+ENV PATH=/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
 
 # Prepare onnxruntime repository & build onnxruntime
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
diff --git a/dockerfiles/Dockerfile.tensorrt b/dockerfiles/Dockerfile.tensorrt
index ef51d41c5ff1b..24947df6308a6 100644
--- a/dockerfiles/Dockerfile.tensorrt
+++ b/dockerfiles/Dockerfile.tensorrt
@@ -17,7 +17,7 @@ RUN apt-get update &&\
 RUN unattended-upgrade
 
 WORKDIR /code
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 
 # Prepare onnxruntime repository & build onnxruntime with TensorRT
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
diff --git a/dockerfiles/Dockerfile.vitisai b/dockerfiles/Dockerfile.vitisai
index e11ab70a61332..c6226155e01e3 100644
--- a/dockerfiles/Dockerfile.vitisai
+++ b/dockerfiles/Dockerfile.vitisai
@@ -22,8 +22,8 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:$PATH
-ENV LD_LIBRARY_PATH /opt/xilinx/xrt/lib:$LD_LIBRARY_PATH
+ENV PATH=/code/cmake-3.27.3-linux-x86_64/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/xilinx/xrt/lib:$LD_LIBRARY_PATH
 
 WORKDIR /code
 RUN . $VAI_ROOT/conda/etc/profile.d/conda.sh &&\
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index b87532debe4bc..6ea3f93cdea12 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1596,6 +1596,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>(Optional) Hardware architecture.</dd>
 <dt><tt>main_context</tt> : int</dt>
 <dd>Usually each single EPContext associate with a graph partition.But for some case like QNN, it has single EPContext contains all partitions.In that case, the node with ep_cache_context should set main_context=1. Other nodes set main_context=0 and skip ep_cache_context.The path is relative to this Onnx file. Default is 1.</dd>
+<dt><tt>max_size</tt> : int</dt>
+<dd>max size in the context. Usage depend on the EP.</dd>
 <dt><tt>notes</tt> : string</dt>
 <dd>(Optional) Some notes for the model</dd>
 <dt><tt>onnx_model_filename</tt> : string</dt>
diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md
index 4d8a286bde66e..199e6671f6a1a 100644
--- a/docs/How_To_Update_ONNX_Dev_Notes.md
+++ b/docs/How_To_Update_ONNX_Dev_Notes.md
@@ -21,7 +21,7 @@ This file should be generated. See [cgmanifests/README](/cgmanifests/README.md)
 - [onnxruntime/test/python/requirements.txt](/onnxruntime/test/python/requirements.txt)
 - [tools/ci_build/github/linux/docker/scripts/requirements.txt](/tools/ci_build/github/linux/docker/scripts/requirements.txt)
 - [tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt](/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt)
-- [tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt](/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt)
+- [tools/ci_build/github/linux/python/requirements.txt](/tools/ci_build/github/linux/python/requirements.txt)
 - Run `git grep -rn "onnx==1" .` to find other locations and update this document if necessary.
 
 1. If there is any change to `cmake/external/onnx/onnx/*.in.proto`, you need to regenerate OnnxMl.cs.
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index e23a52757dedb..eeb8ebb3ccefe 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -453,6 +453,7 @@ Do not modify directly.*
 |SVMClassifier|*in* X:**T1**<br> *out* Y:**T2**<br> *out* Z:**tensor(float)**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int64), tensor(string)|
 |SVMRegressor|*in* X:**T**<br> *out* Y:**tensor(float)**|1+|**T** = tensor(float)|
 |Scaler|*in* X:**T**<br> *out* Y:**tensor(float)**|1+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
+|TreeEnsemble|*in* X:**T**<br> *out* Y:**T**|5+|**T** = tensor(double), tensor(float)|
 |TreeEnsembleClassifier|*in* X:**T1**<br> *out* Y:**T2**<br> *out* Z:**tensor(float)**|3+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int64), tensor(string)|
 |||[1, 2]|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int64), tensor(string)|
 |TreeEnsembleRegressor|*in* X:**T**<br> *out* Y:**tensor(float)**|3+|**T** = tensor(double), tensor(float)|
@@ -1086,11 +1087,13 @@ Do not modify directly.*
 |GreaterOrEqual|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|16+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
 |||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(bool)|
 |GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|GroupNorm||21+|**M** = tensor(float), tensor(float16)<br/> **T** = tensor(float), tensor(float16)|
 |HardSigmoid|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(float), tensor(float16)|
 |Hardmax|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(float), tensor(float16)|
 |||11+|**T** = tensor(float), tensor(float16)|
 |||1+|**T** = tensor(float), tensor(float16)|
-|Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|19+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|21+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||19+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||16+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||14+|**V** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -1189,7 +1192,8 @@ Do not modify directly.*
 |||12+|**T** = tensor(float), tensor(float16), tensor(int32)<br/> **T1** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint8)|
 |||7+|**T** = tensor(float), tensor(float16)|
 |QLinearConv|*in* x:**T1**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T1**<br> *in* w:**T2**<br> *in* w_scale:**tensor(float)**<br> *in* w_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *in* B:**T4**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)<br/> **T4** = tensor(int32)|
-|QLinearMatMul|*in* a:**T1**<br> *in* a_scale:**TS**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**TS**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**TS**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**<br><br>or<br><br>*in* a:**T1**<br> *in* a_scale:**tensor(float)**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**tensor(float)**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
+|QLinearMatMul|*in* a:**T1**<br> *in* a_scale:**TS**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**TS**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**TS**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**<br><br>or<br><br>*in* a:**T1**<br> *in* a_scale:**tensor(float)**<br> *in* a_zero_point:**T1**<br> *in* b:**T2**<br> *in* b_scale:**tensor(float)**<br> *in* b_zero_point:**T2**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T3**<br> *out* y:**T3**|21+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
+|||10+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(int8), tensor(uint8)|
 |QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|21+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)|
 |||19+|**T1** = tensor(float), tensor(float16), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
 |||13+|**T1** = tensor(float), tensor(int32)<br/> **T2** = tensor(int8), tensor(uint8)|
diff --git a/docs/TVM_EP.md b/docs/TVM_EP.md
deleted file mode 100644
index df59d5c05855c..0000000000000
--- a/docs/TVM_EP.md
+++ /dev/null
@@ -1,319 +0,0 @@
-# TVM Execution Provider
-
-## Contents
-
-- [Introduction](#introduction)
-- [Build](#build-onnx-runtime-with-the-tvm-execution-provider)
-    - [Linux](#linux)
-    - [Windows](#windows)
-- [Configuration options](#configuration-options)
-- [Performance Tuning](#performance-tuning)
-    - [Using precompiled model](#using-precompiled-model)
-- [Samples](#samples)
-- [Known issues](#known-issues)
-
-
-## Introduction
-
-TVM is an execution provider for ONNX Runtime that is built on top of Apache TVM. It enables ONNX Runtime users to leverage Apache TVM model optimizations.
-TVM EP is currently in "Preview". It's been tested to work on a handful of models on Linux or Windows, but not on MacOS.
-
-## Build ONNX Runtime with the TVM Execution Provider
-
-### **Linux**
-Install the minimal pre-requisites on Ubuntu/Debian like linux operating systems:
-```bash
-apt-get install -y python3 python3-dev python3-pip python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev llvm-12
-pip3 install numpy decorator attrs nasm
-```
-Note: since ONNX Runtime with TVM EP is built with Intel ipp-crypto library there are new requirements. Compiler gcc (and g++) version should be equal to or higher than 8.2. nasm version should be 2.14.02 or higher. Problem with small nasm version can be seen [here](https://github.com/intel/ipp-crypto/issues/9) or [here](https://bugzilla.nasm.us/show_bug.cgi?id=3392205). For ubuntu LTS 18 `apt-get install nasm` is not enough due to it has version 2.13.02, see how to install from sources instruction [here](https://stackoverflow.com/questions/36144930/steps-to-install-nasm-offline-on-ubuntu).
-
-Also, the current implementation has `NVidia GPU` support for TVM EP. For now, you can use only `NVidia GPU` with CUDA Toolkit support.
-To do this, make sure you have installed the NVidia driver and CUDA Toolkit.
-More detailed instructions can be found on the [official page](https://developer.nvidia.com/cuda-toolkit).
-
-Clone this repo.
-In order to build ONNXRT you will need to have CMake 3.18 or higher. In Ubuntu 20.04 you can use the following commands to install the latest version of CMake:
-
-```bash
-sudo apt-get update
-sudo apt-get install gpg wget
-
-wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
-
-echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main' | sudo tee /etc/apt/sources.list.d/kitware.list >/dev/null
-sudo apt-get update
-
-sudo rm /usr/share/keyrings/kitware-archive-keyring.gpg
-sudo apt-get install kitware-archive-keyring
-
-sudo apt-get install cmake
-```
-
-Build ONNX Runtime (TVM x86):
-```bash
-./build.sh --config Release --enable_pybind --build_wheel --parallel --skip_tests --skip_onnx_tests --use_tvm
-```
-
-Build ONNX Runtime (TVM with CUDA support):
-```bash
-./build.sh --config Release --enable_pybind --build_wheel --parallel --skip_tests --skip_onnx_tests --use_tvm --tvm_cuda_runtime
-```
-
-This command builds both `TVM` and `onnxruntime-tvm`. It creates two wheel, one for each project.
-Build the python API for ONNX Runtime instead of using the standard package. Instructions for this are given below.
-
-Package for TVM:
-```bash
-cd <path_to_onnx_runtime>
-python3 -m pip uninstall tvm -y
-whl_path=$(find ./build/<OS_NAME>/Release/_deps/tvm-src/python/dist -name "*.whl")
-python3 -m pip install $whl_path
-```
-
-Package for TVM EP:
-```bash
-cd <path_to_onnx_runtime>
-python3 -m pip uninstall onnxruntime onnxruntime-tvm -y
-whl_path=$(find ./build/<OS_NAME>/Release/dist -name "*.whl")
-python3 -m pip install $whl_path
-```
-
-Alternatively, you can set `PYTHONPATH` to tell python where to find the ONNXRT library and the TVM library.
-```bash
-export PYTHONPATH=<path_to_onnx_runtime>/build/<OS_NAME>/Release:${PYTHONPATH}
-export PYTHONPATH=<path_to_onnx_runtime>/build/<OS_NAME>/Release/_deps/tvm-src/python:${PYTHONPATH}
-```
-
-### **Windows**
-Install the minimal prerequisites on Windows: Git, CMake, Visual Studio, Python, LLVM
-- Git: Download Git for Windows from [here](https://git-scm.com/download/win) and install it. Please make sure that the git.exe path is included in the environment variable. By default, it should be added. To check git after the installation use `git --version` in command line (cmd).
-- CMake: use [the link](https://cmake.org/download/) to download and install CMake. msi-file is recommended for it. To verify CMake installation use `cmake --version` in cmd.
-- Visual Studio: Download from [here](https://visualstudio.microsoft.com/ru/downloads/) and install Visual Studio 20** Community & Visual Studio Build Tools respectively. It is recommended not to change the default installation path. Chose "Desktop development with C++" workload and make sure that both options of “MSVC [contemporary version] C++ build tools” and “Windows 10 SDK” are selected.
-- Python: Download Python 3.* from [here](https://www.python.org/downloads/windows/) and install it. Please have a check on the option of “Add Python to PATH”, so the installer will include the Python directory into the environment variable directly. To check python after the installation use `python` from cmd. The expected output is similar to the following:
-```cmd
-Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)] on win32
-Type "help", "copyright", "credits" or "license" for more information.
->>>
-```
-Use `quit()` to exit from python interface.
-- LLVM: the compiler is not necessary for pure ONNX Runtime installation but it is needed for TVM EP by default.
-```cmd
-git clone --depth 1 --branch release/11.x https://github.com/llvm/llvm-project.git
-cmake -S llvm -B build -DLLVM_ENABLE_PROJECTS="clang;libcxx;libcxxabi" -DLLVM_TARGETS_TO_BUILD=X86 -Thost=x64 -DCMAKE_BUILD_TYPE=Release -G "Visual Studio 17 2022"
-cmake --build ./build --config Release
-```
-- Dependencies of ipp-crypto:<br>
-1. install asm compiler (nasm) on windows by line:
-```cmd
-winget install nasm -i
-```
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-Add it to PATH (instruction for Windows GUI can be seen [here](https://www.computerhope.com/issues/ch000549.htm#dospath)) or by cmd:
-```cmd
-set PATH="%PATH%;C:\Program Files\NASM"
-```
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-or
-```cmd
-setx PATH "%PATH%;C:\Program Files\NASM"
-```
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-Check by `nasm --version` in prompt command line.<br>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-2. install openssl on windows by msi-file from [here](https://slproweb.com/products/Win32OpenSSL.html)
-Add path to directory (e.g. "C:\Program Files\OpenSSL-Win64\bin") with executable file to PATH (see instructions above).<br>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-Check by `openssl version` in prompt command line.<br>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-3. Correct build of ipp-crytpo requires specific environment variables for supported MSVC compiler. Long way to adjust the environment is to follow to instructions [here](https://docs.microsoft.com/en-us/cpp/build/building-on-the-command-line?view=msvc-170&viewFallbackFrom=vs-2017). Quick way is to use VS Developer command prompt where the environment have been already adjusted or add some paths to standard Windows command prompt:
-```cmd
-set INCLUDE=C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.32.31326\include;C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt
-```
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-Take into account that MSVC and Kit versions are specific for Visual Studio built on the machine, specified values here are used as example.
-<br>
-<br>
-
-For using NVIDIA GPU (optional) CUDA and cuDNN should be installed.
-- CUDA: Install CUDA by the [link](https://developer.nvidia.com/cuda-11.0-download-archive).
-- cuDNN: download cuDNN installer from [here](https://developer.nvidia.com/rdp/cudnn-archive). Choose v8.* for corresponding CUDA v11.*, unzip it, and move cuDNN files as following:
-1. [unzipped dir]\bin\ → C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin
-2. [unzipped dir]\include\ → C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\include
-3. [unzipped dir]\lib\ → C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\lib
-
-To verify the CUDA installation use `nvcc --version` in cmd.
-<br>
-<br>
-
-#### **Build ONNX Runtime with TVM Execution Provider from source (Python):**
-- Use command line and clone sources from github:
-```cmd
-git clone --recursive https://github.com/Microsoft/onnxruntime
-cd onnxruntime
-```
-- CPU build:
-```
-build.bat --config Release --enable_pybind --build_wheel --skip_tests --parallel --use_tvm --skip_onnx_tests --cmake_generator "Visual Studio 17 2022" --llvm_config <path_to_llvm_root>/build/Release/bin/llvm-config.exe
-```
-- GPU build:
-```
-build.bat --config Release --enable_pybind --build_wheel --skip_tests --parallel --use_tvm --skip_onnx_tests --cmake_generator "Visual Studio 17 2022" --llvm_config <path_to_llvm_root>/build/Release/bin/llvm-config.exe --use_cuda --cudnn_home “C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.*” --cuda_home “C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.*”
-```
-In both cases (CPU, GPU) there are the following options for cmake generator: "Visual Studio 17 2022" and "Ninja". Also handshake mechanism can be switched on by `--use_tvm_hash` flag. At the latter case ipp-crypto library is built with dependencies, see details above.
-- Install python wheel package for ONNX Runtime:<br>
-Default path to the package is `<path_to_onnxruntime_root>/build/Windows/Release/Release/dist`. Note that it is different in comparison with path to the package on Linux. Before installation check names of wheel packages and use corresponding one. It can be looked like the following:
-```cmd
-python -m pip install .\onnxruntime\build\Windows\Release\Release\dist\onnxruntime_tvm-1.6.0-cp38-cp38-win_amd64.whl
-```
-- Install python wheel package for TVM due to its python API is used inside TVM EP:<br>
-It can be looked like the following:
-```cmd
-python -m pip install .\onnxruntime\build\Windows\Release\_deps\tvm-src\python\dist\tvm-0.9.dev1728+g3425ed846-cp39-cp39-win_amd64.whl
-```
-- Verify result by python script. Note: python should not be launched from directory containing 'onnxruntime' directory for correct result:
-```python
-import onnxruntime
-print(onnxruntime.__version__)
-print(onnxruntime.get_device())
-print(onnxruntime.get_available_providers())
-```
-- Uninstall procedure:
-```cmd
-pip uninstall onnxruntime-tvm
-```
-
-#### **Build ONNX Runtime with TVM Execution Provider from source (C#):**
-- Use command line and clone sources from github:
-```cmd
-git clone --recursive https://github.com/Microsoft/onnxruntime
-cd onnxruntime
-```
-- CPU build:
-
-Make sure you download [nuget.exe](https://docs.microsoft.com/en-us/nuget/install-nuget-client-tools#nugetexe-cli) and add path to it into `PATH` env.
-```
-build.bat --config Release --build_nuget --skip_tests --parallel --use_tvm --skip_onnx_tests --cmake_generator "Visual Studio 17 2022" --llvm_config llvm-config.exe
-```
-- Install C# nuget package for TVM EP. Default path to the package is `<path_to_onnxruntime_root>\build\Windows\Release\Release`.
-
-
-## Configuration options
-TVM Executor Provider can be configured with the following provider options:
-1. Python
-```python
-po = [dict(executor=tvm_executor_type,
-           so_folder=folder_with_pretuned_files,
-           check_hash=check_hash,
-           hash_file_path=hash_file_path,
-           target=client_target,
-           target_host=client_target_host,
-           opt_level=client_opt_level,
-           freeze_weights=freeze,
-           to_nhwc=layout_transform,
-           tuning_type=tvm_optimizer_type,
-           tuning_file_path=client_tuning_logfile,
-           input_names = input_names_str,
-           input_shapes = input_shapes_str)]
-tvm_session = onnxruntime.InferenceSession(model_path, providers=["TvmExecutionProvider"], provider_options=po)
-```
-
-2. C#
-
-Currently, only precompiled models are supported in C# (see the related section below).
-
-```CSharp
-SessionOptions session_options = new SessionOptions{};
-string tvm_ep_options =
-  $"executor: {tvm_executor_type}, " +
-  $"so_folder: {folder_with_pretuned_files}, " +
-  $"check_hash: {check_hash}, " +
-  $"hash_file_path: {hash_file_path}, " +
-  $"target: {client_target}, " +
-  $"target_host: {client_target_host}, " +
-  $"opt_level: {client_opt_level}, " +
-  $"freeze_weights: {freeze}, " +
-  $"to_nhwc: {layout_transform}, " +
-  $"tuning_type: {tvm_optimizer_type}, " +
-  $"tuning_file_path: {client_tuning_logfile}, " +
-  $"input_names: {input_names_str}, " +
-  $"input_shapes: {input_shapes_str}";
-
-session_options.AppendExecutionProvider_Tvm(tvm_ep_options);
-using var tvm_session = new InferenceSession(modelFilePath, session_options);
-```
-<br>
-
-- `executor` is executor type used by TVM. There is choice between two types: GraphExecutor and VirtualMachine which are corresponded to "graph" and "vm" tags. VirtualMachine is used by default.
-- `so_folder` is path to folder with set of files (.ro-, .so/.dll-files and weights) obtained after model tuning. It uses these files for executor compilation instead of onnx-model. But the latter is still needed for ONNX Runtime.
-- `check_hash` means that it is necessary to perform a HASH check for the model obtained in the `so_folder` parameter. It is `False` by default.
-- `hash_file_path` is path to file that contains the pre-computed HASH for the ONNX model which result of tuning locates in the path passed by `so_folder` parameter.
-  If an empty string was passed as this value, then the file will be searched in the folder that was passed in the `so_folder` parameter.
-- `target` and `target_host` are strings like in TVM (e.g. "llvm --mcpu=avx2"). When using accelerators, target may be something like `cuda` while target_host may be `llvm -mtriple=x86_64-linux-gnu`
-- `opt_level` is TVM optimization level. It is 3 by default
-- `freeze_weights` means that all model weights are kept on compilation stage otherwise they are downloaded each inference. True is recommended value for the best performance. It is true by default.
-- `to_nhwc` switches on special model transformations, particularly data layout, which Octomizer is used. It allows to work correctly with tuning logs obtained from Octomizer. It is false by default.
-- `tuning_type` defines the type of TVM tuning logs being used, and can be set to either `AutoTVM` (1st gen auto tuning logs) or `Ansor` (2nd gen auto tuning logs). By default this option is set to `AutoTVM`.
-- `tuning_file_path` is path to AutoTVM or Ansor tuning file which gives specifications for given model and target for the best performance. (See below for more details).
-
-TVM supports models with fixed graph only. If your model has unknown dimensions in input shapes (excluding batch size) you must provide the shape using the `input_names` and `input_shapes` provider options. Below is an example of what must be passed to `provider_options`:
-```python
-input_names = "input_1 input_2"
-input_shapes = "[1 3 224 224] [1 2]"
-```
-
-## Performance Tuning
-TVM optimizes machine learning models through an automated tuning process that produces model variants specific to targeted hardware architectures.  This process also generates 'tuning logs' that the TVM EP relies on to maximize model performance. These logs can be acquired for your model by either using TVM as described here:
-
-AutoTVM:
-https://tvm.apache.org/docs/how_to/tune_with_autotvm/index.html
-
-Ansor (Autoscheduling):
-https://tvm.apache.org/docs/how_to/tune_with_autoscheduler/index.html
-
-or by using logs generated through the OctoML platform (https://onnx.octoml.ai) using instructions [here](https://help.octoml.ai/en/articles/5814452-using-octoml-platform-logs-with-onnx-rt-tvm-ep)
-
-Using the TVM EP with TVM tuning logs also requires users to turn off ONNX Runtime preprocessing.  To do this, the following `SessionOptions()` can be used:
-```
-so = onnxruntime.SessionOptions()
-so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
-
-tvm_session = onnxruntime.InferenceSession(model_path, sess_options=so, providers=["TvmExecutionProvider"], provider_options=po)
-```
-
-### **Using precompiled model**
-It is also possible to use a precompiled model.
-
-The compiled model can be obtained using the [OctoML platform](https://onnx.octoml.ai)
-or compiled directly (see **Support precompiled model** section in
-[Sample notebook for ResNet50 inference with TVM EP](https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb)
-for more information on model compilation).
-
-In order to use the precompiled model, only need to pass two options:
-* **executor** - `vm` (`VirtualMachine`) must be used as a value
-(this functionality is not supported for `GraphExecutor`);
-* **so_folder** - as a value, you must pass the path to the directory where
-the files of the precompiled model are located.
-* **check_hash** - (optional) if you want to check hash, you must pass `True` as the value.
-* **hash_file_path** - (optional) by default, the file containing the hash for the tuned model will be searched in the directory that is passed in the `so_folder` parameter.
-  If you want to specify different location, then you must pass the path to the file that contains the desired hash as a value.
-
-You can read more about these options in section [Configuration options](#configuration-options) above.
-
-
-## Samples
-- [Sample notebook for ResNet50 inference with TVM EP](https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb)
-
-## Known issues
-- At this moment, the TVM EP has only been verified on UNIX/Linux and Windows systems.
-- Some compatibility issues have been found between ONNX and Google protobuf. `AttributeError: module 'google.protobuf.internal.containers' has no attribute 'MutableMapping'`. This usually occurss during `import onnx` in any python scripts for protobuf version >= 3.19.0 and ONNX version <= 1.8.1. To resolve the issue Google protobuf and ONNX can be reinstalled separately or together using:
-```
-pip3 uninstall onnx -y
-pip3 install onnx==1.10.1
-pip3 uninstall protobuf -y
-pip3 install protobuf==3.19.1
-```
-
-The following pair of ONNX and protobuf versions have been found to be compatible:
-- 3.17.3 and 1.8.0
-- 3.19.1 and 1.10.1
diff --git a/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb b/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb
deleted file mode 100644
index 830495bdfb98d..0000000000000
--- a/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb
+++ /dev/null
@@ -1,657 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "72476497",
-   "metadata": {},
-   "source": [
-    "# ONNX Runtime: Tutorial for TVM execution provider\n",
-    "\n",
-    "This notebook shows a simple example for model inference with TVM EP.\n",
-    "\n",
-    "\n",
-    "#### Tutorial Roadmap:\n",
-    "1. Prerequistes\n",
-    "2. Accuracy check for TVM EP\n",
-    "3. Configuration options\n",
-    "4. Support precompiled model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9345cbab",
-   "metadata": {},
-   "source": [
-    "## 1. Prerequistes\n",
-    "\n",
-    "Make sure that you have installed all the necessary dependencies described in the corresponding paragraph of the documentation.\n",
-    "\n",
-    "Also, make sure you have the `tvm` and `onnxruntime-tvm` packages in your pip environment. \n",
-    "\n",
-    "If you are using `PYTHONPATH` variable expansion, make sure it contains the following paths: `<path_to_msft_onnxrt>/onnxruntime/cmake/external/tvm_update/python` and `<path_to_msft_onnxrt>/onnxruntime/build/Linux/Release`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "da4ca21f",
-   "metadata": {},
-   "source": [
-    "### Common import\n",
-    "\n",
-    "These packages can be delivered from standard `pip`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "0f072875",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import onnx\n",
-    "import tempfile\n",
-    "import numpy as np\n",
-    "from typing import List, AnyStr\n",
-    "from onnx import ModelProto, helper, checker, mapping"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "118670aa",
-   "metadata": {},
-   "source": [
-    "### Specialized import\n",
-    "\n",
-    "It is better to collect these packages from source code in order to clearly understand what is available to you right now."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "a5502966",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import onnxruntime\n",
-    "\n",
-    "import tvm\n",
-    "import tvm.relay\n",
-    "import tvm.testing\n",
-    "import tvm.runtime\n",
-    "import tvm.runtime.vm\n",
-    "import tvm.relay.backend.vm\n",
-    "import tvm.contrib.download"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b7313183",
-   "metadata": {},
-   "source": [
-    "### Helper functions for working with ONNX ModelProto\n",
-    "\n",
-    "This set of helper functions allows you to recognize the meta information of the models. This information is needed for more versatile processing of ONNX models."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "7d0a36e8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_onnx_input_names(model: ModelProto) -> List[AnyStr]:\n",
-    "    inputs = [node.name for node in model.graph.input]\n",
-    "    initializer = [node.name for node in model.graph.initializer]\n",
-    "    inputs = list(set(inputs) - set(initializer))\n",
-    "    return sorted(inputs)\n",
-    "\n",
-    "\n",
-    "def get_onnx_output_names(model: ModelProto) -> List[AnyStr]:\n",
-    "    return [node.name for node in model.graph.output]\n",
-    "\n",
-    "\n",
-    "def get_onnx_input_types(model: ModelProto) -> List[np.dtype]:\n",
-    "    input_names = get_onnx_input_names(model)\n",
-    "    return [\n",
-    "        mapping.TENSOR_TYPE_TO_NP_TYPE[node.type.tensor_type.elem_type]\n",
-    "        for node in sorted(model.graph.input, key=lambda node: node.name) if node.name in input_names\n",
-    "    ]\n",
-    "\n",
-    "\n",
-    "def get_onnx_input_shapes(model: ModelProto) -> List[List[int]]:\n",
-    "    input_names = get_onnx_input_names(model)\n",
-    "    return [\n",
-    "        [dv.dim_value for dv in node.type.tensor_type.shape.dim]\n",
-    "        for node in sorted(model.graph.input, key=lambda node: node.name) if node.name in input_names\n",
-    "    ]\n",
-    "\n",
-    "\n",
-    "def get_random_model_inputs(model: ModelProto) -> List[np.ndarray]:\n",
-    "    input_shapes = get_onnx_input_shapes(model)\n",
-    "    input_types = get_onnx_input_types(model)\n",
-    "    assert len(input_types) == len(input_shapes)\n",
-    "    inputs = [np.random.uniform(size=shape).astype(dtype) for shape, dtype in zip(input_shapes, input_types)]\n",
-    "    return inputs"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f0de1682",
-   "metadata": {},
-   "source": [
-    "### Wrapper helper functions for Inference\n",
-    "\n",
-    "Wrapper helper functions for running model inference using ONNX Runtime EP."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "258ce9e9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_onnxruntime_output(model: ModelProto, inputs: List, provider_name: AnyStr) -> np.ndarray:\n",
-    "    output_names = get_onnx_output_names(model)\n",
-    "    input_names = get_onnx_input_names(model)\n",
-    "    assert len(input_names) == len(inputs)\n",
-    "    input_dict = {input_name: input_value for input_name, input_value in zip(input_names, inputs)}\n",
-    "\n",
-    "    inference_session = onnxruntime.InferenceSession(model.SerializeToString(), providers=[provider_name])\n",
-    "    output = inference_session.run(output_names, input_dict)\n",
-    "\n",
-    "    # Unpack output if there's only a single value.\n",
-    "    if len(output) == 1:\n",
-    "        output = output[0]\n",
-    "    return output\n",
-    "\n",
-    "\n",
-    "def get_cpu_onnxruntime_output(model: ModelProto, inputs: List) -> np.ndarray:\n",
-    "    return get_onnxruntime_output(model, inputs, \"CPUExecutionProvider\")\n",
-    "\n",
-    "\n",
-    "def get_tvm_onnxruntime_output(model: ModelProto, inputs: List) -> np.ndarray:\n",
-    "    return get_onnxruntime_output(model, inputs, \"TvmExecutionProvider\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cc17d3b2",
-   "metadata": {},
-   "source": [
-    "### Helper function for checking accuracy\n",
-    "\n",
-    "This function uses the TVM API to compare two output tensors. The tensor obtained using the `CPUExecutionProvider` is used as a reference.\n",
-    "\n",
-    "If a mismatch is found between tensors, an appropriate exception will be thrown."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "4e598907",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def verify_outputs(\n",
-    "    lhs: List[np.ndarray],\n",
-    "    rhs: List[np.ndarray],\n",
-    "    rtol: float = 5e-5,\n",
-    "    atol: float = 5e-5\n",
-    ") -> None:\n",
-    "    for lhs_tensor, rhs_tensor in zip(lhs, rhs):\n",
-    "        tvm.testing.assert_allclose(lhs_tensor, rhs_tensor, rtol=rtol, atol=atol)\n",
-    "        assert lhs_tensor.dtype == rhs_tensor.dtype\n",
-    "    print(\"Same output, congratulations!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "f33a372b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def verify_with_ort_with_inputs(\n",
-    "    model,\n",
-    "    inputs,\n",
-    "    out_shape=None,\n",
-    "    opset=None,\n",
-    "    freeze_params=False,\n",
-    "    dtype=\"float32\",\n",
-    "    rtol=1e-5,\n",
-    "    atol=1e-5,\n",
-    "    opt_level=1,\n",
-    "):\n",
-    "    if opset is not None:\n",
-    "        model.opset_import[0].version = opset\n",
-    "\n",
-    "    ort_out = get_cpu_onnxruntime_output(model, inputs)\n",
-    "    tvm_out = get_tvm_onnxruntime_output(model, inputs)\n",
-    "    verify_outputs(ort_out, tvm_out, rtol, atol)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8c62b01a",
-   "metadata": {},
-   "source": [
-    "### Helper functions for download models\n",
-    "\n",
-    "These functions use the TVM API to download models from the ONNX Model Zoo."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "324c00e7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "BASE_MODEL_URL = \"https://github.com/onnx/models/raw/master/\"\n",
-    "MODEL_URL_COLLECTION = {\n",
-    "    \"ResNet50-v1\": \"vision/classification/resnet/model/resnet50-v1-7.onnx\",\n",
-    "    \"ResNet50-v2\": \"vision/classification/resnet/model/resnet50-v2-7.onnx\",\n",
-    "    \"SqueezeNet-v1.1\": \"vision/classification/squeezenet/model/squeezenet1.1-7.onnx\",\n",
-    "    \"SqueezeNet-v1.0\": \"vision/classification/squeezenet/model/squeezenet1.0-7.onnx\",\n",
-    "    \"Inception-v1\": \"vision/classification/inception_and_googlenet/inception_v1/model/inception-v1-7.onnx\",\n",
-    "    \"Inception-v2\": \"vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-7.onnx\",\n",
-    "}\n",
-    "\n",
-    "\n",
-    "def get_model_url(model_name):\n",
-    "    return BASE_MODEL_URL + MODEL_URL_COLLECTION[model_name]\n",
-    "\n",
-    "\n",
-    "def get_name_from_url(url):\n",
-    "    return url[url.rfind(\"/\") + 1 :].strip()\n",
-    "\n",
-    "\n",
-    "def find_of_download(model_name):\n",
-    "    model_url = get_model_url(model_name)\n",
-    "    model_file_name = get_name_from_url(model_url)\n",
-    "    return tvm.contrib.download.download_testdata(model_url, model_file_name, module=\"models\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "90fb7c5c",
-   "metadata": {},
-   "source": [
-    "## 2. Accuracy check for TVM EP \n",
-    "\n",
-    "This section will check the accuracy. The check will be to compare the output tensors for `CPUExecutionProvider` and `TvmExecutionProvider`. See the description of `verify_with_ort_with_inputs` function used above.\n",
-    "\n",
-    "\n",
-    "### Check for simple architectures"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "c739ed5c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_two_input_model(op_name: AnyStr) -> ModelProto:\n",
-    "    dtype = \"float32\"\n",
-    "    in_shape = [1, 2, 3, 3]\n",
-    "    in_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)]\n",
-    "    out_shape = in_shape\n",
-    "    out_type = in_type\n",
-    "\n",
-    "    layer = helper.make_node(op_name, [\"in1\", \"in2\"], [\"out\"])\n",
-    "    graph = helper.make_graph(\n",
-    "        [layer],\n",
-    "        \"two_input_test\",\n",
-    "        inputs=[\n",
-    "            helper.make_tensor_value_info(\"in1\", in_type, in_shape),\n",
-    "            helper.make_tensor_value_info(\"in2\", in_type, in_shape),\n",
-    "        ],\n",
-    "        outputs=[\n",
-    "            helper.make_tensor_value_info(\n",
-    "                \"out\", out_type, out_shape\n",
-    "            )\n",
-    "        ],\n",
-    "    )\n",
-    "    model = helper.make_model(graph, producer_name=\"two_input_test\")\n",
-    "    checker.check_model(model, full_check=True)\n",
-    "    return model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "7048ee6d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Same output, congratulations!\n",
-      "****************** Success! ******************\n"
-     ]
-    }
-   ],
-   "source": [
-    "onnx_model = get_two_input_model(\"Add\")\n",
-    "inputs = get_random_model_inputs(onnx_model)\n",
-    "verify_with_ort_with_inputs(onnx_model, inputs)\n",
-    "print(\"****************** Success! ******************\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "52c880f4",
-   "metadata": {},
-   "source": [
-    "### Check for DNN architectures "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "f5d465dc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_onnx_model(model_name):\n",
-    "    model_path = find_of_download(model_name)\n",
-    "    onnx_model = onnx.load(model_path)\n",
-    "    return onnx_model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "68daac7e",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Same output, congratulations!\n",
-      "****************** Success! ******************\n"
-     ]
-    }
-   ],
-   "source": [
-    "model_name = \"ResNet50-v1\"\n",
-    "\n",
-    "onnx_model = get_onnx_model(model_name)\n",
-    "inputs = get_random_model_inputs(onnx_model)\n",
-    "verify_with_ort_with_inputs(onnx_model, inputs)\n",
-    "print(\"****************** Success! ******************\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e27f64a2",
-   "metadata": {},
-   "source": [
-    "## 3. Configuration options\n",
-    "\n",
-    "This section shows how you can configure TVM EP using custom options. For more details on the options used, see the corresponding section of the documentation."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "a053f59f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "provider_name = \"TvmExecutionProvider\"\n",
-    "provider_options = dict(\n",
-    "    target=\"llvm -mtriple=x86_64-linux-gnu\",\n",
-    "    target_host=\"llvm -mtriple=x86_64-linux-gnu\",\n",
-    "    opt_level=3,\n",
-    "    freeze_weights=True,\n",
-    "    tuning_file_path=\"\",\n",
-    "    tuning_type=\"Ansor\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "3f6e6f01",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_name = \"ResNet50-v1\"\n",
-    "onnx_model = get_onnx_model(model_name)\n",
-    "input_dict = {\n",
-    "    input_name: input_value for input_name, input_value in zip(\n",
-    "        get_onnx_input_names(onnx_model),\n",
-    "        get_random_model_inputs(onnx_model),\n",
-    "    )\n",
-    "}\n",
-    "output_names = get_onnx_output_names(onnx_model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "85ab83f2",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "****************** Output shape: (1, 1000) ******************\n"
-     ]
-    }
-   ],
-   "source": [
-    "tvm_session = onnxruntime.InferenceSession(\n",
-    "    onnx_model.SerializeToString(),\n",
-    "    providers=[provider_name],\n",
-    "    provider_options=[provider_options],\n",
-    ")\n",
-    "output = tvm_session.run(output_names, input_dict)[0]\n",
-    "print(f\"****************** Output shape: {output.shape} ******************\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b704374b",
-   "metadata": {},
-   "source": [
-    "## 4. Support precompiled model\n",
-    "\n",
-    "Wrapper functions that allow you to compile the model and save it in the desired format."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "8150942b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def compile_virtual_machine(model: onnx.ModelProto, target_str: AnyStr) -> tvm.runtime.vm.Executable:\n",
-    "    ir_mod, params = tvm.relay.frontend.from_onnx(\n",
-    "        model,\n",
-    "        opset=model.opset_import[0].version,\n",
-    "        freeze_params=True,\n",
-    "    )\n",
-    "    target = tvm.target.Target(target=target_str, host=target_str)\n",
-    "    return tvm.relay.backend.vm.compile(ir_mod, target)\n",
-    "\n",
-    "\n",
-    "def serialize_virtual_machine(vm_exec: tvm.runtime.vm.Executable) -> AnyStr:\n",
-    "    temp_directory = tempfile.mkdtemp()\n",
-    "    path_consts = os.path.join(temp_directory, \"consts\")\n",
-    "    vm_exec.move_late_bound_consts(path_consts, byte_limit=256)\n",
-    "    lib_path = os.path.join(temp_directory, f\"model.so\")\n",
-    "    code_path = os.path.join(temp_directory, f\"model.ro\")\n",
-    "    code, lib = vm_exec.save()\n",
-    "    lib.export_library(lib_path)\n",
-    "    with open(code_path, \"wb\") as fo:\n",
-    "        fo.write(code)\n",
-    "    return temp_directory"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9cbb987e",
-   "metadata": {},
-   "source": [
-    "Preparation of the ONNX model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "febb9d72",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_name = \"ResNet50-v1\"\n",
-    "onnx_model = get_onnx_model(model_name)\n",
-    "input_dict = {\n",
-    "    input_name: input_value for input_name, input_value in zip(\n",
-    "        get_onnx_input_names(onnx_model),\n",
-    "        get_random_model_inputs(onnx_model),\n",
-    "    )\n",
-    "}\n",
-    "output_names = get_onnx_output_names(onnx_model)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b05b251a",
-   "metadata": {},
-   "source": [
-    "Compiling the ONNX model using `VirtualMachine` (TVM)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "b4b999ee",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "compiled_vm_exec = compile_virtual_machine(onnx_model, target_str=\"llvm\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "e3408c15",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "so_folder = serialize_virtual_machine(compiled_vm_exec)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "311405e8",
-   "metadata": {},
-   "source": [
-    "Preparing `ProviderOptions` and launching `TVM EP` inference.\n",
-    "\n",
-    "In order to use the precompiled model, you only need to pass two options:\n",
-    "* **executor** - `vm` (`VirtualMachine`) must be used as a value (this functionality is not supported for `GraphExecutor`);\n",
-    "* **so_folder** - as a value, you must pass the path to the directory where the files of the precompiled model are located."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "8927293c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "provider_name = \"TvmExecutionProvider\"\n",
-    "provider_options = dict(\n",
-    "    executor=\"vm\",\n",
-    "    so_folder=so_folder,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "d7532863",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tvm_session = onnxruntime.InferenceSession(\n",
-    "    onnx_model.SerializeToString(),\n",
-    "    providers=[provider_name],\n",
-    "    provider_options=[provider_options],\n",
-    ")\n",
-    "tvm_output = tvm_session.run(output_names, input_dict)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1c0b983e",
-   "metadata": {},
-   "source": [
-    "Let's make sure that the output values match those that can be obtained through `CPUExecutionProvider`:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "c3de2299",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Same output, congratulations!\n"
-     ]
-    }
-   ],
-   "source": [
-    "verify_outputs(\n",
-    "    tvm_output[0],\n",
-    "    get_cpu_onnxruntime_output(\n",
-    "        onnx_model,\n",
-    "        input_dict.values()\n",
-    "    ),\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/include/onnxruntime/core/framework/kernel_registry.h b/include/onnxruntime/core/framework/kernel_registry.h
index 7b3d04ee66d9e..aaf533135429c 100644
--- a/include/onnxruntime/core/framework/kernel_registry.h
+++ b/include/onnxruntime/core/framework/kernel_registry.h
@@ -8,6 +8,9 @@
 #include "core/framework/op_kernel.h"
 
 namespace onnxruntime {
+namespace logging {
+class Logger;
+}
 
 using KernelCreateMap = std::multimap<std::string, KernelCreateInfo>;
 using KernelDefHashes = std::vector<std::pair<std::string, HashValue>>;
@@ -33,6 +36,7 @@ class KernelRegistry {
   // Kernel matching uses the types from the node and the kernel_type_str_resolver.
   Status TryFindKernel(const Node& node, ProviderType exec_provider,
                        const IKernelTypeStrResolver& kernel_type_str_resolver,
+                       const logging::Logger& logger,
                        const KernelCreateInfo** out) const;
 
   // map of type constraint name to required type
@@ -42,6 +46,7 @@ class KernelRegistry {
   // Kernel matching uses the explicit type constraint name to required type map in type_constraints.
   Status TryFindKernel(const Node& node, ProviderType exec_provider,
                        const TypeConstraintMap& type_constraints,
+                       const logging::Logger& logger,
                        const KernelCreateInfo** out) const;
 
   /**
@@ -61,13 +66,15 @@ class KernelRegistry {
                        std::string_view domain,
                        int version,
                        const KernelRegistry::TypeConstraintMap& type_constraints,
+                       const logging::Logger& logger,
                        const KernelCreateInfo** out) const;
 
   static bool HasImplementationOf(const KernelRegistry& r, const Node& node,
                                   ProviderType exec_provider,
-                                  const IKernelTypeStrResolver& kernel_type_str_resolver) {
+                                  const IKernelTypeStrResolver& kernel_type_str_resolver,
+                                  const logging::Logger& logger) {
     const KernelCreateInfo* info;
-    Status st = r.TryFindKernel(node, exec_provider, kernel_type_str_resolver, &info);
+    Status st = r.TryFindKernel(node, exec_provider, kernel_type_str_resolver, logger, &info);
     return st.IsOK();
   }
 
@@ -83,6 +90,7 @@ class KernelRegistry {
   Status TryFindKernelImpl(const Node& node, ProviderType exec_provider,
                            const IKernelTypeStrResolver* kernel_type_str_resolver,
                            const TypeConstraintMap* type_constraints,
+                           const logging::Logger& logger,
                            const KernelCreateInfo** out) const;
 
   // Check whether the types of inputs/outputs of the given node match the extra
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index a17da2a19bb99..07625c38d8474 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -79,7 +79,6 @@ class OpKernel {
   //               the allocator tied to the session if the kernel owns the pre-packed buffer or an
   //               allocator shared between sessions if the pre-packed buffer is to be shared across sessions
   //               (i.e.) the kernel does not own the buffer.
-  // @param save_prepacked_initializers: Set it to true if intend to save prepacked initializers to external data file.
   // @param is_packed: Set it to true if the kernel packed the tensor or to false
   //                   The kernel is responsible for keeping the packed data and related metadata if is_packed is true,
   //                   and the original initialized constant tensor will be released and not accessible anymore in
@@ -89,7 +88,6 @@ class OpKernel {
 
   virtual Status
   PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
-          bool, /*save_prepacked_initializers*/
           /*out*/ bool& is_packed, /*out*/ PrePackedWeights* /*prepacked_weights*/) {
     is_packed = false;
     return Status::OK();
@@ -131,26 +129,6 @@ class OpKernel {
     return Status::OK();
   }
 
-  // Override this function to get pre-packed tensors from this kernel.
-  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
-  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
-  // @param input_idx : The index of input we prepacked before and intend to get packed tensor back.
-  // Please refer to matmul_nbits kernel for a complete example.
-  virtual std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) {
-    return std::nullopt;
-  }
-
-  // Override this function to set pre-packed tensors to this kernel and restore prepacked weight buffer.
-  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
-  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
-  // Please refer to matmul_nbits kernel for a complete example.
-  // @param input_idx : The input index of the tensor in this kernel.
-  // @param pre_packed_tensor: The prepacked tensor read from onnx data file and use the prepacked tensor
-  // to restore prepacked weight buffer.
-  virtual Status SetPrePackTensor(int /*input_idx*/, const Tensor& /*pre_packed_tensor*/) {
-    return Status::OK();
-  }
-
   const OrtDevice GetDevice(OrtMemType mem_type) const;
   const OpKernelInfo& Info() const {
     return *op_kernel_info_;
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 69af3c93d7a07..eb9581e8018d1 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -1148,11 +1148,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   void FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_node);
 #endif
 
-  // Since one constant initializer could be used by different kernels
-  // and prepacked differently, use an unordered_map to store prepacked
-  // initializer in format of <[initializer_name], <[node_name], [prepacked_initializer]>>
-  typedef std::unordered_map<std::string, std::unordered_map<std::string, ONNX_NAMESPACE::TensorProto>> PrePackedTensorProtoToSave;
-
 #if !defined(ORT_MINIMAL_BUILD)
   /** Gets the GraphProto representation of this Graph. */
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
@@ -1187,26 +1182,18 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   @param initializer_size_threshold initializers larger or equal to this threshold (in bytes) are saved
   in the external file. Initializer smaller than this threshold are included in the onnx file.
   @param align_info offset alignment info.
-  @param save_prepacked_constant_initializers whether to save prepacked initializer into external data file.
-         If set false to this boolean, prepacked initializer will not be saved into onnxruntime data file,
-         we keep constant initializer as it is.
-  @param pre_packed_initializers struct used to store all the prepacked initializers.
   @returns GraphProto serialization of the graph.
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
                                                                   size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info,
-                                                                  bool save_prepacked_constant_initializers,
-                                                                  PrePackedTensorProtoToSave& pre_packed_initializers) const;
+                                                                  const OffsetAlignmentInfo& align_info) const;
 
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
                                                                   size_t initializer_size_threshold) const {
     OffsetAlignmentInfo default_options;
-    PrePackedTensorProtoToSave pre_packed_initializers;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options,
-                                                false, pre_packed_initializers);
+    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
   }
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
@@ -1521,18 +1508,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
  private:
   void InitializeStateFromModelFileGraphProto();
 
-  // Private method used to setup external initializer properly during model save,
-  // this external initializer could be oroginal initializer or prepacked initializer.
-  static void SetUpExternalInitializer(const Graph::OffsetAlignmentInfo& align_info,
-                                       size_t tensor_bytes_size,
-                                       int64_t& external_offset,
-                                       std::ofstream& external_stream,
-                                       gsl::span<const uint8_t> raw_data,
-                                       ONNX_NAMESPACE::TensorProto& output_proto,
-                                       const std::filesystem::path& external_file_path,
-                                       const ONNX_NAMESPACE::TensorProto& initializer,
-                                       bool is_prepacked);
-
   // Add node with specified <node_proto>.
   Node& AddNode(const ONNX_NAMESPACE::NodeProto& node_proto,
                 const ArgNameToTypeMap& name_to_type);
diff --git a/include/onnxruntime/core/optimizer/graph_transformer_utils.h b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
index 6cff153c336f0..31b0f22340510 100644
--- a/include/onnxruntime/core/optimizer/graph_transformer_utils.h
+++ b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
@@ -53,6 +53,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     TransformerLevel level,
     const SessionOptions& session_options,
     const IExecutionProvider& execution_provider /*required by constant folding*/,
+    const logging::Logger& logger,
     const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
     concurrency::ThreadPool* intra_op_thread_pool = nullptr,
     std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);
@@ -84,6 +85,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
     const SessionOptions& session_options,
     const SatApplyContextVariant& apply_context,
     const IExecutionProvider& cpu_execution_provider,
+    const logging::Logger& logger,
     const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
     concurrency::ThreadPool* intra_op_thread_pool = nullptr,
     std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);
diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
index 98fa9e09f1ba8..d035fd34bd072 100644
--- a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
+++ b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -41,6 +41,27 @@ enum COREMLFlags {
   COREML_FLAG_LAST = COREML_FLAG_USE_CPU_AND_GPU,
 };
 
+// MLComputeUnits can be one of the following values:
+// 'MLComputeUnitsCPUAndNeuralEngine|MLComputeUnitsCPUAndGPU|MLComputeUnitsCPUOnly|MLComputeUnitsAll'
+// these values are intended to be used with Ort::SessionOptions::AppendExecutionProvider (C++ API)
+// and SessionOptionsAppendExecutionProvider (C API). For the old API, use COREMLFlags instead.
+static const char* const kCoremlProviderOption_MLComputeUnits = "MLComputeUnits";
+static const char* const kCoremlProviderOption_ModelFormat = "ModelFormat";
+// same as COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES
+static const char* const kCoremlProviderOption_RequireStaticInputShapes = "RequireStaticInputShapes";
+static const char* const kCoremlProviderOption_EnableOnSubgraphs = "EnableOnSubgraphs";
+// provided by https://developer.apple.com/documentation/coreml/mloptimizationhints-swift.struct/specializationstrategy-swift.property
+// Core ML segments the model’s compute graph and specializes each segment for the target compute device.
+// This process can affect the model loading time and the prediction latency.
+// Use this option to tailor the specialization strategy for your model.
+static const char* const kCoremlProviderOption_SpecializationStrategy = "SpecializationStrategy";
+// Profile the Core ML MLComputePlan.
+// This logs the hardware each operator is dispatched to and the estimated execution time.
+// Intended for developer usage but provide useful diagnostic information if performance is not as expected.
+static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
+// please refer to https://developer.apple.com/documentation/coreml/mlmodelconfiguration/allowlowprecisionaccumulationongpu
+static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index b0c5d2329c428..a35d975ac8f1b 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -626,8 +626,13 @@ typedef struct OrtMIGraphXProviderOptions {
 } OrtMIGraphXProviderOptions;
 
 /** \brief OpenVINO Provider Options
- *
- * \see OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO
+ *  \brief This Struct is frozen since ORT 1.13.0. Its maintained part of Legacy API for compatibility.
+ *  \brief For latest OpenVINO Provider Options update to the ProviderOptions map.
+ *  \brief Latest OpenVINO Provider Options are listed in the
+ *  \htmlonly
+ *  <a href="https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#summary-of-options">onnxruntime document.</a>
+ *  \endhtmlonly
+ * \see OrtApi::SessionOptionsAppendExecutionProvider()
  */
 typedef struct OrtOpenVINOProviderOptions {
 #ifdef __cplusplus
@@ -645,7 +650,7 @@ typedef struct OrtOpenVINOProviderOptions {
    * Valid settings are one of: "CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16"
    */
   const char* device_type;
-  unsigned char enable_npu_fast_compile;
+  unsigned char enable_npu_fast_compile;  ///< 0 = disabled, nonzero = enabled
   const char* device_id;
   size_t num_of_threads;  ///< 0 = Use default number of threads
   const char* cache_dir;  // path is set to empty by default
@@ -3662,6 +3667,9 @@ struct OrtApi {
    *   execution provider (typically CPU EP).
    *     - "0": Default. Disabled. QNN EP will handle quantization and dequantization of graph I/O.
    *     - "1": Enabled.
+   *   "enable_htp_spill_fill_buffer": Enable HTP spill fill buffer setting. The flag is used while generating context binary.
+   *     - "0": Default. Disabled.
+   *     - "1": Enabled.
    *
    * SNPE supported keys:
    *   "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
@@ -4607,6 +4615,8 @@ struct OrtApi {
    * \param[in] num_keys
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.17.
    */
   ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_OpenVINO_V2,
                   _In_ OrtSessionOptions* options,
@@ -4624,6 +4634,8 @@ struct OrtApi {
    * \param[in] num_keys
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.18.
    */
   ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_VitisAI,
                   _In_ OrtSessionOptions* options,
@@ -4637,7 +4649,10 @@ struct OrtApi {
    *  \param[in] mem_info OrtMemoryInfo instance
    *  \param[in] count_or_bytes How many bytes is this scratch buffer
    *  \param[out] out A pointer to the scrach buffer
+   *
    *  \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.18.
    */
   ORT_API2_STATUS(KernelContext_GetScratchBuffer, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _In_ size_t count_or_bytes, _Outptr_ void** out);
 
@@ -4648,6 +4663,8 @@ struct OrtApi {
    * \param[out] out A pointer to OrtAllocator
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.18.
    */
   ORT_API2_STATUS(KernelInfoGetAllocator, _In_ const OrtKernelInfo* info, _In_ OrtMemType mem_type, _Outptr_ OrtAllocator** out);
 
@@ -4669,6 +4686,8 @@ struct OrtApi {
    * \param[in] num_external_initializer_files Number of external files
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.18.
    */
   ORT_API2_STATUS(AddExternalInitializersFromFilesInMemory, _In_ OrtSessionOptions* options,
                   _In_reads_(num_external_initializer_files) const ORTCHAR_T* const* external_initializer_file_names,
@@ -4691,6 +4710,8 @@ struct OrtApi {
    *                  OrtApi::ReleaseLoraAdapter.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.20.
    */
   ORT_API2_STATUS(CreateLoraAdapter, const ORTCHAR_T* adapter_file_path, _In_ OrtAllocator* allocator,
                   _Outptr_ OrtLoraAdapter** out);
@@ -4709,6 +4730,8 @@ struct OrtApi {
    *                  OrtApi::ReleaseLoraAdapter.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.20.
    */
   ORT_API2_STATUS(CreateLoraAdapterFromArray, _In_ const void* bytes, size_t num_bytes, _In_ OrtAllocator* allocator,
                   _Outptr_ OrtLoraAdapter** out);
@@ -4730,6 +4753,8 @@ struct OrtApi {
    * \param[in] adapter OrtLoraAdapter instance
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.20.
    */
   ORT_API2_STATUS(RunOptionsAddActiveLoraAdapter, _Inout_ OrtRunOptions* options, _In_ const OrtLoraAdapter* adapter);
 
@@ -4748,6 +4773,8 @@ struct OrtApi {
    * \param[in] kv_len Number of elements in the keys and values arrays
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.20.
    */
   ORT_API2_STATUS(SetEpDynamicOptions, _Inout_ OrtSession* sess, _In_reads_(kv_len) const char* const* keys,
                   _In_reads_(kv_len) const char* const* values, _In_ size_t kv_len);
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 086919913cbea..6a01602e634f8 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -246,12 +246,6 @@ static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disab
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
     "session.optimized_model_external_initializers_file_name";
 
-// Use this config when save prepacked constant initializers to onnx external data file.
-// Default is not save prepacked initializers to onnx data file.
-// Sample usage: sess_options.add_session_config_entry('session.save_prepacked_constant_initializers',  "1")
-static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
-    "session.save_prepacked_constant_initializers";
-
 // Use this config to control the minimum size of the initializer when externalizing it during serialization
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
diff --git a/java/build-android.gradle b/java/build-android.gradle
index d5839f9f27869..9c4275b74f626 100644
--- a/java/build-android.gradle
+++ b/java/build-android.gradle
@@ -82,7 +82,7 @@ allprojects {
 }
 
 android {
-	compileSdkVersion 32
+	compileSdkVersion 34
 
 	defaultConfig {
 		minSdkVersion minSdkVer
@@ -108,8 +108,8 @@ android {
 	}
 
 	compileOptions {
-		sourceCompatibility = JavaVersion.VERSION_1_8
-		targetCompatibility = JavaVersion.VERSION_1_8
+		sourceCompatibility = JavaVersion.VERSION_17
+		targetCompatibility = JavaVersion.VERSION_17
 	}
 
 	sourceSets {
diff --git a/java/build.gradle b/java/build.gradle
index 34ac93cce6f4e..845121dd17a48 100644
--- a/java/build.gradle
+++ b/java/build.gradle
@@ -50,8 +50,8 @@ mavenSettings {
 }
 
 java {
-	sourceCompatibility = JavaVersion.VERSION_1_8
-	targetCompatibility = JavaVersion.VERSION_1_8
+	sourceCompatibility = JavaVersion.VERSION_17
+	targetCompatibility = JavaVersion.VERSION_17
 }
 
 // This jar tasks serves as a CMAKE signaling
diff --git a/java/gradle/wrapper/gradle-wrapper.properties b/java/gradle/wrapper/gradle-wrapper.properties
index 4baf5a11d45a3..381baa9cef1ec 100644
--- a/java/gradle/wrapper/gradle-wrapper.properties
+++ b/java/gradle/wrapper/gradle-wrapper.properties
@@ -1,7 +1,7 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionSha256Sum=9631d53cf3e74bfa726893aee1f8994fee4e060c401335946dba2156f440f24c
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip
+distributionSha256Sum=544c35d6bd849ae8a5ed0bcea39ba677dc40f49df7d1835561582da2009b961d
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip
 networkTimeout=10000
 validateDistributionUrl=true
 zipStoreBase=GRADLE_USER_HOME
diff --git a/java/gradlew.bat b/java/gradlew.bat
index 93e3f59f135dd..25da30dbdeee9 100644
--- a/java/gradlew.bat
+++ b/java/gradlew.bat
@@ -43,11 +43,11 @@ set JAVA_EXE=java.exe
 %JAVA_EXE% -version >NUL 2>&1
 if %ERRORLEVEL% equ 0 goto execute
 
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
+echo. 1>&2
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
+echo. 1>&2
+echo Please set the JAVA_HOME variable in your environment to match the 1>&2
+echo location of your Java installation. 1>&2
 
 goto fail
 
@@ -57,11 +57,11 @@ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
 
 if exist "%JAVA_EXE%" goto execute
 
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
+echo. 1>&2
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
+echo. 1>&2
+echo Please set the JAVA_HOME variable in your environment to match the 1>&2
+echo location of your Java installation. 1>&2
 
 goto fail
 
diff --git a/java/src/main/java/ai/onnxruntime/OrtSession.java b/java/src/main/java/ai/onnxruntime/OrtSession.java
index 7280f3c88e2e8..32dc9d9f84aaa 100644
--- a/java/src/main/java/ai/onnxruntime/OrtSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtSession.java
@@ -1323,6 +1323,18 @@ public void addQnn(Map<String, String> providerOptions) throws OrtException {
       addExecutionProvider(qnnProviderName, providerOptions);
     }
 
+    /**
+     * Adds CoreML as an execution backend.
+     *
+     * @param providerOptions Configuration options for the CoreML backend. Refer to the CoreML
+     *     execution provider's documentation.
+     * @throws OrtException If there was an error in native code.
+     */
+    public void addCoreML(Map<String, String> providerOptions) throws OrtException {
+      String CoreMLProviderName = "CoreML";
+      addExecutionProvider(CoreMLProviderName, providerOptions);
+    }
+
     private native void setExecutionMode(long apiHandle, long nativeHandle, int mode)
         throws OrtException;
 
diff --git a/java/src/test/android/README.md b/java/src/test/android/README.md
index b84021669c9fe..b086be3dc904c 100644
--- a/java/src/test/android/README.md
+++ b/java/src/test/android/README.md
@@ -29,6 +29,11 @@ Use the android's [build instructions](https://onnxruntime.ai/docs/build/android
 
 Please note that you may need to set the `--android_abi=x86_64` (the default option is `arm64-v8a`). This is because android instrumentation test is run on an android emulator which requires an abi of `x86_64`.
 
+#### QNN Builds
+We use two AndroidManifest.xml files to manage different runtime requirements for QNN support. In the [build configuration](app/build.gradle), we specify which manifest file to use based on the qnnVersion.
+In the [QNN manifest](app/src/main/AndroidManifestQnn.xml), we include the <uses-native-library> declaration for libcdsprpc.so, which is required for devices using QNN and Qualcomm DSP capabilities.
+For QNN builds, it is also necessary to set the `ADSP_LIBRARY_PATH` environment variable to the [native library directory](https://developer.android.com/reference/android/content/pm/ApplicationInfo#nativeLibraryDir) depending on the device. This ensures that any native libraries downloaded as dependencies such as QNN libraries are found by the application. This is conditionally added by using the BuildConfig field IS_QNN_BUILD set in the build.gradle file.
+
 #### Build Output
 
 The build will generate two apks which is required to run the test application in `$YOUR_BUILD_DIR/java/androidtest/android/app/build/outputs/apk`:
diff --git a/java/src/test/android/app/build.gradle b/java/src/test/android/app/build.gradle
index 381de06cc09de..baf18e714d25c 100644
--- a/java/src/test/android/app/build.gradle
+++ b/java/src/test/android/app/build.gradle
@@ -4,18 +4,27 @@ plugins {
 }
 
 def minSdkVer = System.properties.get("minSdkVer")?:24
+def qnnVersion = System.properties['qnnVersion']
 
 android {
-	compileSdkVersion 32
+	compileSdkVersion 34
 
 	defaultConfig {
 		applicationId "ai.onnxruntime.example.javavalidator"
 		minSdkVersion minSdkVer
-		targetSdkVersion 32
+		targetSdkVersion 34
 		versionCode 1
 		versionName "1.0"
 
 		testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
+
+		// Add BuildConfig field for qnnVersion
+		if (qnnVersion != null) {
+			buildConfigField "boolean", "IS_QNN_BUILD", "true"
+		}
+		else {
+			buildConfigField "boolean", "IS_QNN_BUILD", "false"
+		}
 	}
 
 	buildTypes {
@@ -25,11 +34,29 @@ android {
 		}
 	}
 	compileOptions {
-		sourceCompatibility JavaVersion.VERSION_1_8
-		targetCompatibility JavaVersion.VERSION_1_8
+		sourceCompatibility JavaVersion.VERSION_17
+		targetCompatibility JavaVersion.VERSION_17
 	}
 	kotlinOptions {
-		jvmTarget = '1.8'
+		jvmTarget = '17'
+	}
+	// Conditional packagingOptions for QNN builds only
+	if (qnnVersion != null)	{
+		packagingOptions  {
+			jniLibs {
+				useLegacyPackaging = true
+			}
+			// Dsp is used in older QC devices and not supported by ORT
+			// Gpu support isn't the target, we just want Npu support (Htp)
+			exclude 'lib/arm64-v8a/libQnnGpu.so'
+			exclude 'lib/arm64-v8a/libQnnDsp*.so'
+		}
+
+		sourceSets {
+			main {
+				manifest.srcFile 'src/main/AndroidManifestQnn.xml'  // Use QNN manifest
+				}
+			}
 	}
 	namespace 'ai.onnxruntime.example.javavalidator'
 }
@@ -42,11 +69,20 @@ dependencies {
 	implementation 'com.google.android.material:material:1.3.0'
 	implementation 'androidx.constraintlayout:constraintlayout:2.0.4'
 	testImplementation 'junit:junit:4.+'
-	androidTestImplementation 'androidx.test.ext:junit:1.1.3'
-	androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0'
-	implementation(name: "onnxruntime-android", ext: "aar")
+	androidTestImplementation "androidx.test.ext:junit:1.1.5"
+	androidTestImplementation "androidx.test.espresso:espresso-core:3.5.0"
 
-	androidTestImplementation 'androidx.test:runner:1.4.0'
-	androidTestImplementation 'androidx.test:rules:1.4.0'
+	androidTestImplementation "androidx.test:runner:1.5.2"
+	androidTestImplementation "androidx.test:rules:1.5.0"
 	androidTestImplementation 'com.microsoft.appcenter:espresso-test-extension:1.4'
+
+	// dependencies for onnxruntime-android-qnn
+	if (qnnVersion != null) {
+		implementation(name: "onnxruntime-android-qnn", ext: "aar")
+		implementation "com.qualcomm.qti:qnn-runtime:$qnnVersion"
+	}
+	else {
+		implementation(name: "onnxruntime-android", ext: "aar")
+	}
+
 }
diff --git a/java/src/test/android/app/src/androidTest/java/ai/onnxruntime/example/javavalidator/SimpleTest.kt b/java/src/test/android/app/src/androidTest/java/ai/onnxruntime/example/javavalidator/SimpleTest.kt
index 166803ae263a5..5e6bee6cac9f4 100644
--- a/java/src/test/android/app/src/androidTest/java/ai/onnxruntime/example/javavalidator/SimpleTest.kt
+++ b/java/src/test/android/app/src/androidTest/java/ai/onnxruntime/example/javavalidator/SimpleTest.kt
@@ -38,13 +38,18 @@ class SimpleTest {
     @Test
     fun runSigmoidModelTest() {
         for (intraOpNumThreads in 1..4) {
-            runSigmoidModelTestImpl(intraOpNumThreads)
+            runSigmoidModelTestImpl(intraOpNumThreads, OrtProvider.CPU)
         }
     }
 
     @Test
     fun runSigmoidModelTestNNAPI() {
-        runSigmoidModelTestImpl(1, true)
+        runSigmoidModelTestImpl(1, OrtProvider.NNAPI)
+    }
+
+    @Test
+    fun runSigmoidModelTestQNN() {
+        runSigmoidModelTestImpl(1, OrtProvider.QNN)
     }
 
     @Throws(IOException::class)
@@ -54,22 +59,49 @@ class SimpleTest {
     }
 
     @Throws(OrtException::class, IOException::class)
-    fun runSigmoidModelTestImpl(intraOpNumThreads: Int, useNNAPI: Boolean = false) {
-        reportHelper.label("Start Running Test with intraOpNumThreads=$intraOpNumThreads, useNNAPI=$useNNAPI")
+    fun runSigmoidModelTestImpl(intraOpNumThreads: Int, executionProvider: OrtProvider) {
+        reportHelper.label("Start Running Test with intraOpNumThreads=$intraOpNumThreads, executionProvider=$executionProvider")
         Log.println(Log.INFO, TAG, "Testing with intraOpNumThreads=$intraOpNumThreads")
-        Log.println(Log.INFO, TAG, "Testing with useNNAPI=$useNNAPI")
+        Log.println(Log.INFO, TAG, "Testing with executionProvider=$executionProvider")
+
         val env = OrtEnvironment.getEnvironment(OrtLoggingLevel.ORT_LOGGING_LEVEL_VERBOSE)
         env.use {
             val opts = SessionOptions()
             opts.setIntraOpNumThreads(intraOpNumThreads)
-            if (useNNAPI) {
-                if (OrtEnvironment.getAvailableProviders().contains(OrtProvider.NNAPI)) {
-                    opts.addNnapi()
-                } else {
-                    Log.println(Log.INFO, TAG, "NO NNAPI EP available, skip the test")
-                    return
+
+            when (executionProvider) {
+
+                OrtProvider.NNAPI -> {
+                    if (OrtEnvironment.getAvailableProviders().contains(OrtProvider.NNAPI)) {
+                        opts.addNnapi()
+                    } else {
+                        Log.println(Log.INFO, TAG, "NO NNAPI EP available, skip the test")
+                        return
+                    }
+                }
+
+                OrtProvider.QNN -> {
+                    if (OrtEnvironment.getAvailableProviders().contains(OrtProvider.QNN)) {
+                        // Since this is running in an Android environment, we use the .so library
+                        val qnnLibrary = "libQnnHtp.so"
+                        val providerOptions = Collections.singletonMap("backend_path", qnnLibrary)
+                        opts.addQnn(providerOptions)
+                    } else {
+                        Log.println(Log.INFO, TAG, "NO QNN EP available, skip the test")
+                        return
+                    }
+                }
+
+                OrtProvider.CPU -> {
+                    // No additional configuration is needed for CPU
+                }
+
+                else -> {
+                    //  Non exhaustive when statements on enum will be prohibited in future Gradle versions
+                    Log.println(Log.INFO, TAG, "Skipping test as OrtProvider is not implemented")
                 }
             }
+
             opts.use {
                 val session = env.createSession(readModel("sigmoid.ort"), opts)
                 session.use {
@@ -92,13 +124,15 @@ class SimpleTest {
                         output.use {
                             @Suppress("UNCHECKED_CAST")
                             val rawOutput = output[0].value as Array<Array<FloatArray>>
+                            // QNN EP will run the Sigmoid float32 op with fp16 precision
+                            val precision = if (executionProvider == OrtProvider.QNN) 1e-3 else 1e-6
                             for (i in 0..2) {
                                 for (j in 0..3) {
                                     for (k in 0..4) {
                                         Assert.assertEquals(
                                             rawOutput[i][j][k],
                                             expected[i][j][k],
-                                            1e-6.toFloat()
+                                            precision.toFloat()
                                         )
                                     }
                                 }
diff --git a/java/src/test/android/app/src/main/AndroidManifest.xml b/java/src/test/android/app/src/main/AndroidManifest.xml
index 2938b7e8bf409..08a612ed79fd6 100644
--- a/java/src/test/android/app/src/main/AndroidManifest.xml
+++ b/java/src/test/android/app/src/main/AndroidManifest.xml
@@ -17,4 +17,4 @@
         </activity>
     </application>
 
-</manifest>
\ No newline at end of file
+</manifest>
diff --git a/java/src/test/android/app/src/main/AndroidManifestQnn.xml b/java/src/test/android/app/src/main/AndroidManifestQnn.xml
new file mode 100644
index 0000000000000..c9416523a9c91
--- /dev/null
+++ b/java/src/test/android/app/src/main/AndroidManifestQnn.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
+
+    <application
+        android:allowBackup="true"
+        android:icon="@mipmap/ic_launcher"
+        android:label="@string/app_name"
+        android:roundIcon="@mipmap/ic_launcher_round"
+        android:supportsRtl="true"
+        android:theme="@style/Theme.JavaValidator">
+        <activity android:name=".MainActivity" android:exported="true">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+        <uses-native-library
+            android:name="libcdsprpc.so"
+            android:required="false" />
+    </application>
+
+</manifest>
diff --git a/java/src/test/android/app/src/main/java/ai/onnxruntime/example/javavalidator/MainActivity.kt b/java/src/test/android/app/src/main/java/ai/onnxruntime/example/javavalidator/MainActivity.kt
index 62e23c4b9b862..3b3a2d057b16e 100644
--- a/java/src/test/android/app/src/main/java/ai/onnxruntime/example/javavalidator/MainActivity.kt
+++ b/java/src/test/android/app/src/main/java/ai/onnxruntime/example/javavalidator/MainActivity.kt
@@ -1,11 +1,19 @@
 package ai.onnxruntime.example.javavalidator
 
 import android.os.Bundle
+import android.system.Os
 import androidx.appcompat.app.AppCompatActivity
 
 /*Empty activity app mainly used for testing*/
 class MainActivity : AppCompatActivity() {
     override fun onCreate(savedInstanceState: Bundle?) {
+        if (BuildConfig.IS_QNN_BUILD) {
+            val adspLibraryPath = applicationContext.applicationInfo.nativeLibraryDir
+            // set the path variable to the native library directory
+            // so that any native libraries downloaded as dependencies
+            // (like qnn libs) are found
+            Os.setenv("ADSP_LIBRARY_PATH", adspLibraryPath, true)
+        }
         super.onCreate(savedInstanceState)
     }
-}
\ No newline at end of file
+}
diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
index 57c4eb3577fd0..fa0b6fd0ef9d9 100644
--- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
+++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@@ -27,6 +27,7 @@
 import java.util.HashMap;
 import java.util.Map;
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
 import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
 
 public class ProviderOptionsTest {
@@ -34,6 +35,7 @@ public class ProviderOptionsTest {
 
   @Test
   @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
+  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
   public void testCUDAOptions() throws OrtException {
     // Test standard options
     OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
@@ -61,6 +63,7 @@ public void testCUDAOptions() throws OrtException {
 
   @Test
   @EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1")
+  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
   public void testTensorRT() throws OrtException {
     // Test standard options
     OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0);
diff --git a/js/.eslintrc.js b/js/.eslintrc.js
index bd1e9061355f5..462e417df1d66 100644
--- a/js/.eslintrc.js
+++ b/js/.eslintrc.js
@@ -198,19 +198,6 @@ module.exports = {
               '_OrtReleaseTensor',
               '_OrtRun',
               '_OrtRunWithBinding',
-              '_OrtTrainingCopyParametersFromBuffer',
-              '_OrtTrainingCopyParametersToBuffer',
-              '_OrtTrainingCreateSession',
-              '_OrtTrainingEvalStep',
-              '_OrtTrainingGetModelInputOutputCount',
-              '_OrtTrainingGetModelInputOutputName',
-              '_OrtTrainingGetParametersSize',
-              '_OrtTrainingLazyResetGrad',
-              '_OrtTrainingLoadCheckpoint',
-              '_OrtTrainingOptimizerStep',
-              '_OrtTrainingReleaseCheckpoint',
-              '_OrtTrainingReleaseSession',
-              '_OrtTrainingRunTrainStep',
             ],
           },
         ],
diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts
index e27e67622aa82..e63f9c6c9147f 100644
--- a/js/common/lib/backend.ts
+++ b/js/common/lib/backend.ts
@@ -3,7 +3,6 @@
 
 import { InferenceSession } from './inference-session.js';
 import { OnnxValue } from './onnx-value.js';
-import { TrainingSession } from './training-session.js';
 
 /**
  * @ignore
@@ -42,33 +41,6 @@ export interface InferenceSessionHandler extends SessionHandler {
   ): Promise<SessionHandler.ReturnType>;
 }
 
-/**
- * Represent a handler instance of a training inference session.
- *
- * @ignore
- */
-export interface TrainingSessionHandler extends SessionHandler {
-  readonly evalInputNames: readonly string[];
-  readonly evalOutputNames: readonly string[];
-
-  lazyResetGrad(): Promise<void>;
-  runTrainStep(
-    feeds: SessionHandler.FeedsType,
-    fetches: SessionHandler.FetchesType,
-    options: InferenceSession.RunOptions,
-  ): Promise<SessionHandler.ReturnType>;
-  runOptimizerStep(options: InferenceSession.RunOptions): Promise<void>;
-  runEvalStep(
-    feeds: SessionHandler.FeedsType,
-    fetches: SessionHandler.FetchesType,
-    options: InferenceSession.RunOptions,
-  ): Promise<SessionHandler.ReturnType>;
-
-  getParametersSize(trainableOnly: boolean): Promise<number>;
-  loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise<void>;
-  getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
-}
-
 /**
  * Represent a backend that provides implementation of model inferencing.
  *
@@ -84,14 +56,6 @@ export interface Backend {
     uriOrBuffer: string | Uint8Array,
     options?: InferenceSession.SessionOptions,
   ): Promise<InferenceSessionHandler>;
-
-  createTrainingSessionHandler?(
-    checkpointStateUriOrBuffer: TrainingSession.UriOrBuffer,
-    trainModelUriOrBuffer: TrainingSession.UriOrBuffer,
-    evalModelUriOrBuffer: TrainingSession.UriOrBuffer,
-    optimizerModelUriOrBuffer: TrainingSession.UriOrBuffer,
-    options: InferenceSession.SessionOptions,
-  ): Promise<TrainingSessionHandler>;
 }
 
 export { registerBackend } from './backend-impl.js';
diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index 642a897a90d26..e70f608ad7030 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 import { env as envImpl } from './env-impl.js';
+import { TryGetGlobalType } from './type-helper.js';
 
 export declare namespace Env {
   export type WasmPathPrefix = string;
@@ -14,7 +15,6 @@ export declare namespace Env {
      * If not modified, the filename of the .wasm file is:
      * - `ort-wasm-simd-threaded.wasm` for default build
      * - `ort-wasm-simd-threaded.jsep.wasm` for JSEP build (with WebGPU and WebNN)
-     * - `ort-training-wasm-simd-threaded.wasm` for training build
      */
     wasm?: URL | string;
     /**
@@ -25,7 +25,6 @@ export declare namespace Env {
      * If not modified, the filename of the .mjs file is:
      * - `ort-wasm-simd-threaded.mjs` for default build
      * - `ort-wasm-simd-threaded.jsep.mjs` for JSEP build (with WebGPU and WebNN)
-     * - `ort-training-wasm-simd-threaded.mjs` for training build
      */
     mjs?: URL | string;
   }
@@ -200,22 +199,16 @@ export declare namespace Env {
      * value will be the GPU adapter that created by the underlying WebGPU backend.
      *
      * When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types".
-     * Use `const adapter = env.webgpu.adapter as GPUAdapter;` in TypeScript to access this property with correct type.
-     *
-     * see comments on {@link Tensor.GpuBufferType}
      */
-    adapter: unknown;
+    adapter: TryGetGlobalType<'GPUAdapter'>;
     /**
      * Get the device for WebGPU.
      *
      * This property is only available after the first WebGPU inference session is created.
      *
      * When use with TypeScript, the type of this property is `GPUDevice` defined in "@webgpu/types".
-     * Use `const device = env.webgpu.device as GPUDevice;` in TypeScript to access this property with correct type.
-     *
-     * see comments on {@link Tensor.GpuBufferType} for more details about why not use types defined in "@webgpu/types".
      */
-    readonly device: unknown;
+    readonly device: TryGetGlobalType<'GPUDevice'>;
     /**
      * Set or get whether validate input content.
      *
diff --git a/js/common/lib/index.ts b/js/common/lib/index.ts
index 3ed56b3c2e812..d75e6a477258d 100644
--- a/js/common/lib/index.ts
+++ b/js/common/lib/index.ts
@@ -26,4 +26,3 @@ export * from './tensor-factory.js';
 export * from './trace.js';
 export * from './onnx-model.js';
 export * from './onnx-value.js';
-export * from './training-session.js';
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index 547db029471a2..e62c6579e8333 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -4,6 +4,7 @@
 import { InferenceSession as InferenceSessionImpl } from './inference-session-impl.js';
 import { OnnxModelOptions } from './onnx-model.js';
 import { OnnxValue, OnnxValueDataLocation } from './onnx-value.js';
+import { TryGetGlobalType } from './type-helper.js';
 
 /* eslint-disable @typescript-eslint/no-redeclare */
 
@@ -282,7 +283,7 @@ export declare namespace InferenceSession {
     extends WebNNExecutionProviderName,
       Omit<WebNNContextOptions, 'deviceType'>,
       Required<Pick<WebNNContextOptions, 'deviceType'>> {
-    context: unknown /* MLContext */;
+    context: TryGetGlobalType<'MLContext'>;
   }
 
   /**
@@ -291,8 +292,8 @@ export declare namespace InferenceSession {
    * @see https://www.w3.org/TR/webnn/#dom-ml-createcontext-gpudevice
    */
   export interface WebNNOptionsWebGpu extends WebNNExecutionProviderName {
-    context: unknown /* MLContext */;
-    gpuDevice: unknown /* GPUDevice */;
+    context: TryGetGlobalType<'MLContext'>;
+    gpuDevice: TryGetGlobalType<'GPUDevice'>;
   }
 
   /**
diff --git a/js/common/lib/tensor.ts b/js/common/lib/tensor.ts
index af918705b97e3..05553bd96662b 100644
--- a/js/common/lib/tensor.ts
+++ b/js/common/lib/tensor.ts
@@ -4,6 +4,7 @@
 import { TensorFactory } from './tensor-factory.js';
 import { Tensor as TensorImpl } from './tensor-impl.js';
 import { TypedTensorUtils } from './tensor-utils.js';
+import { TryGetGlobalType } from './type-helper.js';
 
 /* eslint-disable @typescript-eslint/no-redeclare */
 
@@ -131,24 +132,19 @@ export declare namespace Tensor {
    */
   export type TextureDataTypes = 'float32';
 
+  type GpuBufferTypeFallback = { size: number; mapState: 'unmapped' | 'pending' | 'mapped' };
   /**
    * type alias for WebGPU buffer
-   *
-   * The reason why we don't use type "GPUBuffer" defined in webgpu.d.ts from @webgpu/types is because "@webgpu/types"
-   * requires "@types/dom-webcodecs" as peer dependency when using TypeScript < v5.1 and its version need to be chosen
-   * carefully according to the TypeScript version being used. This means so far there is not a way to keep every
-   * TypeScript version happy. It turns out that we will easily broke users on some TypeScript version.
-   *
-   * for more info see https://github.com/gpuweb/types/issues/127
    */
-  export type GpuBufferType = { size: number; mapState: 'unmapped' | 'pending' | 'mapped' };
+  export type GpuBufferType = TryGetGlobalType<'GPUBuffer', GpuBufferTypeFallback>;
 
+  type MLTensorTypeFallback = { destroy(): void };
   /**
    * type alias for WebNN MLTensor
    *
    * The specification for WebNN's MLTensor is currently in flux.
    */
-  export type MLTensorType = unknown;
+  export type MLTensorType = TryGetGlobalType<'MLTensor', MLTensorTypeFallback>;
 
   /**
    * supported data types for constructing a tensor from a WebGPU buffer
diff --git a/js/common/lib/training-session-impl.ts b/js/common/lib/training-session-impl.ts
deleted file mode 100644
index 21dbe5fe51bb9..0000000000000
--- a/js/common/lib/training-session-impl.ts
+++ /dev/null
@@ -1,273 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import { resolveBackendAndExecutionProviders } from './backend-impl.js';
-import { SessionHandler, TrainingSessionHandler } from './backend.js';
-import { InferenceSession as InferenceSession } from './inference-session.js';
-import { OnnxValue } from './onnx-value.js';
-import { Tensor } from './tensor.js';
-import { TrainingSession as TrainingSessionInterface, TrainingSessionCreateOptions } from './training-session.js';
-
-type SessionOptions = InferenceSession.SessionOptions;
-type FeedsType = InferenceSession.FeedsType;
-type FetchesType = InferenceSession.FetchesType;
-type ReturnType = InferenceSession.ReturnType;
-type RunOptions = InferenceSession.RunOptions;
-
-const noBackendErrMsg: string =
-  'Training backend could not be resolved. ' + "Make sure you're using the correct configuration & WebAssembly files.";
-
-export class TrainingSession implements TrainingSessionInterface {
-  private constructor(handler: TrainingSessionHandler, hasOptimizerModel: boolean, hasEvalModel: boolean) {
-    this.handler = handler;
-    this.hasOptimizerModel = hasOptimizerModel;
-    this.hasEvalModel = hasEvalModel;
-  }
-  private handler: TrainingSessionHandler;
-  private hasOptimizerModel: boolean;
-  private hasEvalModel: boolean;
-
-  get trainingInputNames(): readonly string[] {
-    return this.handler.inputNames;
-  }
-  get trainingOutputNames(): readonly string[] {
-    return this.handler.outputNames;
-  }
-
-  get evalInputNames(): readonly string[] {
-    if (this.hasEvalModel) {
-      return this.handler.evalInputNames;
-    } else {
-      throw new Error('This training session has no evalModel loaded.');
-    }
-  }
-  get evalOutputNames(): readonly string[] {
-    if (this.hasEvalModel) {
-      return this.handler.evalOutputNames;
-    } else {
-      throw new Error('This training session has no evalModel loaded.');
-    }
-  }
-
-  static async create(
-    trainingOptions: TrainingSessionCreateOptions,
-    sessionOptions?: SessionOptions,
-  ): Promise<TrainingSession> {
-    const evalModel: string | Uint8Array = trainingOptions.evalModel || '';
-    const optimizerModel: string | Uint8Array = trainingOptions.optimizerModel || '';
-    const options: SessionOptions = sessionOptions || {};
-
-    // resolve backend, update session options with validated EPs, and create session handler
-    const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options);
-    if (backend.createTrainingSessionHandler) {
-      const handler = await backend.createTrainingSessionHandler(
-        trainingOptions.checkpointState,
-        trainingOptions.trainModel,
-        evalModel,
-        optimizerModel,
-        optionsWithValidatedEPs,
-      );
-      return new TrainingSession(handler, !!trainingOptions.optimizerModel, !!trainingOptions.evalModel);
-    } else {
-      throw new Error(noBackendErrMsg);
-    }
-  }
-
-  /**
-   * Helper function for runTrainStep and future runStep methods that handles the type-narrowing conversion from
-   * the given parameters to SessionHandler.FetchesType and RunOptions.
-   *
-   * @param inputNames the feeds object is checked that they contain all input names in the provided list of input
-   * names.
-   * @param outputNames the fetches object is checked that their keys match up with valid names in the list of output
-   * names.
-   * @param feeds the required input
-   * @param arg1 narrowed & converted into the SessionHandler.FetchesType or RunOptions object
-   * @param arg2 optional RunOptions object.
-   * @returns
-   */
-  typeNarrowingForRunStep(
-    inputNames: readonly string[],
-    outputNames: readonly string[],
-    feeds: FeedsType,
-    arg1?: FetchesType | RunOptions,
-    arg2?: RunOptions,
-  ): [SessionHandler.FetchesType, RunOptions] {
-    const fetches: { [name: string]: OnnxValue | null } = {};
-    let options: RunOptions = {};
-    // check inputs
-    if (typeof feeds !== 'object' || feeds === null || feeds instanceof Tensor || Array.isArray(feeds)) {
-      throw new TypeError(
-        "'feeds' must be an object that use input names as keys and OnnxValue as corresponding values.",
-      );
-    }
-
-    let isFetchesEmpty = true;
-    // determine which override is being used
-    if (typeof arg1 === 'object') {
-      if (arg1 === null) {
-        throw new TypeError('Unexpected argument[1]: cannot be null.');
-      }
-      if (arg1 instanceof Tensor) {
-        throw new TypeError("'fetches' cannot be a Tensor");
-      }
-
-      if (Array.isArray(arg1)) {
-        if (arg1.length === 0) {
-          throw new TypeError("'fetches' cannot be an empty array.");
-        }
-        isFetchesEmpty = false;
-        // output names
-        for (const name of arg1) {
-          if (typeof name !== 'string') {
-            throw new TypeError("'fetches' must be a string array or an object.");
-          }
-          if (outputNames.indexOf(name) === -1) {
-            throw new RangeError(`'fetches' contains invalid output name: ${name}.`);
-          }
-          fetches[name] = null;
-        }
-
-        if (typeof arg2 === 'object' && arg2 !== null) {
-          options = arg2;
-        } else if (typeof arg2 !== 'undefined') {
-          throw new TypeError("'options' must be an object.");
-        }
-      } else {
-        // decide whether arg1 is fetches or options
-        // if any output name is present and its value is valid OnnxValue, we consider it fetches
-        let isFetches = false;
-        const arg1Keys = Object.getOwnPropertyNames(arg1);
-        for (const name of outputNames) {
-          if (arg1Keys.indexOf(name) !== -1) {
-            const v = (arg1 as InferenceSession.NullableOnnxValueMapType)[name];
-            if (v === null || v instanceof Tensor) {
-              isFetches = true;
-              isFetchesEmpty = false;
-              fetches[name] = v;
-            }
-          }
-        }
-
-        if (isFetches) {
-          if (typeof arg2 === 'object' && arg2 !== null) {
-            options = arg2;
-          } else if (typeof arg2 !== 'undefined') {
-            throw new TypeError("'options' must be an object.");
-          }
-        } else {
-          options = arg1 as RunOptions;
-        }
-      }
-    } else if (typeof arg1 !== 'undefined') {
-      throw new TypeError("Unexpected argument[1]: must be 'fetches' or 'options'.");
-    }
-
-    // check if all inputs are in feed
-    for (const name of inputNames) {
-      if (typeof feeds[name] === 'undefined') {
-        throw new Error(`input '${name}' is missing in 'feeds'.`);
-      }
-    }
-
-    // if no fetches is specified, we use the full output names list
-    if (isFetchesEmpty) {
-      for (const name of outputNames) {
-        fetches[name] = null;
-      }
-    }
-
-    return [fetches, options];
-  }
-
-  /**
-   * Helper method for runTrainStep and any other runStep methods. Takes the ReturnType result from the SessionHandler
-   * and changes it into a map of Tensors.
-   *
-   * @param results
-   * @returns
-   */
-  convertHandlerReturnTypeToMapOfTensors(results: SessionHandler.ReturnType): ReturnType {
-    const returnValue: { [name: string]: OnnxValue } = {};
-    for (const key in results) {
-      if (Object.hasOwnProperty.call(results, key)) {
-        const result = results[key];
-        if (result instanceof Tensor) {
-          returnValue[key] = result;
-        } else {
-          returnValue[key] = new Tensor(result.type, result.data, result.dims);
-        }
-      }
-    }
-    return returnValue;
-  }
-
-  async lazyResetGrad(): Promise<void> {
-    await this.handler.lazyResetGrad();
-  }
-
-  runTrainStep(feeds: FeedsType, options?: RunOptions): Promise<ReturnType>;
-  runTrainStep(feeds: FeedsType, fetches: FetchesType, options?: RunOptions): Promise<ReturnType>;
-  async runTrainStep(feeds: FeedsType, arg1?: FetchesType | RunOptions, arg2?: RunOptions): Promise<ReturnType> {
-    const [fetches, options] = this.typeNarrowingForRunStep(
-      this.trainingInputNames,
-      this.trainingOutputNames,
-      feeds,
-      arg1,
-      arg2,
-    );
-    const results = await this.handler.runTrainStep(feeds, fetches, options);
-    return this.convertHandlerReturnTypeToMapOfTensors(results);
-  }
-
-  async runOptimizerStep(options?: InferenceSession.RunOptions | undefined): Promise<void> {
-    if (this.hasOptimizerModel) {
-      await this.handler.runOptimizerStep(options || {});
-    } else {
-      throw new Error('This TrainingSession has no OptimizerModel loaded.');
-    }
-  }
-
-  runEvalStep(feeds: FeedsType, options?: RunOptions | undefined): Promise<ReturnType>;
-  runEvalStep(feeds: FeedsType, fetches: FetchesType, options?: RunOptions | undefined): Promise<ReturnType>;
-  async runEvalStep(feeds: FeedsType, arg1?: FetchesType | RunOptions, arg2?: RunOptions): Promise<ReturnType> {
-    if (this.hasEvalModel) {
-      const [fetches, options] = this.typeNarrowingForRunStep(
-        this.evalInputNames,
-        this.evalOutputNames,
-        feeds,
-        arg1,
-        arg2,
-      );
-      const results = await this.handler.runEvalStep(feeds, fetches, options);
-      return this.convertHandlerReturnTypeToMapOfTensors(results);
-    } else {
-      throw new Error('This TrainingSession has no EvalModel loaded.');
-    }
-  }
-
-  async getParametersSize(trainableOnly = true): Promise<number> {
-    return this.handler.getParametersSize(trainableOnly);
-  }
-
-  async loadParametersBuffer(array: Uint8Array, trainableOnly = true): Promise<void> {
-    const paramsSize = await this.getParametersSize(trainableOnly);
-    // checking that the size of the Uint8Array is equivalent to the byte length of a Float32Array of the number
-    // of parameters
-    if (array.length !== 4 * paramsSize) {
-      throw new Error(
-        'Size of the buffer passed into loadParametersBuffer must match the number of parameters in ' +
-          'the model. Please use getParametersSize method to check.',
-      );
-    }
-    return this.handler.loadParametersBuffer(array, trainableOnly);
-  }
-
-  async getContiguousParameters(trainableOnly = true): Promise<OnnxValue> {
-    return this.handler.getContiguousParameters(trainableOnly);
-  }
-
-  async release(): Promise<void> {
-    return this.handler.dispose();
-  }
-}
diff --git a/js/common/lib/training-session.ts b/js/common/lib/training-session.ts
deleted file mode 100644
index 45dcafc46deb5..0000000000000
--- a/js/common/lib/training-session.ts
+++ /dev/null
@@ -1,206 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-import { InferenceSession } from './inference-session.js';
-import { OnnxValue } from './onnx-value.js';
-import { TrainingSession as TrainingSessionImpl } from './training-session-impl.js';
-
-/* eslint-disable @typescript-eslint/no-redeclare */
-
-export declare namespace TrainingSession {
-  /**
-   * Either URI file path (string) or Uint8Array containing model or checkpoint information.
-   */
-  type UriOrBuffer = string | Uint8Array;
-}
-
-/**
- * Represent a runtime instance of an ONNX training session,
- * which contains a model that can be trained, and, optionally,
- * an eval and optimizer model.
- */
-export interface TrainingSession {
-  // #region run()
-
-  /**
-   * Lazily resets the gradients of all trainable parameters to zero. Should happen after the invocation of
-   * runOptimizerStep.
-   */
-  lazyResetGrad(): Promise<void>;
-
-  /**
-   * Run TrainStep asynchronously with the given feeds and options.
-   *
-   * @param feeds - Representation of the model input. See type description of `InferenceSession.InputType` for
-   detail.
-   * @param options - Optional. A set of options that controls the behavior of model training.
-   * @returns A promise that resolves to a map, which uses output names as keys and OnnxValue as corresponding values.
-   */
-  runTrainStep(
-    feeds: InferenceSession.FeedsType,
-    options?: InferenceSession.RunOptions,
-  ): Promise<InferenceSession.ReturnType>;
-
-  /**
-   * Run a single train step with the given inputs and options.
-   *
-   * @param feeds - Representation of the model input.
-   * @param fetches - Representation of the model output.
-   * detail.
-   * @param options - Optional. A set of options that controls the behavior of model training.
-   * @returns A promise that resolves to a map, which uses output names as keys and OnnxValue as corresponding
-   values.
-   */
-  runTrainStep(
-    feeds: InferenceSession.FeedsType,
-    fetches: InferenceSession.FetchesType,
-    options?: InferenceSession.RunOptions,
-  ): Promise<InferenceSession.ReturnType>;
-
-  /**
-   * Runs a single optimizer step, which performs weight updates for the trainable parameters using the optimizer model.
-   *
-   * @param options - Optional. A set of options that controls the behavior of model optimizing.
-   */
-  runOptimizerStep(options?: InferenceSession.RunOptions): Promise<void>;
-
-  /**
-   * Run a single eval step with the given inputs and options using the eval model.
-   *
-   * @param feeds - Representation of the model input.
-   * @param options - Optional. A set of options that controls the behavior of model eval step.
-   * @returns A promise that resolves to a map, which uses output names as keys and OnnxValue as corresponding
-   values.
-   */
-  runEvalStep(
-    feeds: InferenceSession.FeedsType,
-    options?: InferenceSession.RunOptions,
-  ): Promise<InferenceSession.ReturnType>;
-
-  /**
-   * Run a single eval step with the given inputs and options using the eval model.
-   *
-   * @param feeds - Representation of the model input.
-   * @param fetches - Representation of the model output.
-   * detail.
-   * @param options - Optional. A set of options that controls the behavior of model eval step.
-   * @returns A promise that resolves to a map, which uses output names as keys and OnnxValue as corresponding
-   values.
-   */
-  runEvalStep(
-    feeds: InferenceSession.FeedsType,
-    fetches: InferenceSession.FetchesType,
-    options?: InferenceSession.RunOptions,
-  ): Promise<InferenceSession.ReturnType>;
-
-  // #endregion
-
-  // #region copy parameters
-
-  /**
-   * Retrieves the size of all parameters for the training state. Calculates the total number of primitive (datatype of
-   * the parameters) elements of all the parameters in the training state.
-   *
-   * @param trainableOnly - When set to true, the size is calculated for trainable params only. Default value is true.
-   */
-  getParametersSize(trainableOnly: boolean): Promise<number>;
-
-  /**
-   * Copies parameter values from the given buffer to the training state. Currently, only supporting models with
-   * parameters of type Float32.
-   *
-   * @param buffer - A Uint8Array representation of Float32 parameters.
-   * @param trainableOnly - True if trainable parameters only to be modified, false otherwise. Default value is true.
-   */
-  loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise<void>;
-
-  /**
-   * Copies the model parameters to a contiguous buffer. Usually used in the context of Federated Learning.
-   * Currently, only supporting models with parameters of type Float32.
-   *
-   * @param trainableOnly - When set to true, only trainable parameters are copied. Trainable parameters are parameters
-   * for which requires_grad is set to true. Default value is true.
-   * @returns A promise that resolves to a Float32 OnnxValue of the requested parameters.
-   */
-  getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
-  // #endregion
-
-  // #region release()
-
-  /**
-   * Release the inference session and the underlying resources.
-   */
-  release(): Promise<void>;
-  // #endregion
-
-  // #region metadata
-
-  /**
-   * Get input names of the loaded training model.
-   */
-  readonly trainingInputNames: readonly string[];
-
-  /**
-   * Get output names of the loaded training model.
-   */
-  readonly trainingOutputNames: readonly string[];
-
-  /**
-   * Get input names of the loaded eval model. Is an empty array if no eval model is loaded.
-   */
-  readonly evalInputNames: readonly string[];
-
-  /**
-   * Get output names of the loaded eval model. Is an empty array if no eval model is loaded.
-   */
-  readonly evalOutputNames: readonly string[];
-
-  // #endregion
-}
-
-/**
- * Represents the optional parameters that can be passed into the TrainingSessionFactory.
- */
-export interface TrainingSessionCreateOptions {
-  /**
-   * URI or buffer for a .ckpt file that contains the checkpoint for the training model.
-   */
-  checkpointState: TrainingSession.UriOrBuffer;
-  /**
-   * URI or buffer for the .onnx training file.
-   */
-  trainModel: TrainingSession.UriOrBuffer;
-  /**
-   * Optional. URI or buffer for the .onnx optimizer model file.
-   */
-  optimizerModel?: TrainingSession.UriOrBuffer;
-  /**
-   * Optional. URI or buffer for the .onnx eval model file.
-   */
-  evalModel?: TrainingSession.UriOrBuffer;
-}
-
-/**
- * Defines method overload possibilities for creating a TrainingSession.
- */
-export interface TrainingSessionFactory {
-  // #region create()
-
-  /**
-   * Creates a new TrainingSession and asynchronously loads any models passed in through trainingOptions
-   *
-   * @param trainingOptions specify models and checkpoints to load into the Training Session
-   * @param sessionOptions specify configuration for training session behavior
-   *
-   * @returns Promise that resolves to a TrainingSession object
-   */
-  create(
-    trainingOptions: TrainingSessionCreateOptions,
-    sessionOptions?: InferenceSession.SessionOptions,
-  ): Promise<TrainingSession>;
-
-  // #endregion
-}
-
-// eslint-disable-next-line @typescript-eslint/naming-convention
-export const TrainingSession: TrainingSessionFactory = TrainingSessionImpl;
diff --git a/js/common/lib/type-helper.ts b/js/common/lib/type-helper.ts
new file mode 100644
index 0000000000000..845ba3018d443
--- /dev/null
+++ b/js/common/lib/type-helper.ts
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+/**
+ * A helper type to get certain types if they are declared in global scope.
+ *
+ * For example, if you installed "@webgpu/types" as a dev dependency, then `TryGetTypeIfDeclared<'GPUDevice'>` will
+ * be type `GPUDevice`, otherwise it will be type `unknown`.
+ *
+ *
+ * We don't want to introduce "@webgpu/types" as a dependency of this package because:
+ *
+ * (1) For JavaScript users, it's not needed. For TypeScript users, they can install it as dev dependency themselves.
+ *
+ * (2) because "@webgpu/types" requires "@types/dom-webcodecs" as peer dependency when using TypeScript < v5.1 and its
+ * version need to be chosen carefully according to the TypeScript version being used. This means so far there is not a
+ * way to keep every TypeScript version happy. It turns out that we will easily broke users on some TypeScript version.
+ *
+ * for more info see https://github.com/gpuweb/types/issues/127
+ *
+ * Update (2024-08-07): The reason (2) may be no longer valid. Most people should be using TypeScript >= 5.1 by now.
+ * However, we are still not sure whether introducing "@webgpu/types" as direct dependency is a good idea. We find this
+ * type helper is useful for TypeScript users.
+ *
+ * @ignore
+ */
+export type TryGetGlobalType<Name extends string, Fallback = unknown> = typeof globalThis extends {
+  [k in Name]: { prototype: infer T };
+}
+  ? T
+  : Fallback;
diff --git a/js/common/typedoc.json b/js/common/typedoc.json
index 088c7ba4053e6..f9c7e7b19db41 100644
--- a/js/common/typedoc.json
+++ b/js/common/typedoc.json
@@ -1,6 +1,7 @@
 {
   "entryPoints": ["lib/index.ts"],
   "excludeInternal": true,
+  "intentionallyNotExported": ["TryGetGlobalType"],
   "name": "ONNX Runtime JavaScript API",
   "readme": "none",
   "cleanOutputDir": true
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 239c0b1ba557b..6d3c96e579a47 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -276,12 +276,12 @@
       "dev": true
     },
     "node_modules/axios": {
-      "version": "1.6.1",
-      "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.1.tgz",
-      "integrity": "sha512-vfBmhDpKafglh0EldBEbVuoe7DyAavGSLWhuSm5ZSEKQnHhBf0xAAwybbNH1IkrJNGnS/VG4I5yxig1pCEXE4g==",
+      "version": "1.7.9",
+      "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.9.tgz",
+      "integrity": "sha512-LhLcE7Hbiryz8oMDdDptSrWowmB4Bl6RCt6sIJKpRB4XtVf0iEgewX3au/pJqm+Py1kCASkb/FFKjxQaLtxJvw==",
       "dev": true,
       "dependencies": {
-        "follow-redirects": "^1.15.0",
+        "follow-redirects": "^1.15.6",
         "form-data": "^4.0.0",
         "proxy-from-env": "^1.1.0"
       }
@@ -455,9 +455,9 @@
       "dev": true
     },
     "node_modules/cross-spawn": {
-      "version": "7.0.3",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
-      "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+      "version": "7.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
       "dependencies": {
         "path-key": "^3.1.0",
         "shebang-command": "^2.0.0",
@@ -1581,12 +1581,12 @@
       "dev": true
     },
     "axios": {
-      "version": "1.6.1",
-      "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.1.tgz",
-      "integrity": "sha512-vfBmhDpKafglh0EldBEbVuoe7DyAavGSLWhuSm5ZSEKQnHhBf0xAAwybbNH1IkrJNGnS/VG4I5yxig1pCEXE4g==",
+      "version": "1.7.9",
+      "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.9.tgz",
+      "integrity": "sha512-LhLcE7Hbiryz8oMDdDptSrWowmB4Bl6RCt6sIJKpRB4XtVf0iEgewX3au/pJqm+Py1kCASkb/FFKjxQaLtxJvw==",
       "dev": true,
       "requires": {
-        "follow-redirects": "^1.15.0",
+        "follow-redirects": "^1.15.6",
         "form-data": "^4.0.0",
         "proxy-from-env": "^1.1.0"
       }
@@ -1725,9 +1725,9 @@
       "dev": true
     },
     "cross-spawn": {
-      "version": "7.0.3",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
-      "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+      "version": "7.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
       "requires": {
         "path-key": "^3.1.0",
         "shebang-command": "^2.0.0",
diff --git a/js/node/script/install.js b/js/node/script/install.js
index b15bc03840599..fef93f9169a2c 100644
--- a/js/node/script/install.js
+++ b/js/node/script/install.js
@@ -21,6 +21,7 @@ const os = require('os');
 const fs = require('fs');
 const path = require('path');
 const tar = require('tar');
+const { execFileSync } = require('child_process');
 const { Readable } = require('stream');
 
 // commandline flag:
@@ -58,10 +59,23 @@ if (NO_INSTALL || !shouldInstall) {
 
 // Step.2: Download the required binaries
 const artifactUrl = {
-  11: `https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-gpu-${
-    ORT_VERSION
-  }.tgz`,
-  12: `https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-gpu-cuda12-${
+  get 11() {
+    // TODO: support ORT Cuda v11 binaries
+    throw new Error(`CUDA 11 binaries are not supported by this script yet.
+
+To use ONNX Runtime Node.js binding with CUDA v11 support, please follow the manual steps:
+
+1. Use "--onnxruntime-node-install-cuda=skip" to skip the auto installation.
+2. Navigate to https://aiinfra.visualstudio.com/PublicPackages/_artifacts/feed/onnxruntime-cuda-11
+3. Download the binaries for your platform and architecture
+4. Extract the following binaries to "node_modules/onnxruntime-node/bin/napi-v3/linux/x64:
+   - libonnxruntime_providers_tensorrt.so
+   - libonnxruntime_providers_shared.so
+   - libonnxruntime.so.${ORT_VERSION}
+   - libonnxruntime_providers_cuda.so
+`);
+  },
+  12: `https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-gpu-${
     ORT_VERSION
   }.tgz`,
 }[INSTALL_CUDA_FLAG || tryGetCudaVersion()];
@@ -108,9 +122,27 @@ Use "--onnxruntime-node-install-cuda=skip" to skip the installation. You will st
 function tryGetCudaVersion() {
   // Should only return 11 or 12.
 
-  // TODO: try to get the CUDA version from the system ( `nvcc --version` )
+  // try to get the CUDA version from the system ( `nvcc --version` )
+  let ver = 12;
+  try {
+    const nvccVersion = execFileSync('nvcc', ['--version'], { encoding: 'utf8' });
+    const match = nvccVersion.match(/release (\d+)/);
+    if (match) {
+      ver = parseInt(match[1]);
+      if (ver !== 11 && ver !== 12) {
+        throw new Error(`Unsupported CUDA version: ${ver}`);
+      }
+    }
+  } catch (e) {
+    if (e?.code === 'ENOENT') {
+      console.warn('`nvcc` not found. Assuming CUDA 12.');
+    } else {
+      console.warn('Failed to detect CUDA version from `nvcc --version`:', e.message);
+    }
+  }
 
-  return 11;
+  // assume CUDA 12 if failed to detect
+  return ver;
 }
 
 function parseInstallCudaFlag() {
diff --git a/js/node/tsconfig.json b/js/node/tsconfig.json
index c154c3e148ed0..0401fb9609ad6 100644
--- a/js/node/tsconfig.json
+++ b/js/node/tsconfig.json
@@ -1,7 +1,8 @@
 {
   "extends": "../tsconfig.json",
   "compilerOptions": {
-    "outDir": "dist"
+    "outDir": "dist",
+    "declaration": true
   },
   "include": ["lib"]
 }
diff --git a/js/package-lock.json b/js/package-lock.json
index 594d0584ad80e..f4401c6e98c75 100644
--- a/js/package-lock.json
+++ b/js/package-lock.json
@@ -1573,9 +1573,9 @@
       "dev": true
     },
     "node_modules/cross-spawn": {
-      "version": "7.0.3",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
-      "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+      "version": "7.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
       "dev": true,
       "dependencies": {
         "path-key": "^3.1.0",
@@ -5922,9 +5922,9 @@
       "dev": true
     },
     "cross-spawn": {
-      "version": "7.0.3",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
-      "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+      "version": "7.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
       "dev": true,
       "requires": {
         "path-key": "^3.1.0",
diff --git a/js/react_native/android/build.gradle b/js/react_native/android/build.gradle
index 825990eba0fb8..521866ff0f3e2 100644
--- a/js/react_native/android/build.gradle
+++ b/js/react_native/android/build.gradle
@@ -7,7 +7,7 @@ buildscript {
   }
 
   dependencies {
-    classpath 'com.android.tools.build:gradle:4.1.2'
+    classpath 'com.android.tools.build:gradle:7.4.2'
     // noinspection DifferentKotlinGradleVersion
   }
 }
@@ -221,9 +221,8 @@ dependencies {
   api "com.facebook.react:react-native:" + REACT_NATIVE_VERSION
   api "org.mockito:mockito-core:2.28.2"
 
-  androidTestImplementation "androidx.test:runner:1.1.0"
-  androidTestImplementation "androidx.test:rules:1.1.0"
-
+  androidTestImplementation "androidx.test:runner:1.5.2"
+  androidTestImplementation "androidx.test:rules:1.5.0"
   implementation "junit:junit:4.12"
 
   androidTestImplementation "com.linkedin.dexmaker:dexmaker-mockito-inline-extended:2.28.1"
diff --git a/js/react_native/android/gradle.properties b/js/react_native/android/gradle.properties
index 465b04d1f5813..8fe6e40d76911 100644
--- a/js/react_native/android/gradle.properties
+++ b/js/react_native/android/gradle.properties
@@ -4,7 +4,7 @@
 # Specifies the JVM arguments used for the daemon process.
 # The setting is particularly useful for tweaking memory settings.
 # Default value: -Xmx1024m -XX:MaxPermSize=256m
-# org.gradle.jvmargs=-Xmx2048m -XX:MaxPermSize=512m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8
+org.gradle.jvmargs=-Xmx4096m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8
 #
 # When configured, Gradle will run in incubating parallel mode.
 # This option should only be used with decoupled projects. More details, visit
diff --git a/js/react_native/android/gradle/wrapper/gradle-wrapper.jar b/js/react_native/android/gradle/wrapper/gradle-wrapper.jar
index 62d4c053550b9..249e5832f090a 100644
Binary files a/js/react_native/android/gradle/wrapper/gradle-wrapper.jar and b/js/react_native/android/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/js/react_native/android/gradle/wrapper/gradle-wrapper.properties b/js/react_native/android/gradle/wrapper/gradle-wrapper.properties
index 51d930a381f3a..012d6d90445b4 100644
--- a/js/react_native/android/gradle/wrapper/gradle-wrapper.properties
+++ b/js/react_native/android/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionSha256Sum=7faa7198769f872826c8ef4f1450f839ec27f0b4d5d1e51bade63667cbccd205
-distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.3-bin.zip
+distributionSha256Sum=cb87f222c5585bd46838ad4db78463a5c5f3d336e5e2b98dc7c0c586527351c2
+distributionUrl=https\://services.gradle.org/distributions/gradle-7.5-bin.zip
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
diff --git a/js/react_native/android/gradlew b/js/react_native/android/gradlew
index fbd7c515832da..a69d9cb6c2065 100755
--- a/js/react_native/android/gradlew
+++ b/js/react_native/android/gradlew
@@ -1,7 +1,7 @@
-#!/usr/bin/env sh
+#!/bin/sh
 
 #
-# Copyright 2015 the original author or authors.
+# Copyright © 2015-2021 the original authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,67 +17,101 @@
 #
 
 ##############################################################################
-##
-##  Gradle start up script for UN*X
-##
+#
+#   Gradle start up script for POSIX generated by Gradle.
+#
+#   Important for running:
+#
+#   (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
+#       noncompliant, but you have some other compliant shell such as ksh or
+#       bash, then to run this script, type that shell name before the whole
+#       command line, like:
+#
+#           ksh Gradle
+#
+#       Busybox and similar reduced shells will NOT work, because this script
+#       requires all of these POSIX shell features:
+#         * functions;
+#         * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
+#           «${var#prefix}», «${var%suffix}», and «$( cmd )»;
+#         * compound commands having a testable exit status, especially «case»;
+#         * various built-in commands including «command», «set», and «ulimit».
+#
+#   Important for patching:
+#
+#   (2) This script targets any POSIX shell, so it avoids extensions provided
+#       by Bash, Ksh, etc; in particular arrays are avoided.
+#
+#       The "traditional" practice of packing multiple parameters into a
+#       space-separated string is a well documented source of bugs and security
+#       problems, so this is (mostly) avoided, by progressively accumulating
+#       options in "$@", and eventually passing that to Java.
+#
+#       Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
+#       and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
+#       see the in-line comments for details.
+#
+#       There are tweaks for specific operating systems such as AIX, CygWin,
+#       Darwin, MinGW, and NonStop.
+#
+#   (3) This script is generated from the Groovy template
+#       https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
+#       within the Gradle project.
+#
+#       You can find Gradle at https://github.com/gradle/gradle/.
+#
 ##############################################################################
 
 # Attempt to set APP_HOME
+
 # Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
-    ls=`ls -ld "$PRG"`
-    link=`expr "$ls" : '.*-> \(.*\)$'`
-    if expr "$link" : '/.*' > /dev/null; then
-        PRG="$link"
-    else
-        PRG=`dirname "$PRG"`"/$link"
-    fi
+app_path=$0
+
+# Need this for daisy-chained symlinks.
+while
+    APP_HOME=${app_path%"${app_path##*/}"}  # leaves a trailing /; empty if no leading path
+    [ -h "$app_path" ]
+do
+    ls=$( ls -ld "$app_path" )
+    link=${ls#*' -> '}
+    case $link in             #(
+      /*)   app_path=$link ;; #(
+      *)    app_path=$APP_HOME$link ;;
+    esac
 done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
+
+APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
 
 APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
+APP_BASE_NAME=${0##*/}
 
 # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 
 # Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
+MAX_FD=maximum
 
 warn () {
     echo "$*"
-}
+} >&2
 
 die () {
     echo
     echo "$*"
     echo
     exit 1
-}
+} >&2
 
 # OS specific support (must be 'true' or 'false').
 cygwin=false
 msys=false
 darwin=false
 nonstop=false
-case "`uname`" in
-  CYGWIN* )
-    cygwin=true
-    ;;
-  Darwin* )
-    darwin=true
-    ;;
-  MINGW* )
-    msys=true
-    ;;
-  NONSTOP* )
-    nonstop=true
-    ;;
+case "$( uname )" in                #(
+  CYGWIN* )         cygwin=true  ;; #(
+  Darwin* )         darwin=true  ;; #(
+  MSYS* | MINGW* )  msys=true    ;; #(
+  NONSTOP* )        nonstop=true ;;
 esac
 
 CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
@@ -87,9 +121,9 @@ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 if [ -n "$JAVA_HOME" ] ; then
     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
         # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD="$JAVA_HOME/jre/sh/java"
+        JAVACMD=$JAVA_HOME/jre/sh/java
     else
-        JAVACMD="$JAVA_HOME/bin/java"
+        JAVACMD=$JAVA_HOME/bin/java
     fi
     if [ ! -x "$JAVACMD" ] ; then
         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
@@ -98,7 +132,7 @@ Please set the JAVA_HOME variable in your environment to match the
 location of your Java installation."
     fi
 else
-    JAVACMD="java"
+    JAVACMD=java
     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 
 Please set the JAVA_HOME variable in your environment to match the
@@ -106,80 +140,101 @@ location of your Java installation."
 fi
 
 # Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
-    MAX_FD_LIMIT=`ulimit -H -n`
-    if [ $? -eq 0 ] ; then
-        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
-            MAX_FD="$MAX_FD_LIMIT"
-        fi
-        ulimit -n $MAX_FD
-        if [ $? -ne 0 ] ; then
-            warn "Could not set maximum file descriptor limit: $MAX_FD"
-        fi
-    else
-        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
-    fi
+if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
+    case $MAX_FD in #(
+      max*)
+        MAX_FD=$( ulimit -H -n ) ||
+            warn "Could not query maximum file descriptor limit"
+    esac
+    case $MAX_FD in  #(
+      '' | soft) :;; #(
+      *)
+        ulimit -n "$MAX_FD" ||
+            warn "Could not set maximum file descriptor limit to $MAX_FD"
+    esac
 fi
 
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
-    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
+# Collect all arguments for the java command, stacking in reverse order:
+#   * args from the command line
+#   * the main class name
+#   * -classpath
+#   * -D...appname settings
+#   * --module-path (only if needed)
+#   * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
 
 # For Cygwin or MSYS, switch paths to Windows format before running java
-if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
-    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
-    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
-    
-    JAVACMD=`cygpath --unix "$JAVACMD"`
-
-    # We build the pattern for arguments to be converted via cygpath
-    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
-    SEP=""
-    for dir in $ROOTDIRSRAW ; do
-        ROOTDIRS="$ROOTDIRS$SEP$dir"
-        SEP="|"
-    done
-    OURCYGPATTERN="(^($ROOTDIRS))"
-    # Add a user-defined pattern to the cygpath arguments
-    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
-        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
-    fi
+if "$cygwin" || "$msys" ; then
+    APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
+    CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
+
+    JAVACMD=$( cygpath --unix "$JAVACMD" )
+
     # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    i=0
-    for arg in "$@" ; do
-        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
-        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
-
-        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
-            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
-        else
-            eval `echo args$i`="\"$arg\""
+    for arg do
+        if
+            case $arg in                                #(
+              -*)   false ;;                            # don't mess with options #(
+              /?*)  t=${arg#/} t=/${t%%/*}              # looks like a POSIX filepath
+                    [ -e "$t" ] ;;                      #(
+              *)    false ;;
+            esac
+        then
+            arg=$( cygpath --path --ignore --mixed "$arg" )
         fi
-        i=`expr $i + 1`
+        # Roll the args list around exactly as many times as the number of
+        # args, so each arg winds up back in the position where it started, but
+        # possibly modified.
+        #
+        # NB: a `for` loop captures its iteration list before it begins, so
+        # changing the positional parameters here affects neither the number of
+        # iterations, nor the values presented in `arg`.
+        shift                   # remove old arg
+        set -- "$@" "$arg"      # push replacement arg
     done
-    case $i in
-        0) set -- ;;
-        1) set -- "$args0" ;;
-        2) set -- "$args0" "$args1" ;;
-        3) set -- "$args0" "$args1" "$args2" ;;
-        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
-        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
-        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
-        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
-        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
-        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
-    esac
 fi
 
-# Escape application args
-save () {
-    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
-    echo " "
-}
-APP_ARGS=`save "$@"`
+# Collect all arguments for the java command;
+#   * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
+#     shell script including quotes and variable substitutions, so put them in
+#     double quotes to make sure that they get re-expanded; and
+#   * put everything else in single quotes, so that it's not re-expanded.
+
+set -- \
+        "-Dorg.gradle.appname=$APP_BASE_NAME" \
+        -classpath "$CLASSPATH" \
+        org.gradle.wrapper.GradleWrapperMain \
+        "$@"
+
+# Stop when "xargs" is not available.
+if ! command -v xargs >/dev/null 2>&1
+then
+    die "xargs is not available"
+fi
+
+# Use "xargs" to parse quoted args.
+#
+# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
+#
+# In Bash we could simply go:
+#
+#   readarray ARGS < <( xargs -n1 <<<"$var" ) &&
+#   set -- "${ARGS[@]}" "$@"
+#
+# but POSIX shell has neither arrays nor command substitution, so instead we
+# post-process each arg (as a line of input to sed) to backslash-escape any
+# character that might be a shell metacharacter, then use eval to reverse
+# that process (while maintaining the separation between arguments), and wrap
+# the whole thing up as a single "set" statement.
+#
+# This will of course break if any of these variables contains a newline or
+# an unmatched quote.
+#
 
-# Collect all arguments for the java command, following the shell quoting and substitution rules
-eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+eval "set -- $(
+        printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
+        xargs -n1 |
+        sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
+        tr '\n' ' '
+    )" '"$@"'
 
 exec "$JAVACMD" "$@"
diff --git a/js/react_native/android/gradlew.bat b/js/react_native/android/gradlew.bat
index 5093609d512a9..f127cfd49d402 100644
--- a/js/react_native/android/gradlew.bat
+++ b/js/react_native/android/gradlew.bat
@@ -14,7 +14,7 @@
 @rem limitations under the License.
 @rem
 
-@if "%DEBUG%" == "" @echo off
+@if "%DEBUG%"=="" @echo off
 @rem ##########################################################################
 @rem
 @rem  Gradle startup script for Windows
@@ -25,7 +25,7 @@
 if "%OS%"=="Windows_NT" setlocal
 
 set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
+if "%DIRNAME%"=="" set DIRNAME=.
 set APP_BASE_NAME=%~n0
 set APP_HOME=%DIRNAME%
 
@@ -40,7 +40,7 @@ if defined JAVA_HOME goto findJavaFromJavaHome
 
 set JAVA_EXE=java.exe
 %JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto init
+if %ERRORLEVEL% equ 0 goto execute
 
 echo.
 echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
@@ -54,7 +54,7 @@ goto fail
 set JAVA_HOME=%JAVA_HOME:"=%
 set JAVA_EXE=%JAVA_HOME%/bin/java.exe
 
-if exist "%JAVA_EXE%" goto init
+if exist "%JAVA_EXE%" goto execute
 
 echo.
 echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
@@ -64,21 +64,6 @@ echo location of your Java installation.
 
 goto fail
 
-:init
-@rem Get command-line arguments, handling Windows variants
-
-if not "%OS%" == "Windows_NT" goto win9xME_args
-
-:win9xME_args
-@rem Slurp the command line arguments.
-set CMD_LINE_ARGS=
-set _SKIP=2
-
-:win9xME_args_slurp
-if "x%~1" == "x" goto execute
-
-set CMD_LINE_ARGS=%*
-
 :execute
 @rem Setup the command line
 
@@ -86,17 +71,19 @@ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
 
 
 @rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
 
 :end
 @rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
+if %ERRORLEVEL% equ 0 goto mainEnd
 
 :fail
 rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
 rem the _cmd.exe /c_ return code!
-if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
+set EXIT_CODE=%ERRORLEVEL%
+if %EXIT_CODE% equ 0 set EXIT_CODE=1
+if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
+exit /b %EXIT_CODE%
 
 :mainEnd
 if "%OS%"=="Windows_NT" endlocal
diff --git a/js/react_native/e2e/android/app/build.gradle b/js/react_native/e2e/android/app/build.gradle
index 8a84b0d5065a8..526259e3f8d8f 100644
--- a/js/react_native/e2e/android/app/build.gradle
+++ b/js/react_native/e2e/android/app/build.gradle
@@ -193,7 +193,7 @@ dependencies {
     implementation "com.facebook.react:react-native:+"  // From node_modules
 
     implementation "androidx.swiperefreshlayout:swiperefreshlayout:1.0.0"
-    implementation 'androidx.test.ext:junit:1.1.3'
+    implementation 'androidx.test.ext:junit:1.1.5'
     debugImplementation("com.facebook.flipper:flipper:${FLIPPER_VERSION}") {
       exclude group:'com.facebook.fbjni'
     }
@@ -213,9 +213,9 @@ dependencies {
         implementation jscFlavor
     }
 
-    androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0'
-    androidTestImplementation 'androidx.test:runner:1.4.0'
-    androidTestImplementation 'androidx.test:rules:1.4.0'
+    androidTestImplementation "androidx.test.espresso:espresso-core:3.5.0"
+    androidTestImplementation "androidx.test:runner:1.5.2"
+    androidTestImplementation "androidx.test:rules:1.5.0"
 
     implementation project(':onnxruntime-react-native')
     // specify ORT dependency here so it can be found in libs flatDir repository
diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index 5f329b5ff8b39..5c8748d75c2bc 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -50,12 +50,14 @@ Do not modify directly.*
 | Gather | ai.onnx(1-10,11-12,13+) |  |
 | GatherBlockQuantized | com.microsoft(1+) |  |
 | GatherElements | ai.onnx(11-12,13+) |  |
+| GatherND | ai.onnx(11,12,13+) |  |
 | Gelu | ai.onnx(20+); com.microsoft(1+) |  |
 | Gemm | ai.onnx(7-8,9-10,11-12,13+) |  |
 | GlobalAveragePool | ai.onnx(1+); com.ms.internal.nhwc(1+) |  |
 | GlobalMaxPool | ai.onnx(1+); com.ms.internal.nhwc(1+) |  |
 | Greater | ai.onnx(7-8,9-12,13+) |  |
 | GreaterOrEqual | ai.onnx(12-15,16+) |  |
+| GridSample | ai.onnx(16-19); com.ms.internal.nhwc(16-19) |  |
 | GroupQueryAttention | com.microsoft(1+) |  |
 | HardSigmoid | ai.onnx(6+) |  |
 | If | ai.onnx(1-10,11-12,13-18,19-20,21+) |  |
@@ -93,6 +95,7 @@ Do not modify directly.*
 | Reshape | ai.onnx(5-12,13,14-18,19-20,21+) | no GPU kernel |
 | Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(10,11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling |
 | RotaryEmbedding | com.microsoft(1+) |  |
+| ScatterND | ai.onnx(11-12,13-15,16-17,18+) |  |
 | Shape | ai.onnx(1-12,13-14,15-18,19-20,21+) | no GPU kernel; an ORT warning is generated - need to fix |
 | Sigmoid | ai.onnx(6-12,13+) |  |
 | SimplifiedLayerNormalization | ai.onnx(1+) |  |
diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index b8c3b2ec8ec57..e0012e70a7dec 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -25,10 +25,11 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | Conv | ai.onnx(7-10, 11+) | conv2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight) |
 | ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight). WebNN CPU backend only supports default dilations and group |
 | Cos | ai.onnx(7+) | cos | ✓ | ✓ | |
-| CumSum | ai.onnx(11-13, 14+) | cumulativeSum | ✓ | ✓ | |
+| CumSum | ai.onnx(11-13, 14+) | cumulativeSum | ✓ | ✓ | 'axis' input should be a constant |
 | Div | ai.onnx(7-12, 13, 14+) | div | ✓ | ✓ | |
 | DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | ✗ | ✓ | |
 | Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | ✓ | ✓ | Only supports test mode |
+| Einsum | ai.onnx(12+) | reshape, transpose, matmul, reduceSum, mul, triangular | ✓ | ✓ | |
 | Elu | ai.onnx(7+) | elu | ✓ | ✓ | WebNN CPU backend only supports 'alpha' value is 1.0 |
 | Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | ✓ | ✓ | |
 | Erf | ai.onnx(7-9, 10-12, 13+) | erf | ✓ | ✓ | |
@@ -57,6 +58,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | LessOrEqual | ai.onnx(12-15, 16+) | lesserOrEqual | ✓ | ✓ | |
 | Log | ai.onnx(7-12, 13+) | log | ✓ | ✓ | |
 | LpPool | ai.onnx(7-10, 11-17, 18+) | l2Pool2d | ✗ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'p' value is 2 |
+| LRN | ai.onnx(7-12, 13+) | pad, averagePool2d, transpose, add, mul, pow, div | ✓ | ✓ | |
 | LSTM | ai.onnx(7-13, 14-21, 22+) | lstm | ✓ | ✓ | Only supports 'layout' == 0, 'input_forget' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
 | MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | ✓ | ✓ | |
 | Max | ai.onnx(7, 8-11, 12, 13+) | max | ✓ | ✓ | |
@@ -83,7 +85,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | ✓ | ✓ | Input 'axes' if present should be a constant |
 | Relu | ai.onnx(7-12, 13, 14+) | relu | ✓ | ✓ | |
 | Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | ✓ | ✓ | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported |
-| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, antialias == 0, coordinate_transformation_mode == 'half_pixel', exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant |
+| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, antialias == 0, exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant |
 | ScatterElements | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterElements | ✗ | ✓ | Only supports 'reduction' == 'none' |
 | ScatterND | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterND | ✗ | ✓ | Only supports 'reduction' == 'none' |
 | Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | ✓ | ✓ | |
@@ -93,7 +95,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | Softplus | ai.onnx(7+) | softplus | ✓ | ✓ | |
 | Softsign | ai.onnx(7+) | softsign | ✓ | ✓ | |
 | Sin | ai.onnx(7+) | sin | ✓ | ✓ | |
-| Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice | ✓ | ✓ | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant, only supports 'steps' value 1 |
+| Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice, reverse | ✓ | ✓ | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant |
 | Softmax | ai.onnx(7-10, 11-12, 13+) | softmax | ✓ | ✓ | |
 | Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | ✓ | ✓ | Input 'split' if present should be a constant |
 | Sqrt | ai.onnx(7-12, 13+) | sqrt | ✓ | ✓ | |
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 371e19e00d95e..7c5f99f1a4c67 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -13,6 +13,7 @@ import { ProgramManager } from './webgpu/program-manager';
 import {
   AdapterInfo,
   ComputeContext,
+  DeviceInfo,
   GpuArchitecture,
   GpuData,
   GpuVendor,
@@ -134,6 +135,26 @@ class AdapterInfoImpl implements AdapterInfo {
   }
 }
 
+class DeviceInfoImpl implements DeviceInfo {
+  readonly subgroupsSupported: boolean;
+  readonly subgroupsF16Supported: boolean;
+  readonly subgroupSizeRange?: readonly [number, number];
+
+  constructor(device: GPUDevice) {
+    this.subgroupsSupported = device.features.has('subgroups' as GPUFeatureName);
+    this.subgroupsF16Supported = device.features.has('subgroups' as GPUFeatureName);
+    // Currently subgroups feature is still experimental and size attributes are not in the WebGPU IDL, so we have to
+    // workaround the IDL type checks.
+    // TODO: clean this after subgroups feature is settled in IDL.
+    const deviceSubgroupsLimits = device.limits as { minSubgroupSize?: number; maxSubgroupSize?: number };
+    if (!this.subgroupsSupported || !deviceSubgroupsLimits.minSubgroupSize || !deviceSubgroupsLimits.maxSubgroupSize) {
+      this.subgroupSizeRange = undefined;
+    } else {
+      this.subgroupSizeRange = [deviceSubgroupsLimits.minSubgroupSize, deviceSubgroupsLimits.maxSubgroupSize];
+    }
+  }
+}
+
 /**
  * this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
  * the first parameter so that it is stored for future use.
@@ -141,6 +162,7 @@ class AdapterInfoImpl implements AdapterInfo {
 export class WebGpuBackend {
   adapterInfo: AdapterInfoImpl;
   device: GPUDevice;
+  deviceInfo: DeviceInfoImpl;
   /**
    * an instance of GpuDataManager to manage a GpuDataId -> GpuBuffer mapping
    */
@@ -243,19 +265,25 @@ export class WebGpuBackend {
       requiredFeatures,
     };
 
-    if (adapter.features.has('chromium-experimental-timestamp-query-inside-passes')) {
-      requiredFeatures.push('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName);
-    } else if (adapter.features.has('timestamp-query')) {
-      requiredFeatures.push('timestamp-query');
+    // Try requiring WebGPU features
+    const requireFeatureIfAvailable = (feature: GPUFeatureName) =>
+      adapter.features.has(feature) && requiredFeatures.push(feature) && true;
+    // Try chromium-experimental-timestamp-query-inside-passes and fallback to timestamp-query
+    if (!requireFeatureIfAvailable('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName)) {
+      requireFeatureIfAvailable('timestamp-query');
     }
-    if (adapter.features.has('shader-f16')) {
-      requiredFeatures.push('shader-f16');
+    requireFeatureIfAvailable('shader-f16');
+    // Try subgroups
+    if (requireFeatureIfAvailable('subgroups' as GPUFeatureName)) {
+      // If subgroups feature is available, also try subgroups-f16
+      requireFeatureIfAvailable('subgroups-f16' as GPUFeatureName);
     }
     if (adapter.features.has('chromium-experimental-subgroups')) {
       requiredFeatures.push('chromium-experimental-subgroups' as GPUFeatureName);
     }
 
     this.device = await adapter.requestDevice(deviceDescriptor);
+    this.deviceInfo = new DeviceInfoImpl(this.device);
     this.adapterInfo = new AdapterInfoImpl(adapter.info || (await adapter.requestAdapterInfo()));
     this.gpuDataManager = createGpuDataManager(this);
     this.programManager = new ProgramManager(this);
diff --git a/js/web/lib/wasm/jsep/backend-webnn.ts b/js/web/lib/wasm/jsep/backend-webnn.ts
index d13c663651127..b302354c46eeb 100644
--- a/js/web/lib/wasm/jsep/backend-webnn.ts
+++ b/js/web/lib/wasm/jsep/backend-webnn.ts
@@ -226,7 +226,7 @@ export class WebNNBackend {
     return id;
   }
 
-  // Register WebNN Constant operands from external data.
+  // Register a WebNN Constant operand from external data.
   public registerMLConstant(
     externalFilePath: string,
     dataOffset: number,
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index fddc061cd775a..48bd3ef2bc36f 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -11,7 +11,13 @@ import { WebGpuBackend } from './backend-webgpu';
 import { LOG_DEBUG } from './log';
 import { TensorView } from './tensor-view';
 import { ShapeUtil } from './util';
-import { AdapterInfo, ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo } from './webgpu/types';
+import {
+  AdapterInfo,
+  ComputeContext,
+  ComputeContextInputsOutputsMapping,
+  DeviceInfo,
+  ProgramInfo,
+} from './webgpu/types';
 import { WebNNBackend } from './backend-webnn';
 
 /* eslint-disable no-bitwise */
@@ -70,6 +76,7 @@ class TensorViewImpl implements TensorView {
 
 class ComputeContextImpl implements ComputeContext {
   readonly adapterInfo: AdapterInfo;
+  readonly deviceInfo: DeviceInfo;
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
   readonly outputCount: number;
@@ -87,6 +94,7 @@ class ComputeContextImpl implements ComputeContext {
     contextDataOffset: number,
   ) {
     this.adapterInfo = backend.adapterInfo;
+    this.deviceInfo = backend.deviceInfo;
 
     // extract context data
     const ptrSize = module.PTR_SIZE;
@@ -112,18 +120,6 @@ class ComputeContextImpl implements ComputeContext {
     this.inputs = inputs;
   }
 
-  getMaxComputeWorkgroupSizes(): [number, number, number] {
-    return [
-      this.backend.device.limits.maxComputeWorkgroupSizeX,
-      this.backend.device.limits.maxComputeWorkgroupSizeY,
-      this.backend.device.limits.maxComputeWorkgroupSizeZ,
-    ];
-  }
-
-  getMaxComputeWorkgroupStoragesize(): number {
-    return this.backend.device.limits.maxComputeWorkgroupStorageSize;
-  }
-
   compute(program: ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping): TensorView[] {
     // prepare inputs. inputs should always be valid data.
     const mappedInputs =
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 09c786daa3fcd..6c7afbc7365bb 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -16,9 +16,11 @@ import { einsum, parseEinsumAttributes } from './ops/einsum';
 import { expand } from './ops/expand';
 import { fastGelu } from './ops/fast-gelu';
 import { gather, parseGatherAttributes } from './ops/gather';
+import { gatherND, parseGatherNDAttributes } from './ops/gather-nd';
 import { gatherBlockQuantized, parseGatherBlockQuantizedAttributes } from './ops/gather-block-quantized';
 import { gatherElements, parseGatherElementsAttributes } from './ops/gather-elements';
 import { gemm, parseGemmAttributes } from './ops/gemm';
+import { gridSample, parseGridSampleAttributes } from './ops/grid-sample';
 import { groupQueryAttention } from './ops/group-query-attention';
 import { instanceNorm } from './ops/instance-norm';
 import { layerNorm } from './ops/layer-norm';
@@ -29,6 +31,7 @@ import { pad } from './ops/pad';
 import * as pool from './ops/pool';
 import { dequantizeLinear, parseDequantizeLinearAttributes } from './ops/quantize-linear';
 import { range } from './ops/range';
+import { scatterND, parseScatterNDAttributes } from './ops/scatter-nd';
 import {
   reduceL1,
   reduceL2,
@@ -98,12 +101,14 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Gather', [gather, parseGatherAttributes]],
   ['GatherElements', [gatherElements, parseGatherElementsAttributes]],
   ['GatherBlockQuantized', [gatherBlockQuantized, parseGatherBlockQuantizedAttributes]],
+  ['GatherND', [gatherND, parseGatherNDAttributes]],
   ['Gelu', [unaryOps.gelu]],
   ['Gemm', [gemm, parseGemmAttributes]],
   ['GlobalAveragePool', [pool.globalAveragePool, pool.parseGlobalAveragePoolAttributes]],
   ['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
   ['Greater', [binaryOps.greater]],
   ['GreaterOrEqual', [binaryOps.greaterOrEqual]],
+  ['GridSample', [gridSample, parseGridSampleAttributes]],
   ['GroupQueryAttention', [groupQueryAttention]],
   ['HardSigmoid', [unaryOps.hardSigmoid, unaryOps.parseHardSigmoidAttributes]],
   ['InstanceNormalization', [instanceNorm]],
@@ -138,6 +143,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Relu', [unaryOps.relu]],
   ['Resize', [resize, parseResizeAttributes]],
   ['RotaryEmbedding', [rotaryEmbedding]],
+  ['ScatterND', [scatterND, parseScatterNDAttributes]],
   ['Sigmoid', [unaryOps.sigmoid]],
   ['Sin', [unaryOps.sin]],
   ['Sinh', [unaryOps.sinh]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
index 2a8756e435b8e..cb1f30ecdd1f4 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -29,229 +29,27 @@ import {
   ShaderHelper,
   tensorTypeToWsglStorageType,
   UniformsArrayType,
+  getMaxComponents,
 } from '../common';
 import { ConvTransposeAttributes } from '../conv-transpose';
 
-const createConvTranspose2DOpProgramShaderSource = (
-  shaderHelper: ShaderHelper,
-  inputs: readonly TensorView[],
-  outputShape: readonly number[],
-  hasBias: boolean,
-  is1DimensionDispatch: boolean,
-  isVec4 = false,
-  dataType: string,
-  uniforms: UniformsArrayType,
-  isChannelsLast = false,
-): string => {
-  const rowDim = isChannelsLast ? 1 : 2;
-  const colDim = isChannelsLast ? 2 : 3;
-  const channelDim = isChannelsLast ? 3 : 1;
-  const workPerThread = isVec4 ? 2 : 1;
-
-  let declareFunctions = `
-  fn setOutputAtIndex(flatIndex : u32, value : ${isVec4 ? `vec4<${dataType}>` : dataType}) {
-    result[flatIndex] = ${isVec4 ? `vec4<${dataType}>` : dataType}(value);
-  }`;
-  if (hasBias) {
-    declareFunctions += `
-    fn getBiasByOutputCoords(coords : vec4<u32>) -> ${isVec4 ? `vec4<${dataType}>` : dataType} {
-      return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}];
-    }`;
-  }
-  const components = isVec4 ? 4 : 1;
-  const w = inputVariable('W', inputs[1].dataType, inputs[1].dims.length, components);
-  const dy = inputVariable('Dy', inputs[0].dataType, inputs[0].dims.length, components);
-  const inputVariables = [dy, w];
-  if (hasBias) {
-    inputVariables.push(inputVariable('bias', inputs[2].dataType, [outputShape[channelDim]].length, components));
-  }
-  const output = outputVariable('result', inputs[0].dataType, outputShape.length, components);
-
-  const codeSnippet4 = `{
-        let batch: u32 = ${is1DimensionDispatch ? 'global_id.z' : 'workgroup_id.z'} / uniforms.result_shape[1];
-        let r = ${is1DimensionDispatch ? 'global_id.z' : 'workgroup_id.z'} % uniforms.result_shape[1];
-        let c = ${is1DimensionDispatch ? 'global_id.y' : 'workgroup_id.y'} * ${workPerThread};
-        let d1: u32 = ${is1DimensionDispatch ? 'global_id.x' : 'workgroup_id.x'} * 4;
-
-        let dyCorner = vec2<i32>(i32(r), i32(c)) - vec2<i32>(uniforms.pads);
-
-        // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
-        // ? = to be determined. : = across all values in that axis.
-        var dotProd: array<vec4<${dataType}>, ${workPerThread}>;
-        for (var i = 0; i < ${workPerThread}; i++) {
-          dotProd[i] = vec4<${dataType}>(0.0);
-        }
-        for (var wR: u32 = 0; wR < uniforms.filter_dims[0]; wR = wR + 1) {
-          var dyR = (${dataType}(dyCorner.x) + ${dataType}(wR)) / ${dataType}(uniforms.strides.x);
-          let wRPerm = uniforms.filter_dims[0] - 1 - wR;
-          if (dyR < 0.0 || dyR >= ${dataType}(uniforms.Dy_shape[1]) ||
-              fract(dyR) > 0.0 || wRPerm < 0) {
-            continue;
-          }
-          let idyR: u32 = u32(dyR);
-
-          for (var wC: u32 = 0; wC < uniforms.filter_dims[1]; wC = wC + 1) {
-            let dyC = (${dataType}(dyCorner.y) + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
-            let dyC2 = (${dataType}(dyCorner.y) + 1.0 + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
-            let wCPerm = uniforms.filter_dims[1] - 1 - wC;
-            if (wCPerm < 0) {
-              continue;
-            }
-            var bDyCVal = true;
-            var bDyCVal2 = true;
-            if (dyC < 0.0 || dyC >= ${dataType}(uniforms.Dy_shape[2]) ||
-                fract(dyC) > 0.0) {
-              bDyCVal = false;
-            }
-            if (dyC2 < 0.0 || dyC2 >= ${dataType}(uniforms.Dy_shape[2]) ||
-                fract(dyC2) > 0.0) {
-              bDyCVal2 = false;
-            }
-
-            let idyC: u32 = u32(dyC);
-            let idyC2: u32 = u32(dyC2);
-            if (bDyCVal && bDyCVal2) {
-              let d2Length = uniforms.Dy_shape[3];
-              for (var d2 :u32 = 0; d2 < d2Length; d2 = d2 + 4) {
-                let wValue0 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1', 'd2')};
-                let wValue1 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 1', 'd2')};
-                let wValue2 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 2', 'd2')};
-                let wValue3 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 3', 'd2')};
-
-                var xValue = ${dy.get('batch', 'idyR', 'idyC', 'd2')};
-                let tmpval = vec4<${dataType}>(dot(xValue, wValue0),
-                                      dot(xValue, wValue1),
-                                      dot(xValue, wValue2),
-                                      dot(xValue, wValue3));
-                dotProd[0] = dotProd[0] + tmpval;
-
-                xValue =  ${dy.get('batch', 'idyR', 'idyC2', 'd2')};
-
-                dotProd[1] = dotProd[1] + vec4<${dataType}>(dot(xValue, wValue0),
-                                                    dot(xValue, wValue1),
-                                                    dot(xValue, wValue2),
-                                                    dot(xValue, wValue3));
-              }
-            } else if (bDyCVal) {
-              let d2Length = uniforms.Dy_shape[${channelDim}];
-              for (var d2: u32 = 0; d2 < d2Length; d2 = d2 + 4) {
-                let wValue0 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1', 'd2')};
-                let wValue1 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 1', 'd2')};
-                let wValue2 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 2', 'd2')};
-                let wValue3 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 3', 'd2')};
-
-                var xValue = ${dy.get('batch', 'idyR', 'idyC', 'd2')};
-                let tmpval = vec4<${dataType}>(dot(xValue, wValue0),
-                                      dot(xValue, wValue1),
-                                      dot(xValue, wValue2),
-                                      dot(xValue, wValue3));
-                dotProd[0] = dotProd[0] + tmpval;
-              }
-            } else if (bDyCVal2) {
-              let d2Length = uniforms.Dy_shape[3];
-              for (var d2: u32 = 0; d2 < d2Length; d2 = d2 + 4) {
-                let wValue0 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1', 'd2')};
-                let wValue1 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 1', 'd2')};
-                let wValue2 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 2', 'd2')};
-                let wValue3 = ${w.get('u32(wRPerm)', 'u32(wCPerm)', 'd1 + 3', 'd2')};
-
-                var xValue = ${dy.get('batch', 'idyR', 'idyC2', 'd2')};
-                let tmpval = vec4<${dataType}>(dot(xValue, wValue0),
-                                      dot(xValue, wValue1),
-                                      dot(xValue, wValue2),
-                                      dot(xValue, wValue3));
-                dotProd[1] = dotProd[1] + tmpval;
-              }
-            }
-          }
-        }
-
-        for (var i: u32 = 0; i < ${workPerThread}; i = i + 1) {
-          let value = dotProd[i] + ${hasBias ? 'bias[c+i]' : `vec4<${dataType}>(0.0)`};
-          ${output.set('batch', 'r', 'c + i', 'd1', 'value')};
-        }
-      }`;
-  const codeSnippet = `
-          let outputIndices = ${output.offsetToIndices('global_idx')};
-          let batch = ${output.indicesGet('outputIndices', 0)};
-          let d1 = ${output.indicesGet('outputIndices', channelDim)};
-          let r = ${output.indicesGet('outputIndices', rowDim)};
-          let c = ${output.indicesGet('outputIndices', colDim)};
-          let dyCorner = vec2<i32>(i32(r), i32(c)) - uniforms.pads;
-          let dyRCorner = dyCorner.x;
-          let dyCCorner = dyCorner.y;
-          let groupId = d1 / uniforms.output_channels_per_group;
-          let wOutChannel = d1 - groupId * uniforms.output_channels_per_group;
-          // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
-          // ? = to be determined. : = across all values in that axis.
-          var dotProd = ${dataType}(0.0);
-          for (var wR: u32 = 0; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
-            if (wR % uniforms.dilations.x != 0) {
-              continue;
-            }
-            let dyR = (${dataType}(dyRCorner) + ${dataType}(wR)) / ${dataType}(uniforms.strides[0]);
-            let wRPerm = uniforms.filter_dims.x - 1 - wR / uniforms.dilations.x;
-            if (dyR < 0.0 || dyR >= ${dataType}(uniforms.Dy_shape[${rowDim}]) || fract(dyR) > 0.0 ||
-                wRPerm < 0) {
-              continue;
-            }
-            let idyR: u32 = u32(dyR);
-
-            for (var wC: u32 = 0; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
-              if (wC % uniforms.dilations.y != 0) {
-                continue;
-              }
-              let dyC = (${dataType}(dyCCorner) + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
-              let wCPerm = uniforms.filter_dims.y - 1 - wC / uniforms.dilations.y;
-              if (dyC < 0.0 || dyC >= ${dataType}(uniforms.Dy_shape[${colDim}]) ||
-                  fract(dyC) > 0.0 || wCPerm < 0) {
-                continue;
-              }
-              let idyC: u32 = u32(dyC);
-              var inputChannel = groupId * uniforms.input_channels_per_group;
-              for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + 1) {
-                let xValue = ${
-                  isChannelsLast
-                    ? dy.get('batch', 'idyR', 'idyC', 'inputChannel')
-                    : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
-                };
-                let wValue = ${w.get('inputChannel', 'wOutChannel', 'u32(wRPerm)', 'u32(wCPerm)')};
-                dotProd = dotProd + xValue * wValue;
-                inputChannel = inputChannel + 1;
-              }
-            }
-          }
-          let value = dotProd + ${hasBias ? 'bias[d1]' : `${dataType}(0.0)`};
-          ${output.setByOffset('global_idx', 'value')};
-        `;
-
-  return `
-  ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
-  ${declareFunctions}
-
-    ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')};
-  ${isVec4 ? codeSnippet4 : codeSnippet}}`;
-};
-
 export const createConvTranspose2DProgramInfo = (
   inputs: readonly TensorView[],
   attributes: ConvTransposeAttributes,
   squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const hasBias = inputs.length > 2;
-  // const isChannelsLast = attributes.format === 'NHWC';
   const outputShape = attributes.outputShape;
-  const outputSize = ShapeUtil.size(outputShape);
-
-  // const inChannels = inputs[0].dims[isChannelsLast ? 3 : 1];
-  // TODO Enable isVec4 for performance
-  // Disabled due to weight matrix layout issue
-  // const isVec4 = attributes.group === 1 && isChannelsLast && inChannels % 4 === 0 && outChannels % 4 === 0;
+  const isChannelsLast = attributes.format === 'NHWC';
+  const group = attributes.group;
+  const wShape = inputs[1].dims;
+  const inputChannelsPerGroup = wShape[2] / group;
+  const outputChannelsPerGroup = wShape[3];
+  const components = isChannelsLast ? getMaxComponents(outputChannelsPerGroup) : 1;
+  const outputSize = ShapeUtil.size(outputShape) / components;
   const dispatch = [Math.ceil(outputSize / 64), 1, 1];
   LOG_DEBUG('verbose', () => `[conv2d_backprop_webgpu] dispatch = ${dispatch}`);
 
-  const isChannelsLast = attributes.format === 'NHWC';
   const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank'];
   const strides = [attributes.strides[0], attributes.strides[1]];
   const filterDims = [attributes.kernelShape[isChannelsLast ? 1 : 2], attributes.kernelShape[isChannelsLast ? 2 : 3]];
@@ -268,15 +66,9 @@ export const createConvTranspose2DProgramInfo = (
   ];
   const pads = [
     effectiveFilterDims[0] - 1 - Math.floor((attributes.pads[0] + attributes.pads[2]) / 2),
-    effectiveFilterDims[1] - 1 - Math.floor(attributes.pads[1] + attributes.pads[3]) / 2,
+    effectiveFilterDims[1] - 1 - Math.floor((attributes.pads[1] + attributes.pads[3]) / 2),
   ];
 
-  const isVec4 = false;
-  const group = attributes.group;
-  const wShape = inputs[1].dims;
-  const inputChannelsPerGroup = wShape[0] / group;
-  const outputChannelsPerGroup = wShape[1];
-
   const programUniforms: ProgramUniform[] = [
     { type: DataType.uint32, data: outputSize },
     { type: DataType.uint32, data: strides },
@@ -294,7 +86,6 @@ export const createConvTranspose2DProgramInfo = (
   }
   programUniforms.push(...createTensorShapeVariables(outputShape));
 
-  const is1DimensionDispatch = dispatch[1] === 1 && dispatch[2] === 1;
   const getShaderSource = (shaderHelper: ShaderHelper) => {
     const uniforms: UniformsArrayType = [
       { name: 'output_size', type: 'u32' },
@@ -307,21 +98,83 @@ export const createConvTranspose2DProgramInfo = (
       { name: 'output_channels_per_group', type: 'u32' },
     ];
     const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-    return `${createConvTranspose2DOpProgramShaderSource(
-      shaderHelper,
-      inputs,
-      outputShape,
-      hasBias,
-      is1DimensionDispatch,
-      isVec4,
-      dataType,
-      uniforms,
-      isChannelsLast,
-    )}`;
+    const rowDim = isChannelsLast ? 1 : 2;
+    const colDim = isChannelsLast ? 2 : 3;
+    const channelDim = isChannelsLast ? 3 : 1;
+
+    const w = inputVariable('W', inputs[1].dataType, inputs[1].dims.length, components);
+    const dy = inputVariable('Dy', inputs[0].dataType, inputs[0].dims.length);
+    const inputVariables = [dy, w];
+    if (hasBias) {
+      inputVariables.push(inputVariable('bias', inputs[2].dataType, [outputShape[channelDim]].length, components));
+    }
+    const output = outputVariable('result', inputs[0].dataType, outputShape.length, components);
+
+    const codeSnippet = `
+            let outputIndices = ${output.offsetToIndices(`global_idx * ${components}`)};
+            let batch = ${output.indicesGet('outputIndices', 0)};
+            let d1 = ${output.indicesGet('outputIndices', channelDim)};
+            let r = ${output.indicesGet('outputIndices', rowDim)};
+            let c = ${output.indicesGet('outputIndices', colDim)};
+            let dyCorner = vec2<i32>(i32(r), i32(c)) - uniforms.pads;
+            let dyRCorner = dyCorner.x;
+            let dyCCorner = dyCorner.y;
+            let groupId = d1 / uniforms.output_channels_per_group;
+            let wOutChannel = d1 - groupId * uniforms.output_channels_per_group;
+            // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
+            // ? = to be determined. : = across all values in that axis.
+            var dotProd = ${output.type.value}(0.0);
+            for (var wR: u32 = 0; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
+              if (wR % uniforms.dilations.x != 0) {
+                continue;
+              }
+              let dyR = (${dataType}(dyRCorner) + ${dataType}(wR)) / ${dataType}(uniforms.strides[0]);
+              let wRPerm = uniforms.filter_dims.x - 1 - wR / uniforms.dilations.x;
+              if (dyR < 0.0 || dyR >= ${dataType}(uniforms.Dy_shape[${rowDim}]) || fract(dyR) > 0.0 ||
+                  wRPerm < 0) {
+                continue;
+              }
+              let idyR: u32 = u32(dyR);
+
+              for (var wC: u32 = 0; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
+                if (wC % uniforms.dilations.y != 0) {
+                  continue;
+                }
+                let dyC = (${dataType}(dyCCorner) + ${dataType}(wC)) / ${dataType}(uniforms.strides.y);
+                let wCPerm = uniforms.filter_dims.y - 1 - wC / uniforms.dilations.y;
+                if (dyC < 0.0 || dyC >= ${dataType}(uniforms.Dy_shape[${colDim}]) ||
+                    fract(dyC) > 0.0 || wCPerm < 0) {
+                  continue;
+                }
+                let idyC: u32 = u32(dyC);
+                var inputChannel = groupId * uniforms.input_channels_per_group;
+                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + 1) {
+                  let xValue = ${
+                    isChannelsLast
+                      ? dy.get('batch', 'idyR', 'idyC', 'inputChannel')
+                      : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
+                  };
+                  let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
+                  let wValue = ${w.getByOffset(`w_offset / ${components}`)};
+                  dotProd = dotProd + xValue * wValue;
+                  inputChannel = inputChannel + 1;
+                }
+              }
+            }
+            let value = dotProd${hasBias ? ` + bias[d1 / ${components}]` : ''};
+            ${output.setByOffset('global_idx', 'value')};
+          `;
+
+    return `
+    ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
+      ${shaderHelper.mainStart()}
+      ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')};
+    ${codeSnippet}}`;
   };
+
   return {
     name: 'ConvTranspose2D',
-    shaderCache: { hint: `${attributes.cacheKey};`, inputDependencies },
+    shaderCache: { hint: `${attributes.cacheKey};${components}`, inputDependencies },
     getRunData: () => ({
       dispatchGroup: { x: dispatch[0], y: dispatch[1], z: dispatch[2] },
       outputs: [
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
index f0287529ca08b..c6341f94cf191 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -25,7 +25,6 @@ import { ShapeUtil } from '../../../util';
 import { ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform } from '../../types';
 import {
   createTensorShapeVariables,
-  getBroadcastDims,
   IndicesHelper,
   inputVariable,
   internalVariable,
@@ -40,6 +39,7 @@ import {
   getActivationSnippet,
   InternalActivationAttributes,
 } from '../fuse-utils';
+import { convertOutputBatchIndicesToInputBatchIndices } from '../matmul-shaders';
 
 import { typeSnippet } from './activation_util';
 
@@ -373,42 +373,11 @@ const matMulReadWriteFnSource = (
   hasBias: boolean,
   applyActivation: string,
   variables: IndicesHelper[],
-  batchShapes: Array<readonly number[]>,
   isChannelsLast = false,
 ): string => {
-  const [batchAShape, batchBShape, batchShape] = batchShapes;
   const [batchVariable, aVariable, bVariable, outputVariable] = variables;
-  const broadCastADims = getBroadcastDims(batchAShape, batchShape);
-  const broadCastBDims = getBroadcastDims(batchBShape, batchShape);
   const dataType = tensorTypeToWsglStorageType(variables[0].type.tensor);
-  const getAIndices = () => {
-    const aRank = aVariable.rank;
-    const batchRank = batchVariable.rank;
-    let resStr = `var aIndices: ${aVariable.type.indices};`;
-    for (let i = aRank - 2 - 1, j = batchRank - 1; i >= 0; i--, j--) {
-      resStr += `\naIndices[${i}] = ${batchRank > 1 ? `batchIndices[${j}]` : 'batchIndices'};`;
-    }
-    broadCastADims.forEach((i) => {
-      resStr += `\naIndices[${i}] = 0;`;
-    });
-    resStr += `\naIndices[${aRank - 2}] = u32(row);
-                   aIndices[${aRank - 1}] = u32(colIn);`;
-    return resStr;
-  };
-  const getBIndices = () => {
-    const bRank = bVariable.rank;
-    const batchRank = batchVariable.rank;
-    let resStr = `var bIndices: ${bVariable.type.indices};`;
-    for (let i = bRank - 2 - 1, j = batchRank - 1; i >= 0; i--, j--) {
-      resStr += `\nbIndices[${i}] = ${batchRank > 1 ? `batchIndices[${j}]` : 'batchIndices'};`;
-    }
-    broadCastBDims.forEach((i) => {
-      resStr += `\nbIndices[${i}] = 0;`;
-    });
-    resStr += `\nbIndices[${bRank - 2}] = u32(row);
-                   bIndices[${bRank - 1}] = u32(colIn);`;
-    return resStr;
-  };
+
   const source = `
     fn mm_readA(batch: i32, row: i32, colIn: i32, batchIndices: ${batchVariable.type.indices}) -> ${typeSnippet(
       component,
@@ -418,7 +387,16 @@ const matMulReadWriteFnSource = (
       let col = colIn * ${component};
       if(row < uniforms.dim_a_outer && col < uniforms.dim_inner)
       {
-        ${getAIndices()}
+        var aIndices: ${aVariable.type.indices};
+        ${convertOutputBatchIndicesToInputBatchIndices(
+          'aIndices',
+          aVariable,
+          aVariable.rank - 2,
+          batchVariable.rank,
+          'batchIndices',
+        )}
+        ${aVariable.indicesSet('aIndices', aVariable.rank - 2, 'u32(row)')}
+        ${aVariable.indicesSet('aIndices', aVariable.rank - 1, 'u32(colIn)')}
         value = ${aVariable.getByIndices('aIndices')};
       }
       return value;
@@ -432,7 +410,16 @@ const matMulReadWriteFnSource = (
       let col = colIn * ${component};
       if(row < uniforms.dim_inner && col < uniforms.dim_b_outer)
       {
-        ${getBIndices()}
+        var bIndices: ${bVariable.type.indices};
+        ${convertOutputBatchIndicesToInputBatchIndices(
+          'bIndices',
+          bVariable,
+          bVariable.rank - 2,
+          batchVariable.rank,
+          'batchIndices',
+        )}
+        ${bVariable.indicesSet('bIndices', bVariable.rank - 2, 'u32(row)')}
+        ${bVariable.indicesSet('bIndices', bVariable.rank - 1, 'u32(colIn)')}
         value = ${bVariable.getByIndices('bIndices')};
       }
       return value;
@@ -532,7 +519,6 @@ export const createMatmulProgramInfo = (
       hasBias,
       applyActivation,
       [batchDims, A, B, output],
-      [outerDimsA, outerDimsB, outerDims],
       isChannelsLast,
     );
     return `
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index 793f26fe901e3..0b9173403cd7d 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -195,7 +195,7 @@ export interface IndicesHelper {
   /**
    * whether the helper is for an input, an output or an internal variable.
    */
-  readonly usage: 'input' | 'output' | 'internal';
+  readonly usage: 'input' | 'output' | 'atomicOutput' | 'internal';
 
   /**
    * the rank of the input or output.
@@ -733,6 +733,20 @@ export const outputVariable = (
   components: 1 | 2 | 3 | 4 = 1,
 ): IndicesHelper => createIndicesHelper(name, type, shapeOrRank, 'output', components);
 
+/**
+ * Create a IndicesHelper for an atomic output.
+ *
+ * @param name - the name of the output.
+ * @param type - the tensor type of the output.
+ * @param shapeOrRank - the tensor shape or the rank of the output.
+ * @returns an IndicesHelper for the output.
+ */
+export const atomicOutputVariable = (
+  name: string,
+  type: number,
+  shapeOrRank: number | readonly number[],
+): IndicesHelper => createIndicesHelper(name, type, shapeOrRank, 'atomicOutput', 1);
+
 /**
  * Create a IndicesHelper for an internal variable.
  *
@@ -905,9 +919,8 @@ class ShaderHelperImpl implements ShaderHelper {
     }
     this.variables.push(variable);
     this.appendVariableUniforms(variable);
-
     const access = variable.usage === 'input' ? 'read' : 'read_write';
-    const storageType = variable.type.storage;
+    const storageType = variable.usage === 'atomicOutput' ? `atomic<i32>` : variable.type.storage;
     return `@group(0) @binding(${bindingIndex}) var<storage, ${access}> ${variable.name}: array<${storageType}>;`;
   }
 
@@ -996,27 +1009,3 @@ class ShaderHelperImpl implements ShaderHelper {
 
 export const createShaderHelper = (dispatchGroup: [number, number, number], limits: GPUSupportedLimits) =>
   new ShaderHelperImpl(dispatchGroup, limits);
-
-/**
- * This function comes from https://github.com/tensorflow/tfjs/blob/master/tfjs-core/src/ops/broadcast_util.ts#L18-L40
- * Returns the dimensions in the input shape that are broadcasted to
- * produce the provided output shape.
- *
- * The returned dimensions are 0-indexed and sorted. An example:
- * inShape = [4, 1, 3]
- * outShape = [5, 4, 3, 3]
- * result = [1]. Dimension 1 (2nd dimension of input) gets broadcasted 1 => 3.
- */
-export const getBroadcastDims = (inShape: readonly number[], outShape: readonly number[]): number[] => {
-  const inRank = inShape.length;
-  const dims: number[] = [];
-  for (let i = 0; i < inRank; i++) {
-    const dim = inRank - 1 - i;
-    const a = inShape[dim] || 1;
-    const b = outShape[outShape.length - 1 - i] || 1;
-    if (b > 1 && a === 1) {
-      dims.unshift(dim);
-    }
-  }
-  return dims;
-};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
index 236f1b09a6c93..3e168ddedac86 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
@@ -4,7 +4,6 @@
 import { TensorView } from '../../tensor-view';
 import { ComputeContext } from '../types';
 
-import { createConv2DTransposeMatMulProgramInfo } from './3rd-party/conv_backprop_mm_webgpu';
 import { createConvTranspose2DProgramInfo } from './3rd-party/conv_backprop_webgpu';
 import { ConvAttributes } from './conv';
 import { parseInternalActivationAttributes } from './fuse-utils';
@@ -227,41 +226,16 @@ const validateInputs = (inputs: readonly TensorView[], attributes: ConvTranspose
   }
 };
 
-// for transposing weight tensor from [C, M/group, KH, KW] to [KH, KW, M/group, C]
-const weightTransposePerm = [2, 3, 1, 0];
-
 const convTranspose2d = (
   context: ComputeContext,
   inputs: readonly TensorView[],
   attributes: ConvTransposeAttributes,
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): void => {
-  const adjustedAttributes = getAdjustedConvTransposeAttributes(attributes, inputs);
-  const isChannelsLast = attributes.format === 'NHWC';
-  const outputShape = adjustedAttributes.outputShape;
-  const outChannels = outputShape[isChannelsLast ? 3 : 1];
-  const inputChannels = inputs[0].dims[isChannelsLast ? 3 : 1];
-  // Switch to naive method when outChannels and inputChannels are very small. It's because that in this case it's
-  // not suitable for matmul version since matmul uses tile size 32x32 resulting the underlying execution unit
-  // utilization rate is very low.
-  if (adjustedAttributes.group !== 1 || (outChannels === 1 && inputChannels === 1)) {
-    context.compute(createConvTranspose2DProgramInfo(inputs, adjustedAttributes));
-    return;
-  }
-  const outHeight = outputShape[isChannelsLast ? 1 : 2];
-  const outWidth = outputShape[isChannelsLast ? 2 : 3];
-  const weightHeight = inputs[1].dims[2];
-  const weightWidth = inputs[1].dims[3];
-
-  const dimAOuter = isChannelsLast ? outHeight * outWidth : outChannels;
-  const dimBOuter = isChannelsLast ? outChannels : outHeight * outWidth;
-  const dimInner = weightHeight * weightWidth * inputChannels;
-
-  const sequentialAccessByThreads = /* backend.adapterInfo.isIntel() */ true;
-
   // STEP.1: transpose weight
   const transposedWeight =
     (context.kernelCustomData.wT as TensorView | undefined) ??
-    context.compute(createTransposeProgramInfo(inputs[1], weightTransposePerm), {
+    context.compute(createTransposeProgramInfo(inputs[1], [2, 3, 0, 1]), {
       inputs: [1],
       outputs: [attributes.wIsConst ? -2 : -1],
     })[0];
@@ -271,29 +245,12 @@ const convTranspose2d = (
 
   // STEP.2: prepare reshaped inputs
   const convTransposeInputs = [inputs[0], transposedWeight];
-  const hasBias = inputs.length === 3;
-  if (hasBias) {
-    if (!isChannelsLast && inputs[2].dims.length === 1) {
-      convTransposeInputs.push(inputs[2].reshape([inputs[2].dims[0], 1, 1]));
-    } else {
-      convTransposeInputs.push(inputs[2]);
-    }
+  if (inputs.length === 3) {
+    convTransposeInputs.push(inputs[2]);
   }
-
-  // STEP.3: compute matmul
-  context.compute(
-    createConv2DTransposeMatMulProgramInfo(
-      convTransposeInputs,
-      adjustedAttributes,
-      outputShape,
-      dimAOuter,
-      dimBOuter,
-      dimInner,
-      hasBias,
-      sequentialAccessByThreads,
-    ),
-    { inputs: convTransposeInputs },
-  );
+  context.compute(createConvTranspose2DProgramInfo(convTransposeInputs, attributes, squeezeOutputShapeFunction), {
+    inputs: convTransposeInputs,
+  });
 };
 
 const convTranspose1d = (context: ComputeContext, attributes: ConvTransposeAttributes): void => {
@@ -338,12 +295,9 @@ const convTranspose1d = (context: ComputeContext, attributes: ConvTransposeAttri
     { ...attributes, pads, strides, dilations, kernelShape },
     inputs,
   );
-  context.compute(
-    createConvTranspose2DProgramInfo(inputs, adjustedAttributes, (outputShape) =>
-      isChannelLast
-        ? [outputShape[0], outputShape[2], outputShape[3]]
-        : [outputShape[0], outputShape[1], outputShape[3]],
-    ),
+
+  convTranspose2d(context, inputs, adjustedAttributes, (outputShape) =>
+    isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : [outputShape[0], outputShape[1], outputShape[3]],
   );
 };
 
@@ -352,6 +306,7 @@ export const convTranspose = (context: ComputeContext, attributes: ConvTranspose
   if (context.inputs[0].dims.length === 3) {
     convTranspose1d(context, attributes);
   } else {
-    convTranspose2d(context, context.inputs, attributes);
+    const adjustedAttributes = getAdjustedConvTransposeAttributes(attributes, context.inputs);
+    convTranspose2d(context, context.inputs, adjustedAttributes);
   }
 };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
index 4e2bfa9d89924..3691b5ecb602b 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
@@ -48,11 +48,18 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
   const shape = Array.from(inputs[1].getBigInt64Array(), Number);
   const outputShape: number[] = calculateOutputShape(inputShape, shape);
   const dataType = inputs[0].dataType;
-  const components = dataType === DataType.bool ? 4 : 1;
+  const isBoolOrScalar = dataType === DataType.bool || ShapeUtil.size(inputShape) === 1;
+  const iComponents =
+    dataType === DataType.bool ? 4 : inputShape.length > 0 && inputShape[inputShape.length - 1] % 4 === 0 ? 4 : 1;
+  const components = isBoolOrScalar
+    ? 4
+    : outputShape.length > 0 && outputShape[outputShape.length - 1] % 4 === 0
+      ? 4
+      : 1;
   const outputSize = Math.ceil(ShapeUtil.size(outputShape) / components);
 
   const getShaderSource = (shaderHelper: ShaderHelper) => {
-    const input = inputVariable('input', dataType, inputShape.length, components);
+    const input = inputVariable('input', dataType, inputShape.length, iComponents);
     const output = outputVariable('output', dataType, outputShape.length, components);
     let assignment: string;
     if (dataType === DataType.bool) {
@@ -74,9 +81,10 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
       }`;
     } else {
       assignment = `
-        let outputIndices = ${output.offsetToIndices('global_idx')};
+        let outputIndices = ${output.offsetToIndices(`global_idx * ${components}`)};
         let inputOffset = ${input.broadcastedIndicesToOffset('outputIndices', output)};
-        ${output.setByOffset('global_idx', input.getByOffset('inputOffset'))}
+        let data = ${output.type.value}(${input.getByOffset(`inputOffset / ${iComponents}`)});
+        ${output.setByOffset('global_idx', 'data')}
       }`;
     }
     return `
@@ -92,7 +100,7 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
   ];
   return {
     name: 'Expand',
-    shaderCache: { hint: `${outputShape.length}`, inputDependencies: ['rank'] },
+    shaderCache: { hint: `${outputShape.length};${iComponents}${components}`, inputDependencies: ['rank'] },
     getShaderSource,
     getRunData: () => ({
       outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather-nd.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather-nd.ts
new file mode 100644
index 0000000000000..43b51f6e94a66
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather-nd.ts
@@ -0,0 +1,179 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import { DataType } from '../../../wasm-common';
+import { TensorView } from '../../tensor-view';
+import { ShapeUtil } from '../../util';
+import { AttributeWithCacheKey } from '../attribute-with-cache-key';
+import { ComputeContext, ProgramUniform } from '../types';
+
+import { createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, UniformsArrayType } from './common';
+
+export interface GatherNDAttributes extends AttributeWithCacheKey {
+  readonly batchDims: number;
+}
+
+const computeSliceOffsets = (
+  context: ComputeContext,
+  indicesData: TensorView,
+  sizesFromSliceDimsData: number[],
+  batchDims: number,
+  inputDims: readonly number[],
+  numSlices: number,
+  numSlicesPerBatch: number,
+  inputBatchStride: number,
+  numSliceDims: number,
+) => {
+  const programUniforms: ProgramUniform[] = [
+    { type: DataType.uint32, data: numSlices },
+    { type: DataType.uint32, data: batchDims },
+    { type: DataType.uint32, data: inputDims },
+    { type: DataType.uint32, data: sizesFromSliceDimsData },
+    { type: DataType.uint32, data: numSlicesPerBatch },
+    { type: DataType.uint32, data: inputBatchStride },
+    { type: DataType.uint32, data: numSliceDims },
+  ];
+
+  const outputShape = [numSlices];
+  programUniforms.push(...createTensorShapeVariables(indicesData.dims, outputShape));
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const indices = inputVariable('indices_data', indicesData.dataType, indicesData.dims.length);
+    const output = outputVariable('input_slice_offsets_data', DataType.uint32, 1, 1);
+    const variables = [indices, output];
+    const uniforms: UniformsArrayType = [
+      { name: 'output_size', type: 'u32' },
+      { name: 'batch_dims', type: 'u32' },
+      { name: 'input_dims', type: 'u32', length: inputDims.length },
+      { name: 'sizes_from_slice_dims_data', type: 'u32', length: sizesFromSliceDimsData.length },
+      { name: 'num_slices_per_batch', type: 'u32' },
+      { name: 'input_batch_stride', type: 'u32' },
+      { name: 'num_slice_dims', type: 'u32' },
+    ];
+    return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+    let batch_idx = global_idx / uniforms.num_slices_per_batch;
+    let base_offset = batch_idx * uniforms.input_batch_stride;
+
+    let slice_indices_base_offset = global_idx * uniforms.num_slice_dims;
+    var relative_slice_offset = 0;
+    for (var dim_idx = 0u; dim_idx < uniforms.num_slice_dims; dim_idx ++) {
+      var index = i32(indices_data[dim_idx + slice_indices_base_offset].x);
+      let input_dim_idx = uniforms.batch_dims + dim_idx;
+      if (index < 0) {
+        ${
+          inputDims.length === 1
+            ? 'index += i32(uniforms.input_dims);'
+            : 'index += i32(uniforms.input_dims[input_dim_idx]);'
+        }
+      }
+      ${
+        sizesFromSliceDimsData.length === 1
+          ? 'relative_slice_offset += index * i32(uniforms.sizes_from_slice_dims_data);'
+          : 'relative_slice_offset += index * i32(uniforms.sizes_from_slice_dims_data[dim_idx]);'
+      }
+    }
+
+    input_slice_offsets_data[global_idx] =  base_offset + u32(relative_slice_offset);
+  }`;
+  };
+
+  return context.compute(
+    {
+      name: 'computeSliceOffsets',
+      shaderCache: { hint: `${inputDims.length}_${sizesFromSliceDimsData.length}`, inputDependencies: ['rank'] },
+      getRunData: () => ({
+        outputs: [{ dims: outputShape, dataType: context.inputs[1].dataType }],
+        dispatchGroup: { x: Math.ceil(numSlices / 64) },
+        programUniforms,
+      }),
+      getShaderSource,
+    },
+    { inputs: [indicesData], outputs: [-1] },
+  )[0];
+};
+
+export const gatherND = (context: ComputeContext, attributes: GatherNDAttributes) => {
+  const inputs = context.inputs;
+  const inputShape = inputs[0].dims;
+  const inputType = inputs[0].dataType;
+  const indicesShape = inputs[1].dims;
+  const numSliceDims = indicesShape[indicesShape.length - 1];
+  const numSlices = ShapeUtil.sizeToDimension(indicesShape, indicesShape.length - 1);
+  const sliceSize = ShapeUtil.sizeFromDimension(inputShape, attributes.batchDims + numSliceDims);
+  const numBatches = ShapeUtil.sizeToDimension(inputShape, attributes.batchDims);
+  const inputBatchStride = ShapeUtil.sizeFromDimension(inputShape, attributes.batchDims);
+  const numSlicesPerBatch = numSlices / numBatches;
+  const sizesFromSliceDims = new Array(numSliceDims);
+  let runningProduct = sliceSize;
+  for (let i = 0; i < numSliceDims; ++i) {
+    sizesFromSliceDims[numSliceDims - 1 - i] = runningProduct;
+    runningProduct *= inputShape[attributes.batchDims + numSliceDims - 1 - i];
+  }
+
+  const inputSliceOffsets = computeSliceOffsets(
+    context,
+    inputs[1],
+    sizesFromSliceDims,
+    attributes.batchDims,
+    inputShape,
+    numSlices,
+    numSlicesPerBatch,
+    inputBatchStride,
+    numSliceDims,
+  );
+
+  const lastIndicesDimension = attributes.batchDims + numSliceDims;
+  if (lastIndicesDimension > inputShape.length) {
+    throw new Error('last dimension of indices must not be larger than rank of input tensor');
+  }
+
+  const outputShape = indicesShape.slice(0, -1).concat(inputShape.slice(lastIndicesDimension));
+  const outputSize = ShapeUtil.size(outputShape);
+
+  const programUniforms: ProgramUniform[] = [
+    { type: DataType.uint32, data: outputSize },
+    { type: DataType.uint32, data: sliceSize },
+    ...createTensorShapeVariables(inputs[0].dims, inputSliceOffsets.dims, outputShape),
+  ];
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const input = inputVariable('data', inputs[0].dataType, inputs[0].dims.length);
+    const indices = inputVariable('slice_offsets', DataType.uint32, inputSliceOffsets.dims.length);
+
+    const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+    return `
+          ${shaderHelper
+            .registerUniform('output_size', 'u32')
+            .registerUniform('slice_size', 'u32')
+            .declareVariables(input, indices, output)}
+            ${shaderHelper.mainStart()}
+            ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+          let slice_offset = slice_offsets[global_idx / uniforms.slice_size];
+          output[global_idx] = data[u32(slice_offset) + global_idx % uniforms.slice_size];
+        }`;
+  };
+  context.compute(
+    {
+      name: 'GatherND',
+      shaderCache: { hint: attributes.cacheKey, inputDependencies: ['rank', 'rank'] },
+      getRunData: () => ({
+        outputs: [{ dims: outputShape, dataType: inputType }],
+        dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
+        programUniforms,
+      }),
+      getShaderSource,
+    },
+    { inputs: [inputs[0], inputSliceOffsets] },
+  );
+};
+
+export const parseGatherNDAttributes = (attributes: Record<string, unknown>): GatherNDAttributes => {
+  const batchDims = attributes.batch_dims as number;
+  return {
+    batchDims,
+    cacheKey: '',
+  };
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/grid-sample.ts b/js/web/lib/wasm/jsep/webgpu/ops/grid-sample.ts
new file mode 100644
index 0000000000000..50c71472434ad
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/grid-sample.ts
@@ -0,0 +1,279 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import { DataType } from '../../../wasm-common';
+import { TensorView } from '../../tensor-view';
+import { ShapeUtil } from '../../util';
+import { AttributeWithCacheKey, createAttributeWithCacheKey } from '../attribute-with-cache-key';
+import { ComputeContext, ProgramInfo, ProgramUniform } from '../types';
+
+import { createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper } from './common';
+
+let [idxN, idxC, idxH, idxW] = [0, 1, 2, 3]; // NCHW
+type Mode = 'bilinear' | 'nearest' | 'bicubic';
+type PaddingMode = 'zeros' | 'border' | 'reflection';
+type Format = 'NHWC' | 'NCHW';
+export interface GridSampeAttributes extends AttributeWithCacheKey {
+  alignCorners: number;
+  mode: Mode;
+  paddingMode: PaddingMode;
+  format: Format;
+}
+
+const validateInputs = (inputs: readonly TensorView[]): void => {
+  if (inputs[0].dims.length !== 4) {
+    throw new Error('only 4-D tensor is supported.');
+  }
+  if (inputs[0].dims.length !== inputs[1].dims.length) {
+    throw new Error('input dimensions must be equal to grid dimensions');
+  }
+
+  if (inputs[0].dims.length - 2 !== inputs[1].dims[inputs[1].dims.length - 1]) {
+    throw new Error(`last dimension of grid must be equal to ${inputs[0].dims.length - 2}`);
+  }
+
+  if (inputs[0].dims[0] !== inputs[1].dims[0]) {
+    throw new Error('grid batch size must match input batch size');
+  }
+};
+
+const gsGetCubicCoeffs = `
+  fn gs_get_cubic_coeffs(x: f32) -> vec4<f32> {
+    let cubic_alpha = -0.75f;
+    let x_abs = abs(x);
+    var coeffs: vec4<f32>;
+    coeffs[0] = (((cubic_alpha * (x_abs + 1) - 5 * cubic_alpha) * (x_abs + 1) + 8 * cubic_alpha) * (x_abs + 1) - 4 * cubic_alpha);
+    coeffs[1] = (((cubic_alpha + 2) * x_abs - (cubic_alpha + 3)) * x_abs * x_abs + 1);
+    coeffs[2] = (((cubic_alpha + 2) * (1 - x_abs) - (cubic_alpha + 3)) * (1 - x_abs) * (1 - x_abs) + 1);
+    coeffs[3] = (((cubic_alpha * (2 - x_abs) - 5 * cubic_alpha) * (2 - x_abs) + 8 * cubic_alpha) * (2 - x_abs) - 4 * cubic_alpha);
+    return coeffs;
+  }
+`;
+
+const gsBicubicInterpolate = (dataType: string): string => `
+  fn gs_bicubic_interpolate(p: mat4x4<${dataType}>, x: f32, y: f32) -> ${dataType} {
+    var v: vec4<f32>;
+    var coeffs = gs_get_cubic_coeffs(x);
+    for (var i = 0; i < 4; i++) {
+      v[i] = coeffs[0] * p[i][0] + coeffs[1] * p[i][1] + coeffs[2] * p[i][2] + coeffs[3] * p[i][3];
+    }
+    coeffs = gs_get_cubic_coeffs(y);
+    let pixel = ${dataType}(coeffs[0] * v[0] + coeffs[1] * v[1] + coeffs[2] * v[2] + coeffs[3] * v[3]);
+    return pixel;
+  }
+`;
+
+const gsDenormalize = (attributes: GridSampeAttributes): string => `
+  fn gs_denormalize(n: f32, length: i32) -> f32 {
+    ${
+      attributes.alignCorners === 0
+        ? `
+    // alignCorners: false => [-1, 1] to [-0.5, length - 0.5]
+    return ((n + 1.0) * f32(length) - 1.0) / 2.0;
+    `
+        : `
+    // alignCorners: true => [-1, 1] to [0, length - 1]
+    return (n + 1.0) / 2.0 * (f32(length - 1));
+    `
+    }
+  }
+`;
+
+const gsReflect = (attributes: GridSampeAttributes): string => `
+  ${
+    attributes.paddingMode === 'reflection'
+      ? `
+      fn gs_reflect(x: i32, x_min: f32, x_max: f32) -> u32 {
+        var dx = 0.0;
+        var fx = f32(x);
+        let range = x_max - x_min;
+        if (fx < x_min) {
+          dx = x_min - fx;
+          let n = u32(dx / range);
+          let r = dx - f32(n) * range;
+          if (n % 2 == 0) {
+            fx = x_min + r;
+          } else {
+            fx = x_max - r;
+          }
+        } else if (fx > x_max) {
+          dx = fx - x_max;
+          let n = u32(dx / range);
+          let r = dx - f32(n) * range;
+          if (n % 2 == 0) {
+            fx = x_max - r;
+          } else {
+            fx = x_min + r;
+          }
+        }
+        return u32(fx);
+      }`
+      : ''
+  }
+`;
+
+const pixelAtGrid = (input: IndicesHelper, dataType: string, attributes: GridSampeAttributes): string =>
+  `
+  fn pixel_at_grid(r: i32, c: i32, H: i32, W: i32, batch: u32, channel: u32, border: vec4<f32>) -> ${dataType} {
+     var pixel = ${dataType}(0);
+     var indices = vec4<u32>(0);
+     indices[${idxN}] = batch;
+     indices[${idxC}] = channel;` +
+  (() => {
+    switch (attributes.paddingMode) {
+      case 'zeros':
+        return `
+          if (r >= 0 && r < H && c >=0 && c < W) {
+            indices[${idxH}] = u32(r);
+            indices[${idxW}] = u32(c);
+          }
+        `;
+      case 'border':
+        return `
+          indices[${idxH}] = u32(clamp(r, 0, H - 1));
+          indices[${idxW}] = u32(clamp(c, 0, W - 1));
+        `;
+      case 'reflection':
+        return `
+          indices[${idxH}] = gs_reflect(r, border[1], border[3]);
+          indices[${idxW}] = gs_reflect(c, border[0], border[2]);
+        `;
+      default:
+        throw new Error(`padding mode ${attributes.paddingMode} is not supported`);
+    }
+  })() +
+  `
+    return ${input.getByIndices('indices')};
+  }
+`;
+
+const computePixel = (output: IndicesHelper, dataType: string, attributes: GridSampeAttributes): string =>
+  (() => {
+    switch (attributes.mode) {
+      case 'nearest':
+        return `
+          let result = pixel_at_grid(i32(round(y)), i32(round(x)), H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+        `;
+      case 'bilinear':
+        return `
+          let x1 = i32(floor(x));
+          let y1 = i32(floor(y));
+          let x2 = x1 + 1;
+          let y2 = y1 + 1;
+
+          let p11 = pixel_at_grid(y1, x1, H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+          let p12 = pixel_at_grid(y1, x2, H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+          let p21 = pixel_at_grid(y2, x1, H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+          let p22 = pixel_at_grid(y2, x2, H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+
+          let dx2 = ${dataType}(f32(x2) - x);
+          let dx1 = ${dataType}(x - f32(x1));
+          let dy2 = ${dataType}(f32(y2) - y);
+          let dy1 = ${dataType}(y - f32(y1));
+          let result = dy2 * (dx2 * p11 + dx1 * p12) + dy1 * (dx2 * p21 + dx1 * p22);
+        `;
+      case 'bicubic':
+        return `
+          let x0 = i32(floor(x)) - 1;
+          let y0 = i32(floor(y)) - 1;
+          var p: mat4x4<${dataType}>;
+          for (var h = 0; h < 4; h++) {
+            for (var w = 0; w < 4; w++) {
+              p[h][w] = pixel_at_grid(h + y0, w + x0, H_in, W_in, indices[${idxN}], indices[${idxC}], border);
+            }
+          }
+
+          let dx = x - f32(x0 + 1);
+          let dy = y - f32(y0 + 1);
+          let result = gs_bicubic_interpolate(p, dx, dy);
+        `;
+      default:
+        throw new Error(`mode ${attributes.mode} is not supported`);
+    }
+  })() + `${output.setByOffset('global_idx', 'result')}`;
+
+const createGridSampleProgramInfo = (inputs: readonly TensorView[], attributes: GridSampeAttributes): ProgramInfo => {
+  const x = inputVariable('x', inputs[0].dataType, inputs[0].dims.length);
+  // discard last dimension for using vec2 to access grid data
+  const gridShape = [inputs[1].dims[0], inputs[1].dims[1], inputs[1].dims[2]];
+  const grid = inputVariable('grid', inputs[1].dataType, gridShape.length, 2);
+  let outputShape = [inputs[0].dims[0], inputs[0].dims[1], inputs[1].dims[1], inputs[1].dims[2]];
+  if (attributes.format === 'NHWC') {
+    outputShape = [inputs[0].dims[0], inputs[1].dims[1], inputs[1].dims[2], inputs[0].dims[3]];
+    [idxN, idxC, idxH, idxW] = [0, 3, 1, 2];
+  }
+  const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+  const dataType = x.type.value;
+  const outputSize = ShapeUtil.size(outputShape);
+
+  const programUniforms: ProgramUniform[] = [
+    { type: DataType.uint32, data: outputSize },
+    ...createTensorShapeVariables(inputs[0].dims, gridShape, outputShape),
+  ];
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => `
+  ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(x, grid, output)}
+  ${gsGetCubicCoeffs}
+  ${gsBicubicInterpolate(dataType)}
+  ${gsDenormalize(attributes)}
+  ${gsReflect(attributes)}
+  ${pixelAtGrid(x, dataType, attributes)}
+
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+      let H_in = i32(uniforms.x_shape[${idxH}]);
+      let W_in = i32(uniforms.x_shape[${idxW}]);
+
+      ${
+        attributes.alignCorners === 0
+          ? `
+      let x_min = -0.5;
+      let x_max = f32(W_in) - 0.5;
+      let y_min = -0.5;
+      let y_max = f32(H_in) - 0.5;
+      `
+          : `
+      let x_min = 0.0;
+      let x_max = f32(W_in) - 1.0;
+      let y_min = 0.0;
+      let y_max = f32(H_in) - 1.0;
+      `
+      };
+      let border = vec4<f32>(x_min, y_min, x_max, y_max);
+
+      let indices = ${output.offsetToIndices('global_idx')};
+      var grid_indices = vec3<u32>(indices[${idxN}], indices[${idxH}], indices[${idxW}]);
+      let nxy = ${grid.getByIndices('grid_indices')};
+      var x = gs_denormalize(f32(nxy[0]), W_in);
+      var y = gs_denormalize(f32(nxy[1]), H_in);
+
+      ${computePixel(output, dataType, attributes)}
+  }`;
+
+  return {
+    name: 'GridSample',
+    shaderCache: { hint: `${attributes.cacheKey}`, inputDependencies: ['type', 'type'] },
+    getRunData: (inputs) => {
+      const outputSize = ShapeUtil.size(outputShape);
+      return {
+        outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+        dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
+        programUniforms,
+      };
+    },
+    getShaderSource,
+  };
+};
+
+export const gridSample = (context: ComputeContext, attributes: GridSampeAttributes): void => {
+  validateInputs(context.inputs);
+  context.compute(createGridSampleProgramInfo(context.inputs, attributes));
+};
+
+export const parseGridSampleAttributes = (attributes: Record<string, unknown>): GridSampeAttributes =>
+  createAttributeWithCacheKey({
+    alignCorners: attributes.align_corners as number,
+    mode: attributes.mode as Mode,
+    paddingMode: attributes.padding_mode as PaddingMode,
+    format: attributes.format as Format,
+  });
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index e40cfa5200a08..327e972257d35 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -42,6 +42,7 @@ export const matMul = (context: ComputeContext): void => {
     const batchA = ShapeUtil.size(context.inputs[0].dims.slice(0, -2));
     const batchB = ShapeUtil.size(context.inputs[1].dims.slice(0, -2));
     if (batchA !== 1 && M === 1 && batchB === 1) {
+      // Optimization for batched vec-mat-mul
       const reshapedA = context.inputs[0].reshape([1, batchA, K]);
       const reshapedB = context.inputs[1].reshape([1, K, N]);
       const matmulOutputShape = [1, batchA, N];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts b/js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts
new file mode 100644
index 0000000000000..8c24232d63c0c
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts
@@ -0,0 +1,177 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import { DataType } from '../../../wasm-common';
+import { TensorView } from '../../tensor-view';
+import { ShapeUtil } from '../../util';
+import { AttributeWithCacheKey, createAttributeWithCacheKey } from '../attribute-with-cache-key';
+import { ComputeContext, ProgramInfo, ProgramUniform } from '../types';
+
+import {
+  atomicOutputVariable,
+  createTensorShapeVariables,
+  inputVariable,
+  outputVariable,
+  ShaderHelper,
+} from './common';
+
+export interface ScatterNDAttributes extends AttributeWithCacheKey {
+  reduction: string;
+}
+
+type ReductionType = 'i32' | 'u32' | 'f32';
+
+const atomicReductionSnippet = (reduction: string, ptr: string, v: string, type: ReductionType) => {
+  if (reduction !== 'none' && type !== 'i32' && type !== 'u32' && type !== 'f32') {
+    throw new Error(`Input ${type} is not supported with reduction ${reduction}.`);
+  }
+
+  const floatStart = `{
+                var oldValue = 0;
+                loop {
+                  let newValueF32 =`;
+  const floatEnd = `;
+                  let newValue = bitcast<i32>(newValueF32);
+                  let res = atomicCompareExchangeWeak(&${ptr}, oldValue, newValue);
+                  if res.exchanged {
+                    break;
+                  }
+                  oldValue = res.old_value;
+                }
+              }`;
+
+  switch (reduction) {
+    case 'none':
+      return `${ptr}=${v};`;
+    case 'add':
+      if (type === 'i32' || type === 'u32') {
+        return `atomicAdd(&${ptr}, bitcast<${type}>(${v}));`;
+      } else {
+        // atomicAdd only supports uint/int type. For float, we use
+        // atomicCompareExchangeWeak to simulate.
+        return `
+              ${floatStart}bitcast<${type}>(oldValue) + (${v})${floatEnd}`;
+      }
+    case 'max':
+      if (type === 'i32' || type === 'u32') {
+        return `atomicMax(&${ptr}, bitcast<${type}>(${v}));`;
+      } else {
+        // atomicMax only supports uint/int type. For float, we use
+        // atomicCompareExchangeWeak to simulate.
+        return `
+                ${floatStart}max(bitcast<f32>(oldValue), (${v}))${floatEnd}`;
+      }
+    case 'min':
+      if (type === 'i32' || type === 'u32') {
+        return `atomicMin(&${ptr}, bitcast<${type}>(${v}));`;
+      } else {
+        // atomicMin only supports uint/int type. For float, we use
+        // atomicCompareExchangeWeak to simulate.
+        return `${floatStart}min(bitcast<${type}>(oldValue), (${v}))${floatEnd}`;
+      }
+    case 'mul':
+      // atomicMul is not supported, we use atomicCompareExchangeWeak to simulate.
+      return `${floatStart}(bitcast<${type}>(oldValue) * (${v}))${floatEnd}`;
+
+    default:
+      throw new Error(`Reduction ${reduction} is not supported.`);
+  }
+};
+
+const createScatterNDProgramInfo = (inputs: readonly TensorView[], attributes: ScatterNDAttributes): ProgramInfo => {
+  const inputShape = inputs[0].dims;
+  const indicesShape = inputs[1].dims;
+  const outputShape = inputShape;
+  // TODO: support bool with components 4.
+  const components = 1;
+  const outputSize = Math.ceil(ShapeUtil.size(indicesShape) / components);
+  const lastIndexDimension = indicesShape[indicesShape.length - 1];
+  const numUpdatesElements = ShapeUtil.sizeFromDimension(inputShape, lastIndexDimension);
+
+  const programUniforms: ProgramUniform[] = [
+    { type: DataType.uint32, data: outputSize },
+    { type: DataType.uint32, data: lastIndexDimension },
+    { type: DataType.uint32, data: numUpdatesElements },
+    ...createTensorShapeVariables(inputs[1].dims, inputs[2].dims, outputShape),
+  ];
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const indices = inputVariable('indices', inputs[1].dataType, inputs[1].dims.length);
+    const updates = inputVariable('updates', inputs[2].dataType, inputs[2].dims.length, components);
+    const output =
+      attributes.reduction !== 'none' && attributes.reduction !== ''
+        ? atomicOutputVariable('output', inputs[0].dataType, outputShape.length)
+        : outputVariable('output', inputs[0].dataType, outputShape.length, components);
+
+    return `
+      ${shaderHelper
+        .registerUniform('output_size', 'u32')
+        .registerUniform('last_index_dimension', 'u32')
+        .registerUniform('num_updates_elements', 'u32')
+        .declareVariables(indices, updates, output)}
+      ${shaderHelper.mainStart()}
+        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+  var data_offset = 0u;
+  let indices_start = uniforms.last_index_dimension * global_idx;
+  let indices_end = indices_start + uniforms.last_index_dimension;
+  for (var i = indices_start; i < indices_end; i++) {
+    var index = i32(indices[i].x);
+    ${
+      inputs[0].dims.length === 1
+        ? `
+    let element_count_dim = uniforms.output_strides;
+    let dim_value = uniforms.output_shape;`
+        : `
+    let element_count_dim = uniforms.output_strides[i - indices_start];
+    let dim_value = uniforms.output_shape[i - indices_start + uniforms.last_index_dimension];`
+    }
+    if (index >= 0) {
+      if (index >= i32(dim_value)) {
+        index = i32(dim_value - 1);
+      }
+    } else {
+      if (index < -i32(dim_value)) {
+        index = 0;
+      } else {
+        index += i32(dim_value);
+      }
+    }
+    data_offset += u32((u32(index) * element_count_dim));
+  }
+
+  for (var i = 0u; i < uniforms.num_updates_elements; i++) {
+    let value = updates[uniforms.num_updates_elements * global_idx + i];
+    ${atomicReductionSnippet(
+      attributes.reduction,
+      'output[data_offset + i]',
+      'value',
+      output.type.value as ReductionType,
+    )}
+  }
+
+      }`;
+  };
+  return {
+    name: 'ScatterND',
+    shaderCache: {
+      hint: `${attributes.cacheKey}_${attributes.reduction}`,
+      inputDependencies: ['rank', 'rank'],
+    },
+    getRunData: () => ({
+      outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+      dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
+      programUniforms,
+    }),
+    getShaderSource,
+  };
+};
+
+export const parseScatterNDAttributes = (attributes: Record<string, unknown>): ScatterNDAttributes =>
+  createAttributeWithCacheKey({ reduction: attributes.reduction as string });
+
+export const scatterND = (context: ComputeContext, attributes: ScatterNDAttributes): void => {
+  context.compute(createScatterNDProgramInfo(context.inputs, attributes), {
+    inputs: [context.inputs[1], context.inputs[2]],
+    outputs: [],
+  });
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index 1fd99d085e0ed..5059645211aea 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -29,7 +29,9 @@ const permFunctionBody = (perm: number[], rank: number, input: IndicesHelper, ou
   let reverseFunc = `fn perm(i: ${output.type.indices}) -> ${input.type.indices} {
     var a: ${input.type.indices};`;
   for (let i = 0; i < rank; ++i) {
-    reverseFunc += input.indicesSet('a', perm[i], `i[${i}]`);
+    // input indices and output indices should always be larger or equal to 2,
+    // so indexer is always valid to be used on `a` and `i`.
+    reverseFunc += `a[${perm[i]}]=i[${i}];`;
   }
   return (reverseFunc += 'return a;}');
 };
@@ -48,17 +50,61 @@ const squeezeShape = (shape: readonly number[], adjustedPerm: number[]): { newSh
   return { newShape, newPerm };
 };
 
+const isTransposeReshape = (perm: number[], shape: readonly number[]) => {
+  // As long as the dims with values > 1 stay in the same order, it's a reshape.
+  // Example: Shape=(1,1,1024,4096) -> perm=(2,0,3,1).
+  let lastPermutedAxis = 0;
+  for (let i = 0; i < perm.length; ++i) {
+    if (shape[perm[i]] === 1) {
+      continue;
+    }
+    if (perm[i] < lastPermutedAxis) {
+      return false;
+    }
+    lastPermutedAxis = perm[i];
+  }
+  return true;
+};
+
 export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: number[]): ProgramInfo => {
   const inputDataType = inputTensor.dataType;
   const inputRank = inputTensor.dims.length;
   const perm = getAdjustedPerm(inputRank, permAttr);
   const outputShape = getOutputShape(inputTensor.dims, perm);
+  let newInputShape = inputTensor.dims;
+  let newOutputShape = outputShape;
+  const transposeAsReshape = inputRank < 2 || isTransposeReshape(perm, inputTensor.dims);
+  let getShaderSource;
+  if (transposeAsReshape) {
+    getShaderSource = (shaderHelper: ShaderHelper) => {
+      const input = inputVariable('input', inputDataType, newInputShape, 4);
+      const output = outputVariable('output', inputDataType, newOutputShape, 4);
+      return `
+  ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)}
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+    output[global_idx] = input[global_idx];
+  }`;
+    };
+
+    return {
+      name: 'TransposeCopy',
+      shaderCache: { inputDependencies: ['type'] },
+      getRunData: () => {
+        const outputSize = ShapeUtil.size(outputShape);
+        return {
+          outputs: [{ dims: outputShape, dataType: inputTensor.dataType }],
+          dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */ / 4 /* components */) },
+          programUniforms: [{ type: DataType.uint32, data: Math.ceil(outputSize / 4) }],
+        };
+      },
+      getShaderSource,
+    };
+  }
   const { newShape, newPerm } = squeezeShape(inputTensor.dims, perm);
   const channelsLast = ShapeUtil.areEqual(newPerm, [2, 3, 1]);
   const channelsFirst = ShapeUtil.areEqual(newPerm, [3, 1, 2]);
-  const useShared = (newShape.length === 2 && newPerm[0] > newPerm[1]) || channelsLast || channelsFirst;
-  let newInputShape = useShared ? newShape : inputTensor.dims;
-  let newOutputShape = outputShape;
+  const useShared = newShape.length === 2 || channelsLast || channelsFirst;
   if (useShared) {
     newInputShape = channelsLast
       ? [newShape[0], newShape[1] * newShape[2]]
@@ -66,13 +112,11 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu
         ? [newShape[0] * newShape[1], newShape[2]]
         : newShape;
     newOutputShape = [newInputShape[1], newInputShape[0]];
-  }
-  const input = inputVariable('a', inputDataType, newInputShape.length);
-  const output = outputVariable('output', inputDataType, newOutputShape.length);
-  const tileSize = 16;
-  let getShaderSource;
-  if (useShared) {
-    getShaderSource = (shaderHelper: ShaderHelper) => `
+    const tileSize = 16;
+    getShaderSource = (shaderHelper: ShaderHelper) => {
+      const input = inputVariable('a', inputDataType, newInputShape.length);
+      const output = outputVariable('output', inputDataType, newOutputShape.length);
+      return `
   ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)}
   var<workgroup> tile : array<array<${output.type.value}, ${tileSize + 1}>, ${tileSize}>;
   ${shaderHelper.mainStart([tileSize, tileSize, 1])}
@@ -92,8 +136,29 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu
       ${output.setByIndices(`${output.type.indices}(output_row, output_col)`, 'tile[local_id.x][local_id.y]')}
     }
   }`;
-  } else {
-    getShaderSource = (shaderHelper: ShaderHelper) => `
+    };
+    return {
+      name: 'TransposeShared',
+      shaderCache: { inputDependencies: ['type'] },
+      getRunData: () => {
+        const outputSize = ShapeUtil.size(outputShape);
+        return {
+          outputs: [{ dims: outputShape, dataType: inputTensor.dataType }],
+          dispatchGroup: { x: Math.ceil(newOutputShape[1] / tileSize), y: Math.ceil(newOutputShape[0] / tileSize) },
+          programUniforms: [
+            { type: DataType.uint32, data: outputSize },
+            ...createTensorShapeVariables(newInputShape, newOutputShape),
+          ],
+        };
+      },
+      getShaderSource,
+    };
+  }
+
+  getShaderSource = (shaderHelper: ShaderHelper) => {
+    const input = inputVariable('a', inputDataType, newInputShape.length);
+    const output = outputVariable('output', inputDataType, newOutputShape.length);
+    return `
   ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)}
 
   ${permFunctionBody(perm, inputRank, input, output)}
@@ -106,17 +171,15 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu
 
     ${output.setByOffset('global_idx', input.getByIndices('aIndices'))}
   }`;
-  }
+  };
   return {
-    name: useShared ? 'TransposeShared' : 'Transpose',
+    name: 'Transpose',
     shaderCache: { hint: `${permAttr}`, inputDependencies: ['rank'] },
     getRunData: () => {
       const outputSize = ShapeUtil.size(outputShape);
       return {
         outputs: [{ dims: outputShape, dataType: inputTensor.dataType }],
-        dispatchGroup: useShared
-          ? { x: Math.ceil(newOutputShape[1] / tileSize), y: Math.ceil(newOutputShape[0] / tileSize) }
-          : { x: Math.ceil(outputSize / 64 /* workgroup size */) },
+        dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
         programUniforms: [
           { type: DataType.uint32, data: outputSize },
           ...createTensorShapeVariables(newInputShape, newOutputShape),
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index 042b8d9efaae9..2c5180c5db3ee 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -93,16 +93,23 @@ export class ProgramManager {
   build(programInfo: ProgramInfo, normalizedDispatchGroupSize: [number, number, number]): Artifact {
     TRACE_FUNC_BEGIN(programInfo.name);
     const device = this.backend.device;
-    const extensions: string[] = [];
-    if (device.features.has('shader-f16')) {
-      extensions.push('enable f16;');
-    }
-    if (device.features.has('chromium-experimental-subgroups')) {
-      extensions.push('enable chromium_experimental_subgroups;');
-    }
+    const enableDirectives: string[] = [];
+
+    // Enable WGSL extensions based on available WebGPU features
+    const extensionsInfo: Array<{ feature: GPUFeatureName; extension: string }> = [
+      { feature: 'shader-f16', extension: 'f16' },
+      { feature: 'subgroups' as GPUFeatureName, extension: 'subgroups' },
+      { feature: 'subgroups-f16' as GPUFeatureName, extension: 'subgroups_f16' },
+    ];
+    extensionsInfo.forEach((info) => {
+      if (device.features.has(info.feature)) {
+        enableDirectives.push(`enable ${info.extension};`);
+      }
+    });
+
     const shaderHelper = createShaderHelper(normalizedDispatchGroupSize, this.backend.device.limits);
     const userCode = programInfo.getShaderSource(shaderHelper);
-    const code = `${extensions.join('\n')}\n${shaderHelper.additionalImplementations}\n${userCode}`;
+    const code = `${enableDirectives.join('\n')}\n${shaderHelper.additionalImplementations}\n${userCode}`;
     const shaderModule = device.createShaderModule({ code, label: programInfo.name });
     LOG_DEBUG('verbose', () => `[WebGPU] ${programInfo.name} shader code: ${code}`);
 
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index 3b3c55733c973..9321ac170d036 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -21,6 +21,11 @@ export interface AdapterInfo {
   isArchitecture: (architecture: GpuArchitecture) => boolean;
   isVendor: (vendor: GpuVendor) => boolean;
 }
+export interface DeviceInfo {
+  readonly subgroupsSupported: boolean;
+  readonly subgroupsF16Supported: boolean;
+  readonly subgroupSizeRange?: readonly [number, number];
+}
 
 export interface GpuData {
   type: GpuDataType;
@@ -160,6 +165,11 @@ export interface ComputeContext {
    */
   readonly adapterInfo: AdapterInfo;
 
+  /**
+   * gpu device info
+   */
+  readonly deviceInfo: DeviceInfo;
+
   /**
    * stores the pointer to OpKernelContext
    */
@@ -187,8 +197,6 @@ export interface ComputeContext {
 
   compute(program: ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping): TensorView[];
   output(index: number, dims: readonly number[]): number;
-  getMaxComputeWorkgroupSizes(): [number, number, number];
-  getMaxComputeWorkgroupStoragesize(): number;
 }
 
 export type TimestampQuery = 'none' | 'inside-passes' | 'at-passes';
diff --git a/js/web/lib/wasm/jsep/webnn/tensor-manager.ts b/js/web/lib/wasm/jsep/webnn/tensor-manager.ts
index a19afd4bac732..4932691bda65b 100644
--- a/js/web/lib/wasm/jsep/webnn/tensor-manager.ts
+++ b/js/web/lib/wasm/jsep/webnn/tensor-manager.ts
@@ -78,7 +78,7 @@ const calculateByteLength = (dataType: MLOperandDataType, shape: readonly number
   if (!size) {
     throw new Error('Unsupported data type.');
   }
-  return Math.ceil((shape.reduce((a, b) => a * b) * size) / 8);
+  return shape.length > 0 ? Math.ceil((shape.reduce((a, b) => a * b) * size) / 8) : 0;
 };
 
 /**
@@ -195,7 +195,7 @@ class TensorIdTracker {
     }
 
     // eslint-disable-next-line no-bitwise
-    const usage = MLTensorUsage.READ | MLTensorUsage.WRITE;
+    const usage = typeof MLTensorUsage == 'undefined' ? undefined : MLTensorUsage.READ | MLTensorUsage.WRITE;
     this.wrapper = await this.tensorManager.getCachedTensor(dataType, shape, usage, true, true);
 
     if (copyOld && this.activeUpload) {
@@ -349,7 +349,7 @@ class TensorManagerImpl implements TensorManager {
   public async getCachedTensor(
     dataType: MLOperandDataType,
     shape: readonly number[],
-    usage: MLTensorUsageFlags,
+    usage: MLTensorUsageFlags | undefined,
     writable: boolean,
     readable: boolean,
   ): Promise<TensorWrapper> {
diff --git a/js/web/lib/wasm/jsep/webnn/webnn.d.ts b/js/web/lib/wasm/jsep/webnn/webnn.d.ts
index ca06f5c695f8d..c513b2ec2ed8b 100644
--- a/js/web/lib/wasm/jsep/webnn/webnn.d.ts
+++ b/js/web/lib/wasm/jsep/webnn/webnn.d.ts
@@ -400,7 +400,8 @@ declare const MLTensorUsage: {
 };
 
 interface MLTensorDescriptor extends MLOperandDescriptor {
-  usage: MLTensorUsageFlags;
+  /** @deprecated Use readable/writeable instead of usage */
+  usage: MLTensorUsageFlags | undefined;
   importableToWebGPU?: boolean;
   readable?: boolean;
   writable?: boolean;
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index f3794a72efbe8..da8939cd0263a 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -291,9 +291,6 @@ export const createSession = async (
       const providerName = typeof provider === 'string' ? provider : provider.name;
       if (providerName === 'webnn') {
         wasm.shouldTransferToMLTensor = false;
-        if (wasm.currentContext) {
-          throw new Error('WebNN execution provider is already set.');
-        }
         if (typeof provider !== 'string') {
           const webnnOptions = provider as InferenceSession.WebNNExecutionProviderOption;
           const context = (webnnOptions as InferenceSession.WebNNOptionsWithMLContext)?.context;
@@ -490,7 +487,7 @@ export const prepareInputOutputTensor = (
   }
 
   if (location === 'gpu-buffer') {
-    const gpuBuffer = tensor[2].gpuBuffer as GPUBuffer;
+    const gpuBuffer = tensor[2].gpuBuffer;
     dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!;
 
     const registerBuffer = wasm.jsepRegisterBuffer;
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index 40c614fdf866a..ebeac5dc9e587 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -232,6 +232,23 @@ export declare namespace JSEP {
      * @returns
      */
     jsepCreateMLContext(optionsOrGpuDevice?: MLContextOptions | GPUDevice): Promise<MLContext>;
+
+    /**
+     * [exported from pre-jsep.js] Register a WebNN Constant operand from external data.
+     * @param externalFilePath - specify the external file path.
+     * @param dataOffset - specify the external data offset.
+     * @param dataLength - specify the external data length.
+     * @param builder - specify the MLGraphBuilder used for constructing the Constant.
+     * @param desc - specify the MLOperandDescriptor of the Constant.
+     * @returns the WebNN Constant operand for the specified external data.
+     */
+    jsepRegisterMLConstant(
+      externalFilePath: string,
+      dataOffset: number,
+      dataLength: number,
+      builder: MLGraphBuilder,
+      desc: MLOperandDescriptor,
+    ): MLOperand;
   }
 }
 
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 894667ad58933..07c8f0bf3b940 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -861,9 +861,9 @@
       }
     },
     "node_modules/cross-spawn": {
-      "version": "6.0.5",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.5.tgz",
-      "integrity": "sha512-eTVLrBSt7fjbDygz805pMnstIs2VTBNkRm0qxZd+M7A5XDdxVRWO5MxGBXZhjY4cqLYLdtrGqRf8mBPmzwSpWQ==",
+      "version": "6.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.6.tgz",
+      "integrity": "sha512-VqCUuhcd1iB+dsv8gxPttb5iZh/D0iubSP21g36KXdEuf6I5JiioesUVjpCdHV9MZRUfVFlvwtIUyPfxo5trtw==",
       "dev": true,
       "dependencies": {
         "nice-try": "^1.0.4",
@@ -4312,9 +4312,9 @@
       }
     },
     "cross-spawn": {
-      "version": "6.0.5",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.5.tgz",
-      "integrity": "sha512-eTVLrBSt7fjbDygz805pMnstIs2VTBNkRm0qxZd+M7A5XDdxVRWO5MxGBXZhjY4cqLYLdtrGqRf8mBPmzwSpWQ==",
+      "version": "6.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.6.tgz",
+      "integrity": "sha512-VqCUuhcd1iB+dsv8gxPttb5iZh/D0iubSP21g36KXdEuf6I5JiioesUVjpCdHV9MZRUfVFlvwtIUyPfxo5trtw==",
       "dev": true,
       "requires": {
         "nice-try": "^1.0.4",
diff --git a/js/web/package.json b/js/web/package.json
index 656cd7b56b039..181d6127f5455 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -83,7 +83,7 @@
       "types": "./types.d.ts"
     },
     "./wasm": {
-      "import": "./dist/ort.wasm.min.mjs",
+      "import": "./dist/ort.wasm.bundle.min.mjs",
       "require": "./dist/ort.wasm.min.js",
       "types": "./types.d.ts"
     },
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 408f9e00a5cbd..529e9d1065e69 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -591,14 +591,14 @@ async function main() {
     // ort[.min].[m]js
     await addAllWebBuildTasks({
       outputName: 'ort',
-      define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true' },
+      define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true' },
     });
     // ort.bundle.min.mjs
     await buildOrt({
       isProduction: true,
       outputName: 'ort.bundle',
       format: 'esm',
-      define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true', 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'true' },
+      define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'true' },
     });
 
     // ort.webgpu[.min].[m]js
@@ -619,6 +619,13 @@ async function main() {
       outputName: 'ort.wasm',
       define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true', 'BUILD_DEFS.DISABLE_WEBGL': 'true' },
     });
+    // ort.wasm.bundle.min.mjs
+    await buildOrt({
+      isProduction: true,
+      outputName: 'ort.wasm.bundle',
+      format: 'esm',
+      define: { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true', 'BUILD_DEFS.DISABLE_WEBGL': 'true' },
+    });
     // ort.webgl[.min].[m]js
     await addAllWebBuildTasks({
       outputName: 'ort.webgl',
diff --git a/js/web/test/data/ops/expand.jsonc b/js/web/test/data/ops/expand.jsonc
index 613b4507b2b15..8fbe9339feb9b 100644
--- a/js/web/test/data/ops/expand.jsonc
+++ b/js/web/test/data/ops/expand.jsonc
@@ -134,6 +134,56 @@
             "type": "float32"
           }
         ]
+      },
+      {
+        "name": "Expand in components = 1, out components = 4",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [3, 2, 1],
+            "type": "float32"
+          },
+          {
+            "data": [3, 1, 8],
+            "dims": [3],
+            "type": "int64"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
+              5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6
+            ],
+            "dims": [3, 2, 8],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "Expand in components = 4, out components = 4",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [1, 1, 2, 8],
+            "type": "float32"
+          },
+          {
+            "data": [2, 1, 8],
+            "dims": [3],
+            "type": "int64"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+              16
+            ],
+            "dims": [1, 2, 2, 8],
+            "type": "float32"
+          }
+        ]
       }
     ]
   },
diff --git a/js/web/test/data/ops/gather-nd.jsonc b/js/web/test/data/ops/gather-nd.jsonc
new file mode 100644
index 0000000000000..209c7d1f74087
--- /dev/null
+++ b/js/web/test/data/ops/gather-nd.jsonc
@@ -0,0 +1,147 @@
+[
+  {
+    "name": "GatherND int32",
+    "operator": "GatherND",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "data[4] indices[]",
+        "inputs": [
+          {
+            "data": [100, 101, 102, 777, 778, 779, 1000, 1001, 1002],
+            "dims": [9],
+            "type": "int32"
+          },
+          {
+            "data": [0, 4, 8],
+            "dims": [3, 1],
+            "type": "int64"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [100, 778, 1002],
+            "dims": [3],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "GatherND float32",
+    "operator": "GatherND",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "data[4] indices[]",
+        "inputs": [
+          {
+            "data": [100.1, 101.2, 102.3, 777.4, 778.5, 779.6, 1000.7, 1001.8, 1002.9],
+            "dims": [9],
+            "type": "float32"
+          },
+          {
+            "data": [0, 4, 8],
+            "dims": [3, 1],
+            "type": "int64"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [100.0999984741211, 778.5, 1002.9000244140625],
+            "dims": [3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "GatherND int32 [2 2 2], batch_dims",
+    "operator": "GatherND",
+    "attributes": [{ "name": "batch_dims", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "data[4] indices[]",
+        "inputs": [
+          {
+            "data": [0, 1, 2, 3, 4, 5, 6, 7],
+            "dims": [2, 2, 2],
+            "type": "int32"
+          },
+          {
+            "data": [1, 0],
+            "dims": [2, 1],
+            "type": "int64"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [2, 3, 4, 5],
+            "dims": [2, 2],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "GatherND float16",
+    "operator": "GatherND",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "data[4] indices[]",
+        "inputs": [
+          {
+            "data": [100.1, 101.2, 102.3, 777.4, 778.5, 779.6, 1000.7, 1001.8, 1002.9],
+            "dims": [9],
+            "type": "float16"
+          },
+          {
+            "data": [0, 4, 8],
+            "dims": [3, 1],
+            "type": "int64"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [100.0999984741211, 778.5, 1002.9000244140625],
+            "dims": [3],
+            "type": "float16"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "GatherND uint32 [2 2 2], batch_dims",
+    "operator": "GatherND",
+    "attributes": [{ "name": "batch_dims", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "data[4] indices[]",
+        "inputs": [
+          {
+            "data": [0, 1, 2, 3, 4, 5, 6, 7],
+            "dims": [2, 2, 2],
+            "type": "uint32"
+          },
+          {
+            "data": [1, 0],
+            "dims": [2, 1],
+            "type": "int64"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [2, 3, 4, 5],
+            "dims": [2, 2],
+            "type": "uint32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/matmul.jsonc b/js/web/test/data/ops/matmul.jsonc
index ead6427350bca..f5996db1aecb6 100644
--- a/js/web/test/data/ops/matmul.jsonc
+++ b/js/web/test/data/ops/matmul.jsonc
@@ -363,6 +363,100 @@
             "type": "float32"
           }
         ]
+      },
+      {
+        "name": "same ranks different broadcast small 0",
+        "inputs": [
+          {
+            "data": [0, 1, 2, 3, 4, 5, 6, 7],
+            "dims": [1, 2, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [8, 9, 10, 11],
+            "dims": [2, 1, 2, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [9, 43, 77, 111, 11, 53, 95, 137],
+            "dims": [2, 2, 2, 1],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "same ranks different broadcast small 1",
+        "inputs": [
+          {
+            "data": [0, 1, 2, 3, 4, 5, 6, 7],
+            "dims": [2, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [8, 9, 10, 11],
+            "dims": [1, 2, 2, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [9, 43, 11, 53, 77, 111, 95, 137],
+            "dims": [2, 2, 2, 1],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "same ranks different broadcast larger 0",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ],
+            "dims": [1, 2, 2, 8],
+            "type": "float32"
+          },
+          {
+            "data": [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
+            "dims": [2, 1, 8, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1036, 3308, 5580, 7852, 1260, 4044, 6828, 9612],
+            "dims": [2, 2, 2, 1],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "same ranks different broadcast larger 1",
+        "inputs": [
+          {
+            "data": [
+              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31
+            ],
+            "dims": [2, 1, 2, 8],
+            "type": "float32"
+          },
+          {
+            "data": [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
+            "dims": [1, 2, 8, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1036, 3308, 1260, 4044, 5580, 7852, 6828, 9612],
+            "dims": [2, 2, 2, 1],
+            "type": "float32"
+          }
+        ]
       }
     ]
   }
diff --git a/js/web/test/data/ops/scatternd.jsonc b/js/web/test/data/ops/scatternd.jsonc
new file mode 100644
index 0000000000000..5135bb9e4d3a5
--- /dev/null
+++ b/js/web/test/data/ops/scatternd.jsonc
@@ -0,0 +1,472 @@
+[
+  {
+    "name": "ScatterND int32",
+    "operator": "ScatterND",
+    "attributes": [],
+    "opset": { "domain": "", "version": 13 },
+    "cases": [
+      {
+        "name": "int32",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [8],
+            "type": "int32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [9, 10, 11, 12],
+            "dims": [1, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 11, 3, 10, 9, 6, 7, 12],
+            "dims": [8],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "int32",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+              56, 57, 58, 59, 60, 61, 62, 63, 64
+            ],
+            "dims": [4, 4, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2],
+            "dims": [2, 1],
+            "type": "int64"
+          },
+          {
+            "data": [
+              100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+              121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131
+            ],
+            "dims": [2, 4, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+              110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+              131, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+            ],
+            "dims": [4, 4, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND float32",
+    "operator": "ScatterND",
+    "attributes": [],
+    "opset": { "domain": "", "version": 13 },
+    "cases": [
+      {
+        "name": "float32",
+        "inputs": [
+          {
+            "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+            "dims": [8],
+            "type": "float32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [9.1, 10.2, 11.3, 12.5],
+            "dims": [1, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1.1, 11.3, 3.1, 10.2, 9.1, 6.1, 7.8, 12.5],
+            "dims": [8],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND add int32",
+    "operator": "ScatterND",
+    "attributes": [{ "name": "reduction", "data": "add", "type": "string" }],
+    "opset": { "domain": "", "version": 16 },
+    "cases": [
+      {
+        "name": "int32",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [8],
+            "type": "int32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [9, 10, 11, 12],
+            "dims": [1, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 13, 3, 14, 14, 6, 7, 20],
+            "dims": [8],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND add float32",
+    "operator": "ScatterND",
+    "attributes": [{ "name": "reduction", "data": "add", "type": "string" }],
+    "opset": { "domain": "", "version": 16 },
+    "cases": [
+      {
+        "name": "float32",
+        "inputs": [
+          {
+            "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+            "dims": [8],
+            "type": "float32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [9.1, 10.2, 11.3, 12.5],
+            "dims": [1, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1.100000023841858, 13.5, 3.0999999046325684, 14.699999809265137, 14.40000057220459, 6.099999904632568,
+              7.800000190734863, 21.399999618530273
+            ],
+            "dims": [8],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND mul int32",
+    "operator": "ScatterND",
+    "attributes": [{ "name": "reduction", "data": "mul", "type": "string" }],
+    "opset": { "domain": "", "version": 16 },
+    "cases": [
+      {
+        "name": "int32",
+        "inputs": [
+          {
+            "data": [11, 22, 31, 45, 53, 61, 78, 89],
+            "dims": [8],
+            "type": "int32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [91, 102, 113, 125],
+            "dims": [1, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [11, 2486, 31, 4590, 4823, 61, 78, 11125],
+            "dims": [8],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND min int32",
+    "operator": "ScatterND",
+    "attributes": [{ "name": "reduction", "data": "min", "type": "string" }],
+    "opset": { "domain": "", "version": 16 },
+    "cases": [
+      {
+        "name": "int32",
+        "inputs": [
+          {
+            "data": [11, 22, 31, 45, 53, 61, 78, 89],
+            "dims": [8],
+            "type": "int32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [91, 102, 113, 125],
+            "dims": [1, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [11, 22, 31, 45, 53, 61, 78, 89],
+            "dims": [8],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND max int32",
+    "operator": "ScatterND",
+    "attributes": [{ "name": "reduction", "data": "max", "type": "string" }],
+    "opset": { "domain": "", "version": 16 },
+    "cases": [
+      {
+        "name": "int32",
+        "inputs": [
+          {
+            "data": [11, 22, 31, 45, 53, 61, 78, 89],
+            "dims": [8],
+            "type": "int32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [91, 102, 113, 125],
+            "dims": [1, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [11, 113, 31, 102, 91, 61, 78, 125],
+            "dims": [8],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND mul float32",
+    "operator": "ScatterND",
+    "attributes": [{ "name": "reduction", "data": "mul", "type": "string" }],
+    "opset": { "domain": "", "version": 16 },
+    "cases": [
+      {
+        "name": "float32",
+        "inputs": [
+          {
+            "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+            "dims": [8],
+            "type": "float32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [9.1, 10.2, 11.3, 12.5],
+            "dims": [1, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1.100000023841858, 24.860000610351562, 3.0999999046325684, 45.89999771118164, 48.230003356933594,
+              6.099999904632568, 7.800000190734863, 111.24999237060547
+            ],
+            "dims": [8],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND min float32",
+    "operator": "ScatterND",
+    "attributes": [{ "name": "reduction", "data": "min", "type": "string" }],
+    "opset": { "domain": "", "version": 16 },
+    "cases": [
+      {
+        "name": "float32",
+        "inputs": [
+          {
+            "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+            "dims": [8],
+            "type": "float32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [9.1, 10.2, 11.3, 12.5],
+            "dims": [1, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1.100000023841858, 2.200000047683716, 3.0999999046325684, 4.5, 5.300000190734863, 6.099999904632568,
+              7.800000190734863, 8.899999618530273
+            ],
+            "dims": [8],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND max float32",
+    "operator": "ScatterND",
+    "attributes": [{ "name": "reduction", "data": "max", "type": "string" }],
+    "opset": { "domain": "", "version": 16 },
+    "cases": [
+      {
+        "name": "float32",
+        "inputs": [
+          {
+            "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+            "dims": [8],
+            "type": "float32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [9.1, 10.2, 11.3, 12.5],
+            "dims": [1, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1.100000023841858, 11.300000190734863, 3.0999999046325684, 10.199999809265137, 9.100000381469727,
+              6.099999904632568, 7.800000190734863, 12.5
+            ],
+            "dims": [8],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND float16",
+    "operator": "ScatterND",
+    "attributes": [],
+    "opset": { "domain": "", "version": 11 },
+    "cases": [
+      {
+        "name": "float16",
+        "inputs": [
+          {
+            "data": [1.1, 2.2, 3.1, 4.5, 5.3, 6.1, 7.8, 8.9],
+            "dims": [8],
+            "type": "float16"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [9.1, 10.2, 11.3, 12.5],
+            "dims": [1, 4],
+            "type": "float16"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1.1, 11.3, 3.1, 10.2, 9.1, 6.1, 7.8, 12.5],
+            "dims": [8],
+            "type": "float16"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ScatterND mul uint32",
+    "operator": "ScatterND",
+    "attributes": [{ "name": "reduction", "data": "mul", "type": "string" }],
+    "opset": { "domain": "", "version": 16 },
+    "cases": [
+      {
+        "name": "uint32",
+        "inputs": [
+          {
+            "data": [11, 22, 31, 45, 53, 61, 78, 89],
+            "dims": [8],
+            "type": "uint32"
+          },
+          {
+            "data": [4, 3, 1, 7],
+            "dims": [1, 4, 1],
+            "type": "int64"
+          },
+          {
+            "data": [91, 102, 113, 125],
+            "dims": [1, 4],
+            "type": "uint32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [11, 2486, 31, 4590, 4823, 61, 78, 11125],
+            "dims": [8],
+            "type": "uint32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/transpose.jsonc b/js/web/test/data/ops/transpose.jsonc
index a7265d6444118..d431ceb1712a5 100644
--- a/js/web/test/data/ops/transpose.jsonc
+++ b/js/web/test/data/ops/transpose.jsonc
@@ -263,6 +263,30 @@
       }
     ]
   },
+  {
+    "name": "Transpose as reshape - perms:[1, 0, 2, 4, 3]",
+    "operator": "Transpose",
+    "attributes": [{ "name": "perm", "data": [1, 0, 2, 4, 3], "type": "ints" }],
+    "cases": [
+      {
+        "name": "T[3, 1, 2, 1, 4]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
+            "dims": [3, 1, 2, 1, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
+            "dims": [1, 3, 2, 4, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "Transpose - perms:[1, 0]",
     "operator": "Transpose",
diff --git a/js/web/test/e2e/browser-test-wasm-binary-override.js b/js/web/test/e2e/browser-test-wasm-binary-override.js
index 471c26f6990b5..27cce2ca06236 100644
--- a/js/web/test/e2e/browser-test-wasm-binary-override.js
+++ b/js/web/test/e2e/browser-test-wasm-binary-override.js
@@ -7,7 +7,7 @@ const documentUrl = document.currentScript.src;
 
 it('Browser E2E testing - WebAssembly backend', async function () {
   // preload .wasm file binary
-  const wasmUrl = new URL('./node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.wasm', documentUrl).href;
+  const wasmUrl = new URL('./node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.jsep.wasm', documentUrl).href;
   const response = await fetch(wasmUrl);
 
   // make sure the .wasm file is loaded successfully
diff --git a/js/web/test/e2e/browser-test-wasm-path-override-filename-jsep.js b/js/web/test/e2e/browser-test-wasm-path-override-filename-jsep.js
new file mode 100644
index 0000000000000..d325a5ca7187d
--- /dev/null
+++ b/js/web/test/e2e/browser-test-wasm-path-override-filename-jsep.js
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+it('Browser E2E testing - WebAssembly backend (path override filename)', async function () {
+  // check base URL port from test args
+  if (typeof __ort_arg_port === 'undefined') {
+    throw new Error('test flag --port=<PORT> is required');
+  }
+  const base = `http://localhost:${__ort_arg_port}/`;
+
+  ort.env.wasm.wasmPaths = {};
+
+  if (typeof __ort_arg_files === 'string' && __ort_arg_files.includes('wasm')) {
+    const overrideWasmUrl = new URL('./test-wasm-path-override/jsep-renamed.wasm', base).href;
+    console.log(`ort.env.wasm.wasmPaths['wasm'] = ${JSON.stringify(overrideWasmUrl)};`);
+    ort.env.wasm.wasmPaths.wasm = overrideWasmUrl;
+  }
+
+  if (typeof __ort_arg_files === 'string' && __ort_arg_files.includes('mjs')) {
+    const overrideMjsUrl = new URL('./test-wasm-path-override/jsep-renamed.mjs', base).href;
+    console.log(`ort.env.wasm.wasmPaths['mjs'] = ${JSON.stringify(overrideMjsUrl)};`);
+    ort.env.wasm.wasmPaths.mjs = overrideMjsUrl;
+  }
+
+  await testFunction(ort, { executionProviders: ['wasm'] });
+});
diff --git a/js/web/test/e2e/run-data.js b/js/web/test/e2e/run-data.js
index 04079b042bc23..dbc3ca0bd2460 100644
--- a/js/web/test/e2e/run-data.js
+++ b/js/web/test/e2e/run-data.js
@@ -14,7 +14,7 @@ const NODEJS_TEST_CASES = [
 // [test_for_same_origin, test_for_cross_origin, main_js, ort_main_js, [test_args]]
 const BROWSER_TEST_CASES = [
   // IIFE
-  [true, true, './browser-test-webgl.js', 'ort.min.js'], // webgl
+  [true, true, './browser-test-webgl.js', 'ort.all.min.js'], // webgl
   [true, true, './browser-test-webgl.js', 'ort.webgl.min.js'], // webgl
   [true, true, './browser-test-wasm.js', 'ort.wasm.min.js'], // wasm, ort.wasm
   [true, true, './browser-test-wasm-multi-session-create.js', 'ort.min.js'], // wasm, multi-session create
@@ -24,7 +24,7 @@ const BROWSER_TEST_CASES = [
   [true, true, './browser-test-wasm.js', 'ort.min.js', ['num_threads=1', 'proxy=1']], // wasm, 1 thread, proxy
 
   // ort.min.mjs
-  [true, true, './browser-test-webgl.js', 'ort.min.mjs'], // webgl
+  [true, true, './browser-test-webgl.js', 'ort.webgl.min.mjs'], // webgl
   [true, true, './browser-test-wasm.js', 'ort.min.mjs', ['num_threads=1']], // wasm, 1 thread
   [true, true, './browser-test-wasm.js', 'ort.min.mjs', ['num_threads=2']], // wasm, 2 threads
   [true, true, './browser-test-wasm.js', 'ort.min.mjs', ['num_threads=2', 'proxy=1']], // wasm, 2 threads, proxy
@@ -41,22 +41,22 @@ const BROWSER_TEST_CASES = [
 
   // path override:
   // wasm, path override filenames for both mjs and wasm, same origin
-  [true, false, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=9876', 'files=mjs,wasm']],
+  [true, false, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=9876', 'files=mjs,wasm']],
   [true, false, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=9876', 'files=mjs,wasm']],
   // wasm, path override filenames for both mjs and wasm, cross origin
-  [false, true, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=8081', 'files=mjs,wasm']],
+  [false, true, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=8081', 'files=mjs,wasm']],
   [false, true, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=8081', 'files=mjs,wasm']],
   // wasm, path override filename for wasm, same origin
-  [true, false, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=9876', 'files=wasm']],
+  [true, false, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=9876', 'files=wasm']],
   [true, false, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=9876', 'files=wasm']],
   // wasm, path override filename for wasm, cross origin
-  [false, true, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=8081', 'files=wasm']],
+  [false, true, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=8081', 'files=wasm']],
   [false, true, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=8081', 'files=wasm']],
   // wasm, path override filename for mjs, same origin
-  [true, false, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=9876', 'files=mjs']],
+  [true, false, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=9876', 'files=mjs']],
   [true, false, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=9876', 'files=mjs']],
   // wasm, path override filename for mjs, cross origin
-  [false, true, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=8081', 'files=mjs']],
+  [false, true, './browser-test-wasm-path-override-filename-jsep.js', 'ort.min.js', ['port=8081', 'files=mjs']],
   [false, true, './browser-test-wasm-path-override-filename.js', 'ort.wasm.min.js', ['port=8081', 'files=mjs']],
   // wasm, path override prefix, same origin
   [true, false, './browser-test-wasm-path-override-prefix.js', 'ort.min.js', ['port=9876']],
diff --git a/js/web/test/e2e/run.js b/js/web/test/e2e/run.js
index 93f9d4a144bf2..3361bbece64ed 100644
--- a/js/web/test/e2e/run.js
+++ b/js/web/test/e2e/run.js
@@ -146,6 +146,10 @@ function prepareWasmPathOverrideFiles() {
   fs.copyFileSync(`${sourceFile}.wasm`, path.join(folder, 'ort-wasm-simd-threaded.wasm'));
   fs.copyFileSync(`${sourceFile}.mjs`, path.join(folder, 'renamed.mjs'));
   fs.copyFileSync(`${sourceFile}.wasm`, path.join(folder, 'renamed.wasm'));
+  fs.copyFileSync(`${sourceFile}.jsep.mjs`, path.join(folder, 'ort-wasm-simd-threaded.jsep.mjs'));
+  fs.copyFileSync(`${sourceFile}.jsep.wasm`, path.join(folder, 'ort-wasm-simd-threaded.jsep.wasm'));
+  fs.copyFileSync(`${sourceFile}.jsep.mjs`, path.join(folder, 'jsep-renamed.mjs'));
+  fs.copyFileSync(`${sourceFile}.jsep.wasm`, path.join(folder, 'jsep-renamed.wasm'));
 }
 
 async function testAllNodejsCases() {
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 1f379e0de6165..f179756967d49 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -570,14 +570,14 @@
       "test_greater_equal_expanded",
       "test_greater_equal",
       "test_greater",
-      // // "test_gridsample_aligncorners_true",
-      // // "test_gridsample_bicubic",
-      // // "test_gridsample_bilinear",
-      // // "test_gridsample_border_padding",
-      // // "test_gridsample_nearest",
-      // // "test_gridsample_reflection_padding",
-      // // "test_gridsample_zeros_padding",
-      // // "test_gridsample",
+      "test_gridsample_aligncorners_true",
+      "test_gridsample_bicubic",
+      "test_gridsample_bilinear",
+      "test_gridsample_border_padding",
+      "test_gridsample_nearest",
+      "test_gridsample_reflection_padding",
+      "test_gridsample_zeros_padding",
+      "test_gridsample",
       // // "test_gru_batchwise",
       // // "test_gru_defaults",
       // // "test_gru_seq_length",
@@ -1365,6 +1365,7 @@
       "gather.jsonc",
       "gather-block-quantized.jsonc",
       "gather-elements.jsonc",
+      "gather-nd.jsonc",
       "gemm.jsonc",
       "global-average-pool.jsonc",
       "greater.jsonc",
@@ -1396,6 +1397,7 @@
       "pow-big-number.jsonc",
       "reshape.jsonc",
       "rotary-embedding.jsonc",
+      "scatternd.jsonc",
       "simplified-layer-norm.jsonc",
       "skip-layer-norm.jsonc",
       "skip-simplified-layer-norm.jsonc",
@@ -2362,14 +2364,14 @@
       // "test_sinh",
       // // "test_size_example",
       // // "test_size",
-      // "test_slice_default_axes",
-      // "test_slice_default_steps",
-      // "test_slice_end_out_of_bounds",
-      // "test_slice_neg_steps",
-      // "test_slice_neg",
-      // "test_slice_negative_axes",
-      // "test_slice_start_out_of_bounds",
-      // "test_slice",
+      "test_slice_default_axes",
+      "test_slice_default_steps",
+      "test_slice_end_out_of_bounds",
+      "test_slice_neg_steps",
+      "test_slice_neg",
+      "test_slice_negative_axes",
+      "test_slice_start_out_of_bounds",
+      "test_slice",
       // "test_softmax_axis_0_expanded",
       "test_softmax_axis_0",
       // "test_softmax_axis_1_expanded",
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index c37c10c781400..d54ba32f9f494 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -661,7 +661,7 @@ async function createMLTensorForOutput(mlContext: MLContext, type: ort.Tensor.Ty
     shape: dims as number[],
     // Assign both shape and dimensions while transitioning to new API.
     dimensions: dims as number[],
-    usage: MLTensorUsage.READ,
+    usage: typeof MLTensorUsage == 'undefined' ? undefined : MLTensorUsage.READ,
     readable: true,
   });
 
@@ -686,7 +686,7 @@ async function createMLTensorForInput(mlContext: MLContext, cpuTensor: ort.Tenso
     shape: cpuTensor.dims as number[],
     // Assign both shape and dimensions while transitioning to new API.
     dimensions: cpuTensor.dims as number[],
-    usage: MLTensorUsage.WRITE,
+    usage: typeof MLTensorUsage == 'undefined' ? undefined : MLTensorUsage.WRITE,
     writable: true,
   });
   mlContext.writeTensor(mlTensor, cpuTensor.data);
diff --git a/objectivec/error_utils.mm b/objectivec/error_utils.mm
index 335cf8894d549..e8d4d5bb365c9 100644
--- a/objectivec/error_utils.mm
+++ b/objectivec/error_utils.mm
@@ -11,7 +11,7 @@ void ORTSaveCodeAndDescriptionToError(int code, const char* descriptionCstr, NSE
   if (!error) return;
 
   NSString* description = [NSString stringWithCString:descriptionCstr
-                                             encoding:NSASCIIStringEncoding];
+                                             encoding:NSUTF8StringEncoding];
 
   *error = [NSError errorWithDomain:kOrtErrorDomain
                                code:code
diff --git a/objectivec/include/ort_coreml_execution_provider.h b/objectivec/include/ort_coreml_execution_provider.h
index d7d873f5eb0e0..41d15aa39453a 100644
--- a/objectivec/include/ort_coreml_execution_provider.h
+++ b/objectivec/include/ort_coreml_execution_provider.h
@@ -70,7 +70,22 @@ NS_ASSUME_NONNULL_BEGIN
  */
 - (BOOL)appendCoreMLExecutionProviderWithOptions:(ORTCoreMLExecutionProviderOptions*)options
                                            error:(NSError**)error;
-
+/**
+ * Enables the CoreML execution provider in the session configuration options.
+ * It is appended to the execution provider list which is ordered by
+ * decreasing priority.
+ *
+ * @param provider_options The CoreML execution provider options in dict.
+ *  available keys-values: more detail in core/providers/coreml/coreml_execution_provider.h
+ *      kCoremlProviderOption_MLComputeUnits: one of "CPUAndNeuralEngine", "CPUAndGPU", "CPUOnly", "All"
+ *      kCoremlProviderOption_ModelFormat: one of "MLProgram", "NeuralNetwork"
+ *      kCoremlProviderOption_RequireStaticInputShapes: "1" or "0"
+ *      kCoremlProviderOption_EnableOnSubgraphs: "1" or "0"
+ * @param error Optional error information set if an error occurs.
+ * @return Whether the provider was enabled successfully.
+ */
+- (BOOL)appendCoreMLExecutionProviderWithOptionsV2:(NSDictionary*)provider_options
+                                             error:(NSError**)error;
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/objectivec/ort_coreml_execution_provider.mm b/objectivec/ort_coreml_execution_provider.mm
index 6cb5026b93521..0c790a91fb8b9 100644
--- a/objectivec/ort_coreml_execution_provider.mm
+++ b/objectivec/ort_coreml_execution_provider.mm
@@ -43,6 +43,21 @@ - (BOOL)appendCoreMLExecutionProviderWithOptions:(ORTCoreMLExecutionProviderOpti
 #endif
 }
 
+- (BOOL)appendCoreMLExecutionProviderWithOptionsV2:(NSDictionary*)provider_options
+                                             error:(NSError**)error {
+#if ORT_OBJC_API_COREML_EP_AVAILABLE
+  try {
+    return [self appendExecutionProvider:@"CoreML" providerOptions:provider_options error:error];
+  }
+  ORT_OBJC_API_IMPL_CATCH_RETURNING_BOOL(error);
+
+#else  // !ORT_OBJC_API_COREML_EP_AVAILABLE
+  static_cast<void>(provider_options);
+  ORTSaveCodeAndDescriptionToError(ORT_FAIL, "CoreML execution provider is not enabled.", error);
+  return NO;
+#endif
+}
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/objectivec/test/ort_session_test.mm b/objectivec/test/ort_session_test.mm
index 508289f7bc748..409ee7e1584e2 100644
--- a/objectivec/test/ort_session_test.mm
+++ b/objectivec/test/ort_session_test.mm
@@ -223,6 +223,28 @@ - (void)testAppendCoreMLEP {
   ORTAssertNullableResultSuccessful(session, err);
 }
 
+- (void)testAppendCoreMLEP_v2 {
+  NSError* err = nil;
+  ORTSessionOptions* sessionOptions = [ORTSessionTest makeSessionOptions];
+  NSDictionary* provider_options = @{@"EnableOnSubgraphs" : @"1"};  // set an arbitrary option
+
+  BOOL appendResult = [sessionOptions appendCoreMLExecutionProviderWithOptionsV2:provider_options
+                                                                           error:&err];
+
+  if (!ORTIsCoreMLExecutionProviderAvailable()) {
+    ORTAssertBoolResultUnsuccessful(appendResult, err);
+    return;
+  }
+
+  ORTAssertBoolResultSuccessful(appendResult, err);
+
+  ORTSession* session = [[ORTSession alloc] initWithEnv:self.ortEnv
+                                              modelPath:[ORTSessionTest getAddModelPath]
+                                         sessionOptions:sessionOptions
+                                                  error:&err];
+  ORTAssertNullableResultSuccessful(session, err);
+}
+
 - (void)testAppendXnnpackEP {
   NSError* err = nil;
   ORTSessionOptions* sessionOptions = [ORTSessionTest makeSessionOptions];
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention.cc b/onnxruntime/contrib_ops/cpu/bert/attention.cc
index b15e865aa423c..ad14fb8258656 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/attention.cc
@@ -30,7 +30,6 @@ class Attention : public OpKernel, public AttentionCPUBase {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
@@ -102,7 +101,6 @@ bool Attention<T>::IsPackWeightsSuccessful(int qkv_index,
 
 template <typename T>
 Status Attention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
-                             bool /*save_prepacked_initializers*/,
                              /*out*/ bool& is_packed,
                              /*out*/ PrePackedWeights* prepacked_weights) {
   /* The PrePack() massages the weights to speed up Compute(), there is an option to
diff --git a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
index 0bdee151d2173..4cc5a4228dc8c 100644
--- a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
@@ -11,18 +11,19 @@ namespace onnxruntime {
 namespace contrib {
 namespace group_query_attention_helper {
 
-Status CheckInputs(const Tensor* query,
-                   const Tensor* key,
-                   const Tensor* value,
-                   const Tensor* past_key,
-                   const Tensor* past_value,
-                   const Tensor* cos_cache,
-                   const Tensor* sin_cache,
+template <typename T = Tensor>
+Status CheckInputs(const T* query,
+                   const T* key,
+                   const T* value,
+                   const T* past_key,
+                   const T* past_value,
+                   const T* cos_cache,
+                   const T* sin_cache,
                    void* parameters,
                    int num_heads,
                    int kv_num_heads,
-                   const Tensor* seqlens_k,
-                   const Tensor* total_seqlen,
+                   const T* seqlens_k,
+                   const T* total_seqlen,
                    float scale,
                    float softcap) {
   // Note: Here S* is seqlen_past_kv_cache, S+ is seqlen_present_kv_cache
@@ -265,18 +266,19 @@ Status CheckInputs(const Tensor* query,
   return Status::OK();
 }
 
-Status CheckInputs(const Tensor* query,
-                   const Tensor* key,
-                   const Tensor* value,
-                   const Tensor* past_key,
-                   const Tensor* past_value,
-                   const Tensor* cos_cache,
-                   const Tensor* sin_cache,
+template <typename T = Tensor>
+Status CheckInputs(const T* query,
+                   const T* key,
+                   const T* value,
+                   const T* past_key,
+                   const T* past_value,
+                   const T* cos_cache,
+                   const T* sin_cache,
                    void* parameters,
                    int num_heads,
                    int kv_num_heads,
-                   const Tensor* seqlens_k,
-                   const Tensor* total_seqlen,
+                   const T* seqlens_k,
+                   const T* total_seqlen,
                    float scale,
                    float softcap,
                    int max_threads_per_block) {
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
index cbfd2f0949363..9a6c2af022c91 100644
--- a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
@@ -4,6 +4,7 @@
 #include "contrib_ops/cpu/bert/rotary_embedding.h"
 #include "contrib_ops/cpu/bert/rotary_embedding_helper.h"
 
+#include "core/mlas/inc/mlas.h"
 #include "core/platform/threadpool.h"
 
 using onnxruntime::concurrency::ThreadPool;
@@ -78,31 +79,12 @@ Status RunRotaryEmbedding(concurrency::ThreadPool* tp, RotaryParameters paramete
       const T* cos_data = cos_cache + cache_offset;
       const T* sin_data = sin_cache + cache_offset;
 
-      int cache_idx = 0;
-      bool sign = false;
-      int j = 0;
-      for (int i = 0; i < rotary_emb_dim; i++) {
-        if (interleaved) {
-          cache_idx = (i / 2) % half_rotary_emb_dim;
-          sign = i & 1;
-          j = sign ? i - 1 : i + 1;  // i - sign
-        } else {
-          cache_idx = i % half_rotary_emb_dim;
-          sign = (i >= half_rotary_emb_dim);
-          j = (i + half_rotary_emb_dim) % rotary_emb_dim;
-        }
-        float output_data_i = static_cast<float>(input_data[i]) * static_cast<float>(cos_data[cache_idx]);
-        float input_data_j = static_cast<float>(input_data[j]);
-        float sin_data_cache_idx = static_cast<float>(sin_data[cache_idx]);
-        if (sign) {
-          output_data_i += input_data_j * sin_data_cache_idx;
-        } else {
-          output_data_i -= input_data_j * sin_data_cache_idx;
-        }
-        output_data[i] = static_cast<T>(output_data_i);
-      }
-      for (int i = rotary_emb_dim; i < head_size; i++) {
-        output_data[i] = input_data[i];
+      MlasRotaryEmbedOneRow<T>(input_data, sin_data, cos_data, rotary_emb_dim, interleaved, output_data);
+
+      if (rotary_emb_dim < head_size) {
+        std::memcpy(output_data + rotary_emb_dim,
+                    input_data + rotary_emb_dim,
+                    (head_size - rotary_emb_dim) * sizeof(T));
       }
     }
   });
diff --git a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
index 71a66ea368943..2c897f183164f 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
@@ -24,7 +24,6 @@ class QAttention : public OpKernel, public AttentionCPUBase {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  bool& /*out*/ is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
@@ -59,7 +58,6 @@ QAttention<T>::QAttention(const OpKernelInfo& info) : OpKernel(info), AttentionC
 
 template <typename T>
 Status QAttention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
-                              bool /*save_prepacked_initializers*/,
                               /*out*/ bool& is_packed,
                               /*out*/ PrePackedWeights* prepacked_weights) {
   if (1 != input_idx) {
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
index 4148aae4b9a35..aa47f365c0005 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc
@@ -13,7 +13,7 @@ class DynamicQuantizeLSTM : public OpKernel, public LSTMBase {
   DynamicQuantizeLSTM(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}
 
   Status PrePack(const Tensor& tensor, int input_idx,
-                 AllocatorPtr alloc, bool save_prepacked_initializers, /*out*/ bool& is_packed,
+                 AllocatorPtr alloc, /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
   Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
@@ -91,7 +91,6 @@ static void UseSharedPrePackedBuffersImpl(std::vector<BufferUniquePtr>& prepacke
 }
 
 Status DynamicQuantizeLSTM::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                                    bool /*save_prepacked_initializers*/,
                                     /*out*/ bool& is_packed,
                                     /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index cee3dfc6b3f28..c3e43f897c509 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -32,24 +32,47 @@ constexpr size_t A = 0,
                  bias = 5;
 };
 
-int64_t GetAccuracyLevel(size_t nbits, size_t block_size, int64_t accuracy_level_attr) {
-  const auto accuracy_level = std::clamp(accuracy_level_attr,
-                                         static_cast<int64_t>(CompMostAccurate),
-                                         static_cast<int64_t>(CompLeastAccurate));
-
-  // Find a supported accuracy level that is not less accurate than the one given.
-  // CompMostAccurate is always supported with the fallback implementation.
-  // Note: A higher numeric accuracy level value means lower accuracy, so the comparison order is reversed.
-  int64_t effective_accuracy_level = accuracy_level;
-  for (; effective_accuracy_level > CompMostAccurate; --effective_accuracy_level) {
-    const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(effective_accuracy_level);
-    if (MlasIsSQNBitGemmAvailable(nbits, block_size, compute_type)) {
-      break;
-    }
-  }
+typedef enum {
+  Level0, /*!< input fp32, accumulator fp32 */
+  Level1, /*!< input fp32, accumulator fp32 */
+  Level2, /*!< input fp16, accumulator fp16 */
+  Level3, /*!< input bf16, accumulator fp32 */
+  Level4, /*!< input int8, accumulator int32 */
+} ACCURACY_LEVEL;
+
+// T: A data type.
+template <typename T>
+MLAS_QNBIT_GEMM_COMPUTE_TYPE
+GetComputeType(size_t nbits, size_t block_size, int64_t accuracy_level_attr) {
+  // For Fp32, only accuracy level 1 or 4 makes sense.
+  // non-ARM CPU converts Fp16 to Fp32.
+  // By converting Fp32 to Fp16, precision becomes worse. And due to the casting,
+  // there is no performance gain.
+  if (accuracy_level_attr == static_cast<int64_t>(Level4) &&
+      MlasIsQNBitGemmAvailable(nbits, block_size, SQNBIT_CompInt8)) {
+    return SQNBIT_CompInt8;
+  }
+
+  return SQNBIT_CompFp32;
+}
 
-  return effective_accuracy_level;
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+template <>
+MLAS_QNBIT_GEMM_COMPUTE_TYPE
+GetComputeType<MLFloat16>(size_t nbits, size_t block_size, int64_t accuracy_level_attr) {
+  // For Fp16, only accuracy level 2 or 4 makes sense.
+  // By converting Fp16 to Fp32, there is not precision increase, and the performance
+  // becomes worse.
+  if (accuracy_level_attr == static_cast<int64_t>(Level4) &&
+      MlasIsQNBitGemmAvailable(nbits, block_size, HQNBIT_CompInt8)) {
+    return HQNBIT_CompInt8;
+  }
+
+  // if HQNBIT_CompFp16 is not supported, will fallback to unpacked computation.
+  return HQNBIT_CompFp16;
 }
+#endif  // !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_ARM64
+
 }  // namespace
 
 bool GetType(const NodeArg& node_arg, int32_t& type) {
@@ -74,10 +97,9 @@ class MatMulNBits final : public OpKernel {
         N_{narrow<size_t>(info.GetAttr<int64_t>("N"))},
         block_size_{narrow<size_t>(info.GetAttr<int64_t>("block_size"))},
         nbits_{narrow<size_t>(info.GetAttr<int64_t>("bits"))},
-        accuracy_level_{GetAccuracyLevel(nbits_, block_size_, info.GetAttr<int64_t>("accuracy_level"))},
         has_g_idx_{info.GetInputCount() > InputIndex::g_idx && info.node().InputDefs()[InputIndex::g_idx]->Exists()},
         has_bias_{info.GetInputCount() > InputIndex::bias && info.node().InputDefs()[InputIndex::bias]->Exists()},
-        compute_type_{static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level_)} {
+        compute_type_{GetComputeType<T1>(nbits_, block_size_, info.GetAttr<int64_t>("accuracy_level"))} {
     const auto& node = info.node();
     auto input_defs = node.InputDefs();
     const NodeArg* zero_point_arg =
@@ -98,36 +120,26 @@ class MatMulNBits final : public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
-  void ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx);
-
   Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
                                    /*out*/ bool& used_shared_buffers) override;
 
-  std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) override;
-
-  Status SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) override;
-
  private:
   const size_t K_;
   const size_t N_;
   const size_t block_size_;
   const size_t nbits_;
-  const int64_t accuracy_level_;
   const bool has_g_idx_;
   const bool has_bias_;
-  const MLAS_SQNBIT_GEMM_COMPUTE_TYPE compute_type_;
+  const MLAS_QNBIT_GEMM_COMPUTE_TYPE compute_type_;
   bool has_unquantized_zero_point_{false};
   const bool column_wise_quant_{true};
   IAllocatorUniquePtr<void> packed_b_{};
   size_t packed_b_size_{0};
   IAllocatorUniquePtr<float> scales_fp32_{};
   IAllocatorUniquePtr<float> bias_fp32_{};
-  std::optional<Tensor> packed_tensor_{std::nullopt};
-  MLDataType prepack_tensor_data_type_;
 
   bool has_zp_input_{false};
 
@@ -152,27 +164,11 @@ class MatMulNBits final : public OpKernel {
                         Tensor* y,
                         AllocatorPtr& allocator,
                         concurrency::ThreadPool* thread_pool,
-                        const MatMulComputeHelper& helper) const {
-    ORT_THROW("ComputeBPacked is not supported for T1 type.");
-  }
+                        const MatMulComputeHelper& helper) const;
 };
 
-template <typename T1>
-void MatMulNBits<T1>::ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx) {
-  if (input_idx == InputIndex::B) {
-    prepack_tensor_data_type_ = tensor.DataType();
-  }
-
-  TensorShapeVector weights_dims = {static_cast<int64_t>((packed_b_size_ - 1) / prepack_tensor_data_type_->Size()) + 1};
-  packed_tensor_ = Tensor(prepack_tensor_data_type_,
-                          TensorShape(weights_dims),
-                          packed_b_.get(),
-                          OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator));
-}
-
 template <typename T1>
 Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
-                                bool save_prepacked_initializers,
                                 /*out*/ bool& is_packed,
                                 /*out*/ PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -181,43 +177,40 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
     return Status::OK();
   }
 
-  if (!MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
+  if (!MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
     return Status::OK();
   }
   if (input_idx == InputIndex::B) {
-    packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_);
+    packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_);
     if (packed_b_size_ == 0) {
       return Status::OK();
     }
     auto qptr = tensor.DataRaw();
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
-    MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), nullptr, has_zp_input_, nullptr, nullptr);
+    MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), nullptr, has_zp_input_, nullptr, nullptr);
     is_packed = true;
-  } else if (compute_type_ == CompInt8) {
+  } else if (compute_type_ == SQNBIT_CompInt8) {
 #ifdef MLAS_TARGET_AMD64_IX86
     if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
       auto sptr = tensor.Data<float>();
-      MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), sptr,
-                                   has_zp_input_, nullptr, nullptr);
+      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), sptr,
+                                  has_zp_input_, nullptr, nullptr);
       is_packed = false;
     } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) {
       auto zptr = tensor.Data<uint8_t>();
-      MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr);
+      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr);
       is_packed = false;
     }
 #endif  // MLAS_TARGET_AMD64_IX86
   }
 
-  if (save_prepacked_initializers) {
-    ConvertPrepackWeightIntoTensor(tensor, input_idx);
-  }
-
   return Status::OK();
 }
 
+#if !defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || !defined(MLAS_TARGET_ARM64)
+// Non-ARM-with-fp16-intrinsics fall back fp16 to fp32.
 template <>
 Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
-                                       bool save_prepacked_initializers,
                                        /*out*/ bool& is_packed,
                                        /*out*/ PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -239,64 +232,37 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
     return Status::OK();
   }
 
-  if (!MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
+  if (!MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
     return Status::OK();
   }
   if (input_idx == InputIndex::B) {
-    packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_);
+    packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_);
     if (packed_b_size_ == 0) {
       return Status::OK();
     }
     auto qptr = tensor.DataRaw();
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
-    MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(),
-                                 nullptr, has_zp_input_, nullptr, nullptr);
+    MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(),
+                                nullptr, has_zp_input_, nullptr, nullptr);
     is_packed = true;
-  } else if (compute_type_ == CompInt8) {
+  } else if (compute_type_ == SQNBIT_CompInt8) {
 #ifdef MLAS_TARGET_AMD64_IX86
     if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
-      MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
-                                   scales_fp32_.get(), has_zp_input_, nullptr, nullptr);
+      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
+                                  scales_fp32_.get(), has_zp_input_, nullptr, nullptr);
       is_packed = false;
     } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) {
       auto zptr = tensor.Data<uint8_t>();
-      MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
-                                   nullptr, has_zp_input_, zptr, nullptr);
+      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(),
+                                  nullptr, has_zp_input_, zptr, nullptr);
       is_packed = false;
     }
 #endif  // MLAS_TARGET_AMD64_IX86
   }
 
-  if (save_prepacked_initializers) {
-    ConvertPrepackWeightIntoTensor(tensor, input_idx);
-  }
-
-  return Status::OK();
-}
-
-template <typename T1>
-std::optional<Tensor> MatMulNBits<T1>::GetPrePackTensor(int input_idx) {
-  // For this kernel, prepack is performed on input_B, and possibly scales, zeros_points.
-  // During compute process, scales and zeros_points will keep as it is and only use prepacked
-  // buffer to replace input_B.
-  // Inorder to cope with this logic, we need to return latest prepacked buffer and only serialize
-  // the latest one. So, we need to always return packed_tensor_ here not only for input_B.
-  ORT_UNUSED_PARAMETER(input_idx);
-  return std::move(packed_tensor_);
-}
-
-template <typename T1>
-Status MatMulNBits<T1>::SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) {
-  if (input_idx == 1) {
-    // pre_packed_tensor is constant initialized tensor and its lifecycle is managed by session_state,
-    // session_state will release memory from pre_packed_tensor. packed_b_ will not release memory so
-    // pass empty/default buffer deleter here.
-    // const_cast here is temporary, will fix in follow up PR.
-    packed_b_ = BufferUniquePtr(const_cast<void*>(pre_packed_tensor.DataRaw()), BufferDeleter());
-  }
-
   return Status::OK();
 }
+#endif  // end !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_ARM64
 
 template <typename T1>
 Status MatMulNBits<T1>::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
@@ -311,20 +277,20 @@ Status MatMulNBits<T1>::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>&
   return Status::OK();
 }
 
-template <>
-Status MatMulNBits<float>::ComputeBPacked(const Tensor* a,
-                                          const Tensor* scales,
-                                          const Tensor* zero_points,
-                                          const Tensor* bias,
-                                          Tensor* y,
-                                          AllocatorPtr& allocator,
-                                          concurrency::ThreadPool* thread_pool,
-                                          const MatMulComputeHelper& helper) const {
-  const auto* a_data = a->Data<float>();
-  const auto* scales_data = scales->Data<float>();
+template <typename T1>
+Status MatMulNBits<T1>::ComputeBPacked(const Tensor* a,
+                                       const Tensor* scales,
+                                       const Tensor* zero_points,
+                                       const Tensor* bias,
+                                       Tensor* y,
+                                       AllocatorPtr& allocator,
+                                       concurrency::ThreadPool* thread_pool,
+                                       const MatMulComputeHelper& helper) const {
+  const auto* a_data = a->Data<T1>();
+  const auto* scales_data = scales->Data<T1>();
   const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->DataRaw();
-  const auto* bias_data = bias == nullptr ? nullptr : bias->Data<float>();
-  auto* y_data = y->MutableData<float>();
+  const auto* bias_data = bias == nullptr ? nullptr : bias->Data<T1>();
+  auto* y_data = y->MutableData<T1>();
 
   const size_t batch_count = helper.OutputOffsets().size();
   const size_t M = static_cast<size_t>(helper.M());
@@ -333,19 +299,19 @@ Status MatMulNBits<float>::ComputeBPacked(const Tensor* a,
   const size_t lda = helper.Lda(false);
 
   IAllocatorUniquePtr<std::byte> workspace{};
-  const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(
+  const size_t workspace_size = MlasQNBitGemmBatchWorkspaceSize(
       M, N, K, batch_count, nbits_, block_size_, compute_type_);
   if (workspace_size > 0) {
     // Use reserve since no caching is needed
     workspace = IAllocator::MakeUniquePtr<std::byte>(allocator, workspace_size, true);
   }
 
-  InlinedVector<MLAS_SQNBIT_GEMM_DATA_PARAMS> data(batch_count);
+  InlinedVector<MLAS_QNBIT_GEMM_DATA_PARAMS<T1>> data(batch_count);
   for (size_t i = 0; i < batch_count; ++i) {
     data[i].A = a_data + helper.LeftOffsets()[i];
     data[i].lda = lda;
 #ifdef MLAS_TARGET_AMD64_IX86
-    if (compute_type_ == CompInt8) {
+    if (compute_type_ == SQNBIT_CompInt8) {
       data[i].QuantBDataWorkspace = packed_b_.get();
     }
 #endif
@@ -356,11 +322,12 @@ Status MatMulNBits<float>::ComputeBPacked(const Tensor* a,
     data[i].C = y_data + helper.OutputOffsets()[i];
     data[i].ldc = N;
   }
-  MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(),
-                      thread_pool);
+  MlasQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(),
+                     thread_pool);
   return Status::OK();
 }
 
+#if !defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || !defined(MLAS_TARGET_ARM64)
 template <>
 Status MatMulNBits<MLFloat16>::ComputeBPacked(const Tensor* a,
                                               const Tensor* scales,
@@ -383,7 +350,7 @@ Status MatMulNBits<MLFloat16>::ComputeBPacked(const Tensor* a,
   const size_t lda = helper.Lda(false);
 
   IAllocatorUniquePtr<std::byte> workspace{};
-  const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(
+  const size_t workspace_size = MlasQNBitGemmBatchWorkspaceSize(
       M, N, K, batch_count, nbits_, block_size_, compute_type_);
   if (workspace_size > 0) {
     // Use reserve since no caching is needed
@@ -417,12 +384,12 @@ Status MatMulNBits<MLFloat16>::ComputeBPacked(const Tensor* a,
   size_t c_size = static_cast<size_t>(y->Shape().Size());
   std::vector<float> c_v(c_size);
 
-  InlinedVector<MLAS_SQNBIT_GEMM_DATA_PARAMS> data(batch_count);
+  InlinedVector<MLAS_QNBIT_GEMM_DATA_PARAMS<float>> data(batch_count);
   for (size_t i = 0; i < batch_count; ++i) {
     data[i].A = tmp_a_data_ptr.get() + helper.LeftOffsets()[i];
     data[i].lda = lda;
 #ifdef MLAS_TARGET_AMD64_IX86
-    if (compute_type_ == CompInt8) {
+    if (compute_type_ == SQNBIT_CompInt8) {
       data[i].QuantBDataWorkspace = packed_b_.get();
     }
 #endif
@@ -433,11 +400,12 @@ Status MatMulNBits<MLFloat16>::ComputeBPacked(const Tensor* a,
     data[i].C = c_v.data() + helper.OutputOffsets()[i];
     data[i].ldc = N;
   }
-  MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(),
-                      thread_pool);
+  MlasQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(),
+                     thread_pool);
   MlasConvertFloatToHalfBuffer(c_v.data(), y_data, c_size);
   return Status::OK();
 }
+#endif  // end of !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_AMD64
 
 template <>
 Status MatMulNBits<float>::ComputeBUnpacked(const Tensor* a,
@@ -573,9 +541,10 @@ Status MatMulNBits<MLFloat16>::ComputeBUnpacked(const Tensor* a,
   const size_t ldb = helper.Ldb(true);
 
   float* scales_ptr = nullptr;
+  IAllocatorUniquePtr<float> temp_scales;
   if (!scales_fp32_) {
     auto scales_size = static_cast<size_t>(scales->Shape().Size());
-    auto temp_scales = IAllocator::MakeUniquePtr<float>(allocator, scales_size, true);
+    temp_scales = IAllocator::MakeUniquePtr<float>(allocator, scales_size, true);
     MlasConvertHalfToFloatBuffer(scales_data, temp_scales.get(), scales_size);
     scales_ptr = temp_scales.get();
   } else {
@@ -656,8 +625,9 @@ Status MatMulNBits<MLFloat16>::ComputeBUnpacked(const Tensor* a,
   if (bias) {
     float* bias_ptr = nullptr;
     const size_t bias_size = static_cast<size_t>(bias->Shape().Size());
+    IAllocatorUniquePtr<float> bias_temp;
     if (!bias_fp32_) {
-      auto bias_temp = IAllocator::MakeUniquePtr<float>(allocator, bias_size, true);
+      bias_temp = IAllocator::MakeUniquePtr<float>(allocator, bias_size, true);
       MlasConvertHalfToFloatBuffer(bias->Data<MLFloat16>(), bias_temp.get(), bias_size);
       bias_ptr = bias_temp.get();
     } else {
@@ -710,11 +680,11 @@ Status MatMulNBits<T1>::Compute(OpKernelContext* ctx) const {
   // clang-format on
 
   if (has_single_b_matrix &&
-      packed_b_) {  // Assume that MlasSQNBitGemmBatch() always requires packed B.
-                    // If this changes, i.e., if MlasIsSQNBitGemmAvailable() can return true while
-                    // MlasSQNBitGemmPackQuantBDataSize() returns 0, we can consider calling MlasSQNBitGemmBatch()
+      packed_b_) {  // Assume that MlasQNBitGemmBatch() always requires packed B.
+                    // If this changes, i.e., if MlasIsQNBitGemmAvailable() can return true while
+                    // MlasQNBitGemmPackQuantBDataSize() returns 0, we can consider calling MlasQNBitGemmBatch()
                     // with B directly too.
-    if (MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
+    if (MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
       return ComputeBPacked(a, scales, zero_points, bias, y, allocator, thread_pool, helper);
     }
   }
diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
index c9ee9e2cb760d..d5b8961cf8c5a 100644
--- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
@@ -46,24 +46,13 @@ void ComputeJob(
     const T* gamma_data,
     const T* beta_data,
     const T* bias_data,
-    IAllocatorUniquePtr<float>& skip_float_uptr,
-    IAllocatorUniquePtr<float>& gamma_float_uptr,
-    IAllocatorUniquePtr<float>& beta_float_uptr,
-    IAllocatorUniquePtr<float>& bias_float_uptr,
     ptrdiff_t task_idx,
     int hidden_size,
     int64_t skip_size,
     float epsilon,
     bool simplified,
     T* output_data,
-    T* skip_input_bias_add_output_data,
-    AllocatorPtr alloc) {
-  ORT_UNUSED_PARAMETER(skip_float_uptr);   // only used in MLFloat16 overload
-  ORT_UNUSED_PARAMETER(gamma_float_uptr);  // only used in MLFloat16 overload
-  ORT_UNUSED_PARAMETER(beta_float_uptr);   // only used in MLFloat16 overload
-  ORT_UNUSED_PARAMETER(bias_float_uptr);   // only used in MLFloat16 overload
-  ORT_UNUSED_PARAMETER(alloc);
-
+    T* skip_input_bias_add_output_data) {
   auto offset = task_idx * hidden_size;
   const T* p_input = input_data + offset;
   const T* p_skip = skip_data + (offset % skip_size);
@@ -107,101 +96,6 @@ void ComputeJob(
   }
 }
 
-void ComputeJob(
-    const MLFloat16* input_data,
-    const MLFloat16* skip_data,
-    const MLFloat16* gamma_data,
-    const MLFloat16* beta_data,
-    const MLFloat16* bias_data,
-    IAllocatorUniquePtr<float>& skip_float_uptr,
-    IAllocatorUniquePtr<float>& gamma_float_uptr,
-    IAllocatorUniquePtr<float>& beta_float_uptr,
-    IAllocatorUniquePtr<float>& bias_float_uptr,
-    ptrdiff_t task_idx,
-    int hidden_size,
-    int64_t skip_size,
-    float epsilon,
-    bool simplified,
-    MLFloat16* output_data,
-    MLFloat16* skip_input_bias_add_output_data,
-    AllocatorPtr alloc) {
-  auto offset = task_idx * hidden_size;
-  const MLFloat16* p_input = input_data + offset;
-  const MLFloat16* p_skip = skip_data + (offset % skip_size);
-  MLFloat16* p_output = output_data + offset;
-  MLFloat16* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset;
-
-  float mean(0.0f);
-  float mean_square(0.0f);
-  const size_t num_elems = static_cast<size_t>(hidden_size);
-
-  IAllocatorUniquePtr<float> input_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
-  MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems);
-
-  if (!skip_float_uptr) {
-    skip_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
-    MlasConvertHalfToFloatBuffer(p_skip, skip_float_uptr.get(), num_elems);
-  }
-
-  if (bias_data && !bias_float_uptr) {
-    bias_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
-    MlasConvertHalfToFloatBuffer(bias_data, bias_float_uptr.get(), num_elems);
-  }
-
-  IAllocatorUniquePtr<float> output_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
-  float* output_float_ptr = output_float_uptr.get();
-
-  const float* input_float_ptr = input_float_uptr.get();
-  const float* skip_float_ptr = skip_float_uptr.get();
-  const float* bias_float_ptr = bias_float_uptr.get();
-  for (size_t h = 0; h < num_elems; h++) {
-    float val = input_float_ptr[h] + skip_float_ptr[h];
-
-    if (bias_float_uptr) {
-      val += bias_float_ptr[h];
-    }
-
-    output_float_ptr[h] = val;
-    mean += val;
-    mean_square += val * val;
-  }
-
-  if (nullptr != p_skip_input_bias_add_output) {
-    MlasConvertFloatToHalfBuffer(output_float_ptr, p_skip_input_bias_add_output, num_elems);
-  }
-
-  mean = mean / hidden_size;
-  if (simplified) {
-    mean_square = sqrt(mean_square / hidden_size + epsilon);
-  } else {
-    mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon);
-  }
-
-  if (!gamma_float_uptr) {
-    gamma_float_uptr = std::move(input_float_uptr);  // overwrite input with gamma values, since they have the same size
-    MlasConvertHalfToFloatBuffer(gamma_data, gamma_float_uptr.get(), num_elems);
-  }
-
-  if (beta_data && !beta_float_uptr) {
-    beta_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
-    MlasConvertHalfToFloatBuffer(beta_data, beta_float_uptr.get(), num_elems);
-  }
-
-  const float* gamma_float_ptr = gamma_float_uptr.get();
-  const float* beta_float_ptr = beta_float_uptr.get();
-  for (size_t h = 0; h < num_elems; h++) {
-    if (simplified) {
-      output_float_ptr[h] = output_float_ptr[h] / mean_square * gamma_float_ptr[h];
-    } else if (nullptr == beta_float_uptr) {
-      output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h];
-    } else {
-      output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h] + beta_float_ptr[h];
-    }
-  }
-
-  MlasConvertFloatToHalfBuffer(output_float_ptr, p_output, num_elems);
-}
-
 void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr<float>& dest, bool& is_packed) {
   if (tensor.GetElementType() == utils::ToTensorProtoElementType<MLFloat16>()) {
     auto tensor_data_ptr = tensor.Data<MLFloat16>();
@@ -218,7 +112,12 @@ void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, I
 
 template <typename T, bool simplified>
 SkipLayerNorm<T, simplified>::SkipLayerNorm(const OpKernelInfo& op_kernel_info)
-    : OpKernel(op_kernel_info), skip_fp32_(nullptr), gamma_fp32_(nullptr), beta_fp32_(nullptr), bias_fp32_(nullptr) {
+    : OpKernel(op_kernel_info),
+      prepacked_skip_fp32_size_(0),
+      prepacked_skip_fp32_data_(nullptr),
+      prepacked_gamma_fp32_data_(nullptr),
+      prepacked_beta_fp32_data_(nullptr),
+      prepacked_bias_fp32_data_(nullptr) {
   ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &epsilon_).IsOK());
   ORT_ENFORCE(epsilon_ >= 0);
 }
@@ -226,10 +125,10 @@ SkipLayerNorm<T, simplified>::SkipLayerNorm(const OpKernelInfo& op_kernel_info)
 template <typename T, bool simplified>
 Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {
   const Tensor* input = p_ctx->Input<Tensor>(0);
-  const Tensor* skip = p_ctx->Input<Tensor>(1);
-  const Tensor* gamma = p_ctx->Input<Tensor>(2);
-  const Tensor* beta = p_ctx->Input<Tensor>(3);
-  const Tensor* bias = p_ctx->Input<Tensor>(4);
+  const Tensor* skip = prepacked_skip_fp32_data_ ? nullptr : p_ctx->Input<Tensor>(1);
+  const Tensor* gamma = prepacked_gamma_fp32_data_ ? nullptr : p_ctx->Input<Tensor>(2);
+  const Tensor* beta = simplified ? nullptr : (prepacked_beta_fp32_data_ ? nullptr : p_ctx->Input<Tensor>(3));
+  const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input<Tensor>(simplified ? 3 : 4);
   Tensor* output = p_ctx->Output(0, input->Shape());
   // For inferencing, we support one more optional output which is the sum of the input and skip tensors
   Tensor* skip_input_bias_add_output = p_ctx->Output(3, input->Shape());
@@ -238,19 +137,21 @@ Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {
   size_t input_dims_size = input_dims.size();
   int hidden_size = static_cast<int>(input_dims[input_dims_size - 1]);
 
-  ORT_RETURN_IF_ERROR(onnxruntime::contrib::skip_layer_norm_helper::CheckInputs<Tensor>(input,
-                                                                                        skip,
-                                                                                        gamma,
-                                                                                        beta,
-                                                                                        bias,
-                                                                                        hidden_size,
-                                                                                        input_dims_size));
+  ORT_RETURN_IF_ERROR(skip_layer_norm_helper::CheckPotentiallyPrepackedInputs<Tensor>(input,
+                                                                                      skip,
+                                                                                      gamma,
+                                                                                      beta,
+                                                                                      bias,
+                                                                                      hidden_size,
+                                                                                      input_dims_size,
+                                                                                      prepacked_skip_fp32_data_ != nullptr,
+                                                                                      prepacked_gamma_fp32_data_ != nullptr));
 
   int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1);
 
   const T* input_data = input->Data<T>();
-  const T* skip_data = skip->Data<T>();
-  const T* gamma_data = gamma->Data<T>();
+  const T* skip_data = skip == nullptr ? nullptr : skip->Data<T>();
+  const T* gamma_data = gamma == nullptr ? nullptr : gamma->Data<T>();
   const T* beta_data = beta == nullptr ? nullptr : beta->Data<T>();
   const T* bias_data = bias == nullptr ? nullptr : bias->Data<T>();
 
@@ -258,39 +159,118 @@ Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {
 
   // For inferencing, we support one more optional output which is the sum of the input and skip tensors
   T* skip_input_bias_add_output_data = skip_input_bias_add_output == nullptr ? nullptr : skip_input_bias_add_output->MutableData<T>();
+  const int64_t skip_size = skip ? skip->Shape().Size() : prepacked_skip_fp32_size_;
+
+  if constexpr (std::is_same_v<T, MLFloat16>) {
+    const size_t total_data_size = static_cast<size_t>(input->Shape().Size());
+
+    AllocatorPtr alloc;
+    ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc));
+
+    IAllocatorUniquePtr<float> input_fp32;
+    IAllocatorUniquePtr<float> output_fp32;
+    IAllocatorUniquePtr<float> skip_input_bias_add_output_fp32;
+    IAllocatorUniquePtr<float> skip_fp32;
+    IAllocatorUniquePtr<float> gamma_fp32;
+    IAllocatorUniquePtr<float> beta_fp32;
+    IAllocatorUniquePtr<float> bias_fp32;
+
+    const float* input_data_f = nullptr;
+    const float* skip_data_f = nullptr;
+    const float* gamma_data_f = nullptr;
+    const float* beta_data_f = nullptr;
+    const float* bias_data_f = nullptr;
+    float* output_data_f = nullptr;
+    float* skip_input_bias_add_output_data_f = nullptr;
+
+    const size_t num_elems = static_cast<size_t>(hidden_size);
+
+    input_fp32 = IAllocator::MakeUniquePtr<float>(alloc, total_data_size);
+    MlasConvertHalfToFloatBuffer(input_data, input_fp32.get(), total_data_size);
+    input_data_f = input_fp32.get();
+
+    output_fp32 = IAllocator::MakeUniquePtr<float>(alloc, total_data_size);
+    output_data_f = output_fp32.get();
+
+    skip_input_bias_add_output_fp32 = IAllocator::MakeUniquePtr<float>(alloc, total_data_size);
+    skip_input_bias_add_output_data_f = skip_input_bias_add_output_fp32.get();
+
+    if (skip_data) {
+      skip_fp32 = IAllocator::MakeUniquePtr<float>(alloc, static_cast<size_t>(skip_size));
+      MlasConvertHalfToFloatBuffer(skip_data, skip_fp32.get(), static_cast<size_t>(skip_size));
+      skip_data_f = skip_fp32.get();
+    } else if (prepacked_skip_fp32_data_) {
+      skip_data_f = prepacked_skip_fp32_data_.get();
+    }
 
-  const int64_t& skip_size = skip->Shape().Size();
+    if (gamma_data) {
+      gamma_fp32 = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
+      MlasConvertHalfToFloatBuffer(gamma_data, gamma_fp32.get(), num_elems);
+      gamma_data_f = gamma_fp32.get();
+    } else if (prepacked_gamma_fp32_data_) {
+      gamma_data_f = prepacked_gamma_fp32_data_.get();
+    }
 
-  AllocatorPtr alloc;
-  ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc));
+    if (beta_data) {
+      beta_fp32 = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
+      MlasConvertHalfToFloatBuffer(beta_data, beta_fp32.get(), num_elems);
+      beta_data_f = beta_fp32.get();
+    } else if (prepacked_beta_fp32_data_) {
+      beta_data_f = prepacked_beta_fp32_data_.get();
+    }
 
-  concurrency::ThreadPool::TryBatchParallelFor(
-      p_ctx->GetOperatorThreadPool(), static_cast<int32_t>(task_count),
-      [&](ptrdiff_t task_idx) {
-        ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, skip_fp32_, gamma_fp32_, beta_fp32_,
-                   bias_fp32_, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data,
-                   skip_input_bias_add_output_data, alloc);
-      },
-      0);
+    if (bias_data) {
+      bias_fp32 = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
+      MlasConvertHalfToFloatBuffer(bias_data, bias_fp32.get(), num_elems);
+      bias_data_f = bias_fp32.get();
+    } else if (prepacked_bias_fp32_data_) {
+      bias_data_f = prepacked_bias_fp32_data_.get();
+    }
+
+    concurrency::ThreadPool::TryBatchParallelFor(
+        p_ctx->GetOperatorThreadPool(), static_cast<int32_t>(task_count),
+        [&](ptrdiff_t task_idx) {
+          ComputeJob(input_data_f, skip_data_f, gamma_data_f, beta_data_f, bias_data_f, task_idx, hidden_size, skip_size,
+                     epsilon_, simplified, output_data_f, skip_input_bias_add_output_data_f);
+        },
+        0);
+    MlasConvertFloatToHalfBuffer(output_data_f, output_data, total_data_size);
+    if (skip_input_bias_add_output_data != nullptr)
+      MlasConvertFloatToHalfBuffer(skip_input_bias_add_output_data_f, skip_input_bias_add_output_data, total_data_size);
+  } else {
+    concurrency::ThreadPool::TryBatchParallelFor(
+        p_ctx->GetOperatorThreadPool(), static_cast<int32_t>(task_count),
+        [&](ptrdiff_t task_idx) {
+          ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size,
+                     epsilon_, simplified, output_data, skip_input_bias_add_output_data);
+        },
+        0);
+  }
 
   return Status::OK();
 }
 
 template <typename T, bool simplified>
 Status SkipLayerNorm<T, simplified>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                                             bool /*save_prepacked_initializers*/,
                                              bool& is_packed, PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
-
   is_packed = false;
   if (input_idx == 1) {  // skip
-    ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, skip_fp32_, is_packed);
+    prepacked_skip_fp32_size_ = tensor.Shape().Size();
+    ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_skip_fp32_data_, is_packed);
   } else if (input_idx == 2) {  // gamma
-    ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, gamma_fp32_, is_packed);
-  } else if (input_idx == 3) {  // beta
-    ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, beta_fp32_, is_packed);
+    ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_gamma_fp32_data_, is_packed);
+  } else if (input_idx == 3) {
+    if constexpr (simplified) {
+      // bias
+      ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed);
+    } else {
+      // beta
+      ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_beta_fp32_data_, is_packed);
+    }
   } else if (input_idx == 4) {  // bias
-    ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, bias_fp32_, is_packed);
+    ORT_ENFORCE(!simplified, "SkipSimplifiedLayerNormalization should only has 4 inputs (input, skip, gamma, and beta). Got 5.");
+    ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed);
   }
 
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
index d904c14857437..4a350fdcc2220 100644
--- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
+++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
@@ -16,15 +16,16 @@ class SkipLayerNorm final : public OpKernel {
   SkipLayerNorm(const OpKernelInfo& op_kernel_info);
   Status Compute(OpKernelContext* p_op_kernel_context) const override;
 
-  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool save_prepacked_initializers,
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
                  bool& is_packed, PrePackedWeights* prepacked_weights) override;
 
  private:
   float epsilon_;
-  mutable IAllocatorUniquePtr<float> skip_fp32_;
-  mutable IAllocatorUniquePtr<float> gamma_fp32_;
-  mutable IAllocatorUniquePtr<float> beta_fp32_;
-  mutable IAllocatorUniquePtr<float> bias_fp32_;
+  int64_t prepacked_skip_fp32_size_;
+  IAllocatorUniquePtr<float> prepacked_skip_fp32_data_;
+  IAllocatorUniquePtr<float> prepacked_gamma_fp32_data_;
+  IAllocatorUniquePtr<float> prepacked_beta_fp32_data_;
+  IAllocatorUniquePtr<float> prepacked_bias_fp32_data_;
 };
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h
index 6271f822287e6..4c901f5650dbd 100644
--- a/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h
+++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h
@@ -11,14 +11,10 @@ namespace onnxruntime {
 namespace contrib {
 namespace skip_layer_norm_helper {
 
+namespace {
+
 template <typename T>
-Status CheckInputs(const T* input,
-                   const T* skip,
-                   const T* gamma,
-                   const T* beta,
-                   const T* bias,
-                   int hidden_size_check,
-                   size_t input_dims_size_check) {
+Status CheckSkip(const T* input, const T* skip, size_t input_dims_size_check) {
   const auto& input_dims_check = input->Shape().GetDims();
   const auto& skip_dims_check = skip->Shape().GetDims();
   size_t skip_dims_size_check = skip_dims_check.size();
@@ -33,49 +29,150 @@ Status CheckInputs(const T* input,
                            "skip is expected to have same shape as input or, a batch size of 1 or no batch size when input has 3 dimensions");
   }
 
-  if (input_dims_size_check != 3 && input_dims_size_check != 2) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "input is expected to have 3 or 2 dimensions, got ", input_dims_size_check);
-  }
-
   if (skip_dims_check[skip_dims_size_check - 1] != input_dims_check[input_dims_size_check - 1] || skip_dims_check[skip_dims_size_check - 2] != input_dims_check[input_dims_size_check - 2]) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "last two dimensions of skip needs to be same as input");
   }
 
+  return Status::OK();
+}
+
+template <typename T>
+Status CheckGamma(const T* gamma, int hidden_size_check) {
   const auto& gamma_dims = gamma->Shape().GetDims();
+
   if (gamma_dims.size() != 1) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "gamma is expected to have 1 dimension, got ", gamma_dims.size());
   }
+
   if (gamma_dims[0] != hidden_size_check) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "Last dimension of gamma and input does not match");
   }
 
+  return Status::OK();
+}
+
+template <typename T>
+Status CheckBeta(const T* beta, int hidden_size_check) {
   if (nullptr != beta) {
     const auto& beta_dims = beta->Shape().GetDims();
+
     if (beta_dims.size() != 1) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "beta is expected to have 1 dimension, got ", beta_dims.size());
     }
+
     if (beta_dims[0] != hidden_size_check) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "Last dimension of beta and input does not match");
     }
   }
 
+  return Status::OK();
+}
+
+template <typename T>
+Status CheckBias(const T* bias, int hidden_size_check) {
   if (nullptr != bias) {
     const auto& bias_dims = bias->Shape().GetDims();
+
     if (bias_dims.size() != 1) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "bias is expected to have 1 dimension, got ", bias_dims.size());
     }
+
     if (bias_dims[0] != hidden_size_check) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "Last dimension of bias and input does not match");
     }
   }
+
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+template <typename T>
+Status CheckInputs(const T* input,
+                   const T* skip,
+                   const T* gamma,
+                   const T* beta,
+                   const T* bias,
+                   int hidden_size_check,
+                   size_t input_dims_size_check) {
+  if (input_dims_size_check != 3 && input_dims_size_check != 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "input is expected to have 3 or 2 dimensions, got ", input_dims_size_check);
+  }
+
+  auto status = CheckSkip<T>(input, skip, input_dims_size_check);
+  if (status != Status::OK()) {
+    return status;
+  }
+
+  status = CheckGamma<T>(gamma, hidden_size_check);
+  if (status != Status::OK()) {
+    return status;
+  }
+
+  status = CheckBeta<T>(beta, hidden_size_check);
+  if (status != Status::OK()) {
+    return status;
+  }
+
+  status = CheckBias<T>(bias, hidden_size_check);
+  if (status != Status::OK()) {
+    return status;
+  }
+
+  return Status::OK();
+}
+
+template <typename T>
+Status CheckPotentiallyPrepackedInputs(const T* input,
+                                       const T* skip,
+                                       const T* gamma,
+                                       const T* beta,
+                                       const T* bias,
+                                       int hidden_size_check,
+                                       size_t input_dims_size_check,
+                                       bool prepacked_skip,
+                                       bool prepacked_gamma) {
+  if (input_dims_size_check != 3 && input_dims_size_check != 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "input is expected to have 3 or 2 dimensions, got ", input_dims_size_check);
+  }
+
+  if (nullptr != skip) {
+    auto status = CheckSkip<T>(input, skip, input_dims_size_check);
+    if (status != Status::OK()) {
+      return status;
+    }
+  } else if (!prepacked_skip) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "skip is expected but not provided");
+  }
+
+  if (nullptr != gamma) {
+    auto status = CheckGamma<T>(gamma, hidden_size_check);
+    if (status != Status::OK()) {
+      return status;
+    }
+  } else if (!prepacked_gamma) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "gamma is expected but not provided");
+  }
+
+  auto status = CheckBeta<T>(beta, hidden_size_check);
+  if (status != Status::OK()) {
+    return status;
+  }
+
+  status = CheckBias<T>(bias, hidden_size_check);
+  if (status != Status::OK()) {
+    return status;
+  }
+
   return Status::OK();
 }
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
index d675ba742e03b..7757435990a65 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
@@ -31,6 +31,7 @@ Subgraph::Subgraph(
       allocator_(nullptr),
       is_output_float16_(false) {
   num_implicit_inputs = static_cast<int>(node.ImplicitInputDefs().size());
+  used_implicit_inputs = std::vector<bool>(num_implicit_inputs, true);
 
   auto& subgraph_inputs = subgraph.GetInputs();
   auto& subgraph_outputs = subgraph.GetOutputs();
@@ -73,8 +74,18 @@ Status Subgraph::Setup(const SessionState& session_state,
   // The position_ids, attention_mask, past_0, ... are created by this operator so the name doesn't matter.
   feed_names.insert(feed_names.end(), subgraph_input_names.begin(), subgraph_input_names.end());
 
-  for (auto& entry : node.ImplicitInputDefs()) {
-    feed_names.push_back(entry->Name());
+  const auto& subgraph_map = subgraph_session_state.GetOrtValueNameIdxMap();
+
+  const auto& implicit_input_defs = node.ImplicitInputDefs();
+  for (size_t i = 0, end = num_implicit_inputs; i < end; ++i) {
+    const auto* entry = implicit_input_defs[i];
+    int idx;
+    if (subgraph_map.GetIdx(entry->Name(), idx).IsOK()) {
+      feed_names.push_back(entry->Name());
+    } else {
+      --num_implicit_inputs;
+      used_implicit_inputs[i] = false;
+    }
   }
 
   InlinedVector<OrtDevice> feed_locations;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
index bde591626bb83..8ec9c9cbdc20f 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
@@ -31,6 +31,7 @@ class Subgraph {
   const GraphViewer& subgraph;    // The subgraph
 
   int num_implicit_inputs;
+  std::vector<bool> used_implicit_inputs;
 
   int num_subgraph_inputs;   // Same as subgraph_input_names.size(), keep it for convenience.
   int num_subgraph_outputs;  // Same as subgraph_output_names.size()
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
index 9037e58aaf31f..6c66bfc2816e4 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
@@ -281,8 +281,11 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
   }
 
   // Pass through implicit inputs.
-  for (const auto* entry : implicit_inputs) {
-    decoder_feeds.push_back(*entry);
+  for (size_t i = 0; i < implicit_inputs.size(); ++i) {
+    const auto* entry = implicit_inputs[i];
+    if (used_implicit_inputs[i]) {
+      decoder_feeds.push_back(*entry);
+    }
   }
 
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
index 51473c0c931b9..d59db4afac2c2 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_encoder.cc
@@ -145,8 +145,11 @@ Status T5EncoderSubgraph::CreateInitialFeeds(
       pinned_allocator,
       location));
 
-  for (const auto* entry : implicit_inputs) {
-    feeds.push_back(*entry);
+  for (size_t i = 0; i < implicit_inputs.size(); ++i) {
+    const auto* entry = implicit_inputs[i];
+    if (used_implicit_inputs[i]) {
+      feeds.push_back(*entry);
+    }
   }
 
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h
index e961bab399326..d46d9597a758f 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h
@@ -98,7 +98,7 @@ inline __device__ void compute_attn_1rowblock(const Params& params, const int bi
       for (int m = 0; m < size<1>(tOgO); ++m) {
         const int row = get<0>(tOcO(0, m, 0));
         if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) {
-          gLSE(row) = INFINITY;
+          gLSE(row) = std::numeric_limits<ElementAccum>::infinity();
         }
       }
       return;
@@ -499,7 +499,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params& params, cons
     for (int m = 0; m < size<1>(tOgOaccum); ++m) {
       const int row = get<0>(tOcO(0, m, 0));
       if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) {
-        gLSEaccum(row) = Split ? -INFINITY : INFINITY;
+        gLSEaccum(row) = Split ? -std::numeric_limits<ElementAccum>::infinity() : std::numeric_limits<ElementAccum>::infinity();
       }
     }
     return;
@@ -1061,7 +1061,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   for (int l = 0; l < kNLsePerThread; ++l) {
     const int row = l * kRowsPerLoadLSE + tidx / kBlockM;
     const int col = tidx % kBlockM;
-    ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -INFINITY;
+    ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -std::numeric_limits<ElementAccum>::infinity();
     if (row < kMaxSplits) {
       sLSE[row][col] = lse;
     }
@@ -1082,7 +1082,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   for (int l = 0; l < kNLsePerThread; ++l) {
     const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
     const int col = tidx / kRowsPerLoadTranspose;
-    lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -INFINITY;
+    lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -std::numeric_limits<ElementAccum>::infinity();
     // if (bidx == 0 && tidx < 32) { printf("tidx = %d, row = %d, col = %d, lse = %f\n", tidx, row, col, lse_accum(l)); }
   }
 
@@ -1094,7 +1094,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   }
   MaxOp<float> max_op;
   lse_max = Allreduce<kRowsPerLoadTranspose>::run(lse_max, max_op);
-  lse_max = lse_max == -INFINITY ? 0.0f : lse_max;  // In case all local LSEs are -inf
+  lse_max = lse_max == -std::numeric_limits<ElementAccum>::infinity() ? 0.0f : lse_max;  // In case all local LSEs are -inf
   float lse_sum = expf(lse_accum(0) - lse_max);
 #pragma unroll
   for (int l = 1; l < kNLsePerThread; ++l) {
@@ -1104,7 +1104,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   lse_sum = Allreduce<kRowsPerLoadTranspose>::run(lse_sum, sum_op);
   // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise
   // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum.
-  ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max;
+  ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? std::numeric_limits<ElementAccum>::infinity() : logf(lse_sum) + lse_max;
   // if (bidx == 0 && tidx < 32) { printf("tidx = %d, lse = %f, lse_max = %f, lse_logsum = %f\n", tidx, lse_accum(0), lse_max, lse_logsum); }
   if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) {
     gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum;
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
index 0998155eba635..71434002f8df1 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <limits>
 #include <cute/tensor.hpp>
 
 namespace onnxruntime {
@@ -28,7 +29,7 @@ __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout>& tensor, const
 // Without the "make_coord" we get wrong results
 #pragma unroll
         for (int mi = 0; mi < size<0>(tensor); ++mi) {
-          tensor(mi, make_coord(j, nj)) = -INFINITY;
+          tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
         }
       }
     }
@@ -59,7 +60,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout>& tensor,
         for (int j = 0; j < size<1, 0>(tensor); ++j) {
           const int col_idx = col_idx_base + j;
           if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
-            tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+            tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
           }
         }
       }
@@ -96,7 +97,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx(
 #pragma unroll
     for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
       if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
-        tensor(mi, ni) = -INFINITY;
+        tensor(mi, ni) = -std::numeric_limits<float>::infinity();
       }
     }
     // if (cute::thread0()) {
@@ -151,7 +152,7 @@ struct Mask {
               }
               if constexpr (!Is_even_MN) {
                 if (col_idx >= max_seqlen_k) {
-                  tensor(mi, make_coord(j, nj)) = -INFINITY;
+                  tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                 }
               }
             }
@@ -181,18 +182,18 @@ struct Mask {
                 }
                 if constexpr (Causal_mask) {
                   if (col_idx >= col_idx_limit_right) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (Is_local) {
                   if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
                   // Causal and Local already handles MN masking
                   if (col_idx >= max_seqlen_k) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
               }
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
index 7e0095cb39bd9..7fe506e01a9b9 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <cmath>
+#include <limits>
 
 #include <cute/tensor.hpp>
 
@@ -71,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0>& tenso
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
     // If we don't have float around M_LOG2E the multiplication is done in fp64.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E));
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity()
+                                 ? 0.f
+                                 : max(mi) * (Scale_max ? scale : float(M_LOG2E));
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
       // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
@@ -99,7 +102,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0>& ten
     max(mi) = Allreduce<4>::run(max(mi), max_op);
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity() ? 0.f : max(mi) * scale;
     sum(mi) = 0;
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
@@ -143,7 +146,7 @@ struct Softmax {
       for (int mi = 0; mi < size(row_max); ++mi) {
         float scores_max_cur = !Check_inf
                                    ? row_max(mi)
-                                   : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+                                   : (row_max(mi) == -std::numeric_limits<float>::infinity() ? 0.0f : row_max(mi));
         float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
         row_sum(mi) *= scores_scale;
 #pragma unroll
@@ -169,7 +172,9 @@ struct Softmax {
     for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
       float sum = smooth_softmax ? row_sum(mi) + expf(-row_max(mi) * softmax_scale) : row_sum(mi);
       float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
-      lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
+      lse(mi) = (sum == 0.f || sum != sum)
+                    ? (Split ? -std::numeric_limits<float>::infinity() : std::numeric_limits<float>::infinity())
+                    : row_max(mi) * softmax_scale + __logf(sum);
       float scale = inv_sum;
 #pragma unroll
       for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h
index 5be69ea0af55c..bd54b404420e5 100644
--- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h
@@ -825,7 +825,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
         const int row = l * kRowsPerLoadLSE + tidx / kBlockM;
         const int col = tidx % kBlockM;
         // We skip the first row = 0, as we already populated it in shared memory.
-        ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -INFINITY;
+        ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -std::numeric_limits<ElementAccum>::infinity();
         if (row > 0 && row < kMaxSplits) {
           sLSE(row, col) = lse;
 
@@ -857,7 +857,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
       for (int l = 0; l < kNLsePerThread; ++l) {
         const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
         const int col = tidx / kRowsPerLoadTranspose;
-        lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -INFINITY;
+        lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -std::numeric_limits<ElementAccum>::infinity();
 
 #if defined(DEBUG_LEAN_ATTENTION)
         if (threadIdx.x == 0 && blockIdx.z == tracing_block) {
@@ -874,7 +874,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
       }
       MaxOp<float> max_op;
       lse_max = Allreduce<kRowsPerLoadTranspose>::run(lse_max, max_op);
-      lse_max = lse_max == -INFINITY ? 0.0f : lse_max;  // In case all local LSEs are -inf
+      lse_max = lse_max == -std::numeric_limits<ElementAccum>::infinity() ? 0.0f : lse_max;  // In case all local LSEs are -inf
       float lse_sum = expf(lse_accum(0) - lse_max);
 #pragma unroll
       for (int l = 1; l < kNLsePerThread; ++l) {
@@ -884,7 +884,9 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
       lse_sum = Allreduce<kRowsPerLoadTranspose>::run(lse_sum, sum_op);
       // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise
       // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum.
-      ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max;
+      ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum)
+                                    ? std::numeric_limits<ElementAccum>::infinity()
+                                    : logf(lse_sum) + lse_max;
 // if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) { gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum; }
 // Store the scales exp(lse - lse_logsum) in shared memory.
 #pragma unroll
diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h
index d63c80b012de6..2d33418d69667 100644
--- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h
+++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h
@@ -3,7 +3,7 @@
  ******************************************************************************/
 
 #pragma once
-
+#include <limits>
 #include <cute/tensor.hpp>
 
 namespace onnxruntime {
@@ -28,7 +28,7 @@ __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout>& tensor, const
 // Without the "make_coord" we get wrong results
 #pragma unroll
         for (int mi = 0; mi < size<0>(tensor); ++mi) {
-          tensor(mi, make_coord(j, nj)) = -INFINITY;
+          tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
         }
       }
     }
@@ -59,7 +59,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout>& tensor,
         for (int j = 0; j < size<1, 0>(tensor); ++j) {
           const int col_idx = col_idx_base + j;
           if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
-            tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+            tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
           }
         }
       }
@@ -96,7 +96,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx(
 #pragma unroll
     for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
       if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
-        tensor(mi, ni) = -INFINITY;
+        tensor(mi, ni) = -std::numeric_limits<float>::infinity();
       }
     }
     // if (cute::thread0()) {
@@ -152,7 +152,7 @@ struct Mask {
               }
               if constexpr (!Is_even_MN) {
                 if (col_idx >= max_seqlen_k) {
-                  tensor(mi, make_coord(j, nj)) = -INFINITY;
+                  tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                 }
               }
             }
@@ -182,18 +182,18 @@ struct Mask {
                 }
                 if constexpr (Causal_mask) {
                   if (col_idx >= col_idx_limit_right) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (Is_local) {
                   if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
                   // Causal and Local already handles MN masking
                   if (col_idx >= max_seqlen_k) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
               }
diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h
index ad66389848e6e..0b6ffb3f1985a 100644
--- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h
+++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h
@@ -3,7 +3,7 @@
  ******************************************************************************/
 
 #pragma once
-
+#include <limits>
 #include <cmath>
 
 #include <cute/tensor.hpp>
@@ -72,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0>& tenso
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
     // If we don't have float around M_LOG2E the multiplication is done in fp64.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E));
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity()
+                                 ? 0.f
+                                 : max(mi) * (Scale_max ? scale : float(M_LOG2E));
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
 // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
@@ -107,7 +109,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0>& ten
     max(mi) = Allreduce<4>::run(max(mi), max_op);
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity() ? 0.f : max(mi) * scale;
     sum(mi) = 0;
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
@@ -151,7 +153,7 @@ struct Softmax {
       for (int mi = 0; mi < size(row_max); ++mi) {
         float scores_max_cur = !Check_inf
                                    ? row_max(mi)
-                                   : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+                                   : (row_max(mi) == -std::numeric_limits<float>::infinity() ? 0.0f : row_max(mi));
         float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
         row_sum(mi) *= scores_scale;
 #pragma unroll
@@ -181,7 +183,9 @@ struct Softmax {
       //     printf("sum: %f, inv_sum: %f\n", sum, inv_sum);
       //     printf("mi %d row_max %f softmax_scale %f\n", mi, row_max(mi), softmax_scale);
       // }
-      lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
+      lse(mi) = (sum == 0.f || sum != sum)
+                    ? (Split ? -std::numeric_limits<float>::infinity() : std::numeric_limits<float>::infinity())
+                    : row_max(mi) * softmax_scale + __logf(sum);
       float scale = !Is_dropout ? inv_sum : inv_sum * rp_dropout;
 #pragma unroll
       for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu b/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu
index 8a04ede231a27..ab809d12a89ad 100644
--- a/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu
@@ -6,7 +6,7 @@ Licensed under the MIT License.
 /*
 Kernel implementation for blocking repeated n-grams.
 */
-
+#include <limits>
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "contrib_ops/cuda/bert/ngram_repeat_block_impl.h"
 
@@ -48,7 +48,7 @@ __global__ void banRepeatedTokens(const int64_t* __restrict__ tokens,
   }
   if (is_banned == true) {
     auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1];
-    lprobs[lprob_start + token_to_be_banned] = -INFINITY;
+    lprobs[lprob_start + token_to_be_banned] = -std::numeric_limits<float>::infinity();
   }
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
index d190ed389f3e9..dea5391c7629b 100644
--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc
@@ -95,7 +95,6 @@ GroupNorm::GroupNorm(const OpKernelInfo& op_info) : CudaKernel(op_info) {
 }
 
 Status GroupNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr /*alloc*/,
-                          bool /*save_prepacked_initializers*/,
                           bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
   is_packed = false;
 
diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h
index 4505c066baedb..b408b3c1ee79b 100644
--- a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h
+++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h
@@ -17,7 +17,6 @@ class GroupNorm final : public CudaKernel {
   Status ComputeInternal(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  bool& is_packed, PrePackedWeights* prepacked_weights) override;
 
  private:
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
index aa2c8755f6536..3e93a527877c5 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc
@@ -99,7 +99,6 @@ Status QOrderedAttention::PutIntoMergedBias(const Tensor& tensor, AllocatorPtr a
 }
 
 Status QOrderedAttention::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
-                                  bool /*save_prepacked_initializers*/,
                                   /*out*/ bool& is_packed,
                                   /*out*/ PrePackedWeights* /*prepacked_weights*/) {
   is_packed = false;
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.h b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.h
index 529fd00307d66..9d4e563c1feab 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.h
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.h
@@ -20,7 +20,6 @@ class QOrderedAttention final : public CudaKernel, public AttentionBase {
 
  public:
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.cc b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.cc
index 351e36b884540..a64f628f245e6 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.cc
@@ -51,7 +51,6 @@ QOrderedMatMul::QOrderedMatMul(const OpKernelInfo& info) : CudaKernel(info) {
 }
 
 Status QOrderedMatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                               bool /*save_prepacked_initializers*/,
                                /*out*/ bool& is_packed,
                                /*out*/ PrePackedWeights* /* prepacked_weights */) {
   is_packed = false;
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.h b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.h
index d1cef99779e09..dcb6cc6374be1 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.h
+++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_matmul.h
@@ -18,7 +18,6 @@ class QOrderedMatMul final : public CudaKernel {
   Status ComputeInternal(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
diff --git a/onnxruntime/contrib_ops/webgpu/bert/attention.cc b/onnxruntime/contrib_ops/webgpu/bert/attention.cc
new file mode 100644
index 0000000000000..86dc959cf2e83
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/bert/attention.cc
@@ -0,0 +1,459 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/webgpu/bert/attention.h"
+
+#include "contrib_ops/cpu/bert/multihead_attention_helper.h"
+#include "contrib_ops/webgpu/bert/multihead_attention.h"
+#include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+using namespace onnxruntime::webgpu;
+using namespace ::onnxruntime::common;
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::contrib::multihead_attention_helper;
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+Status TransferBSDToBNSHProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  shader.AddInput("qkv_input", ShaderUsage::UseUniform);
+  const auto& qkv_output = shader.AddOutput("qkv_output", ShaderUsage::UseUniform | ShaderUsage::UseOffsetToIndices);
+
+  if (has_bias_) {
+    shader.AddInput("bias", ShaderUsage::UseUniform);
+  }
+
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.data_size")
+                            << "let output_indices = " << qkv_output.OffsetToIndices("global_idx") << ";\n"
+                            << "let input_offset_idx = output_indices[0] * uniforms.batch_offset + output_indices[1] *"
+                            << " uniforms.head_offset + output_indices[2] * uniforms.sequence_offset + output_indices[3];\n";
+  if (has_bias_) {
+    shader.MainFunctionBody() << "let bias_offset_idx = (input_offset_idx % uniforms.sequence_offset) + uniforms.bias_offset;\n";
+  }
+  shader.MainFunctionBody() << "qkv_output[global_idx] = qkv_input[input_offset_idx]";
+  if (has_bias_) {
+    shader.MainFunctionBody() << " + bias[bias_offset_idx];\n";
+  } else {
+    shader.MainFunctionBody() << ";\n";
+  }
+
+  return Status::OK();
+}
+
+Status TransferBSDToBNSH(onnxruntime::webgpu::ComputeContext& context, int num_heads, int sequence_length,
+                         int head_size, const Tensor* input_tensor, const Tensor* bias, int bias_offset, Tensor* output_tensor) {
+  ORT_ENFORCE(input_tensor->Shape().GetDims().size() == 3);
+  ORT_ENFORCE(output_tensor->Shape().GetDims().size() == 4);
+
+  uint32_t data_size = SafeInt<uint32_t>(output_tensor->Shape().Size());
+  const int batch_offset = num_heads * sequence_length * head_size;
+  const int sequence_offset = num_heads * head_size;
+  const int head_offset = head_size;
+  bool has_bias = bias != nullptr;
+
+  TransferBSDToBNSHProgram program{has_bias};
+  program.AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
+      .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
+      .SetDispatchGroupSize((data_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({{data_size},
+                            {static_cast<uint32_t>(batch_offset)},
+                            {static_cast<uint32_t>(sequence_offset)},
+                            {static_cast<uint32_t>(head_offset)},
+                            {static_cast<uint32_t>(bias_offset)}});
+
+  if (has_bias) {
+    program.AddInput({bias, ProgramTensorMetadataDependency::TypeAndRank});
+  }
+
+  return context.RunProgram(program);
+};
+
+void InitVarStub(std::ostringstream& ss, const Tensor* seqlen_k, bool is_first_prompt) {
+  if (seqlen_k != nullptr) {
+    ss << "total_sequence_length = u32(seqlen_k[batch_idx]) + 1;\n";
+    ss << "var past_sequence_length: u32 = " << (is_first_prompt ? "0" : "total_sequence_length - sequence_length") << ";\n";
+  } else {
+    ss << "let past_sequence_length = uniforms.past_sequence_length;\n";
+  }
+}
+
+Status AttentionProbsProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  shader.AddInput("q", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  shader.AddInput("key", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  if (feed_past_key_) {
+    shader.AddInput("past_key", ShaderUsage::UseUniform);
+  }
+  if (has_attention_bias_) {
+    shader.AddInput("attention_bias", ShaderUsage::UseUniform);
+  }
+  if (seqlen_k_ != nullptr) {
+    shader.AddInput("seqlen_k", ShaderUsage::UseUniform);
+  }
+  shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  if (has_present_key_) {
+    shader.AddOutput("present_key", ShaderUsage::UseUniform);
+  }
+
+  shader.AdditionalImplementation() << "var<workgroup> tileQ: array<q_value_t, " << tile_size_ * tile_size_ << ">;\n"
+                                    << "var<workgroup> tileK: array<key_value_t, " << tile_size_ * tile_size_ << ">;\n"
+                                    << "alias f32_val_t = " << (components_ == 4 ? "vec4<f32>" : (components_ == 2 ? "vec2<f32>" : "f32")) << ";\n";
+  shader.MainFunctionBody() << "// x holds the N and y holds the M\n"
+                            << "let m = workgroup_id.y * TILE_SIZE;\n"
+                            << "let n = workgroup_id.x * TILE_SIZE;\n"
+                            << "let batch_idx = workgroup_id.z / uniforms.num_heads;\n"
+                            << "let qOffset = workgroup_id.z * uniforms.M * uniforms.K + m * uniforms.K;\n"
+                            << "let sequence_length = uniforms.M;\n"
+                            << "var total_sequence_length = uniforms.N;\n";
+  std::ostringstream oss;
+  InitVarStub(oss, seqlen_k_, is_first_prompt_);
+  shader.MainFunctionBody() << oss.str();
+  shader.MainFunctionBody() << "let kOffset = (workgroup_id.z / " << n_reps_ << ") * uniforms.kv_sequence_length * uniforms.K;\n";
+  if (has_present_key_) {
+    shader.MainFunctionBody() << "let presentKeyOffset = (workgroup_id.z / " << n_reps_ << ") * uniforms.present_sequence_length * uniforms.K;\n";
+  }
+
+  shader.MainFunctionBody() << "var value = f32_val_t(0);\n"
+                               "for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {\n"
+                               "  if (global_id.y < uniforms.M && w + local_id.x < uniforms.K) {\n"
+                               "    tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * uniforms.K + w + local_id.x];\n"
+                               "  }\n"
+                               "  if (n + local_id.y < uniforms.N && w + local_id.x < uniforms.K) {\n"
+                               "    var idx = TILE_SIZE * local_id.y + local_id.x;\n";
+
+  if ((feed_past_key_ && has_present_key_) || past_present_share_buffer_) {
+    shader.MainFunctionBody() << "    if (n + local_id.y < past_sequence_length) {\n"
+                              << "      let pastKeyOffset = (workgroup_id.z / " << n_reps_ << ") * uniforms.past_sequence_length * uniforms.K;\n"
+                              << "      tileK[idx] = " << (past_present_share_buffer_ ? "present_key" : "past_key") << "[pastKeyOffset + (n + local_id.y) * uniforms.K + w + local_id.x];\n"
+                              << "    } else  if (n + local_id.y - past_sequence_length < uniforms.kv_sequence_length) {\n"
+                              << "      tileK[idx] = key[kOffset + (n + local_id.y - past_sequence_length) * uniforms.K + w + local_id.x];\n"
+                              << "    }\n";
+  } else {
+    shader.MainFunctionBody() << "    if (n + local_id.y < uniforms.kv_sequence_length) {\n"
+                                 "      tileK[idx] = key[kOffset + (n + local_id.y) * uniforms.K + w + local_id.x];\n"
+                                 "    }\n";
+  }
+
+  if (has_present_key_) {
+    if (past_present_share_buffer_) {
+      shader.MainFunctionBody() << "    if (n + local_id.y >= past_sequence_length && n + local_id.y < uniforms.kv_sequence_length + past_sequence_length) {\n";
+    } else {
+      shader.MainFunctionBody() << "    if (n + local_id.y < uniforms.kv_sequence_length + past_sequence_length) {\n";
+    }
+    shader.MainFunctionBody() << "      present_key[presentKeyOffset + (n + local_id.y) * uniforms.K + w + local_id.x] = tileK[idx];\n"
+                              << "    }\n";
+  }
+
+  shader.MainFunctionBody() << "  }\n"
+                            << "  workgroupBarrier();\n"
+                            << "  for (var k: u32 = 0u; k < TILE_SIZE && w+k < uniforms.K; k++) {\n"
+                            << "    value += f32_val_t(tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * local_id.x + k]);\n"
+                            << "  }\n"
+                            << "  workgroupBarrier();\n"
+                            << "}\n";
+
+  shader.MainFunctionBody() << "if (global_id.y < uniforms.M && global_id.x < total_sequence_length) {\n"
+                            << "  let headOffset = workgroup_id.z * uniforms.M * uniforms.N;\n"
+                            << "  let outputIdx = headOffset + global_id.y * uniforms.N + global_id.x;\n"
+                            << "  var sum: f32 = " << (components_ == 4 ? "value.x + value.y + value.z + value.w" : (components_ == 2 ? "value.x + value.y" : "value")) << ";\n";
+
+  shader.MainFunctionBody() << "  output[outputIdx] = output_value_t(sum * uniforms.alpha)";
+  if (has_attention_bias_) {
+    shader.MainFunctionBody() << " + attention_bias[outputIdx]";
+  }
+  shader.MainFunctionBody() << ";\n"
+                            << "}\n";
+
+  return Status::OK();
+}
+
+Status ComputeAttentionProbs(onnxruntime::webgpu::ComputeContext& context, int output_count, const Tensor* Q,
+                             const Tensor* K, const Tensor* past_key, const Tensor* attention_bias, Tensor* probs, Tensor* present_key,
+                             WebgpuAttentionParameters& parameters, int past_sequence_length, int total_sequence_length,
+                             const Tensor* seqlen_k) {
+  const float alpha = parameters.scale_ == 0.0f ? 1.f / sqrt(static_cast<float>(parameters.head_size_))
+                                                : parameters.scale_;
+
+  const bool feed_past_key = present_key != nullptr && past_key != nullptr && past_key->SizeInBytes() > 0 && !parameters.past_present_share_buffer_;
+  const bool has_present_key = output_count > 1 && past_key;
+  const bool has_attention_bias = attention_bias != nullptr;
+  constexpr int tile_size = 12;
+  const int components = parameters.head_size_ % 4 == 0 ? 4 : (parameters.head_size_ % 2 == 0 ? 2 : 1);
+
+  AttentionProbsProgram program{"AttentionProbs", feed_past_key, has_present_key, has_attention_bias, tile_size,
+                                components, parameters.is_first_prompt_, parameters.n_reps, seqlen_k, parameters.past_present_share_buffer_};
+  program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components},
+                     {K, ProgramTensorMetadataDependency::TypeAndRank, components}});
+  if (feed_past_key) {
+    program.AddInput({past_key, ProgramTensorMetadataDependency::TypeAndRank, components});
+  }
+  if (has_attention_bias) {
+    program.AddInput({attention_bias, ProgramTensorMetadataDependency::TypeAndRank});
+  }
+  if (seqlen_k != nullptr) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::TypeAndRank});
+  }
+  program.AddOutputs({{probs, ProgramTensorMetadataDependency::Rank}});
+  if (has_present_key) {
+    program.AddOutput({present_key, ProgramTensorMetadataDependency::Rank, components});
+  }
+
+  const uint32_t vectorized_head_size = (parameters.head_size_ + components - 1) / components;
+  program.SetDispatchGroupSize((total_sequence_length + tile_size - 1) / tile_size,
+                               (parameters.sequence_length_ + tile_size - 1) / tile_size,
+                               parameters.batch_size_ * parameters.num_heads_)
+      .SetWorkgroupSize(tile_size, tile_size)
+      .CacheHint(std::to_string(tile_size), parameters.past_present_share_buffer_, feed_past_key, has_present_key, has_attention_bias, seqlen_k != nullptr, components, parameters.is_first_prompt_)
+      .AddUniformVariables({{static_cast<uint32_t>(parameters.sequence_length_)},
+                            {static_cast<uint32_t>(vectorized_head_size)},
+                            {static_cast<uint32_t>(total_sequence_length)},
+                            {static_cast<uint32_t>(parameters.num_heads_)},
+                            {static_cast<uint32_t>(parameters.head_size_)},
+                            {static_cast<float>(alpha)},
+                            {static_cast<uint32_t>(past_sequence_length)},
+                            {static_cast<uint32_t>(parameters.kv_sequence_length_)},
+                            {static_cast<uint32_t>(seqlen_k == nullptr ? total_sequence_length : parameters.seqlen_present_kv_cache_)},
+                            {static_cast<uint32_t>(parameters.n_reps)}})
+      .SetOverridableConstants({{static_cast<uint32_t>(tile_size)}});
+
+  return context.RunProgram(program);
+}
+
+Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  if (seqlen_k_) {
+    shader.AddInput("seqlen_k", ShaderUsage::UseUniform);
+  }
+  shader.AddOutput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  shader.AdditionalImplementation() << "var<workgroup> thread_max: array<f32, " << work_group_size_ << ">;\n"
+                                    << "var<workgroup> thread_sum: array<f32, " << work_group_size_ << ">;\n"
+                                    << "alias f32_val_t = " << (components_ == 4 ? "vec4<f32>" : (components_ == 2 ? "vec2<f32>" : "f32")) << ";\n";
+  shader.MainFunctionBody() << "let batch_idx = workgroup_id.z / uniforms.num_heads;\n"
+                            << "let sequence_length = uniforms.sequence_length;\n"
+                            << "var total_sequence_length = uniforms.total_sequence_length_comp * " << components_ << ";\n";
+  std::ostringstream oss;
+  InitVarStub(oss, seqlen_k_, is_first_prompt_);
+  shader.MainFunctionBody() << oss.str()
+                            << "let local_offset = local_idx * uniforms.elements_per_thread;\n"
+                            << "let offset = (global_idx / " << work_group_size_ << ") * uniforms.total_sequence_length_comp + local_offset;\n"
+                            << "let seq_causal_length = " << (seqlen_k_ ? "past_sequence_length + workgroup_id.y + 1" : "uniforms.total_sequence_length_comp") << ";\n"
+                            << "var thread_max_vector = f32_val_t(-3.402823e+38f);\n"
+                            << "for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {\n"
+                            << "  thread_max_vector = max(f32_val_t(x[offset + i]), thread_max_vector);\n"
+                            << "}\n"
+                            << "thread_max[local_idx] = " << (components_ == 4 ? "max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))" : (components_ == 2 ? "max(thread_max_vector.x, thread_max_vector.y)" : "thread_max_vector")) << ";\n"
+                            << "workgroupBarrier();\n"
+                            << "var max_value =  f32(-3.402823e+38f);\n"
+                            << "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
+                            << "  max_value = max(thread_max[i], max_value);\n"
+                            << "}\n"
+                            << "var sum_vector = f32_val_t(0);\n"
+                            << "for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {\n"
+                            << "  sum_vector += exp(f32_val_t(x[offset + i]) - max_value);\n"
+                            << "}\n"
+                            << "thread_sum[local_idx] = " << (components_ == 4 ? "sum_vector.x + sum_vector.y + sum_vector.z + sum_vector.w" : (components_ == 2 ? "sum_vector.x + sum_vector.y" : "sum_vector")) << ";\n"
+                            << "workgroupBarrier();\n"
+                            << "var sum: f32 = 0;\n"
+                            << "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
+                            << "  sum += thread_sum[i]\n;"
+                            << "}\n"
+                            << "if (sum == 0) {\n"
+                            << "  for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {\n"
+                            << "    x[offset + i] = x_value_t(x_element_t(1.0)/x_element_t(seq_causal_length));\n"
+                            << "  }\n"
+                            << "} else {\n"
+                            << "  for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {\n"
+                            << "    var f32input = f32_val_t(x[offset + i]);\n"
+                            << "    x[offset + i] = x_value_t(exp(f32input - max_value) / sum);\n"
+                            << "  }\n"
+                            << "}\n";
+  if (seqlen_k_) {
+    shader.MainFunctionBody() << "for (var total_seq_id: u32 = seq_causal_length; total_seq_id + local_offset < uniforms.total_sequence_length_comp; total_seq_id++) {\n"
+                              << "   x[offset + total_seq_id] = x_value_t(x_element_t(0));\n"
+                              << "}\n";
+  }
+
+  return Status::OK();
+}
+
+Status ComputeInPlaceSoftmax(onnxruntime::webgpu::ComputeContext& context, Tensor* probs, int32_t batch_size, int32_t num_heads, int32_t past_sequence_length, int32_t sequence_length, int32_t total_sequence_length,
+                             const Tensor* seqlen_k, bool is_first_prompt) {
+  const int components = seqlen_k != nullptr ? 1 : (total_sequence_length % 4 == 0 ? 4 : (total_sequence_length % 2 == 0 ? 2 : 1));
+  int work_group_size = 64;
+  const int total_sequence_length_comp = (total_sequence_length + components - 1) / components;
+  if (total_sequence_length_comp < work_group_size) {
+    work_group_size = 32;
+  }
+  const int elementsPerThread = (total_sequence_length_comp + work_group_size - 1) / work_group_size;
+
+  InPlaceSoftmaxProgram program{"InPlaceSoftmax", work_group_size, components, is_first_prompt, seqlen_k};
+  if (seqlen_k != nullptr) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::TypeAndRank});
+  }
+  program.AddOutputs({{probs, ProgramTensorMetadataDependency::TypeAndRank, components}})
+      .CacheHint(work_group_size, is_first_prompt)
+      .SetDispatchGroupSize(1, sequence_length, batch_size * num_heads)
+      .SetWorkgroupSize(work_group_size)
+      .AddUniformVariables({{static_cast<uint32_t>(batch_size)},
+                            {static_cast<uint32_t>(num_heads)},
+                            {static_cast<uint32_t>(past_sequence_length)},
+                            {static_cast<uint32_t>(sequence_length)},
+                            {static_cast<uint32_t>(total_sequence_length_comp)},
+                            {static_cast<uint32_t>(elementsPerThread)}});
+
+  return context.RunProgram(program);
+}
+
+Status VxAttentionScoreProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  shader.AddInput("probs", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  shader.AddInput("v", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  if (feed_past_value_) {
+    shader.AddInput("past_value", ShaderUsage::UseUniform);
+  }
+  if (seqlen_k_) {
+    shader.AddInput("seqlen_k", ShaderUsage::UseUniform);
+  }
+  shader.AddOutput("output", ShaderUsage::UseUniform);
+  if (has_present_value_) {
+    shader.AddOutput("present_value", ShaderUsage::UseUniform);
+  }
+
+  shader.AdditionalImplementation() << "var<workgroup> tileQ: array<probs_value_t, " << tile_size_ * tile_size_ << ">;\n"
+                                    << "var<workgroup> tileK: array<v_value_t, " << tile_size_ * tile_size_ << ">;\n";
+  shader.MainFunctionBody() << "let head_idx = workgroup_id.z % uniforms.num_heads;\n"
+                            << "let batch_idx = workgroup_id.z / uniforms.num_heads;\n"
+                            << "let m = global_id.y;\n"
+                            << "let n = global_id.x;\n"
+                            << "let offsetA = workgroup_id.z * (uniforms.M * uniforms.K) + m * uniforms.K;\n"
+                            << "let sequence_length = uniforms.M;\n"
+                            << "var total_sequence_length = uniforms.K;\n";
+  std::ostringstream oss;
+  InitVarStub(oss, seqlen_k_, is_first_prompt_);
+  shader.MainFunctionBody() << oss.str();
+  shader.MainFunctionBody() << "let vOffset = (workgroup_id.z / " << n_reps_ << ") * uniforms.N * uniforms.kv_sequence_length + n;\n";
+  if (has_present_value_) {
+    shader.MainFunctionBody() << "let presentValueOffset = (workgroup_id.z / " << n_reps_ << ") * uniforms.N * uniforms.present_sequence_length + n;\n";
+  }
+
+  shader.MainFunctionBody() << "var value = probs_element_t(0);\n"
+                            << "for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {\n"
+                            << "  if (m < uniforms.M && w + local_id.x < uniforms.K) {\n"
+                            << "    tileQ[TILE_SIZE * local_id.y + local_id.x] = probs[offsetA + w + local_id.x];\n"
+                            << "  }\n"
+                            << "  if (n < uniforms.N && w + local_id.y < uniforms.K) {\n"
+                            << "    var idx = TILE_SIZE * local_id.y + local_id.x;\n";
+
+  if ((feed_past_value_ && has_present_value_) || past_present_share_buffer_) {
+    shader.MainFunctionBody() << "    if (w + local_id.y < past_sequence_length) {\n"
+                              << "      let pastValueOffset = (workgroup_id.z / " << n_reps_ << ") * uniforms.N * uniforms.past_sequence_length + n;\n"
+                              << "      tileK[idx] = " << (past_present_share_buffer_ ? "present_value" : "past_value") << "[pastValueOffset + (w + local_id.y) * uniforms.N];\n"
+                              << "    } else if (w + local_id.y - past_sequence_length < uniforms.kv_sequence_length) {\n"
+                              << "      tileK[idx] = v[vOffset + (w + local_id.y - uniforms.past_sequence_length) * uniforms.N];\n"
+                              << "    }\n";
+  } else {
+    shader.MainFunctionBody() << "    if (w + local_id.y < uniforms.kv_sequence_length) {\n"
+                              << "      tileK[idx] = v[vOffset + (w + local_id.y) * uniforms.N];\n"
+                              << "    }\n";
+  }
+
+  if (has_present_value_) {
+    if (past_present_share_buffer_) {
+      shader.MainFunctionBody() << "    if (w + local_id.y >= past_sequence_length && w + local_id.y < uniforms.kv_sequence_length + past_sequence_length) {\n";
+    } else {
+      shader.MainFunctionBody() << "    if (w + local_id.y < uniforms.kv_sequence_length + past_sequence_length) {\n";
+    }
+    shader.MainFunctionBody() << "      present_value[presentValueOffset + (w + local_id.y) * uniforms.N] = tileK[idx];\n"
+                              << "    }\n";
+  }
+
+  shader.MainFunctionBody() << "  }\n"
+                            << "  workgroupBarrier();\n"
+                            << "  for (var k: u32 = 0u; k < TILE_SIZE && w+k < total_sequence_length; k++) {\n"
+                            << "    value += tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * k + local_id.x];\n"
+                            << "  }\n"
+                            << "  workgroupBarrier();\n"
+                            << "}\n";
+
+  shader.MainFunctionBody() << "// we need to transpose output from BNSH_v to BSND_v\n"
+                            << "if (m < uniforms.M && n < uniforms.N) {\n"
+                            << "  let outputIdx = batch_idx * uniforms.M * uniforms.v_hidden_size + "
+                            << "  m * uniforms.v_hidden_size + head_idx * uniforms.N + n;\n"
+                            << "  output[outputIdx] = value;\n"
+                            << "}\n";
+
+  return Status::OK();
+}
+
+Status ComputeVxAttentionScore(onnxruntime::webgpu::ComputeContext& context, int output_count,
+                               const Tensor* probs,
+                               const Tensor* V,
+                               const Tensor* past_value,
+                               Tensor* output,
+                               Tensor* present_value,
+                               WebgpuAttentionParameters& parameters,
+                               int past_sequence_length,
+                               int total_sequence_length,
+                               const Tensor* seqlen_k) {
+  const bool feed_past_value = present_value != nullptr && past_value != nullptr && past_value->SizeInBytes() > 0 && !parameters.past_present_share_buffer_;
+  const bool has_present_value = output_count > 1 && past_value != nullptr;
+  constexpr int tile_size = 12;
+
+  VxAttentionScoreProgram program{"VxAttentionScore", feed_past_value, has_present_value, tile_size, parameters.is_first_prompt_, parameters.n_reps, seqlen_k, parameters.past_present_share_buffer_};
+  program.AddInputs({{probs, ProgramTensorMetadataDependency::TypeAndRank},
+                     {V, ProgramTensorMetadataDependency::TypeAndRank}});
+  if (feed_past_value) {
+    program.AddInput({past_value, ProgramTensorMetadataDependency::TypeAndRank});
+  }
+  if (seqlen_k != nullptr) {
+    program.AddInput({seqlen_k, ProgramTensorMetadataDependency::TypeAndRank});
+  }
+  program.AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank}});
+  if (has_present_value) {
+    program.AddOutput({present_value, ProgramTensorMetadataDependency::TypeAndRank});
+  }
+
+  program.SetDispatchGroupSize((parameters.v_head_size_ + tile_size - 1) / tile_size,
+                               (parameters.sequence_length_ + tile_size - 1) / tile_size,
+                               parameters.batch_size_ * parameters.num_heads_)
+      .CacheHint(std::to_string(tile_size), parameters.past_present_share_buffer_, feed_past_value, has_present_value, seqlen_k != nullptr, parameters.is_first_prompt_)
+      .SetWorkgroupSize(tile_size, tile_size)
+      .AddUniformVariables({{static_cast<uint32_t>(parameters.sequence_length_)},
+                            {static_cast<uint32_t>(total_sequence_length)},
+                            {static_cast<uint32_t>(parameters.v_head_size_)},
+                            {static_cast<uint32_t>(parameters.num_heads_)},
+                            {static_cast<uint32_t>(parameters.head_size_)},
+                            {static_cast<uint32_t>(parameters.v_hidden_size_ * parameters.n_reps)},
+                            {static_cast<uint32_t>(past_sequence_length)},
+                            {static_cast<uint32_t>(parameters.kv_sequence_length_)},
+                            {static_cast<uint32_t>(seqlen_k == nullptr ? total_sequence_length : parameters.seqlen_present_kv_cache_)},
+                            {static_cast<uint32_t>(parameters.n_reps)}})
+      .SetOverridableConstants({{static_cast<uint32_t>(tile_size)}});
+
+  return context.RunProgram(program);
+}
+
+Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
+                      const Tensor* past_key, const Tensor* past_value, Tensor* output, Tensor* present_key, Tensor* present_value,
+                      WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k) {
+  const int output_count = std::min({context.OutputCount(), 1 + (past_key != nullptr ? 1 : 0) + (past_value != nullptr ? 1 : 0)});
+  const int past_sequence_length = output_count > 1 ? parameters.past_sequence_length_ : 0;
+  const int total_sequence_length = past_sequence_length + parameters.kv_sequence_length_;
+
+  const TensorShapeVector probs_dims({parameters.batch_size_, parameters.num_heads_,
+                                      parameters.sequence_length_, total_sequence_length});
+  const TensorShape probs_shape(probs_dims);
+  Tensor probs = context.CreateGPUTensor(Q->DataType(), probs_shape);
+  ORT_RETURN_IF_ERROR(ComputeAttentionProbs(context, output_count, Q, K, past_key, attention_bias, &probs, present_key,
+                                            parameters, past_sequence_length, total_sequence_length, seqlen_k));
+
+  ORT_RETURN_IF_ERROR(ComputeInPlaceSoftmax(context, &probs,
+                                            parameters.batch_size_, parameters.num_heads_, parameters.past_sequence_length_, parameters.sequence_length_, total_sequence_length, seqlen_k, parameters.is_first_prompt_));
+
+  ORT_RETURN_IF_ERROR(ComputeVxAttentionScore(context, output_count, &probs, V, past_value, output, present_value,
+                                              parameters, past_sequence_length, total_sequence_length, seqlen_k));
+
+  return Status::OK();
+}
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/bert/attention.h b/onnxruntime/contrib_ops/webgpu/bert/attention.h
new file mode 100644
index 0000000000000..03279fffbc3ef
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/bert/attention.h
@@ -0,0 +1,123 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/compute_context.h"
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "contrib_ops/webgpu/bert/attention_common.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+using namespace onnxruntime::webgpu;
+
+class TransferBSDToBNSHProgram final : public Program<TransferBSDToBNSHProgram> {
+ public:
+  TransferBSDToBNSHProgram(bool has_bias) : Program{"TransferBSDToBNSH"}, has_bias_(has_bias) {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"data_size", ProgramUniformVariableDataType::Uint32},
+                                          {"batch_offset", ProgramUniformVariableDataType::Uint32},
+                                          {"sequence_offset", ProgramUniformVariableDataType::Uint32},
+                                          {"head_offset", ProgramUniformVariableDataType::Uint32},
+                                          {"bias_offset", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  bool has_bias_;
+};
+
+class AttentionProbsProgram final : public Program<AttentionProbsProgram> {
+ public:
+  AttentionProbsProgram(const std::string& kernel_name, bool feed_past_key, bool has_present_key,
+                        bool has_attention_bias, int tile_size, int components, bool is_first_prompt, int n_reps = 1, const Tensor* seqlen_k = nullptr, bool past_present_share_buffer = false)
+      : Program{kernel_name}, feed_past_key_(feed_past_key), has_present_key_(has_present_key), has_attention_bias_(has_attention_bias), tile_size_(tile_size), components_(components), n_reps_(n_reps), seqlen_k_(seqlen_k), past_present_share_buffer_(past_present_share_buffer), is_first_prompt_(is_first_prompt) {
+  }
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"M", ProgramUniformVariableDataType::Uint32},
+                                          {"K", ProgramUniformVariableDataType::Uint32},
+                                          {"N", ProgramUniformVariableDataType::Uint32},
+                                          {"num_heads", ProgramUniformVariableDataType::Uint32},
+                                          {"head_size", ProgramUniformVariableDataType::Uint32},
+                                          {"alpha", ProgramUniformVariableDataType::Float32},
+                                          {"past_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"kv_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"present_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"n_reps", ProgramUniformVariableDataType::Uint32});
+
+  WEBGPU_PROGRAM_DEFINE_OVERRIDABLE_CONSTANTS({"TILE_SIZE", ProgramConstantDataType::Uint32});
+
+ private:
+  bool feed_past_key_;
+  bool has_present_key_;
+  bool has_attention_bias_;
+  int tile_size_;
+  int components_;
+  int n_reps_;
+  const Tensor* seqlen_k_;
+  bool past_present_share_buffer_;
+  bool is_first_prompt_;
+};
+
+class InPlaceSoftmaxProgram final : public Program<InPlaceSoftmaxProgram> {
+ public:
+  InPlaceSoftmaxProgram(const std::string& kernel_name, int work_group_size, int components, bool is_first_prompt, const Tensor* seqlen_k = nullptr)
+      : Program{kernel_name}, work_group_size_(work_group_size), components_(components), seqlen_k_(seqlen_k), is_first_prompt_(is_first_prompt) {
+  }
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"batch_size", ProgramUniformVariableDataType::Uint32},
+                                          {"num_heads", ProgramUniformVariableDataType::Uint32},
+                                          {"past_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"total_sequence_length_comp", ProgramUniformVariableDataType::Uint32},
+                                          {"elements_per_thread", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  int work_group_size_;
+  int components_;
+  const Tensor* seqlen_k_;
+  bool is_first_prompt_;
+};
+
+class VxAttentionScoreProgram final : public Program<VxAttentionScoreProgram> {
+ public:
+  VxAttentionScoreProgram(const std::string& kernel_name, bool feed_past_value, bool has_present_value, int tile_size, bool is_first_prompt, int n_reps = 1, const Tensor* seqlen_k = nullptr, bool past_present_share_buffer = false)
+      : Program{kernel_name}, feed_past_value_(feed_past_value), has_present_value_(has_present_value), tile_size_(tile_size), n_reps_(n_reps), seqlen_k_(seqlen_k), past_present_share_buffer_(past_present_share_buffer), is_first_prompt_(is_first_prompt) {
+  }
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"M", ProgramUniformVariableDataType::Uint32},
+                                          {"K", ProgramUniformVariableDataType::Uint32},
+                                          {"N", ProgramUniformVariableDataType::Uint32},
+                                          {"num_heads", ProgramUniformVariableDataType::Uint32},
+                                          {"head_size", ProgramUniformVariableDataType::Uint32},
+                                          {"v_hidden_size", ProgramUniformVariableDataType::Uint32},
+                                          {"past_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"kv_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"present_sequence_length", ProgramUniformVariableDataType::Uint32},
+                                          {"n_reps", ProgramUniformVariableDataType::Uint32});
+
+  WEBGPU_PROGRAM_DEFINE_OVERRIDABLE_CONSTANTS({"TILE_SIZE", ProgramConstantDataType::Uint32});
+
+ private:
+  bool feed_past_value_;
+  bool has_present_value_;
+  int tile_size_;
+  int n_reps_;
+  const Tensor* seqlen_k_;
+  bool past_present_share_buffer_;
+  bool is_first_prompt_;
+};
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/bert/attention_common.h b/onnxruntime/contrib_ops/webgpu/bert/attention_common.h
new file mode 100644
index 0000000000000..be80ade8b87d0
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/bert/attention_common.h
@@ -0,0 +1,130 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/compute_context.h"
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "contrib_ops/webgpu/bert/attention_common.h"
+
+#include "contrib_ops/cpu/bert/attention_common.h"
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+struct WebgpuAttentionParameters {
+  explicit WebgpuAttentionParameters(AttentionParameters parameters) : is_gqa_(false),
+                                                                       batch_size_(parameters.batch_size),
+                                                                       sequence_length_(parameters.sequence_length),
+                                                                       kv_sequence_length_(parameters.kv_sequence_length),
+                                                                       past_sequence_length_(parameters.past_sequence_length),
+                                                                       total_sequence_length_(parameters.total_sequence_length),
+                                                                       max_sequence_length_(parameters.max_sequence_length),
+                                                                       input_hidden_size_(parameters.input_hidden_size),
+                                                                       hidden_size_(parameters.hidden_size),
+                                                                       head_size_(parameters.head_size),
+                                                                       v_hidden_size_(parameters.v_hidden_size),
+                                                                       v_head_size_(parameters.v_head_size),
+                                                                       num_heads_(parameters.num_heads),
+                                                                       is_unidirectional_(parameters.is_unidirectional),
+                                                                       past_present_share_buffer_(parameters.past_present_share_buffer),
+                                                                       do_rotary_(parameters.do_rotary),
+                                                                       broadcast_attn_bias_dim_0_(parameters.broadcast_attn_bias_dim_0),
+                                                                       broadcast_attn_bias_dim_1_(parameters.broadcast_attn_bias_dim_1),
+                                                                       mask_filter_value_(parameters.mask_filter_value),
+                                                                       scale_(parameters.scale),
+                                                                       mask_type_(parameters.mask_type),
+                                                                       qkv_format_(parameters.qkv_format) {
+  }
+
+  explicit WebgpuAttentionParameters(onnxruntime::contrib::GroupQueryAttentionParameters parameters) : is_gqa_(true),
+                                                                                                       batch_size_(parameters.batch_size),
+                                                                                                       sequence_length_(parameters.sequence_length),
+                                                                                                       kv_sequence_length_(parameters.sequence_length),
+                                                                                                       past_sequence_length_(parameters.seqlen_past_kv_cache),
+                                                                                                       total_sequence_length_(parameters.total_sequence_length),
+                                                                                                       hidden_size_(parameters.hidden_size),
+                                                                                                       head_size_(parameters.head_size),
+                                                                                                       v_hidden_size_(parameters.kv_hidden_size),
+                                                                                                       v_head_size_(parameters.kv_hidden_size / parameters.kv_num_heads),
+                                                                                                       num_heads_(parameters.num_heads),
+                                                                                                       do_rotary_(parameters.do_rotary),
+                                                                                                       scale_(parameters.scale),
+                                                                                                       seqlen_past_kv_cache_(parameters.seqlen_past_kv_cache),
+                                                                                                       seqlen_present_kv_cache_(parameters.seqlen_present_kv_cache),
+                                                                                                       kv_hidden_size_(parameters.kv_hidden_size),
+                                                                                                       kv_num_heads_(parameters.kv_num_heads),
+                                                                                                       num_splits_(parameters.num_splits),
+                                                                                                       rotary_dim_(parameters.rotary_dim),
+                                                                                                       is_packed_qkv_(parameters.is_packed_qkv),
+                                                                                                       is_subsequent_prompt_(parameters.is_subsequent_prompt),
+                                                                                                       is_first_prompt_(parameters.is_first_prompt),
+                                                                                                       rotary_interleaved_(parameters.rotary_interleaved),
+                                                                                                       use_smooth_softmax_(parameters.use_smooth_softmax),
+                                                                                                       softcap_(parameters.softcap),
+                                                                                                       zeros_count_(parameters.zeros_count),
+                                                                                                       zero_ptr_(parameters.zero_ptr),
+                                                                                                       n_reps(parameters.num_heads / parameters.kv_num_heads),
+                                                                                                       qkv_format_(parameters.qkv_format) {
+  }
+
+  bool is_gqa_;
+  int batch_size_ = 0;
+  int sequence_length_ = 0;
+  int kv_sequence_length_ = 0;     // input sequence length of K or V
+  int past_sequence_length_ = 0;   // sequence length in past state of K or V
+  int total_sequence_length_ = 0;  // total sequence length of K or V
+  int max_sequence_length_ = 0;    // max sequence length from 4D mask
+  int input_hidden_size_ = 0;      // first dimension of weights for input projection
+  int hidden_size_ = 0;            // hidden size of Q or K
+  int head_size_ = 0;              // hidden size per head of Q or K
+  int v_hidden_size_ = 0;          // hidden size of V
+  int v_head_size_ = 0;            // hidden size per head of V
+  int num_heads_ = 0;
+  int rotary_embedding_ = 0;
+  bool is_unidirectional_ = false;
+  bool past_present_share_buffer_ = false;
+  bool do_rotary_ = false;
+  bool broadcast_attn_bias_dim_0_ = false;
+  bool broadcast_attn_bias_dim_1_ = false;
+  float mask_filter_value_ = -10000.0f;
+  float scale_ = 0.0f;
+  bool use_tf32_ = false;
+  ;
+  // The following members are in onnxruntime::contrib::GroupQueryAttentionParameters
+  // and not in onnxruntime::contrib::AttentionParameters
+  int seqlen_past_kv_cache_ = 0;     // sequence length of past kv tensor
+  int seqlen_present_kv_cache_ = 0;  // sequence length of present kv tensor
+  int kv_hidden_size_ = 0;
+  int kv_num_heads_ = 0;
+  int num_splits_ = 0;  // number of splits for splitkv
+  int rotary_dim_ = 0;  // rotary embedding dimension
+  int local_window_size_ = 0;
+  bool kv_share_buffer_ = false;
+  bool is_packed_qkv_ = false;
+  bool is_subsequent_prompt_ = false;  // indicates whether we have past context and seqlen > 1
+  bool is_first_prompt_ = false;       // indicates whether this is first decoding step
+  bool rotary_interleaved_ = false;
+  bool use_smooth_softmax_ = false;
+  float softcap_ = 0.0;
+  int zeros_count_ = 0;
+  ;
+  int* zero_ptr_ = nullptr;
+  // Computed values
+  int n_reps = 1;
+  AttentionMaskType mask_type_ = MASK_NONE;
+  AttentionQkvFormat qkv_format_ = UNKNOWN;
+};
+
+Status TransferBSDToBNSH(onnxruntime::webgpu::ComputeContext& context, int num_heads, int sequence_length,
+                         int head_size, const Tensor* input_tensor, const Tensor* bias, int bias_offset, Tensor* output_tensor);
+
+Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
+                      const Tensor* past_key, const Tensor* past_value, Tensor* output, Tensor* present_key, Tensor* present_value,
+                      WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k = nullptr);
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
new file mode 100644
index 0000000000000..31c8af9b4f922
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
@@ -0,0 +1,107 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/cpu/bert/group_query_attention_helper.h"
+#include "contrib_ops/webgpu/bert/attention_common.h"
+#include "contrib_ops/webgpu/bert/group_query_attention.h"
+#include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
+
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+using namespace onnxruntime::webgpu;
+using namespace ::onnxruntime::common;
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::contrib::group_query_attention_helper;
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+ONNX_OPERATOR_KERNEL_EX(
+    GroupQueryAttention,
+    kMSDomain,
+    1,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedFloatTypes())
+        .MayInplace(3, 1)
+        .MayInplace(4, 2)
+        .InputMemoryType(OrtMemTypeCPUInput, 6),
+    GroupQueryAttention);
+
+Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
+  const Tensor* query = context.Input<Tensor>(0);
+  const Tensor* key = context.Input<Tensor>(1);
+  const Tensor* value = context.Input<Tensor>(2);
+  const Tensor* past_key = context.Input<Tensor>(3);
+  const Tensor* past_value = context.Input<Tensor>(4);
+  const Tensor* seqlen_k = context.Input<Tensor>(5);
+  const Tensor* total_seqlen_tensor = context.Input<Tensor>(6);
+  const Tensor* cos_cache = context.Input<Tensor>(7);
+  const Tensor* sin_cache = context.Input<Tensor>(8);
+
+  GroupQueryAttentionParameters params;
+  ORT_RETURN_IF_ERROR(group_query_attention_helper::CheckInputs(query,
+                                                                key,
+                                                                value,
+                                                                past_key,
+                                                                past_value,
+                                                                cos_cache,
+                                                                sin_cache,
+                                                                &params,
+                                                                num_heads_,
+                                                                kv_num_heads_,
+                                                                seqlen_k,
+                                                                total_seqlen_tensor,
+                                                                scale_,
+                                                                softcap_));
+  WebgpuAttentionParameters parameters(params);
+  if (parameters.is_packed_qkv_) {
+    ORT_NOT_IMPLEMENTED("Packed QKV of shape (B, L, N, 3, H) not implemented for webgpu-ep.");
+  }
+  TensorShapeVector output_shape(3);
+  output_shape[0] = static_cast<int64_t>(parameters.batch_size_);
+  output_shape[1] = static_cast<int64_t>(parameters.sequence_length_);
+  output_shape[2] = static_cast<int64_t>(parameters.hidden_size_);
+  Tensor* output = context.Output(0, output_shape);
+  std::vector<int64_t> present_dims{
+      parameters.batch_size_,
+      kv_num_heads_,
+      parameters.seqlen_present_kv_cache_,
+      parameters.head_size_};
+  std::vector<int64_t> present_kv_shape(present_dims);
+  Tensor* present_key = context.Output(1, present_kv_shape);
+  Tensor* present_value = context.Output(2, present_kv_shape);
+  parameters.past_present_share_buffer_ = present_key != nullptr && present_value != nullptr && past_key != nullptr && past_value != nullptr && past_key->DataRaw() == present_key->DataRaw() && past_value->DataRaw() == present_value->DataRaw();
+
+  TensorShapeVector q_new_dims({parameters.batch_size_, parameters.num_heads_,
+                                parameters.sequence_length_, parameters.head_size_});
+  TensorShape q_new_shape(q_new_dims);
+  Tensor Q = context.CreateGPUTensor(query->DataType(), q_new_shape);
+  ORT_RETURN_IF_ERROR(TransferBSDToBNSH(
+      context, parameters.num_heads_, parameters.sequence_length_, parameters.head_size_, query, nullptr, 0, &Q));
+  if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) {  // key and value in BNSH format
+    return ApplyAttention(&Q, key, value, nullptr, past_key, past_value, output, present_key,
+                          present_value, parameters, context, seqlen_k);
+  }
+
+  TensorShapeVector k_new_dims({parameters.batch_size_, parameters.kv_num_heads_,
+                                parameters.kv_sequence_length_, parameters.head_size_});
+  TensorShape k_new_shape(k_new_dims);
+  Tensor K = context.CreateGPUTensor(key->DataType(), k_new_shape);
+  ORT_RETURN_IF_ERROR(TransferBSDToBNSH(context, parameters.kv_num_heads_, parameters.kv_sequence_length_,
+                                        parameters.head_size_, key, nullptr, 0, &K));
+
+  TensorShapeVector v_new_dims({parameters.batch_size_, parameters.kv_num_heads_,
+                                parameters.kv_sequence_length_, parameters.v_head_size_});
+  TensorShape v_new_shape(v_new_dims);
+  Tensor V = context.CreateGPUTensor(value->DataType(), v_new_shape);
+  ORT_RETURN_IF_ERROR(TransferBSDToBNSH(context, parameters.kv_num_heads_, parameters.kv_sequence_length_,
+                                        parameters.v_head_size_, value, nullptr, 0, &V));
+  return ApplyAttention(&Q, &K, &V, nullptr, past_key, past_value, output, present_key,
+                        present_value, parameters, context, seqlen_k);
+}
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.h b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.h
new file mode 100644
index 0000000000000..04969dc778927
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.h
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/compute_context.h"
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+using namespace onnxruntime::webgpu;
+
+class GroupQueryAttention final : public WebGpuKernel {
+ public:
+  GroupQueryAttention(const OpKernelInfo& info) : WebGpuKernel(info) {
+    int64_t num_heads = 0;
+    ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
+    num_heads_ = static_cast<int>(num_heads);
+
+    int64_t kv_num_heads = 0;
+    ORT_ENFORCE(info.GetAttr("kv_num_heads", &kv_num_heads).IsOK() && kv_num_heads > 0);
+    kv_num_heads_ = static_cast<int>(kv_num_heads);
+
+    scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
+    softcap_ = info.GetAttrOrDefault<float>("softcap", 0.0f);
+
+    do_rotary_ = info.GetAttrOrDefault<int64_t>("do_rotary", 0) == 1;
+    rotary_interleaved_ = info.GetAttrOrDefault<int64_t>("rotary_interleaved", 0) == 1;
+
+    use_smooth_softmax_ = info.GetAttrOrDefault<int64_t>("smooth_softmax", 0) == 1;
+
+    local_window_size_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("local_window_size", -1));
+  }
+
+  int num_heads_;     // number of attention heads of Q
+  int kv_num_heads_;  // number of attention heads of K or V
+  float scale_;       // the scaling factor applied before softmax
+  float softcap_;
+  bool do_rotary_;  // whether or not to use rotary embeddings
+  bool rotary_interleaved_;
+  int local_window_size_;
+
+  bool use_smooth_softmax_;
+  Status ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const override;
+};
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.cc
index 5583f296fae42..424556c66bd9d 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "contrib_ops/cpu/bert/multihead_attention_helper.h"
+#include "contrib_ops/webgpu/bert/attention_common.h"
 #include "contrib_ops/webgpu/bert/multihead_attention.h"
 #include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
 
@@ -25,392 +26,8 @@ ONNX_OPERATOR_KERNEL_EX(
         .TypeConstraint("T", WebGpuSupportedFloatTypes()),
     MultiHeadAttention);
 
-Status TransferBSDToBNSHProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("qkv_input", ShaderUsage::UseUniform);
-  const auto& qkv_output = shader.AddOutput("qkv_output", ShaderUsage::UseUniform | ShaderUsage::UseOffsetToIndices);
-
-  if (has_bias_) {
-    shader.AddInput("bias", ShaderUsage::UseUniform);
-  }
-
-  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.data_size")
-                            << "let output_indices = " << qkv_output.OffsetToIndices("global_idx") << ";\n"
-                            << "let input_offset_idx = output_indices[0] * uniforms.batch_offset + output_indices[1] *"
-                            << " uniforms.head_offset + output_indices[2] * uniforms.sequence_offset + output_indices[3];\n";
-  if (has_bias_) {
-    shader.MainFunctionBody() << "let bias_offset_idx = (input_offset_idx % uniforms.sequence_offset) + uniforms.bias_offset;\n";
-  }
-  shader.MainFunctionBody() << "qkv_output[global_idx] = qkv_input[input_offset_idx]";
-  if (has_bias_) {
-    shader.MainFunctionBody() << " + bias[bias_offset_idx];\n";
-  } else {
-    shader.MainFunctionBody() << ";\n";
-  }
-
-  return Status::OK();
-}
-
-Status TransferBSDToBNSH(onnxruntime::webgpu::ComputeContext& context, int num_heads, int sequence_length,
-                         int head_size, const Tensor* input_tensor, const Tensor* bias, int bias_offset, Tensor* output_tensor) {
-  assert(input_tensor->Shape().GetDims().size() == 3);
-  assert(output_tensor->Shape().GetDims().size() == 4);
-
-  uint32_t data_size = gsl::narrow<uint32_t>(output_tensor->Shape().Size());
-  const int batch_offset = num_heads * sequence_length * head_size;
-  const int sequence_offset = num_heads * head_size;
-  const int head_offset = head_size;
-  bool has_bias = bias != nullptr;
-
-  TransferBSDToBNSHProgram program{has_bias};
-  program.AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
-      .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
-      .SetDispatchGroupSize((data_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
-      .AddUniformVariables({{data_size},
-                            {static_cast<uint32_t>(batch_offset)},
-                            {static_cast<uint32_t>(sequence_offset)},
-                            {static_cast<uint32_t>(head_offset)},
-                            {static_cast<uint32_t>(bias_offset)}});
-
-  if (has_bias) {
-    program.AddInput({bias, ProgramTensorMetadataDependency::TypeAndRank});
-  }
-
-  return context.RunProgram(program);
-};
-
-Status AttentionProbsProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("q", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
-  shader.AddInput("key", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
-  if (feed_past_key_) {
-    shader.AddInput("past_key", ShaderUsage::UseUniform);
-  }
-  if (has_attention_bias_) {
-    shader.AddInput("attention_bias", ShaderUsage::UseUniform);
-  }
-
-  shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
-  if (has_present_key_) {
-    shader.AddOutput("present_key", ShaderUsage::UseUniform);
-  }
-
-  shader.AdditionalImplementation() << "var<workgroup> tileQ: array<q_value_t, " << tile_size_ * tile_size_ << ">;\n"
-                                    << "var<workgroup> tileK: array<key_value_t, " << tile_size_ * tile_size_ << ">;\n"
-                                    << "alias f32_val_t = " << (components_ == 4 ? "vec4<f32>" : (components_ == 2 ? "vec2<f32>" : "f32")) << ";\n";
-
-  shader.MainFunctionBody() << "// x holds the N and y holds the M\n"
-                               "let headIdx = workgroup_id.z;\n"
-                               "let m = workgroup_id.y * TILE_SIZE;\n"
-                               "let n = workgroup_id.x * TILE_SIZE;\n"
-                               "let qOffset = uniforms.M * uniforms.K * headIdx + m * uniforms.K;\n";
-
-  if (feed_past_key_ && has_present_key_) {
-    shader.MainFunctionBody() << "let kOffset = uniforms.kv_sequence_length * uniforms.K * headIdx;\n"
-                              << "let pastKeyOffset = uniforms.past_sequence_length * uniforms.K * headIdx;\n";
-  } else {
-    shader.MainFunctionBody() << "let kOffset = uniforms.N * uniforms.K * headIdx + n * uniforms.K;\n";
-  }
-
-  if (has_present_key_) {
-    shader.MainFunctionBody() << "let presentKeyOffset = headIdx * uniforms.N * uniforms.K;\n";
-  }
-
-  shader.MainFunctionBody() << "var value = f32_val_t(0);\n"
-                               "for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {\n"
-                               "  if (global_id.y < uniforms.M && w + local_id.x < uniforms.K) {\n"
-                               "    tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * uniforms.K + w + local_id.x];\n"
-                               "  }\n"
-                               "  if (n + local_id.y < uniforms.N && w + local_id.x < uniforms.K) {\n"
-                               "    var idx = TILE_SIZE * local_id.y + local_id.x;\n";
-
-  if (feed_past_key_ && has_present_key_) {
-    shader.MainFunctionBody() << "    if (n + local_id.y < uniforms.past_sequence_length) {\n"
-                                 "      tileK[idx] = past_key[pastKeyOffset + (n + local_id.y) * uniforms.K + w + local_id.x];\n"
-                                 "    } else {\n"
-                                 "      tileK[idx] = key[kOffset + (n + local_id.y - uniforms.past_sequence_length) * uniforms.K + w + local_id.x];\n"
-                                 "    }\n";
-  } else {
-    shader.MainFunctionBody() << "    tileK[idx] = key[kOffset + local_id.y * uniforms.K + w + local_id.x];\n";
-  }
-
-  if (has_present_key_) {
-    shader.MainFunctionBody() << "    present_key[presentKeyOffset + (n + local_id.y) * uniforms.K + w + local_id.x] = tileK[idx];\n";
-  }
-
-  shader.MainFunctionBody() << "  }\n"
-                            << "  workgroupBarrier();\n"
-                            << "  for (var k: u32 = 0u; k < TILE_SIZE && w+k < uniforms.K; k++) {\n"
-                            << "    value += f32_val_t(tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * local_id.x + k]);\n"
-                            << "  }\n"
-                            << "  workgroupBarrier();\n"
-                            << "}\n";
-
-  shader.MainFunctionBody() << "let headOffset = headIdx * uniforms.M * uniforms.N;\n"
-                            << "if (global_id.y < uniforms.M && global_id.x < uniforms.N) {\n"
-                            << "  let outputIdx = headOffset + global_id.y * uniforms.N + global_id.x;\n"
-                            << "  var sum: f32 = " << (components_ == 4 ? "value.x + value.y + value.z + value.w" : (components_ == 2 ? "value.x + value.y" : "value")) << ";\n";
-
-  shader.MainFunctionBody() << "  output[outputIdx] = output_value_t(sum * uniforms.alpha)";
-  if (has_attention_bias_) {
-    shader.MainFunctionBody() << " + attention_bias[outputIdx]";
-  }
-  shader.MainFunctionBody() << ";\n"
-                            << "}\n";
-
-  return Status::OK();
-}
-
-Status ComputeAttentionProbs(onnxruntime::webgpu::ComputeContext& context, int output_count, const Tensor* Q,
-                             const Tensor* K, const Tensor* past_key, const Tensor* attention_bias, Tensor* probs, Tensor* present_key,
-                             AttentionParameters& parameters, int past_sequence_length, int total_sequence_length) {
-  const float alpha = parameters.scale == 0.0f ? 1.f / sqrt(static_cast<float>(parameters.head_size))
-                                               : parameters.scale;
-
-  const bool feed_past_key = present_key != nullptr && past_key != nullptr && past_key->SizeInBytes() > 0;
-  const bool has_present_key = output_count > 1 && past_key;
-  const bool has_attention_bias = attention_bias != nullptr;
-  constexpr int tile_size = 12;
-  const int components = parameters.head_size % 4 == 0 ? 4 : (parameters.head_size % 2 == 0 ? 2 : 1);
-
-  AttentionProbsProgram program{"AttentionProbs", feed_past_key, has_present_key, has_attention_bias, tile_size,
-                                components};
-  program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components},
-                     {K, ProgramTensorMetadataDependency::TypeAndRank, components}});
-  if (feed_past_key) {
-    program.AddInput({past_key, ProgramTensorMetadataDependency::TypeAndRank, components});
-  }
-  if (has_attention_bias) {
-    program.AddInput({attention_bias, ProgramTensorMetadataDependency::TypeAndRank});
-  }
-  program.AddOutputs({{probs, ProgramTensorMetadataDependency::Rank}});
-  if (has_present_key) {
-    program.AddOutput({present_key, ProgramTensorMetadataDependency::Rank, components});
-  }
-
-  const uint32_t vectorized_head_size = parameters.head_size / components;
-  program.SetDispatchGroupSize((total_sequence_length + tile_size - 1) / tile_size,
-                               (parameters.sequence_length + tile_size - 1) / tile_size,
-                               parameters.batch_size * parameters.num_heads)
-      .SetWorkgroupSize(tile_size, tile_size)
-      .CacheHint(std::to_string(tile_size))
-      .AddUniformVariables({{static_cast<uint32_t>(parameters.sequence_length)},
-                            {static_cast<uint32_t>(vectorized_head_size)},
-                            {static_cast<uint32_t>(total_sequence_length)},
-                            {static_cast<uint32_t>(parameters.num_heads)},
-                            {static_cast<float>(alpha)},
-                            {static_cast<uint32_t>(past_sequence_length)},
-                            {static_cast<uint32_t>(parameters.kv_sequence_length)}})
-      .SetOverridableConstants({{static_cast<uint32_t>(tile_size)}});
-
-  return context.RunProgram(program);
-}
-
-Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddOutput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
-  shader.AdditionalImplementation() << "var<workgroup> thread_max: array<f32, " << work_group_size_ << ">;\n"
-                                    << "var<workgroup> thread_sum: array<f32, " << work_group_size_ << ">;\n"
-                                    << "alias f32_val_t = " << (components_ == 4 ? "vec4<f32>" : (components_ == 2 ? "vec2<f32>" : "f32")) << ";\n";
-
-  shader.MainFunctionBody() << "let local_offset = local_idx * uniforms.elements_per_thread;\n"
-                            << "let offset = (global_idx / " << work_group_size_ << ") * uniforms.d_comp + local_offset;\n"
-                            << "var thread_max_vector = f32_val_t(-3.402823e+38f);\n"
-                            << "for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < uniforms.d_comp; i++) {\n"
-                            << "  thread_max_vector = max(f32_val_t(x[offset + i]), thread_max_vector);\n"
-                            << "}\n"
-                            << "thread_max[local_idx] = " << (components_ == 4 ? "max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))" : (components_ == 2 ? "max(thread_max_vector.x, thread_max_vector.y)" : "thread_max_vector")) << ";\n"
-                            << "workgroupBarrier();\n"
-                            << "var max_value =  f32(-3.402823e+38f);\n"
-                            << "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
-                            << "  max_value = max(thread_max[i], max_value);\n"
-                            << "}\n"
-                            << "var sum_vector = f32_val_t(0);\n"
-                            << "for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < uniforms.d_comp; i++) {\n"
-                            << "  sum_vector += exp(f32_val_t(x[offset + i]) - max_value);\n"
-                            << "}\n"
-                            << "thread_sum[local_idx] = " << (components_ == 4 ? "sum_vector.x + sum_vector.y + sum_vector.z + sum_vector.w" : (components_ == 2 ? "sum_vector.x + sum_vector.y" : "sum_vector")) << ";\n"
-                            << "workgroupBarrier();\n"
-                            << "var sum: f32 = 0;\n"
-                            << "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
-                            << "  sum += thread_sum[i]\n;"
-                            << "}\n"
-                            << "if (sum == 0) {\n"
-                            << "  for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < uniforms.d_comp; i++) {\n"
-                            << "    x[offset + i] = x_value_t(x_element_t(uniforms.d_inv));\n"
-                            << "  }\n"
-                            << "} else {\n"
-                            << "  for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < uniforms.d_comp; i++) {\n"
-                            << "    var f32input = f32_val_t(x[offset + i]);\n"
-                            << "    x[offset + i] = x_value_t(exp(f32input - max_value) / sum);\n"
-                            << "  }\n"
-                            << "}\n";
-
-  return Status::OK();
-}
-
-Status ComputeInPlaceSoftmax(onnxruntime::webgpu::ComputeContext& context, Tensor* probs, int n, int d) {
-  const int components = d % 4 == 0 ? 4 : (d % 2 == 0 ? 2 : 1);
-  int work_group_size = 64;
-  const int d_comp = d / components;
-  if (d_comp < work_group_size) {
-    work_group_size = 32;
-  }
-  const int elementsPerThread = (d_comp + work_group_size - 1) / work_group_size;
-
-  InPlaceSoftmaxProgram program{"InPlaceSoftmax", work_group_size, components};
-  program.AddOutputs({{probs, ProgramTensorMetadataDependency::TypeAndRank, components}})
-      .SetDispatchGroupSize(n)
-      .SetWorkgroupSize(work_group_size)
-      .AddUniformVariables({{static_cast<float>(1.f / static_cast<float>(d))},
-                            {static_cast<uint32_t>(d_comp)},
-                            {static_cast<uint32_t>(elementsPerThread)}});
-
-  return context.RunProgram(program);
-}
-
-Status VxAttentionScoreProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("probs", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
-  shader.AddInput("v", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
-  if (feed_past_value_) {
-    shader.AddInput("past_value", ShaderUsage::UseUniform);
-  }
-
-  shader.AddOutput("output", ShaderUsage::UseUniform);
-  if (has_present_value_) {
-    shader.AddOutput("present_value", ShaderUsage::UseUniform);
-  }
-
-  shader.AdditionalImplementation() << "var<workgroup> tileQ: array<probs_value_t, " << tile_size_ * tile_size_ << ">;\n"
-                                    << "var<workgroup> tileK: array<v_value_t, " << tile_size_ * tile_size_ << ">;\n";
-
-  shader.MainFunctionBody() << "let headIdx = workgroup_id.z;\n"
-                            << "let m = global_id.y;\n"
-                            << "let n = global_id.x;\n"
-                            << "let offsetA = headIdx * (uniforms.M * uniforms.K) + m * uniforms.K;\n";
-
-  if (feed_past_value_ && has_present_value_) {
-    shader.MainFunctionBody() << "let pastValueOffset = headIdx * uniforms.N * uniforms.past_sequence_length + n;\n"
-                              << "let vOffset = headIdx * uniforms.N * uniforms.kv_sequence_length + n;\n";
-  } else {
-    shader.MainFunctionBody() << "let offsetB = headIdx * uniforms.N * uniforms.K + n;\n";
-  }
-
-  if (has_present_value_) {
-    shader.MainFunctionBody() << "let presentValueOffset = headIdx * uniforms.N * uniforms.K + n;\n";
-  }
-
-  shader.MainFunctionBody() << "var value = probs_element_t(0);\n"
-                            << "for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {\n"
-                            << "  if (m < uniforms.M && w + local_id.x < uniforms.K) {\n"
-                            << "    tileQ[TILE_SIZE * local_id.y + local_id.x] = probs[offsetA + w + local_id.x];\n"
-                            << "  }\n"
-                            << "  if (n < uniforms.N && w + local_id.y < uniforms.K) {\n"
-                            << "    var idx = TILE_SIZE * local_id.y + local_id.x;\n";
-
-  if (feed_past_value_ && has_present_value_) {
-    shader.MainFunctionBody() << "    if (w + local_id.y < uniforms.past_sequence_length) {\n"
-                              << "      tileK[idx] = past_value[pastValueOffset + (w + local_id.y) * uniforms.N];\n"
-                              << "    } else {\n"
-                              << "      tileK[idx] = v[vOffset + (w + local_id.y - uniforms.past_sequence_length) * uniforms.N];\n"
-                              << "    }\n";
-  } else {
-    shader.MainFunctionBody() << "    tileK[idx] = v[offsetB + (w + local_id.y) * uniforms.N];\n";
-  }
-
-  if (has_present_value_) {
-    shader.MainFunctionBody() << "    present_value[presentValueOffset + (w + local_id.y) * uniforms.N] = tileK[idx];\n";
-  }
-
-  shader.MainFunctionBody() << "  }\n"
-                            << "  workgroupBarrier();\n"
-                            << "  for (var k: u32 = 0u; k < TILE_SIZE && w+k < uniforms.K; k++) {\n"
-                            << "    value += tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * k + local_id.x];\n"
-                            << "  }\n"
-                            << "  workgroupBarrier();\n"
-                            << "}\n";
-
-  shader.MainFunctionBody() << "// we need to transpose output from BNSH_v to BSND_v\n"
-                            << "let batchIdx = workgroup_id.z / uniforms.num_heads;\n"
-                            << "let currentBatchHeadNumber = workgroup_id.z % uniforms.num_heads;\n"
-                            << "if (m < uniforms.M && n < uniforms.N) {\n"
-                            << "  let outputIdx = batchIdx * uniforms.M * uniforms.v_hidden_size + "
-                            << "  m * uniforms.v_hidden_size + currentBatchHeadNumber * uniforms.N + n;\n"
-                            << "  output[outputIdx] = value;\n"
-                            << "}\n";
-
-  return Status::OK();
-}
-
-Status ComputeVxAttentionScore(onnxruntime::webgpu::ComputeContext& context, int output_count,
-                               const Tensor* probs,
-                               const Tensor* V,
-                               const Tensor* past_value,
-                               Tensor* output,
-                               Tensor* present_value,
-                               AttentionParameters& parameters,
-                               int past_sequence_length,
-                               int total_sequence_length) {
-  const bool feed_past_value = present_value != nullptr && past_value != nullptr && past_value->SizeInBytes() > 0;
-  const bool has_present_value = output_count > 1 && past_value != nullptr;
-  constexpr int tile_size = 12;
-
-  VxAttentionScoreProgram program{"VxAttentionScore", feed_past_value, has_present_value, tile_size};
-  program.AddInputs({{probs, ProgramTensorMetadataDependency::TypeAndRank},
-                     {V, ProgramTensorMetadataDependency::TypeAndRank}});
-  if (feed_past_value) {
-    program.AddInput({past_value, ProgramTensorMetadataDependency::TypeAndRank});
-  }
-  program.AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank}});
-  if (has_present_value) {
-    program.AddOutput({present_value, ProgramTensorMetadataDependency::TypeAndRank});
-  }
-
-  program.SetDispatchGroupSize((parameters.v_head_size + tile_size - 1) / tile_size,
-                               (parameters.sequence_length + tile_size - 1) / tile_size,
-                               parameters.batch_size * parameters.num_heads)
-      .SetWorkgroupSize(tile_size, tile_size)
-      .AddUniformVariables({{static_cast<uint32_t>(parameters.sequence_length)},
-                            {static_cast<uint32_t>(total_sequence_length)},
-                            {static_cast<uint32_t>(parameters.v_head_size)},
-                            {static_cast<uint32_t>(parameters.num_heads)},
-                            {static_cast<uint32_t>(parameters.v_hidden_size)},
-                            {static_cast<uint32_t>(past_sequence_length)},
-                            {static_cast<uint32_t>(parameters.kv_sequence_length)}})
-      .SetOverridableConstants({{static_cast<uint32_t>(tile_size)}});
-  ;
-
-  return context.RunProgram(program);
-}
-
-Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
-                      const Tensor* past_key, const Tensor* past_value, Tensor* output, Tensor* present_key, Tensor* present_value,
-                      AttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context) {
-  const int output_count = std::min({context.OutputCount(), 1 + (past_key != nullptr ? 1 : 0) + (past_value != nullptr ? 1 : 0)});
-  const int past_sequence_length = output_count > 1 ? parameters.past_sequence_length : 0;
-  const int total_sequence_length = past_sequence_length + parameters.kv_sequence_length;
-
-  const TensorShapeVector probs_dims({parameters.batch_size, parameters.num_heads,
-                                      parameters.sequence_length, total_sequence_length});
-  const TensorShape probs_shape(probs_dims);
-  Tensor probs = context.CreateGPUTensor(Q->DataType(), probs_shape);
-  ORT_RETURN_IF_ERROR(ComputeAttentionProbs(context, output_count, Q, K, past_key, attention_bias, &probs, present_key,
-                                            parameters, past_sequence_length, total_sequence_length));
-
-  ORT_RETURN_IF_ERROR(ComputeInPlaceSoftmax(context, &probs,
-                                            parameters.batch_size * parameters.num_heads * parameters.sequence_length, total_sequence_length));
-
-  ORT_RETURN_IF_ERROR(ComputeVxAttentionScore(context, output_count, &probs, V, past_value, output, present_value,
-                                              parameters, past_sequence_length, total_sequence_length));
-
-  return Status::OK();
-}
-
 MultiHeadAttention::MultiHeadAttention(const OpKernelInfo& info)
-    : WebGpuKernel(info) {
-  int64_t num_heads = 0;
-  ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
-  num_heads_ = static_cast<int>(num_heads);
-  mask_filter_value_ = info.GetAttrOrDefault<float>("mask_filter_value", -10000.0f);
-  scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
-  is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
+    : WebGpuKernel(info), AttentionBase(info, false) {
   ORT_ENFORCE(!is_unidirectional_, "Unidirectional MHA does not support webgpu kernel");
 }
 
@@ -434,54 +51,54 @@ Status MultiHeadAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
     ORT_NOT_IMPLEMENTED("input `key_padding_mask` not implemented for webgpu");
   }
 
-  AttentionParameters parameters;
+  AttentionParameters params;
   ORT_RETURN_IF_ERROR(multihead_attention_helper::CheckInputs<Tensor>(query, key, value,
-                                                                      bias, key_padding_mask, attention_bias, past_key, past_value, nullptr, &parameters,
+                                                                      bias, key_padding_mask, attention_bias, past_key, past_value, nullptr, &params,
                                                                       num_heads_, mask_filter_value_, scale_, is_unidirectional_, false, kMultiHeadAttention,
                                                                       context.DeviceLimits().maxComputeInvocationsPerWorkgroup));
-
+  WebgpuAttentionParameters parameters(params);
   TensorShapeVector output_shape(3);
-  output_shape[0] = static_cast<int64_t>(parameters.batch_size);
-  output_shape[1] = static_cast<int64_t>(parameters.sequence_length);
-  output_shape[2] = static_cast<int64_t>(parameters.v_hidden_size);
+  output_shape[0] = static_cast<int64_t>(parameters.batch_size_);
+  output_shape[1] = static_cast<int64_t>(parameters.sequence_length_);
+  output_shape[2] = static_cast<int64_t>(parameters.v_hidden_size_);
   Tensor* output = context.Output(0, output_shape);
 
   // If optional outputs aren't needed, present_key and present_value will be null
   std::vector<int64_t> present_dims{
-      parameters.batch_size,
-      parameters.num_heads,
-      parameters.total_sequence_length,
-      parameters.head_size,
+      parameters.batch_size_,
+      parameters.num_heads_,
+      parameters.total_sequence_length_,
+      parameters.head_size_,
   };
   TensorShape present_shape(present_dims);
   Tensor* present_key = context.Output(1, present_shape);
   Tensor* present_value = context.Output(2, present_shape);
 
-  TensorShapeVector q_new_dims({parameters.batch_size, parameters.num_heads,
-                                parameters.sequence_length, parameters.head_size});
+  TensorShapeVector q_new_dims({parameters.batch_size_, parameters.num_heads_,
+                                parameters.sequence_length_, parameters.head_size_});
   TensorShape q_new_shape(q_new_dims);
   Tensor Q = context.CreateGPUTensor(query->DataType(), q_new_shape);
   ORT_RETURN_IF_ERROR(TransferBSDToBNSH(
-      context, parameters.num_heads, parameters.sequence_length, parameters.head_size, query, bias, 0, &Q));
+      context, parameters.num_heads_, parameters.sequence_length_, parameters.head_size_, query, bias, 0, &Q));
 
-  if (parameters.qkv_format == Q_K_V_BSNH_BNSH_BNSH) {  // key and value in BNSH format
+  if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) {  // key and value in BNSH format
     return ApplyAttention(&Q, key, value, attention_bias, past_key, past_value, output, present_key,
                           present_value, parameters, context);
   }
 
-  TensorShapeVector k_new_dims({parameters.batch_size, parameters.num_heads,
-                                parameters.kv_sequence_length, parameters.head_size});
+  TensorShapeVector k_new_dims({parameters.batch_size_, parameters.num_heads_,
+                                parameters.kv_sequence_length_, parameters.head_size_});
   TensorShape k_new_shape(k_new_dims);
   Tensor K = context.CreateGPUTensor(key->DataType(), k_new_shape);
-  ORT_RETURN_IF_ERROR(TransferBSDToBNSH(context, parameters.num_heads, parameters.kv_sequence_length,
-                                        parameters.head_size, key, bias, parameters.hidden_size, &K));
+  ORT_RETURN_IF_ERROR(TransferBSDToBNSH(context, parameters.num_heads_, parameters.kv_sequence_length_,
+                                        parameters.head_size_, key, bias, parameters.hidden_size_, &K));
 
-  TensorShapeVector v_new_dims({parameters.batch_size, parameters.num_heads,
-                                parameters.kv_sequence_length, parameters.v_head_size});
+  TensorShapeVector v_new_dims({parameters.batch_size_, parameters.num_heads_,
+                                parameters.kv_sequence_length_, parameters.v_head_size_});
   TensorShape v_new_shape(v_new_dims);
   Tensor V = context.CreateGPUTensor(value->DataType(), v_new_shape);
-  ORT_RETURN_IF_ERROR(TransferBSDToBNSH(context, parameters.num_heads, parameters.kv_sequence_length,
-                                        parameters.v_head_size, value, bias, 2 * parameters.hidden_size, &V));
+  ORT_RETURN_IF_ERROR(TransferBSDToBNSH(context, parameters.num_heads_, parameters.kv_sequence_length_,
+                                        parameters.v_head_size_, value, bias, 2 * parameters.hidden_size_, &V));
 
   // Compute the attention score and apply the score to V
   return ApplyAttention(&Q, &K, &V, attention_bias, past_key, past_value, output, present_key,
diff --git a/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.h b/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.h
index 36803e3027b4c..d983236422c9e 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.h
@@ -7,6 +7,9 @@
 #include "core/providers/webgpu/program.h"
 #include "core/providers/webgpu/shader_helper.h"
 #include "core/providers/webgpu/webgpu_kernel.h"
+#include "contrib_ops/webgpu/bert/attention.h"
+
+#include "contrib_ops/cpu/bert/attention_base.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -14,100 +17,10 @@ namespace webgpu {
 
 using namespace onnxruntime::webgpu;
 
-class TransferBSDToBNSHProgram final : public Program<TransferBSDToBNSHProgram> {
- public:
-  TransferBSDToBNSHProgram(bool has_bias) : Program{"TransferBSDToBNSH"}, has_bias_(has_bias) {}
-
-  Status GenerateShaderCode(ShaderHelper& sh) const override;
-
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"data_size", ProgramUniformVariableDataType::Uint32},
-                                          {"batch_offset", ProgramUniformVariableDataType::Uint32},
-                                          {"sequence_offset", ProgramUniformVariableDataType::Uint32},
-                                          {"head_offset", ProgramUniformVariableDataType::Uint32},
-                                          {"bias_offset", ProgramUniformVariableDataType::Uint32});
-
- private:
-  bool has_bias_;
-};
-
-class AttentionProbsProgram final : public Program<AttentionProbsProgram> {
- public:
-  AttentionProbsProgram(const std::string& kernel_name, bool feed_past_key, bool has_present_key,
-                        bool has_attention_bias, int tile_size, int components)
-      : Program{kernel_name}, feed_past_key_(feed_past_key), has_present_key_(has_present_key), has_attention_bias_(has_attention_bias), tile_size_(tile_size), components_(components) {
-  }
-
-  Status GenerateShaderCode(ShaderHelper& sh) const override;
-
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"M", ProgramUniformVariableDataType::Uint32},
-                                          {"K", ProgramUniformVariableDataType::Uint32},
-                                          {"N", ProgramUniformVariableDataType::Uint32},
-                                          {"num_heads", ProgramUniformVariableDataType::Uint32},
-                                          {"alpha", ProgramUniformVariableDataType::Float32},
-                                          {"past_sequence_length", ProgramUniformVariableDataType::Uint32},
-                                          {"kv_sequence_length", ProgramUniformVariableDataType::Uint32});
-
-  WEBGPU_PROGRAM_DEFINE_OVERRIDABLE_CONSTANTS({"TILE_SIZE", ProgramConstantDataType::Uint32});
-
- private:
-  bool feed_past_key_;
-  bool has_present_key_;
-  bool has_attention_bias_;
-  int tile_size_;
-  int components_;
-};
-
-class InPlaceSoftmaxProgram final : public Program<InPlaceSoftmaxProgram> {
- public:
-  InPlaceSoftmaxProgram(const std::string& kernel_name, int work_group_size, int components)
-      : Program{kernel_name}, work_group_size_(work_group_size), components_(components) {
-  }
-
-  Status GenerateShaderCode(ShaderHelper& sh) const override;
-
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"d_inv", ProgramUniformVariableDataType::Float32},
-                                          {"d_comp", ProgramUniformVariableDataType::Uint32},
-                                          {"elements_per_thread", ProgramUniformVariableDataType::Uint32});
-
- private:
-  int work_group_size_;
-  int components_;
-};
-
-class VxAttentionScoreProgram final : public Program<VxAttentionScoreProgram> {
- public:
-  VxAttentionScoreProgram(const std::string& kernel_name, bool feed_past_value, bool has_present_value, int tile_size)
-      : Program{kernel_name}, feed_past_value_(feed_past_value), has_present_value_(has_present_value), tile_size_(tile_size) {
-  }
-
-  Status GenerateShaderCode(ShaderHelper& sh) const override;
-
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"M", ProgramUniformVariableDataType::Uint32},
-                                          {"K", ProgramUniformVariableDataType::Uint32},
-                                          {"N", ProgramUniformVariableDataType::Uint32},
-                                          {"num_heads", ProgramUniformVariableDataType::Uint32},
-                                          {"v_hidden_size", ProgramUniformVariableDataType::Uint32},
-                                          {"past_sequence_length", ProgramUniformVariableDataType::Uint32},
-                                          {"kv_sequence_length", ProgramUniformVariableDataType::Uint32});
-
-  WEBGPU_PROGRAM_DEFINE_OVERRIDABLE_CONSTANTS({"TILE_SIZE", ProgramConstantDataType::Uint32});
-
- private:
-  bool feed_past_value_;
-  bool has_present_value_;
-  int tile_size_;
-};
-
-class MultiHeadAttention final : public WebGpuKernel {
+class MultiHeadAttention final : public WebGpuKernel, public AttentionBase {
  public:
   MultiHeadAttention(const OpKernelInfo& info);
   Status ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const override;
-
- protected:
-  int num_heads_;
-  float mask_filter_value_;
-  float scale_;
-  bool is_unidirectional_{false};
 };
 
 }  // namespace webgpu
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index 29f328264bf3f..31f95ee64df5d 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <string_view>
+
 #include "contrib_ops/webgpu/quantization/matmul_nbits.h"
 #include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
 #include "core/providers/cpu/math/matmul_helper.h"
@@ -352,8 +354,11 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
   const uint32_t components_a = GetMaxComponents(K);
   const uint32_t components_b = GetMaxComponents(blob_size_in_words);
   uint32_t components = GetMaxComponents(N);
-  const bool is_intel = !std::strcmp(context.AdapterInfo().vendor, "intel") && !std::strcmp(context.AdapterInfo().architecture, "gen-12lp");
-  const bool use_block32 = is_intel && block_size == 32;
+
+  // Use block32 for Intel Gen12LP architecture.
+  const bool use_block32 = context.AdapterInfo().vendor == std::string_view{"intel"} &&
+                           context.AdapterInfo().architecture == std::string_view{"gen-12lp"} &&
+                           block_size == 32;
   const bool has_zero_points = zero_points != nullptr;
   // TODO: Support output_number > 1. Some cases are failed when output_number > 1.
   // const uint32_t output_number = M > 1 && (N / components) % 2 == 0 ? 2 : 1;
diff --git a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
index 4006006a76ba8..2e7ed5a16a2f0 100644
--- a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
@@ -42,7 +42,7 @@ Status RegisterWebGpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, FastGelu)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, FusedConv)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, Gelu)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, GroupQueryAttention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, GroupQueryAttention)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, MatMulNBits)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, MultiHeadAttention)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, QuickGelu)>,
diff --git a/onnxruntime/core/codegen/common/common.cc b/onnxruntime/core/codegen/common/common.cc
deleted file mode 100644
index 818b919e99ef2..0000000000000
--- a/onnxruntime/core/codegen/common/common.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/common/common.h"
-
-#include "core/framework/tensorprotoutils.h"
-#include "core/common/inlined_containers.h"
-#include "core/graph/graph.h"
-#include "core/graph/schema_registry.h"
-#include <algorithm>
-#include <string_view>
-
-namespace onnxruntime {
-
-NodeKey GetKey(const onnxruntime::Node* node) {
-  ORT_ENFORCE(nullptr != node);
-  ORT_ENFORCE(node->OutputDefs().size() > 0);
-  return node->OutputDefs()[0]->Name();
-}
-
-NodeKey GetKey(const onnxruntime::Node& node) {
-  ORT_ENFORCE(node.OutputDefs().size() > 0);
-  return node.OutputDefs()[0]->Name();
-}
-
-NodeKey GetKey(const onnxruntime::NodeArg* def) {
-  // NodeArg's name is unique.
-  ORT_ENFORCE(nullptr != def);
-  return def->Name();
-}
-
-bool IsRecurrentNode(const onnxruntime::Node& node) {
-  auto op_type = node.OpType();
-  return (op_type == "LSTM" || op_type == "RNN" || op_type == "GRU" ||
-          op_type == "Scan" || op_type == "Loop");
-}
-
-bool IsAliasNode(const onnxruntime::Node& node) {
-  auto op_type = node.OpType();
-  if (op_type == "Transpose") {
-    // Treat Transpose (1,N) -> (N,1) as Alias
-    const auto shape = node.OutputDefs()[0]->Shape();
-    if (shape != nullptr && shape->dim_size() == 2) {
-      for (int i = 0; i < 2; ++i) {
-        if (shape->dim(i).has_dim_value() && shape->dim(i).dim_value() == 1) {
-          return true;
-        }
-      }
-    }
-    return false;
-  }
-
-  return (op_type == "Flatten" || op_type == "Identity" || op_type == "Reshape" ||
-          op_type == "Squeeze" || op_type == "Unsqueeze");
-}
-
-std::string NormalizeCppName(const std::string& name) {
-  std::string normalized_name = name;
-  for (char c : {'.', ' ', '+', '-', '*', '/', '\\', '='})
-    std::replace(normalized_name.begin(), normalized_name.end(), c, '_');
-  return normalized_name;
-}
-
-std::string NormalizeNodeArgName(const NodeArg* def) {
-  return NormalizeCppName(def->Name());
-}
-
-bool IsFusedNode(const Node& node) {
-  if (node.NodeType() == Node::Type::Fused) {
-    return true;
-  }
-  return false;
-}
-
-// A unified API to get Subgraph
-const Graph* GetSubgraph(const Node& node) {
-  if (node.NodeType() == Node::Type::Fused) {
-    return &(node.GetFunctionBody()->Body());
-  } else if (node.OpType() == "Scan") {
-    return node.GetGraphAttribute("body");
-  }
-  // return nullptr implying no subgraph
-  return nullptr;
-}
-
-bool HasLoop(const Node& node) {
-  auto op_type = node.OpType();
-  if (op_type == "LSTM" ||
-      op_type == "GRU" ||
-      op_type == "RNN" ||
-      op_type == "Scan") {
-    return true;
-  }
-  return false;
-}
-
-// Return the corresponding input node for the NodeArg of the given node
-const onnxruntime::Node* GetInputNode(const Node& node, const NodeArg* def) {
-  const auto& input_name = def->Name();
-  const onnxruntime::Node* input_node = nullptr;
-  // search input node set to see if input_name is in their outputs (weights are not from node)
-  for (auto iter = node.InputNodesBegin(); iter != node.InputNodesEnd(); ++iter) {
-    const onnxruntime::Node& p = *iter;
-    bool found = false;
-    ORT_THROW_IF_ERROR(p.ForEachWithIndex(
-        p.OutputDefs(),
-        [&found, &input_name](const onnxruntime::NodeArg& out_def, size_t) {
-          if (input_name == out_def.Name()) {
-            found = true;
-          }
-          return Status::OK();
-        }));
-    if (found)
-      input_node = &p;
-  }
-  return input_node;
-}
-
-// create capacity from subgraph
-std::unique_ptr<ComputeCapability> ToCapacity(const onnxruntime::GraphViewer& graph,
-                                              int fused_count,
-                                              std::unique_ptr<IndexedSubGraph>& subgraph) {
-  auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>();
-  meta_def->name = "Fuse" + std::to_string(fused_count);
-  meta_def->domain = "Fuse";
-
-  std::set<NodeIndex> node_indices(subgraph->nodes.begin(), subgraph->nodes.end());
-
-  const auto& start_node_index = subgraph->nodes.front();
-  const auto& start_node = *graph.GetNode(start_node_index);
-  const auto& end_node_index = subgraph->nodes.back();
-  const auto& end_node = *graph.GetNode(end_node_index);
-  meta_def->name += start_node.OpType() + std::to_string(start_node_index);
-  meta_def->name += "_With" + std::to_string(subgraph->nodes.size()) + "Nodes_";
-  meta_def->name += end_node.OpType() + std::to_string(end_node_index);
-
-  InlinedHashSet<std::string_view> real_output_names;
-  real_output_names.reserve(graph.GetOutputs().size());
-  for (const auto* def : graph.GetOutputs()) {
-    real_output_names.insert(def->Name());
-  }
-
-  for (const auto& node_index : subgraph->nodes) {
-    const auto& node = *graph.GetNode(node_index);
-    auto process_input_fn =
-        [&meta_def, &node, &node_indices](const onnxruntime::NodeArg& def, size_t) {
-          const onnxruntime::Node* input_node = GetInputNode(node, &def);
-          bool input_from_subgraph = (input_node && node_indices.count(input_node->Index()));
-          if (!input_from_subgraph) {
-            // input is from weights or outside of graph
-            meta_def->inputs.push_back(def.Name());
-          }
-          return Status::OK();
-        };
-    // handle current graph's inputs
-    ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.InputDefs(), process_input_fn));
-    // nodes' implicit inputs also need to be collected. They need to
-    // be promoted to being explicit inputs for everything to work.
-    ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.ImplicitInputDefs(), process_input_fn));
-
-    // Handle outouts
-    // two cases are considered as outputs
-    // 1. Output NodeArg is not used by any Node
-    // 2. Output NodeArg is used by at least one Node out of this subgraph.
-    //    Note a NodeArg can be used by Nodes in and out of the subgraph at the same time.
-    // 3. Output NodeArg is one of real outputs of an Ort graph.
-
-    auto InsertOutputToSubgraph = [&meta_def](const NodeArg* def) {
-      if (std::find(meta_def->outputs.begin(), meta_def->outputs.end(), def->Name()) ==
-          meta_def->outputs.end()) {
-        meta_def->outputs.push_back(def->Name());
-      }
-    };
-
-    InlinedHashSet<std::string_view> input_names_from_the_output_node;
-
-    for (auto o_iter = node.OutputEdgesBegin(); o_iter != node.OutputEdgesEnd(); ++o_iter) {
-      const auto& p = *o_iter;
-      const Node& out_node = p.GetNode();
-
-      // preprocess for the case 1
-      ORT_THROW_IF_ERROR(out_node.ForEachWithIndex(
-          out_node.InputDefs(),
-          [&input_names_from_the_output_node](const onnxruntime::NodeArg& in_def, size_t) {
-            input_names_from_the_output_node.insert(in_def.Name());
-            return Status::OK();
-          }));
-
-      // handle the case 2
-      if (node_indices.count(out_node.Index()) == 0) {
-        const NodeArg* def = node.OutputDefs()[p.GetSrcArgIndex()];
-        InsertOutputToSubgraph(def);
-      }
-    }
-
-    // handle case 1 and 3
-    ORT_THROW_IF_ERROR(node.ForEachWithIndex(
-        node.OutputDefs(),
-        [&](const onnxruntime::NodeArg& def, size_t) {
-          if (input_names_from_the_output_node.count(def.Name()) == 0 ||
-              real_output_names.count(def.Name()) > 0) {
-            InsertOutputToSubgraph(&def);
-          }
-          return Status::OK();
-        }));
-  }
-
-  // Handle subgraph's initializers
-  const auto& all_initializers = graph.GetAllInitializedTensors();
-  for (const auto& node_index : subgraph->nodes) {
-    const auto& node = *graph.GetNode(node_index);
-    // check whether it is an immediate nested subgraph
-    auto immediate_nested_subgraph = GetSubgraph(node);
-    // If so, copy the immediate nested subgraph's initializers to meta_def->inputs.
-    // Note we don't need recursion here, since Ort did recursion for us by handling subgraph early than the current graph.
-    // Therefore, the all inner nested subgraph's initializers should be already in the immediate nested subgraph's inputs.
-    if (nullptr != immediate_nested_subgraph) {
-      for (auto& n : immediate_nested_subgraph->Nodes()) {
-        auto add_input_fn =
-            [&meta_def, &all_initializers](const onnxruntime::NodeArg& def, size_t) {
-              auto iter = all_initializers.find(def.Name());
-              if (iter != all_initializers.end()) {
-                meta_def->inputs.push_back(def.Name());
-              }
-              return Status::OK();
-            };
-        ORT_THROW_IF_ERROR(n.ForEachWithIndex(n.InputDefs(), add_input_fn));
-        ORT_THROW_IF_ERROR(n.ForEachWithIndex(n.ImplicitInputDefs(), add_input_fn));
-      }
-    }
-  }
-
-  meta_def->since_version = 1;
-  meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
-  std::unique_ptr<IndexedSubGraph> finished_subgraph(subgraph.release());
-  finished_subgraph->SetMetaDef(std::move(meta_def));
-  return std::make_unique<ComputeCapability>(std::move(finished_subgraph));
-}
-
-int64_t ShapeRank(const NodeArg* def) {
-  ORT_ENFORCE_DEBUG(nullptr != def);
-  return gsl::narrow_cast<int64_t>(def->Shape()->dim_size());
-}
-
-bool ShapeHasValue(const NodeArg* def, int i) {
-  ORT_ENFORCE_DEBUG(nullptr != def);
-  ORT_ENFORCE_DEBUG(i >= 0);
-  ORT_ENFORCE_DEBUG(i < def->Shape()->dim_size());
-  return utils::HasDimValue(def->Shape()->dim(i));
-}
-
-bool ShapeHasSymbol(const NodeArg* def, int i) {
-  ORT_ENFORCE_DEBUG(nullptr != def);
-  ORT_ENFORCE_DEBUG(i >= 0);
-  ORT_ENFORCE_DEBUG(i < def->Shape()->dim_size());
-  return utils::HasDimParam(def->Shape()->dim(i));
-}
-
-int64_t ShapeValue(const NodeArg* def, int i) {
-  ORT_ENFORCE_DEBUG(ShapeHasValue(def, i));
-  return def->Shape()->dim(i).dim_value();
-}
-
-const std::string& ShapeSymbol(const NodeArg* def, int i) {
-  ORT_ENFORCE_DEBUG(ShapeHasSymbol(def, i));
-  return def->Shape()->dim(i).dim_param();
-}
-
-ONNX_NAMESPACE::TensorProto_DataType TensorProtoDataType(const NodeArg* def) {
-  ORT_ENFORCE_DEBUG(nullptr != def);
-  return static_cast<ONNX_NAMESPACE::TensorProto_DataType>(def->TypeAsProto()->tensor_type().elem_type());
-}
-
-// Convert GraphNodes to internal NodePtrs without check lifetime.
-// Please use it only locally when GraphNodes still exist
-InlinedVector<const Node*> ConvertGraphNodesToNodePtrs(const ConstGraphNodes& graph_nodes) {
-  InlinedVector<const Node*> nodes;
-  for (auto& node : graph_nodes) {
-    nodes.push_back(&node);
-  }
-  return nodes;
-}
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/common.h b/onnxruntime/core/codegen/common/common.h
deleted file mode 100644
index 81b74daf6f711..0000000000000
--- a/onnxruntime/core/codegen/common/common.h
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/common/inlined_containers.h"
-#include "core/framework/compute_capability.h"
-#include "core/framework/tensor.h"
-#include "core/graph/graph_nodes.h"
-#include "core/graph/graph_viewer.h"
-
-#ifndef NDEBUG
-#define ORT_ENFORCE_DEBUG(...) ORT_ENFORCE(__VA_ARGS__)
-#else
-#define ORT_ENFORCE_DEBUG(...)
-#endif  // !NDEBUG
-
-// DYN_PROMOTE is a simplified llvm::dyn_cast, which does not need RTTI
-// DYN_PROMOTE is faster than dynamic_cast and also has smaller binary size
-// Please use DYN_PROMOTE in a critical path.
-#define DYN_PROMOTE(BASE)                                     \
-  template <typename ToType>                                  \
-  inline const ToType* Promote(const BASE* base) {            \
-    if (ToType::IsType(base))                                 \
-      return static_cast<const ToType*>(base);                \
-    return nullptr;                                           \
-  }                                                           \
-                                                              \
-  template <typename ToType>                                  \
-  inline ToType* Promote(BASE* base) {                        \
-    if (ToType::IsType(base))                                 \
-      return static_cast<ToType*>(base);                      \
-    return nullptr;                                           \
-  }                                                           \
-                                                              \
-  template <typename ToType>                                  \
-  inline ToType* Promote(const std::unique_ptr<BASE>& base) { \
-    if (ToType::IsType(base.get()))                           \
-      return static_cast<ToType*>(base);                      \
-    return nullptr;                                           \
-  }                                                           \
-                                                              \
-  template <typename ToType>                                  \
-  inline ToType* Promote(const std::shared_ptr<BASE>& base) { \
-    if (ToType::IsType(base.get()))                           \
-      return static_cast<ToType*>(base);                      \
-    return nullptr;                                           \
-  }
-
-// DYN_PROMOTE_BASE is a macro inserted in the base class to support DYN_PROMOTE
-// TYPE_ID is required for DYN_PROMOTE and TYPE_ID is a enum class
-// TYPE_ID_VAR is a corresponding variable name for in the base class
-#define DYN_PROMOTE_BASE(BASE, TYPE_ID, TYPE_ID_VAR) \
-  inline const TYPE_ID TypeID() const {              \
-    return TYPE_ID_VAR;                              \
-  }                                                  \
-                                                     \
-  static inline bool IsType(const BASE*) {           \
-    return true;                                     \
-  }
-
-// DYN_PROMOTE_DERIVED is a macro inserted in a derived class to support DYN_PROMOTE
-// TYPE_ID is required for DYN_PROMOTE and TYPE_ID is a enum class
-// TYPE_ID_VALUE is corresponding TYPE_ID::value of a derived class.
-#define DYN_PROMOTE_DERIVED(BASE, TYPE_ID, TYPE_ID_VALUE) \
-  static inline bool IsType(const BASE* base) {           \
-    ORT_ENFORCE_DEBUG(nullptr != base);                   \
-    return base->TypeID() == TYPE_ID::TYPE_ID_VALUE;      \
-  }
-
-// DYNAMIC_PROMOTE is a dynamic_cast needing RTTI
-// DYNAMIC_PROMOTE is usually slower than than DYN_PROMOTE.
-// Please use DYNAMIC_PROMOTE in a non-critical path.
-#define DYNAMIC_PROMOTE(BASE)                            \
-  template <typename X>                                  \
-  inline const X* Promote(const BASE* base) {            \
-    auto derived = dynamic_cast<const X*>(base);         \
-    ORT_ENFORCE(nullptr != derived);                     \
-    return derived;                                      \
-  }                                                      \
-                                                         \
-  template <typename X>                                  \
-  inline X* Promote(BASE* base) {                        \
-    auto derived = dynamic_cast<X*>(base);               \
-    ORT_ENFORCE(nullptr != derived);                     \
-    return derived;                                      \
-  }                                                      \
-                                                         \
-  template <typename X>                                  \
-  inline X* Promote(const std::unique_ptr<BASE>& base) { \
-    auto derived = dynamic_cast<X*>(base.get());         \
-    ORT_ENFORCE(nullptr != derived);                     \
-    return derived;                                      \
-  }                                                      \
-                                                         \
-  template <typename X>                                  \
-  inline X* Promote(const std::shared_ptr<BASE>& base) { \
-    auto derived = dynamic_cast<X*>(base.get());         \
-    ORT_ENFORCE(nullptr != derived);                     \
-    return derived;                                      \
-  }
-
-namespace onnxruntime {
-
-// Nodekey is used as a key for maps
-using NodeKey = std::string;
-
-NodeKey GetKey(const onnxruntime::Node* node);
-NodeKey GetKey(const onnxruntime::Node& node);
-NodeKey GetKey(const onnxruntime::NodeArg* def);
-
-bool IsRecurrentNode(const onnxruntime::Node& node);
-
-bool IsAliasNode(const onnxruntime::Node& node);
-
-// Helper function that creates ComputeCapability for subgraphs
-std::unique_ptr<ComputeCapability> ToCapacity(const onnxruntime::GraphViewer& graph,
-                                              int fused_count,
-                                              std::unique_ptr<IndexedSubGraph>& subgraph);
-
-bool IsFusedNode(const Node& node);
-
-bool HasLoop(const Node& node);
-
-const Graph* GetSubgraph(const Node& node);
-
-std::string NormalizeCppName(const std::string& name);
-
-std::string NormalizeNodeArgName(const NodeArg* def);
-
-// Return the corresponding input node for the NodeArg of the given node
-const onnxruntime::Node* GetInputNode(const Node& node, const NodeArg* def);
-
-int64_t ShapeRank(const NodeArg* def);
-
-bool ShapeHasValue(const NodeArg* def, int i);
-
-bool ShapeHasSymbol(const NodeArg* def, int i);
-
-int64_t ShapeValue(const NodeArg* def, int i);
-
-const std::string& ShapeSymbol(const NodeArg* def, int i);
-
-ONNX_NAMESPACE::TensorProto_DataType TensorProtoDataType(const NodeArg* def);
-
-// Convert ConstGraphNodes to internal NodePtrs without check lifetime.
-// Please use it only locally when GraphNodes still exist
-InlinedVector<const Node*> ConvertGraphNodesToNodePtrs(const ConstGraphNodes& graph_nodes);
-
-enum : int {
-  Dimension_Unknown = -1,
-};
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/creator.h b/onnxruntime/core/codegen/common/creator.h
deleted file mode 100644
index b31a12db4875b..0000000000000
--- a/onnxruntime/core/codegen/common/creator.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/common/dispatcher.h"
-
-// TODO rename this file to creator_base
-namespace onnxruntime {
-namespace codegen {
-
-// It is a base class for TVM Op IR builder, weight layout builder, TVM scheduler
-// CreatorBase is a template class of compiler pass
-// for 1) TVM IR builder
-//     2) Weight layout transformer
-//     3) TVM Scheduler, etc.
-// CreatorBase is similor to OpXXCreate in llvm IR builder
-
-template <typename INPUT_TYPE,
-          typename NODE_TYPE,
-          typename CONTEXT_TYPE,
-          typename OUTPUT_TYPE,
-          typename RETURN_TYPE>
-class CreatorBase {
- public:
-  CreatorBase(const std::string& name)
-      : name_(name) {}
-
-  virtual ~CreatorBase() = default;
-
-  virtual RETURN_TYPE Evaluate(INPUT_TYPE,
-                               NODE_TYPE,
-                               CONTEXT_TYPE,
-                               OUTPUT_TYPE) = 0;
-
-  const std::string& Name() const {
-    return name_;
-  }
-
- protected:
-  std::string name_;
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CreatorBase);
-};
-
-// macro to stringize
-#define STRINGIZE_NX(OP) #OP
-#define STRINGIZE(OP) STRINGIZE_NX(OP)
-
-// macro returns class name
-#define CREATOR_CLASS(OP, POSTFIX) \
-  OP##POSTFIX
-
-// macro returns class name as string
-#define CREATOR_STRING(OP, POSTFIX) \
-  STRINGIZE(CREATOR_CLASS(OP, POSTFIX))
-
-// macro returns class constructor name
-#define CREATOR_CLASS_FUNC(OP, POSTFIX) \
-  OP##POSTFIX()
-
-// macro declares a creator class inheriting the template class CreatorBase
-// with corresponding template parameters
-#define DECLARE_CREATOR_CLASS(OP, POSTFIX, INPUT, NODE, CONTEXT, OUTPUT, RETURN)                                      \
-  class CREATOR_CLASS(OP, POSTFIX) : public onnxruntime::codegen::CreatorBase<INPUT, NODE, CONTEXT, OUTPUT, RETURN> { \
-   public:                                                                                                            \
-    CREATOR_CLASS_FUNC(OP, POSTFIX) : CreatorBase(CREATOR_STRING(OP, POSTFIX)) {}                                     \
-    RETURN Evaluate(INPUT,                                                                                            \
-                    NODE,                                                                                             \
-                    CONTEXT,                                                                                          \
-                    OUTPUT) override;                                                                                 \
-                                                                                                                      \
-   private:                                                                                                           \
-    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CREATOR_CLASS(OP, POSTFIX));                                                \
-  };
-
-}  // namespace codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/dispatcher.h b/onnxruntime/core/codegen/common/dispatcher.h
deleted file mode 100644
index 80a854a06977c..0000000000000
--- a/onnxruntime/core/codegen/common/dispatcher.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/common/common.h"
-#include <functional>
-#include <string>
-#include <unordered_map>
-
-namespace onnxruntime {
-namespace codegen {
-
-// DispatcherBase is a customized unordered_map
-// that provides all codegen-related functionality
-// including 1) dispatching a pass
-//           2) dump corresponding name
-// DispatcherBase may or may not keep ownership,
-// depending on the template parameter, CONTENT_TYPE.
-// Note DispatcherBase has a protected destructor
-
-template <typename CONTENT_TYPE>
-class DispatcherBase {
- public:
-  DispatcherBase(const std::string& name)
-      : name_(name) {}
-
-  const std::string& Name() const {
-    return name_;
-  }
-
-  bool Contains(const std::string& name) const {
-    return contents_.count(name) > 0;
-  }
-
-  void ForEach(std::function<void(const std::string&,
-                                  CONTENT_TYPE)>
-                   func) {
-    for (auto& p : contents_) {
-      func(p.first, p.second);
-    }
-  }
-
-  bool Register(const std::string& name,
-                CONTENT_TYPE op) {
-    if (!Contains(name)) {
-      contents_.emplace(name, op);
-      return true;
-    }
-    return false;
-  }
-
-  CONTENT_TYPE Get(const std::string& key) const {
-    auto iter = contents_.find(key);
-    if (iter != contents_.end()) {
-      return iter->second;
-    }
-    return nullptr;
-  }
-
-  const std::unordered_map<std::string, CONTENT_TYPE> GetContents() const {
-    return contents_;
-  }
-
-  std::unordered_map<std::string, CONTENT_TYPE> GetMutableContents() {
-    return contents_;
-  }
-
- protected:
-  std::string name_;
-  std::unordered_map<std::string, CONTENT_TYPE> contents_;
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(DispatcherBase);
-  ~DispatcherBase() = default;
-};
-
-}  // namespace codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/dump_array.h b/onnxruntime/core/codegen/common/dump_array.h
deleted file mode 100644
index 8e51cd36d0087..0000000000000
--- a/onnxruntime/core/codegen/common/dump_array.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <cassert>
-#include <iomanip>
-#include <iostream>
-#include <string>
-#include <vector>
-
-namespace onnxruntime {
-
-template <typename T1, typename T2>
-void DumpArrayRecursive(const T1* data, int64_t& data_offset, const std::vector<T2>& shape, int idx) {
-  int dim = static_cast<int>(shape.size());
-  if (dim == 0) {
-    std::cout << "[]\n";
-    return;
-  }
-
-  assert(idx < dim);
-  int sz = shape[idx];
-
-  std::cout << "[";
-  if (idx < dim - 1) {
-    for (auto i = 0; i < sz; ++i) {
-      DumpArrayRecursive(data, data_offset, shape, idx + 1);
-      if (i < sz - 1) {
-        std::cout << ",";
-        // print multiple newlines after ',' when necessary
-        for (int j = idx + 1; j < dim; j++)
-          std::cout << "\n";
-        // print leading spaces before "[" when necessary
-        for (int j = 0; j < idx + 1; ++j)
-          std::cout << " ";
-      }
-    }
-  } else {
-    for (auto i = 0; i < sz; ++i) {
-      if (std::is_same<T1, int8_t>::value || std::is_same<T1, uint8_t>::value)
-        std::cout << std::setw(3) << static_cast<int>(*(data + data_offset));
-      else
-        std::cout << std::setw(12) << std::setprecision(8) << *(data + data_offset);
-      data_offset++;
-      if (i < sz - 1)
-        std::cout << ",";
-    }
-  }
-  std::cout << "]";
-}
-
-// A helper function to dump multidimensional arrays in a way similar to numpy
-template <typename T1, typename T2>
-void DumpArray(const std::string& tag, const T1* data, const std::vector<T2>& shape) {
-  std::cout << tag << "\n";
-  int64_t data_offset = 0;
-  DumpArrayRecursive(data, data_offset, shape, 0);
-  assert(data_offset == TotalSize(shape));
-  std::cout << std::endl;
-}
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/handle.h b/onnxruntime/core/codegen/common/handle.h
deleted file mode 100644
index 7caad27dcbe01..0000000000000
--- a/onnxruntime/core/codegen/common/handle.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/common/target_info.h"
-#include <functional>
-#include <limits.h>
-
-namespace onnxruntime {
-namespace codegen {
-
-using DomainVersionLookupFunc = std::function<int(const std::string&)>;
-
-struct CodeGenHandle {
-  CodeGenTarget* codegen_target;
-  DomainVersionLookupFunc domain_version_lookup_func =
-      // by default, always uses the latest opset implemented
-      [](const std::string&) { return INT_MAX; };
-};
-
-}  // namespace codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/op_macro.h b/onnxruntime/core/codegen/common/op_macro.h
deleted file mode 100644
index 04305c4aa47b0..0000000000000
--- a/onnxruntime/core/codegen/common/op_macro.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-namespace onnxruntime {
-
-#define LIST_BINARY_OPS() \
-  BINARY_OP(Add)          \
-  BINARY_OP(Div)          \
-  BINARY_OP(Mul)          \
-  BINARY_OP(PRelu)        \
-  BINARY_OP(Sub)
-
-#define LIST_BINARY_CMP_OPS() \
-  BINARY_CMP_OP(Equal)        \
-  BINARY_CMP_OP(Greater)      \
-  BINARY_CMP_OP(Less)
-
-#define LIST_POOL_OPS()  \
-  POOL_OP(MaxPool)       \
-  POOL_OP(AveragePool)   \
-  POOL_OP(GlobalMaxPool) \
-  POOL_OP(GlobalAveragePool)
-
-#define LIST_REDUCE_OPS()    \
-  REDUCE_INDEXED_OP(ArgMax)  \
-  REDUCE_INDEXED_OP(ArgMin)  \
-  REDUCE_OP(ReduceL1)        \
-  REDUCE_OP(ReduceL2)        \
-  REDUCE_OP(ReduceLogSum)    \
-  REDUCE_OP(ReduceLogSumExp) \
-  REDUCE_OP(ReduceMax)       \
-  REDUCE_OP(ReduceMean)      \
-  REDUCE_OP(ReduceMin)       \
-  REDUCE_OP(ReduceProd)      \
-  REDUCE_OP(ReduceSum)       \
-  REDUCE_OP(ReduceSumSquare)
-
-#define LIST_UNARY_OPS()       \
-  UNARY_OP(Abs)                \
-  UNARY_OP(Affine)             \
-  UNARY_OP(Ceil)               \
-  UNARY_OP(Elu)                \
-  UNARY_OP(Exp)                \
-  UNARY_OP(Floor)              \
-  UNARY_OP(HardSigmoid)        \
-  UNARY_OP(LeakyRelu)          \
-  UNARY_OP(Log)                \
-  UNARY_OP(Neg)                \
-  UNARY_OP(ParametricSoftplus) \
-  UNARY_OP(Reciprocal)         \
-  UNARY_OP(Relu)               \
-  UNARY_OP(ScaledTanh)         \
-  UNARY_OP(Selu)               \
-  UNARY_OP(Sigmoid)            \
-  UNARY_OP(Softplus)           \
-  UNARY_OP(Softsign)           \
-  UNARY_OP(Sqrt)               \
-  UNARY_OP(Tanh)               \
-  UNARY_OP(ThresholdedRelu)
-
-#define LIST_VARIADIC_OPS() \
-  VARIADIC_OP(Max)          \
-  VARIADIC_OP(Min)          \
-  VARIADIC_OP(Sum)
-
-#define LIST_ALL_GENERIC_OPS() \
-  LIST_BINARY_OPS()            \
-  LIST_BINARY_CMP_OPS()        \
-  LIST_REDUCE_OPS()            \
-  LIST_POOL_OPS()              \
-  LIST_UNARY_OPS()             \
-  LIST_VARIADIC_OPS()          \
-  ADD_OP_ITEM(Cast)            \
-  ADD_OP_ITEM(Clip)            \
-  ADD_OP_ITEM(Concat)          \
-  ADD_OP_ITEM(Conv)            \
-  ADD_OP_ITEM(Crop)            \
-  ADD_OP_ITEM(Dropout)         \
-  ADD_OP_ITEM(Expand)          \
-  ADD_OP_ITEM(Flatten)         \
-  ADD_OP_ITEM(Gather)          \
-  ADD_OP_ITEM(GatherElements)  \
-  ADD_OP_ITEM(Gemm)            \
-  ADD_OP_ITEM(Identity)        \
-  ADD_OP_ITEM(LogSoftmax)      \
-  ADD_OP_ITEM(LSTM)            \
-  ADD_OP_ITEM(MatMul)          \
-  ADD_OP_ITEM(MatMulInteger)   \
-  ADD_OP_ITEM(Pad)             \
-  ADD_OP_ITEM(Reshape)         \
-  ADD_OP_ITEM(Shape)           \
-  ADD_OP_ITEM(Slice)           \
-  ADD_OP_ITEM(Softmax)         \
-  ADD_OP_ITEM(Split)           \
-  ADD_OP_ITEM(Squeeze)         \
-  ADD_OP_ITEM(Transpose)       \
-  ADD_OP_ITEM(Unsqueeze)       \
-  ADD_OP_ITEM(Where)
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/profile.h b/onnxruntime/core/codegen/common/profile.h
deleted file mode 100644
index 31c9e764320d0..0000000000000
--- a/onnxruntime/core/codegen/common/profile.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-// uncomment this line or use -DCODEGEN_ENABLE_PROFILER in compiler options to enable profiler events in codegen
-// #define CODEGEN_ENABLE_PROFILER
-
-#ifdef CODEGEN_ENABLE_PROFILER
-#include "core/common/profiler.h"
-
-namespace onnxruntime {
-
-class ProfilerEvent {
- public:
-  ProfilerEvent(const std::string& name) : name_(name) {
-    ts_ = profiling::Profiler::Instance().StartTime();
-  }
-
-  ~ProfilerEvent() {
-    profiling::Profiler::Instance().EndTimeAndRecordEvent(profiling::EventCategory::NODE_EVENT, name_, ts_);
-  }
-
- private:
-  TimePoint ts_;
-  const std::string name_;
-};
-
-}  // namespace onnxruntime
-
-#define CODEGEN_PROFILER_EVENT(name) onnxruntime::ProfilerEvent profiler_event(name)
-
-#else
-
-#define CODEGEN_PROFILER_EVENT(name)
-
-#endif
diff --git a/onnxruntime/core/codegen/common/registry.h b/onnxruntime/core/codegen/common/registry.h
deleted file mode 100644
index c1642e76e2120..0000000000000
--- a/onnxruntime/core/codegen/common/registry.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/common/common.h"
-#include <functional>
-#include <string>
-#include <unordered_map>
-
-namespace onnxruntime {
-namespace codegen {
-
-// RegistryBase is a customized unordered_map
-// that keep ownership of passes,
-// including 1) IR builder passes
-//           2) Weight layout transformer passes
-//           3) Scheduler passses, etc.
-
-template <typename CONTENT_TYPE>
-class RegistryBase {
- public:
-  RegistryBase() = default;
-
-  virtual ~RegistryBase() = default;
-
-  bool Contains(const std::string& name) const {
-    return contents_.count(name) > 0;
-  }
-
-  CONTENT_TYPE* Get(const std::string& name) const {
-    if (contents_.find(name) != contents_.end())
-      return contents_.at(name).get();
-    return nullptr;
-  }
-
-  CONTENT_TYPE* RegisterOrGet(
-      const std::string& name,
-      std::unique_ptr<CONTENT_TYPE>&& ptr) {
-    if (!Contains(name))
-      contents_.emplace(name, std::move(ptr));
-    return Get(name);
-  }
-
-  CONTENT_TYPE* RegisterOrGet(
-      std::unique_ptr<CONTENT_TYPE>&& ptr) {
-    return RegisterOrGet(ptr->Name(), std::move(ptr));
-  }
-
-  bool Register(
-      const std::string& name,
-      std::unique_ptr<CONTENT_TYPE>&& ptr) {
-    if (!Contains(name)) {
-      contents_.emplace(name, std::move(ptr));
-      return true;
-    }
-    return false;
-  }
-
-  bool Register(
-      std::unique_ptr<CONTENT_TYPE>&& ptr) {
-    return Register(ptr->Name(), std::move(ptr));
-  }
-
- protected:
-  std::unordered_map<std::string, std::unique_ptr<CONTENT_TYPE>> contents_;
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(RegistryBase);
-};
-
-// Put common Registry Management utilities if these is any
-
-}  // namespace codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/settings.cc b/onnxruntime/core/codegen/common/settings.cc
deleted file mode 100644
index 529cb654f922c..0000000000000
--- a/onnxruntime/core/codegen/common/settings.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/common/settings.h"
-
-#include "core/common/logging/logging.h"
-#include <algorithm>
-#include <cctype>
-
-namespace onnxruntime {
-namespace codegen {
-
-CodeGenSettings& CodeGenSettings::Instance() {
-  static CodeGenSettings settings;
-  return settings;
-}
-
-CodeGenSettings::CodeGenSettings() {}
-
-void CodeGenSettings::InsertOptions(const std::map<std::string, std::string>& options) {
-  for (const auto& option : options) {
-    const auto& key = option.first;
-    const auto& value = option.second;
-
-    auto iter = options_.find(key);
-    // found existing ones
-    if (iter != options_.end()) {
-      if (iter->second != value) {
-        LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << "CodeGenSettings: option"
-                                                 << key << " is overridded from: "
-                                                 << iter->second << " to: " << value;
-        iter->second = value;
-      }
-    } else {
-      options_.insert(std::make_pair(key, value));
-    }
-  }
-}
-
-void CodeGenSettings::DumpOptions() const {
-  std::ostringstream stream;
-  stream << "CodeGenSettings: dump all options" << std::endl;
-  for (const auto& option : options_) {
-    stream << "  " << option.first << " = " << option.second << std::endl;
-  }
-  LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << stream.str();
-}
-
-std::string CodeGenSettings::GetOptionValue(const std::string& key) const {
-  const auto& iter = options_.find(key);
-  if (iter == options_.end()) {
-    LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << "CodeGenSettings::GetOptionValue: unrecognized option" << key;
-    return "";
-  }
-  return iter->second;
-}
-
-bool CodeGenSettings::HasOption(const std::string& key) const {
-  return options_.count(key) > 0;
-}
-
-bool CodeGenSettings::OptionMatches(const std::string& key, const std::string& value) const {
-  if (!HasOption(key))
-    return false;
-
-#ifdef _WIN32
-  return 0 == _stricmp(options_.at(key).c_str(), value.c_str());
-#else
-  return 0 == strcasecmp(options_.at(key).c_str(), value.c_str());
-#endif
-}
-
-void CodeGenSettings::Clear() {
-  options_.clear();
-}
-
-}  // namespace codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/settings.h b/onnxruntime/core/codegen/common/settings.h
deleted file mode 100644
index e327b0e207cc2..0000000000000
--- a/onnxruntime/core/codegen/common/settings.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <map>
-#include <string>
-
-namespace onnxruntime {
-namespace codegen {
-
-// use log level warning as default to make sure logs are outputted
-#define CODEGEN_SETTINGS_LOG_LEVEL WARNING
-
-// This stores codegen settings to control dumps, execution preference, etc.
-// CodeGenSettings could come from command line options or environment variables
-// Or could come from a static variables in source code
-class CodeGenSettings {
- public:
-  // generic built-in options
-  constexpr static const char* kDumpAllOptions = "dump_all_options";
-  constexpr static const char* kCodeGenDumpModule = "codegen_dump_module";      // dump tvm module
-  constexpr static const char* kCodeGenDumpLower = "codegen_dump_lower";        // dump lowered func
-  constexpr static const char* kCodeGenDumpSchedule = "codegen_dump_schedule";  // dump scheduler
-
-  void InsertOptions(const std::map<std::string, std::string>& options);
-  void DumpOptions() const;
-  std::string GetOptionValue(const std::string& key) const;
-  bool HasOption(const std::string& key) const;
-  bool OptionMatches(const std::string& key, const std::string& value) const;
-  void Clear();
-  static CodeGenSettings& Instance();
-
- private:
-  CodeGenSettings();
-
-  std::map<std::string, std::string> options_;
-};
-
-}  // namespace codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/target_info.h b/onnxruntime/core/codegen/common/target_info.h
deleted file mode 100644
index da063545f0a1e..0000000000000
--- a/onnxruntime/core/codegen/common/target_info.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <memory>
-
-namespace onnxruntime {
-
-// CodeGenTarget holds meta info for backend code generation
-// and will be lowered to a target of corresponding backend
-// code generation, e.g. TVM's Target.
-class CodeGenTarget {
- public:
-  CodeGenTarget() {}
-  CodeGenTarget(const std::string& target_name)
-      : target_name_(target_name) {}
-
-  virtual int NaturalVectorWidth(int /*bits*/) const {
-    return 1;
-  }
-
-  const std::string& GetTargetName() const {
-    return target_name_;
-  }
-
-  virtual ~CodeGenTarget() = default;
-
- private:
-  std::string target_name_{"unknown"};  // default name is unknown
-};
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/utils.cc b/onnxruntime/core/codegen/common/utils.cc
deleted file mode 100644
index f4140a411bddf..0000000000000
--- a/onnxruntime/core/codegen/common/utils.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/common/utils.h"
-#include "core/common/cpuid_info.h"
-#include "core/common/safeint.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-namespace onnxruntime {
-
-std::unique_ptr<char[]> GetEnv(const char* var) {
-  char* val = nullptr;
-#if _MSC_VER
-  size_t len;
-
-  if (_dupenv_s(&val, &len, var)) {
-    // Something went wrong, just return nullptr.
-    return nullptr;
-  }
-#else
-  val = getenv(var);
-#endif  // _MSC_VER
-
-  if (val == nullptr) {
-    return nullptr;
-  }
-
-  // On windows, we will have to explicitly free val. Instead of returning val
-  // to its caller and make distinguish between windows and linux, we return
-  // a unique_ptr, and it will be destroyed automatically after the caller
-  // completes.
-  size_t len_val = strnlen(val, onnxruntime::kMaxStrLen) + 1;
-  auto p = std::make_unique<char[]>(len_val);
-  // use explicit loop to get ride of VC's warning on unsafe copy
-  for (size_t i = 0; i < len_val; ++i) {
-    p[i] = val[i];
-  }
-  return p;
-}
-
-bool IsEnvVarDefined(const char* var) {
-  auto val = GetEnv(var);
-  return val != nullptr;
-}
-
-int64_t TotalSize(const std::vector<int64_t>& shape) {
-  SafeInt<int64_t> total = 1;
-  for (auto s : shape) {
-    total *= s;
-  }
-  return total;
-}
-
-// Return the strides for the input shape, i.e. the number of
-// elements contained by a single element of current dimension.
-// For example, for shape[3][4][5][6], strides will be
-// [4*5*6, 5*6, 6, 1], i.e. [120, 30, 6, 1]
-void GetStrides(const int64_t* shape, int ndim, std::vector<int64_t>& strides) {
-  strides.resize(ndim);
-  strides[ndim - 1] = 1;
-  for (int64_t i = ndim - 2; i >= 0; i--) {
-    strides[i] = strides[i + 1] * shape[i + 1];
-  }
-}
-
-// Common utils to get target option
-TargetFeature GetTargetInfo(const codegen::CodeGenSettings& settings) {
-  TargetFeature feature;
-
-  std::string target_str = "";
-
-  bool isAVX = false;
-  bool isAVX2 = false;
-  bool isAVX512 = false;
-  if (target_str == "avx") {
-    isAVX = true;
-  } else if (target_str == "avx2") {
-    isAVX = true;
-    isAVX2 = true;
-  } else if (target_str == "avx512") {
-    isAVX = true;
-    isAVX2 = true;
-    isAVX512 = true;
-  } else {
-    isAVX = CPUIDInfo::GetCPUIDInfo().HasAVX();
-    isAVX2 = CPUIDInfo::GetCPUIDInfo().HasAVX2();
-    isAVX512 = CPUIDInfo::GetCPUIDInfo().HasAVX512Skylake();
-  }
-
-  feature.hasAVX = isAVX;
-  feature.hasAVX2 = isAVX2;
-  feature.hasAVX512 = isAVX512;
-
-  return feature;
-}
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/common/utils.h b/onnxruntime/core/codegen/common/utils.h
deleted file mode 100644
index ef06b5b72dc2c..0000000000000
--- a/onnxruntime/core/codegen/common/utils.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/common/common.h"
-#include <cassert>
-#include <memory>
-#include <vector>
-
-namespace onnxruntime {
-
-// Holding utility functions that are not tied to TVM and ORT
-
-std::unique_ptr<char[]> GetEnv(const char* var);
-
-// Check if an environment variable is set
-bool IsEnvVarDefined(const char* var);
-
-int64_t TotalSize(const std::vector<int64_t>& shape);
-
-void GetStrides(const int64_t* shape, int ndim, std::vector<int64_t>& strides);
-
-struct TargetFeature {
-  bool hasAVX;
-  bool hasAVX2;
-  bool hasAVX512;
-};
-
-TargetFeature GetTargetInfo(const codegen::CodeGenSettings& setttings);
-
-// GCD (Greatest Common Divisor)
-template <typename T>
-T GCD(T a, T b) {
-  ORT_ENFORCE(a >= 0);
-  ORT_ENFORCE(b >= 0);
-  if (a < b) std::swap(a, b);
-  if (b == 0) return a;
-  while (a % b != 0) {
-    a = a % b;
-    std::swap(a, b);
-  }
-  return b;
-}
-
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/common.h b/onnxruntime/core/codegen/mti/common.h
deleted file mode 100644
index d71e740b9284a..0000000000000
--- a/onnxruntime/core/codegen/mti/common.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <stdexcept>
-#include <string>
-
-#define MTI_ASSERT(condition)                                           \
-  if (!(condition)) {                                                   \
-    std::string error_msg = "Not satisfied: " #condition                \
-                            ": line " +                                 \
-                            std::to_string(__LINE__) +                  \
-                            " in file " + std::string(__FILE__) + "\n"; \
-    throw std::runtime_error(error_msg);                                \
-  }
diff --git a/onnxruntime/core/codegen/mti/debug/tvm_print.cc b/onnxruntime/core/codegen/mti/debug/tvm_print.cc
deleted file mode 100644
index 0491636032b47..0000000000000
--- a/onnxruntime/core/codegen/mti/debug/tvm_print.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/debug/tvm_print.h"
-
-#include "core/codegen/common/utils.h"
-#include "core/codegen/common/dump_array.h"
-#include "core/codegen/mti/common.h"
-#include <topi/detail/extern.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.print")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) {
-      DLTensor* X = args[0];
-      DLTensor* Y = args[1];
-
-      DLDataType dtype = X->dtype;
-      std::vector<int64_t> shape;
-      int64_t total_size = 1;
-      for (int i = 0; i < X->ndim; ++i) {
-        shape.push_back(X->shape[i]);
-        total_size *= X->shape[i];
-      }
-
-      // pass X to Y
-      memcpy(static_cast<char*>(Y->data) + Y->byte_offset,
-             static_cast<char*>(X->data) + X->byte_offset,
-             total_size * dtype.bits / 8);
-
-      if (tvm::runtime::TypeMatch(dtype, kDLFloat, 32)) {
-        float* data = reinterpret_cast<float*>(static_cast<char*>(X->data) + X->byte_offset);
-        DumpArray("float tensor:", data, shape);
-      } else if (tvm::runtime::TypeMatch(dtype, kDLInt, 8)) {
-        int8_t* data = reinterpret_cast<int8_t*>(static_cast<char*>(X->data) + X->byte_offset);
-        DumpArray("int8 tensor:", data, shape);
-      } else if (tvm::runtime::TypeMatch(dtype, kDLInt, 16)) {
-        int16_t* data = reinterpret_cast<int16_t*>(static_cast<char*>(X->data) + X->byte_offset);
-        DumpArray("int16 tensor:", data, shape);
-      } else if (tvm::runtime::TypeMatch(dtype, kDLInt, 32)) {
-        int32_t* data = reinterpret_cast<int32_t*>(static_cast<char*>(X->data) + X->byte_offset);
-        DumpArray("int32 tensor:", data, shape);
-      } else if (tvm::runtime::TypeMatch(dtype, kDLUInt, 8)) {
-        uint8_t* data = reinterpret_cast<uint8_t*>(static_cast<char*>(X->data) + X->byte_offset);
-        DumpArray("uint8 tensor:", data, shape);
-      } else if (tvm::runtime::TypeMatch(dtype, kDLUInt, 16)) {
-        uint16_t* data = reinterpret_cast<uint16_t*>(static_cast<char*>(X->data) + X->byte_offset);
-        DumpArray("uint16 tensor:", data, shape);
-      } else if (tvm::runtime::TypeMatch(dtype, kDLUInt, 32)) {
-        uint32_t* data = reinterpret_cast<uint32_t*>(static_cast<char*>(X->data) + X->byte_offset);
-        DumpArray("uint32 tensor:", data, shape);
-      } else {
-        MTI_ASSERT(0 && "not implemented!");
-      }
-    });
-
-tvm::Array<tvm::Tensor>
-PrintTVMTensorExtern(const tvm::Tensor& X,
-                     const std::string& name) {
-  return topi::detail::make_extern(
-      {X->shape},
-      {X->dtype},
-      {X},
-      [&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
-        return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.print"),
-                                          topi::detail::pack_buffer(ins[0]),
-                                          topi::detail::pack_buffer(outs[0])});
-      },
-      name + "_print", "", {});
-}
-
-tvm::Tensor PrintImmutable(const tvm::Tensor& X) {
-  auto outputs = PrintTVMTensorExtern(X, X->op->name + "_print");
-  return outputs[0];
-}
-
-void Print(tvm::Tensor& X) {
-  X = PrintImmutable(X);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/debug/tvm_print.h b/onnxruntime/core/codegen/mti/debug/tvm_print.h
deleted file mode 100644
index 91a334785a2a4..0000000000000
--- a/onnxruntime/core/codegen/mti/debug/tvm_print.h
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Array<tvm::Tensor> PrintTVMTensorExtern(
-    const tvm::Tensor& X,
-    const std::string& name = "PrintTVM2DTensorExtern");
-
-tvm::Tensor PrintImmutable(const tvm::Tensor& X);
-
-void Print(tvm::Tensor& X);
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/binary_ops.cc b/onnxruntime/core/codegen/mti/math/binary_ops.cc
deleted file mode 100644
index f3048799458f4..0000000000000
--- a/onnxruntime/core/codegen/mti/math/binary_ops.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/math/binary_ops.h"
-
-#include "core/codegen/mti/math/unary_ops.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/cast_ops.h"
-#include <topi/broadcast.h>
-
-// Using namespace topi for override operator +-*/
-using namespace topi;
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-#define TVM_BINARY_OP1(op, expr)                                                            \
-  tvm::Tensor op(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { \
-    return Rename(expr, name);                                                              \
-  }                                                                                         \
-  tvm::Tensor op(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) {   \
-    return Rename(expr, name);                                                              \
-  }
-
-#define TVM_BINARY_OP(op, expr)                                                           \
-  TVM_BINARY_OP1(op, expr)                                                                \
-  tvm::Tensor op(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) { \
-    return Rename(expr, name);                                                            \
-  }
-
-TVM_BINARY_OP(Add, lhs + rhs);
-TVM_BINARY_OP(Div, lhs / rhs);
-TVM_BINARY_OP(Max, maximum(lhs, rhs));
-TVM_BINARY_OP(Min, minimum(lhs, rhs));
-TVM_BINARY_OP(Mul, lhs* rhs);
-TVM_BINARY_OP1(PRelu, Relu(lhs) - rhs * Relu(0 - lhs));
-TVM_BINARY_OP(Sub, lhs - rhs);
-
-tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) {
-  return topi::equal(lhs, rhs, name);
-}
-tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) {
-  return topi::equal(lhs, rhs, name);
-}
-tvm::Tensor Equal(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) {
-  return topi::equal(lhs, rhs, name);
-}
-
-tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) {
-  return topi::greater(lhs, rhs, name);
-}
-tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) {
-  return topi::greater(lhs, rhs, name);
-}
-tvm::Tensor Greater(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) {
-  return topi::greater(lhs, rhs, name);
-}
-
-tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) {
-  return topi::less(lhs, rhs, name);
-}
-tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) {
-  return topi::less(lhs, rhs, name);
-}
-tvm::Tensor Less(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) {
-  return topi::less(lhs, rhs, name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/binary_ops.h b/onnxruntime/core/codegen/mti/math/binary_ops.h
deleted file mode 100644
index dd51ce5e7917d..0000000000000
--- a/onnxruntime/core/codegen/mti/math/binary_ops.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Add(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "add");
-tvm::Tensor Add(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "add");
-tvm::Tensor Add(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "add");
-tvm::Tensor Div(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "div");
-tvm::Tensor Div(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "div");
-tvm::Tensor Div(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "div");
-tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "equal");
-tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "equal");
-tvm::Tensor Equal(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "equal");
-tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "greater");
-tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "greater");
-tvm::Tensor Greater(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "greater");
-tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "less");
-tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "less");
-tvm::Tensor Less(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "less");
-tvm::Tensor Max(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "max");
-tvm::Tensor Max(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "max");
-tvm::Tensor Max(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "max");
-tvm::Tensor Min(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "min");
-tvm::Tensor Min(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "min");
-tvm::Tensor Min(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "min");
-tvm::Tensor Mul(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "mul");
-tvm::Tensor Mul(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "mul");
-tvm::Tensor Mul(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "mul");
-tvm::Tensor PRelu(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "prelu");
-tvm::Tensor PRelu(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "prelu");
-tvm::Tensor Sub(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "sub");
-tvm::Tensor Sub(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "sub");
-tvm::Tensor Sub(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "sub");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/gemm.cc b/onnxruntime/core/codegen/mti/math/gemm.cc
deleted file mode 100644
index 7a79513ccaa97..0000000000000
--- a/onnxruntime/core/codegen/mti/math/gemm.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/math/gemm.h"
-
-#include "core/codegen/mti/math/matmul_ops.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <topi/broadcast.h>
-
-// Using namespace topi for override operator +-*/
-using namespace topi;
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Gemm(const tvm::Tensor& A, const tvm::Tensor& B, const tvm::Tensor& C,
-                 bool trans_A, bool trans_B, float alpha, float beta,
-                 const std::string& name) {
-  auto A_dot_B = MatMul2D(A, B, trans_A, trans_B, name + "_matmul2d");
-  tvm::Expr alphaExpr = tvm::make_const(A->dtype, alpha);
-  if (beta != 0) {
-    tvm::Expr betaExpr = tvm::make_const(A->dtype, beta);
-    return Rename(alphaExpr * A_dot_B + (betaExpr * C), name);
-  } else {
-    return Rename(alphaExpr * A_dot_B, name);
-  }
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/gemm.h b/onnxruntime/core/codegen/mti/math/gemm.h
deleted file mode 100644
index 3bb205c13fdc9..0000000000000
--- a/onnxruntime/core/codegen/mti/math/gemm.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Gemm(const tvm::Tensor& p_A, const tvm::Tensor& p_B, const tvm::Tensor& p_C,
-                 bool trans_A, bool trans_B, float alpha, float beta,
-                 const std::string& name = "gemm");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/logsoftmax.cc b/onnxruntime/core/codegen/mti/math/logsoftmax.cc
deleted file mode 100644
index cd8c2edae6959..0000000000000
--- a/onnxruntime/core/codegen/mti/math/logsoftmax.cc
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/math/logsoftmax.h"
-
-#include "core/codegen/mti/tensor/reshape_ops.h"
-#include <topi/nn/softmax.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor LogSoftmax(const tvm::Tensor& input, int64_t axis, const std::string& name) {
-  tvm::Tensor flatten_t = Flatten(input, axis, "logsoftmax_flatten");
-  return Reshape(topi::nn::log_softmax(flatten_t, name), input->shape, "logsoftmax_reshape");
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/logsoftmax.h b/onnxruntime/core/codegen/mti/math/logsoftmax.h
deleted file mode 100644
index 606a32806434b..0000000000000
--- a/onnxruntime/core/codegen/mti/math/logsoftmax.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor LogSoftmax(const tvm::Tensor& input, int64_t axis, const std::string& name = "logsoftmax");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/matmul_ops.cc b/onnxruntime/core/codegen/mti/math/matmul_ops.cc
deleted file mode 100644
index 6ecf2f69a9c25..0000000000000
--- a/onnxruntime/core/codegen/mti/math/matmul_ops.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/math/matmul_ops.h"
-
-#include "core/codegen/mti/common.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <topi/transform.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a, bool trans_b, const std::string& name) {
-  return topi::matmul(A, B, trans_a, trans_b, name);
-}
-
-/*
- * Generic Matrix Multiplication
- *
- * If both arguments are 2-D, they are multiplied like conventional matrices.
- *
- * If either argument is N-D and N > 2, it is treated as a stack of matrices residing in the last two indexes and broadcast accordingly.
- *
- * If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions.
- * After matrix multiplication the prepended 1 is removed.
- *
- * If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
- * After matrix multiplication the appended 1 is removed.
- */
-tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string& name) {
-  int64_t a_rank = static_cast<int64_t>(A->shape.size());
-  int64_t b_rank = static_cast<int64_t>(B->shape.size());
-  const auto& A_shape = A->shape;
-  const auto& B_shape = B->shape;
-  if (a_rank == 2 && b_rank == 2) {
-    // 2-D X 2-D
-    return MatMul2D(A, B);
-  } else if (a_rank == 1 && b_rank == 1) {
-    // 1-D X 1-D
-    auto k = tvm::reduce_axis(tvm::Range(0, A_shape[0]), "k");
-
-    return tvm::compute(
-        {},
-        [&](const tvm::Array<tvm::Var>& /*indices*/) {
-          return tvm::sum(A[k] * B[k], {k});
-        },
-        name);
-  } else if (a_rank == 1) {
-    // 1-D X n-D
-    auto k = tvm::reduce_axis(tvm::Range(0, A_shape[0]), "k");
-
-    auto l = [&](const tvm::Array<tvm::Var>& indices) {
-      auto ndims = indices.size();
-      MTI_ASSERT(ndims >= 1);
-      tvm::Array<tvm::Expr> b_indices;
-      for (size_t bi = 0; bi < ndims - 1; ++bi) {
-        b_indices.push_back(indices[bi]);
-      }
-      b_indices.push_back(k);
-      b_indices.push_back(indices[ndims - 1]);
-      return tvm::sum(A({k}) * B(b_indices), {k});
-    };
-    return tvm::compute(ConcatShapes(SliceShapeToDimension(B_shape, -2), SliceShapeFromDimension(B_shape, -1)), l, name);
-  } else if (b_rank == 1) {
-    // n-D X 1-D
-    auto k = tvm::reduce_axis(tvm::Range(0, B_shape[0]), "k");
-
-    auto l = [&](const tvm::Array<tvm::Var>& indices) {
-      tvm::Array<tvm::Expr> a_indices(indices.begin(), indices.end());
-      a_indices.push_back(k);
-      return tvm::sum(A(a_indices) * B({k}), {k});
-    };
-    return tvm::compute(SliceShapeToDimension(A->shape, -1), l, name);
-  } else {
-    // n-D X m-D
-    MTI_ASSERT(a_rank >= 2 && b_rank >= 2);
-    auto k = tvm::reduce_axis(tvm::Range(0, A_shape[a_rank - 1]), "k");
-
-    auto l = [&](const tvm::Array<tvm::Var>& indices) {
-      auto ndims = static_cast<int>(indices.size());
-      MTI_ASSERT(ndims > 2);
-      tvm::Array<tvm::Expr> a_indices, b_indices;
-
-      // handle broadcasting
-      int i = 0, a_idx = 0, b_idx = 0;
-      bool a_greater = a_rank > b_rank;
-      for (; i < std::abs(a_rank - b_rank); ++i) {
-        if (a_greater) {
-          a_indices.push_back(indices[i]);
-          a_idx++;
-        } else {
-          b_indices.push_back(indices[i]);
-          b_idx++;
-        }
-      }
-      for (; i < ndims - 2; ++i, ++a_idx, ++b_idx) {
-        auto tp = indices[i].type();
-        if (IsOne(A_shape, a_idx)) {
-          a_indices.push_back(tvm::make_zero(tp));
-          b_indices.push_back(indices[i]);
-        } else if (IsOne(B_shape, b_idx)) {
-          b_indices.push_back(tvm::make_zero(tp));
-          a_indices.push_back(indices[i]);
-        } else {
-          a_indices.push_back(indices[i]);
-          b_indices.push_back(indices[i]);
-        }
-      }
-
-      MTI_ASSERT(a_idx == a_rank - 2 && b_idx == b_rank - 2);
-      a_indices.push_back(indices[ndims - 2]);
-      a_indices.push_back(k);
-
-      b_indices.push_back(k);
-      b_indices.push_back(indices[ndims - 1]);
-
-      return tvm::sum(A(a_indices) * B(b_indices), {k});
-    };
-
-    return tvm::compute(ComputeMatMulShape(A_shape, B_shape), l, name);
-  }
-}
-
-tvm::Array<tvm::Expr>
-ComputeMatMulShape(
-    const tvm::Array<tvm::Expr>& A_shape,
-    const tvm::Array<tvm::Expr>& B_shape,
-    bool trans_a,
-    bool trans_b) {
-  auto a_rank = A_shape.size();
-  auto b_rank = B_shape.size();
-  tvm::Array<tvm::Expr> output_shape;
-  int64_t output_rank = std::max(a_rank, b_rank);
-  MTI_ASSERT(a_rank > 0 && b_rank > 0);
-  if (a_rank == 1 && b_rank == 1) {
-    MTI_ASSERT(!trans_a && !trans_b);
-    // reduction, output shape is empty
-  } else if (a_rank == 1) {
-    MTI_ASSERT(!trans_a && !trans_b);
-    output_shape = SliceShapeToDimension(B_shape, b_rank - 2);
-    output_shape.push_back(B_shape[b_rank - 1]);
-  } else if (b_rank == 1) {
-    MTI_ASSERT(!trans_a && !trans_b);
-    output_shape = SliceShapeToDimension(A_shape, a_rank - 1);
-  } else {
-    for (int64_t i = 0; i < output_rank - 2; i++) {
-      tvm::Expr broadcasted_dim = tvm::make_const(HalideIR::Int(32), 1);
-      bool broadcasted =
-          BroadcastDim(A_shape, i, output_rank, broadcasted_dim) &&
-          BroadcastDim(B_shape, i, output_rank, broadcasted_dim);
-      MTI_ASSERT(broadcasted);
-      output_shape.push_back(broadcasted_dim);
-    }
-    output_shape.push_back(A_shape[a_rank - (trans_a ? 1 : 2)]);
-    output_shape.push_back(B_shape[b_rank - (trans_b ? 2 : 1)]);
-  }
-  return output_shape;
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/matmul_ops.h b/onnxruntime/core/codegen/mti/math/matmul_ops.h
deleted file mode 100644
index ab9986132d34a..0000000000000
--- a/onnxruntime/core/codegen/mti/math/matmul_ops.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Array<tvm::Expr>
-ComputeMatMulShape(
-    const tvm::Array<tvm::Expr>& A_shape,
-    const tvm::Array<tvm::Expr>& B_shape,
-    bool trans_a = false,
-    bool trans_b = false);
-
-tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a = false, bool trans_b = false, const std::string& name = "matmul2d");
-
-tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string& name = "matmul");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/reduce_ops.cc b/onnxruntime/core/codegen/mti/math/reduce_ops.cc
deleted file mode 100644
index 7d179e2b04316..0000000000000
--- a/onnxruntime/core/codegen/mti/math/reduce_ops.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/math/reduce_ops.h"
-
-#include "core/codegen/mti/math/binary_ops.h"
-#include "core/codegen/mti/math/unary_ops.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <topi/reduction.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor ArgMax(const tvm::Tensor& X, int64_t axis, bool keep_dims, const std::string& name) {
-  return Rename(topi::argmax(X, ToTvmArrayInt({axis}), keep_dims), name);
-}
-
-tvm::Tensor ArgMin(const tvm::Tensor& X, int64_t axis, bool keep_dims, const std::string& name) {
-  return Rename(topi::argmin(X, ToTvmArrayInt({axis}), keep_dims), name);
-}
-
-tvm::Tensor ReduceL1(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name) {
-  return ReduceSum(Abs(X), axes, keep_dims, name);
-}
-
-tvm::Tensor ReduceL2(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name) {
-  return Sqrt(ReduceSumSquare(X, axes, keep_dims), name);
-}
-
-tvm::Tensor ReduceLogSum(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name) {
-  return Log(ReduceSum(X, axes, keep_dims), name);
-}
-
-tvm::Tensor ReduceLogSumExp(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name) {
-  tvm::Tensor reduce_max = ReduceMax(X, axes, true);
-  tvm::Tensor exp_delta = Exp(Sub(X, reduce_max));
-  tvm::Tensor reduce_max_keep_dims = ReduceMax(X, axes, keep_dims);
-  return Add(ReduceLogSum(exp_delta, axes, keep_dims), reduce_max_keep_dims, name);
-}
-
-tvm::Tensor ReduceMax(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name) {
-  return Rename(topi::max(X, ToTvmArrayInt(axes), keep_dims), name);
-}
-
-tvm::Tensor ReduceMean(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name) {
-  tvm::Tensor reduce_sum = ReduceSum(X, axes, keep_dims);
-  tvm::Expr count = tvm::make_const(reduce_sum->dtype, 1.0f);
-  if (axes.empty()) {
-    for (const auto& dim : X->shape)
-      count = count * dim;
-  } else {
-    for (int64_t axis : axes) {
-      int64_t i = HandleNegativeAxis(axis, X->shape.size());
-      count = count * X->shape[i];
-    }
-  }
-  return tvm::compute(
-      reduce_sum->shape,
-      [&](const tvm::Array<tvm::Var>& i) {
-        return reduce_sum(i) / count;
-      },
-      name);
-}
-
-tvm::Tensor ReduceMin(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name) {
-  return Rename(topi::min(X, ToTvmArrayInt(axes), keep_dims), name);
-}
-
-tvm::Tensor ReduceProd(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name) {
-  auto prod = [](tvm::Expr source, tvm::Array<tvm::IterVar> rdom) {
-    tvm::Var x("x", source.type()), y("y", source.type());
-    tvm::Expr Rename_element = tvm::make_const(source.type(), 1.0f);
-    tvm::ir::CommReducer combiner =
-        tvm::ir::CommReducerNode::make({x}, {y}, {x * y}, {Rename_element});
-    return tvm::ir::Reduce::make(combiner, {source}, rdom, tvm::make_const(tvm::Bool(1), true), 0);
-  };
-
-  return Rename(topi::CommReduce(X, ToTvmArrayInt(axes), prod, keep_dims, true), name);
-}
-
-tvm::Tensor ReduceSum(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name) {
-  return Rename(topi::sum(X, ToTvmArrayInt(axes), keep_dims), name);
-}
-
-tvm::Tensor ReduceSumSquare(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name) {
-  return Rename(topi::sum(Mul(X, X), ToTvmArrayInt(axes), keep_dims), name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/reduce_ops.h b/onnxruntime/core/codegen/mti/math/reduce_ops.h
deleted file mode 100644
index f782df5e6515f..0000000000000
--- a/onnxruntime/core/codegen/mti/math/reduce_ops.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor ArgMax(const tvm::Tensor& X,
-                   int64_t axis,
-                   bool keep_dims,
-                   const std::string& name = "argmax");
-
-tvm::Tensor ArgMin(const tvm::Tensor& X,
-                   int64_t axis,
-                   bool keep_dims,
-                   const std::string& name = "argmin");
-
-tvm::Tensor ReduceL1(const tvm::Tensor& X,
-                     const std::vector<int64_t>& axes,
-                     bool keep_dims,
-                     const std::string& name = "reduce_l1");
-
-tvm::Tensor ReduceL2(const tvm::Tensor& X,
-                     const std::vector<int64_t>& axes,
-                     bool keep_dims,
-                     const std::string& name = "reduce_l2");
-
-tvm::Tensor ReduceLogSum(const tvm::Tensor& X,
-                         const std::vector<int64_t>& axes,
-                         bool keep_dims,
-                         const std::string& name = "reduce_log_sum");
-
-tvm::Tensor ReduceLogSumExp(const tvm::Tensor& X,
-                            const std::vector<int64_t>& axes,
-                            bool keep_dims,
-                            const std::string& name = "argmareduce_log_sum_exp");
-
-tvm::Tensor ReduceMax(const tvm::Tensor& X,
-                      const std::vector<int64_t>& axes,
-                      bool keep_dims,
-                      const std::string& name = "reduce_max");
-
-tvm::Tensor ReduceMean(const tvm::Tensor& X,
-                       const std::vector<int64_t>& axes,
-                       bool keep_dims,
-                       const std::string& name = "reduce_mean");
-
-tvm::Tensor ReduceMin(const tvm::Tensor& X,
-                      const std::vector<int64_t>& axes,
-                      bool keep_dims,
-                      const std::string& name = "reduce_min");
-
-tvm::Tensor ReduceProd(const tvm::Tensor& X,
-                       const std::vector<int64_t>& axes,
-                       bool keep_dims,
-                       const std::string& name = "reduce_prod");
-
-tvm::Tensor ReduceSum(const tvm::Tensor& X,
-                      const std::vector<int64_t>& axes,
-                      bool keep_dims,
-                      const std::string& name = "reduce_sum");
-
-tvm::Tensor ReduceSumSquare(const tvm::Tensor& X,
-                            const std::vector<int64_t>& axes,
-                            bool keep_dims,
-                            const std::string& name = "reduce_sum_square");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/softmax.cc b/onnxruntime/core/codegen/mti/math/softmax.cc
deleted file mode 100644
index d7404137bb873..0000000000000
--- a/onnxruntime/core/codegen/mti/math/softmax.cc
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/math/softmax.h"
-
-#include "core/codegen/mti/tensor/reshape_ops.h"
-#include <topi/nn/softmax.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Softmax(const tvm::Tensor& input, int64_t axis, const std::string& name) {
-  tvm::Tensor flatten_t = Flatten(input, axis, "softmax_flatten");
-  return Reshape(topi::nn::softmax(flatten_t, 1, name), input->shape, "softmax_reshape");
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/softmax.h b/onnxruntime/core/codegen/mti/math/softmax.h
deleted file mode 100644
index fb16fbaeb56a2..0000000000000
--- a/onnxruntime/core/codegen/mti/math/softmax.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Softmax(const tvm::Tensor& input, int64_t axis, const std::string& name = "softmax");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/unary_ops.cc b/onnxruntime/core/codegen/mti/math/unary_ops.cc
deleted file mode 100644
index ae732ea33e670..0000000000000
--- a/onnxruntime/core/codegen/mti/math/unary_ops.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/math/unary_ops.h"
-
-#include "core/codegen/common/settings.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <stdexcept>
-#include <topi/broadcast.h>
-#include <topi/elemwise.h>
-#include <topi/transform.h>
-
-// Using namespace topi for override operator +-*/
-using namespace topi;
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Abs(const tvm::Tensor& X, const std::string& name) {
-  return abs(X, name);
-}
-
-tvm::Tensor Affine(const tvm::Tensor& X, float alpha, float beta, const std::string& name) {
-  tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha);
-  tvm::Expr betaExpr = tvm::make_const(X->dtype, beta);
-  return Rename(alphaExpr * X + betaExpr, name);
-}
-
-tvm::Tensor Ceil(const tvm::Tensor& X, const std::string& name) {
-  return topi::ceil(X, name);
-}
-
-tvm::Tensor Clip(const tvm::Tensor& X, tvm::Expr min_value, tvm::Expr max_value, const std::string& name) {
-  auto Y = tvm::compute(
-      X->shape,
-      [&](const tvm::Array<tvm::Var>& indices) {
-        return tvm::min(tvm::max(X(indices), min_value), max_value);
-      },
-      name);
-  return Y;
-}
-
-tvm::Tensor Elu(const tvm::Tensor& X, float alpha, const std::string& name) {
-  tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha);
-  return Rename(Relu(X) - alphaExpr * Relu(1 - Exp(X)), name);
-}
-
-tvm::Tensor Exp(const tvm::Tensor& X, const std::string& name) {
-  return tvm::compute(
-      X->shape,
-      [&](const tvm::Array<tvm::Var>& indices) {
-        return tvm::exp(X(indices));
-      },
-      name);
-}
-
-tvm::Tensor Floor(const tvm::Tensor& X, const std::string& name) {
-  return topi::floor(X, name);
-}
-
-tvm::Tensor HardSigmoid(const tvm::Tensor& X, float alpha, float beta, const std::string& name) {
-  tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha);
-  tvm::Expr betaExpr = tvm::make_const(X->dtype, beta);
-  return maximum(0, minimum(1, alphaExpr * X + betaExpr), name);
-}
-
-tvm::Tensor LeakyRelu(const tvm::Tensor& X, float alpha, const std::string& name) {
-  tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha);
-  return Rename(Relu(X) - alphaExpr * Relu(0 - X), name);
-}
-
-tvm::Tensor Log(const tvm::Tensor& X, const std::string& name) {
-  return tvm::compute(
-      X->shape,
-      [&](const tvm::Array<tvm::Var>& indices) {
-        return tvm::log(X(indices));
-      },
-      name);
-}
-
-tvm::Tensor Neg(const tvm::Tensor& X, const std::string& name) {
-  return negative(X, name);
-}
-
-tvm::Tensor ParametricSoftplus(const tvm::Tensor& X, float alpha, float beta, const std::string& name) {
-  tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha);
-  tvm::Expr betaExpr = tvm::make_const(X->dtype, beta);
-  return Rename(alphaExpr * Softplus(betaExpr * X), name);
-}
-
-tvm::Tensor Reciprocal(const tvm::Tensor& X, const std::string& name) {
-  return Rename(1 / X, name);
-}
-
-tvm::Tensor Relu(const tvm::Tensor& X, const std::string& name) {
-  return maximum(X, 0, name);
-}
-
-tvm::Tensor ScaledTanh(const tvm::Tensor& X, float alpha, float beta, const std::string& name) {
-  tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha);
-  tvm::Expr betaExpr = tvm::make_const(X->dtype, beta);
-  return Rename(alphaExpr * Tanh(betaExpr * X), name);
-}
-
-tvm::Tensor Selu(const tvm::Tensor& X, float alpha, float gamma, const std::string& name) {
-  tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha);
-  tvm::Expr gammaExpr = tvm::make_const(X->dtype, gamma);
-  return Rename(gammaExpr * (-alphaExpr * Relu(1 - Exp(X)) + Relu(X)), name);
-}
-
-tvm::Tensor Sigmoid(const tvm::Tensor& X, const std::string& name) {
-  return tvm::compute(
-      X->shape,
-      [&](const tvm::Array<tvm::Var>& indices) {
-        return tvm::ir::Select::make(X(indices) > 0,
-                                     1 / (1 + tvm::exp(-X(indices))),
-                                     tvm::exp(X(indices)) / (tvm::exp(X(indices)) + 1));
-      },
-      name);
-}
-
-tvm::Tensor SignNoZero(const tvm::Tensor& X, const std::string& name) {
-  return Rename(greater_equal(X, 0) * 2 - 1, name);
-}
-
-tvm::Tensor Softplus(const tvm::Tensor& X, const std::string& name) {
-  return Rename(Log(1 + Exp(Neg(Abs(X)))) + Relu(X), name);
-}
-
-tvm::Tensor Softsign(const tvm::Tensor& X, const std::string& name) {
-  return Rename(X / (1 + Abs(X)), name);
-}
-
-tvm::Tensor Sqrt(const tvm::Tensor& X, const std::string& name) {
-  return sqrt(X, name);
-}
-
-tvm::Tensor Tanh(const tvm::Tensor& X, const std::string& name) {
-  return tvm::compute(
-      X->shape,
-      [&](const tvm::Array<tvm::Var>& indices) {
-        return tvm::ir::Select::make(X(indices) < 0,
-                                     (tvm::exp(2 * X(indices)) - 1) / (tvm::exp(2 * X(indices)) + 1),
-                                     (1 - tvm::exp(-2 * X(indices))) / (1 + tvm::exp(-2 * X(indices))));
-      },
-      name);
-}
-
-tvm::Tensor ThresholdedRelu(const tvm::Tensor& X, float alpha, const std::string& name) {
-  tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha);
-  return topi::where(greater(X, alphaExpr), X, topi::full_like(X, tvm::make_zero(X->dtype)), name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/math/unary_ops.h b/onnxruntime/core/codegen/mti/math/unary_ops.h
deleted file mode 100644
index aeb336262e547..0000000000000
--- a/onnxruntime/core/codegen/mti/math/unary_ops.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Abs(const tvm::Tensor& X, const std::string& name = "abs");
-tvm::Tensor Affine(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "affine");
-tvm::Tensor Ceil(const tvm::Tensor& X, const std::string& name = "ceil");
-tvm::Tensor Clip(const tvm::Tensor& X, tvm::Expr min_value, tvm::Expr max_value, const std::string& name = "clip");
-tvm::Tensor Elu(const tvm::Tensor& X, float alpha, const std::string& name = "elu");
-tvm::Tensor Exp(const tvm::Tensor& X, const std::string& name = "exp");
-tvm::Tensor Floor(const tvm::Tensor& X, const std::string& name = "floor");
-tvm::Tensor HardSigmoid(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "hard_sigmoid");
-tvm::Tensor LeakyRelu(const tvm::Tensor& X, float alpha, const std::string& name = "leaky_relu");
-tvm::Tensor Log(const tvm::Tensor& X, const std::string& name = "log");
-tvm::Tensor Neg(const tvm::Tensor& X, const std::string& name = "neg");
-tvm::Tensor ParametricSoftplus(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "parametric_softplus");
-tvm::Tensor Reciprocal(const tvm::Tensor& X, const std::string& name = "reciprocal");
-tvm::Tensor Relu(const tvm::Tensor& X, const std::string& name = "relu");
-tvm::Tensor ScaledTanh(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "scaled_tanh");
-tvm::Tensor Selu(const tvm::Tensor& X, float alpha, float gamma, const std::string& name = "selu");
-tvm::Tensor Sigmoid(const tvm::Tensor& X, const std::string& name = "sigmoid");
-tvm::Tensor SignNoZero(const tvm::Tensor& X, const std::string& name = "sign_no_zero");
-tvm::Tensor Softplus(const tvm::Tensor& X, const std::string& name = "softplus");
-tvm::Tensor Softsign(const tvm::Tensor& X, const std::string& name = "softsign");
-tvm::Tensor Sqrt(const tvm::Tensor& X, const std::string& name = "sqrt");
-tvm::Tensor Tanh(const tvm::Tensor& X, const std::string& name = "tanh");
-tvm::Tensor ThresholdedRelu(const tvm::Tensor& X, float alpha, const std::string& name = "thresholded_relu");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/mti_tvm_utils.cc b/onnxruntime/core/codegen/mti/mti_tvm_utils.cc
deleted file mode 100644
index 8e73629c05614..0000000000000
--- a/onnxruntime/core/codegen/mti/mti_tvm_utils.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-
-#include "core/codegen/common/settings.h"
-#include "core/codegen/mti/tensor/reshape_ops.h"
-#include <topi/detail/extern.h>
-#include <tvm/ir_pass.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Array<tvm::Expr> ToTvmArray(gsl::span<const int64_t> shape) {
-  tvm::Array<tvm::Expr> arr;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    arr.push_back(tvm::Expr(static_cast<int32_t>(shape[i])));
-  }
-  return arr;
-}
-
-tvm::Array<tvm::Integer> ToTvmArrayInt(gsl::span<const int64_t> shape) {
-  tvm::Array<tvm::Integer> arr;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    arr.push_back(shape[i]);
-  }
-  return arr;
-}
-
-tvm::Expr SizeToDimension(const tvm::Array<tvm::Expr>& shape, int64_t axis) {
-  tvm::Expr size(1);
-  auto rank = shape.size();
-  if (static_cast<size_t>(axis) != rank) {
-    axis = HandleNegativeAxis(axis, rank);
-  }
-  for (size_t d = 0; d < std::min(rank, static_cast<size_t>(axis)); ++d)
-    size = tvm::ir::Simplify(size * shape[d]);
-  return size;
-}
-
-tvm::Expr SizeFromDimension(const tvm::Array<tvm::Expr>& shape, int64_t axis) {
-  tvm::Expr size(1);
-  auto rank = shape.size();
-  if (static_cast<size_t>(axis) != rank) {
-    axis = HandleNegativeAxis(axis, rank);
-  }
-  for (size_t d = static_cast<size_t>(axis); d < rank; ++d)
-    size = tvm::ir::Simplify(size * shape[d]);
-  return size;
-}
-
-tvm::Expr RoundUp(tvm::Expr value, tvm::Expr alignment) {
-  return tvm::ir::Simplify((value + alignment - 1) / alignment * alignment);
-}
-
-tvm::Array<tvm::Expr> ConcatShapes(
-    const tvm::Array<tvm::Expr>& shape1,
-    const tvm::Array<tvm::Expr>& shape2) {
-  tvm::Array<tvm::Expr> result;
-  for (size_t i = 0; i < shape1.size(); i++)
-    result.push_back(shape1[i]);
-  for (size_t i = 0; i < shape2.size(); i++)
-    result.push_back(shape2[i]);
-  return result;
-}
-
-tvm::Tensor Rename(tvm::Tensor X, const std::string& name) {
-  const_cast<std::string&>(X->op->name) = name;
-  return X;
-}
-
-tvm::Array<tvm::Expr> SliceShape(const tvm::Array<tvm::Expr>& shape, const std::vector<int64_t>& axes) {
-  tvm::Array<tvm::Expr> new_shape;
-  for (auto axis : axes) {
-    CHECK(axis < static_cast<int64_t>(shape.size()));
-    new_shape.push_back(shape[axis]);
-  }
-  return new_shape;
-}
-
-tvm::Array<tvm::Expr> SliceShapeFromDimension(const tvm::Array<tvm::Expr>& shape, int64_t axis) {
-  int64_t rank = static_cast<int64_t>(shape.size());
-  axis = HandleNegativeAxis(axis, rank);
-  std::vector<int64_t> axes;
-  for (auto i = axis; i < rank; ++i)
-    axes.push_back(i);
-  return SliceShape(shape, axes);
-}
-
-tvm::Array<tvm::Expr> SliceShapeToDimension(const tvm::Array<tvm::Expr>& shape, int64_t axis) {
-  int64_t rank = static_cast<int64_t>(shape.size());
-  axis = HandleNegativeAxis(axis, rank);
-  std::vector<int64_t> axes;
-  for (auto i = 0; i < axis; ++i)
-    axes.push_back(i);
-  return SliceShape(shape, axes);
-}
-
-bool IsOne(const tvm::Array<tvm::Expr>& shape, int64_t axis) {
-  int64_t rank = static_cast<int64_t>(shape.size());
-  axis = HandleNegativeAxis(axis, rank);
-  const auto& dim = shape[axis];
-  auto* p = tvm::as_const_int(dim);
-  return p != nullptr && *p == 1;
-}
-
-tvm::Tensor Promote(const tvm::Expr& expr, const tvm::Array<tvm::Expr>& shape, const std::string& name) {
-  return tvm::compute(
-      shape,
-      [&](const tvm::Array<tvm::Var>&) {
-        return expr;
-      },
-      name);
-}
-
-void DumpTVMModuleToFile(const std::string& filename, tvm::runtime::Module& module) {
-  const codegen::CodeGenSettings& settings = codegen::CodeGenSettings::Instance();
-  if (!settings.HasOption(codegen::CodeGenSettings::kCodeGenDumpModule))
-    return;
-
-  // ISSUE: note that all option values are converted to lower case. It doesn't cause
-  // any issue currently, because all supported formats (i.e. file exts) are of lower case.
-  // Just keep in mind that we might have issue if somehow we started to support dump
-  // formats with upper case, although it's quite unlikely.
-  std::string format = settings.GetOptionValue(codegen::CodeGenSettings::kCodeGenDumpModule);
-  std::string module_filename = filename + "." + format;
-  module->SaveToFile(module_filename, format);
-}
-
-tvm::Tensor MakeZeroTensor(const tvm::Array<tvm::Expr>& shape,
-                           HalideIR::Type type,
-                           const std::string& name) {
-  auto l = [&](const tvm::Array<tvm::Var>& /*indices*/) {
-    return tvm::make_zero(type);
-  };
-  return tvm::compute(shape, l, name);
-}
-
-bool BroadcastDim(const tvm::Array<tvm::Expr>& shape, size_t i, size_t output_rank, tvm::Expr& dim) {
-  if (i >= output_rank - shape.size()) {
-    auto new_dim = shape[shape.size() - output_rank + i];
-    if (tvm::ir::Equal(new_dim, dim))
-      return true;
-
-    const int64_t* p_new = tvm::as_const_int(new_dim);
-    if (p_new != nullptr && *p_new == 1) {
-      return true;
-    } else {
-      const int64_t* p_old = tvm::as_const_int(dim);
-      if (p_old != nullptr && *p_old == 1) {
-        dim = new_dim;
-        return true;
-      }
-    }
-    return false;
-  }
-  // auto broadcast to outer dims
-  return true;
-}
-
-tvm::Array<tvm::Tensor> MakeInputsForExtern(const tvm::Array<tvm::Tensor>& inputs, const std::string& name) {
-  // note that currently TVM StorageFlatten creates strides like max(symbolic_dim, 1)
-  // which is not zero when checking symbolic_dim - max(symbolic_dim, 1)
-  // then triggers error like: Trying to bind compact buffer to strided one
-  // here's a workaround to reshape inputs to avoid that
-  tvm::Array<tvm::Tensor> fixed_inputs;
-  for (size_t idx_input = 0; idx_input < inputs.size(); ++idx_input) {
-    const auto& input = inputs[idx_input];
-    tvm::Array<tvm::Expr> fixed_shape;
-    if (input->shape.size() > 0) {
-      // stride compute does not use dim 0, so directly push to fixed_shape
-      fixed_shape.push_back(input->shape[0]);
-      bool need_fix = false;
-      for (size_t idx_dim = 1; idx_dim < input->shape.size(); ++idx_dim) {
-        const auto& dim = input->shape[idx_dim];
-        if (tvm::as_const_int(dim) == nullptr) {
-          fixed_shape.push_back(tvm::max(dim, tvm::make_const(HalideIR::Int(32), 1)));
-          need_fix = true;
-        } else {
-          fixed_shape.push_back(dim);
-        }
-      }
-      if (need_fix) {
-        fixed_inputs.push_back(tvm_codegen::Reshape(input, fixed_shape, name + "_" + std::to_string(idx_input)));
-        continue;
-      }
-    }
-    // no fix needed
-    fixed_inputs.push_back(input);
-  }
-  return fixed_inputs;
-}
-
-// Make sure idx is clamped in the range of [-bound, bound - 1]
-tvm::Expr ClampIndex(const tvm::Expr& idx, const tvm::Expr& bound) {
-  // when idx >= 0, we take tvm::max(..., 0), because (idx < 0) is 0
-  // when idx < 0, we take bound + tvm::max(...), because tvm::max(idx, 0) is 0
-  return tvm::max(tvm::min(idx, bound - 1), 0) +
-         (idx < 0) * (bound + tvm::max(idx, -bound));
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/mti_tvm_utils.h b/onnxruntime/core/codegen/mti/mti_tvm_utils.h
deleted file mode 100644
index c2a14106c1686..0000000000000
--- a/onnxruntime/core/codegen/mti/mti_tvm_utils.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include <gsl/gsl>
-#include <tvm/tvm.h>
-#include "core/codegen/mti/common.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Array<tvm::Expr> ToTvmArray(gsl::span<const int64_t> shape);
-
-tvm::Array<tvm::Integer> ToTvmArrayInt(gsl::span<const int64_t> shape);
-
-// Helper function to compute sub shape size to axis (not included)
-tvm::Expr SizeToDimension(const tvm::Array<tvm::Expr>& shape, int64_t axis);
-
-// Helper function to compute sub shape size from axis (included)
-tvm::Expr SizeFromDimension(const tvm::Array<tvm::Expr>& shape, int64_t axis);
-
-// Helper function to align
-tvm::Expr RoundUp(tvm::Expr value, tvm::Expr alignment);
-
-tvm::Array<tvm::Expr> ConcatShapes(
-    const tvm::Array<tvm::Expr>& shape1,
-    const tvm::Array<tvm::Expr>& shape2);
-
-// Helper function to rename tvm::Tensor
-tvm::Tensor Rename(tvm::Tensor X, const std::string& name);
-
-// Helper function to slice TVM shape
-tvm::Array<tvm::Expr> SliceShape(const tvm::Array<tvm::Expr>& shape, const std::vector<int64_t>& axes);
-
-// Helper function to slice TVM shape from axis (inclusive).
-// Basically, this function returns the shape of [axis, shape.size()-1]
-tvm::Array<tvm::Expr> SliceShapeFromDimension(const tvm::Array<tvm::Expr>& shape, int64_t axis);
-
-// this function returns the shape of [0, axis-1]
-tvm::Array<tvm::Expr> SliceShapeToDimension(const tvm::Array<tvm::Expr>& shape, int64_t axis);
-
-// check if dimension is 1
-bool IsOne(const tvm::Array<tvm::Expr>& shape, int64_t axis);
-
-// Helper function to convert tvm::Expr to tvm::Tensor
-tvm::Tensor Promote(const tvm::Expr& expr,
-                    const tvm::Array<tvm::Expr>& shape,
-                    const std::string& name = "PromoteExpr");
-
-tvm::Tensor MakeZeroTensor(const tvm::Array<tvm::Expr>& shape, HalideIR::Type type, const std::string& name);
-
-void DumpTVMModuleToFile(const std::string& filename, tvm::runtime::Module& module);
-
-bool BroadcastDim(const tvm::Array<tvm::Expr>& shape, size_t i, size_t output_rank, tvm::Expr& dim);
-
-inline int64_t HandleNegativeAxis(int64_t axis, int64_t rank) {
-  MTI_ASSERT(axis >= -rank && axis <= rank - 1);
-  return axis = axis < 0 ? (axis + rank) : axis;
-}
-
-// Make sure idx is clamped in the range of [-bound, bound - 1]
-tvm::Expr ClampIndex(const tvm::Expr& idx, const tvm::Expr& bound);
-
-// Helper function to workaround tvm ExternOp issue when input has symbolic dimensions
-tvm::Array<tvm::Tensor> MakeInputsForExtern(const tvm::Array<tvm::Tensor>& inputs, const std::string& name = "make_inputs_for_extern");
-
-}  //  namespace tvm_codegen
-}  //  namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/nn/conv_ops.cc b/onnxruntime/core/codegen/mti/nn/conv_ops.cc
deleted file mode 100644
index e2d4acc8843ad..0000000000000
--- a/onnxruntime/core/codegen/mti/nn/conv_ops.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-#include "core/codegen/mti/nn/conv_ops.h"
-
-#include "core/codegen/mti/math/matmul_ops.h"
-#include "core/codegen/mti/tensor/pad_ops.h"
-#include "core/codegen/mti/tensor/reshape_ops.h"
-#include "core/codegen/mti/tensor/transpose.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-static tvm::Tensor PadTensor1D(const tvm::Tensor& input,
-                               const tvm::Array<tvm::Expr>& padding,
-                               size_t width_axis,
-                               const std::string& name) {
-  auto pad_left = padding[0];
-  auto pad_right = padding[1];
-
-  tvm::Array<tvm::Expr> pad_before(std::vector<tvm::Expr>(input->shape.size(), 0));
-  pad_before.Set(width_axis, pad_left);
-  tvm::Array<tvm::Expr> pad_after(std::vector<tvm::Expr>(input->shape.size(), 0));
-  pad_after.Set(width_axis, pad_right);
-
-  const int64_t* padding_w0 = tvm::as_const_int(pad_left);
-  const int64_t* padding_w1 = tvm::as_const_int(pad_right);
-
-  const bool do_pad = ((padding_w0 != nullptr && *padding_w0) ||
-                       (padding_w1 != nullptr && *padding_w1));
-
-  return do_pad ? Pad(input, pad_before, pad_after,
-                      0, "constant", name + "_input_padded")
-                : input;
-}
-
-tvm::Tensor Conv1D(const tvm::Tensor& input,
-                   const tvm::Tensor& filter,
-                   const tvm::Array<tvm::Expr>& out_shape,
-                   const tvm::Array<tvm::Expr>& stride,
-                   const tvm::Array<tvm::Expr>& padding,
-                   const std::string& name) {
-  size_t channel_axis = 1;
-  size_t width_axis = 2;
-
-  auto stride_width = stride[width_axis - 2];
-
-  auto input_padded = PadTensor1D(input, padding, width_axis, name);
-  auto rc = tvm::reduce_axis((tvm::Range(0, filter->shape[1])), "rc");
-  auto rx = tvm::reduce_axis((tvm::Range(0, filter->shape[2])), "rx");
-
-  return tvm::compute(
-      out_shape,
-      [&](const tvm::Array<tvm::Var>& output) {
-        tvm::Array<tvm::Expr> indices;
-        for (const tvm::Var& var : output) {
-          indices.push_back(var);
-        }
-        indices.Set(channel_axis, rc);
-        indices.Set(width_axis, output[width_axis] * stride_width + rx);
-
-        return tvm::sum(input_padded(indices) * filter({output[1], rc, rx}),
-                        {rc, rx});
-      },
-      name);
-}
-
-tvm::Tensor Conv2D(const tvm::Tensor& input,
-                   const tvm::Tensor& filter,
-                   const tvm::Array<tvm::Expr>& output_shape,
-                   const tvm::Array<tvm::Expr>& stride,
-                   const tvm::Array<tvm::Expr>& padding,
-                   const std::string& name) {
-  return Conv2D_native(input, filter, output_shape, stride, padding);
-}
-
-static tvm::Tensor PadTensor2D(const tvm::Tensor& input,
-                               const tvm::Array<tvm::Expr>& padding,
-                               size_t height_axis,
-                               size_t width_axis,
-                               const std::string& name) {
-  auto pad_top = padding[0];
-  auto pad_left = padding[1];
-  auto pad_bottom = padding[2];
-  auto pad_right = padding[3];
-
-  tvm::Array<tvm::Expr> pad_before(std::vector<tvm::Expr>(input->shape.size(), 0));
-  pad_before.Set(height_axis, pad_top);
-  pad_before.Set(width_axis, pad_left);
-
-  tvm::Array<tvm::Expr> pad_after(std::vector<tvm::Expr>(input->shape.size(), 0));
-  pad_after.Set(height_axis, pad_bottom);
-  pad_after.Set(width_axis, pad_right);
-
-  const int64_t* padding_h0 = tvm::as_const_int(pad_top);
-  const int64_t* padding_w0 = tvm::as_const_int(pad_left);
-  const int64_t* padding_h1 = tvm::as_const_int(pad_bottom);
-  const int64_t* padding_w1 = tvm::as_const_int(pad_right);
-
-  const bool do_pad = ((padding_h0 != nullptr && *padding_h0) ||
-                       (padding_w0 != nullptr && *padding_w0)) ||
-                      ((padding_h1 != nullptr && *padding_h1) ||
-                       (padding_w1 != nullptr && *padding_w1));
-
-  return do_pad ? Pad(input, pad_before, pad_after,
-                      0, "constant", name + "_input_padded")
-                : input;
-}
-
-tvm::Tensor Conv2D_native(const tvm::Tensor& input,
-                          const tvm::Tensor& filter,
-                          const tvm::Array<tvm::Expr>& out_shape,
-                          const tvm::Array<tvm::Expr>& stride,
-                          const tvm::Array<tvm::Expr>& padding,
-                          const std::string& name) {
-  size_t channel_axis = 1;
-  size_t height_axis = 2;
-  size_t width_axis = 3;
-
-  auto stride_height = stride[height_axis - 2];
-  auto stride_width = stride[width_axis - 2];
-
-  auto input_padded = PadTensor2D(input, padding, height_axis, width_axis, name);
-
-  auto rc = tvm::reduce_axis((tvm::Range(0, filter->shape[1])), "rc");
-  auto ry = tvm::reduce_axis((tvm::Range(0, filter->shape[2])), "ry");
-  auto rx = tvm::reduce_axis((tvm::Range(0, filter->shape[3])), "rx");
-
-  return tvm::compute(
-      out_shape,
-      [&](const tvm::Array<tvm::Var>& output) {
-        tvm::Array<tvm::Expr> indices;
-        for (const tvm::Var& var : output) {
-          indices.push_back(var);
-        }
-        indices.Set(channel_axis, rc);
-        indices.Set(height_axis, output[height_axis] * stride_height + ry);
-        indices.Set(width_axis, output[width_axis] * stride_width + rx);
-
-        return tvm::sum(input_padded(indices) * filter({output[1], rc, ry, rx}),
-                        {rc, ry, rx});
-      },
-      name);
-}
-
-tvm::Tensor Conv2D_gemm(const tvm::Tensor& input,
-                        const tvm::Tensor& filter,
-                        const tvm::Array<tvm::Expr>& out_shape,
-                        const tvm::Array<tvm::Expr>& stride,
-                        const tvm::Array<tvm::Expr>& padding,
-                        const std::string& name) {
-  size_t height_axis = 2;
-  size_t width_axis = 3;
-
-  auto stride_height = stride[height_axis - 2];
-  auto stride_width = stride[width_axis - 2];
-
-  auto input_padded = PadTensor2D(input, padding, height_axis, width_axis, name);
-
-  tvm::Array<tvm::Expr> img_col_tmp(std::vector<tvm::Expr>(6, 0));
-  img_col_tmp.Set(0, out_shape[0]);
-  img_col_tmp.Set(1, out_shape[2]);
-  img_col_tmp.Set(2, out_shape[3]);
-  img_col_tmp.Set(3, filter->shape[1]);
-  img_col_tmp.Set(4, filter->shape[2]);
-  img_col_tmp.Set(5, filter->shape[3]);
-
-  auto img_col = tvm::compute(
-      img_col_tmp,
-      [&](const tvm::Array<tvm::Var>& output) {
-        tvm::Array<tvm::Expr> indices;
-        indices.push_back(output[0]);
-        indices.push_back(output[3]);
-        indices.push_back(output[1] * stride_height + output[4]);
-        indices.push_back(output[2] * stride_width + output[5]);
-        return input_padded(indices);
-      },
-      name);
-
-  tvm::Array<tvm::Expr> input_col_shape(std::vector<tvm::Expr>(2, 0));
-  input_col_shape.Set(0, img_col_tmp[1] * img_col_tmp[2]);
-  input_col_shape.Set(1, img_col_tmp[3] * img_col_tmp[4] * img_col_tmp[5]);
-  auto input_col = Reshape(img_col, input_col_shape);
-
-  tvm::Array<tvm::Expr> filter_row_shape(std::vector<tvm::Expr>(2, 0));
-  filter_row_shape.Set(0, filter->shape[0]);
-  filter_row_shape.Set(1, filter->shape[1] * filter->shape[2] * filter->shape[3]);
-  auto filter_row = Reshape(filter, filter_row_shape, name);
-
-  auto Y = MatMul2D(input_col, filter_row, false, true, name);
-  auto Y_T = Transpose(Y, /*axes=*/{}, name);
-  return Reshape(Y_T, out_shape, name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/nn/conv_ops.h b/onnxruntime/core/codegen/mti/nn/conv_ops.h
deleted file mode 100644
index 1396c216865a7..0000000000000
--- a/onnxruntime/core/codegen/mti/nn/conv_ops.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Conv1D(const tvm::Tensor& input,
-                   const tvm::Tensor& filter,
-                   const tvm::Array<tvm::Expr>& output_shape,
-                   const tvm::Array<tvm::Expr>& stride,
-                   const tvm::Array<tvm::Expr>& padding,
-                   const std::string& name = "conv1d");
-
-tvm::Tensor Conv2D(const tvm::Tensor& input,
-                   const tvm::Tensor& filter,
-                   const tvm::Array<tvm::Expr>& output_shape,
-                   const tvm::Array<tvm::Expr>& stride,
-                   const tvm::Array<tvm::Expr>& padding,
-                   const std::string& name = "conv2d");
-
-tvm::Tensor Conv2D_native(const tvm::Tensor& input,
-                          const tvm::Tensor& filter,
-                          const tvm::Array<tvm::Expr>& output_shape,
-                          const tvm::Array<tvm::Expr>& stride,
-                          const tvm::Array<tvm::Expr>& padding,
-                          const std::string& name = "conv2d_native");
-
-tvm::Tensor Conv2D_gemm(const tvm::Tensor& input,
-                        const tvm::Tensor& filter,
-                        const tvm::Array<tvm::Expr>& output_shape,
-                        const tvm::Array<tvm::Expr>& stride,
-                        const tvm::Array<tvm::Expr>& padding,
-                        const std::string& name = "conv2d_gemm");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/nn/lstm.cc b/onnxruntime/core/codegen/mti/nn/lstm.cc
deleted file mode 100644
index 1148b0924e869..0000000000000
--- a/onnxruntime/core/codegen/mti/nn/lstm.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/nn/lstm.h"
-
-#include "core/codegen/mti/math/binary_ops.h"
-#include "core/codegen/mti/math/unary_ops.h"
-#include "core/codegen/mti/math/matmul_ops.h"
-#include "core/codegen/mti/math/reduce_ops.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/reshape_ops.h"
-#include "core/codegen/mti/tensor/split.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-/*
-`X` - input tensor
-`i` - input gate
-`o` - output gate
-`f` - forget gate
-`c` - cell gate
-`t` - time step (t-1 means previous time step)
-
-`W[iofc]` - W parameter weight matrix for input, output, forget, and cell gates
-`R[iofc]` - R recurrence weight matrix for input, output, forget, and cell gates
-`Wb[iofc]` - W bias vectors for input, output, forget, and cell gates
-`Rb[iofc]` - R bias vectors for input, output, forget, and cell gates
-`P[iof]`  - P peephole weight vector for input, output, and forget gates
-`WB[iofc]` - W parameter weight matrix for backward input, output, forget, and cell gates
-`RB[iofc]` - R recurrence weight matrix for backward input, output, forget, and cell gates
-`WBb[iofc]` - W bias vectors for backward input, output, forget, and cell gates
-`RBb[iofc]` - R bias vectors for backward input, output, forget, and cell gates
-`PB[iof]`  - P peephole weight vector for backward input, output, and forget gates
-
-`H` - Hidden state
-`num_directions` - 2 if direction == bidirectional else 1
-
-Equations (Default: f=Sigmoid, g=Tanh, h=Tanh):
-  it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
-  ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
-  ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
-  Ct = ft (.) Ct-1 + it (.) ct
-  ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
-  Ht = ot (.) h(Ct)
-*/
-
-void LSTM_cell(
-    const LSTMAttributes& lstm_attrs,
-    const tvm::Tensor& X,
-    const tvm::Tensor& W,
-    const tvm::Tensor& R,
-    const tvm::Tensor& B,
-    bool has_B,
-    const tvm::Tensor& prev_H,
-    const tvm::Tensor& prev_C,
-    const tvm::Tensor& P,
-    bool has_P,
-    tvm::Tensor& Y_h,
-    tvm::Tensor& Y_c) {
-  // Input projection: Xt*(W[iofc]^T) for forward direction or Xt*(WB[iofc]^T) for reverse direction
-  // (batch_size, input_size) * trans(4 * hidden_size, input_size) => (batch_size, 4 * hidden_size)
-  tvm::Tensor input_proj = MatMul2D(X, W, /*trans_a*/ false, /*trans_b*/ true);
-
-  // Hidden projection: Ht-1*(R[iofc]^T) for forward direction or Ht-1*(RB[iofc]^T) for reverse direction
-  // (batch_size, hidden_size) * trans(4 * hidden_size, hidden_size) => (batch_size, 4 * hidden_size)
-  tvm::Tensor hidden_proj = MatMul2D(prev_H, R, /*trans_a*/ false, /*trans_b*/ true);
-
-  // (batch_size, 4 * hidden_size)
-  tvm::Tensor sum_proj = Add(input_proj, hidden_proj);
-
-  // Concatenation of [Wb[iofc], Rb[iofc]] or [WBb[iofc], RBb[iofc]]
-  if (has_B) {
-    // (8 * hidden_size) -> (2, 4 * hidden_size) -> (1, 4 * hidden_size), should be done in const folding
-    tvm::Tensor reduce_B =
-        ReduceSum(Reshape(B, {2, 4 * static_cast<int>(lstm_attrs.hidden_size)}), {0}, /*keep_dims*/ true);
-    // (batch_size, 4 * hidden_size) via broadcasting reduce_B
-    sum_proj = Add(sum_proj, reduce_B);
-  }
-
-  std::vector<int64_t> iofc_sum_split_sizes(4, lstm_attrs.hidden_size);
-  // Split sum_proj into iofc, where each gate proj is of (batch_size, hidden_size)
-  tvm::Array<tvm::Tensor> iofc_sum_projs = Split(sum_proj, ToTvmArray(iofc_sum_split_sizes), /*axis*/ 1);
-  MTI_ASSERT(iofc_sum_projs.size() == 4);
-  tvm::Tensor i_proj = iofc_sum_projs[0],
-              o_proj = iofc_sum_projs[1],
-              f_proj = iofc_sum_projs[2],
-              c_proj = iofc_sum_projs[3];
-
-  tvm::Tensor P_i, P_o, P_f;
-  if (has_P) {
-    std::vector<int64_t> iof_p_split_sizes(3, lstm_attrs.hidden_size);
-    // Split P into P_i, P_o, P_f, in const pre-processing (P_i, P_f might be merged?)
-    // where each P_[iof] has the shape of (hidden_size)
-    tvm::Array<tvm::Tensor> iof_P_projs = Split(P, ToTvmArray(iof_p_split_sizes), /*axis*/ 0);
-    MTI_ASSERT(iof_P_projs.size() == 3);
-    P_i = iof_P_projs[0],
-    P_o = iof_P_projs[1],
-    P_f = iof_P_projs[2];
-
-    // (batch_size, hidden_size) via broadcasting P_[if]
-    i_proj = Add(i_proj, Mul(P_i, prev_C));
-    f_proj = Add(f_proj, Mul(P_f, prev_C));
-  }
-
-  // TODO: handle more general cases for activations f, h, g and activation_alpha and
-  // activation_beta. We may consider to move some code such as ActivationInfo from deep_cpu_lstm
-  // into a common header file, because the code can be used here.
-
-  // Note that by default f = Sigmoid, g = Tanh, h = Tanh
-
-  // it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
-  // shape: (batch_size, hidden_size)
-  tvm::Tensor i_t = Sigmoid(i_proj);
-  // ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
-  // shape: (batch_size, hidden_size)
-  tvm::Tensor f_t = Sigmoid(f_proj);
-  // ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
-  // shape: (batch_size, hidden_size)
-  tvm::Tensor c_t = Tanh(c_proj);
-
-  // Ct = ft (.) Ct-1 + it (.) ct
-  // shape: (batch_size, hidden_size)
-  Y_c = Add(Mul(f_t, prev_C), Mul(i_t, c_t), Y_c->op->name);
-
-  // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
-  // shape: (batch_size, hidden_size)
-  if (has_P) {
-    o_proj = Add(o_proj, Mul(P_o, Y_c));
-  }
-  // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
-  // shape: (batch_size, hidden_size)
-  o_proj = Sigmoid(o_proj);
-  // Ht = ot (.) h(Ct)
-  // shape: (batch_size, hidden_size)
-  Y_h = Mul(o_proj, Tanh(Y_c), Y_h->op->name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/nn/lstm.h b/onnxruntime/core/codegen/mti/nn/lstm.h
deleted file mode 100644
index 851fa880c4427..0000000000000
--- a/onnxruntime/core/codegen/mti/nn/lstm.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-// A bubble now. But don't remove it
-// TODO: refactor the LSTMcell building to a tvm function
-//       and move it here
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-struct LSTMAttributes {
-  LSTMAttributes(int64_t hidden_size_p) : hidden_size(hidden_size_p) {}
-  int64_t hidden_size;
-};
-
-void LSTM_cell(
-    const LSTMAttributes& lstm_attrs,
-    const tvm::Tensor& X,
-    const tvm::Tensor& W,
-    const tvm::Tensor& R,
-    const tvm::Tensor& B,
-    bool has_B,
-    const tvm::Tensor& prev_H,
-    const tvm::Tensor& prev_C,
-    const tvm::Tensor& P,
-    bool has_P,
-    tvm::Tensor& Y_h,
-    tvm::Tensor& Y_c);
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/nn/pool_ops.cc b/onnxruntime/core/codegen/mti/nn/pool_ops.cc
deleted file mode 100644
index 868a14748cabc..0000000000000
--- a/onnxruntime/core/codegen/mti/nn/pool_ops.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/nn/pool_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/mlas/inc/mlas.h"
-#include "core/providers/cpu/nn/pool_attributes.h"
-#include <topi/nn/pooling.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// TODO: topi only support 2d-pool, MaxPool1d and MaxPool3d will need to be added if necessary.
-// only support version < 8 for topi doesn't come with implementation to output index tensor
-tvm::Tensor MaxPool(const tvm::Tensor& input,
-                    const PoolAttributes& pool_attrs,
-                    const tvm::Array<tvm::Expr>& /*output_shape*/,
-                    const std::string& /*name*/) {
-  return topi::nn::pool(input,
-                        ToTvmArray(pool_attrs.kernel_shape),
-                        ToTvmArray(pool_attrs.strides),
-                        ToTvmArray(pool_attrs.pads),
-                        /*pool_type*/ topi::nn::kMaxPool,
-                        /*ceil_mode*/ false,
-                        /*layout*/ pool_attrs.storage_order == 0 ? "NCWH" : "NCHW",
-                        pool_attrs.count_include_pad);
-}
-
-tvm::Tensor AveragePool(const tvm::Tensor& input,
-                        const PoolAttributes& pool_attrs,
-                        const tvm::Array<tvm::Expr>& /*output_shape*/,
-                        const std::string& /*name*/) {
-  return topi::nn::pool(input,
-                        ToTvmArray(pool_attrs.kernel_shape),
-                        ToTvmArray(pool_attrs.strides),
-                        ToTvmArray(pool_attrs.pads),
-                        /*pool_type*/ topi::nn::kAvgPool,
-                        /*ceil_mode*/ false,
-                        /*layout*/ "NCHW",
-                        pool_attrs.count_include_pad);
-}
-
-tvm::Tensor GlobalMaxPool(const tvm::Tensor& input,
-                          const PoolAttributes& /*pool_attrs*/,
-                          const tvm::Array<tvm::Expr>& /*output_shape*/,
-                          const std::string& /*name*/) {
-  return topi::nn::global_pool(input,
-                               /*pool_type*/ topi::nn::kMaxPool,
-                               /*layout*/ "NCHW");
-}
-
-tvm::Tensor GlobalAveragePool(const tvm::Tensor& input,
-                              const PoolAttributes& /*pool_attrs*/,
-                              const tvm::Array<tvm::Expr>& /*output_shape*/,
-                              const std::string& /*name*/) {
-  return topi::nn::global_pool(input,
-                               /*pool_type*/ topi::nn::kAvgPool,
-                               /*layout*/ "NCHW");
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/nn/pool_ops.h b/onnxruntime/core/codegen/mti/nn/pool_ops.h
deleted file mode 100644
index d381f9ddff859..0000000000000
--- a/onnxruntime/core/codegen/mti/nn/pool_ops.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-
-// Forward declaration
-struct PoolAttributes;
-
-namespace tvm_codegen {
-
-tvm::Tensor MaxPool(const tvm::Tensor& input,
-                    const PoolAttributes& pool_attrs,
-                    const tvm::Array<tvm::Expr>& output_shape,
-                    const std::string& name = "max_pool");
-
-tvm::Tensor AveragePool(const tvm::Tensor& input,
-                        const PoolAttributes& pool_attrs,
-                        const tvm::Array<tvm::Expr>& output_shape,
-                        const std::string& name = "average_pool");
-
-tvm::Tensor GlobalMaxPool(const tvm::Tensor& input,
-                          const PoolAttributes& pool_attrs,
-                          const tvm::Array<tvm::Expr>& output_shape,
-                          const std::string& name = "global_max_pool");
-
-tvm::Tensor GlobalAveragePool(const tvm::Tensor& input,
-                              const PoolAttributes& pool_attrs,
-                              const tvm::Array<tvm::Expr>& output_shape,
-                              const std::string& name = "global_average_pool");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/cast_ops.cc b/onnxruntime/core/codegen/mti/tensor/cast_ops.cc
deleted file mode 100644
index a8fc86488d82b..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/cast_ops.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/cast_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <topi/broadcast.h>
-#include <topi/elemwise.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Cast(const tvm::Tensor& X, tvm::Type type, const std::string& name) {
-  return topi::cast(X, type, name);
-}
-
-// handle cases where bool is reprented as uint8 (e.g. in ONNX).
-tvm::Tensor CastToUInt8Bool(const tvm::Tensor& X, const std::string& name) {
-  return tvm::compute(
-      X->shape,
-      [&](const tvm::Array<tvm::Var>& indices) {
-        auto val = X(indices);
-        // A special cast from float16 to bool, first cast up to float32,
-        // to workaround a float16 bug in many TVM backends.
-        // Intel Skylake is one of them. https://github.com/dmlc/tvm/issues/2959
-        // TODO: remove it, after TVM is fixed
-        if (X->dtype == HalideIR::Float(16))
-          val = tvm::cast(HalideIR::Float(32), val);
-        return tvm::ir::Select::make(topi::equal(val, tvm::make_zero(val.type())),
-                                     tvm::make_zero(HalideIR::UInt(8)),
-                                     tvm::make_const(HalideIR::UInt(8), 1));
-      },
-      name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/cast_ops.h b/onnxruntime/core/codegen/mti/tensor/cast_ops.h
deleted file mode 100644
index 02f6f9cb1fde7..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/cast_ops.h
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Cast(const tvm::Tensor& X, tvm::Type type, const std::string& name = "cast");
-tvm::Tensor CastToUInt8Bool(const tvm::Tensor& X, const std::string& name = "cast_uint8_bool");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/concat_ops.cc b/onnxruntime/core/codegen/mti/tensor/concat_ops.cc
deleted file mode 100644
index 3394d5b7e00a2..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/concat_ops.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/concat_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <gsl/gsl>
-#include <topi/transform.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Concat(const tvm::Array<tvm::Tensor>& inputs,
-                   int64_t axis,
-                   const std::string& name) {
-  return ConcatSafe(inputs, axis, name);
-}
-
-// Note topi's implementation requires control flow within iterations to avoid out-of-bound access.
-// Therefore, MTI implements a ConcatSafe that does not have out-of-bound access,
-// and does not requires control or predicate.
-tvm::Tensor ConcatSafe(const tvm::Array<tvm::Tensor>& inputs,
-                       int64_t axis,
-                       const std::string& name) {
-  axis = HandleNegativeAxis(axis, gsl::narrow<int64_t>(inputs[0]->shape.size()));
-  MTI_ASSERT(axis < gsl::narrow<int64_t>(inputs[0]->shape.size()) && "axis out of bounds");
-
-  tvm::Array<tvm::Expr> axis_sizes;
-  for (auto t : inputs) {
-    axis_sizes.push_back(t->shape[axis]);
-  }
-
-  tvm::Expr join_size = axis_sizes[0];
-  for (size_t i = 1; i < axis_sizes.size(); ++i) {
-    join_size += axis_sizes[i];
-  }
-  join_size = tvm::ir::Simplify(join_size);
-  tvm::Array<tvm::Expr> out_shape;
-  for (size_t i = 0; i < inputs[0]->shape.size(); ++i) {
-    out_shape.push_back(i == gsl::narrow<size_t>(axis) ? join_size : inputs[0]->shape[i]);
-  }
-
-  return tvm::compute(
-      out_shape, [&](const tvm::Array<tvm::Var>& ovars) {
-        tvm::Array<tvm::Expr> indices;
-
-        // preset
-        tvm::Expr min = 0;
-        tvm::Expr extent = axis_sizes[0];
-        tvm::Expr offset = 0;
-        tvm::Expr ret;
-
-        // input i = 0
-        for (size_t j = 0; j < ovars.size(); ++j) {
-          if (j == gsl::narrow<size_t>(axis)) {
-            tvm::Expr ivar = ovars[j];
-            indices.push_back(tvm::max(tvm::min(ivar, min + extent - 1), min));
-          } else {
-            indices.push_back(ovars[j]);
-          }
-        }
-        ret = inputs[0](indices);
-
-        for (size_t i = 1; i < inputs.size(); ++i) {
-          offset += extent;
-          tvm::Expr min = 0;
-          extent = axis_sizes[i];
-          auto j = gsl::narrow<size_t>(axis);
-          tvm::Expr ivar = ovars[j] - offset;
-          indices.Set(j, tvm::max(tvm::min(ivar, min + extent - 1), min));
-
-          ret = tvm::ir::Select::make(ivar >= 0,
-                                      inputs[i](indices),
-                                      ret);
-        }
-
-        return ret;
-      },
-      name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/concat_ops.h b/onnxruntime/core/codegen/mti/tensor/concat_ops.h
deleted file mode 100644
index 153afebb44615..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/concat_ops.h
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Concat(const tvm::Array<tvm::Tensor>& inputs, int64_t axis, const std::string& name = "concat");
-tvm::Tensor ConcatSafe(const tvm::Array<tvm::Tensor>& inputs, int64_t axis, const std::string& name = "concat_safe");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/crop.cc b/onnxruntime/core/codegen/mti/tensor/crop.cc
deleted file mode 100644
index 3fe569100df12..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/crop.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/crop.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <topi/nn.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Crop(const tvm::Tensor& t,
-                 const tvm::Array<tvm::Expr>& border,
-                 const tvm::Array<tvm::Expr>& scale,
-                 const std::string& name) {
-  MTI_ASSERT(t->shape.size() == 4);
-  tvm::Expr N = t->shape[0];
-  tvm::Expr C = t->shape[1];
-  tvm::Expr H = t->shape[2];
-  tvm::Expr W = t->shape[3];
-
-  MTI_ASSERT(border.size() == 4);
-  tvm::Expr leftBorder = border[0];
-  tvm::Expr topBorder = border[1];
-  tvm::Expr rightBorder = border[2];
-  tvm::Expr bottomBorder = border[3];
-
-  tvm::Expr bottomLimit = H - bottomBorder;
-  tvm::Expr rightLimit = W - rightBorder;
-
-  if (!scale.empty()) {
-    CHECK_EQ(scale.size(), 2);
-    bottomLimit = topBorder + scale[0];
-    rightLimit = leftBorder + scale[1];
-  }
-
-  tvm::Array<tvm::Expr> output_shape;
-  output_shape.push_back(tvm::ir::Simplify(N));
-  output_shape.push_back(tvm::ir::Simplify(C));
-  output_shape.push_back(tvm::ir::Simplify(bottomLimit - topBorder));
-  output_shape.push_back(tvm::ir::Simplify(rightLimit - leftBorder));
-
-  auto l = [&](const tvm::Array<tvm::Var>& ovars) {
-    tvm::Array<tvm::Expr> indices;
-
-    indices.push_back(tvm::min(ovars[0], output_shape[0] - 1));
-    indices.push_back(tvm::min(ovars[1], output_shape[1] - 1));
-    indices.push_back(tvm::min(topBorder + ovars[2], topBorder + output_shape[2] - 1));
-    indices.push_back(tvm::min(leftBorder + ovars[3], leftBorder + output_shape[3] - 1));
-
-    return t(indices);
-  };
-
-  return tvm::compute(output_shape, l, name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/crop.h b/onnxruntime/core/codegen/mti/tensor/crop.h
deleted file mode 100644
index ffb6a05c70504..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/crop.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Crop(const tvm::Tensor& t,
-                 const tvm::Array<tvm::Expr>& border,
-                 const tvm::Array<tvm::Expr>& scale = {},
-                 const std::string& name = "crop");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/expand.cc b/onnxruntime/core/codegen/mti/tensor/expand.cc
deleted file mode 100644
index cdac4f56e1f9f..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/expand.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/expand.h"
-#include "core/codegen/mti/common.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Expand(const tvm::Tensor& X, const tvm::Array<tvm::Expr>& new_shape, const std::string& name) {
-  MTI_ASSERT(new_shape.size() >= X->shape.size());
-  return tvm::compute(
-      new_shape,
-      [&](const tvm::Array<tvm::Var>& out_indices) {
-        tvm::Array<tvm::Expr> indices;
-        size_t broadcasted_rank = new_shape.size() - X->shape.size();
-        for (size_t d = broadcasted_rank; d < new_shape.size(); ++d) {
-          if (tvm::is_const_int(X->shape[d - broadcasted_rank], 1)) {
-            indices.push_back(tvm::make_zero(HalideIR::Int(32)));
-          } else {
-            indices.push_back(out_indices[d]);
-          }
-        }
-        return X(indices);
-      },
-      name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/expand.h b/onnxruntime/core/codegen/mti/tensor/expand.h
deleted file mode 100644
index d66d41aeb0194..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/expand.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Expand(const tvm::Tensor& X, const tvm::Array<tvm::Expr>& new_shape, const std::string& name = "expand");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/gather.cc b/onnxruntime/core/codegen/mti/tensor/gather.cc
deleted file mode 100644
index 152b3981f1623..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/gather.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/gather.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <gsl/gsl>
-#include <topi/transform.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Gather(const tvm::Tensor& t,
-                   int64_t axis,
-                   const tvm::Tensor& indices,
-                   const std::string& name) {
-  // handle negative axis
-  axis = HandleNegativeAxis(axis, gsl::narrow<int64_t>(t->shape.size()));
-  size_t axis_t = gsl::narrow<size_t>(axis);
-
-  tvm::Array<tvm::Expr> output_shape;
-  for (size_t i = 0; i < axis_t; ++i)
-    output_shape.push_back(t->shape[i]);
-
-  for (size_t i = 0; i < indices->shape.size(); ++i)
-    output_shape.push_back(indices->shape[i]);
-
-  for (size_t i = axis_t + 1; i < t->shape.size(); ++i)
-    output_shape.push_back(t->shape[i]);
-
-  tvm::Expr idx_upper_bound = t->shape[axis_t];
-  auto l = [&](const tvm::Array<tvm::Var>& ovars) {
-    tvm::Array<tvm::Expr> ivars;
-    for (size_t i = 0; i < t->shape.size(); ++i) {
-      if (i < axis_t) {
-        ivars.push_back(ovars[i]);
-      } else if (i == axis_t) {
-        tvm::Array<tvm::Expr> idx_vars;
-        for (size_t d = 0; d < indices->shape.size(); ++d)
-          idx_vars.push_back(ovars[axis_t + d]);
-        // make sure idx is clamped in the range of [-idx_upper_bound, idx_upper_bound - 1]
-        tvm::Expr real_idx = tvm_codegen::ClampIndex(indices(idx_vars), idx_upper_bound);
-        ivars.push_back(tvm::cast(tvm::Int(32), real_idx));  // tvm indices must be Int32
-      } else {
-        ivars.push_back(ovars[i - 1 + indices->shape.size()]);
-      }
-    }
-    return t(ivars);
-  };
-
-  return tvm::compute(output_shape, l, name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/gather.h b/onnxruntime/core/codegen/mti/tensor/gather.h
deleted file mode 100644
index a44bf3e4127d5..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/gather.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Gather(const tvm::Tensor& t,
-                   int64_t axis,
-                   const tvm::Tensor& indices,
-                   const std::string& name = "gather");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/gather_elements.cc b/onnxruntime/core/codegen/mti/tensor/gather_elements.cc
deleted file mode 100644
index 12d2983335890..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/gather_elements.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/gather_elements.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <topi/transform.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor GatherElements(const tvm::Tensor& t,
-                           int64_t axis,
-                           const tvm::Tensor& indices,
-                           const std::string& name) {
-  tvm::Array<tvm::Expr> output_shape;
-  int64_t indices_rank = static_cast<int64_t>(indices->shape.size());
-  // output shape is the same as indices
-  for (int64_t i = 0; i < indices_rank; ++i)
-    output_shape.push_back(indices->shape[i]);
-
-  tvm::Expr idx_upper_bound = t->shape[axis];
-  auto l = [&](const tvm::Array<tvm::Var>& ovars) {
-    tvm::Array<tvm::Expr> ivars;
-    for (int64_t i = 0; i < indices_rank; i++) {
-      if (i == axis) {
-        tvm::Array<tvm::Expr> idx_vars;
-        for (int64_t j = 0; j < indices_rank; j++)
-          idx_vars.push_back(ovars[j]);
-        // make sure idx is clamped in the range of [-idx_upper_bound, idx_upper_bound - 1]
-        tvm::Expr real_idx = tvm_codegen::ClampIndex(indices(idx_vars), idx_upper_bound);
-        // tvm idx must be of Int(32)
-        ivars.push_back(tvm::cast(tvm::Int(32), real_idx));
-      } else {
-        ivars.push_back(ovars[i]);
-      }
-    }
-    return t(ivars);
-  };
-
-  return tvm::compute(output_shape, l, name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/gather_elements.h b/onnxruntime/core/codegen/mti/tensor/gather_elements.h
deleted file mode 100644
index 650086f0f2e87..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/gather_elements.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor GatherElements(const tvm::Tensor& t,
-                           int64_t axis,
-                           const tvm::Tensor& indices,
-                           const std::string& name = "gather_elements");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/pad_ops.cc b/onnxruntime/core/codegen/mti/tensor/pad_ops.cc
deleted file mode 100644
index 2f688290d109e..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/pad_ops.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/pad_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <topi/nn.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Note topi::pad does not support modes {edge, reflect}
-// Therefore, MTI implements a generic Pad
-tvm::Tensor Pad(const tvm::Tensor& t,
-                const tvm::Array<tvm::Expr>& pad_before,
-                const tvm::Array<tvm::Expr>& pad_after,
-                float pad_value,
-                const std::string& mode,
-                const std::string& name) {
-  MTI_ASSERT(pad_before.size() >= 1);
-  MTI_ASSERT(pad_before.size() == pad_after.size());
-  MTI_ASSERT(pad_before.size() == t->shape.size());
-
-  tvm::Array<tvm::Expr> output_shape;
-  for (size_t i = 0; i < t->shape.size(); ++i) {
-    output_shape.push_back(
-        tvm::ir::Simplify(t->shape[i] + pad_before[i] + pad_after[i]));
-  }
-
-  auto l = [&](const tvm::Array<tvm::Var>& ovars) {
-    tvm::Array<tvm::Expr> conds;
-    tvm::Array<tvm::Expr> indices;
-    tvm::Array<tvm::Expr> coords;
-
-    for (size_t i = 0; i < t->shape.size(); ++i) {
-      tvm::Expr ivar = ovars[i] - pad_before[i];
-      tvm::Expr min = 0;
-      tvm::Expr extent = t->shape[i];
-
-      conds.push_back(ivar < min);
-      conds.push_back(ivar >= min + extent);
-      indices.push_back(tvm::max(tvm::min(ivar, min + extent - 1), min));
-
-      if (mode == "reflect") {
-        // calculate indices for reflect mode
-        tvm::Expr limit = extent - 1;
-        tvm::Expr coord = ivar - min;
-        // Avoid mod zero when tensor shape has 1,
-        // e.g. input shape is [1, 3, 3] instead of [3, 3]
-        auto* p_limit = tvm::as_const_int(limit);
-        if (p_limit != nullptr && *p_limit != 0)
-          coord = (coord + 2 * limit) % (2 * limit);  // avoid negative value
-        coord = coord - limit;
-        coord = tvm::abs(coord);
-        coord = limit - coord;
-        coord = coord + min;
-        coords.push_back(coord);
-      }
-    }
-
-    if (mode == "reflect") {
-      return tvm::ir::Select::make(topi::detail::Map(conds, tvm::ir::Or::make),
-                                   t(coords), t(indices));
-    } else if (mode == "constant") {
-      return tvm::ir::Select::make(topi::detail::Map(conds, tvm::ir::Or::make),
-                                   tvm::make_const(t->dtype, pad_value), t(indices));
-    }
-
-    // default mode is edge
-    return t(indices);
-  };
-
-  return tvm::compute(output_shape, l, name);
-}
-
-tvm::Tensor Pad(const tvm::Tensor& t,
-                const tvm::Array<tvm::Expr>& output_shape,
-                const tvm::Expr& pad_value,
-                const std::string& name) {
-  MTI_ASSERT(t->dtype == pad_value.type());
-
-  auto l = [&](const tvm::Array<tvm::Var>& ovars) {
-    tvm::Array<tvm::Expr> conds;
-    tvm::Array<tvm::Expr> indices;
-
-    for (size_t i = 0; i < t->shape.size(); ++i) {
-      tvm::Expr ivar = ovars[i];
-      tvm::Expr min = 0;
-      tvm::Expr extent = t->shape[i];
-
-      conds.push_back(ivar < min);
-      conds.push_back(ivar >= min + extent);
-      indices.push_back(tvm::max(tvm::min(ivar, min + extent - 1), min));
-    }
-
-    return tvm::ir::Select::make(topi::detail::Map(conds, tvm::ir::Or::make),
-                                 pad_value, t(indices));
-  };
-
-  return tvm::compute(output_shape, l, name);
-}
-
-tvm::Tensor PadLastDim(const tvm::Tensor& t,
-                       const int32_t align_size,
-                       const tvm::Expr& pad_value,
-                       const std::string& name) {
-  auto input_shape = t->shape;
-  tvm::Array<tvm::Expr> out_shape;
-  size_t input_shape_rank = input_shape.size();
-  for (size_t i = 0; i < input_shape_rank - 1; ++i) {
-    out_shape.push_back(input_shape[i]);
-  }
-  out_shape.push_back(
-      (input_shape[input_shape_rank - 1] + align_size - 1) /
-      align_size * align_size);
-
-  return Pad(t, out_shape, pad_value, name + "_pad");
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/pad_ops.h b/onnxruntime/core/codegen/mti/tensor/pad_ops.h
deleted file mode 100644
index 6e8e350d71e97..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/pad_ops.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// ONNX Pad semantics
-tvm::Tensor Pad(const tvm::Tensor& t,
-                const tvm::Array<tvm::Expr>& pad_before,
-                const tvm::Array<tvm::Expr>& pad_after,
-                float pad_value = 0.0f,
-                const std::string& mode = "constant",
-                const std::string& name = "pad");
-
-// Other common Pad interfaces
-// Pad for a given shape
-tvm::Tensor Pad(const tvm::Tensor& t,
-                const tvm::Array<tvm::Expr>& output_shape,
-                const tvm::Expr& pad_value,
-                const std::string& name = "pad");
-
-// Pad for the last dim only.
-// This is widely used for weight layout to guard alignment
-tvm::Tensor PadLastDim(const tvm::Tensor& t,
-                       const int32_t align_size,
-                       const tvm::Expr& pad_value,
-                       const std::string& name = "pad_last_dim");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/reshape_ops.cc b/onnxruntime/core/codegen/mti/tensor/reshape_ops.cc
deleted file mode 100644
index 817fb32c2837a..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/reshape_ops.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/reshape_ops.h"
-
-#include "core/codegen/mti/common.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <topi/transform.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Flatten(const tvm::Tensor& X, int64_t axis, const std::string& name) {
-  const auto& input_shape = X->shape;
-  return Reshape(X, {SizeToDimension(input_shape, axis), SizeFromDimension(input_shape, axis)}, name);
-}
-
-tvm::Tensor Identity(const tvm::Tensor& X, const std::string& name) {
-  return Reshape(X, X->shape, name);
-}
-
-tvm::Tensor Reshape(const tvm::Tensor& X, const tvm::Array<tvm::Expr>& new_shape, const std::string& name) {
-  if (new_shape.size() > 0) {
-    auto X_dim = SizeToDimension(X->shape, X->shape.size());
-    auto new_dim = SizeToDimension(new_shape, new_shape.size());
-    auto* pX_dim = tvm::as_const_int(X_dim);
-    auto* pNew_dim = tvm::as_const_int(new_dim);
-
-    if (pX_dim != nullptr && pNew_dim != nullptr) {
-      MTI_ASSERT(*pX_dim == *pNew_dim);
-    }
-    return topi::reshape(X, new_shape, name);
-  } else {
-    // generate empty dim tensor with origial input data value
-    tvm::Array<tvm::Expr> tmp_shape;
-    tmp_shape.push_back(1);
-    auto tmp_tensor = topi::reshape(X, tmp_shape);
-    return tvm::compute(
-        new_shape,
-        [&](const tvm::Array<tvm::Var>&) {
-          return tmp_tensor[0];
-        },
-        name);
-  }
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/reshape_ops.h b/onnxruntime/core/codegen/mti/tensor/reshape_ops.h
deleted file mode 100644
index e23d62e4c57b0..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/reshape_ops.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Flatten(const tvm::Tensor& X, int64_t axis, const std::string& name = "flatten");
-tvm::Tensor Identity(const tvm::Tensor& X, const std::string& name = "identity");
-tvm::Tensor Reshape(const tvm::Tensor& X, const tvm::Array<tvm::Expr>& new_shape, const std::string& name = "reshape");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/shape_op.cc b/onnxruntime/core/codegen/mti/tensor/shape_op.cc
deleted file mode 100644
index b51bd67a8b2dc..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/shape_op.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/shape_op.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Shape(const tvm::Tensor& X, const std::string& name) {
-  int ndim = static_cast<int>(X->shape.size());
-  tvm::Array<tvm::Expr> out_shape{ndim};
-  return tvm::compute(
-      out_shape, [&](const tvm::Array<tvm::Var>& indices) {
-        auto idx = indices[0];
-        tvm::Expr ret = 0;
-        for (int i = 0; i < ndim; ++i) {
-          ret = tvm::ir::Select::make(idx == i, X->shape[i], ret);
-        }
-        return tvm::cast(HalideIR::Int(64), ret);
-      },
-      name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/shape_op.h b/onnxruntime/core/codegen/mti/tensor/shape_op.h
deleted file mode 100644
index 67ee2de50eca9..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/shape_op.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Shape(const tvm::Tensor& X, const std::string& name = "shape");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/slice.cc b/onnxruntime/core/codegen/mti/tensor/slice.cc
deleted file mode 100644
index 6cbab43584d4b..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/slice.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/slice.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <climits>
-#include <gsl/gsl>
-#include <topi/transform.h>
-#include <tvm/ir_pass.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// local constexpr for INT_MAX
-constexpr int64_t max_range = INT_MAX;
-
-tvm::Expr position(const tvm::Expr& dim, const tvm::Integer& offset, bool allow_out_of_bound = false) {
-  if (offset->value >= max_range) {
-    return allow_out_of_bound ? dim : dim - 1;
-  } else if (offset->value <= -max_range) {
-    return tvm::make_const(HalideIR::Int(32), allow_out_of_bound ? -1 : 0);
-  } else {
-    if (offset->value >= 0) {
-      return tvm::ir::Simplify(tvm::ir::Min::make(offset, dim + (allow_out_of_bound ? 0 : -1)));
-    } else {
-      return tvm::ir::Simplify(dim + tvm::ir::Max::make(offset, -dim + (allow_out_of_bound ? -1 : 0)));
-    }
-  }
-}
-
-tvm::Tensor Slice(const tvm::Tensor& X,
-                  const std::vector<int64_t>& starts,
-                  const std::vector<int64_t>& ends,
-                  const std::vector<int64_t>& axes1,
-                  const std::vector<int64_t>& steps,
-                  const std::string& name) {
-  MTI_ASSERT(starts.size() == ends.size());
-  MTI_ASSERT(starts.size() == axes1.size());
-  MTI_ASSERT(starts.size() == steps.size());
-
-  std::vector<int64_t> axes;
-  for (const auto& i : axes1) {
-    axes.push_back(HandleNegativeAxis(i, X->shape.size()));
-  }
-
-  tvm::Array<tvm::Expr> output_shape;
-  bool empty = false;
-  for (int64_t i = 0; i < gsl::narrow<int64_t>(X->shape.size()); ++i) {
-    auto axes_iter = std::find(axes.begin(), axes.end(), i);
-    if (axes_iter != axes.end()) {
-      auto axis = axes_iter - axes.begin();
-      tvm::Expr start = position(X->shape[i], starts[axis]);
-      tvm::Expr end = position(X->shape[i], ends[axis], /*allow_out_of_bound*/ true);
-      auto dim = tvm::ir::Simplify((end - start + tvm::Integer(steps[axis] + (steps[axis] < 0 ? 1 : -1))) / tvm::Integer(steps[axis]));
-      auto int_dim = tvm::as_const_int(dim);
-      if (int_dim && *int_dim <= 0) {
-        output_shape.push_back(0);
-        empty = true;
-      } else {
-        output_shape.push_back(dim);
-      }
-    } else {
-      output_shape.push_back(X->shape[i]);
-    }
-  }
-
-  if (empty) {
-    return MakeZeroTensor(output_shape, X->dtype, name);
-  }
-
-  return tvm::compute(
-      output_shape,
-      [&](const tvm::Array<tvm::Var>& ovars) {
-        tvm::Array<tvm::Expr> ivars;
-        for (size_t i = 0; i < X->shape.size(); ++i) {
-          auto axes_iter = std::find(axes.begin(), axes.end(), i);
-          if (axes_iter != axes.end()) {
-            auto axis = axes_iter - axes.begin();
-            ivars.push_back(tvm::ir::Simplify(ovars[i] * tvm::Integer(steps[axis]) + position(X->shape[i], starts[axis])));
-          } else {
-            ivars.push_back(ovars[i]);
-          }
-        }
-        return X(ivars);
-      },
-      name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/slice.h b/onnxruntime/core/codegen/mti/tensor/slice.h
deleted file mode 100644
index ac5c9437791f6..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/slice.h
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Slice(const tvm::Tensor& X,
-                  const std::vector<int64_t>& starts,
-                  const std::vector<int64_t>& ends,
-                  const std::vector<int64_t>& axes,
-                  const std::vector<int64_t>& steps,
-                  const std::string& name = "slice");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/split.cc b/onnxruntime/core/codegen/mti/tensor/split.cc
deleted file mode 100644
index 6ee366314858f..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/split.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/split.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <gsl/gsl>
-#include <topi/transform.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Similar to numpy, topi::split takes split indices rather than the
-// sizes of the splits. Thus we implement our own.
-tvm::Array<tvm::Tensor> Split(const tvm::Tensor& X,
-                              const tvm::Array<tvm::Expr>& split_sizes,
-                              int64_t axis,
-                              const std::string& name) {
-  MTI_ASSERT(axis < gsl::narrow<int64_t>(X->shape.size()));
-  size_t axis_t = gsl::narrow<int>(axis);
-
-  tvm::Array<tvm::Array<tvm::Expr>> output_shapes;
-  int num_splits = gsl::narrow<int>(split_sizes.size());
-  for (auto& s : split_sizes) {
-    tvm::Array<tvm::Expr> shape;
-    for (size_t i = 0; i < axis_t; i++) {
-      shape.push_back(X->shape[i]);
-    }
-    shape.push_back(s);
-    for (size_t i = axis_t + 1; i < X->shape.size(); i++) {
-      shape.push_back(X->shape[i]);
-    }
-    output_shapes.push_back(shape);
-  }
-
-  tvm::Array<tvm::Tensor> res;
-  int idx = 0;
-  for (int i_split = 0; i_split < num_splits; ++i_split) {
-    tvm::Expr s = split_sizes[i_split];
-    auto l = [&](const tvm::Array<tvm::Var>& indices) {
-      tvm::Array<tvm::Expr> new_indices;
-      for (size_t i = 0; i < axis_t; i++) {
-        new_indices.push_back(indices[i]);
-      }
-      new_indices.push_back(indices[axis_t] + idx);
-      for (size_t i = axis_t + 1; i < X->shape.size(); i++) {
-        new_indices.push_back(indices[i]);
-      }
-      MTI_ASSERT(topi::detail::IsConstInt(s));
-      MTI_ASSERT(new_indices.size() == X->shape.size());
-      int size = topi::detail::GetConstInt(s);
-      idx += size;
-      return X(new_indices);
-    };
-    res.push_back(tvm::compute(output_shapes[i_split], l, name));
-  }
-
-  MTI_ASSERT(topi::detail::IsConstInt(X->shape[axis_t]));
-  int size_of_splitted_axis = static_cast<int>(topi::detail::GetConstInt(X->shape[axis_t]));
-  MTI_ASSERT(idx == size_of_splitted_axis);
-  return res;
-}
-
-tvm::Array<tvm::Tensor> SplitWithIndices(const tvm::Tensor& X,
-                                         const tvm::Array<tvm::Integer>& split_sizes,
-                                         int64_t axis,
-                                         const std::string& name) {
-  return topi::split(X, split_sizes, gsl::narrow<int>(axis), name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/split.h b/onnxruntime/core/codegen/mti/tensor/split.h
deleted file mode 100644
index bcb9c47d936dd..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/split.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// ONNX Split semantics
-tvm::Array<tvm::Tensor> Split(const tvm::Tensor& X,
-                              const tvm::Array<tvm::Expr>& split_sizes,
-                              int64_t axis,
-                              const std::string& name = "split");
-
-// Another common Split interface
-// Split with chunck indices
-tvm::Array<tvm::Tensor> SplitWithIndices(const tvm::Tensor& X,
-                                         const tvm::Array<tvm::Integer>& split_sizes,
-                                         int64_t axis,
-                                         const std::string& name = "split_with_indices");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/tile.cc b/onnxruntime/core/codegen/mti/tensor/tile.cc
deleted file mode 100644
index 2fef86adcbaea..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/tile.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/tile.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include <gsl/gsl>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Tile(const tvm::Tensor& t,
-                 const std::vector<int64_t>& repeats,
-                 const std::string& name) {
-  MTI_ASSERT(repeats.size() == t->shape.size());
-  tvm::Array<tvm::Expr> output_shape;
-
-  bool repeats_zero = false;
-  for (size_t i = 0; i < t->shape.size(); ++i) {
-    if (repeats[i] == 0)
-      repeats_zero = true;
-    output_shape.push_back(t->shape[i] * gsl::narrow<int>(repeats[i]));
-  }
-
-  auto l = [&](const tvm::Array<tvm::Var>& ovars) {
-    if (repeats_zero)
-      return tvm::make_zero(t->dtype);
-
-    tvm::Array<tvm::Expr> ivars;
-    for (size_t i = 0; i < t->shape.size(); ++i) {
-      tvm::Expr ovar = ovars[i];
-      ivars.push_back(ovar % t->shape[i]);
-    }
-    return t(ivars);
-  };
-
-  return tvm::compute(output_shape, l, name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/tile.h b/onnxruntime/core/codegen/mti/tensor/tile.h
deleted file mode 100644
index 7ce331fb5ea95..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/tile.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Tile(const tvm::Tensor& t,
-                 const std::vector<int64_t>& repeats,
-                 const std::string& name = "tile");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/transpose.cc b/onnxruntime/core/codegen/mti/tensor/transpose.cc
deleted file mode 100644
index 873ff8d7f1708..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/transpose.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/tensor/transpose.h"
-
-#include <topi/transform.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Transpose(const tvm::Tensor& X, const tvm::Array<tvm::Integer>& axes, const std::string& name) {
-  return topi::transpose(X, axes, name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/transpose.h b/onnxruntime/core/codegen/mti/tensor/transpose.h
deleted file mode 100644
index a2a98fedf1e79..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/transpose.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Transpose(const tvm::Tensor& X,
-                      const tvm::Array<tvm::Integer>& axes,
-                      const std::string& name = "transpose");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/where.cc b/onnxruntime/core/codegen/mti/tensor/where.cc
deleted file mode 100644
index 2bdac3cae7ef5..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/where.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/where.h"
-
-#include <topi/broadcast.h>
-#include <topi/transform.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Where(const tvm::Tensor& B,
-                  const tvm::Tensor& X,
-                  const tvm::Tensor& Y,
-                  const std::string& name) {
-  size_t rank = std::max(std::max(B->shape.size(), X->shape.size()), Y->shape.size());
-  tvm::Array<tvm::Expr> output_shape;
-  for (size_t i = 0; i < rank; ++i) {
-    tvm::Expr dim = tvm::make_const(HalideIR::Int(32), 1);
-    bool broadcasted =
-        BroadcastDim(B->shape, i, rank, dim) &&
-        BroadcastDim(X->shape, i, rank, dim) &&
-        BroadcastDim(Y->shape, i, rank, dim);
-    MTI_ASSERT(broadcasted);
-    output_shape.push_back(dim);
-  }
-
-  return topi::where(topi::broadcast_to(B, output_shape),
-                     topi::broadcast_to(X, output_shape),
-                     topi::broadcast_to(Y, output_shape),
-                     name);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/mti/tensor/where.h b/onnxruntime/core/codegen/mti/tensor/where.h
deleted file mode 100644
index 68c5288eb3580..0000000000000
--- a/onnxruntime/core/codegen/mti/tensor/where.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <string>
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Where(const tvm::Tensor& B,
-                  const tvm::Tensor& X,
-                  const tvm::Tensor& Y,
-                  const std::string& name = "where");
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/all_ops.h b/onnxruntime/core/codegen/passes/op_ir_creator/all_ops.h
deleted file mode 100644
index 1463e50bd72fb..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/all_ops.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/passes/utils/codegen_context.h"
-#include "core/codegen/common/op_macro.h"
-#include "core/codegen/passes/op_ir_creator/tvm_op_creator.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// This macro declares a TVM IR builder
-// based on ORT OP type with postfix DefaultTVM
-#define DECLARE_GENERIC_OP_IR_CREATOR_CLASS(OP) \
-  DECLARE_OP_IR_CREATOR_CLASS(OP, DefaultTVM)
-
-// This macro returns a TVM IR builder class name
-// based ORT OP type with postfix DefaultTVM
-#define GENERIC_OP_IR_CREATOR_CLASS(OP) \
-  CREATOR_CLASS(OP, DefaultTVM##IRCreator)
-
-#define GENERIC_OP_IR_CREATOR_STRING(OP) \
-  STRINGIZE(GENERIC_OP_IR_CREATOR_CLASS(OP))
-
-// define all ops for DefaultTVM
-#define ADD_OP_ITEM(OP) DECLARE_GENERIC_OP_IR_CREATOR_CLASS(OP)
-#define BINARY_OP(OP) ADD_OP_ITEM(OP)
-#define BINARY_CMP_OP(OP) ADD_OP_ITEM(OP)
-#define POOL_OP(OP) ADD_OP_ITEM(OP)
-#define UNARY_OP(OP) ADD_OP_ITEM(OP)
-#define VARIADIC_OP(OP) ADD_OP_ITEM(OP)
-#define REDUCE_INDEXED_OP(OP) ADD_OP_ITEM(OP)
-#define REDUCE_OP(OP) ADD_OP_ITEM(OP)
-
-LIST_ALL_GENERIC_OPS()
-
-#undef ADD_OP_ITEM
-#undef BINARY_OP
-#undef BINARY_CMP_OP
-#undef POOL_OP
-#undef REDUCE_OP
-#undef REDUCE_INDEXED_OP
-#undef UNARY_OP
-#undef VARIADIC_OP
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/binary_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/binary_ops.cc
deleted file mode 100644
index 9452146621ac7..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/binary_ops.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/common/op_macro.h"
-#include "core/codegen/mti/math/binary_ops.h"
-#include "core/codegen/mti/tensor/cast_ops.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// helper local macro defines Evaluate of BINARY_OP OpIRCreators
-#define BINARY_OP(name)                                      \
-  Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate(        \
-      const tvm::Array<tvm::Tensor>& inputs,                 \
-      const Node& node,                                      \
-      CodeGenContext&,                                       \
-      tvm::Array<tvm::Tensor>& outputs) {                    \
-    tvm::Tensor Y = name(inputs[0], inputs[1], node.Name()); \
-    outputs.push_back(Y);                                    \
-    return Status::OK();                                     \
-  }
-
-LIST_BINARY_OPS()
-
-#undef BINARY_OP
-
-// helper local macro defines Evaluate of BINARY_CMP_OP OpIRCreators
-#define BINARY_CMP_OP(name)                                                                               \
-  Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate(                                                     \
-      const tvm::Array<tvm::Tensor>& inputs,                                                              \
-      const Node& node,                                                                                   \
-      CodeGenContext&,                                                                                    \
-      tvm::Array<tvm::Tensor>& outputs) {                                                                 \
-    tvm::Tensor Y = Cast(name(inputs[0], inputs[1], node.Name()), HalideIR::UInt(8), "cast_bool_" #name); \
-    outputs.push_back(Y);                                                                                 \
-    return Status::OK();                                                                                  \
-  }
-
-LIST_BINARY_CMP_OPS()
-
-#undef BINARY_CMP_OP
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/clip.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/clip.cc
deleted file mode 100644
index bb33e6e70accf..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/clip.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/math/unary_ops.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Clip OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Clip)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-
-  int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain());
-  tvm::Expr min_value, max_value;
-  if (version < 11) {
-    float max_v, min_v;
-    info.GetAttrOrDefault("min", &min_v, std::numeric_limits<float>::lowest());
-    info.GetAttrOrDefault("max", &max_v, std::numeric_limits<float>::max());
-    min_value = tvm::make_const(tvm::Float(32), min_v);
-    max_value = tvm::make_const(tvm::Float(32), max_v);
-  } else {
-    // for op_version >= 11, max and min are optional inputs
-    min_value = tvm::make_const(tvm::Float(32), std::numeric_limits<float>::lowest());
-    max_value = tvm::make_const(tvm::Float(32), std::numeric_limits<float>::max());
-    auto num_inputs = inputs.size();
-    if (num_inputs >= 2 && inputs[1].defined()) {
-      min_value = inputs[1]();
-    }
-    if (num_inputs == 3 && inputs[2].defined()) {
-      max_value = inputs[2]();
-    }
-  }
-
-  tvm::Tensor Y = Clip(inputs[0], min_value, max_value, node.Name() + "_Clip");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/gemm.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/gemm.cc
deleted file mode 100644
index 64f995076e1bb..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/gemm.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/math/gemm.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Gemm OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Gemm)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& /*ctx_codegen*/,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-
-  tvm::Tensor A = inputs[0];
-  tvm::Tensor B = inputs[1];
-  tvm::Tensor C = inputs[2];
-
-  int64_t trans_A, trans_B;
-  ORT_RETURN_IF_ERROR(attrs.GetAttr<int64_t>("transA", &trans_A));
-  ORT_RETURN_IF_ERROR(attrs.GetAttr<int64_t>("transB", &trans_B));
-
-  float alpha, beta;
-  ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha).IsOK());
-  ORT_ENFORCE(attrs.GetAttr<float>("beta", &beta).IsOK());
-
-  tvm::Tensor Y = Gemm(A, B, C, trans_A != 0, trans_B != 0, alpha, beta, node.Name() + "_Gemm");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/logsoftmax.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/logsoftmax.cc
deleted file mode 100644
index cb09518bf63d1..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/logsoftmax.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/math/logsoftmax.h"
-#include "core/framework/op_kernel_info.h"
-#include "core/providers/common.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of LogSoftmax OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(LogSoftmax)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-
-  int64_t axis_i64;
-  ORT_RETURN_IF_ERROR(info.GetAttr<int64_t>("axis", &axis_i64));
-  axis_i64 = HandleNegativeAxis(axis_i64, gsl::narrow_cast<int64_t>(inputs[0]->shape.size()));
-
-  tvm::Tensor Y = LogSoftmax(inputs[0], axis_i64, node.Name() + "_LogSoftmax");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/matmul.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/matmul.cc
deleted file mode 100644
index ab1ac237bfa5d..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/matmul.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/math/matmul_ops.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of MatMul OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(MatMul)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  tvm::Tensor Y = MatMul(inputs[0], inputs[1], node.Name() + "_MatMul");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/quantize/matmul_integer.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/quantize/matmul_integer.cc
deleted file mode 100644
index 6f66b1f1a2afb..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/quantize/matmul_integer.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/math/binary_ops.h"
-#include "core/codegen/mti/math/matmul_ops.h"
-#include "core/codegen/mti/tensor/cast_ops.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of MatMulInteger OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(MatMulInteger)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  const auto& A = inputs[0];
-  const auto& B = inputs[1];
-  auto& name = node.Name();
-
-  // A generic path, cast to int32
-  // Support skipped trailing inputs
-  auto A_Int32 = (node.InputDefs().size() >= 3 && node.InputDefs()[2]->Exists())
-                     ? Sub(Cast(A, HalideIR::Int(32)), Cast(inputs[2], HalideIR::Int(32)))
-                     : Cast(A, HalideIR::Int(32));
-  auto B_Int32 = (node.InputDefs().size() >= 4 && node.InputDefs()[3]->Exists())
-                     ? Sub(Cast(B, HalideIR::Int(32)), Cast(inputs[3], HalideIR::Int(32)))
-                     : Cast(B, HalideIR::Int(32));
-  tvm::Tensor Y = MatMul(A_Int32, B_Int32, name + "_MatMulInteger");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/reduce_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/reduce_ops.cc
deleted file mode 100644
index f29a3f3e7cdf7..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/reduce_ops.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/common/op_macro.h"
-#include "core/codegen/mti/math/reduce_ops.h"
-#include "core/codegen/mti/tensor/cast_ops.h"
-#include "core/codegen/mti/tensor/reshape_ops.h"
-#include "core/framework/op_kernel_info.h"
-#include "core/providers/common.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-using ReduceIndexedFunc = tvm::Tensor (*)(const tvm::Tensor& X, int64_t axis, bool keep_dims, const std::string& name);
-using ReduceFunc = tvm::Tensor (*)(const tvm::Tensor& X, const std::vector<int64_t>& axes, bool keep_dims, const std::string& name);
-
-// helper class for for REDUCE_INDEXED_OP
-class FuncReduceIndexed {
- public:
-  FuncReduceIndexed(const Node& node, ReduceIndexedFunc func, const std::string& name) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-    axis_ = info.GetAttrOrDefault<int64_t>("axis", 0);
-    int64_t keepdims_i = 1;
-    ORT_ENFORCE(info.GetAttr("keepdims", &keepdims_i).IsOK());
-    keep_dims_ = (keepdims_i == 1);
-    func_ = func;
-    name_ = name;
-  }
-
-  tvm::Tensor operator()(const tvm::Tensor& X) const {
-    auto axis = HandleNegativeAxis(axis_, gsl::narrow_cast<int64_t>(X->shape.size()));
-    tvm::Tensor index32 = func_(X, axis, keep_dims_, name_);
-    return Cast(index32, tvm::Int(64));
-  }
-
- private:
-  int64_t axis_;
-  bool keep_dims_;
-  ReduceIndexedFunc func_;
-  std::string name_;
-};
-
-// helper class for REDUCE_OP
-class FuncReduce {
- public:
-  FuncReduce(const Node& node, ReduceFunc func, const std::string& name) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-    axes_ = info.GetAttrsOrDefault<int64_t>("axes");
-    int64_t keepdims_i = 1;
-    ORT_ENFORCE(info.GetAttr("keepdims", &keepdims_i).IsOK());
-    keep_dims_ = (keepdims_i == 1);
-    func_ = func;
-    name_ = name;
-  }
-
-  tvm::Tensor operator()(const tvm::Tensor& X) const {
-    std::vector<int64_t> axes;
-    for (auto i : axes_)
-      axes.push_back(HandleNegativeAxis(i, gsl::narrow_cast<int64_t>(X->shape.size())));
-
-    return func_(X, axes, keep_dims_, name_);
-  }
-
- private:
-  std::vector<int64_t> axes_;
-  bool keep_dims_;
-  ReduceFunc func_;
-  std::string name_;
-};
-
-// helper macro defines Evaluate of REDUCE_OP OpIRCreators
-#define REDUCE_OP(name)                                             \
-  Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate(               \
-      const tvm::Array<tvm::Tensor>& inputs,                        \
-      const Node& node,                                             \
-      CodeGenContext&,                                              \
-      tvm::Array<tvm::Tensor>& outputs) {                           \
-    tvm::Tensor Y;                                                  \
-    if (ShapeRank(node.OutputDefs()[0]) == 0) {                     \
-      tvm::Tensor temp = FuncReduce(node, &name, #name)(inputs[0]); \
-      Y = Reshape(temp, {});                                        \
-    } else {                                                        \
-      Y = FuncReduce(node, &name, #name)(inputs[0]);                \
-    }                                                               \
-    outputs.push_back(Y);                                           \
-    return Status::OK();                                            \
-  }
-
-// helper macro defines Evaluate of REDUCE_INDEXED_OP OpIRCreators
-#define REDUCE_INDEXED_OP(name)                                       \
-  Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate(                 \
-      const tvm::Array<tvm::Tensor>& inputs,                          \
-      const Node& node,                                               \
-      CodeGenContext&,                                                \
-      tvm::Array<tvm::Tensor>& outputs) {                             \
-    tvm::Tensor Y = FuncReduceIndexed(node, &name, #name)(inputs[0]); \
-    outputs.push_back(Y);                                             \
-    return Status::OK();                                              \
-  }
-
-LIST_REDUCE_OPS()
-
-#undef REDUCE_OP
-#undef REDUCE_INDEXED_OP
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/softmax.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/softmax.cc
deleted file mode 100644
index 7b13de5a94e48..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/softmax.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/math/softmax.h"
-#include "core/framework/op_kernel_info.h"
-#include "core/providers/common.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Softmax OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Softmax)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-
-  int64_t axis_i64;
-  ORT_RETURN_IF_ERROR(info.GetAttr<int64_t>("axis", &axis_i64));
-
-  axis_i64 = HandleNegativeAxis(axis_i64, gsl::narrow_cast<int64_t>(inputs[0]->shape.size()));
-  tvm::Tensor Y = Softmax(inputs[0], axis_i64, node.Name() + "_Softmax");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h
deleted file mode 100644
index 29e6519af0ef1..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-// helper class for unary_ops with alpha
-class FuncWithAlpha {
- public:
-  FuncWithAlpha(const Node& node) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
-  }
-
- protected:
-  float alpha_;
-};
-
-// helper class for unary_ops with alpha and beta
-class FuncWithAlphaBeta {
- public:
-  FuncWithAlphaBeta(const Node& node) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
-    ORT_ENFORCE(attrs.GetAttr<float>("beta", &beta_).IsOK());
-  }
-
- protected:
-  float alpha_;
-  float beta_;
-};
-
-// helper class for unary_ops with alpha and gamma
-class FuncWithAlphaGamma {
- public:
-  FuncWithAlphaGamma(const Node& node) {
-    ProtoHelperNodeContext ctx(node);
-    OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-    ORT_ENFORCE(attrs.GetAttr<float>("alpha", &alpha_).IsOK());
-    ORT_ENFORCE(attrs.GetAttr<float>("gamma", &gamma_).IsOK());
-  }
-
- protected:
-  float alpha_;
-  float gamma_;
-};
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc
deleted file mode 100644
index 0407c0a06abf6..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/common/op_macro.h"
-#include "core/codegen/mti/math/unary_ops.h"
-#include "core/codegen/passes/op_ir_creator/math/unary_funcs.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// helper macro declares unary_ops helper class without attribute
-#define FuncClass(name)                                  \
-  class Func##name {                                     \
-   public:                                               \
-    Func##name(const Node&) {}                           \
-    tvm::Tensor operator()(const tvm::Tensor& X) const { \
-      return name(X);                                    \
-    }                                                    \
-  }
-
-// helper macro declares unary_ops helper class with alpha
-#define FuncClassAlpha(name)                              \
-  class Func##name : public FuncWithAlpha {               \
-   public:                                                \
-    Func##name(const Node& node) : FuncWithAlpha(node) {} \
-    tvm::Tensor operator()(const tvm::Tensor& X) const {  \
-      return name(X, alpha_);                             \
-    }                                                     \
-  }
-
-// helper macro declares unary_ops helper class with alpha and beta
-#define FuncClassAlphaBeta(name)                              \
-  class Func##name : public FuncWithAlphaBeta {               \
-   public:                                                    \
-    Func##name(const Node& node) : FuncWithAlphaBeta(node) {} \
-    tvm::Tensor operator()(const tvm::Tensor& X) const {      \
-      return name(X, alpha_, beta_);                          \
-    }                                                         \
-  }
-
-// helper macro declares unary_ops helper class with alpha and gamma
-#define FuncClassAlphaGamma(name)                              \
-  class Func##name : public FuncWithAlphaGamma {               \
-   public:                                                     \
-    Func##name(const Node& node) : FuncWithAlphaGamma(node) {} \
-    tvm::Tensor operator()(const tvm::Tensor& X) const {       \
-      return name(X, alpha_, gamma_);                          \
-    }                                                          \
-  }
-
-FuncClass(Abs);
-FuncClassAlphaBeta(Affine);
-FuncClass(Ceil);
-FuncClassAlpha(Elu);
-FuncClass(Exp);
-FuncClass(Floor);
-FuncClassAlphaBeta(HardSigmoid);
-FuncClassAlpha(LeakyRelu);
-FuncClass(Log);
-FuncClass(Neg);
-FuncClassAlphaBeta(ParametricSoftplus);
-FuncClass(Reciprocal);
-FuncClass(Relu);
-FuncClassAlphaBeta(ScaledTanh);
-FuncClassAlphaGamma(Selu);
-FuncClass(Sigmoid);
-FuncClass(Softplus);
-FuncClass(Softsign);
-FuncClass(Sqrt);
-FuncClass(Tanh);
-FuncClassAlpha(ThresholdedRelu);
-
-// helper macro defines Evaluate of UNARY_OP OpIRCreators
-#define UNARY_OP(name)                                \
-  Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \
-      const tvm::Array<tvm::Tensor>& inputs,          \
-      const Node& node,                               \
-      CodeGenContext&,                                \
-      tvm::Array<tvm::Tensor>& outputs) {             \
-    tvm::Tensor Y = Func##name(node)(inputs[0]);      \
-    outputs.push_back(Y);                             \
-    return Status::OK();                              \
-  }
-
-// helper local macros to replace some calls in LIST_UNARY_OPS
-LIST_UNARY_OPS()
-
-#undef UNARY_OP
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/variadic_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/variadic_ops.cc
deleted file mode 100644
index 9559a713c2876..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/math/variadic_ops.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/math/binary_ops.h"
-#include "core/codegen/mti/tensor/reshape_ops.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-tvm::Tensor Sum(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) {
-  return Add(lhs, rhs, name);
-}
-
-// helper local macro defines Evaluate of BINARY_OP OpIRCreators
-#define VARIADIC_OP(name)                                      \
-  Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate(          \
-      const tvm::Array<tvm::Tensor>& inputs,                   \
-      const Node& node,                                        \
-      CodeGenContext&,                                         \
-      tvm::Array<tvm::Tensor>& outputs) {                      \
-    tvm::Tensor Y = Identity(inputs[0], node.Name() + "0");    \
-    for (size_t i = 1; i < inputs.size(); ++i)                 \
-      Y = name(Y, inputs[i], node.Name() + std::to_string(i)); \
-    outputs.push_back(Y);                                      \
-    return Status::OK();                                       \
-  }
-
-LIST_VARIADIC_OPS()
-
-#undef VARIADIC_OP
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/conv.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/conv.cc
deleted file mode 100644
index 19545d1554405..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/nn/conv.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/nn/conv_ops.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/concat_ops.h"
-#include "core/codegen/mti/tensor/split.h"
-#include "core/codegen/passes/utils/ort_tvm_utils.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-Status GENERIC_OP_IR_CREATOR_CLASS(Conv)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-
-  // Attributes
-  int64_t group;
-  std::string auto_pad;
-  std::vector<int64_t> kernel_shape, strides, dilations, pads;
-
-  info.GetAttrOrDefault<int64_t>("group", &group, 1);
-  info.GetAttrOrDefault<std::string>("auto_pad", &auto_pad, "NOTSET");
-
-  ORT_THROW_IF_ERROR(info.GetAttrs<int64_t>("kernel_shape", kernel_shape));
-  ORT_ENFORCE(kernel_shape.size() <= 2, "Only support 1D/2D convolution currently!");
-  ORT_THROW_IF_ERROR(info.GetAttrs<int64_t>("strides", strides));
-
-  dilations = info.GetAttrs<int64_t>("dilations", dilations).IsOK() ? dilations : std::vector<int64_t>(kernel_shape.size(), 1);
-  ORT_ENFORCE(dilations == std::vector<int64_t>(kernel_shape.size(), 1), "Only support dilation is 1 currently");
-
-  pads = info.GetAttrs<int64_t>("pads", pads).IsOK() ? pads : std::vector<int64_t>(kernel_shape.size() * 2, 0);
-
-  // auto_pad
-  if (auto_pad != "NOTSET") {
-    auto rank = inputs[0]->shape.size() - 2;
-    ORT_ENFORCE(rank > 0);
-    for (uint64_t i = 0; i < rank; i++) {
-      if (auto_pad == "VALID") {
-        pads[i] = 0;
-        pads[i + rank] = 0;
-      } else if (auto_pad == "SAME_UPPER" || auto_pad == "SAME_LOWER") {
-        // TODO: handle symbolic dim
-        ORT_ENFORCE(ShapeHasValue(node.InputDefs()[0], 2 + i));
-
-        int64_t input_dim_value = ShapeValue(node.InputDefs()[0], 2 + i);
-        int64_t output_dim_value = (input_dim_value + strides[i] - 1) / strides[i];
-        int64_t pad_needed = (output_dim_value - 1) * strides[i] + kernel_shape[i] - input_dim_value;
-
-        pads[i] = auto_pad == "SAME_LOWER" ? (pad_needed + 1) / 2 : pad_needed / 2;
-        pads[i + rank] = pad_needed - pads[i];
-      } else {
-        ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unknown auto_pad value");
-      }
-    }
-  }
-
-  // Inputs
-  tvm::Tensor X = inputs[0];
-  tvm::Tensor W = inputs[1];
-  // Outputs
-  tvm::Tensor Y;
-  tvm::Array<tvm::Expr> Y_shape = ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen);
-
-  // 1-D convolution
-  if (kernel_shape.size() == 1) {
-    Y = Conv1D(X, W, Y_shape, ToTvmArray(strides), ToTvmArray(pads), node.Name() + "_Conv1D");
-  }
-  // 2-D convolution
-  else if (kernel_shape.size() == 2) {
-    if (group == 1) {
-      Y = Conv2D(X, W, Y_shape, ToTvmArray(strides), ToTvmArray(pads), node.Name() + "_Conv2D");
-    } else {
-      int64_t channel_out = ShapeValue(node.InputDefs()[1], 0);
-      int64_t channel_in = ShapeValue(node.InputDefs()[1], 1);
-      ORT_ENFORCE(channel_out % group == 0);
-
-      int64_t cout_group = channel_out / group;
-      Y_shape.Set(1, Y_shape[1] / gsl::narrow_cast<int>(group));
-
-      tvm::Array<tvm::Integer> split_index0;
-      tvm::Array<tvm::Integer> split_index1;
-
-      for (int i = 1; i < group; i++) {
-        split_index0.push_back(i * channel_in);
-        split_index1.push_back(i * cout_group);
-      }
-
-      auto input_groups = SplitWithIndices(X, split_index0, 1);
-      auto weight_groups = SplitWithIndices(W, split_index1, 0);
-
-      // FIXME: This will trigger a llvm buffer overflow when group is too large
-      // TODO: fix this change it to batched gemm/conv
-      tvm::Array<tvm::Tensor> output_tensors;
-      for (int i = 0; i < group; i++) {
-        auto output_tensor = Conv2D(input_groups[i],
-                                    weight_groups[i],
-                                    Y_shape,
-                                    ToTvmArray(strides),
-                                    ToTvmArray(pads),
-                                    node.Name() + "_Conv2D");
-        output_tensors.push_back(output_tensor);
-      }
-      Y = Concat(output_tensors, 1);
-    }
-  }
-
-  // Add bias if provided
-  // Support skipped trailing inputs
-  if (node.InputDefs().size() > 2 && node.InputDefs()[2]->Exists()) {
-    tvm::Tensor B = inputs[2];
-    Y = tvm::compute(
-        Y_shape,
-        [&](const tvm::Array<tvm::Var>& indices) {
-          return Y(indices) + B(indices[1]);
-        });
-  }
-
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc
deleted file mode 100644
index 88170bb56dd2d..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/nn/lstm.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// In the cell computation, we don't have the "direction" dimension and sequence dimension,
-// which have been processed outside of the cell.
-// Here we implement an LTSM cell.
-// For those args (inputs/outputs) of hidden states we put AFTER regular args (inputs/outputs)
-// with a pre-defined order
-// In a LSTM, the order is H and then C.
-// Ouputs of LSTM is Y_h and then Y_c
-Status GENERIC_OP_IR_CREATOR_CLASS(LSTM)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-
-  std::string direction_attr;
-  ORT_RETURN_IF_ERROR(attrs.GetAttr("direction", &direction_attr));
-  int64_t hidden_size;
-  ORT_RETURN_IF_ERROR(attrs.GetAttr("hidden_size", &hidden_size));
-
-  // input tensor with shape [seq_length, batch_size, input_size]
-  const tvm::Tensor& X = inputs[0];  // input tensor with shape [seq_length, batch_size, input_size]
-  const tvm::Tensor& W = inputs[1];  // weights tensor with shape [4*hidden_size, input_size]
-  const tvm::Tensor& R = inputs[2];  // recurrence tensor with shape [4*hidden_size, hidden_size]
-  const tvm::Tensor& B = inputs[3];  // optional bias tensor with shape [8*hidden_size]
-  bool has_B = node.InputDefs()[3]->Exists();
-
-  // Unsupported the 4th inputs
-  // optional tensor specifying sequence lengths in a batch, shape: [batch_size]
-  // const tvm::Tensor* seq_len = inputs[4] ? &inputs[4]->tensor : nullptr;
-
-  const tvm::Tensor& prev_H = inputs[5];  // optional initial H, shape: [batch_size, hidden_size]
-  const tvm::Tensor& prev_C = inputs[6];  // optional initial C, shape: [batch_size, hidden_size]
-
-  const tvm::Tensor& P = inputs[7];  // optional peepholes tensor with shape [3*hidde_size]
-  bool has_P = node.InputDefs()[7]->Exists();
-
-  tvm::Tensor Y_h;  // shape: [batch_size, hidden_size]
-  tvm::Tensor Y_c;  // shape: [batch_size, hidden_size]
-  LSTMAttributes lstm_attrs(hidden_size);
-  LSTM_cell(lstm_attrs, X, W, R, B, has_B, prev_H, prev_C, P, has_P, Y_h, Y_c);
-
-  // Since we only generate lstm cell, lstm's states need to be always outputs,
-  // regardless whethere they are skipped or not.
-  // The skipped trailing outputs need to be handled by Execution
-  outputs.push_back(Y_h);
-  outputs.push_back(Y_c);
-
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc
deleted file mode 100644
index 84d3b7c1e0f79..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/nn/pool_ops.h"
-#include "core/framework/op_kernel_info.h"
-#include "core/providers/cpu/nn/pool_attributes.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// A local macro to create Pool Ops
-
-// helper macro defines Evaluate of of POOL_OP OpIRCreators
-#define POOL_OP(name)                                                                                                         \
-  Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate(                                                                         \
-      const tvm::Array<tvm::Tensor>& inputs,                                                                                  \
-      const Node& node,                                                                                                       \
-      CodeGenContext& ctx_codegen,                                                                                            \
-      tvm::Array<tvm::Tensor>& outputs) {                                                                                     \
-    ORT_RETURN_IF_NOT(outputs.size() == 1, "multiple outputs are not supported yet!");                                        \
-    ProtoHelperNodeContext ctx(node);                                                                                         \
-    OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);                                                                     \
-    int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain());                                  \
-    PoolAttributes pool_attrs(info, #name, version);                                                                          \
-    for (auto n : pool_attrs.dilations) {                                                                                     \
-      ORT_RETURN_IF_NOT(n <= 1, "dilations are not supported yet!");                                                          \
-    }                                                                                                                         \
-    if (pool_attrs.global_pooling) {                                                                                          \
-      if (inputs[0]->shape.size() != 4) {                                                                                     \
-        ORT_NOT_IMPLEMENTED(gsl::narrow_cast<int64_t>(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \
-      }                                                                                                                       \
-    } else {                                                                                                                  \
-      if (pool_attrs.kernel_shape.size() != 2) {                                                                              \
-        ORT_NOT_IMPLEMENTED(pool_attrs.kernel_shape.size(), "d pooling is not implementated");                                \
-      }                                                                                                                       \
-    }                                                                                                                         \
-    tvm::Array<tvm::Expr> dummy_output_shape;                                                                                 \
-    tvm::Tensor Y = name(inputs[0], pool_attrs, dummy_output_shape);                                                          \
-    outputs.push_back(Y);                                                                                                     \
-    return Status::OK();                                                                                                      \
-  }
-
-LIST_POOL_OPS()
-
-#undef POOL_OP
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/cast.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/cast.cc
deleted file mode 100644
index bd324fd359edf..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/cast.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/tensor/cast_ops.h"
-#include "core/codegen/passes/utils/ort_tvm_utils.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Cast OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Cast)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-
-  int64_t to;
-  ORT_RETURN_IF_ERROR(attrs.GetAttr<int64_t>("to", &to));
-  auto to_type_proto = gsl::narrow_cast<ONNX_NAMESPACE::TensorProto_DataType>(to);
-
-  tvm::Tensor X = inputs[0];
-  tvm::Tensor Y;
-  if (to_type_proto == ONNX_NAMESPACE::TensorProto_DataType_BOOL) {
-    // special case for bool as ONNX bool is uint8, while in tvm it's uint1
-    Y = CastToUInt8Bool(X, node.Name() + "_Cast");
-  } else {
-    Y = Cast(X, ToTvmType(to_type_proto), node.Name() + "_Cast");
-  }
-
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/concat.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/concat.cc
deleted file mode 100644
index 418296889419e..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/concat.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/tensor/concat_ops.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Concat OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Concat)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-
-  int64_t axis;
-  ORT_RETURN_IF_ERROR(info.GetAttr<int64_t>("axis", &axis));
-
-  tvm::Tensor Y = Concat(inputs, axis, node.Name() + "_Concat");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/crop.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/crop.cc
deleted file mode 100644
index 3b6a9a76f0723..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/crop.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/crop.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Crop OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Crop)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-
-  if (inputs[0]->shape.size() != 4) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input is expected to have four dimensions corresponding to [N,C,H,W]");
-  }
-
-  std::vector<int64_t> border;
-  std::vector<int64_t> scale;
-
-  ORT_ENFORCE(attrs.GetAttrs<int64_t>("border", border).IsOK());
-  // scale is optional and status is false when omit
-  bool is_ok = attrs.GetAttrs<int64_t>("scale", scale).IsOK();
-  ORT_UNUSED_PARAMETER(is_ok);
-
-  if (border.size() != 4) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Attribute border needs to be specified with four border elements");
-  }
-
-  tvm::Tensor Y = Crop(inputs[0], ToTvmArray(border), ToTvmArray(scale), node.Name() + "_Crop");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/expand.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/expand.cc
deleted file mode 100644
index 0f0e0cf0987b3..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/expand.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/expand.h"
-#include "core/codegen/passes/utils/ort_tvm_utils.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Expand OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Expand)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  tvm::Tensor Y = Expand(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Expand");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather.cc
deleted file mode 100644
index 3a5d801b6839f..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/tensor/gather.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Gather OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Gather)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-
-  int64_t axis;
-  ORT_ENFORCE(attrs.GetAttr<int64_t>("axis", &axis).IsOK());
-
-  tvm::Tensor Y = Gather(inputs[0], axis, inputs[1], node.Name() + "_Gather");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather_elements.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather_elements.cc
deleted file mode 100644
index 0b71506cceed3..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather_elements.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/tensor/gather_elements.h"
-#include "core/framework/op_kernel_info.h"
-#include "core/providers/common.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of GatherElements OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(GatherElements)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-
-  int64_t axis;
-  ORT_ENFORCE(attrs.GetAttr<int64_t>("axis", &axis).IsOK());
-  axis = HandleNegativeAxis(axis, gsl::narrow_cast<int64_t>(inputs[0]->shape.size()));
-
-  tvm::Tensor Y = GatherElements(inputs[0], axis, inputs[1], node.Name() + "_GatherElements");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/pad.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/pad.cc
deleted file mode 100644
index e9e20e8a43998..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/pad.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/pad_ops.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Pad OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Pad)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-
-  std::string mode;
-  std::vector<int64_t> pads;
-  float value;
-
-  ORT_THROW_IF_ERROR(attrs.GetAttr<std::string>("mode", &mode));
-  ORT_THROW_IF_ERROR(attrs.GetAttrs<int64_t>("pads", pads));
-  ORT_THROW_IF_ERROR(attrs.GetAttr<float>("value", &value));
-
-  if (mode != "constant" && mode != "edge" && mode != "reflect")
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Pad: Unsupported padding mode!");
-
-  if (pads.size() != 2 * inputs[0]->shape.size())
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Pad: pads rank does not match inputs rank!");
-
-  std::vector<int64_t> pad_before, pad_after;
-  size_t offset = pads.size() / 2;
-  for (size_t i = 0; i < offset; i++) {
-    pad_before.push_back(pads[i]);
-    pad_after.push_back(pads[i + offset]);
-  }
-
-  tvm::Tensor Y = Pad(inputs[0], ToTvmArray(pad_before), ToTvmArray(pad_after), value, mode, node.Name() + "_Pad");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/reshape_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/reshape_ops.cc
deleted file mode 100644
index a83f598bc8ad1..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/reshape_ops.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/reshape_ops.h"
-#include "core/codegen/passes/utils/ort_tvm_utils.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Dropout OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Dropout)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  tvm::Tensor Y = Identity(inputs[0]);
-  outputs.push_back(Y);
-
-  // optional mask
-  // Support skipped trailing outputs
-  if (node.OutputDefs().size() > 1 && node.OutputDefs()[1]->Exists()) {
-    // A fake mask with all ones
-    auto l = [&](const tvm::Array<tvm::Var>& /*indices*/) {
-      return tvm::make_const(tvm::UInt(8), 1);
-    };
-    tvm::Tensor mask = tvm::compute(inputs[0]->shape, l, "mask");
-    outputs.push_back(mask);
-  }
-
-  return Status::OK();
-}
-
-// Evaluate of Flatten OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Flatten)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-
-  int64_t axis;
-  ORT_RETURN_IF_ERROR(attrs.GetAttr<int64_t>("axis", &axis));
-
-  tvm::Tensor Y = Flatten(inputs[0], axis, node.Name() + "_Flatten");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-// Evaluate of Identity OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Identity)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node&,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  tvm::Tensor Y = Identity(inputs[0]);
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-// Evaluate of Reshape OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Reshape)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  tvm::Tensor Y = Reshape(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Reshape");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-// Evaluate of Squeeze OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Squeeze)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  tvm::Tensor Y = Reshape(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Squeeze");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-// Evaluate of Unsqueeze OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Unsqueeze)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  tvm::Tensor Y = Reshape(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Unsqueeze");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/shape_op.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/shape_op.cc
deleted file mode 100644
index 84761ecac1397..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/shape_op.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/shape_op.h"
-#include "core/codegen/passes/utils/ort_tvm_utils.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Expand OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Shape)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  tvm::Tensor Y = Shape(inputs[0], node.Name() + "_Expand");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc
deleted file mode 100644
index 6a016580c41e4..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-#include "core/codegen/passes/utils/ort_tvm_utils.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/slice.h"
-#include "core/framework/op_kernel_info.h"
-#include "core/framework/tensorprotoutils.h"
-
-#include <tvm/ir_pass.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-Status SliceCommon(const tvm::Array<tvm::Tensor>& inputs,
-                   const Node& node,
-                   tvm::Array<tvm::Tensor>& outputs,
-                   const std::vector<int64_t>& starts,
-                   const std::vector<int64_t>& ends,
-                   const std::vector<int64_t>& axes1,
-                   const std::vector<int64_t>& steps1) {
-  ORT_RETURN_IF_NOT(nullptr != node.InputDefs()[0], "nullptr == node.InputDefs()[0]");
-
-  std::vector<int64_t> axes;
-  if (axes1.size() == 0) {
-    for (size_t i = 0; i < starts.size(); ++i) {
-      axes.push_back(gsl::narrow_cast<int64_t>(i));
-    }
-  } else {
-    axes = axes1;
-  }
-
-  std::vector<int64_t> steps;
-  if (steps1.size() == 0) {
-    steps.resize(starts.size(), 1);
-  } else {
-    steps = steps1;
-  }
-
-  tvm::Tensor Y = Slice(inputs[0], starts, ends, axes, steps, node.Name() + "_Slice");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-// Evaluate of Slice OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Slice)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-
-  // NOTE that in opset 10, Slice has changed starts/ends/axes from attribute to input
-  // which may lead to dynamic output shape.
-  int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain());
-  ORT_RETURN_IF_NOT(version <= 9, "Dynamic Slice is not supported yet");
-
-  std::vector<int64_t> starts, ends, steps;
-  ORT_RETURN_IF_ERROR(info.GetAttrs<int64_t>("starts", starts));
-  ORT_RETURN_IF_ERROR(info.GetAttrs<int64_t>("ends", ends));
-  ORT_RETURN_IF_NOT(starts.size() == ends.size(), "starts.size() != ends.size()");
-
-  auto axes = info.GetAttrsOrDefault<int64_t>("axes");
-
-  return SliceCommon(inputs, node, outputs, starts, ends, axes, steps);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc
deleted file mode 100644
index ec52d98b5bf96..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/split.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Split OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Split)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> info(&ctx);
-
-  int64_t axis;
-  ORT_RETURN_IF_ERROR(info.GetAttr<int64_t>("axis", &axis));
-  axis = HandleNegativeAxis(axis, gsl::narrow_cast<int64_t>(inputs[0]->shape.size()));
-  std::vector<int64_t> split_sizes;
-
-  int64_t split_size_sum = 0;
-  if (info.GetAttrs("split", split_sizes).IsOK()) {
-    // optional
-    split_size_sum = std::accumulate(split_sizes.cbegin(), split_sizes.cend(), 0LL);
-    ORT_RETURN_IF_NOT(std::all_of(split_sizes.cbegin(), split_sizes.cend(), [](int64_t value) { return value > 0; }),
-                      "Invalid value in 'split' attribute. All values must be > 0");
-
-    // check split sizes
-    for (size_t i = 0; i < node.OutputDefs().size(); ++i) {
-      ORT_RETURN_IF_NOT(split_sizes[i] == ShapeValue(node.OutputDefs()[i], gsl::narrow<int>(axis)),
-                        "split_sizes[i] != ShapeValue(node.OutputDefs()[i], axis)");
-    }
-
-  } else {
-    for (size_t i = 0; i < node.OutputDefs().size(); ++i) {
-      split_sizes.push_back(ShapeValue(node.OutputDefs()[i], gsl::narrow<int>(axis)));
-      split_size_sum += split_sizes[i];
-    }
-  }
-
-  // check total size
-  if (ShapeHasValue(node.InputDefs()[0], axis)) {
-    int64_t input_axis_dim = ShapeValue(node.InputDefs()[0], axis);
-    if (split_size_sum != input_axis_dim) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                             "Cannot split using values in 'split' attribute. Axis=", axis,
-                             " Dim being splitted=", input_axis_dim,
-                             " Sum of sizes in 'split' (must equal size of selected axis) was ", split_size_sum);
-    }
-  }
-
-  tvm::Array<tvm::Tensor> output_tensors = Split(inputs[0], ToTvmArray(split_sizes), axis, node.Name() + "_Split");
-  for (size_t i = 0; i < node.OutputDefs().size(); ++i) {
-    outputs.push_back(output_tensors[i]);
-  }
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/transpose.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/transpose.cc
deleted file mode 100644
index 43999ebd1f465..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/transpose.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/transpose.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Transpose OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Transpose)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-
-  size_t input_0_shape_rank = inputs[0]->shape.size();
-  std::vector<int64_t> permute;
-  bool is_ok = attrs.GetAttrs<int64_t>("perm", permute).IsOK();
-  if (permute.size() != 0 && permute.size() != input_0_shape_rank)
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Transpose: Incorrect permute size");
-
-  std::vector<int64_t> default_permute;
-  const std::vector<int64_t>* perm;
-  // either we don't have perm attribute or the perm attribute is empty
-  bool use_default_perm = !is_ok || permute.size() == 0;
-  if (use_default_perm) {
-    default_permute.resize(input_0_shape_rank);
-    for (size_t i = 0; i < input_0_shape_rank; ++i) {
-      default_permute[i] = gsl::narrow<int64_t>(input_0_shape_rank - 1 - i);
-    }
-    perm = &default_permute;
-  } else {
-    perm = &permute;
-  }
-
-  tvm::Tensor Y = Transpose(inputs[0], ToTvmArrayInt(*perm), node.Name() + "_Transpose");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/where.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/where.cc
deleted file mode 100644
index 9d6df7c1c430d..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/where.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/mti/tensor/where.h"
-#include "core/framework/op_kernel_info.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Evaluate of Transpose OpIRCreator
-Status GENERIC_OP_IR_CREATOR_CLASS(Where)::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>& outputs) {
-  ProtoHelperNodeContext ctx(node);
-  OpNodeProtoHelper<ProtoHelperNodeContext> attrs(&ctx);
-
-  tvm::Tensor Y = Where(inputs[0], inputs[1], inputs[2], node.Name() + "_Where");
-  outputs.push_back(Y);
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.cc
deleted file mode 100644
index 7889e2add755e..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/tvm_ir_builder.h"
-
-#include "core/codegen/common/op_macro.h"
-#include "core/codegen/passes/op_ir_creator/all_ops.h"
-#include "core/common/common.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-TVMIRBuilder::TVMIRBuilder(const std::string& name)
-    : name_(name) {}
-
-const std::string& TVMIRBuilder::Name() const {
-  return name_;
-}
-
-void TVMIRBuilder::InsertDispatcher(std::unique_ptr<OpIRDispatcher>&& ptr) {
-  dispatchers_.push_back(std::move(ptr));
-}
-
-void TVMIRBuilder::ClearAllDispatchers() {
-  dispatchers_.clear();
-}
-
-void TVMIRBuilder::DumpAllOpIRCreators() const {
-  int count = 0;
-  for (auto& d : dispatchers_) {
-    std::cout << "************ TVM OpIRDispatcher "
-              << count << " : "
-              << d->Name()
-              << " ************" << std::endl;
-
-    d->ForEach([](const std::string& key, OpIRCreator* builder) {
-      std::cout << "Key " << key
-                << ", Creator " << builder->Name() << std::endl;
-    });
-
-    ++count;
-  }
-}
-
-// Evaluate finds ONE proper OpIRCreator and build the corresponding OpIR
-// If a TVMIRBuilder has more than one OpIRCreator for an ORT Op,
-// the first one will be used.
-// Please adjust registration order and dispatcher in TVMIRBuilder
-// to make sure the proper OpIRCreator is called.
-Status TVMIRBuilder::Evaluate(
-    const tvm::Array<tvm::Tensor>& inputs,
-    const Node& node,
-    CodeGenContext& ctx_codegen,
-    tvm::Array<tvm::Tensor>& outputs) {
-  OpIRCreator* candidate = nullptr;
-  for (auto& d : dispatchers_) {
-    candidate = d->Find(node);
-    if (nullptr != candidate)
-      break;
-  }
-
-  if (nullptr == candidate) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not implemented: ", node.OpType());
-  }
-
-  ORT_RETURN_IF_ERROR(candidate->Evaluate(inputs, node, ctx_codegen, outputs));
-
-  return Status::OK();
-}
-
-// BEGIN: Generic IR creator classes
-#define ADD_OP_ITEM(name) \
-  op_ir_registry->Register(std::make_unique<GENERIC_OP_IR_CREATOR_CLASS(name)>());
-
-#define BINARY_OP(name) ADD_OP_ITEM(name)
-#define BINARY_CMP_OP(name) ADD_OP_ITEM(name)
-#define POOL_OP(name) ADD_OP_ITEM(name)
-#define REDUCE_OP(name) ADD_OP_ITEM(name)
-#define REDUCE_INDEXED_OP(name) ADD_OP_ITEM(name)
-#define UNARY_OP(name) ADD_OP_ITEM(name)
-#define VARIADIC_OP(name) ADD_OP_ITEM(name)
-
-void RegisterAllGenericOpIRCreators(OpIRRegistry* op_ir_registry) {
-  LIST_ALL_GENERIC_OPS();
-}
-
-#undef ADD_OP_ITEM
-#undef BINARY_OP
-#undef BINARY_CMP_OP
-#undef POOL_OP
-#undef REDUCE_OP
-#undef REDUCE_INDEXED_OP
-#undef UNARY_OP
-#undef VARIADIC_OP
-
-// BEGIN: Plugin Generic IR creator classes
-#define ADD_OP_ITEM(name) \
-  dispatcher->Register(#name, registry->Get(GENERIC_OP_IR_CREATOR_STRING(name)));
-
-#define BINARY_OP(name) ADD_OP_ITEM(name)
-#define BINARY_CMP_OP(name) ADD_OP_ITEM(name)
-#define POOL_OP(name) ADD_OP_ITEM(name)
-#define REDUCE_OP(name) ADD_OP_ITEM(name)
-#define REDUCE_INDEXED_OP(name) ADD_OP_ITEM(name)
-#define UNARY_OP(name) ADD_OP_ITEM(name)
-#define VARIADIC_OP(name) ADD_OP_ITEM(name)
-
-void RegisterGenericOrtOpTypeDispatcher(const std::shared_ptr<TVMIRBuilder>& builder,
-                                        const OpIRRegistry* registry) {
-  auto dispatcher = std::make_unique<OP_IR_DISPATCHER_CLASS(OpType)>("GenericOrtOpTypeOpIRCreators");
-  LIST_ALL_GENERIC_OPS()
-  builder->InsertDispatcher(std::move(dispatcher));
-}
-
-#undef ADD_OP_ITEM
-#undef BINARY_OP
-#undef BINARY_CMP_OP
-#undef POOL_OP
-#undef REDUCE_OP
-#undef REDUCE_INDEXED_OP
-#undef UNARY_OP
-// END: Generic IR creators classes
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.h b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.h
deleted file mode 100644
index c80056e619d6d..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/passes/utils/codegen_context.h"
-#include "core/codegen/passes/op_ir_creator/tvm_op_creator.h"
-#include "core/common/common.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// TVMIRBuilder contains all applicable TVM OpIRCreators
-// OpIRCreators are stored in multiple dispatchers
-// that check different conditions of an ORT Node.
-
-// If an ORT Node satisfies more than one OpIRCreators,
-// the first dispatched pass will be applied.
-
-class TVMIRBuilder {
- public:
-  TVMIRBuilder(const std::string& name);
-  ~TVMIRBuilder() = default;
-
-  // A debug dumps all existing in this TVMIRBuilders
-  void DumpAllOpIRCreators() const;
-
-  // Evaluates an OpIRCreator that first satisfies condtions of all dispatchers
-  Status Evaluate(
-      const tvm::Array<tvm::Tensor>& inputs,
-      const Node& node,
-      CodeGenContext& ctx,
-      tvm::Array<tvm::Tensor>& outputs);
-
-  // Inserts a dispatcher and move its ownership to this TVMIRBuilder
-  void InsertDispatcher(std::unique_ptr<OpIRDispatcher>&& ptr);
-
-  // Clears all dispatchers in this TVMIRBuilder
-  void ClearAllDispatchers();
-
-  // Dumps the name of this TVMIRBuilder
-  const std::string& Name() const;
-
- private:
-  std::vector<std::unique_ptr<OpIRDispatcher>> dispatchers_;
-  std::string name_;
-
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVMIRBuilder);
-};
-
-// Utility function to register all builtin generic OpIRCreators into an OpIRRegistry.
-// It creates instances of all generic OpIRCreators
-// and registers them to op_ir_registry
-void RegisterAllGenericOpIRCreators(OpIRRegistry* op_ir_registry);
-
-// Utility function to bind all builtin generic OpIRCreators to a TVMIRBuilder.
-// It creates an instance of a Dispatcher that contains all generic OpIRCreators created above
-// and uses OrtOpType to dispatch OpIRCreators.
-// Then, it registers the created Dispatcher to a TVMIRBuilder, builder.
-void RegisterGenericOrtOpTypeDispatcher(const std::shared_ptr<TVMIRBuilder>& builder,
-                                        const OpIRRegistry* registry);
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.cc
deleted file mode 100644
index 992272753f5a4..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/op_ir_creator/tvm_op_creator.h"
-
-#include "core/codegen/common/common.h"
-#include "core/codegen/common/dispatcher.h"
-#include "core/codegen/passes/utils/codegen_context.h"
-
-namespace onnxruntime {
-namespace codegen {
-// Explicit instantiation for OpIRCreator
-template class CreatorBase<const tvm::Array<tvm::Tensor>&,
-                           const Node&,
-                           tvm_codegen::CodeGenContext&,
-                           tvm::Array<tvm::Tensor>&,
-                           Status>;
-
-// Explicit instantiation for OpIRCreators' dispatcher
-template class DispatcherBase<tvm_codegen::OpIRCreator*>;
-
-}  // namespace codegen
-
-namespace tvm_codegen {
-
-// One dispatcher is based on ORT OpType
-OpIRCreator* OP_IR_DISPATCHER_CLASS(OpType)::Find(const Node& node) {
-  return DispatcherBase::Get(node.OpType());
-}
-
-// Another dispatcher is based ORT NodeArg name (GetKey)
-OpIRCreator* OP_IR_DISPATCHER_CLASS(NodeName)::Find(const Node& node) {
-  return DispatcherBase::Get(GetKey(&node));
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.h b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.h
deleted file mode 100644
index e29c4a9f20767..0000000000000
--- a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/common/creator.h"
-#include "core/codegen/common/dispatcher.h"
-#include "core/codegen/common/registry.h"
-#include "core/graph/graph.h"
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-class CodeGenContext;
-
-// OpIRCreator lowers an Ort Node to its corresponding TVM IRs
-using OpIRCreator = codegen::CreatorBase<
-    const tvm::Array<tvm::Tensor>&,
-    const Node&,
-    CodeGenContext&,
-    tvm::Array<tvm::Tensor>&,
-    Status>;
-
-// OpIRDispatcher is the base dispatcher for TVM IR Builder
-// It checks whether an Ort Node satisfying a criteria (in Find)
-// and dispatches a corresponding OpIRCreator.
-class OpIRDispatcher : public codegen::DispatcherBase<OpIRCreator*> {
- public:
-  OpIRDispatcher(const std::string& name)
-      : DispatcherBase(name) {}
-
-  virtual ~OpIRDispatcher() = default;
-
-  virtual OpIRCreator* Find(const Node&) = 0;
-
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OpIRDispatcher);
-};
-
-// Macro returns an OpIRCreators' dispatcher's name
-#define OP_IR_DISPATCHER_CLASS(OP) \
-  TVM##OP##IRCreator
-
-// Macro declares an OpIRCreators' dispatcher
-#define DECLARE_OP_IR_DISPATCHER_CLASS(OP)                             \
-  class OP_IR_DISPATCHER_CLASS(OP) : public OpIRDispatcher {           \
-   public:                                                             \
-    TVM##OP##IRCreator(const std::string& name)                        \
-        : OpIRDispatcher(name) {}                                      \
-    ~TVM##OP##IRCreator() = default;                                   \
-    OpIRCreator* Find(const Node&) override;                           \
-                                                                       \
-   private:                                                            \
-    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OP_IR_DISPATCHER_CLASS(OP)); \
-  };
-
-// Declare two common dispatchers for TVM Op IR builders
-// One dispatcher is based on Ort OpType
-DECLARE_OP_IR_DISPATCHER_CLASS(OpType)
-// Another dispatcher is based Ort NodeArg name
-DECLARE_OP_IR_DISPATCHER_CLASS(NodeName)
-
-// OpIRCreator Registry is a registry holds all OpIRCreators
-using OpIRRegistry = codegen::RegistryBase<OpIRCreator>;
-
-// Macro declares an OpIRCreator
-#define DECLARE_OP_IR_CREATOR_CLASS(OP, PREFIX)         \
-  DECLARE_CREATOR_CLASS(OP, PREFIX##IRCreator,          \
-                        const tvm::Array<tvm::Tensor>&, \
-                        const Node&,                    \
-                        tvm_codegen::CodeGenContext&,   \
-                        tvm::Array<tvm::Tensor>&,       \
-                        Status)
-
-// Macro returns an OpIRCreator's name  with prefix
-#define OP_IR_CREATOR_CLASS_EX(OP, PREFIX, ARCH) \
-  CREATOR_CLASS(OP, PREFIX##ARCH##IRCreator)
-
-// Macro declares an OpIRCreator with prefix and arch
-#define DECLARE_OP_IR_CREATOR_CLASS_EX(OP, PREFIX, ARCH) \
-  DECLARE_OP_IR_CREATOR_CLASS(OP, PREFIX##ARCH)
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/scheduler/all_schedules.h b/onnxruntime/core/codegen/passes/scheduler/all_schedules.h
deleted file mode 100644
index fe4be90f9fc84..0000000000000
--- a/onnxruntime/core/codegen/passes/scheduler/all_schedules.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/passes/scheduler/tvm_scheduler.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// AlwaysRoot is for debug purpose
-DECLARE_TVM_SCHEDULER_CLASS(AlwaysRoot, GenericTVMRule)
-// Create schedule for TVM Rule
-DECLARE_TVM_SCHEDULER_CLASS(Extern, GenericTVMRule)
-DECLARE_TVM_SCHEDULER_CLASS(Reduce, GenericTVMRule)
-
-// Crete scheduler for ORT OpType, Softmax
-DECLARE_TVM_SCHEDULER_CLASS(Softmax, GenericOrtOpType)
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/scheduler/ort_type_schedule.cc b/onnxruntime/core/codegen/passes/scheduler/ort_type_schedule.cc
deleted file mode 100644
index 59f492d164b14..0000000000000
--- a/onnxruntime/core/codegen/passes/scheduler/ort_type_schedule.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/scheduler/all_schedules.h"
-
-#include "core/codegen/passes/scheduler/schedule_utils.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-bool TVM_SCHEDULER_CLASS(Softmax, GenericOrtOpType)::Evaluate(
-    const tvm::Tensor& tensor,
-    const Node*,
-    CodeGenContext&,
-    ScheduleContext& ctx_sched) {
-  // compute root the exp since it is reused more than once
-  auto& tensor_exp = tensor->op->InputTensors()[0];
-  return InsertRootSchedule(tensor_exp, ctx_sched);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
deleted file mode 100644
index 76c2ad509c401..0000000000000
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/common/utils.h"
-#include "core/codegen/passes/scheduler/schedule_utils.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Check the schedule of tensor
-// If it has no compute_root, Insert compute_root to tensor, and record it to ctx.scheduled_tensors
-bool InsertRootSchedule(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx) {
-  auto it = ctx.scheduled_tensors.find(tensor->op.get());
-  if (it != ctx.scheduled_tensors.end()) {
-    if (it->second == ScheduleType::ScheduleClosure ||
-        it->second == ScheduleType::ScheduleRoot) {
-      return false;
-    }
-    it->second = ScheduleType::ScheduleRoot;
-  } else {
-    ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleRoot));
-  }
-  ctx.schedule[tensor->op].compute_root();
-  return true;
-}
-
-// Check the schedule of tensor
-// If it is not labeled as closure, lable it.
-bool InsertClosure(const tvm::Tensor& tensor,
-                   ScheduleContext& ctx) {
-  auto it = ctx.scheduled_tensors.find(tensor->op.get());
-  if (it != ctx.scheduled_tensors.end()) {
-    if (it->second == ScheduleType::ScheduleClosure)
-      return false;
-    it->second = ScheduleType::ScheduleClosure;
-  } else {
-    ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleClosure));
-  }
-  return true;
-}
-
-// Combination of InsertRootSchedule and InsertClosure
-bool InsertRootScheduleAndClosure(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx) {
-  auto it = ctx.scheduled_tensors.find(tensor->op.get());
-  if (it != ctx.scheduled_tensors.end()) {
-    if (it->second == ScheduleType::ScheduleClosure) {
-      return false;
-    }
-    it->second = ScheduleType::ScheduleClosure;
-  } else {
-    ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleClosure));
-  }
-  ctx.schedule[tensor->op].compute_root();
-  return true;
-}
-
-// Check precondition for vectorize schedule
-bool ShouldTryVectorization(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx) {
-  auto it = ctx.scheduled_tensors.find(tensor->op.get());
-  if (it != ctx.scheduled_tensors.end()) {
-    if (it->second > ScheduleType::ScheduleInline) {
-      return false;
-    }
-  }
-  return true;
-}
-
-// Check the schedule of tensor
-// If it is not scheduled, try to vectorize it.
-// Note TryVectorization has to use with compute_root.
-// Therefore, there is a safety check of tensor's schedule
-bool TryVectorization(
-    const tvm::Tensor& tensor,
-    int64_t natural_vector_size,
-    ScheduleContext& ctx) {
-  if (!ShouldTryVectorization(tensor, ctx))
-    return false;
-
-  auto shape = tensor->shape;
-  auto rank = shape.size();
-  if (rank < 1) {
-    return false;
-  }
-  const int64_t* tail_dim = as_const_int(shape[rank - 1]);
-
-  if (nullptr != tail_dim) {
-    auto extern_op = tensor->op.as<tvm::ExternOpNode>();
-    if (nullptr != extern_op) {
-      return false;
-    }
-
-    auto compute_op = tensor->op.as<tvm::ComputeOpNode>();
-
-    if (nullptr != compute_op) {
-      auto axis = compute_op->axis;
-      tvm::IterVar x = axis[rank - 1];
-      if ((*tail_dim) > natural_vector_size) {
-        if ((*tail_dim) % natural_vector_size != 0) {
-          natural_vector_size = GCD<int64_t>(natural_vector_size, (*tail_dim));
-        }
-
-        if (natural_vector_size > 1) {
-          tvm::IterVar xi, xo;
-          ctx.schedule[tensor->op].split(x, static_cast<int32_t>(natural_vector_size), &xo, &xi);
-          ctx.schedule[tensor->op].vectorize(xi);
-          return true;
-        }
-      } else if (*tail_dim > 0) {
-        // don't vectorize if dim is 0
-        ctx.schedule[tensor->op].vectorize(x);
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// Check the schedule of tensor
-// If it is not scheduled, try to add compute_inline on it.
-// Note TryInlineSchedule cannot be used with compute_root.
-// Therefore, there is a safety check of tensor's schedule.
-bool TryInlineSchedule(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx) {
-  auto it = ctx.scheduled_tensors.find(tensor->op.get());
-  if (it != ctx.scheduled_tensors.end()) {
-    if ((int)it->second < (int)ScheduleType::ScheduleInline) {
-      ctx.schedule[tensor->op].compute_inline();
-      it->second = ScheduleType::ScheduleInline;
-      return true;
-    } else {
-      return false;
-    }
-  }
-  ctx.schedule[tensor->op].compute_inline();
-  ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleInline));
-  return true;
-}
-
-// Check the schedule of tensor's inputs, and call InsertRootSchedule for each of them
-bool InputRootSchedule(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx) {
-  bool status = false;
-  for (auto& t : tensor->op->InputTensors()) {
-    if (t->op->InputTensors().size() > 0) {
-      bool status_root = InsertRootSchedule(t, ctx);
-      status = status || status_root;
-    }
-  }
-  return status;
-}
-
-// Check the schedule of tensor's inputs,
-// and call InsertRootSchedule and TryVectorization for each of them
-bool InputRootScheduleWithVectorization(
-    const tvm::Tensor& tensor,
-    int64_t natural_vector_size,
-    ScheduleContext& ctx) {
-  bool status = false;
-  for (auto& t : tensor->op->InputTensors()) {
-    if (t->op->InputTensors().size() > 0) {
-      bool status_vec = TryVectorization(t, natural_vector_size, ctx);
-      bool status_root = InsertRootSchedule(t, ctx);
-      status = status || status_root || status_vec;
-    }
-  }
-  return status;
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
deleted file mode 100644
index 4a0781f94d385..0000000000000
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include <tvm/tvm.h>
-#include <core/codegen/passes/scheduler/tvm_scheduler.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// Check the schedule of tensor
-// If it has no compute_root, Insert compute_root to tensor,
-// and record it to ctx.scheduled_tensors
-bool InsertRootSchedule(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx);
-
-// Check the schedule of tensor
-// If it is not labeled as closure, lable it.
-bool InsertClosure(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx);
-
-// Combination of InsertRootSchedule and InsertClosure
-bool InsertRootScheduleAndClosure(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx);
-
-// Check precondition for vectorize schedule
-bool ShouldTryVectorization(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx);
-
-// Check the schedule of tensor
-// If it is not scheduled, try to vectorize it.
-// Note TryVectorization has to use with compute_root.
-// Therefore, there is a safety check of tensor's schedule
-bool TryVectorization(
-    const tvm::Tensor& tensor,
-    int64_t natural_vector_size,
-    ScheduleContext& ctx);
-
-// Check the schedule of tensor
-// If it is not scheduled, try to add compute_inline on it.
-// Note TryInlineSchedule cannot be used with compute_root.
-// Therefore, there is a safety check of tensor's schedule.
-bool TryInlineSchedule(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx);
-
-// Check the schedule of tensor's inputs,
-// and call InsertRootSchedule for each of them
-bool InputRootSchedule(
-    const tvm::Tensor& tensor,
-    ScheduleContext& ctx);
-
-// Check the schedule of tensor's inputs,
-// and call InsertRootSchedule and TryVectorization for each of them
-bool InputRootScheduleWithVectorization(
-    const tvm::Tensor& tensor,
-    int64_t natural_vector_size,
-    ScheduleContext& ctx);
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_rule_schedule.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_rule_schedule.cc
deleted file mode 100644
index 33162deddc983..0000000000000
--- a/onnxruntime/core/codegen/passes/scheduler/tvm_rule_schedule.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/scheduler/all_schedules.h"
-
-#include "core/codegen/passes/scheduler/schedule_utils.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// This is for debug
-bool TVM_SCHEDULER_CLASS(AlwaysRoot, GenericTVMRule)::Evaluate(
-    const tvm::Tensor& tensor,
-    const Node*,
-    CodeGenContext&,
-    ScheduleContext& ctx_sched) {
-  return InsertRootSchedule(tensor, ctx_sched);
-}
-
-// For External tvm::Tensor
-bool TVM_SCHEDULER_CLASS(Extern, GenericTVMRule)::Evaluate(
-    const tvm::Tensor& tensor,
-    const Node*,
-    CodeGenContext&,
-    ScheduleContext& ctx_sched) {
-  bool status = InsertRootScheduleAndClosure(tensor, ctx_sched);
-  bool status_input = InputRootSchedule(tensor, ctx_sched);
-  return status || status_input;
-}
-
-// For Reduce Compute tvm::Tensor
-bool TVM_SCHEDULER_CLASS(Reduce, GenericTVMRule)::Evaluate(
-    const tvm::Tensor& tensor,
-    const Node*,
-    CodeGenContext&,
-    ScheduleContext& ctx_sched) {
-  return InsertRootScheduleAndClosure(tensor, ctx_sched);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc
deleted file mode 100644
index 2c8250198fa5f..0000000000000
--- a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/scheduler/tvm_schedule_builder.h"
-
-#include "core/codegen/common/op_macro.h"
-#include "core/codegen/common/settings.h"
-#include "core/common/common.h"
-#include "core/common/logging/logging.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-TVMScheduleBuilder::TVMScheduleBuilder(const std::string& name)
-    : name_(name) {
-}
-
-const std::string& TVMScheduleBuilder::Name() const {
-  return name_;
-}
-
-void TVMScheduleBuilder::InsertDispatcher(std::unique_ptr<TVMScheduleDispatcher>&& ptr) {
-  dispatchers_.push_back(std::move(ptr));
-}
-
-void TVMScheduleBuilder::ClearDispatcher() {
-  dispatchers_.clear();
-}
-
-void TVMScheduleBuilder::DumpAllSchedulers() const {
-  std::ostringstream stream;
-  int count = 0;
-  stream << "[CODEGEN_DUMP_SCHEDULE]" << std::endl;
-  for (auto& d : dispatchers_) {
-    stream << "************ TVM Scheduler Dispatcher "
-           << count << " : "
-           << d->Name()
-           << " ************" << std::endl;
-
-    d->ForEach([&stream](const std::string& key, Scheduler* op) {
-      stream << "Key " << key
-             << ", Creator " << op->Name() << std::endl;
-    });
-
-    ++count;
-  }
-
-  LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << stream.str();
-}
-
-Status TVMScheduleBuilder::Evaluate(
-    const tvm::Tensor& tensor,
-    const Node* node,
-    CodeGenContext& ctx_codegen,
-    ScheduleContext& sched) {
-  Scheduler* candidate = nullptr;
-
-  for (auto& d : dispatchers_) {
-    candidate = d->Find(tensor, node, ctx_codegen);
-    if (nullptr != candidate)
-      break;
-  }
-
-  bool enable_dump_schedule = codegen::CodeGenSettings::Instance().HasOption(codegen::CodeGenSettings::kCodeGenDumpSchedule);
-
-  if (nullptr == candidate) {
-    if (nullptr != node)
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not implemented: ", node->OpType());
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not implemented an internal tvm::Tensor: ", tensor->op->name);
-  }
-
-  bool status = candidate->Evaluate(tensor, node, ctx_codegen, sched);
-
-  if (enable_dump_schedule) {
-    std::ostringstream stream;
-    if (nullptr != node) {
-      stream << std::endl;
-      stream << "[CODEGEN_DUMP_SCHEDULE] "
-             << "Schedule Node: " << node->Name() << std::endl;
-    } else {
-      stream << std::endl;
-    }
-
-    if (status) {
-      stream << "[CODEGEN_DUMP_SCHEDULE] "
-             << "Schedule tvm::Tesnor "
-             << tensor->op->name
-             << " with "
-             << candidate->Name() << std::endl;
-    } else {
-      stream << "[CODEGEN_DUMP_SCHEDULE] "
-             << "Schedule tvm::Tesnor "
-             << tensor->op->name
-             << " is suppressed " << std::endl;
-    }
-
-    LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << stream.str();
-  }
-
-  return Status::OK();
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.h b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.h
deleted file mode 100644
index 9f0a1b3ef45c2..0000000000000
--- a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/passes/scheduler/tvm_scheduler.h"
-#include "core/common/common.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// TVMScheduleBuilder contains all applicable TVM scheduler passes.
-// Scheduler passes are stored in multiple dispatchers
-// that check different conditions of a tvm::Tensor.
-
-// If a tvm::Tensor satisfies more than one TVM scheduler passes,
-// the first dispatched pass will be applied.
-
-class TVMScheduleBuilder {
- public:
-  // TODO: add more parameter in consructor to support different target
-  TVMScheduleBuilder(const std::string& name);
-  ~TVMScheduleBuilder() = default;
-
-  void DumpAllSchedulers() const;
-
-  Status Evaluate(
-      const tvm::Tensor& tensor,
-      const Node* node,
-      CodeGenContext& ctx,
-      ScheduleContext& sched);
-
-  void InsertDispatcher(std::unique_ptr<TVMScheduleDispatcher>&& ptr);
-  void ClearDispatcher();
-
-  const std::string& Name() const;
-
- private:
-  std::vector<std::unique_ptr<TVMScheduleDispatcher>> dispatchers_;
-  std::string name_;
-
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVMScheduleBuilder);
-};
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.cc
deleted file mode 100644
index 071200a234e33..0000000000000
--- a/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/scheduler/tvm_scheduler.h"
-
-#include "core/codegen/common/common.h"
-#include "core/codegen/common/dispatcher.h"
-#include "core/codegen/passes/utils/codegen_context.h"
-
-namespace onnxruntime {
-namespace codegen {
-// explicit instantiation
-template class CreatorBase<const tvm::Tensor&,
-                           const Node*,
-                           tvm_codegen::CodeGenContext&,
-                           tvm_codegen::ScheduleContext&,
-                           bool>;
-
-template class DispatcherBase<tvm_codegen::Scheduler*>;
-
-}  // namespace codegen
-
-namespace tvm_codegen {
-
-static const std::string TMVOpRuleKey_Extern("TVMOpRule_Extern");
-static const std::string TMVOpRuleKey_ComputeReduce("TVMOpRule_ComputeReduce");
-static const std::string TMVOpRuleKey_ComputeRegular("TVMOpRule_ComputeRegular");
-static const std::string TMVOpRuleKey_AlwaysRoot("TMVOpRuleKey_AlwaysRoot");
-static const std::string TMVOpRuleKey_NoRule("TVMOpRule_NoRule");
-
-const std::string& GetTVMOpRule(TVMOpRuleType rule) {
-  if (rule == TVMOpRuleType::Extern) {
-    return TMVOpRuleKey_Extern;
-  } else if (rule == TVMOpRuleType::ComputeReduce) {
-    return TMVOpRuleKey_ComputeReduce;
-  } else if (rule == TVMOpRuleType::AlwaysRoot) {
-    return TMVOpRuleKey_AlwaysRoot;
-  }
-  return TMVOpRuleKey_NoRule;
-}
-
-const std::string& GetTVMOpRule(const tvm::Tensor& tensor) {
-  auto extern_op = tensor->op.as<tvm::ExternOpNode>();
-
-  if (nullptr != extern_op) {
-    return TMVOpRuleKey_Extern;
-  }
-
-  auto compute_op = tensor->op.as<tvm::ComputeOpNode>();
-  if (nullptr != compute_op) {
-    if (compute_op->reduce_axis.size() > 0) {
-      return TMVOpRuleKey_ComputeReduce;
-    }
-  }
-
-  return TMVOpRuleKey_NoRule;
-}
-
-Scheduler* SCHEDULE_DISPATCHER_CLASS(OrtOpType)::
-    Find(const tvm::Tensor&, const Node* node, tvm_codegen::CodeGenContext&) {
-  if (nullptr == node)
-    return nullptr;
-  return DispatcherBase::Get(node->OpType());
-}
-
-Scheduler* SCHEDULE_DISPATCHER_CLASS(TVMOpRule)::
-    Find(const tvm::Tensor& tensor, const Node*, tvm_codegen::CodeGenContext&) {
-  return DispatcherBase::Get(GetTVMOpRule(tensor));
-}
-
-Scheduler* SCHEDULE_DISPATCHER_CLASS(OrtOpName)::
-    Find(const tvm::Tensor&, const Node* node, tvm_codegen::CodeGenContext&) {
-  if (nullptr == node)
-    return nullptr;
-  return DispatcherBase::Get(GetKey(node));
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.h b/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.h
deleted file mode 100644
index d022497c77f7e..0000000000000
--- a/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/common/common.h"
-#include "core/codegen/common/creator.h"
-#include "core/codegen/common/registry.h"
-#include "core/codegen/passes/utils/codegen_context.h"
-#include "core/graph/graph.h"
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// These are current generic TVMOpRule we used.
-enum class TVMOpRuleType : int {
-  Extern = 0,
-  ComputeReduce = 1,
-  ComputeRegular = 2,
-  AlwaysRoot = 3,  // for debug
-  NoRule,
-};
-
-const std::string& GetTVMOpRule(const tvm::Tensor& tensor);
-const std::string& GetTVMOpRule(TVMOpRuleType rule);
-
-// These are current generic ScheduleType in tvm_codegen
-enum class ScheduleType : int {
-  ScheduleNone = 0,
-  ScheduleInline = 1,
-  ScheduleAt = 2,
-  ScheduleRoot = 3,
-  ScheduleClosure = 4,
-};
-
-// Data struct to bundle tvm::Schedule and scheduled tensor
-struct ScheduleContext {
-  ScheduleContext(const tvm::Array<tvm::Operation>& ops) {
-    schedule = tvm::create_schedule(ops);
-  }
-  tvm::Schedule schedule;
-  std::map<const tvm::Node*, ScheduleType> scheduled_tensors;
-};
-
-// Scheduler inserts a tvm::Schedule content to a tvm::Tensor
-using Scheduler = codegen::CreatorBase<
-    const tvm::Tensor&,
-    const Node*,
-    tvm_codegen::CodeGenContext&,
-    ScheduleContext&,
-    bool>;
-
-// TVMScheduleDispatcher is the base dispatcher for TVM Schedule Builder
-// It checks whether a pair of {tvm::Tensor, Ort Node} satisfying a criteria (in Find)
-// and dispatches a corresponding Scheduler.
-class TVMScheduleDispatcher : public codegen::DispatcherBase<Scheduler*> {
- public:
-  TVMScheduleDispatcher(const std::string& name)
-      : DispatcherBase(name) {}
-
-  virtual ~TVMScheduleDispatcher() = default;
-
-  virtual Scheduler* Find(const tvm::Tensor&,
-                          const Node*,
-                          tvm_codegen::CodeGenContext&) = 0;
-
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVMScheduleDispatcher);
-};
-
-// Macro returns an Schedulers' dispatcher's name
-#define SCHEDULE_DISPATCHER_CLASS(TYPE) \
-  TVM##TYPE##Schedulers
-
-// Macro declares an Schedulers' dispatcher
-#define DECLARE_SCHEDULE_DISPATCHER_CLASS(TYPE)                                       \
-  class SCHEDULE_DISPATCHER_CLASS(TYPE) : public tvm_codegen::TVMScheduleDispatcher { \
-   public:                                                                            \
-    TVM##TYPE##Schedulers(const std::string& name)                                    \
-        : TVMScheduleDispatcher(name) {}                                              \
-    ~TVM##TYPE##Schedulers() = default;                                               \
-    tvm_codegen::Scheduler* Find(const tvm::Tensor&,                                  \
-                                 const Node*,                                         \
-                                 tvm_codegen::CodeGenContext&) override;              \
-                                                                                      \
-   private:                                                                           \
-    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVM##TYPE##Schedulers);                     \
-  };
-
-// Common dispatchers are listed here
-// For a special pattern, it can be created later.
-// One dispatcher is based on Ort OpType
-DECLARE_SCHEDULE_DISPATCHER_CLASS(OrtOpType)
-// One dispatcher is based on TVMOpRule
-DECLARE_SCHEDULE_DISPATCHER_CLASS(TVMOpRule)
-// One dispatcher is based Ort NodeArg name
-DECLARE_SCHEDULE_DISPATCHER_CLASS(OrtOpName)
-
-// Scheduler Registry is a registry holds all Schedulers
-using TVMScheduleRegistry = codegen::RegistryBase<Scheduler>;
-
-// Macro declares TVM scheduler class
-#define DECLARE_TVM_SCHEDULER_CLASS(OP, PRETFIX)       \
-  DECLARE_CREATOR_CLASS(OP, PRETFIX##Scheduler,        \
-                        const tvm::Tensor&,            \
-                        const Node*,                   \
-                        tvm_codegen::CodeGenContext&,  \
-                        tvm_codegen::ScheduleContext&, \
-                        bool)
-
-// Macro returns TVM scheduler's name with prefix
-#define TVM_SCHEDULER_CLASS(OP, PREFIX) \
-  CREATOR_CLASS(OP, PREFIX##Scheduler)
-
-// Macro returns TVM scheduler's name as string
-#define TVM_SCHEDULER_STRING(OP, PREFIX) \
-  STRINGIZE(TVM_SCHEDULER_CLASS(OP, PREFIX))
-
-// Macro returns TVM scheduler's name with prefix and arch
-#define TVM_SCHEDULER_CLASS_EX(OP, PREFIX, ARCH) \
-  CREATOR_CLASS(OP, PREFIX##ARCH##Scheduler)
-
-// Macro declares TVM scheduler class with prefix and arch
-#define DECLARE_TVM_SCHEDULER_CLASS_EX(OP, PREFIX, ARCH) \
-  DECLARE_TVM_SCHEDULER_CLASS(OP, PREFIX##ARCH)
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/utils/codegen_context.cc b/onnxruntime/core/codegen/passes/utils/codegen_context.cc
deleted file mode 100644
index 2f1a59b4a92eb..0000000000000
--- a/onnxruntime/core/codegen/passes/utils/codegen_context.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/utils/codegen_context.h"
-
-#include "core/codegen/common/common.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-CodeGenContext::CodeGenContext(
-    const codegen::CodeGenHandle* handle)
-    : handle_(handle), unname_symbol_counter_(0) {}
-
-tvm::Var CodeGenContext::GetOrCreateDynamicDim(const std::string& name) {
-  if (dynamic_dims_.count(name) == 0)
-    dynamic_dims_.emplace(name, tvm::Var(name));
-
-  return dynamic_dims_.at(name);
-}
-
-std::string CodeGenContext::CreateUnnamedSymbol() {
-  return "unnamed_" + std::to_string(unname_symbol_counter_++);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/utils/codegen_context.h b/onnxruntime/core/codegen/passes/utils/codegen_context.h
deleted file mode 100644
index 641552bd3b2e8..0000000000000
--- a/onnxruntime/core/codegen/passes/utils/codegen_context.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/common/handle.h"
-#include "core/codegen/common/common.h"
-#include "core/common/common.h"
-#include "core/framework/data_types.h"
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// CodeGenContext is a data structure involving across passes
-// Compiler developers can use it to store meta data
-// to support fine-grained control of code generation
-class CodeGenContext {
- public:
-  CodeGenContext(const codegen::CodeGenHandle* handle);
-
-  virtual ~CodeGenContext() = default;
-
-  // returns tvm::Var for the dynamic dim
-  tvm::Var GetOrCreateDynamicDim(const std::string& name);
-
-  const codegen::CodeGenHandle* GetCodeGenHandle() const {
-    return handle_;
-  }
-
-  std::string CreateUnnamedSymbol();
-
- protected:
-  std::unordered_map<std::string, tvm::Var> dynamic_dims_;
-
-  const codegen::CodeGenHandle* handle_;
-
-  int unname_symbol_counter_;
-};
-
-// Add Promote for CodeGenContext
-DYNAMIC_PROMOTE(CodeGenContext)
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc
deleted file mode 100644
index 55892974aa33f..0000000000000
--- a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/utils/ort_tvm_utils.h"
-
-#include "core/codegen/common/profile.h"
-#include "core/codegen/passes/utils/codegen_context.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/common.h"
-#include <gsl/gsl>
-
-#include <topi/detail/extern.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-#define RETURN_DLDATATYPE_IF_MATCH(type_enum, type, type_code) \
-  case type_enum:                                              \
-    return {type_code, sizeof(type) * 8, 1};                   \
-    break;
-
-// DLDataType: {DLDataTypeCode, bits, lanes}
-DLDataType ToTvmDLDataType(MLDataType ml_type) {
-  if (ml_type->IsTensorType()) {
-    ml_type = ml_type->AsTensorType()->GetElementType();
-  }
-  auto prim_type = ml_type->AsPrimitiveDataType();
-  if (prim_type == nullptr) {
-    ORT_NOT_IMPLEMENTED("converting MLDataType ", ml_type, " to tvm DLDataType is not implemented");
-  }
-
-  switch (prim_type->GetDataType()) {
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_INT8, int8_t, kDLInt);
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_UINT8, uint8_t, kDLUInt);
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_INT16, int16_t, kDLInt);
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_UINT16, uint16_t, kDLUInt);
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_INT32, int32_t, kDLInt);
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_UINT32, uint32_t, kDLUInt);
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_INT64, int64_t, kDLInt);
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_UINT64, uint64_t, kDLUInt);
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_BOOL, bool, kDLUInt);
-
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, float, kDLFloat);
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, double, kDLFloat);
-    RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16, MLFloat16, kDLFloat);
-    default:
-      ORT_NOT_IMPLEMENTED("converting MLDataType ", ml_type, " to tvm DLDataType is not implemented");
-  }
-}
-
-tvm::Type ToTvmType(ONNX_NAMESPACE::TensorProto_DataType proto_type) {
-  switch (proto_type) {
-    // Note that bool is uint1 in tvm, but uint8 in ONNX, so it always require special handling
-    // case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    //  return tvm::UInt(1); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_INT16:
-      return tvm::Int(16); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
-      return tvm::Int(32); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
-      return tvm::Int(64); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
-      return tvm::UInt(8); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_UINT16:
-      return tvm::UInt(16); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
-      return tvm::UInt(32); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
-      return tvm::UInt(64); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
-      return tvm::Float(32); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
-      return tvm::Float(64); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
-      return tvm::Int(8); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
-      return tvm::Float(16); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_STRING:
-      ORT_THROW("Casting to and from strings is not supported yet."); /*break;*/
-    case ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED:
-      ORT_THROW("Cast op must have 'to' argument of type DataType"); /*break;*/
-    default:
-      ORT_THROW("Unexpected 'to' argument value: ", proto_type);
-  }
-}
-
-tvm::Array<tvm::Expr> ShapeToTvmArray(const NodeArg* def, CodeGenContext& ctx) {
-  ORT_ENFORCE(nullptr != def);
-  const ONNX_NAMESPACE::TensorShapeProto* shape_proto = def->Shape();
-  ORT_ENFORCE(nullptr != shape_proto);
-
-  tvm::Array<tvm::Expr> arr;
-  for (int i = 0; i < shape_proto->dim_size(); ++i) {
-    arr.push_back(ShapeDimToTvmDim(shape_proto->dim(i), ctx));
-  }
-  return arr;
-}
-
-tvm::Expr ShapeDimToTvmDim(const ONNX_NAMESPACE::TensorShapeProto_Dimension& dim, CodeGenContext& ctx) {
-  if (utils::HasDimParam(dim)) {
-    return ctx.GetOrCreateDynamicDim(dim.dim_param());
-  } else if (utils::HasDimValue(dim)) {
-    return tvm::Expr(gsl::narrow_cast<int32_t>(dim.dim_value()));
-  }
-  return ctx.GetOrCreateDynamicDim(ctx.CreateUnnamedSymbol());
-}
-
-#ifdef CODEGEN_ENABLE_PROFILER
-struct event_in_bracket_and_id {
-  bool in_bracket;
-  size_t id;
-};
-std::unordered_map<std::string, event_in_bracket_and_id> g_codegen_profiler_event_ids;
-std::vector<std::pair<std::string, TimePoint>> g_codegen_profiler_events(1024);
-
-TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.profile_event")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* ret) {
-      DLTensor* X = args[0];
-      DLTensor* Y = args[1];
-      size_t event_id = args[2];
-      bool is_begin = args[3];
-      if (!is_begin) {
-        DCHECK(event_id < g_codegen_profiler_event_ids.size());
-        profiling::Profiler::Instance().EndTimeAndRecordEvent(
-            profiling::EventCategory::NODE_EVENT,
-            g_codegen_profiler_events[event_id].first,
-            g_codegen_profiler_events[event_id].second);
-      }
-
-      {
-        CODEGEN_PROFILER_EVENT("profile_stub");
-        int64_t elem_count = 1;
-        for (int i = 0; i < X->ndim; ++i) {
-          elem_count *= X->shape[i];
-        }
-        // there's overhead in this copy, so put begin after copy and end before copy
-        memcpy(static_cast<char*>(Y->data) + Y->byte_offset,
-               static_cast<char*>(X->data) + X->byte_offset,
-               elem_count * X->dtype.bits / 8);
-      }
-
-      if (is_begin) {
-        DCHECK(g_codegen_profiler_events.size() > event_id);
-        DCHECK(!g_codegen_profiler_events[event_id].first.empty());
-        DCHECK(g_codegen_profiler_event_ids[g_codegen_profiler_events[event_id].first].id == event_id);
-        g_codegen_profiler_events[event_id].second =
-            profiling::Profiler::Instance().StartTime();
-      }
-    });
-
-tvm::Tensor ProfileBegin(tvm::Tensor X, const std::string& event_name) {
-  size_t event_id;
-  if (g_codegen_profiler_event_ids.count(event_name) == 0) {
-    event_id = g_codegen_profiler_event_ids.size();
-    ORT_ENFORCE(event_id < g_codegen_profiler_events.size());
-  } else {
-    ORT_ENFORCE(!g_codegen_profiler_event_ids[event_name].in_bracket);
-    event_id = g_codegen_profiler_event_ids[event_name].id;
-  }
-  g_codegen_profiler_event_ids[event_name] = {true, event_id};
-  g_codegen_profiler_events[event_id].first = event_name;
-  return topi::detail::make_extern(
-      {X->shape}, {X->dtype}, {X},
-      [&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
-        return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.profile_event"),
-                                          topi::detail::pack_buffer(ins[0]),
-                                          topi::detail::pack_buffer(outs[0]),
-                                          gsl::narrow<int>(event_id),
-                                          true});
-      },
-      event_name + "_begin", "", {})[0];
-}
-
-tvm::Tensor ProfileEnd(tvm::Tensor X, const std::string& event_name) {
-  ORT_ENFORCE(g_codegen_profiler_event_ids.at(event_name).in_bracket);
-  g_codegen_profiler_event_ids.at(event_name).in_bracket = false;
-  size_t event_id = g_codegen_profiler_event_ids.at(event_name).id;
-  ORT_ENFORCE(event_id < g_codegen_profiler_events.size());
-  ORT_ENFORCE(g_codegen_profiler_events[event_id].first == event_name);
-  return topi::detail::make_extern(
-      {X->shape}, {X->dtype}, {X},
-      [&](tvm::Array<tvm::Buffer> ins, tvm::Array<tvm::Buffer> outs) {
-        return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.profile_event"),
-                                          topi::detail::pack_buffer(ins[0]),
-                                          topi::detail::pack_buffer(outs[0]),
-                                          gsl::narrow<int>(event_id),
-                                          false});
-      },
-      event_name + "_end", "", {})[0];
-}
-#endif
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.h b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.h
deleted file mode 100644
index f13e91a2d5cea..0000000000000
--- a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/common/common.h"
-#include "core/framework/data_types.h"
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-class CodeGenContext;
-
-// Helper function that converts a onnxruntime MLDataType to TVM DLDataType
-DLDataType ToTvmDLDataType(MLDataType ml_type);
-
-tvm::Type ToTvmType(ONNX_NAMESPACE::TensorProto_DataType proto_type);
-
-tvm::Array<tvm::Expr> ShapeToTvmArray(const NodeArg* def, CodeGenContext& ctx);
-
-tvm::Expr ShapeDimToTvmDim(const ONNX_NAMESPACE::TensorShapeProto_Dimension& dim, CodeGenContext& ctx);
-
-#ifdef CODEGEN_ENABLE_PROFILER
-// Helper functions to inspect into lowered function
-tvm::Tensor ProfileBegin(tvm::Tensor X, const std::string& event_name);
-
-tvm::Tensor ProfileEnd(tvm::Tensor X, const std::string& event_name);
-#endif
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.cc b/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.cc
deleted file mode 100644
index c65132f6d4bca..0000000000000
--- a/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/weight_layout/tiling_2d.h"
-
-#include "core/codegen/passes/utils/codegen_context.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-constexpr auto local_name_prefix = "tiling_2d_";
-constexpr int num_bits = 8;
-
-const std::string WeightLayoutTiling2D::GetKey(
-    ONNX_NAMESPACE::TensorProto_DataType proto_type,
-    int vector_width) {
-  return WeightLayout::GetKey(
-      local_name_prefix + std::to_string(vector_width),
-      proto_type, 2, 0.0f);
-}
-
-WeightLayoutTiling2D::WeightLayoutTiling2D(
-    ONNX_NAMESPACE::TensorProto_DataType proto_type,
-    int vector_width)
-    : WeightLayout(
-          local_name_prefix + std::to_string(vector_width),
-          proto_type, 2, 0.0f),
-      vector_width_(vector_width) {}
-
-CoordTransFunc WeightLayoutTiling2D::ToActual(const tvm::Tensor& /*X*/) const {
-  return [&](const tvm::Array<tvm::Expr>& nominal_coord) {
-    ORT_ENFORCE(nominal_coord.size() == 2);
-    const auto& y = nominal_coord[0];
-    const auto& x = nominal_coord[1];
-    return tvm::Array<tvm::Expr>{
-        x,
-        y};
-  };
-}
-
-CoordTransFunc WeightLayoutTiling2D::ToNominal(const tvm::Tensor& X) const {
-  return [&](const tvm::Array<tvm::Expr>& actual_coord) {
-    ORT_ENFORCE(actual_coord.size() == 2);
-    ORT_ENFORCE(X->dtype == HalideIR::type_of<int8_t>() ||
-                X->dtype == HalideIR::type_of<int16_t>());
-
-    int tile_row = (sizeof(int32_t) * num_bits) / X->dtype.bits();
-    int tile_col = ((vector_width_ * num_bits) / X->dtype.bits()) / tile_row;
-
-    const auto& x = actual_coord[0];
-    const auto& y = actual_coord[1];
-
-    const int block_dimy = tile_row;
-    const int block_dimx = tile_col;
-
-    const auto& y0 = y % block_dimy;
-    const auto& y1 = (y / block_dimy) % block_dimx;
-    const auto& y2 = y / block_dimy / block_dimx;
-
-    const auto& x0 = x % block_dimx;
-    const auto& x1 = x / block_dimx;
-
-    return tvm::Array<tvm::Expr>{
-        y0 + y2 * block_dimx * block_dimy + x0 * block_dimy,
-        y1 + x1 * block_dimx};
-  };
-}
-
-tvm::Array<tvm::Expr> WeightLayoutTiling2D::ToActualShape(const tvm::Tensor& X) const {
-  ORT_ENFORCE(X->dtype == HalideIR::type_of<int8_t>() ||
-              X->dtype == HalideIR::type_of<int16_t>());
-
-  auto pad_row = tvm::make_const(tvm::Int(32), (vector_width_ * num_bits) / X->dtype.bits());
-  auto pad_col = tvm::make_const(tvm::Int(32), vector_width_ / sizeof(int32_t));
-
-  auto new_shape0 = ((X->shape[1] + pad_col - 1) / pad_col) * pad_col;
-  auto new_shape1 = ((X->shape[0] + pad_row - 1) / pad_row) * pad_row;
-
-  tvm::Array<tvm::Expr>
-      new_shape = {
-          new_shape0,
-          new_shape1};
-  return new_shape;
-}
-
-std::vector<int64_t> WeightLayoutTiling2D::ToActualShape(const Tensor* X) const {
-  ORT_ENFORCE(X != nullptr);
-  ORT_ENFORCE(X->Shape().GetDims().size() == 2);
-
-  int pad_row = vector_width_ / X->DataType()->Size();
-  int pad_col = vector_width_ / sizeof(int32_t);
-
-  auto old_shape = X->Shape().GetDims();
-  auto new_shape0 = (old_shape[1] + pad_col - 1) / pad_col * pad_col;
-  auto new_shape1 = ((old_shape[0] + pad_row - 1) / pad_row) * pad_row;
-
-  std::vector<int64_t> new_shape = {
-      new_shape0,
-      new_shape1};
-
-  return new_shape;
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.h b/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.h
deleted file mode 100644
index 64334a069f94f..0000000000000
--- a/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "core/codegen/passes/weight_layout/weight_layout.h"
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-/*
- * \class! WeightLayoutTiling2D
- * \breif! Transform 2D weight to 4D by tiling both dimension,
- *         this layout is used for tensorization.
- * [M, N] => [M/Tx, N/Ty, Tx, Ty]
- */
-
-class WeightLayoutTiling2D : public WeightLayout {
- public:
-  static const std::string GetKey(ONNX_NAMESPACE::TensorProto_DataType proto_type,
-                                  int vector_width);
-
- public:
-  WeightLayoutTiling2D(ONNX_NAMESPACE::TensorProto_DataType proto_type,
-                       int vector_width);
-
-  ~WeightLayoutTiling2D() = default;
-
-  CoordTransFunc ToNominal(const tvm::Tensor& X) const override;
-  CoordTransFunc ToActual(const tvm::Tensor& X) const override;
-  tvm::Array<tvm::Expr> ToActualShape(const tvm::Tensor& X) const override;
-  std::vector<int64_t> ToActualShape(const Tensor* X) const override;
-
- private:
-  int vector_width_;
-
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayoutTiling2D);
-};
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.cc b/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.cc
deleted file mode 100644
index ea8597f7dd89d..0000000000000
--- a/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/weight_layout/transpose_2d.h"
-
-#include "core/codegen/passes/utils/codegen_context.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-constexpr auto local_layout_name = "transpose_2d";
-
-const std::string WeightLayoutTranspose2D::GetKey(
-    ONNX_NAMESPACE::TensorProto_DataType proto_type) {
-  return WeightLayout::GetKey(local_layout_name, proto_type, 2, 0.0f);
-}
-
-WeightLayoutTranspose2D::WeightLayoutTranspose2D(
-    ONNX_NAMESPACE::TensorProto_DataType proto_type)
-    : WeightLayout(local_layout_name, proto_type, 2, 0.0f) {}
-
-CoordTransFunc WeightLayoutTranspose2D::ToActual(const tvm::Tensor& /*X*/) const {
-  return [&](const tvm::Array<tvm::Expr>& nominal_coord) {
-    ORT_ENFORCE(nominal_coord.size() == 2);
-    const auto& y = nominal_coord[0];
-    const auto& x = nominal_coord[1];
-    return tvm::Array<tvm::Expr>{
-        x,
-        y};
-  };
-}
-
-CoordTransFunc WeightLayoutTranspose2D::ToNominal(const tvm::Tensor& /*X*/) const {
-  return [&](const tvm::Array<tvm::Expr>& actual_coord) {
-    ORT_ENFORCE(actual_coord.size() == 2);
-    const auto& y = actual_coord[0];
-    const auto& x = actual_coord[1];
-    return tvm::Array<tvm::Expr>{
-        x,
-        y};
-  };
-}
-
-tvm::Array<tvm::Expr> WeightLayoutTranspose2D::ToActualShape(const tvm::Tensor& X) const {
-  tvm::Array<tvm::Expr> new_shape = {
-      X->shape[1],
-      X->shape[0]};
-  return new_shape;
-}
-
-std::vector<int64_t> WeightLayoutTranspose2D::ToActualShape(const Tensor* X) const {
-  ORT_ENFORCE(X != nullptr);
-  ORT_ENFORCE(X->Shape().GetDims().size() == 2);
-  auto old_shape = X->Shape().GetDims();
-
-  std::vector<int64_t> new_shape = {
-      old_shape[1],
-      old_shape[0]};
-
-  return new_shape;
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.h b/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.h
deleted file mode 100644
index 65babaaec8dac..0000000000000
--- a/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "core/codegen/passes/weight_layout/weight_layout.h"
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// WeightLayoutTranspose2D for transposing a 2D weight
-// [W, H] => [H, W]
-class WeightLayoutTranspose2D : public WeightLayout {
- public:
-  static const std::string GetKey(ONNX_NAMESPACE::TensorProto_DataType proto_type);
-
- public:
-  WeightLayoutTranspose2D(ONNX_NAMESPACE::TensorProto_DataType proto_type);
-
-  ~WeightLayoutTranspose2D() = default;
-
-  CoordTransFunc ToNominal(const tvm::Tensor& X) const override;
-  CoordTransFunc ToActual(const tvm::Tensor& X) const override;
-  tvm::Array<tvm::Expr> ToActualShape(const tvm::Tensor& X) const override;
-  std::vector<int64_t> ToActualShape(const Tensor* X) const override;
-
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayoutTranspose2D);
-};
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.cc b/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.cc
deleted file mode 100644
index b1ddb791a3b3d..0000000000000
--- a/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/weight_layout/vertical_stripes_2d.h"
-
-#include "core/codegen/passes/utils/codegen_context.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-constexpr auto local_name_prefix = "vertical_stripe_2d_";
-
-const std::string WeightLayoutVerticalStripe2D::GetKey(
-    ONNX_NAMESPACE::TensorProto_DataType proto_type,
-    int stripe_width) {
-  return WeightLayout::GetKey(
-      local_name_prefix + std::to_string(stripe_width),
-      proto_type, 2, 0.0f);
-}
-
-WeightLayoutVerticalStripe2D::WeightLayoutVerticalStripe2D(
-    ONNX_NAMESPACE::TensorProto_DataType proto_type,
-    int stripe_width)
-    : WeightLayout(
-          local_name_prefix + std::to_string(stripe_width),
-          proto_type, 2, 0.0f),
-      stripe_width_(stripe_width) {
-}
-
-CoordTransFunc WeightLayoutVerticalStripe2D::ToActual(const tvm::Tensor& /*X*/) const {
-  return [&](const tvm::Array<tvm::Expr>& nominal_coord) {
-    ORT_ENFORCE(nominal_coord.size() == 2);
-    const auto& y = nominal_coord[0];
-    const auto& x = nominal_coord[1];
-    return tvm::Array<tvm::Expr>{
-        x / stripe_width_,
-        y,
-        x % stripe_width_};
-  };
-}
-
-CoordTransFunc WeightLayoutVerticalStripe2D::ToNominal(const tvm::Tensor& /*X*/) const {
-  return [&](const tvm::Array<tvm::Expr>& actual_coord) {
-    ORT_ENFORCE(actual_coord.size() == 3);
-    const auto& z = actual_coord[0];
-    const auto& y = actual_coord[1];
-    const auto& x = actual_coord[2];
-    return tvm::Array<tvm::Expr>{
-        y,
-        x + stripe_width_ * z};
-  };
-}
-
-tvm::Array<tvm::Expr> WeightLayoutVerticalStripe2D::ToActualShape(const tvm::Tensor& X) const {
-  tvm::Array<tvm::Expr> new_shape = {
-      (X->shape[1] + stripe_width_ - 1) / stripe_width_,
-      X->shape[0],
-      stripe_width_};
-  return new_shape;
-}
-
-std::vector<int64_t> WeightLayoutVerticalStripe2D::ToActualShape(const Tensor* X) const {
-  ORT_ENFORCE(X != nullptr);
-  auto old_shape = X->Shape().GetDims();
-
-  ORT_ENFORCE(old_shape.size() == 2);
-
-  std::vector<int64_t> new_shape = {
-      (old_shape[1] + stripe_width_ - 1) / stripe_width_,
-      old_shape[0],
-      stripe_width_};
-
-  return new_shape;
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.h b/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.h
deleted file mode 100644
index b9b65025dc014..0000000000000
--- a/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/common/common.h"
-#include "core/codegen/passes/weight_layout/weight_layout.h"
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-// WeightLayoutVerticalStripe2D for making a 2D weight to 3D, by tiling the lowest (verteical) dimension
-// [W, H] => [H/stripe, W, stripe]
-class WeightLayoutVerticalStripe2D : public WeightLayout {
- public:
-  static const std::string GetKey(
-      ONNX_NAMESPACE::TensorProto_DataType proto_type,
-      int stripe_width);
-
- public:
-  WeightLayoutVerticalStripe2D(
-      ONNX_NAMESPACE::TensorProto_DataType proto_type,
-      int stripe_width);
-
-  ~WeightLayoutVerticalStripe2D() = default;
-
-  virtual CoordTransFunc ToNominal(const tvm::Tensor& X) const override;
-  virtual CoordTransFunc ToActual(const tvm::Tensor& X) const override;
-  tvm::Array<tvm::Expr> ToActualShape(const tvm::Tensor& X) const override;
-  std::vector<int64_t> ToActualShape(const Tensor* X) const override;
-
- private:
-  int stripe_width_;
-
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayoutVerticalStripe2D);
-};
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.cc b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.cc
deleted file mode 100644
index ab3e647fd284a..0000000000000
--- a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/codegen/passes/weight_layout/weight_layout.h"
-
-#include "core/codegen/common/common.h"
-#include "core/codegen/common/utils.h"
-#include "core/codegen/mti/mti_tvm_utils.h"
-#include "core/codegen/passes/utils/ort_tvm_utils.h"
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-static tvm::Tensor CreateTVMPlaceholder(
-    const std::string& name,
-    HalideIR::Type type,
-    int dim) {
-  tvm::Array<tvm::Expr> shape;
-  if (dim > 0) {
-    for (int i = 0; i < dim; ++i) {
-      shape.push_back(tvm::Var(name + "_v" + std::to_string(i)));
-    }
-  } else {
-    shape.push_back(1);
-  }
-  return tvm::placeholder(shape, type, name + "_placeholder");
-}
-
-const std::string WeightLayout::GetKey(
-    const std::string& name,
-    ONNX_NAMESPACE::TensorProto_DataType proto_type,
-    int input_dim,
-    float pad_zero) {
-  std::ostringstream key;
-  key << name << "_type_" << static_cast<int>(proto_type);
-  key << "_dim_" << input_dim;
-  key << "_pad_zero_" << pad_zero;
-  return NormalizeCppName(key.str());
-}
-
-WeightLayout::WeightLayout(
-    const std::string& name,
-    ONNX_NAMESPACE::TensorProto_DataType proto_type,
-    int input_dim,
-    float pad_zero)
-    : name_(GetKey(name, proto_type, input_dim, pad_zero)),
-      proto_type_(proto_type),
-      input_dim_(input_dim),
-      pad_zero_(pad_zero) {}
-
-const std::string& WeightLayout::Name() const {
-  return name_;
-}
-
-void WeightLayout::CreateLayoutMarshallingTVMOp(tvm::Array<tvm::Tensor>& inputs,
-                                                tvm::Array<tvm::Tensor>& outputs) const {
-  HalideIR::Type halide_type = ToTvmType(proto_type_);
-
-  tvm::Tensor placeholder = CreateTVMPlaceholder(name_, halide_type, input_dim_);
-  inputs.push_back(placeholder);
-
-  tvm::Array<tvm::Expr> new_shape = ToActualShape(placeholder);
-  CoordTransFunc new_coord_to_old_coord_func = ToNominal(placeholder);
-  tvm::Expr pad_zero_expr = tvm::make_const(halide_type, pad_zero_);
-
-  tvm::Tensor output = tvm::compute(
-      new_shape,
-      [&](const tvm::Array<tvm::Var>& output_coord) {
-        tvm::Array<tvm::Expr> output_coord1;
-        for (const auto& coord : output_coord)
-          output_coord1.push_back(coord);
-        auto input_coord = new_coord_to_old_coord_func(output_coord1);
-        ORT_ENFORCE(input_coord.size() == placeholder->shape.size());
-
-        if (input_coord.size() > 0) {
-          auto in_range = (input_coord[0] >= 0) && (input_coord[0] < placeholder->shape[0]);
-          for (size_t dim = 1; dim < input_coord.size(); ++dim)
-            in_range = in_range && (input_coord[dim] >= 0) && (input_coord[dim] < placeholder->shape[dim]);
-
-          return tvm::if_then_else(in_range, placeholder(input_coord), pad_zero_expr);
-        } else {
-          // scalar
-          return placeholder(input_coord);
-        }
-      });
-
-  outputs.push_back(output);
-}
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h
deleted file mode 100644
index 1b45a38e7e24e..0000000000000
--- a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/codegen/common/common.h"
-#include "core/codegen/common/registry.h"
-#include "core/common/common.h"
-#include "core/framework/tensor.h"
-#include <tvm/tvm.h>
-
-namespace onnxruntime {
-namespace tvm_codegen {
-
-using CoordTransFunc = std::function<tvm::Array<tvm::Expr>(const tvm::Array<tvm::Expr>&)>;
-
-// WeightLayout is data layout transformer for weight/initializer
-class WeightLayout {
- public:
-  // Static function to return unique string as a key
-  static const std::string GetKey(
-      const std::string& name,
-      ONNX_NAMESPACE::TensorProto_DataType proto_type,
-      int input_dim,
-      float pad_zero);
-
- public:
-  WeightLayout(
-      const std::string& name,
-      ONNX_NAMESPACE::TensorProto_DataType proto_type,
-      int input_dim,
-      float pad_zero);
-
-  virtual ~WeightLayout() = default;
-
-  // Return a CoordTransFunc from actual (transformed) coordinate to normial (original) coordinate
-  virtual CoordTransFunc ToNominal(const tvm::Tensor& X) const = 0;
-
-  // Return a CoordTransFunc from normial (original) coordinate to actual (transformed) coordinate
-  virtual CoordTransFunc ToActual(const tvm::Tensor& X) const = 0;
-
-  // Return actual (transformed) shape in tvm::Array (tvm_codegen)
-  virtual tvm::Array<tvm::Expr> ToActualShape(const tvm::Tensor& X) const = 0;
-
-  // Return actual (transformed) shape in vector<int64_t> (ort)
-  virtual std::vector<int64_t> ToActualShape(const Tensor* X) const = 0;
-
-  // Create Layout Marshalling op in outputs
-  void CreateLayoutMarshallingTVMOp(tvm::Array<tvm::Tensor>& inputs,
-                                    tvm::Array<tvm::Tensor>& outputs) const;
-
-  // Layout name
-  const std::string& Name() const;
-
- protected:
-  std::string name_;
-  ONNX_NAMESPACE::TensorProto_DataType proto_type_;
-  int input_dim_;
-  float pad_zero_;
-
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayout);
-};
-
-// Weight Layout Registry is a registry holds all WeightLayout
-using WeightLayoutRegistry = codegen::RegistryBase<WeightLayout>;
-
-}  // namespace tvm_codegen
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index 5dca4cf6c165b..ecd3960107926 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -138,7 +138,8 @@ class PlannerImpl {
               const SubgraphsKernelCreateInfoMaps& subgraphs_kernel_create_info_maps,
               const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map,
               const OrtValueNameIdxMap& ort_value_name_idx_map,
-              const ISequentialPlannerContext& context, SequentialExecutionPlan& plan)
+              const ISequentialPlannerContext& context, SequentialExecutionPlan& plan,
+              const logging::Logger& logger)
       : context_(&context),
         plan_(plan),
         parent_node_(parent_node),
@@ -148,14 +149,15 @@ class PlannerImpl {
         kernel_create_info_map_(kernel_create_info_map),
         subgraphs_kernel_create_info_maps_(subgraphs_kernel_create_info_maps),
         outer_scope_node_arg_to_location_map_(outer_scope_node_arg_to_location_map),
-        ort_value_name_idx_map_(ort_value_name_idx_map) {}
+        ort_value_name_idx_map_(ort_value_name_idx_map),
+        logger_(logger) {
+  }
 
   Status CreatePlan(
 #ifdef ORT_ENABLE_STREAM
       const IStreamCommandHandleRegistry& stream_handle_registry,
 #endif
-      const PathString& partition_config_file,
-      const logging::Logger& logger);
+      const PathString& partition_config_file);
 
  private:
   gsl::not_null<const ISequentialPlannerContext*> context_;
@@ -183,6 +185,12 @@ class PlannerImpl {
   InlinedHashMap<onnxruntime::NodeIndex, InlinedHashSet<onnxruntime::NodeIndex>> dependence_graph_;
   InlinedHashMap<onnxruntime::OrtValueIndex, onnxruntime::NodeIndex> value_node_map_;
 
+  // logger_ is not currently used in a minimal build
+#if defined(ORT_MINIMAL_BUILD) && !defined(ORT_EXTENDED_MINIMAL_BUILD)
+  [[maybe_unused]]
+#endif
+  const logging::Logger& logger_;
+
   // OrtValueInfo: Auxiliary information about an OrtValue used only during plan-generation:
   struct OrtValueInfo {
     const onnxruntime::NodeArg* p_def_site;  // the (unique) NodeArg corresponding to the MLValue
@@ -213,6 +221,7 @@ class PlannerImpl {
     FreeBufferInfo(OrtValueIndex ort_value, size_t dealloc_point)
         : ml_value(ort_value), deallocate_point(dealloc_point) {}
   };
+
   // freelist_ : a list of ml-values whose buffers are free to be reused, sorted by when
   // they became free (more recently freed earlier in the list).
   std::list<FreeBufferInfo> freelist_;
@@ -225,7 +234,8 @@ class PlannerImpl {
   }
 
   int& UseCount(OrtValueIndex n) {
-    ORT_ENFORCE(n >= 0 && static_cast<size_t>(n) < ort_value_info_.size(), "invalid value index: ", n, " against size ", ort_value_info_.size());
+    ORT_ENFORCE(n >= 0 && static_cast<size_t>(n) < ort_value_info_.size(),
+                "invalid value index: ", n, " against size ", ort_value_info_.size());
     return ort_value_info_[n].usecount;
   }
   int& UseCount(const OrtValueName& name) { return UseCount(Index(name)); }
@@ -335,9 +345,9 @@ class PlannerImpl {
             // we cannot.
             const Node* producer_node = graph.GetProducerNode(p_input_arg->Name());
             if (producer_node && HasExternalOutputs(*producer_node)) {
-              LOGS_DEFAULT(VERBOSE) << "Be noted Node " << node.Name() << " is reusing input buffer of node "
-                                    << producer_node->Name() << " which has external outputs. "
-                                    << "Be cautious the reuse MUST be a read-only usage.";
+              LOGS(logger_, VERBOSE) << "Be noted Node " << node.Name() << " is reusing input buffer of node "
+                                     << producer_node->Name() << " which has external outputs. "
+                                     << "Be cautious the reuse MUST be a read-only usage.";
             }
 #endif
             *reusable_input = Index(p_input_arg->Name());
@@ -361,9 +371,9 @@ class PlannerImpl {
           // we cannot.
           const Node* producer_node = graph.GetProducerNode(p_input_arg->Name());
           if (producer_node && HasExternalOutputs(*producer_node)) {
-            LOGS_DEFAULT(VERBOSE) << "Be noted Node " << node.Name() << " is reusing input buffer of node "
-                                  << producer_node->Name() << " which has external outputs. "
-                                  << "Be cautious the reuse MUST be a read-only usage.";
+            LOGS(logger_, VERBOSE) << "Be noted Node " << node.Name() << " is reusing input buffer of node "
+                                   << producer_node->Name() << " which has external outputs. "
+                                   << "Be cautious the reuse MUST be a read-only usage.";
           }
 #endif
           *reusable_input = Index(p_input_arg->Name());
@@ -397,8 +407,8 @@ class PlannerImpl {
                 }
               } else {
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
-                LOGS_DEFAULT(VERBOSE) << "Node " << node.Name() << " cannot reuse input buffer for node "
-                                      << producer_node->Name() << " as it has external outputs";
+                LOGS(logger_, VERBOSE) << "Node " << node.Name() << " cannot reuse input buffer for node "
+                                       << producer_node->Name() << " as it has external outputs";
 #endif
               }
             }
@@ -448,8 +458,8 @@ class PlannerImpl {
             return true;
           } else {
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
-            LOGS_DEFAULT(VERBOSE) << "Node " << node.Name() << " cannot reuse strided output buffer for node "
-                                  << producer_node->Name() << " as it has external outputs.";
+            LOGS(logger_, VERBOSE) << "Node " << node.Name() << " cannot reuse strided output buffer for node "
+                                   << producer_node->Name() << " as it has external outputs.";
 #endif
           }
         }
@@ -1198,9 +1208,9 @@ class PlannerImpl {
                 // Otherwise, we cannot reuse the buffer.
                 const Node* producer_node = graph_viewer.GetProducerNode(p_input_arg->Name());
                 if (producer_node && HasExternalOutputs(*producer_node)) {
-                  LOGS_DEFAULT(VERBOSE) << "Be noted input buffer " << p_output_arg->Name() << " of node "
-                                        << producer_node->Name() << " which has external outputs is reused. "
-                                        << "Be cautious the reuse MUST be a read-only usage.";
+                  LOGS(logger_, VERBOSE) << "Be noted input buffer " << p_output_arg->Name() << " of node "
+                                         << producer_node->Name() << " which has external outputs is reused. "
+                                         << "Be cautious the reuse MUST be a read-only usage.";
                 }
 #endif
 
@@ -1241,9 +1251,9 @@ class PlannerImpl {
               // Otherwise, we cannot reuse the buffer.
               const Node* producer_node = graph_viewer.GetProducerNode(p_input_arg->Name());
               if (producer_node && HasExternalOutputs(*producer_node)) {
-                LOGS_DEFAULT(VERBOSE) << "Be noted input buffer " << p_output_arg->Name() << " of node "
-                                      << producer_node->Name() << " which has external outputs is reused. "
-                                      << "Be cautious the reuse MUST be a read-only usage.";
+                LOGS(logger_, VERBOSE) << "Be noted input buffer " << p_output_arg->Name() << " of node "
+                                       << producer_node->Name() << " which has external outputs is reused. "
+                                       << "Be cautious the reuse MUST be a read-only usage.";
               }
 #endif
 
@@ -1290,8 +1300,8 @@ class PlannerImpl {
                   }
                 } else {
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
-                  LOGS_DEFAULT(VERBOSE) << "Node " << node->Name() << " cannot reuse input buffer for node "
-                                        << producer_node->Name() << " as it has external outputs";
+                  LOGS(logger_, VERBOSE) << "Node " << node->Name() << " cannot reuse input buffer for node "
+                                         << producer_node->Name() << " as it has external outputs";
 #endif
                 }
               }
@@ -1869,8 +1879,7 @@ class PlannerImpl {
   }
 
 #ifndef ORT_ENABLE_STREAM
-  void PartitionIntoStreams(const logging::Logger& /*logger*/,
-                            const ExecutionProviders& /*execution_providers*/,
+  void PartitionIntoStreams(const ExecutionProviders& /*execution_providers*/,
                             const PathString& /*partition_config_file*/) {
     if (graph_viewer_.NumberOfNodes() > 0) {
       stream_nodes_.push_back({});
@@ -1915,11 +1924,11 @@ class PlannerImpl {
 
 #else
 
-  void
-  PartitionIntoStreams(const logging::Logger& logger, const ExecutionProviders& execution_providers,
-                       const PathString& partition_config_file) {
-    auto partitioner = IGraphPartitioner::CreateGraphPartitioner(logger, partition_config_file);
-    auto status = partitioner->PartitionGraph(graph_viewer_, execution_providers, stream_nodes_, context_->GetExecutionOrder());
+  void PartitionIntoStreams(const ExecutionProviders& execution_providers,
+                            const PathString& partition_config_file) {
+    auto partitioner = IGraphPartitioner::CreateGraphPartitioner(logger_, partition_config_file);
+    auto status = partitioner->PartitionGraph(graph_viewer_, execution_providers, stream_nodes_,
+                                              context_->GetExecutionOrder());
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     plan_.node_stream_map_.resize(SafeInt<size_t>(graph_viewer_.MaxNodeIndex()) + 1);
     for (size_t i = 0; i < stream_nodes_.size(); ++i) {
@@ -2282,10 +2291,9 @@ Status PlannerImpl::CreatePlan(
 #ifdef ORT_ENABLE_STREAM
     const IStreamCommandHandleRegistry& stream_handle_registry,
 #endif
-    const PathString& partition_config_file,
-    const logging::Logger& logger) {
+    const PathString& partition_config_file) {
   // 1. partition graph into streams
-  PartitionIntoStreams(logger, execution_providers_, this->parent_node_ ? PathString{} : partition_config_file);
+  PartitionIntoStreams(execution_providers_, parent_node_ ? PathString{} : partition_config_file);
 
   // 2. initialize the plan based on stream partition result
   int num_ml_values = ort_value_name_idx_map_.MaxIdx() + 1;
@@ -2354,14 +2362,13 @@ Status SequentialPlanner::CreatePlan(
   PlannerImpl planner(parent_node, graph_viewer, outer_scope_node_args, providers,
                       kernel_create_info_map, subgraphs_kernel_create_info_maps,
                       outer_scope_node_arg_to_location_map,
-                      ort_value_name_idx_map, context, *plan);
+                      ort_value_name_idx_map, context, *plan, logger);
 
   return planner.CreatePlan(
 #ifdef ORT_ENABLE_STREAM
       stream_handle_registry,
 #endif
-      partition_config_file,
-      logger);
+      partition_config_file);
 }
 
 #ifdef ORT_ENABLE_STREAM
diff --git a/onnxruntime/core/framework/allocator_utils.cc b/onnxruntime/core/framework/allocator_utils.cc
index 797b6e1606f97..edf965d3835b5 100644
--- a/onnxruntime/core/framework/allocator_utils.cc
+++ b/onnxruntime/core/framework/allocator_utils.cc
@@ -77,7 +77,7 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) {
   }
 }
 
-bool ShouldCpuAllocatorUseArena([[maybe_unused]] bool is_arena_requested) {
+bool DoesCpuAllocatorSupportArenaUsage() {
 #if defined(USE_JEMALLOC) || defined(USE_MIMALLOC)
   // We use these allocators instead of the arena.
   return false;
@@ -89,7 +89,7 @@ bool ShouldCpuAllocatorUseArena([[maybe_unused]] bool is_arena_requested) {
   if constexpr (sizeof(void*) == 4) {
     return false;
   } else {
-    return is_arena_requested;
+    return true;
   }
 #endif
 }
diff --git a/onnxruntime/core/framework/allocator_utils.h b/onnxruntime/core/framework/allocator_utils.h
index 4035a0cc349e4..bef0b7057a7f8 100644
--- a/onnxruntime/core/framework/allocator_utils.h
+++ b/onnxruntime/core/framework/allocator_utils.h
@@ -43,8 +43,8 @@ struct AllocatorCreationInfo {
 AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info);
 
 /**
- * Gets whether a CPU allocator should use an arena or not.
+ * Gets whether a CPU allocator supports arena usage.
  */
-bool ShouldCpuAllocatorUseArena(bool is_arena_requested);
+bool DoesCpuAllocatorSupportArenaUsage();
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
index ef68b88187e08..1eb7420b44d2c 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.cc
+++ b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -41,7 +41,8 @@ static bool IsSmallInitializer(const onnxruntime::GraphViewer& graph, const Node
 
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes) {
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const logging::Logger& logger) {
   // automatic conversion from const std::vector&
   const auto& ordered_nodes = graph.GetNodesInTopologicalOrder();
   InlinedVector<size_t> node_id_to_order_map(graph.MaxNodeIndex());
@@ -83,7 +84,7 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
             auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
             for (auto& consumer_node : consumer_nodes) {
               candidates.push(consumer_node->Index());
-              LOGS_DEFAULT(INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name();
+              LOGS(logger, INFO) << "Candidate for fallback CPU execution: " << consumer_node->Name();
             }
           }
           return Status::OK();
@@ -159,9 +160,9 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
 
     if (place_in_cpu) {
       cpu_nodes.insert(cur);
-      LOGS_DEFAULT(INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
-                         << " because the CPU execution path is deemed faster than overhead involved with execution on other EPs "
-                         << " capable of executing this node";
+      LOGS(logger, INFO) << "ORT optimization- Force fallback to CPU execution for node: " << node->Name()
+                         << " because the CPU execution path is deemed faster than overhead involved with execution "
+                            "on other EPs capable of executing this node";
       for (auto* output : node->OutputDefs()) {
         cpu_output_args.insert(output);
       }
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h
index c5bcd22888b7c..bca75adbfd5a7 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.h
+++ b/onnxruntime/core/framework/fallback_cpu_capability.h
@@ -9,6 +9,9 @@
 #include "core/graph/graph_viewer.h"
 
 namespace onnxruntime {
+namespace logging {
+class Logger;
+}
 
 /**
   Returns a list of nodes that are preferred on CPU.
@@ -19,6 +22,7 @@ namespace onnxruntime {
   */
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes);
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const logging::Logger& logger);
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 4f745b74abce7..406fc1b15effc 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -149,13 +149,13 @@ auto get_capabilities = [](const IExecutionProvider& ep,
 };
 }  // namespace
 
-static Status GetCapabilityForEP(const GetCapabilityForEPParams& params) {
+static Status GetCapabilityForEP(const GetCapabilityForEPParams& params, const logging::Logger& logger) {
   auto& current_ep = params.current_ep.get();
   const auto& ep_type = current_ep.Type();
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   if (current_ep.GetPreferredLayout() == DataLayout::NHWC && !params.transform_layout.get()) {
-    LOGS_DEFAULT(WARNING) << ep_type << " cannot be used with this model due to its ONNX opset not being supported by "
+    LOGS(logger, WARNING) << ep_type << " cannot be used with this model due to its ONNX opset not being supported by "
                                         "the layout transformer.";
     return Status::OK();
   }
@@ -165,7 +165,8 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params) {
   const auto kernel_registries_for_ep = kernel_registry_mgr.GetKernelRegistriesByProviderType(ep_type);
   const KernelLookup kernel_lookup{ep_type,
                                    kernel_registries_for_ep,
-                                   kernel_registry_mgr.GetKernelTypeStrResolver()};
+                                   kernel_registry_mgr.GetKernelTypeStrResolver(),
+                                   logger};
 
   auto& graph = params.graph.get();
   auto& capabilities = params.capabilities.get();
@@ -248,13 +249,15 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params) {
 static Status GetCapabilityForEPForAotInlining(const GraphViewer& graph_viewer,
                                                const KernelRegistryManager& kernel_registry_mgr,
                                                const IExecutionProvider& current_ep,
+                                               const logging::Logger& logger,
                                                std::vector<std::unique_ptr<ComputeCapability>>& capabilities) {
   const auto& ep_type = current_ep.Type();
 
   const auto kernel_registries_for_ep = kernel_registry_mgr.GetKernelRegistriesByProviderType(ep_type);
   const KernelLookup kernel_lookup{ep_type,
                                    kernel_registries_for_ep,
-                                   kernel_registry_mgr.GetKernelTypeStrResolver()};
+                                   kernel_registry_mgr.GetKernelTypeStrResolver(),
+                                   logger};
 
   // TODO: Provide EP with a capability to look inside the functions.
   capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup);
@@ -359,7 +362,8 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
                                            GraphPartitioner::Mode mode,
                                            int& fused_node_unique_id,
                                            const layout_transformation::TransformLayoutFunction& transform_layout_fn,
-                                           const layout_transformation::DebugGraphFn& debug_graph_fn) {
+                                           const layout_transformation::DebugGraphFn& debug_graph_fn,
+                                           const logging::Logger& logger) {
   // handle testing edge case where optimizers or constant lifting results in graph with no nodes.
   // doing it here saves all providers checking for this in GetCapability
   if (graph.NumberOfNodes() == 0) {
@@ -373,7 +377,7 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
       // we pass through the FuncManager from the top level graph
       ORT_RETURN_IF_ERROR(PartitionOnnxFormatModelImpl(*subgraph, func_mgr, kernel_registry_mgr,
                                                        fused_kernel_registry, current_ep, mode, fused_node_unique_id,
-                                                       transform_layout_fn, debug_graph_fn));
+                                                       transform_layout_fn, debug_graph_fn, logger));
     }
   }
 
@@ -398,7 +402,7 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
       std::cref(transform_layout_fn),
       std::cref(debug_graph_fn)};
 
-  ORT_RETURN_IF_ERROR(GetCapabilityForEP(get_capability_params));
+  ORT_RETURN_IF_ERROR(GetCapabilityForEP(get_capability_params, logger));
   if (capabilities.empty()) {
     return Status::OK();
   }
@@ -425,7 +429,7 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
     Node* n = PlaceNode(graph, *capability->sub_graph, fusion_style, type, mode, fused_node_unique_id);
     if (n != nullptr) {
       // searching in kernel registries, if no kernel registered for the fused_node, use compile approach
-      if (!KernelRegistryManager::HasImplementationOf(kernel_registry_mgr, *n, type)) {
+      if (!KernelRegistryManager::HasImplementationOf(kernel_registry_mgr, *n, type, logger)) {
         nodes_to_compile.push_back(n);
         capabilities_to_compile.push_back(std::move(capability));
       } else {
@@ -559,6 +563,7 @@ static Status InlineNodes(Graph& graph, bool& modified_graph) {
 static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_providers,
                                      const KernelRegistryManager& kernel_registry_mgr,
                                      Graph& graph,
+                                     const logging::Logger& logger,
                                      InlinedHashSet<std::string>& not_inlined,
                                      size_t& inlined_count) {
   // handle testing edge case where optimizers or constant lifting results in graph with no nodes.
@@ -574,6 +579,7 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
       ORT_RETURN_IF_ERROR(InlineFunctionsAOTImpl(execution_providers,
                                                  kernel_registry_mgr,
                                                  *subgraph,
+                                                 logger,
                                                  not_inlined,
                                                  inlined_count));
     }
@@ -597,7 +603,8 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
   InlinedHashSet<NodeIndex> claimed_by_ep;
   for (const auto& ep : execution_providers) {
     std::vector<std::unique_ptr<ComputeCapability>> capabilities;
-    ORT_RETURN_IF_ERROR(GetCapabilityForEPForAotInlining(graph_viewer, kernel_registry_mgr, *ep, capabilities));
+    ORT_RETURN_IF_ERROR(GetCapabilityForEPForAotInlining(graph_viewer, kernel_registry_mgr, *ep, logger,
+                                                         capabilities));
     for (auto& capability : capabilities) {
       const auto& nodes = capability->sub_graph->nodes;
       if (nodes.size() == 1) {
@@ -674,7 +681,7 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
                            context_cache_path, "' exist already.");
   }
 
-  Model ep_context_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+  Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
                          graph.DomainToVersionMap(), {}, logger);
   auto& ep_graph = ep_context_model.MainGraph();
   ep_graph.SetDescription(graph.Description());
@@ -727,7 +734,8 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
 
 static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
                                        const ExecutionProviders& execution_providers,
-                                       KernelRegistryManager& kernel_registry_manager) {
+                                       KernelRegistryManager& kernel_registry_manager,
+                                       const logging::Logger& logger) {
   bool modified_graph = false;
 
   auto& graph = partition_params.graph.get();
@@ -742,7 +750,8 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
       ORT_RETURN_IF_ERROR(PartitionOnnxFormatModelImpl(graph, func_mgr, kernel_registry_manager,
                                                        fused_kernel_registry, *ep, mode, fused_node_unique_id,
                                                        transform_layout_function,
-                                                       partition_params.debug_graph_fn));
+                                                       partition_params.debug_graph_fn,
+                                                       logger));
     }
 
     // expand any nodes that have an ONNX function definition but no matching ORT kernel.
@@ -762,7 +771,8 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
 
 static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_params,
                                           KernelRegistryManager& kernel_registry_mgr,
-                                          IExecutionProvider& current_ep) {
+                                          IExecutionProvider& current_ep,
+                                          const logging::Logger& logger) {
   // handle testing edge case where optimizers or constant lifting results in graph with no nodes.
   // doing it here saves all providers checking for this in GetCapability
   auto& graph = partition_params.graph.get();
@@ -776,7 +786,8 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
       auto& subgraph = *entry.second;
       PartitionParams subgraph_partition_params = partition_params;
       subgraph_partition_params.graph = std::ref(subgraph);
-      ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(subgraph_partition_params, kernel_registry_mgr, current_ep));
+      ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(subgraph_partition_params, kernel_registry_mgr, current_ep,
+                                                      logger));
     }
   }
 
@@ -795,7 +806,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
   };
   // clang-format on
 
-  ORT_RETURN_IF_ERROR(GetCapabilityForEP(get_capability_params));
+  ORT_RETURN_IF_ERROR(GetCapabilityForEP(get_capability_params, logger));
   if (capabilities.empty()) {
     return Status::OK();
   }
@@ -876,10 +887,11 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
 // Simplified partitioning where custom EPs may produce compiled nodes.
 static Status PartitionOrtFormatModel(const PartitionParams& partition_params,
                                       const ExecutionProviders& execution_providers,
-                                      KernelRegistryManager& kernel_registry_manager) {
+                                      KernelRegistryManager& kernel_registry_manager,
+                                      const logging::Logger& logger) {
   // process full graph with each EP
   for (const auto& ep : execution_providers) {
-    ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(partition_params, kernel_registry_manager, *ep));
+    ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(partition_params, kernel_registry_manager, *ep, logger));
   }
 
   return Status::OK();
@@ -906,6 +918,7 @@ Status GraphPartitioner::InlineFunctionsAOT(Model& model,
     ORT_RETURN_IF_ERROR(InlineFunctionsAOTImpl(execution_providers,
                                                kernel_registry_manager,
                                                graph,
+                                               logger,
                                                not_inlined,
                                                inlined_count));
 
@@ -977,8 +990,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 
   if (mode == Mode::kNormal || mode == Mode::kAssignOnly) {
 #if !defined(ORT_MINIMAL_BUILD)
-    ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode,
-                                                 providers_, kernel_registry_mgr_));
+    ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, logger));
 
     bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
     std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
@@ -991,8 +1003,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX models are not supported in this build.");
 #endif  //! defined(ORT_MINIMAL_BUILD)
   } else {
-    ORT_RETURN_IF_ERROR(PartitionOrtFormatModel(partition_params,
-                                                providers_, kernel_registry_mgr_));
+    ORT_RETURN_IF_ERROR(PartitionOrtFormatModel(partition_params, providers_, kernel_registry_mgr_, logger));
   }
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/framework/kernel_lookup.h b/onnxruntime/core/framework/kernel_lookup.h
index 0dd17d2f4a624..fac43bad0fefb 100644
--- a/onnxruntime/core/framework/kernel_lookup.h
+++ b/onnxruntime/core/framework/kernel_lookup.h
@@ -21,17 +21,19 @@ class KernelLookup final : public IExecutionProvider::IKernelLookup {
  public:
   KernelLookup(ProviderType provider_type,
                gsl::span<const gsl::not_null<const KernelRegistry*>> kernel_registries,
-               const IKernelTypeStrResolver& kernel_type_str_resolver)
+               const IKernelTypeStrResolver& kernel_type_str_resolver,
+               const logging::Logger& logger)
       : provider_type_{provider_type},
         kernel_registries_{kernel_registries},
-        kernel_type_str_resolver_{kernel_type_str_resolver} {
+        kernel_type_str_resolver_{kernel_type_str_resolver},
+        logger_{logger} {
     ORT_ENFORCE(!provider_type_.empty(), "provider_type must be specified.");
   }
 
   const KernelCreateInfo* LookUpKernel(const Node& node) const override {
     const KernelCreateInfo* kernel_create_info{};
     for (const auto& registry : kernel_registries_) {
-      const auto lookup_status = registry->TryFindKernel(node, provider_type_, kernel_type_str_resolver_,
+      const auto lookup_status = registry->TryFindKernel(node, provider_type_, kernel_type_str_resolver_, logger_,
                                                          &kernel_create_info);
       if (lookup_status.IsOK() && kernel_create_info != nullptr) {
         return kernel_create_info;
@@ -45,6 +47,7 @@ class KernelLookup final : public IExecutionProvider::IKernelLookup {
   ProviderType provider_type_;
   const gsl::span<const gsl::not_null<const KernelRegistry*>> kernel_registries_;
   const IKernelTypeStrResolver& kernel_type_str_resolver_;
+  const logging::Logger& logger_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/kernel_registry.cc b/onnxruntime/core/framework/kernel_registry.cc
index cbbe0f86b8b7e..8602a3b4004ff 100644
--- a/onnxruntime/core/framework/kernel_registry.cc
+++ b/onnxruntime/core/framework/kernel_registry.cc
@@ -183,6 +183,7 @@ Status KernelRegistry::TryFindKernelImpl(const Node& node,
                                          ProviderType exec_provider,
                                          const IKernelTypeStrResolver* kernel_type_str_resolver,
                                          const TypeConstraintMap* type_constraints,
+                                         const logging::Logger& logger,
                                          const KernelCreateInfo** out) const {
   const auto& node_provider = node.GetExecutionProviderType();
   const auto& expected_provider = (node_provider.empty() ? exec_provider : node_provider);
@@ -215,7 +216,7 @@ Status KernelRegistry::TryFindKernelImpl(const Node& node,
               std::ostream_iterator<std::string>(oss, "\n"));
     oss << ")";
 
-    VLOGS_DEFAULT(2) << "TryFindKernel failed, Reason: " << oss.str();
+    VLOGS(logger, 2) << "TryFindKernel failed, Reason: " << oss.str();
     return Status(common::ONNXRUNTIME, common::FAIL, oss.str());
   }
 
@@ -224,14 +225,16 @@ Status KernelRegistry::TryFindKernelImpl(const Node& node,
 
 Status KernelRegistry::TryFindKernel(const Node& node, ProviderType exec_provider,
                                      const IKernelTypeStrResolver& kernel_type_str_resolver,
+                                     const logging::Logger& logger,
                                      const KernelCreateInfo** out) const {
-  return TryFindKernelImpl(node, exec_provider, &kernel_type_str_resolver, nullptr, out);
+  return TryFindKernelImpl(node, exec_provider, &kernel_type_str_resolver, nullptr, logger, out);
 }
 
 Status KernelRegistry::TryFindKernel(const Node& node, ProviderType exec_provider,
                                      const TypeConstraintMap& type_constraints,
+                                     const logging::Logger& logger,
                                      const KernelCreateInfo** out) const {
-  return TryFindKernelImpl(node, exec_provider, nullptr, &type_constraints, out);
+  return TryFindKernelImpl(node, exec_provider, nullptr, &type_constraints, logger, out);
 }
 
 static bool KernelDefCompatible(int version, const KernelDef& kernel_def,
@@ -261,6 +264,7 @@ Status KernelRegistry::TryFindKernel(ProviderType exec_provider,
                                      std::string_view domain,
                                      int version,
                                      const KernelRegistry::TypeConstraintMap& type_constraints,
+                                     const logging::Logger& logger,
                                      const KernelCreateInfo** out) const {
   auto range = kernel_creator_fn_map_.equal_range(GetMapKey(op_type, domain, exec_provider));
   if (out) *out = nullptr;
@@ -289,7 +293,7 @@ Status KernelRegistry::TryFindKernel(ProviderType exec_provider,
               std::ostream_iterator<std::string>(oss, "\n"));
     oss << ")";
 
-    VLOGS_DEFAULT(2) << "TryFindKernel failed, Reason: " << oss.str();
+    VLOGS(logger, 2) << "TryFindKernel failed, Reason: " << oss.str();
     return Status(common::ONNXRUNTIME, common::FAIL, oss.str());
   }
 
diff --git a/onnxruntime/core/framework/kernel_registry_manager.cc b/onnxruntime/core/framework/kernel_registry_manager.cc
index f8ccdb8fb0238..721353854a474 100644
--- a/onnxruntime/core/framework/kernel_registry_manager.cc
+++ b/onnxruntime/core/framework/kernel_registry_manager.cc
@@ -57,7 +57,7 @@ void KernelRegistryManager::RegisterKernelRegistry(std::shared_ptr<KernelRegistr
 }
 #endif
 
-Status KernelRegistryManager::SearchKernelRegistry(const Node& node,
+Status KernelRegistryManager::SearchKernelRegistry(const Node& node, const logging::Logger& logger,
                                                    /*out*/ const KernelCreateInfo** kernel_create_info) const {
   Status status;
 
@@ -82,7 +82,7 @@ Status KernelRegistryManager::SearchKernelRegistry(const Node& node,
   }
 
   for (auto& registry : custom_kernel_registries_) {
-    status = registry->TryFindKernel(node, std::string(), GetKernelTypeStrResolver(), kernel_create_info);
+    status = registry->TryFindKernel(node, std::string(), GetKernelTypeStrResolver(), logger, kernel_create_info);
     if (status.IsOK()) {
       return status;
     }
@@ -95,7 +95,7 @@ Status KernelRegistryManager::SearchKernelRegistry(const Node& node,
   }
 
   if (p != nullptr) {
-    status = p->TryFindKernel(node, std::string(), GetKernelTypeStrResolver(), kernel_create_info);
+    status = p->TryFindKernel(node, std::string(), GetKernelTypeStrResolver(), logger, kernel_create_info);
     if (status.IsOK()) {
       return status;
     }
@@ -104,10 +104,14 @@ Status KernelRegistryManager::SearchKernelRegistry(const Node& node,
   return Status(ONNXRUNTIME, NOT_IMPLEMENTED, create_error_message("Failed to find kernel for "));
 }
 
-bool KernelRegistryManager::HasImplementationOf(const KernelRegistryManager& r, const Node& node, const std::string& provider_type) {
+bool KernelRegistryManager::HasImplementationOf(const KernelRegistryManager& r,
+                                                const Node& node,
+                                                const std::string& provider_type,
+                                                const logging::Logger& logger) {
   const auto kernel_registries = r.GetKernelRegistriesByProviderType(provider_type);
   return std::any_of(kernel_registries.begin(), kernel_registries.end(), [&](const KernelRegistry* kernel_registry) {
-    return KernelRegistry::HasImplementationOf(*kernel_registry, node, provider_type, r.GetKernelTypeStrResolver());
+    return KernelRegistry::HasImplementationOf(*kernel_registry, node, provider_type, r.GetKernelTypeStrResolver(),
+                                               logger);
   });
 }
 
diff --git a/onnxruntime/core/framework/kernel_registry_manager.h b/onnxruntime/core/framework/kernel_registry_manager.h
index 1da73208cb536..72f0ed3c6268a 100644
--- a/onnxruntime/core/framework/kernel_registry_manager.h
+++ b/onnxruntime/core/framework/kernel_registry_manager.h
@@ -67,13 +67,14 @@ class KernelRegistryManager {
 
   // This function assumes the node is already assigned to an execution provider
   // Don't call this function before graph partition is done
-  Status SearchKernelRegistry(const Node& node,
+  Status SearchKernelRegistry(const Node& node, const logging::Logger& logger,
                               /*out*/ const KernelCreateInfo** kernel_create_info) const;
 
   /**
    * Whether this node can be run on this provider
    */
-  static bool HasImplementationOf(const KernelRegistryManager& r, const Node& node, const std::string& provider_type);
+  static bool HasImplementationOf(const KernelRegistryManager& r, const Node& node, const std::string& provider_type,
+                                  const logging::Logger& logger);
 
   Status CreateKernel(const Node& node,
                       const IExecutionProvider& execution_provider,
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 6ea12c7f3336b..2185b8332b9cf 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -68,7 +68,7 @@ static void CalculateTotalOutputSizes(OpKernelContextInternal* op_kernel_context
   int output_count = op_kernel_context->OutputCount();
   for (auto i = 0; i < output_count; i++) {
     const OrtValue* p_output = op_kernel_context->GetOutputMLValue(i);
-    if (p_output != nullptr && p_output->IsTensor()) {
+    if (p_output != nullptr && p_output->IsTensor() && p_output->IsAllocated()) {
       const auto& tensor = p_output->Get<Tensor>();
       size_t tensor_size = tensor.SizeInBytes();
 #if defined(TRACE_EXECUTION)
@@ -104,7 +104,7 @@ static void CalculateTotalInputSizes(const OpKernelContextInternal* op_kernel_co
   const int input_count = op_kernel_context->InputCount();
   for (auto i = 0; i < input_count; i++) {
     const OrtValue* p_input = op_kernel_context->GetInputMLValue(i);
-    if (p_input != nullptr && p_input->IsTensor()) {
+    if (p_input != nullptr && p_input->IsTensor() && p_input->IsAllocated()) {
       const OpKernelInfo& op_kernel_info = p_op_kernel->Info();
       const Tensor* p_tensor = nullptr;
       bool is_param = op_kernel_info.TryGetConstantInput(i, &p_tensor);
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 18405231750ba..8d4db36106f28 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -83,11 +83,6 @@ struct SessionOptions {
   // enable profiling for this session.
   bool enable_profiling = false;
 
-  // save pre-packed constant external initializers instead of original initializers to onnxruntime data file.
-  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
-  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
-  bool save_prepacked_constant_initializers = false;
-
   // Non empty filepath enables serialization of the transformed optimized model to the specified filepath.
   //
   // Set session config value for ORT_SESSION_OPTIONS_CONFIG_SAVE_MODEL_FORMAT to 'ORT' or 'ONNX' to explicitly
@@ -196,7 +191,6 @@ inline std::ostream& operator<<(std::ostream& os, const SessionOptions& session_
      << " execution_mode:" << session_options.execution_mode
      << " execution_order:" << session_options.execution_order
      << " enable_profiling:" << session_options.enable_profiling
-     << " save_prepacked_constant_initializers:" << session_options.save_prepacked_constant_initializers
      << " optimized_model_filepath:" << ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(session_options.optimized_model_filepath)
      << " enable_mem_pattern:" << session_options.enable_mem_pattern
      << " enable_mem_reuse:" << session_options.enable_mem_reuse
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 943db091b341f..0ac2271ba09f1 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -14,7 +14,6 @@
 #include "core/framework/op_kernel.h"
 #include "core/framework/ort_value_pattern_planner.h"
 #include "core/framework/session_state_utils.h"
-#include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/controlflow/utils.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@@ -179,7 +178,7 @@ Status SessionState::PopulateKernelCreateInfo(const KernelRegistryManager& kerne
                                               bool saving_ort_format) {
   for (auto& node : graph_.Nodes()) {
     const KernelCreateInfo* kci = nullptr;
-    auto status = kernel_registry_manager.SearchKernelRegistry(node, &kci);
+    auto status = kernel_registry_manager.SearchKernelRegistry(node, logger_, &kci);
     if (!status.IsOK() && saving_ort_format) {
       // if we didn't find the kernel and are saving to ORT format an EP that compiles nodes is enabled.
       // in that case we assigned the node to that EP but do not compile it into a fused node.
@@ -188,7 +187,7 @@ Status SessionState::PopulateKernelCreateInfo(const KernelRegistryManager& kerne
       // at runtime when the model is loaded in a minimal build, the compiling EP will replace this node if possible.
       // if that's not possible for some reason we can fallback to the CPU EP implementation.
       node.SetExecutionProviderType(kCpuExecutionProvider);
-      status = kernel_registry_manager.SearchKernelRegistry(node, &kci);
+      status = kernel_registry_manager.SearchKernelRegistry(node, logger_, &kci);
     }
 
     ORT_RETURN_IF_ERROR(status);
@@ -398,18 +397,12 @@ static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type,
 }
 
 Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
-                                                       const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map,
-                                                       bool save_prepacked_constant_initializers,
-                                                       PrePackInitializers& pre_packed_initializers) {
-  auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map,
-                                     save_prepacked_constant_initializers, &pre_packed_initializers](
+                                                       const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map) {
+  auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map](
                                         bool should_cache_prepacked_weights_for_shared_initializers) -> Status {
-    std::unordered_map<std::string, std::string> pre_packed_kernel_input_map;
     for (auto& node : GetGraphViewer().Nodes()) {
       auto kernel = GetMutableKernel(node.Index());
-      auto kernel_name = kernel->Info().node().Name();
       int input_idx = 0;
-      bool is_kernel_prepacked = false;
       for (auto& input_def : node.InputDefs()) {
         if (input_def->Exists()) {
           const std::string& input_name = input_def->Name();
@@ -421,27 +414,16 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
             if (st->GetOrtValueNameIdxMap().GetIdx(input_name, ort_value_idx).IsOK()) {
               std::unordered_map<int, OrtValue>& constant_initialized_tensors = st->constant_initialized_tensors_;
 
-              if (constant_initialized_tensors.count(ort_value_idx) && !is_kernel_prepacked) {
+              if (constant_initialized_tensors.count(ort_value_idx)) {
                 bool is_packed = false;
                 const Tensor& const_initialized_tensor = constant_initialized_tensors[ort_value_idx].Get<Tensor>();
 
                 auto iter = initializers_to_share_map.find(input_name);
                 bool is_shared_initializer = (iter != initializers_to_share_map.end());
 
-                // found pre-packed constant initializers from data file, no need to do pre-packing again
-                // apply pre-packed tensor to kernel so kernel can use it directly
-                if (pre_packed_initializers.pre_packed_initializer_names_read_from_file.count(input_name) != 0) {
-                  is_packed = true;
-
-                  // kernel like Matmul_nbits will call prepack multiple times with input_B and possibly scales/zero_points.
-                  // If prepacked weights already read from ONNX data file (this happens we ORT reads data file with prepacked
-                  // weights serialized), only need to set prepacked weights once to kernel.
-                  is_kernel_prepacked = true;
-                  ORT_THROW_IF_ERROR(kernel->SetPrePackTensor(input_idx, const_initialized_tensor));
-                }
                 // Caching pre-packed weights is limited to shared initializers associated with the CPU EP for now
-                else if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers &&
-                         node.GetExecutionProviderType() == kCpuExecutionProvider) {  // caching of pre-packed weights' turned ON
+                if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers &&
+                    node.GetExecutionProviderType() == kCpuExecutionProvider) {  // caching of pre-packed weights' turned ON
 
                   AllocatorPtr allocator_for_caching = prepacked_weights_container_->GetOrCreateAllocator(CPU);
                   ORT_ENFORCE(allocator_for_caching.get() != nullptr);
@@ -453,7 +435,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                   // weight with the pre-packed weight generated by this instance of the same op_type because other static
                   // properties of the node like node attributes could play a role in the pre-packed weights' contents.
                   ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, allocator_for_caching,
-                                                      save_prepacked_constant_initializers, is_packed,
+                                                      is_packed,
                                                       &weights_to_be_filled_in));
 
                   if (is_packed) {
@@ -500,50 +482,18 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                   AllocatorPtr session_cpu_alloc = GetAllocator(kernel->Info().GetDevice(OrtMemType::OrtMemTypeDefault));
                   ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx,
                                                       session_cpu_alloc,  // use allocator tied to this session
-                                                      save_prepacked_constant_initializers,
                                                       is_packed,
                                                       nullptr  // no caching required
                                                       ));
                 }
                 if (is_packed) {
-                  // if intended to save prepacked initializers, get prepacked tensors from kernel and save in hashmap,
-                  // will save to data file later
-                  if (save_prepacked_constant_initializers) {
-                    auto tensor = kernel->GetPrePackTensor(input_idx);
-
-                    if (tensor != std::nullopt) {
-                      // save prepacked initializers per initializer and kernel since one initializer could
-                      // be used by multiple kernels
-                      pre_packed_initializers.pre_packed_initializers_to_save[input_name][kernel_name] = std::move(tensor.value());
-
-                      pre_packed_kernel_input_map[kernel_name] = input_name;
-                    }
-                  }
-
                   ++number_of_prepacks_counter_;
 
-                  // if constant_initialized_tensor is already pre-packed, don't need to remove it
-                  if (pre_packed_initializers.pre_packed_initializer_names_read_from_file.count(input_name) == 0 &&
-                      constant_initializers_use_count.count(input_name) && --constant_initializers_use_count[input_name] == 0) {
+                  if (constant_initializers_use_count.count(input_name) && --constant_initializers_use_count[input_name] == 0) {
                     // release the constant initialized tensor
                     st->initialized_tensors_.erase(ort_value_idx);
                     constant_initialized_tensors.erase(ort_value_idx);
                   }
-                } else {
-                  // handle prepack for matmul_nbits, it will prepack several times but set is_packed
-                  // to false for scales and zero_points, we keep scales and zero_points as it is only
-                  // update packed_tensor to input_B.
-                  // TODO: this logic works with matmul_nbits kernel but if other kernels also call prepack
-                  // multiple times and use different initializers to store prepacked weights, this piece of logic
-                  // might introduce bug and need a per kernel strategy to update prepacked weights.
-                  if (save_prepacked_constant_initializers && pre_packed_kernel_input_map.count(kernel_name)) {
-                    auto tensor = kernel->GetPrePackTensor(input_idx);
-
-                    if (tensor != std::nullopt) {
-                      auto existing_input_name = pre_packed_kernel_input_map[kernel_name];
-                      pre_packed_initializers.pre_packed_initializers_to_save[existing_input_name][kernel_name] = std::move(tensor.value());
-                    }
-                  }
                 }
               }
               // stop searching in 2 cases:
@@ -1226,7 +1176,6 @@ static Status VerifyEachNodeIsAssignedToAnEp(const Graph& graph, const logging::
 
 Status SessionState::FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE>& graph_location,
                                           const KernelRegistryManager& kernel_registry_manager,
-                                          PrePackInitializers& pre_packed_initializers,
                                           bool remove_initializers,
                                           bool saving_ort_format) {
   // recursively create the subgraph session state instances and populate the kernel create info in them.
@@ -1240,7 +1189,7 @@ Status SessionState::FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE
   InlinedHashMap<std::string, size_t> constant_initializers_use_count;
   ComputeConstantInitializerUseCount(graph_, constant_initializers_use_count);
   return FinalizeSessionStateImpl(graph_location, kernel_registry_manager, nullptr, sess_options_,
-                                  remove_initializers, constant_initializers_use_count, pre_packed_initializers);
+                                  remove_initializers, constant_initializers_use_count);
 }
 
 static Status Index(const OrtValueNameIdxMap& ort_value_name_idx_map,
@@ -1374,7 +1323,6 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
                                               const SessionOptions& session_options,
                                               bool remove_initializers,
                                               InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
-                                              PrePackInitializers& pre_packed_initializers,
                                               const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map,
                                               bool graph_info_already_created) {
   if (!graph_info_already_created) {
@@ -1474,8 +1422,6 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
   // For inference it is enabled by default, but users can choose to disable it via session options.
   const bool disable_prepacking =
       session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigDisablePrepacking, "0") == "1";
-  const bool save_prepacked_constant_initializers =
-      session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsSavePrePackedConstantInitializers, "0") == "1";
   // Memory pattern tracer allocates all initializers on a single contiguous
   // buffer. This has the effect of reducing memory fragmentation.
   // Further more, in training scenarios NCCL kernels require initializers to be allocated
@@ -1528,9 +1474,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
     }
   }
 #endif
-  // This unorder set is used during model load with prepacked initializer saved in ONNX data file.
-  // ORT reads prepacked initializers and store them into this set so we could skip PrePack
-  // process later to save heap memory.
+
   ORT_RETURN_IF_ERROR(
       session_state_utils::SaveInitializedTensors(
           Env::Default(), graph_location, *graph_viewer_,
@@ -1545,7 +1489,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
             return Status::OK();
           },
           logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options,
-          memory_profile_func, name_to_buffered_tensor_, pre_packed_initializers.pre_packed_initializer_names_read_from_file));
+          memory_profile_func, name_to_buffered_tensor_));
 
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
   // Record Weight allocation info on device
@@ -1561,14 +1505,9 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
 
   ORT_RETURN_IF_ERROR(CreateKernels(kernel_registry_manager));
 
-  // pre-packed constant initializers only when:
-  //   1. pre-packing is not disabled
-  //   2. pre-packed not initializers found and loaded during model load
   if (!disable_prepacking) {
     ORT_RETURN_IF_ERROR(PrepackConstantInitializedTensors(constant_initializers_use_count,
-                                                          session_options.initializers_to_share_map,
-                                                          save_prepacked_constant_initializers,
-                                                          pre_packed_initializers));
+                                                          session_options.initializers_to_share_map));
   }
 
   ORT_RETURN_IF_ERROR(
@@ -1607,7 +1546,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
                                                                subgraph_outer_scope_node_arg_to_location_map));
       ORT_RETURN_IF_ERROR(subgraph_session_state.FinalizeSessionStateImpl(
           graph_location, kernel_registry_manager, &node, subgraph_session_options, remove_initializers,
-          constant_initializers_use_count, pre_packed_initializers, subgraph_outer_scope_node_arg_to_location_map, true));
+          constant_initializers_use_count, subgraph_outer_scope_node_arg_to_location_map, true));
 
       // setup all the info for handling the feeds and fetches used in subgraph execution
       auto* p_op_kernel = GetMutableKernel(node.Index());
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index a6023d2a62da7..e1674ba4b690b 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -312,25 +312,8 @@ class SessionState {
     return &name_to_buffered_tensor_;
   }
 
-  // Data structure stores prepacked initializers in format of Tensor.
-  struct PrePackInitializers {
-    // This map is used during model save for prepacked initializers.
-    // Since one constant initializer could be used by different kernels
-    // and prepacked differently, use an unordered_map to store prepacked
-    // initializer in format of <[initializer_name], <[kernel_name], [prepacked_initializer]>>
-    typedef std::unordered_map<std::string, std::unordered_map<std::string, Tensor>> PrePackedTensorsToSave;
-    PrePackedTensorsToSave pre_packed_initializers_to_save;
-
-    // This set is used during model load with prepacked initializer serialized in external data file.
-    // ORT reads prepacked initializers and store their name into this set so we could skip PrePack
-    // process later to save heap memory. Prepacked tensor itself is saved in session state's constant_initialized_tensors_.
-    typedef std::unordered_set<std::string> PrePackedTensorNamesReadFromFile;
-    PrePackedTensorNamesReadFromFile pre_packed_initializer_names_read_from_file;
-  };
-
   Status FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE>& graph_loc,
                               const KernelRegistryManager& kernel_registry_manager,
-                              PrePackInitializers& pre_packed_initializers,
                               bool remove_initializers = true,
                               bool saving_ort_format = false);
 
@@ -338,15 +321,6 @@ class SessionState {
     return parent_;
   }
 
-  Status FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE>& graph_loc,
-                              const KernelRegistryManager& kernel_registry_manager,
-                              bool remove_initializers = true,
-                              bool saving_ort_format = false) {
-    PrePackInitializers pre_packed_initializers;
-    return FinalizeSessionState(graph_loc, kernel_registry_manager, pre_packed_initializers,
-                                remove_initializers, saving_ort_format);
-  }
-
   // Clear all removable attributes if they exists.
   // The function logs the list of removable attributes for every node.
   void PruneRemovableAttributes();
@@ -406,13 +380,9 @@ class SessionState {
   /**
    * Prepack the constant initialized tensors for better performance.
    * The original constant initialized tensors will be removed to save memory.
-   * For model with prepacked initializer serialized into ONNX data file,
-   * PrePack will be skipped to save memory.
    */
   Status PrepackConstantInitializedTensors(InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
-                                           const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map,
-                                           bool save_prepacked_constant_initializers,
-                                           PrePackInitializers& pre_packed_initializers);
+                                           const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map);
 
   SessionState* GetMutableSubgraphSessionState(onnxruntime::NodeIndex index, const std::string& attribute_name);
 
@@ -430,7 +400,6 @@ class SessionState {
                                   const SessionOptions& session_options,
                                   bool remove_initializers,
                                   InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
-                                  PrePackInitializers& pre_packed_initializers,
                                   const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map = {},
                                   bool graph_info_already_created = false);
 
diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index 3424f40e79c01..2c74805c57dce 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -21,6 +21,7 @@
 #include "core/framework/ort_value_pattern_planner.h"
 #include "core/framework/ort_value_name_idx_map.h"
 #include "core/framework/sequential_execution_plan.h"
+#include "core/framework/session_state.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/framework/bfc_arena.h"
@@ -71,7 +72,6 @@ static inline common::Status ExtDataTensorProtoToTensor(const Env& env,
                                                         const std::basic_string<PATH_CHAR_TYPE>& proto_path,
                                                         const ONNX_NAMESPACE::TensorProto& tensor_proto,
                                                         Tensor& tensor, OrtCallback& ext_data_deleter,
-                                                        SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile& pre_packed_initializers_name_set,
                                                         Tensor* buffered_tensor = nullptr) {
   ORT_ENFORCE(utils::HasExternalData(tensor_proto));
 
@@ -79,7 +79,7 @@ static inline common::Status ExtDataTensorProtoToTensor(const Env& env,
   SafeInt<size_t> ext_data_len = 0;
   ORT_RETURN_IF_ERROR(utils::GetExtDataFromTensorProto(env, proto_path.c_str(), tensor_proto,
                                                        ext_data_buf, ext_data_len, ext_data_deleter,
-                                                       &pre_packed_initializers_name_set, buffered_tensor));
+                                                       buffered_tensor));
 
   // NB: creating a do-nothing allocator per tensor is wasteful; can perhaps be
   // avoided if the Tensor class implements the do-nothing behavior when given a
@@ -100,7 +100,6 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
                                              const AllocatorPtr& alloc, const AllocatorPtr& default_cpu_alloc,
                                              OrtValue& ort_value, const DataTransferManager& data_transfer_mgr,
                                              const ExternalDataLoaderManager& external_data_loader_mgr,
-                                             SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile& pre_packed_initializers_name_set,
                                              bool use_device_allocator_for_initializers = false,
                                              Tensor* buffered_tensor = nullptr) {
   if (bool(alloc) == (m != nullptr)) {
@@ -140,7 +139,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       // TensorProtoToTensor it would copy the data, causing unnecessary overhead
       OrtCallback ext_data_deleter;
       ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_tensor,
-                                                     ext_data_deleter, pre_packed_initializers_name_set, buffered_tensor));
+                                                     ext_data_deleter, buffered_tensor));
 
       ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()};
       MLDataType ml_tensor_type = DataTypeImpl::GetType<Tensor>();
@@ -164,7 +163,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       OrtCallback ext_data_deleter;
       std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
       ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor,
-                                                     ext_data_deleter, pre_packed_initializers_name_set, buffered_tensor));
+                                                     ext_data_deleter, buffered_tensor));
       scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter);
       // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.
 
@@ -273,8 +272,7 @@ common::Status SaveInitializedTensors(
     const ExecutionPlanBase& exec_plan,
     const SessionOptions& session_options,
     const MemoryProfileFunction& memory_profile_func,
-    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors,
-    SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile& pre_packed_initializers_name_set) {
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors) {
   LOGS(logger, INFO) << "Saving initialized tensors.";
   ORT_ENFORCE(ort_value_name_idx_map.MaxIdx() > -1, "OrtValue indexes should have been populated.");
 
@@ -403,7 +401,6 @@ common::Status SaveInitializedTensors(
 
       Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, (m.has_value()) ? &*m : nullptr, alloc,
                                          default_cpu_alloc, ort_value, data_transfer_mgr, external_data_loader_mgr,
-                                         pre_packed_initializers_name_set,
                                          use_device_allocator_for_initializers, p_tensor);
       if (!st.IsOK()) {
         std::ostringstream oss;
diff --git a/onnxruntime/core/framework/session_state_utils.h b/onnxruntime/core/framework/session_state_utils.h
index 4de501b6f7429..af27f5caba0f4 100644
--- a/onnxruntime/core/framework/session_state_utils.h
+++ b/onnxruntime/core/framework/session_state_utils.h
@@ -12,7 +12,6 @@
 #include "core/framework/tensor.h"
 #include "core/framework/tensor_allocator.h"
 #include "core/framework/session_options.h"
-#include "core/framework/session_state.h"
 #include "core/framework/sequential_execution_plan.h"
 #include "core/platform/path_lib.h"
 
@@ -51,8 +50,7 @@ common::Status SaveInitializedTensors(
     const ExecutionPlanBase& exec_plan,
     const SessionOptions& session_options,
     const MemoryProfileFunction& memory_profile_func,
-    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors,
-    SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile& pre_packed_initializers_name_set);
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors);
 
 common::Status AllocateTensor(
     const onnxruntime::MemBuffer* m,
diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc
index bcd04effe2bd4..93146e66d9f24 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.cc
+++ b/onnxruntime/core/framework/tensor_external_data_info.cc
@@ -40,8 +40,6 @@ Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>&
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed");
     } else if (stringmap.key() == "checksum" && !stringmap.value().empty()) {
       out->checksum_ = stringmap.value();
-    } else if (stringmap.key() == "prepacked" && !stringmap.value().empty()) {
-      out->prepacked_ = stringmap.value() == "1";
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error!");
     }
diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h
index c2490f5cc5bc2..afc8fda6c3037 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.h
+++ b/onnxruntime/core/framework/tensor_external_data_info.h
@@ -23,8 +23,6 @@ class ExternalDataInfo {
 
   const std::string& GetChecksum() const { return checksum_; }
 
-  bool GetPrePacked() const noexcept { return prepacked_; }
-
   // If the value of 'offset' or 'length' field is larger the max value of ssize_t, this function will treat it as a
   // wrong value and return FAIL.
   static common::Status Create(
@@ -38,6 +36,5 @@ class ExternalDataInfo {
   // 0 means the whole file
   size_t length_ = 0;
   std::string checksum_;
-  bool prepacked_ = false;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 0c69ee11f62bc..2af9f95ad059e 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -230,12 +230,11 @@ Status TensorProtoToOrtValueImpl(const Env& env, const std::filesystem::path& mo
 
 namespace utils {
 
-static Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
-                                  const std::filesystem::path& tensor_proto_dir,
-                                  std::basic_string<ORTCHAR_T>& external_file_path,
-                                  onnxruntime::FileOffsetType& file_offset,
-                                  SafeInt<size_t>& tensor_byte_size,
-                                  bool& pre_packed) {
+Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                           const std::filesystem::path& tensor_proto_dir,
+                           std::basic_string<ORTCHAR_T>& external_file_path,
+                           onnxruntime::FileOffsetType& file_offset,
+                           SafeInt<size_t>& tensor_byte_size) {
   ORT_RETURN_IF_NOT(onnxruntime::utils::HasExternalData(tensor_proto),
                     "Tensor does not have external data to read from.");
 
@@ -245,8 +244,6 @@ static Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_prot
   std::unique_ptr<onnxruntime::ExternalDataInfo> external_data_info;
   ORT_RETURN_IF_ERROR(onnxruntime::ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info));
 
-  pre_packed = external_data_info->GetPrePacked();
-
   const auto& location = external_data_info->GetRelPath();
 
   external_file_path = location == onnxruntime::utils::kTensorProtoMemoryAddressTag ? std::filesystem::path(location)
@@ -268,11 +265,6 @@ void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::str
   tensor_proto.set_raw_data(std::move(param));
 }
 
-Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto, const std::filesystem::path& tensor_proto_dir, std::basic_string<ORTCHAR_T>& external_file_path, onnxruntime::FileOffsetType& file_offset, SafeInt<size_t>& tensor_byte_size) {
-  bool pre_packed = false;
-  return GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_file_path, file_offset, tensor_byte_size, pre_packed);
-}
-
 void ConvertRawDataInTensorProto(TensorProto* tensor) {
   size_t element_size = 1;
   char* bytes = NULL;
@@ -996,7 +988,7 @@ static Status GetFileContent(const Env& env, const std::filesystem::path& file_p
 Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& model_path,
                                  const ONNX_NAMESPACE::TensorProto& tensor_proto, void*& ext_data_buf,
                                  SafeInt<size_t>& ext_data_len, OrtCallback& ext_data_deleter,
-                                 SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile* pre_packed_initializers_name_set, Tensor* buffered_tensor) {
+                                 Tensor* buffered_tensor) {
   ORT_ENFORCE(utils::HasExternalData(tensor_proto));
   std::basic_string<ORTCHAR_T> tensor_proto_dir;
   if (!model_path.empty()) {
@@ -1005,13 +997,8 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
   std::basic_string<ORTCHAR_T> external_data_file_path;
   FileOffsetType file_offset;
   SafeInt<size_t> raw_data_safe_len = 0;
-  bool pre_packed = false;
   ORT_RETURN_IF_ERROR(
-      GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, raw_data_safe_len, pre_packed));
-
-  if (pre_packed && pre_packed_initializers_name_set != nullptr) {
-    (*pre_packed_initializers_name_set).insert(tensor_proto.name());
-  }
+      GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, raw_data_safe_len));
 
   if (external_data_file_path == onnxruntime::utils::kTensorProtoMemoryAddressTag) {
     // the value in location is the memory address of the data
@@ -1121,7 +1108,7 @@ Status TensorProtoToTensor(const Env& env, const std::filesystem::path& model_pa
   OrtCallback& d = deleter_for_file_data.d;
 
   if (utils::HasExternalData(tensor_proto)) {
-    ORT_RETURN_IF_ERROR(GetExtDataFromTensorProto(env, model_path, tensor_proto, raw_data, raw_data_len, d, nullptr));
+    ORT_RETURN_IF_ERROR(GetExtDataFromTensorProto(env, model_path, tensor_proto, raw_data, raw_data_len, d));
   } else if (utils::HasRawData(tensor_proto)) {
     raw_data = const_cast<char*>(tensor_proto.raw_data().data());
     // TODO The line above has const-correctness issues. Below is a possible fix which copies the tensor_proto data
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index 770132f8e95fc..262f7adaca1cb 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -17,19 +17,26 @@
 #include "core/framework/external_data_loader.h"
 #include "core/framework/ort_value.h"
 #include "core/framework/mem_buffer.h"
-#include "core/framework/session_state.h"
 #include "core/framework/tensor_external_data_info.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/platform/env.h"
 
 namespace onnxruntime {
 namespace utils {
+/**
+ * This function is used to get the external data info from the given tensor proto.
+ * @param tensor_proto       given initializer tensor
+ * @param tensor_proto_dir   directory of the tensor proto file
+ * @param external_file_path output external file path
+ * @param file_offset        output tensor offset
+ * @param tensor_byte_size   output tensor byte size
+ * @returns                  Status::OK() if the function is executed successfully
+ */
 Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
                            const std::filesystem::path& tensor_proto_dir,
                            std::basic_string<ORTCHAR_T>& external_file_path,
                            onnxruntime::FileOffsetType& file_offset,
                            SafeInt<size_t>& tensor_byte_size);
-
 /**
  * This function is used to convert the endianess of Tensor data.
  * Mostly, will be used in big endian system to support the model file
@@ -165,7 +172,6 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::
                                          const ONNX_NAMESPACE::TensorProto& tensor_proto,
                                          void*& ext_data_buf, SafeInt<size_t>& ext_data_len,
                                          OrtCallback& ext_data_deleter,
-                                         SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile* pre_packed_initializers_name_set,
                                          Tensor* buffered_tensor = nullptr);
 
 // Given a tensor proto with external data obtain a tensor using the specified custom external data loader.
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 5402345447706..ff664c2c76703 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -57,7 +57,6 @@ void DestroyStrings(void* p_data, int64_t elements) {
 bool ProviderIsCpuBased(const std::string& provider_type) {
   return provider_type == onnxruntime::kCpuExecutionProvider ||
          provider_type == onnxruntime::kDnnlExecutionProvider ||
-         provider_type == onnxruntime::kTvmExecutionProvider ||
          provider_type == onnxruntime::kVitisAIExecutionProvider ||
          provider_type == onnxruntime::kOpenVINOExecutionProvider ||
          provider_type == onnxruntime::kNnapiExecutionProvider ||
@@ -1064,11 +1063,5 @@ bool IsOutputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index
   return false;
 }
 
-std::string GetPrepackedInitializerName(const std::string& initializer_name, const std::string& node_name) {
-  const std::string seperator = ":";
-
-  return initializer_name + seperator + node_name;
-}
-
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/utils.h b/onnxruntime/core/framework/utils.h
index db38ef1675595..afdb5a2cb27f5 100644
--- a/onnxruntime/core/framework/utils.h
+++ b/onnxruntime/core/framework/utils.h
@@ -234,8 +234,6 @@ constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<UInt4x2>() {
 
 int32_t ONNXTensorElementDataTypeToProtoTensorType(ONNXTensorElementDataType);
 
-std::string GetPrepackedInitializerName(const std::string& initializer_name, const std::string& node_name);
-
 #ifdef ENABLE_TRAINING
 common::Status VerifyInputTensorsAllocatedContiguously(OpKernelContext* context);
 #endif
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 09a4a77780916..c7a0793c4748f 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3335,6 +3335,11 @@ void RegisterContribSchemas() {
           AttributeProto::STRING,
           OPTIONAL_VALUE)
       .Attr("notes", "(Optional) Some notes for the model", AttributeProto::STRING, OPTIONAL_VALUE)
+      .Attr(
+          "max_size",
+          "max size in the context. Usage depend on the EP.",
+          AttributeProto::INT,
+          static_cast<int64_t>(0))
       .AllowUncheckedAttributes()
       .Input(
           0,
diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
index 6f1f1c831d191..5a3cd86b04492 100644
--- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
@@ -9,7 +9,7 @@
 #include "core/graph/constants.h"
 #include "core/graph/contrib_ops/contrib_defs.h"
 #include "core/graph/contrib_ops/shape_inference_functions.h"
-#include "onnx/onnx-ml.pb.h" // ?
+#include "core/graph/onnx_protobuf.h"
 
 // Suppress a warning: global initializer calls a non-constexpr function 'symbol' which is from
 // ONNX_OPERATOR_SET_SCHEMA_EX macro and only happens in debug build
@@ -23,7 +23,7 @@ void convTransposeShapeInference(InferenceContext& ctx);
 void convPoolShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, bool use_dilation, bool require_kernel_shape,
                             int input1Idx, int input2Idx);
 namespace defs::math::utils {
-  void MatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int input1Idx, int input2Idx);
+void MatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int input1Idx, int input2Idx);
 }
 
 }  // namespace ONNX_NAMESPACE
@@ -822,10 +822,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
             }
           }
 
-        if (all_lengths_known) {
-          output_shape->mutable_dim(axis)->set_dim_value(total_length);
-        }
-      }));
+          if (all_lengths_known) {
+            output_shape->mutable_dim(axis)->set_dim_value(total_length);
+          }
+        }));
 
   ONNX_MS_OPERATOR_SET_SCHEMA(QLinearWhere, 1, OpSchema()
     .SetDoc("Return elements, either from X or Y, depending on condition.")
@@ -955,7 +955,8 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
               AttributeProto::INT, static_cast<int64_t>(0))
         .Attr("do_rotary", "Whether to use rotary position embedding. Default value is 0.",
               AttributeProto::INT, OPTIONAL_VALUE)
-        .Attr("past_present_share_buffer", "Corresponding past and present are same tensor, its shape is "
+        .Attr("past_present_share_buffer",
+              "Corresponding past and present are same tensor, its shape is "
               "(2, batch_size, num_heads, max_sequence_length, head_size)",
               AttributeProto::INT, OPTIONAL_VALUE)
         .Attr("mask_filter_value",
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 3f50841f50913..e8a5855b36496 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -4084,75 +4084,10 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const {
   return result;
 }
 
-void Graph::SetUpExternalInitializer(const Graph::OffsetAlignmentInfo& align_info,
-                                     size_t tensor_bytes_size,
-                                     int64_t& external_offset,
-                                     std::ofstream& external_stream,
-                                     gsl::span<const uint8_t> raw_data,
-                                     ONNX_NAMESPACE::TensorProto& output_proto,
-                                     const std::filesystem::path& external_file_path,
-                                     const ONNX_NAMESPACE::TensorProto& initializer,
-                                     bool is_prepacked) {
-  // update external_offset for alignment
-  // need to do padding before write actual tensor data as we do offset alignment at the begin of
-  // large tensors (offset need to be page aligned and alloction granularity aligned) like below:
-  // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
-  // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
-  if (align_info.align_offset && static_cast<int64_t>(tensor_bytes_size) > align_info.align_threshold) {
-    // Align to the larger of the page size or the allocation granularity
-    int64_t alignment_factor = std::max(static_cast<int64_t>(4096), align_info.allocation_granularity);
-    // Align to the next page or alloc granularity boundary
-    int64_t new_external_offset = static_cast<int64_t>(
-                                      std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
-                                  alignment_factor;
-
-    // padding tensor with zeros for alignment
-    InlinedVector<uint8_t> paddings;
-    size_t padding_size = SafeInt<size_t>(new_external_offset - external_offset);
-    paddings.reserve(padding_size);
-    for (size_t index = 0; index != padding_size; ++index) {
-      paddings.push_back(0x0);
-    }
-    external_stream.write(reinterpret_cast<const char*>(paddings.data()), padding_size);
-
-    external_offset = new_external_offset;
-  }
-
-  external_stream.write(reinterpret_cast<const char*>(raw_data.data()), tensor_bytes_size);
-
-  output_proto.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
-  ONNX_NAMESPACE::StringStringEntryProto* location = output_proto.add_external_data();
-  location->set_key("location");
-  location->set_value(ToUTF8String(external_file_path.native()));
-  ONNX_NAMESPACE::StringStringEntryProto* offset = output_proto.add_external_data();
-  offset->set_key("offset");
-  offset->set_value(std::to_string(external_offset));
-  ONNX_NAMESPACE::StringStringEntryProto* length = output_proto.add_external_data();
-  length->set_key("length");
-  length->set_value(std::to_string(tensor_bytes_size));
-
-  if (is_prepacked) {
-    ONNX_NAMESPACE::StringStringEntryProto* pre_packed = output_proto.add_external_data();
-    pre_packed->set_key("prepacked");
-    pre_packed->set_value("1");
-  }
-
-  output_proto.set_name(initializer.name());
-  output_proto.set_data_type(initializer.data_type());
-  for (int i = 0; i != initializer.dims_size(); ++i) {
-    output_proto.add_dims(initializer.dims(i));
-  }
-  output_proto.set_doc_string(initializer.doc_string());
-
-  external_offset += tensor_bytes_size;
-}
-
 ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                        const std::filesystem::path& model_file_path,
                                                                        size_t initializer_size_threshold,
-                                                                       const OffsetAlignmentInfo& align_info,
-                                                                       bool save_prepacked_constant_initializers,
-                                                                       PrePackedTensorProtoToSave& pre_packed_initializers) const {
+                                                                       const OffsetAlignmentInfo& align_info) const {
   GraphProto result;
   ToGraphProtoInternal(result);
   ORT_ENFORCE(external_file_path.is_relative());
@@ -4171,34 +4106,6 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std
 #endif
 
   for (const auto& initializer : graph_proto_->initializer()) {
-    bool use_pre_packed_initializer = false;
-    InlinedVector<TensorProto> pre_packed_initializers_tensor_proto;
-    // If this initializer has been prepacked, saved prepacked external initializer instead of original one.
-    // Since one initializer could be used by multiple kernels and been prepacked differently,
-    // Save each prepacked initializers seperately, chagne the initializer name to [initializer_name]:[node_name]
-    // to avoid conflict. Change the node input name accordingly.
-    // IT could potentially make the ONNX data file larger since we store multiple prepacked initializers into disk
-    // but this could be rare case.
-    if (save_prepacked_constant_initializers && pre_packed_initializers.count(initializer.name())) {
-      for (const auto& item : pre_packed_initializers[initializer.name()]) {
-        auto& node_name = item.first;
-        std::string prepacked_initializer_name = utils::GetPrepackedInitializerName(initializer.name(), node_name);
-        pre_packed_initializers_tensor_proto.push_back(item.second);
-        use_pre_packed_initializer = true;
-
-        for (auto& node : *result.mutable_node()) {
-          if (node.name() == node_name) {
-            int input_index = 0;
-            for (const auto& input : node.input()) {
-              if (input == initializer.name()) {
-                node.set_input(input_index, prepacked_initializer_name);
-              }
-              input_index += 1;
-            }
-          }
-        }
-      }
-    }
 #if !defined(DISABLE_SPARSE_TENSORS)
     if (sparse_end != sparse_tensor_names_.find(initializer.name())) {
       // Sparse tensors are added to the ONNX file.
@@ -4207,39 +4114,61 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std
       ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse");
     } else {
 #endif
-      if (use_pre_packed_initializer) {
-        for (const auto& pre_packed_initializer : pre_packed_initializers_tensor_proto) {
-          // Dense tensors larger than the threshold are added to the external file.
-          TensorProto* output_proto = result.add_initializer();
-          std::vector<uint8_t> raw_data;
-          size_t tensor_bytes_size = 0;
-
-          ORT_THROW_IF_ERROR(utils::UnpackInitializerData(pre_packed_initializer, model_path, raw_data));
-          tensor_bytes_size = raw_data.size();
-          if (tensor_bytes_size < initializer_size_threshold) {
-            *output_proto = pre_packed_initializer;
-            continue;
-          }
+      // Dense tensors larger than the threshold are added to the external file.
+      TensorProto* output_proto = result.add_initializer();
+
+      std::vector<uint8_t> raw_data;
+      ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
+      size_t tensor_bytes_size = raw_data.size();
+      if (tensor_bytes_size < initializer_size_threshold) {
+        *output_proto = initializer;
+        continue;
+      }
 
-          SetUpExternalInitializer(align_info, tensor_bytes_size, external_offset, external_stream,
-                                   raw_data, *output_proto, external_file_path, pre_packed_initializer, true);
-        }
-      } else {
-        // Dense tensors larger than the threshold are added to the external file.
-        TensorProto* output_proto = result.add_initializer();
-        std::vector<uint8_t> raw_data;
-        size_t tensor_bytes_size = 0;
-
-        ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
-        tensor_bytes_size = raw_data.size();
-        if (tensor_bytes_size < initializer_size_threshold) {
-          *output_proto = initializer;
-          continue;
+      // update external_offset for alignment
+      // need to do padding before write actual tensor data as we do offset alignment at the begin of
+      // large tensors (offset need to be page aligned and alloction granularity aligned) like below:
+      // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
+      // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+      if (align_info.align_offset && static_cast<int64_t>(tensor_bytes_size) > align_info.align_threshold) {
+        // Align to the larger of the page size or the allocation granularity
+        int64_t alignment_factor = std::max(static_cast<int64_t>(4096), align_info.allocation_granularity);
+        // Align to the next page or alloc granularity boundary
+        int64_t new_external_offset = static_cast<int64_t>(
+                                          std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
+                                      alignment_factor;
+
+        // padding tensor with zeros for alignment
+        for (int64_t index = external_offset; index != new_external_offset; ++index) {
+          external_stream << '0';
         }
 
-        SetUpExternalInitializer(align_info, tensor_bytes_size, external_offset, external_stream,
-                                 raw_data, *output_proto, external_file_path, initializer, false);
+        external_offset = new_external_offset;
       }
+
+      for (size_t index = 0; index != tensor_bytes_size; ++index) {
+        external_stream << raw_data[index];
+      }
+
+      output_proto->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
+      ONNX_NAMESPACE::StringStringEntryProto* location = output_proto->add_external_data();
+      location->set_key("location");
+      location->set_value(ToUTF8String(external_file_path.native()));
+      ONNX_NAMESPACE::StringStringEntryProto* offset = output_proto->add_external_data();
+      offset->set_key("offset");
+      offset->set_value(std::to_string(external_offset));
+      ONNX_NAMESPACE::StringStringEntryProto* length = output_proto->add_external_data();
+      length->set_key("length");
+      length->set_value(std::to_string(tensor_bytes_size));
+
+      output_proto->set_name(initializer.name());
+      output_proto->set_data_type(initializer.data_type());
+      for (int i = 0; i != initializer.dims_size(); ++i) {
+        output_proto->add_dims(initializer.dims(i));
+      }
+      output_proto->set_doc_string(initializer.doc_string());
+
+      external_offset += tensor_bytes_size;
 #if !defined(DISABLE_SPARSE_TENSORS)
     }
 #endif
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index ad1ec9c8dedb3..1bae63b510563 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -384,17 +384,13 @@ ModelProto Model::ToProto() const {
 ModelProto Model::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                        const std::filesystem::path& file_path,
                                                        size_t initializer_size_threshold,
-                                                       const Graph::OffsetAlignmentInfo& align_info,
-                                                       bool save_prepacked_constant_initializers,
-                                                       Graph::PrePackedTensorProtoToSave& pre_packed_initializers) const {
+                                                       const Graph::OffsetAlignmentInfo& align_info) const {
   ModelProto result(model_proto_);
   const auto& graph = *graph_;
   *(result.mutable_graph()) = graph.ToGraphProtoWithExternalInitializers(external_file_name,
                                                                          file_path,
                                                                          initializer_size_threshold,
-                                                                         align_info,
-                                                                         save_prepacked_constant_initializers,
-                                                                         pre_packed_initializers);
+                                                                         align_info);
   return result;
 }
 
@@ -612,9 +608,7 @@ static Status SaveModelWithExternalInitializers(Model& model,
                                                 const T& file_path,
                                                 const std::filesystem::path& external_file_name,
                                                 size_t initializer_size_threshold,
-                                                const Graph::OffsetAlignmentInfo& align_info,
-                                                bool save_prepacked_constant_initializers,
-                                                Graph::PrePackedTensorProtoToSave& pre_packed_initializers) {
+                                                const Graph::OffsetAlignmentInfo& align_info) {
   int fd = 0;
   Status status = Env::Default().FileOpenWr(file_path, fd);
   ORT_RETURN_IF_ERROR(status);
@@ -622,8 +616,7 @@ static Status SaveModelWithExternalInitializers(Model& model,
   ORT_TRY {
     status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name,
                                                  initializer_size_threshold,
-                                                 align_info, save_prepacked_constant_initializers,
-                                                 pre_packed_initializers);
+                                                 align_info);
   }
   ORT_CATCH(const std::exception& ex) {
     ORT_HANDLE_EXCEPTION([&]() {
@@ -654,12 +647,9 @@ Status Model::Load(const PathString& file_path, std::shared_ptr<Model>& p_model,
 Status Model::SaveWithExternalInitializers(Model& model, const std::filesystem::path& file_path,
                                            const std::filesystem::path& external_file_name,
                                            size_t initializer_size_threshold,
-                                           const Graph::OffsetAlignmentInfo& align_info,
-                                           bool save_prepacked_constant_initializers,
-                                           Graph::PrePackedTensorProtoToSave& pre_packed_initializers) {
+                                           const Graph::OffsetAlignmentInfo& align_info) {
   return SaveModelWithExternalInitializers(model, file_path, external_file_name, initializer_size_threshold,
-                                           align_info, save_prepacked_constant_initializers,
-                                           pre_packed_initializers);
+                                           align_info);
 }
 
 Status Model::LoadFromBytes(int count, const void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) {
@@ -776,9 +766,7 @@ Status Model::SaveWithExternalInitializers(Model& model,
                                            const std::filesystem::path& file_path,
                                            const std::filesystem::path& external_file_name,
                                            size_t initializer_size_threshold,
-                                           const Graph::OffsetAlignmentInfo& align_info,
-                                           bool save_prepacked_constant_initializers,
-                                           Graph::PrePackedTensorProtoToSave& pre_packed_initializers) {
+                                           const Graph::OffsetAlignmentInfo& align_info) {
   if (fd < 0) {
     return Status(ONNXRUNTIME, INVALID_ARGUMENT, "<fd> is less than 0.");
   }
@@ -787,8 +775,7 @@ Status Model::SaveWithExternalInitializers(Model& model,
 
   auto model_proto = model.ToGraphProtoWithExternalInitializers(external_file_name, file_path,
                                                                 initializer_size_threshold,
-                                                                align_info, save_prepacked_constant_initializers,
-                                                                pre_packed_initializers);
+                                                                align_info);
   google::protobuf::io::FileOutputStream output(fd);
   const bool result = model_proto.SerializeToZeroCopyStream(&output) && output.Flush();
   if (result) {
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 38d9044ff9d31..9bcec6f78ca08 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -191,17 +191,13 @@ class Model {
   ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                                   const std::filesystem::path& file_path,
                                                                   size_t initializer_size_threshold,
-                                                                  const Graph::OffsetAlignmentInfo& align_info,
-                                                                  bool save_prepacked_constant_initializers,
-                                                                  Graph::PrePackedTensorProtoToSave& pre_packed_initializers) const;
+                                                                  const Graph::OffsetAlignmentInfo& align_info) const;
 
   ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                                   const std::filesystem::path& file_path,
                                                                   size_t initializer_size_threshold) const {
     Graph::OffsetAlignmentInfo default_align_info;
-    Graph::PrePackedTensorProtoToSave pre_packed_initializers;
-    return ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold, default_align_info,
-                                                false, pre_packed_initializers);
+    return ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold, default_align_info);
   }
 
   static common::Status Save(Model& model, const PathString& file_path);
@@ -214,18 +210,14 @@ class Model {
                                                      const std::filesystem::path& file_path,
                                                      const std::filesystem::path& external_file_path,
                                                      size_t initializer_size_threshold,
-                                                     const Graph::OffsetAlignmentInfo& align_info,
-                                                     bool save_prepacked_constant_initializers,
-                                                     Graph::PrePackedTensorProtoToSave& pre_packed_initializers);
+                                                     const Graph::OffsetAlignmentInfo& align_info);
 
   static common::Status SaveWithExternalInitializers(Model& model,
                                                      const std::filesystem::path& file_path,
                                                      const std::filesystem::path& external_file_path,
                                                      size_t initializer_size_threshold) {
     Graph::OffsetAlignmentInfo default_align_info;
-    Graph::PrePackedTensorProtoToSave pre_packed_initializers;
-    return SaveWithExternalInitializers(model, file_path, external_file_path, initializer_size_threshold, default_align_info,
-                                        false, pre_packed_initializers);
+    return SaveWithExternalInitializers(model, file_path, external_file_path, initializer_size_threshold, default_align_info);
   }
 
   static common::Status SaveWithExternalInitializers(Model& model,
@@ -233,9 +225,7 @@ class Model {
                                                      const std::filesystem::path& file_path,
                                                      const std::filesystem::path& external_file_path,
                                                      size_t initializer_size_threshold,
-                                                     const Graph::OffsetAlignmentInfo& align_info,
-                                                     bool save_prepacked_constant_initializers,
-                                                     Graph::PrePackedTensorProtoToSave& pre_packed_initializers);
+                                                     const Graph::OffsetAlignmentInfo& align_info);
 
   static common::Status SaveWithExternalInitializers(Model& model,
                                                      int fd,
@@ -243,9 +233,7 @@ class Model {
                                                      const std::filesystem::path& external_file_path,
                                                      size_t initializer_size_threshold) {
     Graph::OffsetAlignmentInfo default_align_info;
-    Graph::PrePackedTensorProtoToSave pre_packed_initializers;
-    return SaveWithExternalInitializers(model, fd, file_path, external_file_path, initializer_size_threshold, default_align_info,
-                                        false, pre_packed_initializers);
+    return SaveWithExternalInitializers(model, fd, file_path, external_file_path, initializer_size_threshold, default_align_info);
   }
 
   static common::Status Load(std::istream& model_istream, ONNX_NAMESPACE::ModelProto* p_model_proto);
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 28ae64c4d5b3e..207c058d899b4 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1435,6 +1435,29 @@ MLAS_FP16* Destination,
 size_t Count
 );
 
+/**
+ * @brief rotary embedding for one hidden state vector
+ *
+ * @tparam T: data type of input, sin, cos and output. Currently only float32/16 are supported.
+ * @param input:  input tensor, of shape [dim]
+ * @param sin:   sin tensor, of shape [dim/2]
+ * @param cos:   cos tensor, of shape [dim/2]
+ * @param dim:   dimension of rotary embedding
+ * @param interleaved:  whether the real part and imaginary parts are interleaved
+ * @param output:  output tensor, of shape [dim]
+ */
+template <typename T>
+void
+MLASCALL
+MlasRotaryEmbedOneRow(
+    const T* input,
+    const T* sin,
+    const T* cos,
+    size_t dim,
+    bool interleaved,
+    T* output
+);
+
     /**
  * @brief Whether current CPU supports FP16 acceleration.
 */
diff --git a/onnxruntime/core/mlas/inc/mlas_qnbit.h b/onnxruntime/core/mlas/inc/mlas_qnbit.h
index 232bf2261ef4c..9608644a22523 100644
--- a/onnxruntime/core/mlas/inc/mlas_qnbit.h
+++ b/onnxruntime/core/mlas/inc/mlas_qnbit.h
@@ -27,51 +27,50 @@ Module Name:
  * @brief Define compute types of block quantization, in order of decreasing accuracy.
  */
 typedef enum {
-    CompUndef = 0, /*!< undef */
-    CompFp32,      /*!< input fp32, accumulator fp32 */
-    CompFp16,      /*!< input fp16, accumulator fp16 */
-    CompBf16,      /*!< input bf16, accumulator fp32 */
-    CompInt8,      /*!< input int8, accumulator int32 */
-
-    // special values that should be the first and last actual values
-
-    CompMostAccurate = CompUndef,
-    CompLeastAccurate = CompInt8,
-} MLAS_SQNBIT_GEMM_COMPUTE_TYPE;
+    SQNBIT_CompFp32,      /*!< input fp32, accumulator fp32 */
+    HQNBIT_CompFp16,      /*!< input fp16, accumulator fp16 */
+    BHQNBIT_CompBf16,     /*!< input bf16, accumulator fp32 */
+    SQNBIT_CompInt8,      /*!< input int8, accumulator int32, input fp32 */
+    HQNBIT_CompInt8,      /*!< input int8, accumulator int32, input fp16 */
+} MLAS_QNBIT_GEMM_COMPUTE_TYPE;
 
 /**
  * @brief Data parameters for float/n-bit quantized int GEMM routine.
+ *
+ * @tparam  T   data type of input A
  */
-struct MLAS_SQNBIT_GEMM_DATA_PARAMS {
-    const float* A = nullptr;                       ///< address of A (float32 matrix)
+template <typename T>
+struct MLAS_QNBIT_GEMM_DATA_PARAMS {
+    const T* A = nullptr;                       ///< address of A (float32/16 matrix)
     size_t lda = 0;                                 ///< leading dimension of A
     const void* QuantBDataWorkspace;                ///< address of quantized B (quantized n-bit int values)
     const std::byte* PackedQuantBData = nullptr;    /// address of packed quantized B data
-    const float* QuantBScale = nullptr;             ///< address of scale values of quantized B, one per block
+    const T* QuantBScale = nullptr;             ///< address of scale values of quantized B, one per block
     const void* QuantBZeroPoint = nullptr;          ///< optional address of zero point values of quantized B, one per block
-    const float* QuantBBlkSum = nullptr;            ///< optional address of scale * zp, one per block
-    const float* Bias = nullptr;                    ///< optional address of Bias, vector size N
-    float* C = nullptr;                             ///< address of result matrix
+    const T* QuantBBlkSum = nullptr;            ///< optional address of scale * zp, one per block
+    const T* Bias = nullptr;                    ///< optional address of Bias, vector size N
+    T* C = nullptr;                             ///< address of result matrix
     size_t ldc = 0;                                 ///< leading dimension of C
 
     ///< optional post processing to apply to result matrix
-    MLAS_GEMM_POSTPROCESSOR<float>* PostProcessor = nullptr;
+    MLAS_GEMM_POSTPROCESSOR<T>* PostProcessor = nullptr;
 };
 
 /**
  * @brief Batched GEMM:  C = A * B + Bias
- *        A must be a float32 matrix
+ *        A must be a float32/16 matrix
  *        B must be a quantized and packed n-bit int matrix
  *
- *        Call MlasIsSQNBitGemmAvailable() with the same parameters to determine whether this function may be called.
+ *        Call MlasIsQNBitGemmAvailable() with the same parameters to determine whether this function may be called.
  *
- *        Call MlasSQNBitGemmPackQuantBDataSize() with the same parameters to determine whether
- *          MLAS_SQNBIT_GEMM_DATA_PARAMS::QuantBData in `DataParams` should point to a buffer packed with
- *          MlasSQNBitGemmPackQuantBData().
+ *        Call MlasQNBitGemmPackQuantBDataSize() with the same parameters to determine whether
+ *          MLAS_QNBIT_GEMM_DATA_PARAMS::QuantBData in `DataParams` should point to a buffer packed with
+ *          MlasQNBitGemmPackQuantBData().
  *
- *        Call MlasSQNBitGemmBatchWorkspaceSize() with the same parameters to determine whether `Workspace` should
+ *        Call MlasQNBitGemmBatchWorkspaceSize() with the same parameters to determine whether `Workspace` should
  *          point to an intermediate workspace buffer.
  *
+ * @tparam          T               data type of input A
  * @param[in]       M               row size of matrix A and C
  * @param[in]       N               column size of matrix B and C
  * @param[in]       K               column size of matrix A and row size of matrix B
@@ -81,36 +80,37 @@ struct MLAS_SQNBIT_GEMM_DATA_PARAMS {
  * @param[in]       ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  * @param[inout]    DataParams      An array (size BatchN) of parameter blocks
  * @param[in]       Workspace       Address of intermediate workspace buffer.
-                                    If MlasSQNBitGemmBatchWorkspaceSize() returns a non-zero value, this must be a
+                                    If MlasQNBitGemmBatchWorkspaceSize() returns a non-zero value, this must be a
                                     buffer with at least that many bytes. Otherwise, it may be nullptr.
  * @param[in]       ThreadPool      optional thread pool to use
  */
+template <typename T>
 void MLASCALL
-MlasSQNBitGemmBatch(
+MlasQNBitGemmBatch(
     size_t M,
     size_t N,
     size_t K,
     size_t BatchN,
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<T>* DataParams,
     void* Workspace,
     MLAS_THREADPOOL* ThreadPool = nullptr
 );
 
 /**
- * @brief Determines whether a float32/quantized n-bit int GEMM implementation is available on the current platform.
+ * @brief Determines whether a float32/16 quantized n-bit int GEMM implementation is available on the current platform.
  *
  * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
  * @param[in]   BlkLen          number of quantized values per block
  * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
 bool MLASCALL
-MlasIsSQNBitGemmAvailable(
+MlasIsQNBitGemmAvailable(
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 );
 
 /**
@@ -126,22 +126,22 @@ MlasIsSQNBitGemmAvailable(
  * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
 size_t MLASCALL
-MlasSQNBitGemmBatchWorkspaceSize(
+MlasQNBitGemmBatchWorkspaceSize(
     size_t M,
     size_t N,
     size_t K,
     size_t BatchN,
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 );
 
 /**
  * @brief Gets the size in bytes of the packed quantized B data.
- * If non-zero, the quantized B data must first be packed by calling MlasSQNBitGemmPackQuantBData() with a buffer of
- * this size, and then that packed quantized B data buffer must be passed to MlasSQNBitGemmBatch().
- * If zero, MlasSQNBitGemmPackQuantBData() must not be called and the quantized B data must be directly passed to
- * MlasSQNBitGemmBatch().
+ * If non-zero, the quantized B data must first be packed by calling MlasQNBitGemmPackQuantBData() with a buffer of
+ * this size, and then that packed quantized B data buffer must be passed to MlasQNBitGemmBatch().
+ * If zero, MlasQNBitGemmPackQuantBData() must not be called and the quantized B data must be directly passed to
+ * MlasQNBitGemmBatch().
  *
  * @param[in]   N               column size of matrix B and C
  * @param[in]   K               column size of matrix A and row size of matrix B
@@ -150,12 +150,12 @@ MlasSQNBitGemmBatchWorkspaceSize(
  * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
 size_t MLASCALL
-MlasSQNBitGemmPackQuantBDataSize(
+MlasQNBitGemmPackQuantBDataSize(
     size_t N,
     size_t K,
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 );
 
 /**
@@ -186,12 +186,12 @@ MlasSQNBitGemmPackQuantBDataSize(
  * @param[in]   ThreadPool          thread pool to use (no parallel if nullptr)
  */
 void MLASCALL
-MlasSQNBitGemmPackQuantBData(
+MlasQNBitGemmPackQuantBData(
     size_t N,
     size_t K,
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const void* QuantBData,
     void* PackedQuantBDataAndOrBlkSum,
     const void* QuantBScale,
diff --git a/onnxruntime/core/mlas/lib/fp16_neon_common.cpp b/onnxruntime/core/mlas/lib/cast_kernel_neon.cpp
similarity index 99%
rename from onnxruntime/core/mlas/lib/fp16_neon_common.cpp
rename to onnxruntime/core/mlas/lib/cast_kernel_neon.cpp
index 29734c2277667..8a385c9c61751 100644
--- a/onnxruntime/core/mlas/lib/fp16_neon_common.cpp
+++ b/onnxruntime/core/mlas/lib/cast_kernel_neon.cpp
@@ -6,7 +6,7 @@ Licensed under the MIT License.
 
 Module Name:
 
-    fp16_neon_common.cpp
+    cast_kernel_neon.cpp
 
 Abstract:
 
diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h
index 30b66cdb2ea78..f4c49905ebbd7 100644
--- a/onnxruntime/core/mlas/lib/fp16_common.h
+++ b/onnxruntime/core/mlas/lib/fp16_common.h
@@ -64,6 +64,15 @@ MLAS_FORCEINLINE
 MLAS_FLOAT16X4
 MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); }
 
+template <int lane>
+MLAS_FORCEINLINE
+MLAS_FLOAT16X4
+MlasLoadLaneFloat16x4(const _mlas_fp16_* Buffer, MLAS_FLOAT16X4 vec) {
+    return vreinterpret_f16_u16(
+        vld1_lane_u16(Buffer, vreinterpret_u16_f16(vec), lane)
+    );
+}
+
 MLAS_FORCEINLINE
 MLAS_FLOAT16X4
 MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len)
@@ -95,6 +104,14 @@ MlasStoreFloat16x4(_mlas_fp16_* Buffer, MLAS_FLOAT16X4 Vector)
     vst1_u16(Buffer, vreinterpret_u16_f16(Vector));
 }
 
+template <int lane>
+MLAS_FORCEINLINE
+void
+MlasStoreLaneFloat16x4(_mlas_fp16_* Buffer, MLAS_FLOAT16X4 Vector)
+{
+    vst1_lane_u16(Buffer, vreinterpret_u16_f16(Vector), lane);
+}
+
 MLAS_FORCEINLINE
 void
 MlasStorePartialFloat16x4(_mlas_fp16_* Buffer, MLAS_FLOAT16X4 Vector, size_t len)
diff --git a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp
new file mode 100644
index 0000000000000..69e37d2b916d1
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp
@@ -0,0 +1,898 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    hqnbitgemm_kernel_neon_fp16.cpp
+
+Abstract:
+
+    This module implements the float/quantized n-bit integer matrix
+    multiplication kernels for ARM NEON specific to
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE HQNBIT_CompFp16.
+
+--*/
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+
+#include "fp16_common.h"
+#include "qnbitgemm.h"
+#include "qnbitgemm_kernel_neon.h"
+
+namespace sqnbitgemm_neon
+{
+MLAS_FORCEINLINE void
+Transpose8x8(uint8x8_t& v0, uint8x8_t& v1, uint8x8_t& v2, uint8x8_t& v3,
+             uint8x8_t& v4, uint8x8_t& v5, uint8x8_t& v6, uint8x8_t& v7)
+{
+    // v0: | B00 B10 | B20 B30 | B40 B50 | B60 B70 | B80 B90 | Ba0 Bb0 | Bc0 Bd0 | Be0 Bf0 |
+    // v1: | B01 B11 | B21 B31 | B41 B51 | B61 B71 | B81 B91 | Ba1 Bb1 | Bc1 Bd1 | Be1 Bf1 |
+    // v2: | B02 B12 | B22 B32 | B42 B52 | B62 B72 | B82 B92 | Ba2 Bb2 | Bc2 Bd2 | Be2 Bf2 |
+    // v3: | B03 B13 | B23 B33 | B43 B53 | B63 B73 | B83 B93 | Ba3 Bb3 | Bc3 Bd3 | Be3 Bf3 |
+    // v4: | B04 B14 | B24 B34 | B44 B54 | B64 B74 | B84 B94 | Ba4 Bb4 | Bc4 Bd4 | Be4 Bf4 |
+    // v5: | B05 B15 | B25 B35 | B45 B55 | B65 B75 | B85 B95 | Ba5 Bb5 | Bc5 Bd5 | Be5 Bf5 |
+    // v6: | B06 B16 | B26 B36 | B46 B56 | B66 B76 | B86 B96 | Ba6 Bb6 | Bc6 Bd6 | Be6 Bf6 |
+    // v7: | B07 B17 | B27 B37 | B47 B57 | B67 B77 | B87 B97 | Ba7 Bb7 | Bc7 Bd7 | Be7 Bf7 |
+
+    uint8x8x2_t a0 = vtrn_u8(v0, v1);
+    uint8x8x2_t a1 = vtrn_u8(v2, v3);
+    uint8x8x2_t a2 = vtrn_u8(v4, v5);
+    uint8x8x2_t a3 = vtrn_u8(v6, v7);
+
+    // a0[0]: | B00 B10 | B01 B11 | B40 B50 | B41 B51 | B80 B90 | B81 B91 | Bc0 Bd0 | Bc1 Bd1 |
+    // a0[1]: | B20 B30 | B21 B31 | B60 B70 | B61 B71 | Ba0 Bb0 | Ba1 Bb1 | Be0 Bf0 | Be1 Bf1 |
+    // a1[0]: | B02 B12 | B03 B13 | B42 B52 | B43 B53 | B82 B92 | B83 B93 | Bc2 Bd2 | Bc3 Bd3 |
+    // a1[1]: | B22 B32 | B23 B33 | B62 B72 | B63 B73 | Ba2 Bb2 | Ba3 Bb3 | Be2 Bf2 | Be3 Bf3 |
+    // a2[0]: | B04 B14 | B05 B15 | B44 B54 | B45 B55 | B84 B94 | B85 B95 | Bc4 Bd4 | Bc5 Bd5 |
+    // a2[1]: | B24 B34 | B25 B35 | B64 B74 | B65 B75 | Ba4 Bb4 | Ba5 Bb5 | Be4 Bf4 | Be5 Bf5 |
+    // a3[0]: | B06 B16 | B07 B17 | B46 B56 | B47 B57 | B86 B96 | B87 B97 | Bc6 Bd6 | Bc7 Bd7 |
+    // a3[1]: | B26 B36 | B27 B37 | B66 B76 | B67 B77 | Ba6 Bb6 | Ba7 Bb7 | Be6 Bf6 | Be7 Bf7 |
+
+    uint16x4x2_t b0 = vtrn_u16(vreinterpret_u16_u8(a0.val[0]), vreinterpret_u16_u8(a1.val[0]));
+    uint16x4x2_t b1 = vtrn_u16(vreinterpret_u16_u8(a0.val[1]), vreinterpret_u16_u8(a1.val[1]));
+    uint16x4x2_t b2 = vtrn_u16(vreinterpret_u16_u8(a2.val[0]), vreinterpret_u16_u8(a3.val[0]));
+    uint16x4x2_t b3 = vtrn_u16(vreinterpret_u16_u8(a2.val[1]), vreinterpret_u16_u8(a3.val[1]));
+
+    // b0[0]: | B00 B10 | B01 B11 | B02 B12 | B03 B13 | B80 B90 | B81 B91 | B82 B92 | B83 B93 |
+    // b0[1]: | B40 B50 | B41 B51 | B42 B52 | B43 B53 | Bc0 Bd0 | Bc1 Bd1 | Bc2 Bd2 | Bc3 Bd3 |
+    // b1[0]: | B20 B30 | B21 B31 | B22 B32 | B23 B33 | Ba0 Bb0 | Ba1 Bb1 | Ba2 Bb2 | Ba3 Bb3 |
+    // b1[1]: | B60 B70 | B61 B71 | B62 B72 | B63 B73 | Be0 Bf0 | Be1 Bf1 | Be2 Bf2 | Be3 Bf3 |
+    // b2[0]: | B04 B14 | B05 B15 | B06 B16 | B07 B17 | B84 B94 | B85 B95 | B86 B96 | B87 B97 |
+    // b2[1]: | B44 B54 | B45 B55 | B46 B56 | B47 B57 | Bc4 Bd4 | Bc5 Bd5 | Bc6 Bd6 | Bc7 Bd7 |
+    // b3[0]: | B24 B34 | B25 B35 | B26 B36 | B27 B37 | Ba4 Bb4 | Ba5 Bb5 | Ba6 Bb6 | Ba7 Bb7 |
+    // b3[1]: | B64 B74 | B65 B75 | B66 B76 | B67 B77 | Be4 Bf4 | Be5 Bf5 | Be6 Bf6 | Be7 Bf7 |
+
+    uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), vreinterpret_u32_u16(b2.val[0]));
+    uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]), vreinterpret_u32_u16(b2.val[1]));
+    uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b1.val[0]), vreinterpret_u32_u16(b3.val[0]));
+    uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b1.val[1]), vreinterpret_u32_u16(b3.val[1]));
+
+    // c0[0]: | B00 B10 | B01 B11 | B02 B12 | B03 B13 | B04 B14 | B05 B15 | B06 B16 | B07 B17 |
+    // c0[1]: | B80 B90 | B81 B91 | B92 B92 | B83 B93 | B84 B94 | B85 B95 | B86 B96 | B87 B97 |
+    // c1[0]: | B40 B50 | B41 B51 | B42 B52 | B43 B53 | B44 B54 | B45 B55 | B46 B56 | B47 B57 |
+    // c1[1]: | Bc0 Bd0 | Bc1 Bd1 | Bc2 Bd2 | Bc3 Bd3 | Bc4 Bd4 | Bc5 Bd5 | Bc6 Bd6 | Bc7 Bd7 |
+    // c2[0]: | B20 B30 | B21 B31 | B22 B32 | B23 B33 | B24 B34 | B25 B35 | B26 B36 | B27 B37 |
+    // c2[1]: | Ba0 Bb0 | Ba1 Bb1 | Ba2 Bb2 | Ba3 Bb3 | Ba4 Bb4 | Ba5 Bb5 | Ba6 Bb6 | Ba7 Bb7 |
+    // c3[0]: | B60 B70 | B61 B71 | B62 B72 | B63 B73 | B64 B74 | B65 B75 | B66 B76 | B67 B77 |
+    // c3[1]: | Be0 Bf0 | Be1 Bf1 | Be2 Bf2 | Be3 Bf3 | Be4 Bf4 | Be5 Bf5 | Be6 Bf6 | Be7 Bf7 |
+
+    v0 = vreinterpret_u8_u32(c0.val[0]);
+    v1 = vreinterpret_u8_u32(c2.val[0]);
+    v2 = vreinterpret_u8_u32(c1.val[0]);
+    v3 = vreinterpret_u8_u32(c3.val[0]);
+    v4 = vreinterpret_u8_u32(c0.val[1]);
+    v5 = vreinterpret_u8_u32(c2.val[1]);
+    v6 = vreinterpret_u8_u32(c1.val[1]);
+    v7 = vreinterpret_u8_u32(c3.val[1]);
+}
+
+MLAS_FORCEINLINE void
+Transpose4x8(float16x8_t& v0, float16x8_t& v1, float16x8_t& v2, float16x8_t& v3)
+{
+    // |v00|v01|v02|v03|v04|v05|v06|v07|
+    // |v10|v11|v12|v13|v14|v15|v16|v17|
+    // |v20|v21|v22|v23|v24|v25|v26|v27|
+    // |v30|v31|v32|v33|v34|v35|v36|v37|
+    //  =>
+    // |v00|v10|v20|v30|v04|v14|v24|v34|
+    // |v01|v11|v21|v31|v05|v15|v25|v35|
+    // |v02|v12|v22|v32|v06|v16|v26|v36|
+    // |v03|v13|v23|v33|v07|v17|v27|v37|
+    float16x8x2_t t01 = vtrnq_f16(v0, v1);
+    float16x8x2_t t23 = vtrnq_f16(v2, v3);
+
+    v0 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])));
+    v1 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])));
+    v2 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])));
+    v3 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])));
+}
+
+MLAS_FORCEINLINE void
+Transpose4x4(float16x4_t& v0, float16x4_t& v1, float16x4_t& v2, float16x4_t& v3)
+{
+    float16x4x2_t t01 = vtrn_f16(v0, v1);
+    float16x4x2_t t23 = vtrn_f16(v2, v3);
+
+    v0 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0])));
+    v1 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1])));
+    v2 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0])));
+    v3 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1])));
+}
+
+void
+HQ4BitGemmPackQuantBData_CompFp16(
+    size_t N,
+    size_t K,
+    size_t BlkLen,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    const std::byte* QuantBDataBegin,
+    std::byte* PackedQuantBDataBegin,
+    MLAS_THREADPOOL* ThreadPool
+)
+{
+    MLAS_UNREFERENCED_PARAMETER(ComputeType);
+    constexpr size_t nbits = 4;
+    constexpr size_t k_blk_dim = 16;
+    constexpr size_t n_blk_dim = 8;
+    assert(BlkLen > 0 && BlkLen % k_blk_dim == 0);
+
+    const size_t k_blk_num = MlasDivRoundup(K, k_blk_dim);
+    const size_t n_blk_num = MlasDivRoundup(N, n_blk_dim);
+    constexpr size_t k_blk_bytes = MlasQNBitBlkDataSizeInBytes(nbits, k_blk_dim);
+    const size_t iterations = k_blk_num * n_blk_num; // one iteration per block
+    const size_t ld = MlasDivRoundup(K, BlkLen) * MlasQNBitBlkDataSizeInBytes(nbits, BlkLen);
+
+    //
+    // For blocks 16_K * 8_N, transpose bytes in 8x8 blocks like this:
+    // src B_k_n:
+    // | B00 B10 | B20 B30 | B40 B50 | B60 B70 | B80 B90 | Ba0 Bb0 | Bc0 Bd0 | Be0 Bf0 |
+    // | B01 B11 | B21 B31 | B41 B51 | B61 B71 | B81 B91 | Ba1 Bb1 | Bc1 Bd1 | Be1 Bf1 |
+    // | B02 B12 | B22 B32 | B42 B52 | B62 B72 | B82 B92 | Ba2 Bb2 | Bc2 Bd2 | Be2 Bf2 |
+    // | B03 B13 | B23 B33 | B43 B53 | B63 B73 | B83 B93 | Ba3 Bb3 | Bc3 Bd3 | Be3 Bf3 |
+    // | B04 B14 | B24 B34 | B44 B54 | B64 B74 | B84 B94 | Ba4 Bb4 | Bc4 Bd4 | Be4 Bf4 |
+    // | B05 B15 | B25 B35 | B45 B55 | B65 B75 | B85 B95 | Ba5 Bb5 | Bc5 Bd5 | Be5 Bf5 |
+    // | B06 B16 | B26 B36 | B46 B56 | B66 B76 | B86 B96 | Ba6 Bb6 | Bc6 Bd6 | Be6 Bf6 |
+    // | B07 B17 | B27 B37 | B47 B57 | B67 B77 | B87 B97 | Ba7 Bb7 | Bc7 Bd7 | Be7 Bf7 |
+    // => dst:
+    // | B00 B10 | B01 B11 | B02 B12 | B03 B13 | B04 B14 | B05 B15 | B06 B16 | B07 B17 |
+    // | B20 B30 | B21 B31 | B22 B32 | B23 B33 | B24 B34 | B25 B35 | B26 B36 | B27 B37 |
+    // | B40 B50 | B41 B51 | B42 B52 | B43 B53 | B44 B54 | B45 B55 | B46 B56 | B47 B57 |
+    // | B60 B70 | B61 B71 | B62 B72 | B63 B73 | B64 B74 | B65 B75 | B66 B76 | B67 B77 |
+    // | B80 B90 | B81 B91 | B92 B92 | B83 B93 | B84 B94 | B85 B95 | B86 B96 | B87 B97 |
+    // | Ba0 Bb0 | Ba1 Bb1 | Ba2 Bb2 | Ba3 Bb3 | Ba4 Bb4 | Ba5 Bb5 | Ba6 Bb6 | Ba7 Bb7 |
+    // | Bc0 Bd0 | Bc1 Bd1 | Bc2 Bd2 | Bc3 Bd3 | Bc4 Bd4 | Bc5 Bd5 | Bc6 Bd6 | Bc7 Bd7 |
+    // | Be0 Bf0 | Be1 Bf1 | Be2 Bf2 | Be3 Bf3 | Be4 Bf4 | Be5 Bf5 | Be6 Bf6 | Be7 Bf7 |
+    //
+
+    //
+    // For blocks < 8_N:
+    // src: | v0 v1 | v2 v3 | v4 v5 | v6 v7 | v8 v9 | vA vB | vC vD | vE vF |
+    //   =>
+    // dst: | v0 v8 | v1 v9 | v2 vA | v3 vB | v4 vC | v5 vD | v6 vE | v7 vF |
+    //
+
+    MlasTrySimpleParallel(
+        ThreadPool, iterations,
+        [&](ptrdiff_t tid) {
+            const size_t n_blk = tid / k_blk_num;
+            const size_t k_blk = tid % k_blk_num;
+            size_t n = n_blk * n_blk_dim;
+            const size_t src_offset = n * ld + k_blk * k_blk_bytes;
+
+            if (n + n_blk_dim <= N) {
+                const size_t dst_offset = n * ld + k_blk * k_blk_bytes * n_blk_dim;
+                const uint8_t* src = reinterpret_cast<const uint8_t*>(QuantBDataBegin) + src_offset;
+                uint8_t* dst = reinterpret_cast<uint8_t*>(PackedQuantBDataBegin) + dst_offset;
+
+                uint8x8_t v0 = vld1_u8(src);
+                uint8x8_t v1 = vld1_u8(src + ld);
+                uint8x8_t v2 = vld1_u8(src + 2*ld);
+                uint8x8_t v3 = vld1_u8(src + 3*ld);
+                uint8x8_t v4 = vld1_u8(src + 4*ld);
+                uint8x8_t v5 = vld1_u8(src + 5*ld);
+                uint8x8_t v6 = vld1_u8(src + 6*ld);
+                uint8x8_t v7 = vld1_u8(src + 7*ld);
+
+                Transpose8x8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+                vst1_u8(dst, v0);
+                vst1_u8(dst + 8, v1);
+                vst1_u8(dst + 16, v2);
+                vst1_u8(dst + 24, v3);
+                vst1_u8(dst + 32, v4);
+                vst1_u8(dst + 40, v5);
+                vst1_u8(dst + 48, v6);
+                vst1_u8(dst + 56, v7);
+            } else {
+                const uint8_t* src = reinterpret_cast<const uint8_t*>(QuantBDataBegin) + src_offset;
+                uint8_t* dst = reinterpret_cast<uint8_t*>(PackedQuantBDataBegin) + src_offset;
+
+                for (; n < N; ++n, src += ld, dst += ld) {
+                    uint8x8_t v0 = vld1_u8(src);
+                    uint8x8_t v_even = vand_u8(v0, vdup_n_u8(0x0F));
+                    uint8x8_t v_odd = vshr_n_u8(v0, 4);
+                    uint8x8x2_t v1 = vzip_u8(v_even, v_odd);
+                    uint8x8_t v2 = vorr_u8(v1.val[0], vshl_n_u8(v1.val[1], 4));
+                    vst1_u8(dst, v2);
+                }
+            }
+        }
+    );
+}
+
+template<size_t N, size_t K>
+MLAS_FORCEINLINE
+typename std::enable_if_t<(N == 8 && K == 16), void>
+HQ4BitBlkDequantBKernel(
+    const std::uint8_t* src_ptr,
+    const float16x8_t& scale,
+    const float16x8_t& neg_scaled_zp,
+    _mlas_fp16_* dst_ptr
+) {
+    const uint8x8_t low_mask = vdup_n_u8(0x0F);
+
+    uint8x8_t b01 = vld1_u8(src_ptr);
+    uint8x8_t b23 = vld1_u8(src_ptr + 8);
+    uint8x8_t b45 = vld1_u8(src_ptr + 16);
+    uint8x8_t b67 = vld1_u8(src_ptr + 24);
+    uint8x8_t b89 = vld1_u8(src_ptr + 32);
+    uint8x8_t bab = vld1_u8(src_ptr + 40);
+    uint8x8_t bcd = vld1_u8(src_ptr + 48);
+    uint8x8_t bef = vld1_u8(src_ptr + 56);
+
+    float16x8_t b0 = vcvtq_f16_u16(vshll_n_u8(vand_u8(b01, low_mask), 0));
+    float16x8_t b1 = vcvtq_f16_u16(vshll_n_u8(vshr_n_u8(b01, 4), 0));
+    float16x8_t b2 = vcvtq_f16_u16(vshll_n_u8(vand_u8(b23, low_mask), 0));
+    float16x8_t b3 = vcvtq_f16_u16(vshll_n_u8(vshr_n_u8(b23, 4), 0));
+    float16x8_t b4 = vcvtq_f16_u16(vshll_n_u8(vand_u8(b45, low_mask), 0));
+    float16x8_t b5 = vcvtq_f16_u16(vshll_n_u8(vshr_n_u8(b45, 4), 0));
+    float16x8_t b6 = vcvtq_f16_u16(vshll_n_u8(vand_u8(b67, low_mask), 0));
+    float16x8_t b7 = vcvtq_f16_u16(vshll_n_u8(vshr_n_u8(b67, 4), 0));
+    float16x8_t b8 = vcvtq_f16_u16(vshll_n_u8(vand_u8(b89, low_mask), 0));
+    float16x8_t b9 = vcvtq_f16_u16(vshll_n_u8(vshr_n_u8(b89, 4), 0));
+    float16x8_t ba = vcvtq_f16_u16(vshll_n_u8(vand_u8(bab, low_mask), 0));
+    float16x8_t bb = vcvtq_f16_u16(vshll_n_u8(vshr_n_u8(bab, 4), 0));
+    float16x8_t bc = vcvtq_f16_u16(vshll_n_u8(vand_u8(bcd, low_mask), 0));
+    float16x8_t bd = vcvtq_f16_u16(vshll_n_u8(vshr_n_u8(bcd, 4), 0));
+    float16x8_t be = vcvtq_f16_u16(vshll_n_u8(vand_u8(bef, low_mask), 0));
+    float16x8_t bf = vcvtq_f16_u16(vshll_n_u8(vshr_n_u8(bef, 4), 0));
+
+    float16x8_t c0 = vfmaq_f16(neg_scaled_zp, b0, scale);
+    float16x8_t c1 = vfmaq_f16(neg_scaled_zp, b1, scale);
+    float16x8_t c2 = vfmaq_f16(neg_scaled_zp, b2, scale);
+    float16x8_t c3 = vfmaq_f16(neg_scaled_zp, b3, scale);
+    float16x8_t c4 = vfmaq_f16(neg_scaled_zp, b4, scale);
+    float16x8_t c5 = vfmaq_f16(neg_scaled_zp, b5, scale);
+    float16x8_t c6 = vfmaq_f16(neg_scaled_zp, b6, scale);
+    float16x8_t c7 = vfmaq_f16(neg_scaled_zp, b7, scale);
+    float16x8_t c8 = vfmaq_f16(neg_scaled_zp, b8, scale);
+    float16x8_t c9 = vfmaq_f16(neg_scaled_zp, b9, scale);
+    float16x8_t ca = vfmaq_f16(neg_scaled_zp, ba, scale);
+    float16x8_t cb = vfmaq_f16(neg_scaled_zp, bb, scale);
+    float16x8_t cc = vfmaq_f16(neg_scaled_zp, bc, scale);
+    float16x8_t cd = vfmaq_f16(neg_scaled_zp, bd, scale);
+    float16x8_t ce = vfmaq_f16(neg_scaled_zp, be, scale);
+    float16x8_t cf = vfmaq_f16(neg_scaled_zp, bf, scale);
+
+    MlasStoreFloat16x8(dst_ptr, c0);
+    MlasStoreFloat16x8(dst_ptr + 8, c1);
+    MlasStoreFloat16x8(dst_ptr + 16, c2);
+    MlasStoreFloat16x8(dst_ptr + 24, c3);
+    MlasStoreFloat16x8(dst_ptr + 32, c4);
+    MlasStoreFloat16x8(dst_ptr + 40, c5);
+    MlasStoreFloat16x8(dst_ptr + 48, c6);
+    MlasStoreFloat16x8(dst_ptr + 56, c7);
+    MlasStoreFloat16x8(dst_ptr + 64, c8);
+    MlasStoreFloat16x8(dst_ptr + 72, c9);
+    MlasStoreFloat16x8(dst_ptr + 80, ca);
+    MlasStoreFloat16x8(dst_ptr + 88, cb);
+    MlasStoreFloat16x8(dst_ptr + 96, cc);
+    MlasStoreFloat16x8(dst_ptr + 104, cd);
+    MlasStoreFloat16x8(dst_ptr + 112, ce);
+    MlasStoreFloat16x8(dst_ptr + 120, cf);
+}
+
+template<size_t N, size_t K>
+MLAS_FORCEINLINE
+typename std::enable_if_t<(N == 1 && K == 16), void>
+HQ4BitBlkDequantBKernel(
+    const std::uint8_t* src_ptr,
+    const float16x8_t& scale,
+    const float16x8_t& neg_scaled_zp,
+    _mlas_fp16_* dst_ptr
+) {
+    const uint8x8_t low_mask = vdup_n_u8(0x0F);
+
+    uint8x8_t v0 = vld1_u8(src_ptr);
+
+    float16x8_t f_low = vcvtq_f16_u16(vshll_n_u8(vand_u8(v0, low_mask), 0));
+    float16x8_t f_high = vcvtq_f16_u16(vshll_n_u8(vshr_n_u8(v0, 4), 0));
+
+    float16x8_t c0 = vfmaq_f16(neg_scaled_zp, f_low, scale);
+    float16x8_t c1 = vfmaq_f16(neg_scaled_zp, f_high, scale);
+
+    MlasStoreFloat16x8(dst_ptr, c0);
+    MlasStoreFloat16x8(dst_ptr + 8, c1);
+}
+
+void
+HQ4BitBlkDequantBForHgemm_CompFp16(
+    size_t BlkLen,
+    MLAS_FP16* FpData,
+    const std::byte* QuantBData,
+    const MLAS_FP16* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    size_t CountN,
+    size_t K,
+    size_t BlockCountK
+) {
+    MLAS_UNREFERENCED_PARAMETER(K);
+    constexpr size_t nbits = 4;
+    constexpr size_t kk_blk_dim = 16;
+    constexpr size_t n_blk_dim = 8;
+    assert(BlkLen > 0 && BlkLen % kk_blk_dim == 0);
+
+    const size_t kk_blk_num = BlockCountK * BlkLen / kk_blk_dim;
+    constexpr size_t kk_blk_bytes = MlasQNBitBlkDataSizeInBytes(nbits, kk_blk_dim);
+    const size_t kk_n_src_bytes = kk_blk_bytes * n_blk_dim;
+    const size_t kk_n_dst_size = kk_blk_dim * n_blk_dim;
+    const size_t ld_blk_src = kk_blk_num * kk_n_src_bytes;
+    const size_t ld_blk_dst = BlkLen * BlockCountK * n_blk_dim;
+    const size_t ld_blk_scale = BlockCountK * n_blk_dim;
+    const size_t ld_zp = (BlockCountK + 1) / 2;
+    const size_t ld_blk_zp = ld_zp * n_blk_dim;
+    const float16x8_t zp_mid_point_vec = MlasBroadcastFloat16x8(MLAS_FP16(8.0f).val);
+    const bool has_zp = QuantBZeroPoint != nullptr;
+
+    size_t n = 0;
+    for (; n + n_blk_dim <= CountN; n += n_blk_dim) {
+        const auto* scales_ptr = reinterpret_cast<const _mlas_fp16_*>(QuantBScale);
+        const std::uint8_t* zero_points_ptr = reinterpret_cast<const uint8_t*>(QuantBZeroPoint);
+        const std::uint8_t* src_ptr = reinterpret_cast<const uint8_t*>(QuantBData);
+        auto* dst_ptr = reinterpret_cast<_mlas_fp16_*>(FpData);
+
+        for (size_t k_blk_i = 0; k_blk_i < BlockCountK; ++k_blk_i) {
+            // prepare scales and zero_points for the block
+            _mlas_fp16_ scales[n_blk_dim];
+            uint16_t zero_points[n_blk_dim];
+            float16x8_t scale_vec;
+            float16x8_t neg_scaled_zp_vec;
+
+            UnrolledLoop<n_blk_dim>([&](int nn){
+                scales[nn] = scales_ptr[nn * BlockCountK];
+            });
+            scale_vec = MlasLoadFloat16x8(scales);
+
+            if (has_zp) {
+                UnrolledLoop<n_blk_dim>([&](int nn){
+                    uint8_t zp = zero_points_ptr[nn * ld_zp];
+                    zp = (k_blk_i & 1) ? (zp >> 4) : (zp & 0x0F);
+                    zero_points[nn] = static_cast<uint16_t>(zp);
+                });
+                uint16x8_t zp_u16_vec = vld1q_u16(zero_points);
+                neg_scaled_zp_vec = vcvtq_f16_u16(zp_u16_vec);
+            } else {
+                neg_scaled_zp_vec = zp_mid_point_vec;
+            }
+            neg_scaled_zp_vec = vnegq_f16(vmulq_f16(scale_vec, neg_scaled_zp_vec));
+
+            for (size_t kk = 0; kk < BlkLen; kk += kk_blk_dim) {
+                HQ4BitBlkDequantBKernel<8, 16>(src_ptr, scale_vec, neg_scaled_zp_vec, dst_ptr);
+
+                src_ptr += kk_n_src_bytes;
+                dst_ptr += kk_n_dst_size;
+            }
+
+            ++scales_ptr;
+            if (has_zp) {
+                zero_points_ptr += k_blk_i & 1;
+            }
+        }
+
+        QuantBData += ld_blk_src;
+        FpData += ld_blk_dst;
+        QuantBScale += ld_blk_scale;
+        QuantBZeroPoint = has_zp ? QuantBZeroPoint + ld_blk_zp : nullptr;
+    }
+
+    // remaining N
+    for (; n < CountN; ++n) {
+        const auto* scales_ptr = reinterpret_cast<const _mlas_fp16_*>(QuantBScale);
+        const std::uint8_t* zero_points_ptr = reinterpret_cast<const uint8_t*>(QuantBZeroPoint);
+        for (size_t k_blk_i = 0; k_blk_i < BlockCountK; ++k_blk_i) {
+            const auto scale = scales_ptr[0];
+            float16x8_t scale_vec = MlasBroadcastFloat16x8(scale);
+            float16x8_t neg_scaled_zp_vec;
+
+            if (has_zp) {
+                uint8_t zero_point = static_cast<uint8_t>(zero_points_ptr[0]);
+                zero_point = (k_blk_i & 1) ? (zero_point >> 4) : (zero_point & 0x0F);
+                uint16x8_t zp_u16_vec = vdupq_n_u16(static_cast<uint16_t>(zero_point));
+                neg_scaled_zp_vec = vcvtq_f16_u16(zp_u16_vec);
+            } else {
+                neg_scaled_zp_vec = zp_mid_point_vec;
+            }
+            neg_scaled_zp_vec = vnegq_f16(vmulq_f16(scale_vec, neg_scaled_zp_vec));
+
+            for (size_t kk = 0; kk < BlkLen; kk += kk_blk_dim) {
+                HQ4BitBlkDequantBKernel<1, 16>(
+                    reinterpret_cast<const uint8_t*>(QuantBData), scale_vec, neg_scaled_zp_vec,
+                    reinterpret_cast<_mlas_fp16_*>(FpData)
+                );
+
+                QuantBData += kk_blk_bytes;
+                FpData += kk_blk_dim;
+            }
+
+            ++scales_ptr;
+            if (has_zp) {
+                zero_points_ptr += k_blk_i & 1;
+            }
+        }
+
+        QuantBScale += BlockCountK;
+        if (has_zp) {
+            QuantBZeroPoint += ld_zp;
+        }
+    }
+}
+
+template <size_t N>
+MLAS_FORCEINLINE
+typename std::enable_if_t<(N == 8), float16x8_t>
+PrepareAccumulator(const _mlas_fp16_* Bias)
+{
+    if (Bias) {
+        return MlasLoadFloat16x8(Bias);
+    } else {
+        return MlasZeroFloat16x8();
+    }
+}
+
+template <size_t N>
+MLAS_FORCEINLINE
+typename std::enable_if_t<(N == 4), float16x4_t>
+PrepareAccumulator(const _mlas_fp16_* Bias)
+{
+    if (Bias) {
+        return MlasLoadFloat16x4(Bias);
+    } else {
+        return MlasZeroFloat16x4();
+    }
+}
+
+template <size_t N>
+MLAS_FORCEINLINE
+typename std::enable_if_t<((N == 2 || N == 1)), float16x4_t>
+PrepareAccumulator(const _mlas_fp16_* Bias)
+{
+    float16x4_t v = MlasZeroFloat16x4();
+
+    if (Bias) {
+        v = MlasLoadLaneFloat16x4<0>(Bias, v);
+        if constexpr (N == 2) {
+            v = MlasLoadLaneFloat16x4<1>(Bias + 1, v);
+        }
+        return v;
+    } else {
+        return v;
+    }
+}
+
+template<size_t N, size_t M, size_t K>
+MLAS_FORCEINLINE
+typename std::enable_if_t<(N == 8 && M == 1 && K == 8), float16x8_t>
+HQ4BitGemmMicroKernel(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* B,
+    const size_t ldb,
+    float16x8_t accumulator
+) {
+    MLAS_UNREFERENCED_PARAMETER(ldb);
+    float16x8_t a0 = MlasLoadFloat16x8(A);
+    float16x8_t b0 = MlasLoadFloat16x8(B);
+    float16x8_t b1 = MlasLoadFloat16x8(B + 8);
+    float16x8_t b2 = MlasLoadFloat16x8(B + 16);
+    float16x8_t b3 = MlasLoadFloat16x8(B + 24);
+    float16x8_t b4 = MlasLoadFloat16x8(B + 32);
+    float16x8_t b5 = MlasLoadFloat16x8(B + 40);
+    float16x8_t b6 = MlasLoadFloat16x8(B + 48);
+    float16x8_t b7 = MlasLoadFloat16x8(B + 56);
+
+    // This version uses less instructions, but introduces dependency path between instructions.
+    // Must pair it with loop unrolling to alleviate dependency path penalty.
+    float16x8_t c0 = vfmaq_laneq_f16(accumulator, b0, a0, 0);
+    c0 = vfmaq_laneq_f16(c0, b1, a0, 1);
+    c0 = vfmaq_laneq_f16(c0, b2, a0, 2);
+    c0 = vfmaq_laneq_f16(c0, b3, a0, 3);
+    c0 = vfmaq_laneq_f16(c0, b4, a0, 4);
+    c0 = vfmaq_laneq_f16(c0, b5, a0, 5);
+    c0 = vfmaq_laneq_f16(c0, b6, a0, 6);
+    c0 = vfmaq_laneq_f16(c0, b7, a0, 7);
+
+    return c0;
+}
+
+template<size_t N, size_t M, size_t K>
+MLAS_FORCEINLINE
+typename std::enable_if_t<(N == 8 && M == 1 && K == 4), float16x8_t>
+HQ4BitGemmMicroKernel(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* B,
+    const size_t ldb,
+    float16x8_t accumulator
+) {
+    MLAS_UNREFERENCED_PARAMETER(ldb);
+    float16x4_t a0 = MlasLoadFloat16x4(A);
+    float16x8_t b0 = MlasLoadFloat16x8(B);
+    float16x8_t b1 = MlasLoadFloat16x8(B + 8);
+    float16x8_t b2 = MlasLoadFloat16x8(B + 16);
+    float16x8_t b3 = MlasLoadFloat16x8(B + 24);
+
+    float16x8_t c0 = vfmaq_lane_f16(accumulator, b0, a0, 0);
+    c0 = vfmaq_lane_f16(c0, b1, a0, 1);
+    c0 = vfmaq_lane_f16(c0, b2, a0, 2);
+    c0 = vfmaq_lane_f16(c0, b3, a0, 3);
+
+    return c0;
+}
+
+template<size_t N, size_t M, size_t K>
+MLAS_FORCEINLINE
+typename std::enable_if_t<(N == 8 && M == 1 && (K == 2 || K == 1)), float16x8_t>
+HQ4BitGemmMicroKernel(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* B,
+    const size_t ldb,
+    float16x8_t accumulator
+) {
+    MLAS_UNREFERENCED_PARAMETER(ldb);
+    float16x4_t a0 = MlasZeroFloat16x4();
+    a0 = MlasLoadLaneFloat16x4<0>(A, a0);
+    if constexpr (K == 2) a0 = MlasLoadLaneFloat16x4<1>(A + 1, a0);
+    float16x8_t b0 = MlasLoadFloat16x8(B), b1;
+    if constexpr (K == 2) b1 = MlasLoadFloat16x8(B + 8);
+
+    float16x8_t c0 = vfmaq_lane_f16(accumulator, b0, a0, 0), c01;
+    if constexpr (K == 2) c01 = vfmaq_lane_f16(c0, b1, a0, 1);
+
+    if constexpr (K == 1)
+        return c0;
+    else
+        return c01;
+}
+
+template <size_t N, size_t M, size_t K>
+MLAS_FORCEINLINE
+typename std::enable_if_t<((N > 0 && N <= 4) && M == 1 && K == 8), float16x4_t>
+HQ4BitGemmMicroKernel(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* B,
+    const size_t ldb,
+    float16x4_t accumulator
+) {
+    float16x8_t a0 = MlasLoadFloat16x8(A);
+
+    float16x8_t b0, b1, b2, b3;
+    b0 = MlasLoadFloat16x8(B);
+    if constexpr (N > 1) b1 = MlasLoadFloat16x8(B + ldb);
+    if constexpr (N > 2) b2 = MlasLoadFloat16x8(B + ldb * 2);
+    if constexpr (N > 3) b3 = MlasLoadFloat16x8(B + ldb * 3);
+
+    float16x8_t c00, c01, c02, c03;
+    c00 = vmulq_f16(b0, a0);
+    if constexpr (N > 1)
+        c01 = vmulq_f16(b1, a0);
+    else
+        c01 = MlasZeroFloat16x8();
+    if constexpr (N > 2)
+        c02 = vmulq_f16(b2, a0);
+    else
+        c02 = MlasZeroFloat16x8();
+    if constexpr (N > 3)
+        c03 = vmulq_f16(b3, a0);
+    else
+        c03 = MlasZeroFloat16x8();
+
+    Transpose4x8(c00, c01, c02, c03);
+
+    float16x8_t c_low_high = vaddq_f16(vaddq_f16(c00, c01), vaddq_f16(c02, c03));
+    float16x4_t c_low = vget_low_f16(c_low_high);
+    float16x4_t c_high = vget_high_f16(c_low_high);
+    float16x4_t c = vadd_f16(c_low, c_high);
+
+    return vadd_f16(c, accumulator);
+}
+
+template <size_t N, size_t M, size_t K>
+MLAS_FORCEINLINE
+typename std::enable_if_t<((N > 0 && N <= 4) && M == 1 && (K == 4)), float16x4_t>
+HQ4BitGemmMicroKernel(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* B,
+    const size_t ldb,
+    float16x4_t accumulator
+) {
+    float16x4_t a0 = MlasLoadFloat16x4(A);
+    float16x4_t b0, b1, b2, b3;
+    b0 = MlasLoadFloat16x4(B);
+    if constexpr (N > 1) b1 = MlasLoadFloat16x4(B + ldb);
+    if constexpr (N > 2) b2 = MlasLoadFloat16x4(B + ldb * 2);
+    if constexpr (N > 3) b3 = MlasLoadFloat16x4(B + ldb * 3);
+
+    float16x4_t c00, c01, c02, c03;
+    c00 = vmul_f16(b0, a0);
+    if constexpr (N > 1)
+        c01 = vmul_f16(b1, a0);
+    else
+        c01 = MlasZeroFloat16x4();
+    if constexpr (N > 2)
+        c02 = vmul_f16(b2, a0);
+    else
+        c02 = MlasZeroFloat16x4();
+    if constexpr (N > 3)
+        c03 = vmul_f16(b3, a0);
+    else
+        c03 = MlasZeroFloat16x4();
+
+    Transpose4x4(c00, c01, c02, c03);
+
+    float16x4_t c = vadd_f16(vadd_f16(c00, c01), vadd_f16(c02, c03));
+    return vadd_f16(c, accumulator);
+}
+
+template <size_t N, size_t M, size_t K>
+MLAS_FORCEINLINE
+typename std::enable_if_t<((N > 0 && N <= 4) && M == 1 && (K > 0 && K < 4)), float16x4_t>
+HQ4BitGemmMicroKernel(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* B,
+    const size_t ldb,
+    float16x4_t accumulator
+) {
+    float16x4_t a0 = MlasZeroFloat16x4();
+    float16x4_t b0 = MlasZeroFloat16x4(), b1, b2, b3;
+    if constexpr (N > 1) b1 = MlasZeroFloat16x4();
+    if constexpr (N > 2) b2 = MlasZeroFloat16x4();
+    if constexpr (N > 3) b3 = MlasZeroFloat16x4();
+
+    a0 = MlasLoadLaneFloat16x4<0>(A, a0);
+    b0 = MlasLoadLaneFloat16x4<0>(B, b0);
+    if constexpr (N > 1) b1 = MlasLoadLaneFloat16x4<0>(B + ldb, b1);
+    if constexpr (N > 2) b2 = MlasLoadLaneFloat16x4<0>(B + ldb * 2, b2);
+    if constexpr (N > 3) b3 = MlasLoadLaneFloat16x4<0>(B + ldb * 3, b3);
+
+    if constexpr (K >= 2) {
+        a0 = MlasLoadLaneFloat16x4<1>(A + 1, a0);
+        b0 = MlasLoadLaneFloat16x4<1>(B + 1, b0);
+        if constexpr (N > 1) b1 = MlasLoadLaneFloat16x4<1>(B + 1 + ldb, b1);
+        if constexpr (N > 2) b2 = MlasLoadLaneFloat16x4<1>(B + 1 + ldb * 2, b2);
+        if constexpr (N > 3) b3 = MlasLoadLaneFloat16x4<1>(B + 1 + ldb * 3, b3);
+    }
+
+    if constexpr (K >= 3) {
+        a0 = MlasLoadLaneFloat16x4<2>(A + 2, a0);
+        b0 = MlasLoadLaneFloat16x4<2>(B + 2, b0);
+        if constexpr (N > 1) b1 = MlasLoadLaneFloat16x4<2>(B + 2 + ldb, b1);
+        if constexpr (N > 2) b2 = MlasLoadLaneFloat16x4<2>(B + 2 + ldb * 2, b2);
+        if constexpr (N > 3) b3 = MlasLoadLaneFloat16x4<2>(B + 2 + ldb * 3, b3);
+    }
+
+    float16x4_t c00, c01, c02, c03;
+    c00 = vmul_f16(b0, a0);
+    if constexpr (N > 1)
+        c01 = vmul_f16(b1, a0);
+    else
+        c01 = MlasZeroFloat16x4();
+    if constexpr (N > 2)
+        c02 = vmul_f16(b2, a0);
+    else
+        c02 = MlasZeroFloat16x4();
+    if constexpr (N > 3)
+        c03 = vmul_f16(b3, a0);
+    else
+        c03 = MlasZeroFloat16x4();
+
+    Transpose4x4(c00, c01, c02, c03);
+
+    float16x4_t c = vadd_f16(vadd_f16(c00, c01), vadd_f16(c02, c03));
+    return vadd_f16(c, accumulator);
+}
+
+template <size_t CountN, size_t CountM>
+typename std::enable_if_t<((CountN >= 1 && CountN <= 16 && ((CountN - 1) & CountN) == 0) && (CountM == 1 || CountM == 2)), void>
+HQ4BitGemmKernel_CompFp16_Kernel(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* B,
+    const _mlas_fp16_* Bias,
+    _mlas_fp16_* C,
+    size_t K,
+    size_t lda,
+    size_t ldb,
+    size_t ldc
+) {
+    using RegisterType = typename std::conditional_t<(CountN < 8), float16x4_t, float16x8_t>;
+
+    RegisterType accu00, accu01, accu10, accu11;
+    constexpr size_t b_step = CountN >= 8 ? 8 : 1;
+    constexpr size_t N = CountN == 16 ? 8 : CountN;
+
+    if constexpr (CountM == 2) {
+        accu00 = accu10 = PrepareAccumulator<N>(Bias);
+    } else {
+        accu00 = PrepareAccumulator<N>(Bias);
+    }
+    if constexpr (CountN == 16) {
+        if constexpr (CountM == 2) {
+            accu01 = accu11 = PrepareAccumulator<N>(Bias ? Bias + 8 : nullptr);
+        } else {
+            accu01 = PrepareAccumulator<N>(Bias ? Bias + 8 : nullptr);
+        }
+    }
+
+    size_t k = 0;
+    for (; k + 8 <= K; k += 8, A += 8, B += b_step * 8) {
+        accu00 = HQ4BitGemmMicroKernel<N, 1, 8>(A, B, ldb, accu00);
+        if constexpr (CountN == 16) {
+            accu01 = HQ4BitGemmMicroKernel<N, 1, 8>(A, B + b_step * ldb, ldb, accu01);
+        }
+        if constexpr (CountM == 2) {
+            accu10 = HQ4BitGemmMicroKernel<N, 1, 8>(A + lda, B, ldb, accu10);
+            if constexpr (CountN == 16) {
+                accu11 = HQ4BitGemmMicroKernel<N, 1, 8>(A + lda, B + b_step * ldb, ldb, accu11);
+            }
+        }
+    }
+
+    if (K & 4) {
+        accu00 = HQ4BitGemmMicroKernel<N, 1, 4>(A, B, ldb, accu00);
+        if constexpr (CountN == 16) {
+            accu01 = HQ4BitGemmMicroKernel<N, 1, 4>(A, B + b_step * ldb, ldb, accu01);
+        }
+        if constexpr (CountM == 2) {
+            accu10 = HQ4BitGemmMicroKernel<N, 1, 4>(A + lda, B, ldb, accu10);
+            if constexpr (CountN == 16) {
+                accu11 = HQ4BitGemmMicroKernel<N, 1, 4>(A + lda, B + b_step * ldb, ldb, accu11);
+            }
+        }
+        k += 4, A += 4, B += b_step * 4;
+    }
+
+    if (K & 2) {
+        accu00 = HQ4BitGemmMicroKernel<N, 1, 2>(A, B, ldb, accu00);
+        if constexpr (CountN == 16) {
+            accu01 = HQ4BitGemmMicroKernel<N, 1, 2>(A, B + b_step * ldb, ldb, accu01);
+        }
+        if constexpr (CountM == 2) {
+            accu10 = HQ4BitGemmMicroKernel<N, 1, 2>(A + lda, B, ldb, accu10);
+            if constexpr (CountN == 16) {
+                accu11 = HQ4BitGemmMicroKernel<N, 1, 2>(A + lda, B + b_step * ldb, ldb, accu11);
+            }
+        }
+        k += 2, A += 2, B += b_step * 2;
+    }
+
+    if (k < K) {
+        accu00 = HQ4BitGemmMicroKernel<N, 1, 1>(A, B, ldb, accu00);
+        if constexpr (CountN == 16) {
+            accu01 = HQ4BitGemmMicroKernel<N, 1, 1>(A, B + b_step * ldb, ldb, accu01);
+        }
+        if constexpr (CountM == 2) {
+            accu10 = HQ4BitGemmMicroKernel<N, 1, 1>(A + lda, B, ldb, accu10);
+            if constexpr (CountN == 16) {
+                accu11 = HQ4BitGemmMicroKernel<N, 1, 1>(A + lda, B + b_step * ldb, ldb, accu11);
+            }
+        }
+    }
+
+    if constexpr (CountN >= 8) {
+        MlasStoreFloat16x8(C, accu00);
+        if constexpr (CountN == 16) {
+            MlasStoreFloat16x8(C + 8, accu01);
+        }
+    } else if constexpr (CountN == 4) {
+        MlasStoreFloat16x4(C, accu00);
+    } else {
+        MlasStoreLaneFloat16x4<0>(C, accu00);
+        if constexpr (CountN == 2) {
+            MlasStoreLaneFloat16x4<1>(C + 1, accu00);
+        }
+    }
+
+    if constexpr (CountM == 2) {
+        if constexpr (CountN >= 8) {
+            MlasStoreFloat16x8(C + ldc, accu10);
+            if constexpr (CountN == 16) {
+                MlasStoreFloat16x8(C + ldc + 8, accu11);
+            }
+        } else if constexpr (CountN == 4) {
+            MlasStoreFloat16x4(C + ldc, accu10);
+        } else {
+            MlasStoreLaneFloat16x4<0>(C + ldc, accu10);
+            if constexpr (CountN == 2) {
+                MlasStoreLaneFloat16x4<1>(C + ldc + 1, accu10);
+            }
+        }
+    }
+}
+
+void
+HQ4BitGemmKernel_CompFp16(
+    const MLAS_FP16* A,
+    const MLAS_FP16* B,
+    const MLAS_FP16* Bias,
+    MLAS_FP16* C,
+    size_t CountM,
+    size_t CountN,
+    size_t K,
+    size_t lda,
+    size_t ldb,
+    size_t ldc
+) {
+    assert(CountM <= 2);
+
+    // 2M_16N is the balance between loop unrolling and register spill.
+    // More unroll will trigger register spill.
+    // Less unroll will increase micro kernel dependency path penalty.
+    // TODO: dequant 16N as continuous segments. Current version dequants 8N.
+    const auto* a = reinterpret_cast<const _mlas_fp16_*>(A);
+    const auto* b = reinterpret_cast<const _mlas_fp16_*>(B);
+    const auto* bias = reinterpret_cast<const _mlas_fp16_*>(Bias);
+    auto* c = reinterpret_cast<_mlas_fp16_*>(C);
+
+    for (; CountN >= 16; CountN -= 16) {
+        if (CountM == 2) {
+            HQ4BitGemmKernel_CompFp16_Kernel<16, 2>(a, b, bias, c, K, lda, ldb, ldc);
+        } else {
+            HQ4BitGemmKernel_CompFp16_Kernel<16, 1>(a, b, bias, c, K, lda, ldb, ldc);
+        }
+        b += 16 * ldb, c += 16;
+        if (bias) bias += 16;
+    }
+
+    if (CountN & 8) {
+        if (CountM == 2) {
+            HQ4BitGemmKernel_CompFp16_Kernel<8, 2>(a, b, bias, c, K, lda, ldb, ldc);
+        } else {
+            HQ4BitGemmKernel_CompFp16_Kernel<8, 1>(a, b, bias, c, K, lda, ldb, ldc);
+        }
+        b += 8 * ldb, c += 8;
+        if (bias) bias += 8;
+    }
+
+    if (CountN & 4) {
+        if (CountM == 2) {
+            HQ4BitGemmKernel_CompFp16_Kernel<4, 2>(a, b, bias, c, K, lda, ldb, ldc);
+        } else {
+            HQ4BitGemmKernel_CompFp16_Kernel<4, 1>(a, b, bias, c, K, lda, ldb, ldc);
+        }
+        b += 4 * ldb, c += 4;
+        if (bias) bias += 4;
+    }
+
+    if (CountN & 2) {
+        if (CountM == 2) {
+            HQ4BitGemmKernel_CompFp16_Kernel<2, 2>(a, b, bias, c, K, lda, ldb, ldc);
+        } else {
+            HQ4BitGemmKernel_CompFp16_Kernel<2, 1>(a, b, bias, c, K, lda, ldb, ldc);
+        }
+        b += 2 * ldb, c += 2;
+        if (bias) bias += 2;
+    }
+
+    if (CountN & 1) {
+        if (CountM == 2) {
+            HQ4BitGemmKernel_CompFp16_Kernel<1, 2>(a, b, bias, c, K, lda, ldb, ldc);
+        } else {
+            HQ4BitGemmKernel_CompFp16_Kernel<1, 1>(a, b, bias, c, K, lda, ldb, ldc);
+        }
+    }
+}
+}  // namespace sqnbitgemm_neon
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 13ea8d96c20e4..100d7d47751aa 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -358,6 +358,22 @@ size_t
     bool ZeroMode
     );
 
+#ifdef FORCE_GENERIC_ALGORITHMS
+typedef
+size_t
+(MLASCALL MLAS_GEMM_FLOAT_KERNEL_GENERIC)(
+    const float* A,
+    const float* B,
+    float* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    float alpha
+    );
+#endif
+
 #else
 
 #if defined(__aarch64__) && defined(__linux__)
@@ -733,6 +749,10 @@ extern "C" {
 #if defined(MLAS_TARGET_AMD64_IX86)
     MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelSse;
     MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelAvx;
+#ifdef FORCE_GENERIC_ALGORITHMS
+    MLAS_GEMM_FLOAT_KERNEL_GENERIC MlasSgemmKernelZero;
+    MLAS_GEMM_FLOAT_KERNEL_GENERIC MlasSgemmKernelAdd;
+#endif
 #if defined(MLAS_TARGET_AMD64)
     MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelFma3;
     MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelAvx512F;
@@ -1017,17 +1037,24 @@ extern const MLAS_FPQ4GEMM_DISPATCH MlasFpQ4GemmDispatchAvx512;
 // Float/quantized n-bit integer matrix/matrix multiply dispatch structure.
 //
 
-struct MLAS_SQNBIT_GEMM_DISPATCH;
+struct MLAS_QNBIT_GEMM_DISPATCH;
 
-extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon;
+extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon;
 
-extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2;
+extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2;
 
-extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2vnni;
+extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2vnni;
 
-extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512;
+extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512;
+
+extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni;
+
+//
+// Rotary embedding dispatch structure.
+//
+struct MLAS_ROPE_DISPATCH;
+extern const MLAS_ROPE_DISPATCH MlasRopeDispatchNeon;
 
-extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni;
 
 //
 // Quantized depthwise convolution kernels.
@@ -1184,10 +1211,12 @@ struct MLAS_PLATFORM {
     const MLAS_FPQ4GEMM_DISPATCH* FpQ4GemmDispatch{nullptr};
     const MLAS_Q8Q4GEMM_DISPATCH* Q8Q4GemmDispatch{nullptr};
 
-    const MLAS_SQNBIT_GEMM_DISPATCH* SQNBitGemmDispatch{nullptr};
+    const MLAS_QNBIT_GEMM_DISPATCH* QNBitGemmDispatch{nullptr};
 
     MLAS_CAST_F16_TO_F32_KERNEL* CastF16ToF32Kernel;
     MLAS_CAST_F32_TO_F16_KERNEL* CastF32ToF16Kernel;
+
+    const MLAS_ROPE_DISPATCH* RopeDispatch{nullptr};
 };
 
 inline
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 23d29fd02fa5a..ec572a4150292 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -286,7 +286,11 @@ Return Value:
     this->QuantizeLinearS4Kernel = MlasQuantizeLinearS4Kernel;
     this->QuantizeLinearU4Kernel = MlasQuantizeLinearU4Kernel;
 #ifndef __APPLE__
+#ifndef FORCE_GENERIC_ALGORITHMS
     this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelSse;
+#else  // FORCE_GENERIC_ALGORITHMS
+    this->CastF16ToF32Kernel = nullptr;
+#endif  // FORCE_GENERIC_ALGORITHMS
 #endif  // __APPLE__
 
     this->NchwcBlockSize = 8;
@@ -308,8 +312,11 @@ Return Value:
     //
     // Check if the processor supports SSE 4.1 instructions.
     //
-
+#ifndef FORCE_GENERIC_ALGORITHMS
     if ((Cpuid1[2] & 0x80000) != 0) {
+#else  // FORCE_GENERIC_ALGORITHMS
+    if (false) {
+#endif  // FORCE_GENERIC_ALGORITHMS
         this->GemmU8S8Dispatch = &MlasGemmU8S8DispatchSse41;
     }
 
@@ -319,7 +326,11 @@ Return Value:
     // Check if the processor supports the AVX and OSXSAVE features.
     //
 
+#ifndef FORCE_GENERIC_ALGORITHMS
     if ((Cpuid1[2] & 0x18000000) == 0x18000000) {
+#else  // FORCE_GENERIC_ALGORITHMS
+    if (false) {
+#endif  // FORCE_GENERIC_ALGORITHMS
 
         //
         // Check if the operating system supports saving SSE and AVX states.
@@ -387,7 +398,7 @@ Return Value:
                 this->ConvDepthwiseS8S8Kernel = MlasConvDepthwiseKernelAvx2<int8_t, int8_t>;
                 this->ConvDepthwiseS8U8Kernel = MlasConvDepthwiseKernelAvx2<int8_t, uint8_t>;
                 this->ComputeSumExpF32Kernel = MlasComputeSumExpF32KernelFma3;
-                this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2;
+                this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2;
                 this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelAvx2;
                 this->CastF32ToF16Kernel = &MlasCastF32ToF16KernelAvx2;
 
@@ -417,7 +428,7 @@ Return Value:
                     this->GemmU8S8Kernel = MlasGemmU8S8KernelAvxVnni;
                     this->GemvU8S8Kernel = MlasGemvU8S8KernelAvxVnni;
                     this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvxVnni;
-                    this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2vnni;
+                    this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2vnni;
                 }
 
 #if !defined(ORT_MINIMAL_BUILD)
@@ -458,7 +469,7 @@ Return Value:
                         this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512Core;
                         this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvx512Core;
                         this->FpQ4GemmDispatch = &MlasFpQ4GemmDispatchAvx512;
-                        this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx512;
+                        this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx512;
 
                         //
                         // Check if the processor supports AVX512VNNI.
@@ -471,7 +482,7 @@ Return Value:
                             this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512Vnni;
                             this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvx512Vnni;
                             this->Q8Q4GemmDispatch = &MlasQ8Q4GemmDispatchAvx512vnni;
-                            this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx512vnni;
+                            this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx512vnni;
                         }
                     }
                 }
@@ -531,6 +542,8 @@ Return Value:
     this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchNeon;
     this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchNeon;
     this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchNeon;
+    this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
+    this->RopeDispatch = &MlasRopeDispatchNeon;
 
     //
     // Check if the processor supports ASIMD dot product instructions.
@@ -560,9 +573,6 @@ Return Value:
         this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchSdot;
         this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchDot;
         this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
-
-        // MlasSQNBitGemmDispatchNeon has a dependency on dot product instructions
-        this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
     }
 
 #if defined(__linux__)
diff --git a/onnxruntime/core/mlas/lib/qgemm.h b/onnxruntime/core/mlas/lib/qgemm.h
index 1ef5b5f7411f0..bcd878efa681b 100644
--- a/onnxruntime/core/mlas/lib/qgemm.h
+++ b/onnxruntime/core/mlas/lib/qgemm.h
@@ -867,7 +867,8 @@ MlasGemmQuantGetDispatch(
 {
     const MLAS_GEMM_QUANT_DISPATCH* GemmQuantDispatch = &MlasGemmQuantDispatchDefault;
 
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_LARCH64)
+#if !defined(FORCE_GENERIC_ALGORITHMS)
+#if defined(MLAS_TARGET_AMD64_IX86)
     if (AIsSigned) {
         GemmQuantDispatch =
             BIsSigned ? GetMlasPlatform().GemmS8S8Dispatch : GetMlasPlatform().GemmS8U8Dispatch;
@@ -895,7 +896,13 @@ MlasGemmQuantGetDispatch(
     if (GetMlasPlatform().GemmU8X8Dispatch == &MlasGemm8X8DispatchPOWER10) {
         GemmQuantDispatch = GetMlasPlatform().GemmU8X8Dispatch;
     }
+#elif defined(MLAS_TARGET_LARCH64)
+    if (!AIsSigned) {
+        GemmQuantDispatch =
+            BIsSigned ? GetMlasPlatform().GemmU8S8Dispatch : GetMlasPlatform().GemmU8U8Dispatch;
+    }
 #endif
+#endif // !defined(FORCE_GENERIC_ALGORITHMS)
 
     if (nullptr == GemmQuantDispatch) {
         std::stringstream ss;
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp b/onnxruntime/core/mlas/lib/qnbitgemm.cpp
similarity index 62%
rename from onnxruntime/core/mlas/lib/sqnbitgemm.cpp
rename to onnxruntime/core/mlas/lib/qnbitgemm.cpp
index a45494ef2e04f..f064a8e1d6a78 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qnbitgemm.cpp
@@ -6,16 +6,16 @@ Licensed under the MIT License.
 
 Module Name:
 
-    sqnbitgemm.cpp
+    qnbitgemm.cpp
 
 Abstract:
 
     This module implements the float/quantized n-bit integer matrix
-    multiplication hardware agnostic entrypoint, MlasSQNBitGemmBatch,
+    multiplication hardware agnostic entrypoint, MlasQNBitGemmBatch,
     as well as some SQNBitGemm-related query functions.
 --*/
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_q8_block.h"
 
 #include <cassert>
@@ -23,35 +23,40 @@ Module Name:
 namespace
 {
 
-enum SQNBitGemmVariant {
+enum QNBitGemmVariant {
     SQNBitGemmVariantInvalid = -1,
 
     // Valid variants
 
     SQNBitGemmVariant_BitWidth4_CompFp32 = 0,
     SQNBitGemmVariant_BitWidth4_CompInt8,
+    HQNBitGemmVariant_BitWidth4_CompFp16,
+    HQNBitGemmVariant_BitWidth4_CompInt8,
 
     // End of valid variants
 
-    // Keep this element last and ensure that its value is the number of valid SQNBitGemmVariant values.
+    // Keep this element last and ensure that its value is the number of valid QNBitGemmVariant values.
     // Its value is used as an array size.
     SQNBitGemmVariantCount,
 };
 
-SQNBitGemmVariant
-GetSQNBitGemmVariant(
+QNBitGemmVariant
+GetQNBitGemmVariant(
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
     if (BlkBitWidth == 4 &&
         (BlkLen == 16 || BlkLen == 32 || BlkLen == 64 || BlkLen == 128 || BlkLen == 256)) {
-        if (ComputeType == CompFp32 ||
-            ComputeType == CompUndef) {  // treat CompUndef (undefined) as CompFp32
+        if (ComputeType == SQNBIT_CompFp32) {
             return SQNBitGemmVariant_BitWidth4_CompFp32;
-        } else if (ComputeType == CompInt8) {
+        } else if (ComputeType == HQNBIT_CompFp16) {
+            return HQNBitGemmVariant_BitWidth4_CompFp16;
+        } else if (ComputeType == SQNBIT_CompInt8) {
             return SQNBitGemmVariant_BitWidth4_CompInt8;
+        } else if (ComputeType == HQNBIT_CompInt8) {
+            return HQNBitGemmVariant_BitWidth4_CompInt8;
         }
     }
 
@@ -61,23 +66,28 @@ GetSQNBitGemmVariant(
 }  // namespace
 
 bool MLASCALL
-MlasIsSQNBitGemmAvailable(
+MlasIsQNBitGemmAvailable(
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
-    const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch;
+    const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch;
     if (Dispatch == nullptr) {
         return false;
     }
 
-    const auto Variant = GetSQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType);
+    const auto Variant = GetQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType);
 
     switch (Variant) {
         case SQNBitGemmVariant_BitWidth4_CompFp32: {
             return Dispatch->SQ4BitGemmM1Kernel_CompFp32 != nullptr &&
-                   Dispatch->Q4BitBlkDequantBForSgemm_CompFp32 != nullptr;
+                   Dispatch->SQ4BitBlkDequantBForSgemm_CompFp32 != nullptr;
+        }
+        case HQNBitGemmVariant_BitWidth4_CompFp16: {
+            return Dispatch->HQ4BitGemmPackQuantBData != nullptr &&
+                   Dispatch->HQ4BitGemmKernel_CompFp16 != nullptr &&
+                   Dispatch->HQ4BitBlkDequantBForHgemm_CompFp16 != nullptr;
         }
         case SQNBitGemmVariant_BitWidth4_CompInt8: { // SQ4BitGemmKernel_BlkSum_CompInt8
             return
@@ -94,80 +104,80 @@ namespace
 {
 
 size_t
-SQNBitGemmPerGemmWorkspaceSize(
+QNBitGemmPerGemmWorkspaceSize(
     size_t M,
     size_t N,
     size_t K,
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
-    const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch;
+    const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch;
     if (Dispatch == nullptr) {
         return 0;
     }
 
-    if (BlkBitWidth == 4 && Dispatch->SQ4BitGemmPerGemmWorkspaceSize != nullptr) {
-        return Dispatch->SQ4BitGemmPerGemmWorkspaceSize(M, N, K, BlkLen, ComputeType);
+    if (BlkBitWidth == 4 && Dispatch->Q4BitGemmPerGemmWorkspaceSize != nullptr) {
+        return Dispatch->Q4BitGemmPerGemmWorkspaceSize(M, N, K, BlkLen, ComputeType);
     }
 
     return 0;
 }
 
 size_t
-SQNBitGemmPerGemmWorkspaceAlignment(
+QNBitGemmPerGemmWorkspaceAlignment(
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
-    const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch;
+    const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch;
     if (Dispatch == nullptr) {
         return 1;
     }
 
-    if (BlkBitWidth == 4 && Dispatch->SQ4BitGemmPerGemmWorkspaceAlignment != nullptr) {
-        return Dispatch->SQ4BitGemmPerGemmWorkspaceAlignment(BlkLen, ComputeType);
+    if (BlkBitWidth == 4 && Dispatch->Q4BitGemmPerGemmWorkspaceAlignment != nullptr) {
+        return Dispatch->Q4BitGemmPerGemmWorkspaceAlignment(BlkLen, ComputeType);
     }
 
     return 1;
 }
 
 size_t
-SQNBitGemmPerGemmWorkspaceStride(
+QNBitGemmPerGemmWorkspaceStride(
     size_t M,
     size_t N,
     size_t K,
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
-    const auto Size = SQNBitGemmPerGemmWorkspaceSize(M, N, K, BlkBitWidth, BlkLen, ComputeType);
-    const auto Alignment = SQNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType);
+    const auto Size = QNBitGemmPerGemmWorkspaceSize(M, N, K, BlkBitWidth, BlkLen, ComputeType);
+    const auto Alignment = QNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType);
     return MlasDivRoundup(Size, Alignment) * Alignment;
 }
 
 }  // namespace
 
 size_t MLASCALL
-MlasSQNBitGemmBatchWorkspaceSize(
+MlasQNBitGemmBatchWorkspaceSize(
     size_t M,
     size_t N,
     size_t K,
     size_t BatchN,
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
-    const size_t PerGemmWorkspaceStride = SQNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, ComputeType);
+    const size_t PerGemmWorkspaceStride = QNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, ComputeType);
     if (PerGemmWorkspaceStride == 0) {
         return 0;
     }
 
-    const size_t Alignment = SQNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType);
+    const size_t Alignment = QNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType);
 
     const size_t WorkspaceSize = BatchN * PerGemmWorkspaceStride;
 
@@ -175,21 +185,21 @@ MlasSQNBitGemmBatchWorkspaceSize(
 }
 
 size_t MLASCALL
-MlasSQNBitGemmPackQuantBDataSize(
+MlasQNBitGemmPackQuantBDataSize(
     size_t N,
     size_t K,
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
-    const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch;
+    const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch;
     if (Dispatch == nullptr) {
         return 0;
     }
 
-    if (BlkBitWidth == 4 && Dispatch->SQ4BitGemmPackQuantBDataSize != nullptr) {
-        return Dispatch->SQ4BitGemmPackQuantBDataSize(
+    if (BlkBitWidth == 4 && Dispatch->Q4BitGemmPackQuantBDataSize != nullptr) {
+        return Dispatch->Q4BitGemmPackQuantBDataSize(
             N, K, BlkLen, ComputeType
         );
     }
@@ -213,12 +223,12 @@ struct PerGemmQuantAWorkspace {
 };
 
 void MLASCALL
-MlasSQNBitGemmPackQuantBData(
+MlasQNBitGemmPackQuantBData(
     size_t N,
     size_t K,
     size_t BlkBitWidth,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const void* QuantBData,
     void* PackedQuantBDataAndOrBlkSumWorkspace,
     const void* QuantBScale,
@@ -227,15 +237,15 @@ MlasSQNBitGemmPackQuantBData(
     MLAS_THREADPOOL* ThreadPool
 )
 {
-    const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch;
+    const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch;
     if (Dispatch == nullptr) {
         return;
     }
 
     if (BlkBitWidth == 4) {
-        if (ComputeType == CompInt8 && Dispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) {
+        if (ComputeType == SQNBIT_CompInt8 && Dispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) {
             const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
-            PackedQuantBDataStruct packed_quant_b(PackedQuantBDataAndOrBlkSumWorkspace, N, BlockCountK, BlkLen);
+            PackedQuantBDataStruct<float> packed_quant_b(PackedQuantBDataAndOrBlkSumWorkspace, N, BlockCountK, BlkLen);
             Dispatch->SQ4BitGemmPackQuantBDataAndBlkSum(
                 N,
                 K,
@@ -248,6 +258,16 @@ MlasSQNBitGemmPackQuantBData(
                 packed_quant_b,
                 ThreadPool
             );
+        } else if (ComputeType == HQNBIT_CompFp16 && Dispatch->HQ4BitGemmPackQuantBData != nullptr) {
+            Dispatch->HQ4BitGemmPackQuantBData(
+                N,
+                K,
+                BlkLen,
+                ComputeType,
+                static_cast<const std::byte*>(QuantBData),
+                static_cast<std::byte*>(PackedQuantBDataAndOrBlkSumWorkspace),
+                ThreadPool
+            );
         } else if (Dispatch->SQ4BitGemmPackQuantBData != nullptr) {
           // TODO: these assertions are true if called from matmul_nbits kernel but not from mlas tests.
             //assert(QuantBScale == nullptr);
@@ -295,22 +315,11 @@ AddBiasForGemm(const float* Bias, float* C, size_t CountM, size_t CountN, size_t
     }
 }
 
-typedef void(SQNBitGemmFn)(
-    size_t BlkLen,
-    size_t K,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
-    void* PerGemmWorkspace,
-    size_t RangeStartM,
-    size_t RangeCountM,
-    size_t RangeStartN,
-    size_t RangeCountN
-);
-
 void
 SQ4BitGemm_CompFp32(
     const size_t BlkLen,
     const size_t K,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* const DataParams,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<float>* const DataParams,
     void* const PerGemmWorkspace,
     const size_t RangeStartM,
     const size_t RangeCountM,
@@ -355,7 +364,7 @@ SQ4BitGemm_CompFp32(
             float* c_blk = C + n;
             const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
 
-            GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmM1Kernel_CompFp32(
+            GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmM1Kernel_CompFp32(
                 BlkLen,
                 a_row, b_col, b_col_scale, b_col_zp, c_blk, CountN, K, k_blks, bias
             );
@@ -393,7 +402,7 @@ SQ4BitGemm_CompFp32(
         float* c_blk = C + n;
         const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
 
-        GetMlasPlatform().SQNBitGemmDispatch->Q4BitBlkDequantBForSgemm_CompFp32(
+        GetMlasPlatform().QNBitGemmDispatch->SQ4BitBlkDequantBForSgemm_CompFp32(
             BlkLen,
             dequant_b, b_col, b_col_scale, b_col_zp, CountN, K, k_blks
         );
@@ -425,11 +434,84 @@ SQ4BitGemm_CompFp32(
     }
 }
 
+void
+HQ4BitGemm_CompFp16(
+    const size_t BlkLen,
+    const size_t K,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<MLAS_FP16>* const DataParams,
+    void* const PerGemmWorkspace,
+    const size_t RangeStartM,
+    const size_t RangeCountM,
+    const size_t RangeStartN,
+    const size_t RangeCountN
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+    MLAS_UNREFERENCED_PARAMETER(PerGemmWorkspace);
+
+    const size_t lda = DataParams->lda;
+    const size_t ldc = DataParams->ldc;
+    const size_t k_blk_num = MlasDivRoundup(K, BlkLen);
+    const size_t qldb = k_blk_num * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t ldb = k_blk_num * BlkLen;
+    const size_t k_zp_bytes = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(k_blk_num);
+
+    const MLAS_FP16* A = DataParams->A + RangeStartM * lda;
+    MLAS_FP16* C = DataParams->C + RangeStartM * ldc + RangeStartN;
+    const std::byte* QuantBData = static_cast<const std::byte*>(DataParams->PackedQuantBData) + RangeStartN * qldb;
+    const MLAS_FP16* QuantBScale = DataParams->QuantBScale + RangeStartN * k_blk_num;
+    const std::byte* QuantBZeroPoint =
+        (DataParams->QuantBZeroPoint == nullptr)
+            ? nullptr
+            : static_cast<const std::byte*>(DataParams->QuantBZeroPoint) + RangeStartN * k_zp_bytes;
+    const MLAS_FP16* Bias = (DataParams->Bias == nullptr) ? nullptr : DataParams->Bias;
+
+    // 32N is the sweet spot of cache utilization. It is machine dependent though.
+    constexpr size_t StrideM = 2;
+    constexpr size_t StrideN = 32;
+
+    // TODO(fajin): move allocation up to the op.
+    size_t bufsize = ldb * StrideN * sizeof(MLAS_FP16);
+    MlasThreadedBufAlloc(bufsize);
+    auto* dequant_b = reinterpret_cast<MLAS_FP16*>(ThreadedBufHolder.get());
+
+    for (size_t n = 0, countN; n < RangeCountN; n += countN) {
+        countN = std::min(StrideN, RangeCountN - n);
+        GetMlasPlatform().QNBitGemmDispatch->HQ4BitBlkDequantBForHgemm_CompFp16(
+            BlkLen, dequant_b, QuantBData, QuantBScale, QuantBZeroPoint, countN, K, k_blk_num
+        );
+
+        const MLAS_FP16* a = A;
+        MLAS_FP16* c = C;
+        for (size_t m = 0, countM; m < RangeCountM; m += countM) {
+            countM = std::min(StrideM, RangeCountM - m);
+            GetMlasPlatform().QNBitGemmDispatch->HQ4BitGemmKernel_CompFp16(
+                a, dequant_b, Bias, c, countM, countN, K, lda, ldb, ldc
+            );
+
+            if (DataParams->PostProcessor != nullptr) {
+                DataParams->PostProcessor->Process(
+                    DataParams->C, RangeStartM + m, RangeStartN + n, countM, countN, ldc
+                );
+            }
+
+            a += countM * lda;
+            c += countM * ldc;
+        }
+
+        QuantBData += countN * qldb;
+        QuantBScale += countN * k_blk_num;
+        QuantBZeroPoint = QuantBZeroPoint ? QuantBZeroPoint + countN * k_zp_bytes : nullptr;
+        Bias = Bias ? Bias + countN : nullptr;
+        C += countN;
+    }
+}
+
 void
 SQ4BitGemm_CompInt8(
     const size_t BlkLen,
     const size_t K,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* const DataParams,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<float>* const DataParams,
     void* const PerGemmWorkspace,
     const size_t RangeStartM,
     const size_t RangeCountM,
@@ -500,10 +582,10 @@ SQ4BitGemm_CompInt8(
         float* c_blk = C + n;
         const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
 
-        if (GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmKernel_CompInt8 != nullptr) {
+        if (GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_CompInt8 != nullptr) {
             size_t RowsRemaining = RangeCountM;
             while (RowsRemaining > 0) {
-                const auto RowsHandled = GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmKernel_CompInt8(
+                const auto RowsHandled = GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_CompInt8(
                     BlkLen,
                     a_row, b_col, b_col_scale, b_col_zp, c_blk, RowsRemaining, CountN, K, k_blks, ldc, bias
                 );
@@ -522,10 +604,10 @@ SQ4BitGemm_CompInt8(
             }
         }
 #ifdef MLAS_TARGET_AMD64_IX86
-        else if (GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmKernel_BlkSum_CompInt8 != nullptr)
+        else if (GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_BlkSum_CompInt8 != nullptr)
         {
             const float* b_blk_sum = QuantBBlkSum + n * k_blks;
-            GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmKernel_BlkSum_CompInt8(
+            GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_BlkSum_CompInt8(
                 BlkLen,
                 QuantA,
                 QuantAScale,
@@ -554,26 +636,29 @@ SQ4BitGemm_CompInt8(
     }
 }
 
-typedef void(InitializeWorkspaceFn)(
+template <typename T>
+void
+InitializeWorkspace_CompInt8(
     size_t M,
     size_t N,
     size_t K,
     size_t BatchN,
     size_t BlkLen,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<T>* DataParams,
     void* Workspace,
     size_t PerGemmWorkspaceStride,
     MLAS_THREADPOOL* ThreadPool
 );
 
+template <>
 void
-InitializeWorkspace_CompInt8(
+InitializeWorkspace_CompInt8<float>(
     size_t M,
     size_t N,
     size_t K,
     size_t BatchN,
     size_t BlkLen,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<float>* DataParams,
     void* Workspace,
     size_t PerGemmWorkspaceStride,
     MLAS_THREADPOOL* ThreadPool
@@ -581,8 +666,8 @@ InitializeWorkspace_CompInt8(
 {
     MLAS_UNREFERENCED_PARAMETER(N);
 
-    const auto QuantizeARow = GetMlasPlatform().SQNBitGemmDispatch->QuantizeARow_CompInt8;
-    const auto QuantizeARow2 = GetMlasPlatform().SQNBitGemmDispatch->QuantizeARowComputeBlkSum_CompInt8;
+    const auto QuantizeARow = GetMlasPlatform().QNBitGemmDispatch->QuantizeARow_CompInt8;
+    const auto QuantizeARow2 = GetMlasPlatform().QNBitGemmDispatch->QuantizeARowComputeBlkSum_CompInt8;
 
     const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
     const size_t QuantAStride = BlockCountK * Q8BlkSize(BlkLen);
@@ -622,61 +707,153 @@ InitializeWorkspace_CompInt8(
     }
 }
 
-struct Operations {
-    InitializeWorkspaceFn* InitializeWorkspace = nullptr;
-    SQNBitGemmFn* SQNBitGemm = nullptr;
-};
+template <>
+void
+InitializeWorkspace_CompInt8<MLAS_FP16>(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BatchN,
+    size_t BlkLen,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<MLAS_FP16>* DataParams,
+    void* Workspace,
+    size_t PerGemmWorkspaceStride,
+    MLAS_THREADPOOL* ThreadPool
+) {
+    MLAS_UNREFERENCED_PARAMETER(M);
+    MLAS_UNREFERENCED_PARAMETER(N);
+    MLAS_UNREFERENCED_PARAMETER(K);
+    MLAS_UNREFERENCED_PARAMETER(BatchN);
+    MLAS_UNREFERENCED_PARAMETER(BlkLen);
+    MLAS_UNREFERENCED_PARAMETER(DataParams);
+    MLAS_UNREFERENCED_PARAMETER(Workspace);
+    MLAS_UNREFERENCED_PARAMETER(PerGemmWorkspaceStride);
+    MLAS_UNREFERENCED_PARAMETER(ThreadPool);
+}
+
+template <typename T>
+using InitializeWorkspaceFn = std::function<void(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BatchN,
+    size_t BlkLen,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<T>* DataParams,
+    void* Workspace,
+    size_t PerGemmWorkspaceStride,
+    MLAS_THREADPOOL* ThreadPool
+)>;
 
-constexpr auto OperationMap = []() {
-    std::array<Operations, SQNBitGemmVariantCount> ops;
+template <typename T>
+InitializeWorkspaceFn<T>
+GetInitializeWorkspace(QNBitGemmVariant variant);
 
-    ops[SQNBitGemmVariant_BitWidth4_CompFp32].SQNBitGemm = SQ4BitGemm_CompFp32;
+template <>
+InitializeWorkspaceFn<float>
+GetInitializeWorkspace(QNBitGemmVariant variant)
+{
+    switch (variant) {
+        case SQNBitGemmVariant_BitWidth4_CompInt8:
+            return InitializeWorkspace_CompInt8<float>;
+        default:
+            return nullptr;
+    }
+}
+
+template <>
+InitializeWorkspaceFn<MLAS_FP16>
+GetInitializeWorkspace(QNBitGemmVariant variant)
+{
+    switch (variant) {
+        case HQNBitGemmVariant_BitWidth4_CompInt8:
+            return InitializeWorkspace_CompInt8<MLAS_FP16>;
+        default:
+            return nullptr;
+    }
+}
+
+template <typename T>
+using QNBitGemmFn = std::function<void(
+    const size_t BlkLen,
+    const size_t K,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<T>* const DataParams,
+    void* const PerGemmWorkspace,
+    const size_t RangeStartM,
+    const size_t RangeCountM,
+    const size_t RangeStartN,
+    const size_t RangeCountN
+)>;
 
-    ops[SQNBitGemmVariant_BitWidth4_CompInt8].InitializeWorkspace = InitializeWorkspace_CompInt8;
-    ops[SQNBitGemmVariant_BitWidth4_CompInt8].SQNBitGemm = SQ4BitGemm_CompInt8;
+template <typename T>
+QNBitGemmFn<T>
+GetQNBitGemm(QNBitGemmVariant variant);
 
-    return ops;
-}();
+template <>
+QNBitGemmFn<float>
+GetQNBitGemm(QNBitGemmVariant variant)
+{
+    switch (variant) {
+        case SQNBitGemmVariant_BitWidth4_CompFp32:
+            return SQ4BitGemm_CompFp32;
+        case SQNBitGemmVariant_BitWidth4_CompInt8:
+            return SQ4BitGemm_CompInt8;
+        default:
+            return nullptr;
+    }
+}
+
+template <>
+QNBitGemmFn<MLAS_FP16>
+GetQNBitGemm(QNBitGemmVariant variant)
+{
+    switch (variant) {
+        case HQNBitGemmVariant_BitWidth4_CompFp16:
+            return HQ4BitGemm_CompFp16;
+        default:
+            return nullptr;
+    }
+}
 }  // namespace
 
+template <typename T>
 void MLASCALL
-MlasSQNBitGemmBatch(
+MlasQNBitGemmBatch(
     const size_t M,
     const size_t N,
     const size_t K,
     const size_t BatchN,
     const size_t BlkBitWidth,
     const size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<T>* DataParams,
     void* Workspace,
     MLAS_THREADPOOL* ThreadPool
 )
 {
-    const auto Variant = GetSQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType);
+    const auto Variant = GetQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType);
     assert(Variant != SQNBitGemmVariantInvalid);
 
     //
     // Ensure `Workspace` has correct alignment.
     //
     if (Workspace != nullptr) {
-        const size_t Alignment = SQNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType);
+        const size_t Alignment = QNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType);
         const uintptr_t WorkspaceAddress = reinterpret_cast<uintptr_t>(Workspace);
         Workspace = reinterpret_cast<void*>(
             (WorkspaceAddress + Alignment - 1) & (~(Alignment - 1))
         );
     }
 
-    const size_t PerGemmWorkspaceStride = SQNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, ComputeType);
+    const size_t PerGemmWorkspaceStride = QNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, ComputeType);
 
-    if (const auto InitializeWorkspaceOperation = OperationMap[Variant].InitializeWorkspace;
+    if (const auto InitializeWorkspaceOperation = GetInitializeWorkspace<T>(Variant);
         InitializeWorkspaceOperation != nullptr) {
         InitializeWorkspaceOperation(
             M, N, K, BatchN, BlkLen, DataParams, Workspace, PerGemmWorkspaceStride, ThreadPool
         );
     }
 
-    const auto ComputeOperation = OperationMap[Variant].SQNBitGemm;
+    const auto ComputeOperation = GetQNBitGemm<T>(Variant);
 
     const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
 
@@ -685,11 +862,11 @@ MlasSQNBitGemmBatch(
             const auto* Data = &DataParams[gemm_i];
             void* PerGemmWorkspace =
                 reinterpret_cast<std::byte*>(Workspace) + gemm_i * PerGemmWorkspaceStride;
-            if (ComputeType == CompInt8 && GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) {
-                PackedQuantBDataStruct packed_quant_b(const_cast<void*>(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen);
-                const_cast<MLAS_SQNBIT_GEMM_DATA_PARAMS*>(Data)->PackedQuantBData = packed_quant_b.PackedQuantBData;
-                const_cast<MLAS_SQNBIT_GEMM_DATA_PARAMS*>(Data)->QuantBBlkSum = packed_quant_b.QuantBBlkSum;
-                const_cast<MLAS_SQNBIT_GEMM_DATA_PARAMS*>(Data)->QuantBScale = packed_quant_b.PackedQuantBScale;
+            if (ComputeType == SQNBIT_CompInt8 && GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) {
+                PackedQuantBDataStruct<T> packed_quant_b(const_cast<void*>(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen);
+                const_cast<MLAS_QNBIT_GEMM_DATA_PARAMS<T>*>(Data)->PackedQuantBData = packed_quant_b.PackedQuantBData;
+                const_cast<MLAS_QNBIT_GEMM_DATA_PARAMS<T>*>(Data)->QuantBBlkSum = packed_quant_b.QuantBBlkSum;
+                const_cast<MLAS_QNBIT_GEMM_DATA_PARAMS<T>*>(Data)->QuantBScale = packed_quant_b.PackedQuantBScale;
                 PerGemmQuantAWorkspace per_gemm_quant_a_workspace(PerGemmWorkspace, M, BlockCountK, BlkLen);
                 ComputeOperation(BlkLen, K, Data, &per_gemm_quant_a_workspace, 0, M, 0, N);
             } else {
@@ -756,11 +933,11 @@ MlasSQNBitGemmBatch(
 
         void* PerGemmWorkspace =
             reinterpret_cast<std::byte*>(Workspace) + gemm_i * PerGemmWorkspaceStride;
-        if (ComputeType == CompInt8 && GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) {
-            PackedQuantBDataStruct packed_quant_b(const_cast<void*>(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen);
-            const_cast<MLAS_SQNBIT_GEMM_DATA_PARAMS*>(Data)->PackedQuantBData = packed_quant_b.PackedQuantBData;
-            const_cast<MLAS_SQNBIT_GEMM_DATA_PARAMS*>(Data)->QuantBBlkSum = packed_quant_b.QuantBBlkSum;
-            const_cast<MLAS_SQNBIT_GEMM_DATA_PARAMS*>(Data)->QuantBScale = packed_quant_b.PackedQuantBScale;
+        if (ComputeType == SQNBIT_CompInt8 && GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) {
+            PackedQuantBDataStruct<T> packed_quant_b(const_cast<void*>(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen);
+            const_cast<MLAS_QNBIT_GEMM_DATA_PARAMS<T>*>(Data)->PackedQuantBData = packed_quant_b.PackedQuantBData;
+            const_cast<MLAS_QNBIT_GEMM_DATA_PARAMS<T>*>(Data)->QuantBBlkSum = packed_quant_b.QuantBBlkSum;
+            const_cast<MLAS_QNBIT_GEMM_DATA_PARAMS<T>*>(Data)->QuantBScale = packed_quant_b.PackedQuantBScale;
 
             PerGemmQuantAWorkspace per_gemm_quant_a_workspace(PerGemmWorkspace, M, BlockCountK, BlkLen);
             ComputeOperation(BlkLen, K, Data, &per_gemm_quant_a_workspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN);
@@ -769,3 +946,33 @@ MlasSQNBitGemmBatch(
         }
     });
 }
+
+template
+void MLASCALL
+MlasQNBitGemmBatch(
+    const size_t M,
+    const size_t N,
+    const size_t K,
+    const size_t BatchN,
+    const size_t BlkBitWidth,
+    const size_t BlkLen,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<float>* DataParams,
+    void* Workspace,
+    MLAS_THREADPOOL* ThreadPool
+);
+
+template
+void MLASCALL
+MlasQNBitGemmBatch(
+    const size_t M,
+    const size_t N,
+    const size_t K,
+    const size_t BatchN,
+    const size_t BlkBitWidth,
+    const size_t BlkLen,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    const MLAS_QNBIT_GEMM_DATA_PARAMS<MLAS_FP16>* DataParams,
+    void* Workspace,
+    MLAS_THREADPOOL* ThreadPool
+);
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.h b/onnxruntime/core/mlas/lib/qnbitgemm.h
similarity index 71%
rename from onnxruntime/core/mlas/lib/sqnbitgemm.h
rename to onnxruntime/core/mlas/lib/qnbitgemm.h
index 2da336ca2f0ec..eb3d0b44ae3de 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.h
+++ b/onnxruntime/core/mlas/lib/qnbitgemm.h
@@ -6,7 +6,7 @@ Licensed under the MIT License.
 
 Module Name:
 
-    sqnbitgemm.h
+    qnbitgemm.h
 
 Abstract:
 
@@ -46,24 +46,25 @@ MlasAlignAddress(void* addr, const size_t alignment)
     return addr;
 }
 
+template <typename T>
 struct PackedQuantBDataStruct {
     PackedQuantBDataStruct(void* PackedQuantBWorkspace, size_t N, size_t BlockCountK, size_t BlkLen)
         : QuantBWorkspace_(PackedQuantBWorkspace), N_(N), BlockCountK_(BlockCountK), BlkLen_(BlkLen)
     {
-      // TODO: duplicate code from SQ4BitGemmPackQuantBDataSize
+      // TODO: duplicate code from Q4BitGemmPackQuantBDataSize
         constexpr size_t BlkBitWidth = 4;
         const size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
-        size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(float);
+        size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(T);
 
         // _mm256_load_si256 requires alignment on a 32-byte boundary
         PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 32);
-        QuantBBlkSum = (float*)(PackedQuantBData + PackedQuantBDataSize);
-        QuantBBlkSum = (float*)MlasAlignAddress(QuantBBlkSum, MlasQNBitQuantBBlkSumAlignment());
-        PackedQuantBScale = (float*)((std::byte*)QuantBBlkSum + BlkSumSize);
+        QuantBBlkSum = (T*)(PackedQuantBData + PackedQuantBDataSize);
+        QuantBBlkSum = (T*)MlasAlignAddress(QuantBBlkSum, MlasQNBitQuantBBlkSumAlignment());
+        PackedQuantBScale = (T*)((std::byte*)QuantBBlkSum + BlkSumSize);
     }
     std::byte* PackedQuantBData;
-    float* PackedQuantBScale;
-    float* QuantBBlkSum;
+    T* PackedQuantBScale;
+    T* QuantBBlkSum;
 
     void* QuantBWorkspace_;
     size_t N_, BlockCountK_, BlkLen_;
@@ -84,44 +85,45 @@ MlasQNBitZeroPointsForBlksSizeInBytes(size_t BlkCount)
 // Kernel dispatch structure.
 //
 
-struct MLAS_SQNBIT_GEMM_DISPATCH {
+struct MLAS_QNBIT_GEMM_DISPATCH {
     //
     // Quantized B data packing function prototypes.
     //
 
-    /** Gets size of packed quantized B data containing 4-bit integers. See MlasSQNBitGemmPackQuantBDataSize(). */
-    typedef size_t(SQ4BitGemmPackQuantBDataSize_Fn)(
+    /** Gets size of packed quantized B data containing 4-bit integers. See MlasQNBitGemmPackQuantBDataSize(). */
+    typedef size_t(Q4BitGemmPackQuantBDataSize_Fn)(
         size_t N,
         size_t K,
         size_t BlkLen,
-        MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+        MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
     );
 
-    SQ4BitGemmPackQuantBDataSize_Fn* SQ4BitGemmPackQuantBDataSize = nullptr;
+    Q4BitGemmPackQuantBDataSize_Fn* Q4BitGemmPackQuantBDataSize = nullptr;
 
-    /** Packs quantized B data containing 4-bit integers. See MlasSQNBitGemmPackQuantBData(). */
-    typedef void(SQ4BitGemmPackQuantBData_Fn)(
+    /** Packs quantized B data containing 4-bit integers. See MlasQNBitGemmPackQuantBData(). */
+    typedef void(Q4BitGemmPackQuantBData_Fn)(
         size_t N,
         size_t K,
         size_t BlkLen,
-        MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+        MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
         const std::byte* QuantBDataBegin,
         std::byte* PackedQuantBDataBegin,
         MLAS_THREADPOOL* ThreadPool
     );
 
-    SQ4BitGemmPackQuantBData_Fn* SQ4BitGemmPackQuantBData = nullptr;
+    Q4BitGemmPackQuantBData_Fn* SQ4BitGemmPackQuantBData = nullptr;
+    Q4BitGemmPackQuantBData_Fn* HQ4BitGemmPackQuantBData = nullptr;
 
     typedef void(SQ4BitGemmPackQuantBDataAndSumBlk_Fn)(
         size_t N,
         size_t K,
         size_t BlkLen,
-        MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+        MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
         const std::byte* QuantBDataBegin,
         const float* QuantBScaleBegin,
         bool has_zp_input,
         const std::byte* QuantBZPBegin,
-        PackedQuantBDataStruct& packed_quant_b,
+        PackedQuantBDataStruct<float>& packed_quant_b,
         MLAS_THREADPOOL* ThreadPool
     );
 
@@ -141,15 +143,15 @@ struct MLAS_SQNBIT_GEMM_DISPATCH {
      * @param[in]   BlkLen          number of quantized values per block
      * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
      */
-    typedef size_t(SQ4BitGemmPerGemmWorkspaceSize_Fn)(
+    typedef size_t(Q4BitGemmPerGemmWorkspaceSize_Fn)(
         size_t M,
         size_t N,
         size_t K,
         size_t BlkLen,
-        MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+        MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
     );
 
-    SQ4BitGemmPerGemmWorkspaceSize_Fn* SQ4BitGemmPerGemmWorkspaceSize = nullptr;
+    Q4BitGemmPerGemmWorkspaceSize_Fn* Q4BitGemmPerGemmWorkspaceSize = nullptr;
 
     /**
      * @brief Gets the required byte alignment of the per-GEMM intermediate workspace.
@@ -157,15 +159,15 @@ struct MLAS_SQNBIT_GEMM_DISPATCH {
      * @param[in]   BlkLen          number of quantized values per block
      * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
      */
-    typedef size_t(SQ4BitGemmPerGemmWorkspaceAlignment_Fn)(
+    typedef size_t(Q4BitGemmPerGemmWorkspaceAlignment_Fn)(
         size_t BlkLen,
-        MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+        MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
     );
 
-    SQ4BitGemmPerGemmWorkspaceAlignment_Fn* SQ4BitGemmPerGemmWorkspaceAlignment = nullptr;
+    Q4BitGemmPerGemmWorkspaceAlignment_Fn* Q4BitGemmPerGemmWorkspaceAlignment = nullptr;
 
     //
-    // CompFp32 kernel function prototypes.
+    // SQNBIT_CompFp32 kernel function prototypes.
     //
 
     /**
@@ -228,10 +230,41 @@ struct MLAS_SQNBIT_GEMM_DISPATCH {
         size_t BlockStrideQuantB
     );
 
-    Q4BitBlkDequantBForSgemm_CompFp32_Fn* Q4BitBlkDequantBForSgemm_CompFp32 = nullptr;
+    Q4BitBlkDequantBForSgemm_CompFp32_Fn* SQ4BitBlkDequantBForSgemm_CompFp32 = nullptr;
+
+    /**
+     * @brief Dequantize B into the format expected by the Sgemm kernel.
+     *        B is a quantized 4-bit integer matrix that is block quantized and column major.
+     *        This is equivalent to dequantizing B and then running MlasSgemmCopyPackB.
+     *
+     * @param       BlkLen              Number of values in a block.
+     * @param[out]  FpData              Supplies the output buffer for the dequantized B float data.
+     *                                  It should have enough space for
+     *                                      (CountN + 16 - 1) / 16 * 16 * (CountK + BlkLen - 1) / BlkLen * BlkLen
+     *                                  elements. Only the first (CountN + 16 - 1) / 16 * 16 * CountK elements are
+     *                                  useful, but the kernel implementation can be simplified with the extra space.
+     * @param       QuantBData          Supplies the quantized B matrix block data.
+     * @param       QuantBScale         Supplies the quantized B matrix block scale values.
+     * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
+     * @param       CountN              Number of columns of B.
+     * @param       CountK              Number of rows of B.
+     * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
+     */
+    typedef void(Q4BitBlkDequantBForSgemm_CompFp16_Fn)(
+        size_t BlkLen,
+        MLAS_FP16* FpData,
+        const std::byte* QuantBData,
+        const MLAS_FP16* QuantBScale,
+        const std::byte* QuantBZeroPoint,
+        size_t CountN,
+        size_t CountK,
+        size_t BlockStrideQuantB
+    );
+
+    Q4BitBlkDequantBForSgemm_CompFp16_Fn* HQ4BitBlkDequantBForHgemm_CompFp16 = nullptr;
 
     //
-    // CompInt8 kernel function prototypes.
+    // SQNBIT_CompInt8 kernel function prototypes.
     //
 
     /**
@@ -337,4 +370,35 @@ struct MLAS_SQNBIT_GEMM_DISPATCH {
         float* AScaledGroupSum  // scale_k * Sum_blklen(a_i)
     );
     QuantizeARowComputeBlkSum_CompInt8_Fn* QuantizeARowComputeBlkSum_CompInt8 = nullptr;
+
+    /**
+     * @brief Multiply fp16 matrix A rows with fp16 matrix B columns.
+     *        Results are written to fp16 matrix C.
+     *        If bias is provided, the bias are added to the result.
+     *
+     * @param       A                   first row of the A matrix segment. Row major.
+     * @param       B                   first column of the B matrix segment. Column major.
+     * @param       Bias                the bias at the target column. Optional.
+     * @param[out]  C                   first element of the output matrix segment. Row major.
+     * @param       CountM              the number of rows of A chunk.
+     * @param       CountN              the number of columns of B chunk.
+     * @param       K                   the number of columns of A matrix and rows of B matrix.
+     * @param       lda                 the leading dimension of A.
+     * @param       ldb                 the leading dimension of B.
+     * @param       ldc                 the leading dimension of C.
+     */
+    typedef void(HQ4BitGemmKernel_CompFp16_Fn)(
+        const MLAS_FP16* A,
+        const MLAS_FP16* B,
+        const MLAS_FP16* Bias,
+        MLAS_FP16* C,
+        size_t CountM,
+        size_t CountN,
+        size_t K,
+        size_t lda,
+        size_t ldb,
+        size_t ldc
+    );
+
+    HQ4BitGemmKernel_CompFp16_Fn* HQ4BitGemmKernel_CompFp16 = nullptr;
 };
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp
similarity index 74%
rename from onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
rename to onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp
index 3f32cc6c5312d..d05de64e68ec8 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp
@@ -6,7 +6,7 @@ Licensed under the MIT License.
 
 Module Name:
 
-    sqnbitgemm_kernel_neon.cpp
+    qnbitgemm_kernel_neon.cpp
 
 Abstract:
 
@@ -19,8 +19,8 @@ Module Name:
 
 #include <cassert>
 
-#include "sqnbitgemm.h"
-#include "sqnbitgemm_kernel_neon.h"
+#include "qnbitgemm.h"
+#include "qnbitgemm_kernel_neon.h"
 #include "sqnbitgemm_q8_block.h"
 
 namespace sqnbitgemm_neon
@@ -34,11 +34,11 @@ namespace
 //
 
 size_t
-SQ4BitGemmPackQuantBDataSize(
+Q4BitGemmPackQuantBDataSize(
     size_t N,
     size_t K,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
     MLAS_UNREFERENCED_PARAMETER(ComputeType);  // same size regardless of ComputeType
@@ -55,7 +55,7 @@ SQ4BitGemmPackQuantBData(
     size_t N,
     size_t K,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const std::byte* QuantBDataBegin,
     std::byte* PackedQuantBDataBegin,
     MLAS_THREADPOOL* ThreadPool
@@ -69,7 +69,7 @@ SQ4BitGemmPackQuantBData(
     const size_t BlkDataSize = MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
     const size_t Iterations = N * BlockCountK;  // one iteration per block
 
-    const size_t SubBlkLen = (ComputeType == CompInt8)
+    const size_t SubBlkLen = (ComputeType == SQNBIT_CompInt8)
                                  ? ((BlkLen == 16) ? 16 : 32)
                                  : 16;
 
@@ -126,18 +126,18 @@ SQ4BitGemmPackQuantBData(
 //
 
 size_t
-SQ4BitGemmPerGemmWorkspaceSize(
+Q4BitGemmPerGemmWorkspaceSize(
     size_t M,
     size_t N,
     size_t K,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
     MLAS_UNREFERENCED_PARAMETER(N);
 
     switch (ComputeType) {
-        case CompInt8: {
+        case SQNBIT_CompInt8: {
             // workspace buffer is used for block quantization of A to int8
             const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
             const size_t PerGemmWorkspaceSize = M * BlockCountK * Q8BlkSize(BlkLen);
@@ -150,15 +150,15 @@ SQ4BitGemmPerGemmWorkspaceSize(
 }
 
 size_t
-SQ4BitGemmPerGemmWorkspaceAlignment(
+Q4BitGemmPerGemmWorkspaceAlignment(
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
     MLAS_UNREFERENCED_PARAMETER(BlkLen);
 
     switch (ComputeType) {
-        case CompInt8: {
+        case SQNBIT_CompInt8: {
             return Q8BlkAlignment();
         }
         default: {
@@ -175,20 +175,27 @@ SQ4BitGemmPerGemmWorkspaceAlignment(
 // Kernel dispatch structure definition.
 //
 
-const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon = []() {
-    MLAS_SQNBIT_GEMM_DISPATCH d;
+const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon = []() {
+    MLAS_QNBIT_GEMM_DISPATCH d;
 
-    d.SQ4BitGemmPackQuantBDataSize = sqnbitgemm_neon::SQ4BitGemmPackQuantBDataSize;
+    d.Q4BitGemmPackQuantBDataSize = sqnbitgemm_neon::Q4BitGemmPackQuantBDataSize;
     d.SQ4BitGemmPackQuantBData = sqnbitgemm_neon::SQ4BitGemmPackQuantBData;
 
-    d.SQ4BitGemmPerGemmWorkspaceSize = sqnbitgemm_neon::SQ4BitGemmPerGemmWorkspaceSize;
-    d.SQ4BitGemmPerGemmWorkspaceAlignment = sqnbitgemm_neon::SQ4BitGemmPerGemmWorkspaceAlignment;
+    d.Q4BitGemmPerGemmWorkspaceSize = sqnbitgemm_neon::Q4BitGemmPerGemmWorkspaceSize;
+    d.Q4BitGemmPerGemmWorkspaceAlignment = sqnbitgemm_neon::Q4BitGemmPerGemmWorkspaceAlignment;
 
     d.SQ4BitGemmM1Kernel_CompFp32 = sqnbitgemm_neon::SQ4BitGemmM1Kernel_CompFp32;
-    d.Q4BitBlkDequantBForSgemm_CompFp32 = sqnbitgemm_neon::Q4BitBlkDequantBForSgemm_CompFp32;
-
-    d.SQ4BitGemmKernel_CompInt8 = sqnbitgemm_neon::SQ4BitGemmKernel_CompInt8;
+    d.SQ4BitBlkDequantBForSgemm_CompFp32 = sqnbitgemm_neon::SQ4BitBlkDequantBForSgemm_CompFp32;
+    if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeonDot()) {
+        d.SQ4BitGemmKernel_CompInt8 = sqnbitgemm_neon::SQ4BitGemmKernel_CompInt8;
+    }
     d.QuantizeARow_CompInt8 = sqnbitgemm_neon::QuantizeARow_CompInt8;
 
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+    d.HQ4BitGemmPackQuantBData = sqnbitgemm_neon::HQ4BitGemmPackQuantBData_CompFp16;
+    d.HQ4BitBlkDequantBForHgemm_CompFp16 = sqnbitgemm_neon::HQ4BitBlkDequantBForHgemm_CompFp16;
+    d.HQ4BitGemmKernel_CompFp16 = sqnbitgemm_neon::HQ4BitGemmKernel_CompFp16;
+#endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED && MLAS_TARGET_ARM64
+
     return d;
 }();
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.h b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.h
similarity index 69%
rename from onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.h
rename to onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.h
index ef9345d7ac484..ccadd24ac1991 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.h
+++ b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.h
@@ -6,7 +6,7 @@ Licensed under the MIT License.
 
 Module Name:
 
-    sqnbitgemm_kernel_neon.h
+    qnbitgemm_kernel_neon.h
 
 Abstract:
 
@@ -30,13 +30,13 @@ namespace sqnbitgemm_neon
 
 //
 // Function declarations for SQNBitGemm ARM NEON kernel entry points.
-// Refer to the prototypes in sqnbitgemm.h for documentation.
+// Refer to the prototypes in qnbitgemm.h for documentation.
 // These are declared here so they can be used to initialize the
-// MLAS_SQNBIT_GEMM_DISPATCH structure and also be implemented in separate
+// MLAS_QNBIT_GEMM_DISPATCH structure and also be implemented in separate
 // files.
 //
 
-// CompFp32 declarations
+// SQNBIT_CompFp32 declarations
 
 void
 SQ4BitGemmM1Kernel_CompFp32(
@@ -53,7 +53,7 @@ SQ4BitGemmM1Kernel_CompFp32(
 );
 
 void
-Q4BitBlkDequantBForSgemm_CompFp32(
+SQ4BitBlkDequantBForSgemm_CompFp32(
     size_t BlkLen,
     float* FpData,
     const std::byte* QuantBData,
@@ -64,7 +64,48 @@ Q4BitBlkDequantBForSgemm_CompFp32(
     size_t BlockCountK
 );
 
-// CompInt8 declarations
+// HQNBIT_CompFp16 declarations
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+void
+HQ4BitGemmPackQuantBData_CompFp16(
+    size_t N,
+    size_t K,
+    size_t BlkLen,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    const std::byte* QuantBDataBegin,
+    std::byte* PackedQuantBDataBegin,
+    MLAS_THREADPOOL* ThreadPool
+);
+
+void
+HQ4BitBlkDequantBForHgemm_CompFp16(
+    size_t BlkLen,
+    MLAS_FP16* FpData,
+    const std::byte* QuantBData,
+    const MLAS_FP16* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    size_t CountN,
+    size_t K,
+    size_t BlockCountK
+);
+
+void
+HQ4BitGemmKernel_CompFp16(
+    const MLAS_FP16* A,
+    const MLAS_FP16* B,
+    const MLAS_FP16* Bias,
+    MLAS_FP16* C,
+    size_t CountM,
+    size_t CountN,
+    size_t K,
+    size_t lda,
+    size_t ldb,
+    size_t ldc
+);
+
+#endif  // !(defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64))
+
+// SQNBIT_CompInt8 declarations
 
 void
 QuantizeARow_CompInt8(
diff --git a/onnxruntime/core/mlas/lib/rotary_embedding.cpp b/onnxruntime/core/mlas/lib/rotary_embedding.cpp
new file mode 100644
index 0000000000000..1f8f7b240694c
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/rotary_embedding.cpp
@@ -0,0 +1,101 @@
+/*++
+
+Copyright (c) Intel Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    rotary_embedding.cpp
+
+Abstract:
+
+    This module implements rotary embedding kernels for fp32/16.
+
+--*/
+
+#include "rotary_embedding.h"
+
+namespace {
+
+template <typename T>
+void
+MLASCALL
+MlasRotaryEmbedOneRow_FallBack(
+    const T* input_data,
+    const T* sin_data,
+    const T* cos_data,
+    size_t rotary_emb_dim,
+    bool interleaved,
+    T* output_data
+) {
+    const size_t half_rotary_emb_dim = rotary_emb_dim / 2;
+    size_t cache_idx = 0;
+    bool sign = false;
+    size_t j = 0;
+    for (size_t i = 0; i < rotary_emb_dim; i++) {
+        if (interleaved) {
+            cache_idx = (i / 2) % half_rotary_emb_dim;
+            sign = i & 1;
+            j = sign ? i - 1 : i + 1;  // i - sign
+        } else {
+            cache_idx = i % half_rotary_emb_dim;
+            sign = (i >= half_rotary_emb_dim);
+            j = (i + half_rotary_emb_dim) % rotary_emb_dim;
+        }
+        float output_data_i = static_cast<float>(input_data[i]) * static_cast<float>(cos_data[cache_idx]);
+        float input_data_j = static_cast<float>(input_data[j]);
+        float sin_data_cache_idx = static_cast<float>(sin_data[cache_idx]);
+        if (sign) {
+            output_data_i += input_data_j * sin_data_cache_idx;
+        } else {
+            output_data_i -= input_data_j * sin_data_cache_idx;
+        }
+        output_data[i] = static_cast<T>(output_data_i);
+    }
+}
+
+}  // namespace
+
+
+template <>
+void
+MLASCALL
+MlasRotaryEmbedOneRow<float>(
+    const float* input,
+    const float* sin,
+    const float* cos,
+    size_t dim,
+    bool interleaved,
+    float* output
+) {
+    const auto* dispatch = GetMlasPlatform().RopeDispatch;
+
+    if (dispatch == nullptr || dispatch->SRope == nullptr) {
+        MlasRotaryEmbedOneRow_FallBack<float>(input, sin, cos, dim, interleaved, output);
+        return;
+    }
+
+    dispatch->SRope(input, sin, cos, dim, interleaved, output);
+}
+
+template <>
+void
+MLASCALL
+MlasRotaryEmbedOneRow<MLAS_FP16>(
+    const MLAS_FP16* input,
+    const MLAS_FP16* sin,
+    const MLAS_FP16* cos,
+    size_t dim,
+    bool interleaved,
+    MLAS_FP16* output
+) {
+    const auto* dispatch = GetMlasPlatform().RopeDispatch;
+
+    if (dispatch == nullptr || dispatch->HRope == nullptr) {
+        MlasRotaryEmbedOneRow_FallBack<MLAS_FP16>(input, sin, cos, dim, interleaved, output);
+        return;
+    }
+
+    dispatch->HRope(input, sin, cos, dim, interleaved, output);
+}
diff --git a/onnxruntime/core/mlas/lib/rotary_embedding.h b/onnxruntime/core/mlas/lib/rotary_embedding.h
new file mode 100644
index 0000000000000..352dddccf1025
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/rotary_embedding.h
@@ -0,0 +1,46 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    rotary_embedding.h
+
+Abstract:
+
+    This module includes kernel function prototypes and helper functions for
+    implementing rotary embedding.
+
+--*/
+
+#pragma once
+
+#include "mlasi.h"
+
+struct MLAS_ROPE_DISPATCH {
+    // rotary embedding kernel for fp32
+    typedef void(SRope_Fn)(
+        const float* input,
+        const float* sin,
+        const float* cos,
+        size_t dim,
+        bool interleaved,
+        float* output
+    );
+
+    SRope_Fn* SRope = nullptr;
+
+    // rotary embedding kernel for fp16
+    typedef void(HRope_Fn)(
+        const MLAS_FP16* input,
+        const MLAS_FP16* sin,
+        const MLAS_FP16* cos,
+        size_t dim,
+        bool interleaved,
+        MLAS_FP16* output
+    );
+
+    HRope_Fn* HRope = nullptr;
+};
diff --git a/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon.cpp b/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon.cpp
new file mode 100644
index 0000000000000..e59a95cd9ee4e
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon.cpp
@@ -0,0 +1,32 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    rotary_embedding_kernel_neon.cpp
+
+Abstract:
+
+    This module implements the rotary embedding kernels for ARM NEON.
+
+--*/
+
+#include "rotary_embedding.h"
+#include "rotary_embedding_kernel_neon.h"
+
+//
+// Kernel dispatch structure definition.
+//
+const MLAS_ROPE_DISPATCH MlasRopeDispatchNeon = []() {
+    MLAS_ROPE_DISPATCH d;
+
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+    if (MlasFp16AccelerationSupported()) {
+        d.HRope = rope_neon::RopeKernel_Fp16;
+    }
+#endif
+    return d;
+}();
diff --git a/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon.h b/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon.h
new file mode 100644
index 0000000000000..8153f65650f7d
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon.h
@@ -0,0 +1,37 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    rotary_embedding_kernel_neon.h
+
+Abstract:
+
+    This module includes function declarations and common helper functions for
+    rotary embedding on ARM cpu.
+
+--*/
+
+#pragma once
+
+#include <arm_neon.h>
+
+#include "mlasi.h"
+
+namespace rope_neon {
+
+// Rotary embedding kernel for fp16. Embed one hidden state vector.
+void
+RopeKernel_Fp16(
+    const MLAS_FP16* input,
+    const MLAS_FP16* sin,
+    const MLAS_FP16* cos,
+    size_t dim,
+    bool interleaved,
+    MLAS_FP16* output
+);
+
+}  // namespace rope_neon
diff --git a/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon_fp16.cpp
new file mode 100644
index 0000000000000..3e2eb8fee0e6e
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon_fp16.cpp
@@ -0,0 +1,253 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    rotary_embedding_kernel_neon_fp16.cpp
+
+Abstract:
+
+    This module implements the fp16 rotary embedding kernels for ARM NEON.
+
+--*/
+
+#include <arm_neon.h>
+#include <cassert>
+
+#include "fp16_common.h"
+#include "rotary_embedding.h"
+#include "rotary_embedding_kernel_neon.h"
+
+namespace rope_neon {
+
+namespace {
+
+template <bool interleaved>
+void
+RopeKernel_Fp16_Impl(
+    const _mlas_fp16_* input,
+    const _mlas_fp16_* sin,
+    const _mlas_fp16_* cos,
+    size_t dim,
+    _mlas_fp16_* output
+);
+
+template <>
+void
+RopeKernel_Fp16_Impl<false>(
+    const _mlas_fp16_* input,
+    const _mlas_fp16_* sin,
+    const _mlas_fp16_* cos,
+    size_t dim,
+    _mlas_fp16_* output
+) {
+    const size_t half_dim = dim >> 1;
+    size_t i = 0, j = half_dim;
+    for (; i + 7 < half_dim; i += 8, j += 8) {
+        float16x8_t real = MlasLoadFloat16x8(input + i);
+        float16x8_t imag = MlasLoadFloat16x8(input + j);
+        float16x8_t sin_val = MlasLoadFloat16x8(sin + i);
+        float16x8_t cos_val = MlasLoadFloat16x8(cos + i);
+        float16x8_t real_out = vfmsq_f16(vmulq_f16(real, cos_val), imag, sin_val);
+        float16x8_t imag_out = vfmaq_f16(vmulq_f16(real, sin_val), imag, cos_val);
+        MlasStoreFloat16x8(output + i, real_out);
+        MlasStoreFloat16x8(output + j, imag_out);
+    }
+    for (; i + 3 < half_dim; i += 4, j += 4) {
+        float16x4_t real = MlasLoadFloat16x4(input + i);
+        float16x4_t imag = MlasLoadFloat16x4(input + j);
+        float16x4_t sin_val = MlasLoadFloat16x4(sin + i);
+        float16x4_t cos_val = MlasLoadFloat16x4(cos + i);
+        float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
+        float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
+        MlasStoreFloat16x4(output + i, real_out);
+        MlasStoreFloat16x4(output + j, imag_out);
+    }
+    if (half_dim - i == 3) {
+        float16x4_t real = MlasZeroFloat16x4();
+        float16x4_t imag = MlasZeroFloat16x4();
+        float16x4_t sin_val = MlasZeroFloat16x4();
+        float16x4_t cos_val = MlasZeroFloat16x4();
+        real = MlasLoadLaneFloat16x4<0>(input + i, real);
+        real = MlasLoadLaneFloat16x4<1>(input + i + 1, real);
+        real = MlasLoadLaneFloat16x4<2>(input + i + 2, real);
+        imag = MlasLoadLaneFloat16x4<0>(input + j, imag);
+        imag = MlasLoadLaneFloat16x4<1>(input + j + 1, imag);
+        imag = MlasLoadLaneFloat16x4<2>(input + j + 2, imag);
+        sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
+        sin_val = MlasLoadLaneFloat16x4<1>(sin + i + 1, sin_val);
+        sin_val = MlasLoadLaneFloat16x4<2>(sin + i + 2, sin_val);
+        cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
+        cos_val = MlasLoadLaneFloat16x4<1>(cos + i + 1, cos_val);
+        cos_val = MlasLoadLaneFloat16x4<2>(cos + i + 2, cos_val);
+        float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
+        float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
+        MlasStoreLaneFloat16x4<0>(output + i, real_out);
+        MlasStoreLaneFloat16x4<1>(output + i + 1, real_out);
+        MlasStoreLaneFloat16x4<2>(output + i + 2, real_out);
+        MlasStoreLaneFloat16x4<0>(output + j, imag_out);
+        MlasStoreLaneFloat16x4<1>(output + j + 1, imag_out);
+        MlasStoreLaneFloat16x4<2>(output + j + 2, imag_out);
+    } else if (half_dim - i == 2) {
+        float16x4_t real = MlasZeroFloat16x4();
+        float16x4_t imag = MlasZeroFloat16x4();
+        float16x4_t sin_val = MlasZeroFloat16x4();
+        float16x4_t cos_val = MlasZeroFloat16x4();
+        real = MlasLoadLaneFloat16x4<0>(input + i, real);
+        real = MlasLoadLaneFloat16x4<1>(input + i + 1, real);
+        imag = MlasLoadLaneFloat16x4<0>(input + j, imag);
+        imag = MlasLoadLaneFloat16x4<1>(input + j + 1, imag);
+        sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
+        sin_val = MlasLoadLaneFloat16x4<1>(sin + i + 1, sin_val);
+        cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
+        cos_val = MlasLoadLaneFloat16x4<1>(cos + i + 1, cos_val);
+        float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
+        float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
+        MlasStoreLaneFloat16x4<0>(output + i, real_out);
+        MlasStoreLaneFloat16x4<1>(output + i + 1, real_out);
+        MlasStoreLaneFloat16x4<0>(output + j, imag_out);
+        MlasStoreLaneFloat16x4<1>(output + j + 1, imag_out);
+    } else if (half_dim - i == 1) {
+        float16x4_t real = MlasZeroFloat16x4();
+        float16x4_t imag = MlasZeroFloat16x4();
+        float16x4_t sin_val = MlasZeroFloat16x4();
+        float16x4_t cos_val = MlasZeroFloat16x4();
+        real = MlasLoadLaneFloat16x4<0>(input + i, real);
+        imag = MlasLoadLaneFloat16x4<0>(input + j, imag);
+        sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
+        cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
+        float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
+        float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
+        MlasStoreLaneFloat16x4<0>(output + i, real_out);
+        MlasStoreLaneFloat16x4<0>(output + j, imag_out);
+    }
+}
+
+template <>
+void
+RopeKernel_Fp16_Impl<true>(
+    const _mlas_fp16_* input,
+    const _mlas_fp16_* sin,
+    const _mlas_fp16_* cos,
+    size_t dim,
+    _mlas_fp16_* output
+) {
+    size_t i = 0;
+    for (; i + 15 < dim; i += 16) {
+        float16x8_t x0 = MlasLoadFloat16x8(input + i);
+        float16x8_t x1 = MlasLoadFloat16x8(input + i + 8);
+        float16x8_t real = vuzp1q_f16(x0, x1);
+        float16x8_t imag = vuzp2q_f16(x0, x1);
+        float16x8_t sin_val = MlasLoadFloat16x8(sin + i);
+        float16x8_t cos_val = MlasLoadFloat16x8(cos + i);
+        float16x8_t real_out = vfmsq_f16(vmulq_f16(real, cos_val), imag, sin_val);
+        float16x8_t imag_out = vfmaq_f16(vmulq_f16(real, sin_val), imag, cos_val);
+        float16x8_t y0 = vzip1q_f16(real_out, imag_out);
+        float16x8_t y1 = vzip2q_f16(real_out, imag_out);
+        MlasStoreFloat16x8(output + i, y0);
+        MlasStoreFloat16x8(output + i + 8, y1);
+    }
+    for (; i + 7 < dim; i += 8) {
+        float16x4_t x0 = MlasLoadFloat16x4(input + i);
+        float16x4_t x1 = MlasLoadFloat16x4(input + i + 4);
+        float16x4_t real = vuzp1_f16(x0, x1);
+        float16x4_t imag = vuzp2_f16(x0, x1);
+        float16x4_t sin_val = MlasLoadFloat16x4(sin + i);
+        float16x4_t cos_val = MlasLoadFloat16x4(cos + i);
+        float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
+        float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
+        float16x4_t y0 = vzip1_f16(real_out, imag_out);
+        float16x4_t y1 = vzip2_f16(real_out, imag_out);
+        MlasStoreFloat16x4(output + i, y0);
+        MlasStoreFloat16x4(output + i + 4, y1);
+    }
+    if (dim - i == 6) {
+        float16x4_t real = MlasZeroFloat16x4();
+        float16x4_t imag = MlasZeroFloat16x4();
+        float16x4_t sin_val = MlasZeroFloat16x4();
+        float16x4_t cos_val = MlasZeroFloat16x4();
+        real = MlasLoadLaneFloat16x4<0>(input + i, real);
+        imag = MlasLoadLaneFloat16x4<0>(input + i + 1, imag);
+        real = MlasLoadLaneFloat16x4<1>(input + i + 2, real);
+        imag = MlasLoadLaneFloat16x4<1>(input + i + 3, imag);
+        real = MlasLoadLaneFloat16x4<2>(input + i + 4, real);
+        imag = MlasLoadLaneFloat16x4<2>(input + i + 5, imag);
+        sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
+        sin_val = MlasLoadLaneFloat16x4<1>(sin + i + 1, sin_val);
+        sin_val = MlasLoadLaneFloat16x4<2>(sin + i + 2, sin_val);
+        cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
+        cos_val = MlasLoadLaneFloat16x4<1>(cos + i + 1, cos_val);
+        cos_val = MlasLoadLaneFloat16x4<2>(cos + i + 2, cos_val);
+        float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
+        float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
+        MlasStoreLaneFloat16x4<0>(output + i, real_out);
+        MlasStoreLaneFloat16x4<0>(output + i + 1, imag_out);
+        MlasStoreLaneFloat16x4<1>(output + i + 2, real_out);
+        MlasStoreLaneFloat16x4<1>(output + i + 3, imag_out);
+        MlasStoreLaneFloat16x4<2>(output + i + 4, real_out);
+        MlasStoreLaneFloat16x4<2>(output + i + 5, imag_out);
+    } else if (dim - i == 4) {
+        float16x4_t real = MlasZeroFloat16x4();
+        float16x4_t imag = MlasZeroFloat16x4();
+        float16x4_t sin_val = MlasZeroFloat16x4();
+        float16x4_t cos_val = MlasZeroFloat16x4();
+        real = MlasLoadLaneFloat16x4<0>(input + i, real);
+        imag = MlasLoadLaneFloat16x4<0>(input + i + 1, imag);
+        real = MlasLoadLaneFloat16x4<1>(input + i + 2, real);
+        imag = MlasLoadLaneFloat16x4<1>(input + i + 3, imag);
+        sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
+        sin_val = MlasLoadLaneFloat16x4<1>(sin + i + 1, sin_val);
+        cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
+        cos_val = MlasLoadLaneFloat16x4<1>(cos + i + 1, cos_val);
+        float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
+        float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
+        MlasStoreLaneFloat16x4<0>(output + i, real_out);
+        MlasStoreLaneFloat16x4<0>(output + i + 1, imag_out);
+        MlasStoreLaneFloat16x4<1>(output + i + 2, real_out);
+        MlasStoreLaneFloat16x4<1>(output + i + 3, imag_out);
+    } else if (dim - i == 2) {
+        float16x4_t real = MlasZeroFloat16x4();
+        float16x4_t imag = MlasZeroFloat16x4();
+        float16x4_t sin_val = MlasZeroFloat16x4();
+        float16x4_t cos_val = MlasZeroFloat16x4();
+        real = MlasLoadLaneFloat16x4<0>(input + i, real);
+        imag = MlasLoadLaneFloat16x4<0>(input + i + 1, imag);
+        sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
+        cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
+        float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
+        float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
+        MlasStoreLaneFloat16x4<0>(output + i, real_out);
+        MlasStoreLaneFloat16x4<0>(output + i + 1, imag_out);
+    }
+}
+
+}  // namespace
+
+void
+RopeKernel_Fp16(
+    const MLAS_FP16* input,
+    const MLAS_FP16* sin,
+    const MLAS_FP16* cos,
+    size_t dim,
+    bool interleaved,
+    MLAS_FP16* output
+) {
+    // real part and imaginary part must be paired
+    assert(dim % 2 == 0);
+
+    const auto* input_impl = reinterpret_cast<const _mlas_fp16_*>(input);
+    const auto* sin_impl = reinterpret_cast<const _mlas_fp16_*>(sin);
+    const auto* cos_impl = reinterpret_cast<const _mlas_fp16_*>(cos);
+    auto* output_impl = reinterpret_cast<_mlas_fp16_*>(output);
+
+    if (interleaved) {
+        RopeKernel_Fp16_Impl<true>(input_impl, sin_impl, cos_impl, dim, output_impl);
+    } else {
+        RopeKernel_Fp16_Impl<false>(input_impl, sin_impl, cos_impl, dim, output_impl);
+    }
+}
+
+}  // namespace rope_neon
diff --git a/onnxruntime/core/mlas/lib/scalar/SgemmKernelScalar.cpp b/onnxruntime/core/mlas/lib/scalar/SgemmKernelScalar.cpp
index 62729256dac23..cbec5d89bbac7 100644
--- a/onnxruntime/core/mlas/lib/scalar/SgemmKernelScalar.cpp
+++ b/onnxruntime/core/mlas/lib/scalar/SgemmKernelScalar.cpp
@@ -83,6 +83,8 @@ Return Value:
 
 #endif
 
+    int countb = 0;
+
     do {
 
         float BElements00;
@@ -116,6 +118,7 @@ Return Value:
         //
 
         const float* a = A;
+        const float* b = B;
         size_t k = CountK;
 
         while (k >= 2) {
@@ -128,10 +131,10 @@ Return Value:
                 Row1AElements1 = a[lda + 1];
             }
 
-            BElements00 = B[0];
-            BElements01 = B[1];
-            BElements02 = B[2];
-            BElements03 = B[3];
+            BElements00 = b[0];
+            BElements01 = b[1];
+            BElements02 = b[2];
+            BElements03 = b[3];
             Row0Block00 = Row0Block00 + BElements00 * Row0AElements0;
             Row0Block01 = Row0Block01 + BElements01 * Row0AElements0;
             Row0Block02 = Row0Block02 + BElements02 * Row0AElements0;
@@ -144,10 +147,10 @@ Return Value:
                 Row1Block03 = Row1Block03 + BElements03 * Row1AElements0;
             }
 
-            BElements00 = B[4];
-            BElements01 = B[5];
-            BElements02 = B[6];
-            BElements03 = B[7];
+            BElements00 = b[16];
+            BElements01 = b[17];
+            BElements02 = b[18];
+            BElements03 = b[19];
             Row0Block00 = Row0Block00 + BElements00 * Row0AElements1;
             Row0Block01 = Row0Block01 + BElements01 * Row0AElements1;
             Row0Block02 = Row0Block02 + BElements02 * Row0AElements1;
@@ -161,7 +164,7 @@ Return Value:
             }
 
             a += 2;
-            B += 8;
+            b += 32;
             k -= 2;
         }
 
@@ -173,10 +176,10 @@ Return Value:
                 Row1AElements0 = a[lda];
             }
 
-            BElements00 = B[0];
-            BElements01 = B[1];
-            BElements02 = B[2];
-            BElements03 = B[3];
+            BElements00 = b[0];
+            BElements01 = b[1];
+            BElements02 = b[2];
+            BElements03 = b[3];
             Row0Block00 = Row0Block00 + BElements00 * Row0AElements0;
             Row0Block01 = Row0Block01 + BElements01 * Row0AElements0;
             Row0Block02 = Row0Block02 + BElements02 * Row0AElements0;
@@ -188,8 +191,6 @@ Return Value:
                 Row1Block02 = Row1Block02 + BElements02 * Row1AElements0;
                 Row1Block03 = Row1Block03 + BElements03 * Row1AElements0;
             }
-
-            B += 4;
         }
 
         //
@@ -295,9 +296,14 @@ Return Value:
             break;
         }
 
+        B += 4;
         C += 4;
         CountN -= 4;
 
+        countb = (countb + 1) % 4;
+        if (countb == 0) {
+            B += CountK * 16 - 16;
+        }
     } while (CountN > 0);
 
     return ProcessTwoRows ? 2 : 1;
diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
index 4d7a1ceb4eee7..f8b25fb42caf3 100644
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -1061,7 +1061,7 @@ Return Value:
 
         size_t RowsHandled;
 
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
+#if (defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)) && !defined(FORCE_GENERIC_ALGORITHMS)
         RowsHandled = GetMlasPlatform().GemmFloatKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode);
 #else
         if (ZeroMode) {
@@ -1158,6 +1158,7 @@ Return Value:
 
     if (M == 1 && TransA == CblasNoTrans && alpha == 1.0f && (beta == 0.0f || beta == 1.0f)) {
 
+#if !defined(FORCE_GENERIC_ALGORITHMS)
 #if defined(MLAS_TARGET_AMD64)
 
         MLAS_SGEMM_KERNEL_M1_ROUTINE* SgemmKernelM1Routine;
@@ -1181,6 +1182,7 @@ Return Value:
         }
 
 #endif
+#endif // !defined(FORCE_GENERIC_ALGORITHMS)
 
     }
 
@@ -1193,7 +1195,7 @@ Return Value:
 
     if (N == 1 && ldb == 1 && ldc == 1 && alpha == 1.0f && (beta == 0.0f || beta == 1.0f)) {
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) && !defined(FORCE_GENERIC_ALGORITHMS)
 
         MLAS_SGEMM_KERNEL_M1_ROUTINE* SgemmKernelM1Routine;
 
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp
index baaa4ba1a3b1f..81615da46aa2e 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp
@@ -19,7 +19,7 @@ Module Name:
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 #include "sqnbitgemm_kernel_avx_common_int8.h"
 #include "sqnbitgemm_kernel_avx2_int8_blklen16.h"
@@ -1306,12 +1306,12 @@ SQ4BitGemmPackQuantBDataAndBlkSum(
     size_t N,
     size_t K,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const std::byte* QuantBDataBegin,
     const float* QuantBScaleBegin,
     bool has_zp_input,
     const std::byte* QuantBZPBegin,
-    PackedQuantBDataStruct& packed_quant_b,
+    PackedQuantBDataStruct<float>& packed_quant_b,
     MLAS_THREADPOOL* ThreadPool
 )
 {
@@ -1319,9 +1319,9 @@ SQ4BitGemmPackQuantBDataAndBlkSum(
 
     const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
 
-    // TODO: always use SubBlkLen = 64 in CompInt8
+    // TODO: always use SubBlkLen = 64 in SQNBIT_CompInt8
     size_t SubBlkLen = (BlkLen == 16) ? 16 : (BlkLen == 32 ? 32 : 64);
-    if (BlkLen == 32 && ComputeType == CompInt8) {
+    if (BlkLen == 32 && ComputeType == SQNBIT_CompInt8) {
         SubBlkLen = 64;
     }
     PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin, has_zp_input, QuantBZPBegin, packed_quant_b, ThreadPool);
@@ -1330,18 +1330,18 @@ SQ4BitGemmPackQuantBDataAndBlkSum(
 //
 // Kernel dispatch structure definition.
 //
-const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2 = []() {
-    MLAS_SQNBIT_GEMM_DISPATCH d;
+const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2 = []() {
+    MLAS_QNBIT_GEMM_DISPATCH d;
 
-    d.SQ4BitGemmPackQuantBDataSize = SQ4BitGemmPackQuantBDataSize;
+    d.Q4BitGemmPackQuantBDataSize = Q4BitGemmPackQuantBDataSize;
     d.SQ4BitGemmPackQuantBData = SQ4BitGemmPackQuantBData;
     d.SQ4BitGemmPackQuantBDataAndBlkSum = SQ4BitGemmPackQuantBDataAndBlkSum;
 
-    d.SQ4BitGemmPerGemmWorkspaceSize = SQ4BitGemmPerGemmWorkspaceSize;
-    d.SQ4BitGemmPerGemmWorkspaceAlignment = SQ4BitGemmPerGemmWorkspaceAlignment;
+    d.Q4BitGemmPerGemmWorkspaceSize = Q4BitGemmPerGemmWorkspaceSize;
+    d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment;
 
     d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32_avx2;
-    d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2;
+    d.SQ4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2;
 
     d.SQ4BitGemmKernel_BlkSum_CompInt8 = SQ4BitGemmKernel_BlkSum_CompInt8_avx2;
     d.QuantizeARowComputeBlkSum_CompInt8 = QuantizeARow_CompInt8_avx2;
@@ -1349,18 +1349,18 @@ const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2 = []() {
     return d;
 }();
 
-const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2vnni = []() {
-    MLAS_SQNBIT_GEMM_DISPATCH d;
+const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2vnni = []() {
+    MLAS_QNBIT_GEMM_DISPATCH d;
 
-    d.SQ4BitGemmPackQuantBDataSize = SQ4BitGemmPackQuantBDataSize;
+    d.Q4BitGemmPackQuantBDataSize = Q4BitGemmPackQuantBDataSize;
     d.SQ4BitGemmPackQuantBData = SQ4BitGemmPackQuantBData;
     d.SQ4BitGemmPackQuantBDataAndBlkSum = SQ4BitGemmPackQuantBDataAndBlkSum;
 
-    d.SQ4BitGemmPerGemmWorkspaceSize = SQ4BitGemmPerGemmWorkspaceSize;
-    d.SQ4BitGemmPerGemmWorkspaceAlignment = SQ4BitGemmPerGemmWorkspaceAlignment;
+    d.Q4BitGemmPerGemmWorkspaceSize = Q4BitGemmPerGemmWorkspaceSize;
+    d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment;
 
     d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32_avx2;
-    d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2;
+    d.SQ4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2;
 
     d.SQ4BitGemmKernel_BlkSum_CompInt8 = SQ4BitGemmKernel_BlkSum_CompInt8_avx2vnni;
     d.QuantizeARowComputeBlkSum_CompInt8 = QuantizeARow_CompInt8_avx2;
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen16.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen16.h
index 80d67806ea6e8..445ead329acf8 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen16.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen16.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 
 
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h
index af6f52090adcb..5dab8091ce760 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 
 
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h
index 174ebc580904c..d4b89bd9bad2d 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 
 template<bool vnni>
@@ -117,7 +117,7 @@ accumulate_blklen64_r1c1blk1_avx2(
         __m256 scale_b_8_ps = _mm256_broadcast_ss(scale_b);
 
         acc0 = _mm256_fmadd_ps(sum_ps, _mm256_mul_ps(scale_a_8_ps, scale_b_8_ps), acc0);
-#if !defined(__GNUC__) || (__GNUC__ > 9)
+#if !defined(__GNUC__) || (__GNUC__ > 10)
     }
 #endif
 }
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp
index 13bd369a065bb..b4e25d4e4040a 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp
@@ -19,7 +19,7 @@ Module Name:
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 #include "sqnbitgemm_kernel_avx_common_int8.h"
 #include "sqnbitgemm_kernel_avx512_int8_blklen16.h"
@@ -28,7 +28,7 @@ Module Name:
 #include "sqnbitgemm_kernel_avx512_int8_blklen128.h"
 
 //
-// CompFp32 kernel implementation.
+// SQNBIT_CompFp32 kernel implementation.
 //
 
 #include "sqnbitgemm_kernel_avx_common_fp32.h"
@@ -151,7 +151,7 @@ SQ4BitGemmM1Kernel_CompFp32_avx512(
 }
 
 //
-// CompInt8 kernel implementation.
+// SQNBIT_CompInt8 kernel implementation.
 //
 
 MLAS_FORCEINLINE
@@ -332,12 +332,12 @@ SQ4BitGemmPackQuantBDataAndBlkSum512(
     size_t N,
     size_t K,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const std::byte* QuantBDataBegin,
     const float* QuantBScaleBegin,
     bool has_zp_input,
     const std::byte* QuantBZPBegin,
-    PackedQuantBDataStruct& packed_quant_b,
+    PackedQuantBDataStruct<float>& packed_quant_b,
     MLAS_THREADPOOL* ThreadPool
 )
 {
@@ -346,24 +346,24 @@ SQ4BitGemmPackQuantBDataAndBlkSum512(
     const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
 
     size_t SubBlkLen = (BlkLen == 16) ? 16 : (BlkLen == 32 ? 32 : 64);
-    if (ComputeType == CompInt8) {
+    if (ComputeType == SQNBIT_CompInt8) {
         SubBlkLen = 128;
     }
     PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin, has_zp_input, QuantBZPBegin, packed_quant_b, ThreadPool);
 }
 
-const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512 = []() {
-    MLAS_SQNBIT_GEMM_DISPATCH d;
+const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512 = []() {
+    MLAS_QNBIT_GEMM_DISPATCH d;
 
-    d.SQ4BitGemmPackQuantBDataSize = SQ4BitGemmPackQuantBDataSize;
+    d.Q4BitGemmPackQuantBDataSize = Q4BitGemmPackQuantBDataSize;
     d.SQ4BitGemmPackQuantBData = SQ4BitGemmPackQuantBData;
     d.SQ4BitGemmPackQuantBDataAndBlkSum = SQ4BitGemmPackQuantBDataAndBlkSum512;
 
-    d.SQ4BitGemmPerGemmWorkspaceSize = SQ4BitGemmPerGemmWorkspaceSize;
-    d.SQ4BitGemmPerGemmWorkspaceAlignment = SQ4BitGemmPerGemmWorkspaceAlignment;
+    d.Q4BitGemmPerGemmWorkspaceSize = Q4BitGemmPerGemmWorkspaceSize;
+    d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment;
 
     d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32_avx512;
-    d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2;
+    d.SQ4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2;
 
     d.SQ4BitGemmKernel_BlkSum_CompInt8 = SQ4BitGemmKernel_BlkSum_CompInt8_avx512;
     d.QuantizeARowComputeBlkSum_CompInt8 = QuantizeARow_CompInt8_avx512;
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8.h
index 7d9dc36854621..8f1ea6676b788 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 
 
@@ -81,7 +81,7 @@ accumulate_blklen32_r2c1blk2_avx2(
         _mm256_sign_epi8(bv1_32_epi8, bv1_32_epi8), _mm256_sign_epi8(av01_32_epi8, bv1_32_epi8)
     );
     const __m256i sum_16_epi16 = _mm256_hadd_epi16(dot0_16_epi16, dot1_16_epi16);
-    
+
     __m256i one_16_epi16 = _mm256_srli_epi16(_mm256_cmpeq_epi16(bv0_32_epi8, bv0_32_epi8), 15);
     const __m256i sum_8_epi32 = _mm256_madd_epi16(one_16_epi16, sum_16_epi16);
     const __m256 sum_ps = _mm256_cvtepi32_ps(sum_8_epi32);
@@ -143,7 +143,7 @@ accumulate_blklen32_r2c1blk2_avx2(
     // const __m256i bv1 = _mm256_and_si256(_mm256_srli_epi16(bv_packed, 4), low_mask);  // 16, 17,...30, 31, 48, 49,...,62, 63
     __m256i bv1_32_epi8 = _mm256_srli_epi16(_mm256_sub_epi8(bv_packed, bv0_32_epi8), 4);  // 16, 17,...30, 31, 48, 49,...,62, 63
 
-    //__m256i bv0_32_epi8 = _mm256_set_m128i(_mm256_castsi256_si128(bv1), _mm256_castsi256_si128(bv0)); 
+    //__m256i bv0_32_epi8 = _mm256_set_m128i(_mm256_castsi256_si128(bv1), _mm256_castsi256_si128(bv0));
 
     //// This (the second line below) saves one _mm256_extracti128_si256 against using _mm256_set_m128i.
     ////__m256i bv1_32_epi8 = _mm256_set_m128i(_mm256_extracti128_si256(bv1, 1), _mm256_extracti128_si256(bv0, 1));
@@ -184,7 +184,7 @@ accumulate_blklen32_r2c1blk1_avx2(
     const __m128i bv_packed0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(QuantBDataPtr));
     __m256i bv_32_epi8 = _mm256_set_m128i(_mm_srli_epi16(bv_packed0, 4), bv_packed0);
     bv_32_epi8 = _mm256_and_si256(_mm256_set1_epi8(0x0F), bv_32_epi8);
-    
+
     const int8_t zp = get_zp<HasZeroPoint>(true, QuantBZeroPointPtr);
     const __m256i bzp = _mm256_set1_epi8(zp);
     bv_32_epi8 = _mm256_sub_epi8(bv_32_epi8, bzp);
@@ -435,7 +435,7 @@ Q4Int8Gemm2x4BlkLen32Avx2(
     }
 }
 
-template <bool HasZeroPoint> 
+template <bool HasZeroPoint>
 void MLAS_FORCEINLINE Q4Int8Gemm2xXBlkLen32Avx2(
     const std::byte* QuantA,
     const std::byte* QuantBData,
@@ -877,7 +877,7 @@ MLAS_FORCEINLINE
           QuantBZeroPoint + multipleCols * StrideQuantBZeroPoint,
           C + multipleRows * ldc + multipleCols,
           remainingRows,
-          remainingCols, 
+          remainingCols,
           BlockCountK,
           Bias ? Bias + multipleCols : nullptr,
           lda,
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h
index 60a887345d0e0..d79554c34c108 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 #include "sqnbitgemm_kernel_avx512_int8_blklen64.h"
 
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen16.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen16.h
index bb14babd6c2b1..03064886caf24 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen16.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen16.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 #include "sqnbitgemm_kernel_avx2_int8_blklen16.h"
 #include "sqnbitgemm_kernel_avx512_int8_blklen32.h"
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen32.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen32.h
index e9df6b952bd27..3b1096ac05ba7 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen32.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen32.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 #include "sqnbitgemm_kernel_avx2_int8_blklen32.h"
 #include "sqnbitgemm_kernel_avx512_int8_blklen64.h"
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h
index 2a65ac4af0c1d..72ce28d834199 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 
 static MLAS_FORCEINLINE __m256
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp
index 6a5c01162c51b..a4468bb906bbc 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp
@@ -19,7 +19,7 @@ Module Name:
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 #include "sqnbitgemm_kernel_avx_common_fp32.h"
 #include "sqnbitgemm_kernel_avx_common_int8.h"
@@ -314,12 +314,12 @@ SQ4BitGemmPackQuantBDataAndBlkSum512vnni(
     size_t N,
     size_t K,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const std::byte* QuantBDataBegin,
     const float* QuantBScaleBegin,
     bool has_zp_input,
     const std::byte* QuantBZPBegin,
-    PackedQuantBDataStruct& packed_quant_b,
+    PackedQuantBDataStruct<float>& packed_quant_b,
     MLAS_THREADPOOL* ThreadPool
 )
 {
@@ -328,7 +328,7 @@ SQ4BitGemmPackQuantBDataAndBlkSum512vnni(
     const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
 
     size_t SubBlkLen = (BlkLen == 16) ? 16 : (BlkLen == 32 ? 32 : 64);
-    if (ComputeType == CompInt8) {
+    if (ComputeType == SQNBIT_CompInt8) {
         SubBlkLen = 128;
     }
     PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin, has_zp_input, QuantBZPBegin, packed_quant_b, ThreadPool);
@@ -337,18 +337,18 @@ SQ4BitGemmPackQuantBDataAndBlkSum512vnni(
 //
 // Kernel dispatch structure definition.
 //
-const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni = []() {
-    MLAS_SQNBIT_GEMM_DISPATCH d;
+const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni = []() {
+    MLAS_QNBIT_GEMM_DISPATCH d;
 
-    d.SQ4BitGemmPackQuantBDataSize = SQ4BitGemmPackQuantBDataSize;
+    d.Q4BitGemmPackQuantBDataSize = Q4BitGemmPackQuantBDataSize;
     d.SQ4BitGemmPackQuantBData = SQ4BitGemmPackQuantBData;
     d.SQ4BitGemmPackQuantBDataAndBlkSum = SQ4BitGemmPackQuantBDataAndBlkSum512vnni;
 
-    d.SQ4BitGemmPerGemmWorkspaceSize = SQ4BitGemmPerGemmWorkspaceSize;
-    d.SQ4BitGemmPerGemmWorkspaceAlignment = SQ4BitGemmPerGemmWorkspaceAlignment;
+    d.Q4BitGemmPerGemmWorkspaceSize = Q4BitGemmPerGemmWorkspaceSize;
+    d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment;
 
     d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32;
-    d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2;
+    d.SQ4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2;
 
     d.SQ4BitGemmKernel_BlkSum_CompInt8 = SQ4BitGemmKernel_BlkSum_CompInt8_avx512vnni;
     d.QuantizeARowComputeBlkSum_CompInt8 = QuantizeARow_CompInt8_avx512;
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h
index 177f5518bb891..b0367b7fb9a15 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h
@@ -1,5 +1,5 @@
 #pragma once
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_q8_block.h"
 
 //
@@ -7,16 +7,16 @@
 //
 
 static size_t
-SQ4BitGemmPackQuantBDataSize(
+Q4BitGemmPackQuantBDataSize(
     size_t N,
     size_t K,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
     constexpr size_t BlkBitWidth = 4;
     const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
-    if (ComputeType == CompInt8) {
+    if (ComputeType == SQNBIT_CompInt8) {
         size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
         const size_t ScaleSize = N * BlockCountK * sizeof(float);
         size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(float);
@@ -39,7 +39,7 @@ SQ4BitGemmPackQuantBData(
     size_t N,
     size_t K,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE /* ComputeType*/,
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE /* ComputeType*/,
     const std::byte* QuantBDataBegin,
     std::byte* PackedQuantBDataBegin,
     MLAS_THREADPOOL* ThreadPool
@@ -304,7 +304,7 @@ PackQuantBDataAndBlkSum(
     const float* QuantBScaleBegin,
     bool has_zp_input,
     const std::byte* QuantBZPBegin,
-    PackedQuantBDataStruct& packed_quant_b,
+    PackedQuantBDataStruct<float>& packed_quant_b,
     MLAS_THREADPOOL* ThreadPool
 )
 {
@@ -326,18 +326,18 @@ PackQuantBDataAndBlkSum(
 //
 
 static size_t
-SQ4BitGemmPerGemmWorkspaceSize(
+Q4BitGemmPerGemmWorkspaceSize(
     size_t M,
     size_t N,
     size_t K,
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
     MLAS_UNREFERENCED_PARAMETER(N);
 
     switch(ComputeType) {
-        case CompInt8: {
+        case SQNBIT_CompInt8: {
             // workspace buffer is used for block quantization of A to int8
             const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
             // QuantData + Scale + BlkSum
@@ -351,15 +351,15 @@ SQ4BitGemmPerGemmWorkspaceSize(
 }
 
 static size_t
-SQ4BitGemmPerGemmWorkspaceAlignment(
+Q4BitGemmPerGemmWorkspaceAlignment(
     size_t BlkLen,
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType
 )
 {
     MLAS_UNREFERENCED_PARAMETER(BlkLen);
 
     switch (ComputeType) {
-        case CompInt8: {
+        case SQNBIT_CompInt8: {
             return Q8BlkAlignment();
         }
         default: {
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_fp32.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_fp32.h
index 5cd380e591098..d15cfc782e125 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_fp32.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_fp32.h
@@ -1,5 +1,5 @@
 #pragma once
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 
 template <bool HasZeroPoint>
 MLAS_FORCEINLINE
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_int8.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_int8.h
index 895ce6cd091c2..2e96082968866 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_int8.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_int8.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 #include "sqnbitgemm_q8_block.h"
 
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp
index 12ddc42506e98..31a499b8243af 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp
@@ -13,7 +13,7 @@ Module Name:
     This module implements the float/quantized n-bit integer matrix
     multiplication kernels for ARM NEON specific to
     input type T1 as float32 and
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE CompFp32.
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE SQNBIT_CompFp32.
 
 --*/
 
@@ -21,8 +21,8 @@ Module Name:
 
 #include <cassert>
 
-#include "sqnbitgemm.h"
-#include "sqnbitgemm_kernel_neon.h"
+#include "qnbitgemm.h"
+#include "qnbitgemm_kernel_neon.h"
 
 namespace sqnbitgemm_neon
 {
@@ -31,7 +31,7 @@ namespace
 {
 
 //
-// CompFp32 kernel implementation.
+// SQNBIT_CompFp32 kernel implementation.
 //
 
 MLAS_FORCEINLINE void
@@ -608,7 +608,7 @@ Q4BitBlkDequantBForSgemm_CompFp32_Impl(
 }  // namespace
 
 void
-Q4BitBlkDequantBForSgemm_CompFp32(
+SQ4BitBlkDequantBForSgemm_CompFp32(
     size_t BlkLen,
     float* FpData,
     const std::byte* QuantBData,
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
index 0d62ea37b7e26..73beb06a3cfad 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
@@ -13,7 +13,7 @@ Module Name:
     This module implements the float/quantized n-bit integer matrix
     multiplication kernels for ARM NEON specific to
     input type T1 as float32 and
-    MLAS_SQNBIT_GEMM_COMPUTE_TYPE CompInt8.
+    MLAS_QNBIT_GEMM_COMPUTE_TYPE SQNBIT_CompInt8.
 
 --*/
 
@@ -21,15 +21,15 @@ Module Name:
 
 #include <cassert>
 
-#include "sqnbitgemm.h"
-#include "sqnbitgemm_kernel_neon.h"
+#include "qnbitgemm.h"
+#include "qnbitgemm_kernel_neon.h"
 #include "sqnbitgemm_q8_block.h"
 
 namespace sqnbitgemm_neon
 {
 
 //
-// CompInt8 kernel implementation.
+// SQNBIT_CompInt8 kernel implementation.
 //
 
 namespace
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen32.h b/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen32.h
index 45c3963365e6b..941b884d0b9d2 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen32.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen32.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 
 template <bool HasZeroPoint, bool vnni>
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen64.h b/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen64.h
index e9c3812bde899..ed78dfa67042d 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen64.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen64.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <utility>
 
-#include "sqnbitgemm.h"
+#include "qnbitgemm.h"
 #include "sqnbitgemm_kernel_avx_common.h"
 
 
diff --git a/onnxruntime/core/optimizer/attention_fusion_helper.h b/onnxruntime/core/optimizer/attention_fusion_helper.h
index 267a82b72670c..935114c40d1a7 100644
--- a/onnxruntime/core/optimizer/attention_fusion_helper.h
+++ b/onnxruntime/core/optimizer/attention_fusion_helper.h
@@ -1,5 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#include <limits>
 #include "onnx/defs/shape_inference.h"
 #include "onnx/defs/tensor_proto_util.h"
 #include "core/framework/tensorprotoutils.h"
@@ -767,7 +768,8 @@ bool MatchInputMaskSubgraph(const Graph& graph, const Node& layer_norm, const No
   }
 
   // check where has X=-Infinity
-  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]), -INFINITY, true)) {
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]),
+                                                       -std::numeric_limits<float>::infinity(), true)) {
     DEBUG_LOG("where const not matched.");
     return false;
   }
diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index 1466de51d0b99..e755b4bfa6364 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -227,11 +227,12 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
 #if !defined(DISABLE_SPARSE_TENSORS)
       // Create execution frame for executing constant nodes.
       OptimizerExecutionFrame::Info info({node}, constant_inputs, graph.ModelPath(), execution_provider_,
-                                         is_sparse_initializer_check);
+                                         is_sparse_initializer_check, logger);
 #else
       // Create execution frame for executing constant nodes.
-      OptimizerExecutionFrame::Info info({node}, constant_inputs, graph.ModelPath(), execution_provider_,
-                                         [](std::string const&) { return false; });
+      OptimizerExecutionFrame::Info info(
+          {node}, constant_inputs, graph.ModelPath(), execution_provider_, [](const std::string&) { return false; },
+          logger);
 #endif
 
       std::vector<int> fetch_mlvalue_idxs;
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index f769d31092d19..ba2b87b5aa0ca 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -63,6 +63,7 @@
 #ifdef MLAS_TARGET_AMD64_IX86
 #include "core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.h"
 #endif
+#include "core/optimizer/qdq_transformer/bias_quantization.h"
 #include "core/optimizer/qdq_transformer/clip_quantizelinear.h"
 #include "core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.h"
 #include "core/optimizer/qdq_transformer/qdq_propagation.h"
@@ -189,6 +190,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     TransformerLevel level,
     const SessionOptions& session_options,
     const IExecutionProvider& cpu_execution_provider, /*required by constant folding*/
+    const logging::Logger& logger,
     const InlinedHashSet<std::string>& rules_and_transformers_to_disable,
     [[maybe_unused]] concurrency::ThreadPool* intra_op_thread_pool,
     std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors) {
@@ -243,6 +245,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
 
       if (!disable_quant_qdq) {
         transformers.emplace_back(std::make_unique<QDQPropagationTransformer>());
+        transformers.emplace_back(std::make_unique<BiasQuantization>());
 
         // EnsureUniqueDQForNodeUnit is actually a required graph transformation. The unique DQ per QDQ node unit input
         // condition that it ensures is important for the partitioning that happens after Level1 optimizers are run.
@@ -402,7 +405,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       }
 
       auto cpu_registry = cpu_execution_provider.GetKernelRegistry();
-      auto nhwc_transformer = std::make_unique<NhwcTransformer>(std::move(cpu_allocator), std::move(cpu_registry));
+      auto nhwc_transformer = std::make_unique<NhwcTransformer>(std::move(cpu_allocator), std::move(cpu_registry),
+                                                                logger);
       if (nhwc_transformer->IsActive()) {
         transformers.emplace_back(std::move(nhwc_transformer));
       }
@@ -435,6 +439,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
     const SessionOptions& session_options,
     const SatApplyContextVariant& apply_context,
     const IExecutionProvider& cpu_execution_provider,
+    const logging::Logger& logger,
     const InlinedHashSet<std::string>& rules_and_transformers_to_disable,
     [[maybe_unused]] concurrency::ThreadPool* intra_op_thread_pool,
     std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors) {
@@ -488,7 +493,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
 #ifndef DISABLE_CONTRIB_OPS
         AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
         auto cpu_registry = cpu_execution_provider.GetKernelRegistry();
-        auto nhwc_transformer = std::make_unique<NhwcTransformer>(std::move(cpu_allocator), std::move(cpu_registry));
+        auto nhwc_transformer = std::make_unique<NhwcTransformer>(std::move(cpu_allocator), std::move(cpu_registry),
+                                                                  logger);
         if (nhwc_transformer->IsActive()) {
           transformers.emplace_back(std::move(nhwc_transformer));
         }
diff --git a/onnxruntime/core/optimizer/insert_cast_transformer.cc b/onnxruntime/core/optimizer/insert_cast_transformer.cc
index 67ebc22dab41d..b1665c7172549 100644
--- a/onnxruntime/core/optimizer/insert_cast_transformer.cc
+++ b/onnxruntime/core/optimizer/insert_cast_transformer.cc
@@ -84,7 +84,9 @@ static bool NodeNeedsInputCastToFp32(const onnxruntime::Node& node) {
 // going to a node that will need a Cast.
 //
 // Return true if all the fp16 inputs and outputs are connected to nodes that will be cast to fp32.
-static bool IsIsolatedFp16NodeOnCpu(const onnxruntime::Node& node, onnxruntime::Graph& graph, const KernelRegistry& cpu_kernel_registry) {
+static bool IsIsolatedFp16NodeOnCpu(const onnxruntime::Node& node, onnxruntime::Graph& graph,
+                                    const KernelRegistry& cpu_kernel_registry,
+                                    const logging::Logger& logger) {
   // we can check if it's an isolated fp16 node
   // if node has input coming from other nodes (only consuming graph inputs or initializers if it doesn't),
   //    does not have a subgraph (would have to alter subgraph inputs if we cast the input to this node),
@@ -211,7 +213,7 @@ static bool IsIsolatedFp16NodeOnCpu(const onnxruntime::Node& node, onnxruntime::
     const KernelCreateInfo* kernel_create_info{};
     const auto lookup_status = cpu_kernel_registry.TryFindKernel(
         kCpuExecutionProvider, node.OpType(), node.Domain(),
-        node.SinceVersion(), type_constraint_map, &kernel_create_info);
+        node.SinceVersion(), type_constraint_map, logger, &kernel_create_info);
     if (lookup_status.IsOK() && kernel_create_info != nullptr) {
       return true;
     }
@@ -220,9 +222,10 @@ static bool IsIsolatedFp16NodeOnCpu(const onnxruntime::Node& node, onnxruntime::
   return false;
 }
 
-static Status ForceSingleNodeCPUFloat16ToFloat32(onnxruntime::Graph& graph, const KernelRegistry& cpu_kernel_registry) {
+static Status ForceSingleNodeCPUFloat16ToFloat32(onnxruntime::Graph& graph, const KernelRegistry& cpu_kernel_registry,
+                                                 const logging::Logger& logger) {
   for (auto& node : graph.Nodes()) {
-    if (IsIsolatedFp16NodeOnCpu(node, graph, cpu_kernel_registry)) {
+    if (IsIsolatedFp16NodeOnCpu(node, graph, cpu_kernel_registry, logger)) {
       // unassign the node so that NeedInsertCast will return true for it, forcing it to fp32
       node.SetExecutionProviderType("");
     }
@@ -319,7 +322,8 @@ class RemoveDuplicateCastTransformer : public GraphTransformer {
       return dst_bit_length <= src_bit_length;
     }
 
-    if ((*src_type == "tensor(float16)" && *dst_type == "tensor(bfloat16)") || (*src_type == "tensor(bfloat16)" && *dst_type == "tensor(float16)")) {
+    if ((*src_type == "tensor(float16)" && *dst_type == "tensor(bfloat16)") ||
+        (*src_type == "tensor(bfloat16)" && *dst_type == "tensor(float16)")) {
       return true;
     }
 
@@ -453,7 +457,7 @@ class RemoveDuplicateCastTransformer : public GraphTransformer {
 Status InsertCastTransformer::ApplyImpl(onnxruntime::Graph& graph, bool& modified, int graph_level,
                                         const logging::Logger& logger) const {
   if (force_cpu_fp32_)
-    ORT_RETURN_IF_ERROR(ForceSingleNodeCPUFloat16ToFloat32(graph, *cpu_kernel_registries_));
+    ORT_RETURN_IF_ERROR(ForceSingleNodeCPUFloat16ToFloat32(graph, *cpu_kernel_registries_, logger));
 
   GraphViewer graph_viewer(graph);
   auto& order = graph_viewer.GetNodesInTopologicalOrder();
diff --git a/onnxruntime/core/optimizer/matmul_bn_fusion.cc b/onnxruntime/core/optimizer/matmul_bn_fusion.cc
index e944522c9c338..6b76dc626fba0 100644
--- a/onnxruntime/core/optimizer/matmul_bn_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_bn_fusion.cc
@@ -107,6 +107,22 @@ bool MatmulBNFusion::SatisfyCondition(const Graph& graph, const Node& node, cons
     return false;
   }
 
+  // Checks the first input of MatMul has 2 dimensions.
+  // The test for the second input is done in method Apply as it accesses the constant.
+  if (node.InputDefs()[0] == nullptr) {
+    // This should never happen but just in case.
+    return false;
+  }
+  auto shape_a = node.InputDefs()[0]->Shape();
+  if (shape_a == nullptr) {
+    // We cannot shape the rank. It is better to avoid fusing.
+    return false;
+  }
+  if (shape_a->dim_size() != 2) {
+    // Gemm only supports 2D tensors.
+    return false;
+  }
+
   // First output from BN is required. Others are optional. If any optional outputs exist we can't fuse.
   const auto& output_defs = batch_norm_node->OutputDefs();
   if (output_defs.size() > 1) {
@@ -165,6 +181,7 @@ Status MatmulBNFusion::Apply(Graph& graph, Node& matmul_node, RewriteRuleEffect&
       bias_tensor->dims_size() != 1 ||
       mean_tensor->dims_size() != 1 ||
       var_tensor->dims_size() != 1 ||
+      matmul_b_tensor->dims_size() != 2 ||
       scale_tensor->dims(0) != matmul_b_tensor->dims(1) ||
       bias_tensor->dims(0) != matmul_b_tensor->dims(1) ||
       mean_tensor->dims(0) != matmul_b_tensor->dims(1) ||
diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
index 4fee1a6ce224e..b619efb2f751e 100644
--- a/onnxruntime/core/optimizer/matmul_integer_to_float.cc
+++ b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
@@ -49,6 +49,49 @@ bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
   return data_type == actual_data_type;
 }
 
+// Return total mnumber of Elements.
+static uint64_t NumElements(const TensorShapeProto* tensor_shape) {
+  if (nullptr == tensor_shape || tensor_shape->dim_size() < 1) {
+    return 0;
+  }
+  uint64_t num_elements = 1;
+
+  for (int i = 0; i < tensor_shape->dim_size(); i++) {
+    num_elements *= tensor_shape->dim(i).dim_value();
+  }
+  return num_elements;
+}
+
+bool CheckMatMulLargeTensors(const Node& matmulinteger_node, const Node& cast_node) {
+  const auto a_def = matmulinteger_node.InputDefs()[0];
+  const auto b_def = matmulinteger_node.InputDefs()[1];
+  const int a_dim_size = a_def->Shape()->dim_size();
+  const int b_dim_size = b_def->Shape()->dim_size();
+  uint64_t a_num_elements = NumElements(a_def->Shape());
+  uint64_t b_num_elements = NumElements(b_def->Shape());
+
+  if (a_dim_size != b_dim_size) {
+    bool a_is_broadcasted = a_dim_size < b_dim_size;
+    if (a_is_broadcasted) {
+      for (int i = 0; i < b_dim_size - a_dim_size; i++) {
+        a_num_elements *= b_def->Shape()->dim(i).dim_value();
+      }
+    } else {
+      for (int i = 0; i < a_dim_size - b_dim_size; i++) {
+        b_num_elements *= a_def->Shape()->dim(i).dim_value();
+      }
+    }
+  }
+
+  int output_data_type = HasElementDataType(*cast_node.OutputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) ? 2 : 4;
+  uint64_t total_bytes = (a_num_elements + b_num_elements) * output_data_type;
+
+  if (total_bytes > UINT32_MAX) {
+    return true;
+  }
+  return false;
+}
+
 /**
 MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
 
@@ -114,6 +157,17 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
       continue;
     }
 
+    const Node* p_dynamicquantize_node = graph_utils::FirstParentByType(*p_matmulinteger_node, "DynamicQuantizeLinear");
+
+    // Check MatMulInteger Nodes' input is coming from DynamicQuantizeLinear
+    // For larger tensors DynamicQuantizeLinear -> MatMulInteger is used to be resource efficient
+    // And we have better MatMulInteger Metacommand coverage in DML
+    if (is_dml_ep && p_dynamicquantize_node) {
+      if (CheckMatMulLargeTensors(matmulinteger_node, cast_node)) {
+        continue;
+      }
+    }
+
     // Find bias node
     Node* p_add_node = nullptr;
     if (optimizer_utils::CheckOutputEdges(graph, mul_node, 1)) {
diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc
index ee79fa620374e..cd654991c92d5 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nhwc_transformer.cc
@@ -44,7 +44,9 @@ NhwcConvLookup(
   return &(iter->second);
 }
 
-NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator, std::shared_ptr<KernelRegistry> cpu_kernel_registry) noexcept
+NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
+                                 std::shared_ptr<KernelRegistry> cpu_kernel_registry,
+                                 const logging::Logger& logger) noexcept
     : GraphTransformer("NhwcTransformer"), cpu_allocator_(std::move(cpu_allocator)) {
   if (!cpu_kernel_registry) {
     // This is a CPU op nodes optimizer, not useful if cpu EP is not available.
@@ -64,7 +66,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator, std::shared_ptr<Ker
     const KernelCreateInfo* kernel_create_info{};
     const auto status = cpu_kernel_registry->TryFindKernel(
         kCpuExecutionProvider, qconv_int8.op_type_, qconv_int8.domain_,
-        qconv_int8.version_, qconv_int8.type_constraints_, &kernel_create_info);
+        qconv_int8.version_, qconv_int8.type_constraints_, logger, &kernel_create_info);
     if (status.IsOK() && kernel_create_info != nullptr) {
       kernel_create_info = nullptr;
       conv_table_.emplace(
@@ -83,7 +85,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator, std::shared_ptr<Ker
     const KernelCreateInfo* kernel_create_info{};
     const auto status = cpu_kernel_registry->TryFindKernel(
         kCpuExecutionProvider, qconv_uint8.op_type_, qconv_uint8.domain_,
-        qconv_uint8.version_, qconv_uint8.type_constraints_, &kernel_create_info);
+        qconv_uint8.version_, qconv_uint8.type_constraints_, logger, &kernel_create_info);
     if (status.IsOK() && kernel_create_info != nullptr) {
       kernel_create_info = nullptr;
       conv_table_.emplace(
@@ -103,7 +105,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator, std::shared_ptr<Ker
     const KernelCreateInfo* kernel_create_info{};
     const auto status = cpu_kernel_registry->TryFindKernel(
         kCpuExecutionProvider, nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_,
-        nhwc_conv_fp16.version_, nhwc_conv_fp16.type_constraints_, &kernel_create_info);
+        nhwc_conv_fp16.version_, nhwc_conv_fp16.type_constraints_, logger, &kernel_create_info);
     if (status.IsOK() && kernel_create_info != nullptr) {
       kernel_create_info = nullptr;
       conv_table_.emplace(
@@ -123,7 +125,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator, std::shared_ptr<Ker
     const KernelCreateInfo* kernel_create_info{};
     const auto status = cpu_kernel_registry->TryFindKernel(
         kCpuExecutionProvider, nhwc_maxpool_fp16.op_type_, nhwc_maxpool_fp16.domain_,
-        nhwc_maxpool_fp16.version_, nhwc_maxpool_fp16.type_constraints_, &kernel_create_info);
+        nhwc_maxpool_fp16.version_, nhwc_maxpool_fp16.type_constraints_, logger, &kernel_create_info);
     if (status.IsOK() && kernel_create_info != nullptr) {
       kernel_create_info = nullptr;
       conv_table_.emplace(
@@ -140,7 +142,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator, std::shared_ptr<Ker
     const KernelCreateInfo* kernel_create_info{};
     const auto status = cpu_kernel_registry->TryFindKernel(
         kCpuExecutionProvider, nhwc_avgpool_fp16.op_type_, nhwc_avgpool_fp16.domain_,
-        nhwc_avgpool_fp16.version_, nhwc_avgpool_fp16.type_constraints_, &kernel_create_info);
+        nhwc_avgpool_fp16.version_, nhwc_avgpool_fp16.type_constraints_, logger, &kernel_create_info);
     if (status.IsOK() && kernel_create_info != nullptr) {
       kernel_create_info = nullptr;
       conv_table_.emplace(
@@ -157,7 +159,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator, std::shared_ptr<Ker
     const KernelCreateInfo* kernel_create_info{};
     const auto status = cpu_kernel_registry->TryFindKernel(
         kCpuExecutionProvider, nhwc_gavgpool_fp16.op_type_, nhwc_gavgpool_fp16.domain_,
-        nhwc_gavgpool_fp16.version_, nhwc_gavgpool_fp16.type_constraints_, &kernel_create_info);
+        nhwc_gavgpool_fp16.version_, nhwc_gavgpool_fp16.type_constraints_, logger, &kernel_create_info);
     if (status.IsOK() && kernel_create_info != nullptr) {
       kernel_create_info = nullptr;
       conv_table_.emplace(
diff --git a/onnxruntime/core/optimizer/nhwc_transformer.h b/onnxruntime/core/optimizer/nhwc_transformer.h
index 000732060b889..c65f851fdab9d 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.h
+++ b/onnxruntime/core/optimizer/nhwc_transformer.h
@@ -75,7 +75,8 @@ and inserts nodes to transpose tensors as needed.
 class NhwcTransformer : public GraphTransformer {
  private:
  public:
-  explicit NhwcTransformer(AllocatorPtr cpu_allocator, std::shared_ptr<KernelRegistry> cpu_kernel_registry) noexcept;
+  explicit NhwcTransformer(AllocatorPtr cpu_allocator, std::shared_ptr<KernelRegistry> cpu_kernel_registry,
+                           const logging::Logger& logger) noexcept;
 
   /**
    * @brief Usually called right after constructor, it shows whether
diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.cc b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
index ed7d5feb2beb3..b2e8e491c361c 100644
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.cc
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
@@ -32,9 +32,11 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
                                     const InitializedTensorSet& initialized_tensor_set,
                                     const std::filesystem::path& model_path,
                                     const IExecutionProvider& execution_provider,
-                                    const std::function<bool(const std::string&)>& is_sparse_initializer_func)
+                                    const std::function<bool(const std::string&)>& is_sparse_initializer_func,
+                                    const logging::Logger& logger)
     : execution_provider_(execution_provider),
-      is_sparse_initializer_func_(is_sparse_initializer_func) {
+      is_sparse_initializer_func_(is_sparse_initializer_func),
+      logger_(logger) {
   allocator_ptr_ = std::make_shared<CPUAllocator>();
   ORT_ENFORCE(allocator_ptr_, "Failed to get allocator for optimizer");
 
@@ -79,9 +81,11 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
                                     const std::unordered_map<std::string, OrtValue>& initialized_tensor_set,
                                     const std::filesystem::path& /* model_path */,
                                     const IExecutionProvider& execution_provider,
-                                    const std::function<bool(const std::string&)>& is_sparse_initializer_func)
+                                    const std::function<bool(const std::string&)>& is_sparse_initializer_func,
+                                    const logging::Logger& logger)
     : execution_provider_(execution_provider),
-      is_sparse_initializer_func_(is_sparse_initializer_func) {
+      is_sparse_initializer_func_(is_sparse_initializer_func),
+      logger_(logger) {
   allocator_ptr_ = std::make_shared<CPUAllocator>();
   ORT_ENFORCE(allocator_ptr_, "Failed to get allocator for optimizer");
 
@@ -117,7 +121,7 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
 Status OptimizerExecutionFrame::Info::TryFindKernel(const Node* node, const KernelCreateInfo** out) const {
   std::shared_ptr<KernelRegistry> kernel_registry = execution_provider_.GetKernelRegistry();
   const OpSchemaKernelTypeStrResolver kernel_type_str_resolver{};
-  return kernel_registry->TryFindKernel(*node, execution_provider_.Type(), kernel_type_str_resolver, out);
+  return kernel_registry->TryFindKernel(*node, execution_provider_.Type(), kernel_type_str_resolver, logger_, out);
 }
 
 static Status TryCreateKernel(const Node& node,
@@ -128,10 +132,11 @@ static Status TryCreateKernel(const Node& node,
                               FuncManager& funcs_mgr,
                               const DataTransferManager& data_transfer_mgr,
                               const ConfigOptions& config_options,
+                              const logging::Logger& logger,
                               /*out*/ std::unique_ptr<OpKernel>& op_kernel) {
   const OpSchemaKernelTypeStrResolver kernel_type_str_resolver{};
   const KernelCreateInfo* kernel_create_info = nullptr;
-  ORT_RETURN_IF_ERROR(kernel_registry.TryFindKernel(node, execution_provider.Type(), kernel_type_str_resolver,
+  ORT_RETURN_IF_ERROR(kernel_registry.TryFindKernel(node, execution_provider.Type(), kernel_type_str_resolver, logger,
                                                     &kernel_create_info));
 
   static const AllocatorMap dummy_allocators;
@@ -154,7 +159,7 @@ OptimizerExecutionFrame::Info::CreateKernel(const Node* node, const ConfigOption
   std::shared_ptr<KernelRegistry> kernel_registry = execution_provider_.GetKernelRegistry();
   FuncManager func;
   auto status = TryCreateKernel(*node, *kernel_registry, execution_provider_, initializers_,
-                                ort_value_name_idx_map_, func, data_transfer_mgr_, config_options,
+                                ort_value_name_idx_map_, func, data_transfer_mgr_, config_options, logger_,
                                 op_kernel);
 
   // Kernel found in the CPU kernel registry
diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.h b/onnxruntime/core/optimizer/optimizer_execution_frame.h
index b0f7f461661b5..24a23312feba9 100644
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.h
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.h
@@ -27,13 +27,15 @@ class OptimizerExecutionFrame final : public IExecutionFrame {
          const InitializedTensorSet& initialized_tensor_set,
          const std::filesystem::path& model_path,
          const IExecutionProvider& execution_provider,
-         const std::function<bool(const std::string&)>& is_sparse_initializer_func);
+         const std::function<bool(const std::string&)>& is_sparse_initializer_func,
+         const logging::Logger& logger);
 
     Info(const std::vector<const Node*>& nodes,
          const std::unordered_map<std::string, OrtValue>& initialized_tensor_set,
          const std::filesystem::path& model_path,
          const IExecutionProvider& execution_provider,
-         const std::function<bool(const std::string&)>& is_sparse_initializer_func);
+         const std::function<bool(const std::string&)>& is_sparse_initializer_func,
+         const logging::Logger& logger);
 
     ~Info() = default;
 
@@ -76,6 +78,7 @@ class OptimizerExecutionFrame final : public IExecutionFrame {
     std::unique_ptr<NodeIndexInfo> node_index_info_;
     const IExecutionProvider& execution_provider_;
     const std::function<bool(const std::string&)>& is_sparse_initializer_func_;
+    const logging::Logger& logger_;
 
     ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Info);
   };
diff --git a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
index 18e462c04dff3..5538aa54801cc 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
@@ -36,7 +36,7 @@ static inline bool MatchesOpSinceVersion(
   return std::find(versions.begin(), versions.end(), node.SinceVersion()) != versions.end();
 }
 
-static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) {
+static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph, const logging::Logger& logger) {
   constexpr size_t w_idx = 1;
   constexpr size_t w_zp_idx = 9;
   constexpr size_t r_idx = 2;
@@ -60,7 +60,7 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) {
   if (!graph_utils::NodeArgIsConstant(graph, *input_defs[r_idx]) ||
       !graph.GetInitializedTensor(input_defs[r_idx]->Name(), r_tensor_proto) ||
       r_tensor_proto->data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
-    LOGS_DEFAULT(WARNING) << "Unable transforming DynamicQuantizeLSTM operator,"
+    LOGS(logger, WARNING) << "Unable transforming DynamicQuantizeLSTM operator,"
                           << " cannot locate recurrence tensor of const int8 type,"
                           << " int8 overflow might impact precision !";
     return false;
@@ -86,7 +86,7 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) {
     if (!graph_utils::NodeArgIsConstant(graph, *input_defs[r_zp_idx]) ||
         !graph.GetInitializedTensor(input_defs[r_zp_idx]->Name(), r_zp_tensor_proto) ||
         r_zp_tensor_proto->data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
-      LOGS_DEFAULT(WARNING) << "Unable transforming DynamicQuantizeLSTM operator,"
+      LOGS(logger, WARNING) << "Unable transforming DynamicQuantizeLSTM operator,"
                             << " unable to locate recurrence tensor or its zero point value,"
                             << " int8 overflow might impact precision !";
       return false;
@@ -171,7 +171,7 @@ Status Avx2WeightS8ToU8Transformer::ApplyImpl(Graph& graph, bool& modified, int
     if (graph_utils::IsSupportedOptypeVersionAndDomain(
             op_node, "DynamicQuantizeLSTM", {1}, kMSDomain)) {
       // This one has two set of quantized arguments
-      modified |= TryConvertDynamicQuantizeLSTM(op_node, graph);
+      modified |= TryConvertDynamicQuantizeLSTM(op_node, graph, logger);
       continue;  // go on to next operator node
     }
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/bias_quantization.cc b/onnxruntime/core/optimizer/qdq_transformer/bias_quantization.cc
new file mode 100644
index 0000000000000..9e9665e14ede4
--- /dev/null
+++ b/onnxruntime/core/optimizer/qdq_transformer/bias_quantization.cc
@@ -0,0 +1,149 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/qdq_transformer/bias_quantization.h"
+
+#include "core/common/common.h"
+#include "core/graph/graph_utils.h"
+#include "core/graph/graph_viewer.h"
+#include "core/optimizer/utils.h"
+#include "core/optimizer/qdq_transformer/qdq_util.h"
+
+namespace onnxruntime {
+
+Status BiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
+  const GraphViewer graph_viewer{graph};
+  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
+  for (const auto node_idx : node_indices) {
+    auto* node_ptr = graph.GetNode(node_idx);
+    if (!node_ptr) {
+      continue;
+    }
+
+    Node& node = *node_ptr;
+    ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger));
+
+    const auto& input_defs = node.InputDefs();
+
+    // It's Conv/Gemm node with an initializer bias.
+    if ((node.OpType() != "Conv" && node.OpType() != "Gemm") || input_defs.size() < 3 || !input_defs[2]->Exists() ||
+        !graph_utils::IsInitializer(graph, input_defs[2]->Name(), true)) {
+      continue;
+    }
+
+    auto bias_shape = input_defs[2]->Shape();
+    if (!bias_shape || bias_shape->dim_size() != 1) {
+      continue;
+    }
+    int64_t bias_size = bias_shape->dim(0).dim_value();
+
+    // input_0 and input_1 are outputs of DequantizeLinear nodes.
+    const Node* parent_node_0 = graph.GetProducerNode(input_defs[0]->Name());
+    const Node* parent_node_1 = graph.GetProducerNode(input_defs[1]->Name());
+    if (!parent_node_0 || !parent_node_1 || parent_node_0->OpType() != QDQ::DQOpName ||
+        parent_node_1->OpType() != QDQ::DQOpName) {
+      continue;
+    }
+
+    Node& dq_0 = *graph.GetNode(parent_node_0->Index());
+    Node& dq_1 = *graph.GetNode(parent_node_1->Index());
+
+    // Currently we require input_0 is per-tensor scale.
+    if (!optimizer_utils::IsScalar(*dq_0.InputDefs()[1])) {
+      continue;
+    }
+
+    // For input_1, it's either per-tensor scale or per-channel scale on specific axis (0 for Conv and 1 for Gemm).
+    bool is_per_tensor_scale = true;
+    if (!optimizer_utils::IsScalar(*dq_1.InputDefs()[1])) {
+      is_per_tensor_scale = false;
+      auto weight_scale_shape = dq_1.InputDefs()[1]->Shape();
+      if (!weight_scale_shape || weight_scale_shape->dim_size() != 1 || !weight_scale_shape->dim(0).has_dim_value() ||
+          weight_scale_shape->dim(0).dim_value() != bias_size) {
+        continue;
+      }
+
+      const auto& dq_attrs = dq_1.GetAttributes();
+      if (dq_attrs.find("block_size") != dq_attrs.end()) {
+        continue;
+      }
+
+      int64_t axis = 1;
+      if (dq_attrs.find("axis") != dq_attrs.end()) {
+        axis = dq_attrs.at("axis").i();
+      }
+
+      int64_t expected_axis = 0;
+      if (node.OpType() == "Gemm") {
+        int64_t transB = 0;
+        if (const auto& attr = node.GetAttributes().find("transB"); attr != node.GetAttributes().end()) {
+          transB = attr->second.i();
+        }
+        expected_axis = transB == 0 ? 1 : 0;
+      }
+
+      if (axis != expected_axis) {
+        continue;
+      }
+    }
+
+    // Bias is quantized to int32.
+    ONNX_NAMESPACE::TypeProto int32_type_proto;
+    int32_type_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_INT32);
+    auto scale_type = dq_1.InputDefs()[1]->TypeAsProto();  // Maybe per-tensor (scalar) or per-channel (1D) scale.
+    ONNX_NAMESPACE::TypeProto bias_dq_type;
+    bias_dq_type.mutable_tensor_type()->set_elem_type(scale_type->tensor_type().elem_type());
+    bias_dq_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(bias_size);
+
+    // scale = input_scale_0 * input_scale_1.
+    NodeArg& scale_node_arg =
+        graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_bias_scale"), scale_type);
+    Node& mul_node = graph.AddNode(graph.GenerateNodeName(node.Name() + "_scale"), "Mul", "Scale node",
+                                   {dq_0.MutableInputDefs()[1], dq_1.MutableInputDefs()[1]}, {&scale_node_arg}, nullptr,
+                                   node.Domain());
+
+    // fp_bias / scale.
+    NodeArg& bias_div_node_arg =
+        graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_bias_div"), &bias_dq_type);
+    Node& div_node =
+        graph.AddNode(graph.GenerateNodeName(node.Name() + "_bias_div"), "Div", "Bias div node",
+                      {node.MutableInputDefs()[2], &scale_node_arg}, {&bias_div_node_arg}, nullptr, node.Domain());
+    graph.AddEdge(mul_node.Index(), div_node.Index(), 0, 1);
+
+    // Round(fp_bias / scale).
+    NodeArg& bias_div_round_node_arg =
+        graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_bias_div_round"), &bias_dq_type);
+    Node& round_node =
+        graph.AddNode(graph.GenerateNodeName(node.Name() + "_bias_div_round"), "Round", "Bias div round node",
+                      {&bias_div_node_arg}, {&bias_div_round_node_arg}, nullptr, node.Domain());
+    graph.AddEdge(div_node.Index(), round_node.Index(), 0, 0);
+
+    // Cast(round(fp_bias / scale)) to int32.
+    NodeArg& bias_int32_node_arg =
+        graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_bias_int32"), &int32_type_proto);
+    Node& cast_node = graph.AddNode(graph.GenerateNodeName(node.Name() + "_bias_int32"), "Cast", "Bias int32 node",
+                                    {&bias_div_round_node_arg}, {&bias_int32_node_arg}, nullptr, node.Domain());
+    cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_INT32));
+    graph.AddEdge(round_node.Index(), cast_node.Index(), 0, 0);
+
+    // Bias DQ node produces output to Conv/Gemm node's input_2, with scale = input_scale_0 * input_scale_1, zp = 0.
+    NodeArg& bias_dq_node_arg =
+        graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(node.Name() + "_bias_dq"), &bias_dq_type);
+    Node& dq_node = graph.AddNode(graph.GenerateNodeName(node.Name() + "_bias_dq"), QDQ::DQOpName, "Bias DQ node",
+                                  {&bias_int32_node_arg, &scale_node_arg}, {&bias_dq_node_arg}, nullptr, node.Domain());
+    if (!is_per_tensor_scale) {
+      dq_node.AddAttribute("axis", static_cast<int64_t>(0));
+    }
+
+    graph.AddEdge(cast_node.Index(), dq_node.Index(), 0, 0);
+    graph.AddEdge(mul_node.Index(), dq_node.Index(), 0, 1);
+    node.MutableInputDefs()[2] = &bias_dq_node_arg;
+    graph.AddEdge(dq_node.Index(), node.Index(), 0, 2);
+
+    modified = true;
+  }
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/qdq_transformer/bias_quantization.h b/onnxruntime/core/optimizer/qdq_transformer/bias_quantization.h
new file mode 100644
index 0000000000000..0297def260fd9
--- /dev/null
+++ b/onnxruntime/core/optimizer/qdq_transformer/bias_quantization.h
@@ -0,0 +1,27 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/graph_transformer.h"
+
+namespace onnxruntime {
+
+/**
+ * @class BiasQuantization
+ *
+ * Some quantized models do not have Gemm/Conv's bias quantized. This optimization adds a subgraph to quantize the bias
+ * with scale = scale_input_0 * scale_input_1 and zero_point = 0.
+ *
+ * Normally the ConstantFolding optimizer would fold the bias initializer into an int32_t initializer, which is consumed
+ * by a DequantizeLinear node.
+ */
+class BiasQuantization : public GraphTransformer {
+ public:
+  BiasQuantization() noexcept : GraphTransformer("BiasQuantization") {}
+
+ private:
+  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
index 2738c3ab02799..2f98711771f1b 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@@ -236,7 +236,7 @@ void ConvQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool is_
 
 #if !defined(ORT_MINIMAL_BUILD)
   // TODO: Enable 16-bit types in selector when QLinearConv supports 16-bit.
-  std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
+  std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider, kAclExecutionProvider};
   std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::ConvSelector>(is_int8_allowed,
                                                                                false,
                                                                                false,
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index d2240b5d50194..81305f7effa16 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -291,7 +291,8 @@ SelectorManager::SelectorManager() {
   InitializeSelectorsMap();
 }
 
-std::vector<NodeGroup> SelectorManager::GetQDQSelections(const GraphViewer& graph_viewer) const {
+std::vector<NodeGroup> SelectorManager::GetQDQSelections(const GraphViewer& graph_viewer,
+                                                         const logging::Logger& logger) const {
   std::vector<NodeGroup> qdq_selections;
   for (auto index : graph_viewer.GetNodesInTopologicalOrder()) {
     const auto* node = graph_viewer.GetNode(index);
@@ -313,7 +314,7 @@ std::vector<NodeGroup> SelectorManager::GetQDQSelections(const GraphViewer& grap
     const auto& versions = op_versions_and_selector.op_versions_map.find(node->OpType())->second;
     if (!versions.empty()) {
       if (std::find(versions.cbegin(), versions.cend(), node->SinceVersion()) == versions.cend()) {
-        LOGS_DEFAULT(VERBOSE) << "Op version is not supported for" << node->OpType();
+        LOGS(logger, VERBOSE) << "Op version is not supported for" << node->OpType();
         continue;
       }
     }
@@ -329,7 +330,7 @@ std::vector<NodeGroup> SelectorManager::GetQDQSelections(const GraphViewer& grap
 }
 
 std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
-GetAllNodeUnits(const GraphViewer& graph_viewer) {
+GetAllNodeUnits(const GraphViewer& graph_viewer, const logging::Logger& logger) {
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
@@ -342,7 +343,7 @@ GetAllNodeUnits(const GraphViewer& graph_viewer) {
 
   // Get QDQ NodeUnits first
   QDQ::SelectorManager selector_mgr;
-  const auto qdq_selections = selector_mgr.GetQDQSelections(graph_viewer);
+  const auto qdq_selections = selector_mgr.GetQDQSelections(graph_viewer, logger);
 
   for (const auto& qdq_selection : qdq_selections) {
     auto qdq_unit = std::make_unique<NodeUnit>(graph_viewer, qdq_selection);
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h
index f388206551172..ccc1844e3e985 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h
@@ -15,7 +15,9 @@
 #endif
 
 namespace onnxruntime {
-
+namespace logging {
+class Logger;
+}
 class GraphViewer;
 class Node;
 
@@ -65,7 +67,7 @@ class SelectorManager {
 
   // Methods that finds and returns a vector of QDQ::NodeGroup in a given graph
   // Can be used in QDQ support in different EPs
-  std::vector<NodeGroup> GetQDQSelections(const GraphViewer& graph_viewer) const;
+  std::vector<NodeGroup> GetQDQSelections(const GraphViewer& graph_viewer, const logging::Logger& logger) const;
 
  private:
   Selectors qdq_selectors_;
@@ -88,7 +90,7 @@ class SelectorManager {
 // We currently have a bit of a mess with generic things like this to get all the node units being in the optimizer
 // library whereas it should be able to be used by an EP with no dependency on optimizers.
 std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
-GetAllNodeUnits(const GraphViewer& graph_viewer);
+GetAllNodeUnits(const GraphViewer& graph_viewer, const logging::Logger& logger);
 
 }  // namespace QDQ
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/transformer_memcpy.cc b/onnxruntime/core/optimizer/transformer_memcpy.cc
index 6a5a85ce0ff31..8c0136c495403 100644
--- a/onnxruntime/core/optimizer/transformer_memcpy.cc
+++ b/onnxruntime/core/optimizer/transformer_memcpy.cc
@@ -17,13 +17,22 @@ class TransformerMemcpyImpl {
   TransformerMemcpyImpl(onnxruntime::Graph& graph, const std::string& provider)
       : graph_(graph), provider_(provider) {}
 
-  bool ModifyGraph(const KernelRegistryManager& schema_registries, const logging::Logger& logger, int& copy_node_counter);
+  bool ModifyGraph(const KernelRegistryManager& schema_registries,
+                   const logging::Logger& logger,
+                   int& copy_node_counter);
 
  private:
-  void ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries, InitializedTensorSet& initializers_consumed);
-  void BuildDefsMapping(const onnxruntime::NodeArg* arg, const KernelRegistryManager& kernel_registries);
+  void ProcessDefs(onnxruntime::Node& node,
+                   const KernelRegistryManager& kernel_registries,
+                   InitializedTensorSet& initializers_consumed,
+                   const logging::Logger& logger);
+  void BuildDefsMapping(const onnxruntime::NodeArg* arg,
+                        const KernelRegistryManager& kernel_registries,
+                        const logging::Logger& logger);
   void AddCopyNode(onnxruntime::NodeArg* arg, bool is_input, const logging::Logger& logger);
-  bool ProcessInitializers(const KernelRegistryManager& kernel_registries, const InitializedTensorSet& initializers_consumed);
+  bool ProcessInitializers(const KernelRegistryManager& kernel_registries,
+                           const InitializedTensorSet& initializers_consumed,
+                           const logging::Logger& logger);
 
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TransformerMemcpyImpl);
@@ -130,21 +139,21 @@ bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_regi
   // find defs that require copy
   for (auto& node : graph_.Nodes()) {
     // as we process the defs, collect all the initializers consumed at the current graph level
-    ProcessDefs(node, kernel_registries, initializers_consumed);
+    ProcessDefs(node, kernel_registries, initializers_consumed, logger);
   }
 
   // for initializers shared by different providers, create dups
-  if (ProcessInitializers(kernel_registries, initializers_consumed))
+  if (ProcessInitializers(kernel_registries, initializers_consumed, logger))
     modified = true;
 
   for (auto arg : graph_.GetInputs())
-    BuildDefsMapping(arg, kernel_registries);
+    BuildDefsMapping(arg, kernel_registries, logger);
 
   for (auto arg : non_provider_input_defs_)
-    BuildDefsMapping(arg, kernel_registries);
+    BuildDefsMapping(arg, kernel_registries, logger);
 
   for (auto arg : non_provider_output_defs_)
-    BuildDefsMapping(arg, kernel_registries);
+    BuildDefsMapping(arg, kernel_registries, logger);
 
   for (auto arg : graph_.GetInputs())
     // For inputs we need to create a copy node only when the input is connected to both provider
@@ -202,8 +211,10 @@ bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_regi
   return modified;
 }
 
-void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries,
-                                        InitializedTensorSet& initializers_consumed) {
+void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node,
+                                        const KernelRegistryManager& kernel_registries,
+                                        InitializedTensorSet& initializers_consumed,
+                                        const logging::Logger& logger) {
   auto node_provider_type = node.GetExecutionProviderType();
   if ((node_provider_type == provider_) ||
       (node_provider_type == kCudaExecutionProvider && kTensorrtExecutionProvider == provider_) ||
@@ -211,7 +222,7 @@ void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelReg
     provider_nodes_.insert(&node);
     // note KernelCreateInfo might be nullptr for custom kernel
     const KernelCreateInfo* kci = nullptr;
-    ORT_IGNORE_RETURN_VALUE(kernel_registries.SearchKernelRegistry(node, &kci));
+    ORT_IGNORE_RETURN_VALUE(kernel_registries.SearchKernelRegistry(node, logger, &kci));
 
     bool is_implicit_input = false;
     auto process_inputs =
@@ -278,7 +289,9 @@ void TransformerMemcpyImpl::ProcessDefs(onnxruntime::Node& node, const KernelReg
 }
 
 // for non_provider defs, collect the nodes that expect it is provider tensor as input/output.
-void TransformerMemcpyImpl::BuildDefsMapping(const onnxruntime::NodeArg* arg, const KernelRegistryManager& kernel_registries) {
+void TransformerMemcpyImpl::BuildDefsMapping(const onnxruntime::NodeArg* arg,
+                                             const KernelRegistryManager& kernel_registries,
+                                             const logging::Logger& logger) {
   for (auto& it : graph_.Nodes()) {
     if (it.OpType() == "MemcpyFromHost" || it.OpType() == "MemcpyToHost") continue;
     auto input_it =
@@ -296,7 +309,7 @@ void TransformerMemcpyImpl::BuildDefsMapping(const onnxruntime::NodeArg* arg, co
         (node_provider_type == kCudaExecutionProvider && kTensorrtExecutionProvider == provider_) ||
         (node_provider_type == kRocmExecutionProvider && kMIGraphXExecutionProvider == provider_)) {
       const KernelCreateInfo* kci = nullptr;
-      ORT_IGNORE_RETURN_VALUE(kernel_registries.SearchKernelRegistry(it, &kci));
+      ORT_IGNORE_RETURN_VALUE(kernel_registries.SearchKernelRegistry(it, logger, &kci));
       if (arg_input_index != -1) {
         if (!kci || !utils::IsInputOnCpu(it, kci, arg_input_index)) provider_input_nodes_[arg].insert(&it);
       }
@@ -351,7 +364,9 @@ static const onnxruntime::NodeArg* FindNodeArg(const NodeArgSetType& def_set, co
 // We duplicate any initializer that is used by both provider nodes and non-provider nodes
 // to ensure that provider nodes and non-provider nodes don't share initializers, as they
 // need to stay in different memory locations.
-bool TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& kernel_registries, const InitializedTensorSet& initializers_consumed) {
+bool TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& kernel_registries,
+                                                const InitializedTensorSet& initializers_consumed,
+                                                const logging::Logger& logger) {
   std::map<const onnxruntime::NodeArg*, onnxruntime::NodeArg*> replacements;
   for (const auto& pair : initializers_consumed) {
     const auto& name = pair.first;
@@ -383,7 +398,7 @@ bool TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& ker
     auto dup_replacements = replacements;
 
     const KernelCreateInfo* kci = nullptr;
-    auto status = kernel_registries.SearchKernelRegistry(*p_node, &kci);
+    auto status = kernel_registries.SearchKernelRegistry(*p_node, logger, &kci);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     if (kci == nullptr) continue;
     if (kci->kernel_def == nullptr) continue;
diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
index 470838d36ec1c..10cb6eb97bdd6 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
@@ -5,6 +5,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cstring>
 #include <iostream>
 #include <memory>
 #include <unordered_map>
@@ -1653,14 +1654,14 @@ static bool HandleSplit(HandlerArgs& args) {
 
 constexpr HandlerInfo split_handler = {&FirstInput, &HandleSplit};
 
-static bool HandleConcat(HandlerArgs& args) {
+bool HandleConcat(HandlerArgs& args) {
   return HandleSimpleNodeWithAxis(args);
 }
 
 constexpr HandlerInfo concat_handler = {&AllInputs, &HandleConcat};
 
 // Handles Softmax, Hardmax, and LogSoftmax
-static bool HandleSoftHardMax(HandlerArgs& args) {
+bool HandleSoftHardMax(HandlerArgs& args) {
   if (args.ctx.opset >= 13) {
     return HandleSimpleNodeWithAxis(args, /*default_axis*/ -1);
   }
diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.h b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.h
index 0095ead75f0c8..f65bd6aa82fbb 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.h
+++ b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.h
@@ -71,6 +71,9 @@ bool HandleSimpleNodeBroadcast(HandlerArgs& args);
 // Transposes all inputs and all outputs. Updates axis attribute.
 bool HandleSimpleNodeWithAxis(HandlerArgs& args, std::optional<int64_t> default_axis = std::nullopt);
 
+bool HandleConcat(HandlerArgs& args);
+bool HandleSoftHardMax(HandlerArgs& args);
+
 // base handlers that are used by extended handlers. add from transpose_optimizer.cc as needed.
 bool HandleReduceOps(HandlerArgs& args);
 bool HandleResize([[maybe_unused]] HandlerArgs& args);
diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc
index 8eaac3d34c3af..824ab20a84668 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc
@@ -34,10 +34,6 @@ static bool EPAwareHandleResize(HandlerArgs& args) {
 
 constexpr HandlerInfo ep_aware_resize_handler = {&FirstInput, &EPAwareHandleResize};
 
-static bool HandleQLinearConcat(HandlerArgs& args) {
-  return HandleSimpleNodeWithAxis(args);
-}
-
 std::vector<size_t> QLinearConcatInputs(OptimizerCtx& ctx, api::NodeRef& node) {
   (void)ctx;
   std::vector<size_t> indices;
@@ -48,11 +44,7 @@ std::vector<size_t> QLinearConcatInputs(OptimizerCtx& ctx, api::NodeRef& node) {
   return indices;
 }
 
-constexpr HandlerInfo q_linear_concat_handler = {&QLinearConcatInputs, &HandleQLinearConcat};
-
-static bool HandleQLinearBinaryOp(HandlerArgs& args) {
-  return HandleSimpleNodeBroadcast(args);
-}
+constexpr HandlerInfo q_linear_concat_handler = {&QLinearConcatInputs, &HandleConcat};
 
 std::vector<size_t> QLinearBinaryOpInputs(OptimizerCtx&, api::NodeRef&) {
   // Inputs are: [A, A_scale, A_zero_point, B, B_scale, B_zero_point, C_scale, C_zero_point],
@@ -60,7 +52,7 @@ std::vector<size_t> QLinearBinaryOpInputs(OptimizerCtx&, api::NodeRef&) {
   return {0, 3};
 }
 
-constexpr HandlerInfo q_linear_binary_op_handler = {&QLinearBinaryOpInputs, &HandleQLinearBinaryOp};
+constexpr HandlerInfo q_linear_binary_op_handler = {&QLinearBinaryOpInputs, &HandleSimpleNodeBroadcast};
 
 static bool HandleQLinearPoolOp(HandlerArgs& args) {
   // Swap between channel first/last variants. Only works for applicable values of perm.
@@ -129,6 +121,7 @@ constexpr HandlerInfo max_pool_op_handler = {&FirstInput, &HandleMaxPool};
 
 constexpr HandlerInfo node_1_inp_handler = {&FirstInput, &HandleSimpleNode};
 constexpr HandlerInfo reduce_op_handler = {&FirstInput, &HandleReduceOps};
+constexpr HandlerInfo soft_hard_max_handler = {&FirstInput, &HandleSoftHardMax};
 constexpr HandlerInfo contrib_quantize_dequantize_linear_handler = {&FirstInput,
                                                                     &HandleContribQuantizeDequantizeLinear};
 
@@ -148,6 +141,7 @@ const HandlerMap& OrtExtendedHandlers() {
         {"com.microsoft.QLinearMul", q_linear_binary_op_handler},
         {"com.microsoft.QLinearReduceMean", reduce_op_handler},
         {"com.microsoft.QLinearSigmoid", node_1_inp_handler},
+        {"com.microsoft.QLinearSoftmax", soft_hard_max_handler},
     };
 
     return map;
diff --git a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc
index bf3b53afbd7d3..7464ab4c57d01 100644
--- a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc
+++ b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc
@@ -1,7 +1,8 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #include "hardware_core_enumerator.h"
+#include "core/platform/windows/env.h"
 #include <memory>
 #include <Windows.h>
 #include <assert.h>
@@ -83,6 +84,38 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
   // # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
   // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
   auto cores = GetCoreInfo();
+#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
+  const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69};  // "GenuntelineI"
+  bool isIntelSpecifiedPlatform = false;
+  const int kVendorID_IntelSpecifiedPlatformIDs[3] = {
+      // ExtendedModel, ExtendedFamily, Family Code, and Model Number
+      0xa06a,  // MTL
+      0xc065,  // ARL-H
+      0xb065   // ARL-U
+  };
+
+  int regs_leaf0[4];
+  int regs_leaf1[4];
+  __cpuid(regs_leaf0, 0);
+  __cpuid(regs_leaf1, 0x1);
+
+  auto isIntel = (kVendorID_Intel[0] == regs_leaf0[1]) && (kVendorID_Intel[1] == regs_leaf0[2]) && (kVendorID_Intel[2] == regs_leaf0[3]);
+
+  for (int intelSpecifiedPlatform : kVendorID_IntelSpecifiedPlatformIDs) {
+    if ((regs_leaf1[0] >> 4) == intelSpecifiedPlatform) {
+      isIntelSpecifiedPlatform = true;
+    }
+  }
+
+  if (isIntel) {
+    if (isIntelSpecifiedPlatform) {
+      // We want to exclude cores without an LLC
+      return cores.LLCCores;
+    } else {
+      return cores.PhysicalCores;
+    }
+  }
+#endif
 
   return cores.LLCCores;
 }
diff --git a/onnxruntime/core/platform/windows/stacktrace.cc b/onnxruntime/core/platform/windows/stacktrace.cc
index 3401507ae911f..cc23d70c0f11f 100644
--- a/onnxruntime/core/platform/windows/stacktrace.cc
+++ b/onnxruntime/core/platform/windows/stacktrace.cc
@@ -30,7 +30,6 @@ class CaptureStackTrace {
 // Get the stack trace. Currently only enabled for a DEBUG build as we require the DbgHelp library.
 std::vector<std::string> GetStackTrace() {
 #ifndef NDEBUG
-// TVM need to run with shared CRT, so won't work with debug helper now
 #if (defined __cpp_lib_stacktrace) && !(defined _OPSCHEMA_LIB_) && !(defined _GAMING_XBOX) && !(defined ONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
   return detail::CaptureStackTrace().Trace();
 #else
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc
index a799ed743ef52..f954baf3eabae 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.cc
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc
@@ -1288,15 +1288,15 @@ CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewe
 
       const KernelCreateInfo* cann_kernel_def = kernel_lookup.LookUpKernel(node);
       if (cann_kernel_def == nullptr) {
-        LOGS_DEFAULT(INFO) << "CANN kernel not found in registries for Op type: " << node.OpType()
-                           << " node name: " << node.Name();
+        LOGS(*GetLogger(), INFO) << "CANN kernel not found in registries for Op type: " << node.OpType()
+                                 << " node name: " << node.Name();
         continue;
       }
 
       candidates.push_back(node.Index());
     }
 
-    auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates);
+    auto cpu_nodes = GetCpuPreferredNodes(graph_viewer, kernel_lookup, candidates, *GetLogger());
     for (auto& node_index : candidates) {
       if (cpu_nodes.count(node_index) > 0)
         continue;
diff --git a/onnxruntime/core/providers/coreml/builders/helper.cc b/onnxruntime/core/providers/coreml/builders/helper.cc
index e1f148fa93e23..38ac629331749 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.cc
+++ b/onnxruntime/core/providers/coreml/builders/helper.cc
@@ -24,11 +24,12 @@ namespace coreml {
 
 OpBuilderInputParams MakeOpBuilderParams(const GraphViewer& graph_viewer,
                                          int32_t coreml_version,
-                                         uint32_t coreml_flags) {
+                                         bool only_allow_static_input_shapes,
+                                         bool create_mlprogram) {
   return OpBuilderInputParams{graph_viewer,
                               coreml_version,
-                              (coreml_flags & COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES) != 0,
-                              (coreml_flags & COREML_FLAG_CREATE_MLPROGRAM) != 0};
+                              only_allow_static_input_shapes,
+                              create_mlprogram};
 }
 
 const IOpBuilder* GetOpBuilder(const Node& node) {
@@ -133,13 +134,13 @@ bool CheckIsConstantInitializer(const NodeArg& node_arg, const GraphViewer& grap
   return true;
 }
 
-bool HasNeuralEngine(const logging::Logger& logger) {
+bool HasNeuralEngine() {
   bool has_neural_engine = false;
 
 #ifdef __APPLE__
   struct utsname system_info;
   uname(&system_info);
-  LOGS(logger, VERBOSE) << "Current Apple hardware info: " << system_info.machine;
+  LOGS_DEFAULT(VERBOSE) << "Current Apple hardware info: " << system_info.machine;
 
 #if TARGET_OS_IPHONE
   // utsname.machine has device identifier. For example, identifier for iPhone Xs is "iPhone11,2".
@@ -163,7 +164,7 @@ bool HasNeuralEngine(const logging::Logger& logger) {
 #else
   // In this case, we are running the EP on non-apple platform, which means we are running the model
   // conversion with CoreML EP enabled, for this we always assume the target system has Neural Engine
-  LOGS(logger, INFO) << "HasNeuralEngine running on non-Apple hardware. "
+  LOGS_DEFAULT(INFO) << "HasNeuralEngine running on non-Apple hardware. "
                         "Returning true to enable model conversion and local testing of CoreML EP implementation. "
                         "No CoreML model will be compiled or run.";
   has_neural_engine = true;
diff --git a/onnxruntime/core/providers/coreml/builders/helper.h b/onnxruntime/core/providers/coreml/builders/helper.h
index 0acaa0dd8a4a3..ae7f3bdbc31a9 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.h
+++ b/onnxruntime/core/providers/coreml/builders/helper.h
@@ -25,7 +25,8 @@ namespace coreml {
 
 OpBuilderInputParams MakeOpBuilderParams(const GraphViewer& graph_viewer,
                                          int32_t coreml_version,
-                                         uint32_t coreml_flags);
+                                         bool only_allow_static_input_shapes,
+                                         bool create_mlprogram);
 
 const IOpBuilder* GetOpBuilder(const Node& node);
 
@@ -45,7 +46,7 @@ bool CheckIsConstantInitializer(const NodeArg& node_arg, const GraphViewer& grap
 
 // CoreML is more efficient running using Apple Neural Engine
 // This is to detect if the current system has Apple Neural Engine
-bool HasNeuralEngine(const logging::Logger& logger);
+bool HasNeuralEngine();
 
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 70002b6295f5a..2817f34bc64f2 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -13,14 +13,6 @@ using namespace CoreML::Specification;
 namespace onnxruntime {
 namespace coreml {
 
-// Once all ops are supportted FP16, we can remove it. Before that, we keep a set of ops to
-// filter suppported ones.
-static std::set<std::string> Float16Ops = {
-    "Add", "ArgMax", "AveragePool", "BatchNormalization", "Cast", "Clip", "Concat", "Conv", "ConvTranspose",
-    "DepthToSpace", "Div", "Gelu", "Gemm", "GlobalAveragePool", "GlobalMaxPool", "GridSample", "GroupNormalization",
-    "InstanceNormalization", "LayerNormalization", "LeakyRelu", "MatMul", "MaxPool", "Mul", "PRelu", "Pow",
-    "Reciprocal", "Relu", "Reshape", "Resize", "Sigmoid", "Slice", "Split", "Sqrt", "Sub", "Tanh", "Transpose"};
-
 namespace {
 // TODO, move this to shared_library
 bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node& node,
@@ -64,20 +56,27 @@ bool BaseOpBuilder::IsOpSupported(const Node& node, const OpBuilderInputParams&
   }
 
   if (!HasSupportedOpSet(node, logger)) {
+    LOGS(logger, VERBOSE) << "Operator [" << node.OpType() << "] does not support this opset";
     return false;
   }
 
   if (!HasSupportedInputs(node, input_params, logger)) {
+    LOGS(logger, VERBOSE) << "Operator [" << node.OpType() << "] has unsupported inputs";
     return false;
   }
 
   // We do not support external initializers for now
   const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
   if (HasExternalInitializer(initializers, node, logger)) {
+    LOGS(logger, VERBOSE) << "Operator [" << node.OpType() << "] has external initializers";
     return false;
   }
 
-  return IsOpSupportedImpl(node, input_params, logger);
+  if (!IsOpSupportedImpl(node, input_params, logger)) {
+    LOGS(logger, VERBOSE) << "Operator [" << node.OpType() << "] is not supported by the impl";
+    return false;
+  }
+  return true;
 }
 
 bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputParams& input_params,
@@ -114,13 +113,10 @@ bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx,
     return true;
   }
 
-// only support MLProgram for FP16
-#if defined(COREML_ENABLE_MLPROGRAM)
-  if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 &&
-      Float16Ops.count(node.OpType())) {
+  // only MLProgram support FP16
+  if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
     return true;
   }
-#endif
 
   LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
   return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
index cc68fa6ec399a..442194cb31cbc 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
@@ -151,7 +151,7 @@ bool BatchNormalizationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBu
     return false;
   }
 
-#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64)
+#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64) && TARGET_OS_IOS && TARGET_CPU_X86_64
   // To Pass IOS pipeline https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=134&_a=summary
   auto input_dtype = input_defs[0]->TypeAsProto()->tensor_type().elem_type();
   if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && input_params.coreml_version < 7) {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
index 8aa2dbae2531c..0482620b269a4 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -6,6 +6,7 @@
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/shape_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
@@ -55,6 +56,64 @@ bool CheckIfBothInputShapesMatch(const Node& node, const logging::Logger& logger
 }
 }  // namespace
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+static std::vector<int64_t> InferOutputShape(const std::vector<int64_t>& a, const std::vector<int64_t>& b) {
+  std::vector<int64_t> output_shape;
+  int64_t i_a = 0, j_b = 0;
+  if (a.size() >= b.size()) {
+    output_shape = a;
+    j_b -= a.size() - b.size();
+  } else {
+    output_shape = b;
+    i_a -= b.size() - a.size();
+  }
+
+  for (size_t i = 0; i < output_shape.size(); i++, i_a++, j_b++) {
+    const int64_t a_dim = (i_a >= 0) ? a[i_a] : 1;
+    const int64_t b_dim = (j_b >= 0) ? b[j_b] : 1;
+    if (a_dim == -1 || b_dim == -1) {
+      output_shape[i] = -1;
+    } else {
+      output_shape[i] = std::max(a_dim, b_dim);
+    }
+  }
+  return output_shape;
+}
+
+// Add variadic inputs to the model builder
+// in onnx spec, some node allows variadic inputs, such as max(x, y, z, ...)
+// while in coreml, maximum op only allows two inputs maximum(x, y)
+// the conversion is doing the following:
+// max(x, y, z, ...) -> max(max(x, y), z, ...)
+static void AddVariadicInputs(std::unique_ptr<CoreML::Specification::MILSpec::Operation>* op,
+                              ModelBuilder& model_builder,
+                              const Node& node,
+                              const logging::Logger& logger) {
+  using namespace CoreML::Specification::MILSpec;
+  const auto& input_defs(node.InputDefs());
+  std::string_view layer_input_name_x = model_builder.GetUniqueName(node, "variadic");
+  auto input_dtype = input_defs[0]->TypeAsProto()->tensor_type().elem_type();
+  const int32_t elem_type = static_cast<int32_t>(input_dtype);
+  std::vector<int64_t> x0_shape, x1_shape;
+  GetShape(*input_defs[0], x0_shape, logger);
+  GetShape(*input_defs[1], x1_shape, logger);
+  x0_shape = InferOutputShape(x0_shape, x1_shape);
+  std::unique_ptr<Operation> op_prev = std::move(*op);
+  for (size_t i = 2; i < input_defs.size(); i++) {
+    AddIntermediateOperationOutput(*op_prev, layer_input_name_x, elem_type, x0_shape);
+    std::unique_ptr<Operation> op_cur = model_builder.CreateOperation(node, op_prev->type());
+    AddOperationInput(*op_cur, "x", layer_input_name_x);
+    AddOperationInput(*op_cur, "y", input_defs[i]->Name());
+    model_builder.AddOperation(std::move(op_prev));
+    op_prev = std::move(op_cur);
+    layer_input_name_x = model_builder.GetUniqueName(node, "variadic");
+    GetShape(*input_defs[i], x1_shape, logger);
+    x0_shape = InferOutputShape(x0_shape, x1_shape);
+  }
+  *op = std::move(op_prev);
+}
+#endif
+
 Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                               const logging::Logger& logger) const {
   const auto& op_type(node.OpType());
@@ -70,6 +129,8 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
       coreml_op_type = "add";
     } else if (op_type == "Mul") {
       coreml_op_type = "mul";
+    } else if (op_type == "Max") {
+      coreml_op_type = "maximum";
     } else if (op_type == "Sub") {
       coreml_op_type = "sub";
     } else if (op_type == "Div") {
@@ -86,8 +147,11 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
     AddOperationInput(*op, "x", input_defs[0]->Name());
     AddOperationInput(*op, "y", input_defs[1]->Name());
+    if (input_defs.size() > 2) {
+      // "max" node may have variadic inputs
+      AddVariadicInputs(&op, model_builder, node, logger);
+    }
     AddOperationOutput(*op, *node.OutputDefs()[0]);
-
     model_builder.AddOperation(std::move(op));
   } else
 #endif  // defined (COREML_ENABLE_MLPROGRAM)
@@ -157,6 +221,10 @@ bool BinaryOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderIn
     return false;
   }
 
+  if (node.OpType() == "Max" && !input_params.create_mlprogram) {
+    return false;
+  }
+
   return true;
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
index bc9e2f10296ed..f7046c213a8cb 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
@@ -98,26 +98,24 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
         const bool min_max_attribs = node.SinceVersion() < 11;
         std::string_view min_name;
         if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
-          min_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "min", min)
-                                     : node.InputDefs()[1]->Name();
+          min_name = (min_max_attribs || !has_min) ? model_builder.AddScalarConstant(clip_op.type(), "min", min)
+                                                   : node.InputDefs()[1]->Name();
         } else {
-          min_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "min", MLFloat16(min))
-                                     : node.InputDefs()[1]->Name();
+          min_name = (min_max_attribs || !has_min) ? model_builder.AddScalarConstant(clip_op.type(), "min", MLFloat16(min))
+                                                   : node.InputDefs()[1]->Name();
         }
 
         AddOperationInput(clip_op, "alpha", min_name);
 
-        if (has_max) {
-          std::string_view max_name;
-          if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
-            max_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "max", max)
-                                       : node.InputDefs()[2]->Name();
-          } else {
-            max_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "max", MLFloat16(max))
-                                       : node.InputDefs()[2]->Name();
-          }
-          AddOperationInput(clip_op, "beta", max_name);
+        std::string_view max_name;
+        if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+          max_name = (min_max_attribs || !has_max) ? model_builder.AddScalarConstant(clip_op.type(), "max", max)
+                                                   : node.InputDefs()[2]->Name();
+        } else {
+          max_name = (min_max_attribs || !has_max) ? model_builder.AddScalarConstant(clip_op.type(), "max", MLFloat16(max))
+                                                   : node.InputDefs()[2]->Name();
         }
+        AddOperationInput(clip_op, "beta", max_name);
       }
     }
 
@@ -200,7 +198,9 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 bool ClipOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                       const logging::Logger& logger) const {
   float min, max;
-  return GetClipMinMax(input_params.graph_viewer, node, min, max, logger);
+  bool ret = GetClipMinMax(input_params.graph_viewer, node, min, max, logger);
+  // what does it mean if min == max?
+  return ret && (min != max);
 }
 
 void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
index 5651b9cc5793e..d533b867bd454 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
@@ -5,10 +5,15 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
 
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
 namespace onnxruntime {
 namespace coreml {
 
@@ -20,6 +25,7 @@ class ReductionOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+  bool SupportsMLProgram() const override { return true; }
 };
 
 namespace {
@@ -48,13 +54,12 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
                                                  const logging::Logger& /* logger */) const {
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
-  const auto& initializers(model_builder.GetInitializerTensors());
 
   std::vector<int64_t> axes;
 
   NodeAttrHelper helper(node);
   if (input_defs.size() > 1 && input_defs[1]->Exists()) {
-    auto& axes_tensor = *initializers.at(input_defs[1]->Name());
+    auto& axes_tensor = *model_builder.GetConstantInitializer(input_defs[1]->Name());
     Initializer axes_initializer(axes_tensor);
     int64_t* data = axes_initializer.data<int64_t>();
     int64_t size = axes_initializer.size();
@@ -66,28 +71,76 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
 
   const bool keepdims = helper.Get("keepdims", 1) != 0;
   const bool noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0) != 0;
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::string_view coreml_op_type;
+    if (noop_with_empty_axes && axes.size() == 0) {
+      coreml_op_type = "identity";
+    } else if (op_type == "ReduceSum") {
+      coreml_op_type = "reduce_sum";
+    } else if (op_type == "ReduceMean") {
+      coreml_op_type = "reduce_mean";
+    } else if (op_type == "ReduceMax") {
+      coreml_op_type = "reduce_max";
+    } else if (op_type == "ReduceMin") {
+      coreml_op_type = "reduce_min";
+    } else if (op_type == "ReduceProd") {
+      coreml_op_type = "reduce_prod";
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "ReductionOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
+    }
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    if (coreml_op_type != "identity") {
+      if (axes.size() > 0) {
+        AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", axes));
+      }
+      AddOperationInput(*op, "keep_dims", model_builder.AddScalarConstant(op->type(), "keep_dims", keepdims));
+    }
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // (COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    if (op_type == "ReduceSum") {
+      AddReductionParams(layer->mutable_reducesum(), axes, keepdims, noop_with_empty_axes);
+    } else if (op_type == "ReduceMean") {
+      AddReductionParams(layer->mutable_reducemean(), axes, keepdims, noop_with_empty_axes);
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "ReductionOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
+    }
 
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
 
-  if (op_type == "ReduceSum") {
-    AddReductionParams(layer->mutable_reducesum(), axes, keepdims, noop_with_empty_axes);
-  } else if (op_type == "ReduceMean") {
-    AddReductionParams(layer->mutable_reducemean(), axes, keepdims, noop_with_empty_axes);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "ReductionOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
+    model_builder.AddLayer(std::move(layer));
   }
-
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
-
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
 bool ReductionOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                            const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
+  if (!input_params.create_mlprogram &&
+      (node.OpType() == "ReduceMax" || node.OpType() == "ReduceMin" || node.OpType() == "ReduceProd")) {
+    return false;
+  }
+
+#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64) && TARGET_OS_IOS && TARGET_CPU_X86_64
+  // skip ReductionOpTest.ReduceSum_half_bert because reduce_sum will output all zeros
+  int32_t input_type;
+  GetType(*input_defs[0], input_type, logger);
+  if (node.OpType() == "ReduceSum" && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+    return false;
+  }
+#endif
 
   NodeAttrHelper helper(node);
 
@@ -99,18 +152,16 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInpu
   if (input_defs.size() > 1 && input_defs[1]->Exists()) {
     // 'axes' is optional input in new opsets
     const auto& axes_name = input_defs[1]->Name();
-    const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-    if (!Contains(initializers, axes_name)) {
+    const auto* axes = input_params.graph_viewer.GetConstantInitializer(axes_name);
+    if (!axes) {
       LOGS(logger, VERBOSE) << "Axes of reduction must be a constant initializer";
       return false;
     }
 
-    empty_axes = initializers.at(axes_name)->int64_data_size() == 0;
+    empty_axes = axes->int64_data_size() == 0;
   }
-
-  if (empty_axes && noop_with_empty_axes) {
-    // TODO: When we add ML Program support we should enable this as it makes the node an Identity op
-    LOGS(logger, VERBOSE) << "CoreML doesn't support noop on empty axes for reduction layers" << std::endl;
+  if (empty_axes && noop_with_empty_axes && !input_params.create_mlprogram) {
+    LOGS(logger, VERBOSE) << "NeuralNetwork doesn't support noop on empty axes for reduction layers";
     return false;
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
index a86e3d9538d87..243f949bdd48e 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
@@ -2,7 +2,9 @@
 // Licensed under the MIT License.
 
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/shape_utils.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"  // for NodeAttrHelper
 
@@ -14,28 +16,132 @@ class ShapeOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+  bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
+                              const logging::Logger& logger) const override;
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                              const logging::Logger& /*logger*/) const {
-  auto layer = model_builder.CreateNNLayer(node);
-  layer->mutable_getshape();
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
-  model_builder.AddLayer(std::move(layer));
+  const auto& input_defs = node.InputDefs();
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    NodeAttrHelper node_attr_helper{node};
+    int64_t size = -1;
+    int64_t num_dims = 0;
+    int64_t start = node_attr_helper.Get("start", 0);
+    // If the input shape is not available, size is -1 and start is 0
+    if (input_defs[0]->Shape()) {
+      num_dims = input_defs[0]->Shape()->dim_size();
+      start = HandleNegativeAxis(start, num_dims);
+      if (node_attr_helper.HasAttr("end")) {
+        int64_t end = HandleNegativeAxis(node_attr_helper.Get("end", -1), num_dims);
+        size = end - start;
+      }
+    }
+
+    int32_t output_datatype = ONNX_NAMESPACE::TensorProto_DataType_INT32;
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "shape");
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    if (size != -1 || start != 0) {
+      std::string_view layer_input_name_x = model_builder.GetUniqueName(node, "slice_by_size");
+      std::vector<int64_t> x0_shape{num_dims};
+      AddIntermediateOperationOutput(*op, layer_input_name_x, output_datatype, x0_shape);
+      model_builder.AddOperation(std::move(op));
+
+      auto slice_op = model_builder.CreateOperation(node, "slice_by_size");
+      AddOperationInput(*slice_op, "x", layer_input_name_x);
+      std::vector<int64_t> starts = {start};
+      std::vector<int64_t> sizes = {size};
+      AddOperationInput(*slice_op, "begin", model_builder.AddConstant(slice_op->type(), "begin", starts));
+      AddOperationInput(*slice_op, "size", model_builder.AddConstant(slice_op->type(), "size", sizes));
+      AddOperationOutput(*slice_op, *node.OutputDefs()[0], output_datatype);
+      model_builder.AddOperation(std::move(slice_op));
+    } else {
+      AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype);
+      model_builder.AddOperation(std::move(op));
+    }
+  } else  // NOLINT
+#endif
+  {
+    auto layer = model_builder.CreateNNLayer(node);
+    layer->mutable_getshape();
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
 
-bool ShapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
+bool ShapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                        const logging::Logger& logger) const {
+  const auto* tensor_shape = node.InputDefs()[0]->Shape();
+
   NodeAttrHelper node_attr_helper{node};
-  if (node_attr_helper.Get("start", 0) != 0) {
-    LOGS(logger, VERBOSE) << "Shape does not support 'start' attribute with value other than 0";
+  if (!input_params.create_mlprogram) {
+    if (node_attr_helper.HasAttr("end")) {
+      LOGS(logger, VERBOSE) << "Shape does not support 'end' attribute";
+      return false;
+    }
+
+    if (node_attr_helper.Get("start", 0) != 0) {
+      LOGS(logger, VERBOSE) << "Shape does not support 'start' attribute with value other than 0";
+      return false;
+    }
+  } else {
+    int64_t end = node_attr_helper.HasAttr("end")
+                      ? node_attr_helper.Get("end", -1)
+                      : std::numeric_limits<int64_t>::max();
+    int64_t start = node_attr_helper.Get("start", 0);
+    // no need to slice if start is 0 and end is max
+    if (end == std::numeric_limits<int64_t>::max() && start == 0) {
+    } else if (tensor_shape == nullptr) {
+      LOGS(logger, VERBOSE) << "Shape does not support slicing when tensor_shape is not available";
+      return false;
+    }
+    int64_t dim_size = tensor_shape->dim_size();
+    int64_t size = node_attr_helper.HasAttr("end")
+                       ? HandleNegativeAxis(node_attr_helper.Get("end", -1), dim_size)
+                       : dim_size;
+    start = HandleNegativeAxis(start, dim_size);
+    size = size - start;
+    if (size == 0) {
+      LOGS(logger, VERBOSE) << "Shape does not support slicing when size is 0";
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool ShapeOpBuilder::HasSupportedInputsImpl(const Node& node,
+                                            [[maybe_unused]] const OpBuilderInputParams& input_params,
+                                            const logging::Logger& logger) const {
+  // We only check the type of input 0
+  const auto& input = *node.InputDefs()[0];
+
+  int32_t input_type;
+  if (!GetType(input, input_type, logger)) {
     return false;
   }
 
-  if (node_attr_helper.HasAttr("end")) {
-    LOGS(logger, VERBOSE) << "Shape does not support 'end' attribute";
+  if (input_params.create_mlprogram) {
+    if ((input_type == ONNX_NAMESPACE::TensorProto_DataType_INT32 ||
+         input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT ||
+         input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16)) {
+      return true;
+    } else {
+      LOGS(logger, VERBOSE) << "[" << node.OpType()
+                            << "] Input type: [" << input_type
+                            << "] is not supported.";
+      return false;
+    }
+  } else if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+    LOGS(logger, VERBOSE) << "[" << node.OpType()
+                          << "] Input type: [" << input_type
+                          << "] is not supported.";
     return false;
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
index d6584124c6aba..c6e331feed326 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
@@ -4,6 +4,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -18,6 +19,7 @@ class SoftmaxOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
@@ -33,55 +35,100 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   NodeAttrHelper helper(node);
   int32_t axis_default_value = (node.SinceVersion() < 13) ? 1 : -1;
   const auto axis = helper.Get("axis", axis_default_value);
-  const auto axis_nonnegative = HandleNegativeAxis(axis, data_shape.size());
-
-  if (node.SinceVersion() >= 13 || (data_shape.size() == 2)) {
-    auto* coreml_softmaxnd = layer->mutable_softmaxnd();
-    coreml_softmaxnd->set_axis(axis);
-    *layer->mutable_input()->Add() = input_name;
-    *layer->mutable_output()->Add() = output_name;
-    model_builder.AddLayer(std::move(layer));
-  } else {
-    // note: if opsets < 13, onnx Softmax coerces the input shape to be 2D based on axis.
-    // we need to manually reshape to 2D and apply SoftmaxND to axis -1 to achieve equivalent results for CoreML.
-    TensorShape input_shape(data_shape);
-    const auto size_to_dimension = input_shape.SizeToDimension(axis_nonnegative);
-    const auto size_from_dimension = input_shape.SizeFromDimension(axis_nonnegative);
-
-    TensorShapeVector target_shape;
-    target_shape.push_back(size_to_dimension);
-    target_shape.push_back(size_from_dimension);
-
-    const auto reshape1_output_name = model_builder.GetUniqueName(node, "reshape1_output");
-    {  // Add reshape layer
-      auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape1");
-      *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
-      *reshape_layer->mutable_input()->Add() = input_name;
-      *reshape_layer->mutable_output()->Add() = reshape1_output_name;
-      model_builder.AddLayer(std::move(reshape_layer));
+  auto axis_nonnegative = HandleNegativeAxis(axis, data_shape.size());
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  // CoreML's softmax match onnx's softmax behavior since opset 13.
+  // For opset < 13, we need to reshape to 2D and set axis to -1 to simulate onnx softmax behavior.
+  // [B,D,...](onnx softmax opset 12, axis=1)->[B,D*...](CoreML softmax, axis=-1)->[B,D,...](reshape back)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    auto input_dtype = node.InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+    const int32_t elem_type = static_cast<int32_t>(input_dtype);
+
+    std::string_view layer_input_name_x = node.InputDefs()[0]->Name();
+    const bool need_reshape = node.SinceVersion() < 13 && axis_nonnegative != static_cast<int64_t>(data_shape.size()) - 1;
+    std::vector<int64_t> target_shape;
+    if (need_reshape) {
+      // reshape to 2D to simulate onnx softmax behavior
+      auto reshape1 = model_builder.CreateOperation(node, "reshape", "pre");
+      TensorShape input_shape(data_shape);
+      target_shape.push_back(input_shape.SizeToDimension(axis_nonnegative));
+      target_shape.push_back(input_shape.SizeFromDimension(axis_nonnegative));
+      axis_nonnegative = 1;
+      AddOperationInput(*reshape1, "x", layer_input_name_x);
+      AddOperationInput(*reshape1, "shape", model_builder.AddConstant(reshape1->type(), "shape1", target_shape));
+      layer_input_name_x = model_builder.GetUniqueName(node, "ln_reshape1_");
+      AddIntermediateOperationOutput(*reshape1, layer_input_name_x, elem_type, target_shape);
+      model_builder.AddOperation(std::move(reshape1));
     }
-    const auto softmax_output_name = model_builder.GetUniqueName(node, "softmax_output");
-    {
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "softmax");
+    AddOperationInput(*op, "x", layer_input_name_x);
+    AddOperationInput(*op, "axis", model_builder.AddScalarConstant(op->type(), "axis", axis_nonnegative));
+    if (!need_reshape) {
+      AddOperationOutput(*op, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(op));
+    } else {
+      std::string_view ln_output_name = model_builder.GetUniqueName(node, "ln_reshape1_");
+      AddIntermediateOperationOutput(*op, ln_output_name, elem_type, target_shape);
+      model_builder.AddOperation(std::move(op));
+      auto reshape2 = model_builder.CreateOperation(node, "reshape", "post");
+      AddOperationInput(*reshape2, "x", ln_output_name);
+      AddOperationInput(*reshape2, "shape", model_builder.AddConstant(reshape2->type(), "shape2", data_shape));
+      AddOperationOutput(*reshape2, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(reshape2));
+    }
+  } else  // NOLINT
+#endif
+  {
+    if (node.SinceVersion() >= 13 || (data_shape.size() == 2)) {
       auto* coreml_softmaxnd = layer->mutable_softmaxnd();
-      coreml_softmaxnd->set_axis(-1);
-      *layer->mutable_input()->Add() = reshape1_output_name;
-      *layer->mutable_output()->Add() = softmax_output_name;
+      coreml_softmaxnd->set_axis(axis);
+      *layer->mutable_input()->Add() = input_name;
+      *layer->mutable_output()->Add() = output_name;
       model_builder.AddLayer(std::move(layer));
-    }
-    {
-      // Add reshape back layer
-      auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape2");
-      *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {data_shape.cbegin(), data_shape.cend()};
-      *reshape_layer->mutable_input()->Add() = softmax_output_name;
-      *reshape_layer->mutable_output()->Add() = output_name;
-      model_builder.AddLayer(std::move(reshape_layer));
+    } else {
+      // note: if opsets < 13, onnx Softmax coerces the input shape to be 2D based on axis.
+      // we need to manually reshape to 2D and apply SoftmaxND to axis -1 to achieve equivalent results for CoreML.
+      TensorShape input_shape(data_shape);
+      const auto size_to_dimension = input_shape.SizeToDimension(axis_nonnegative);
+      const auto size_from_dimension = input_shape.SizeFromDimension(axis_nonnegative);
+
+      TensorShapeVector target_shape;
+      target_shape.push_back(size_to_dimension);
+      target_shape.push_back(size_from_dimension);
+
+      const auto reshape1_output_name = model_builder.GetUniqueName(node, "reshape1_output");
+      {  // Add reshape layer
+        auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape1");
+        *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
+        *reshape_layer->mutable_input()->Add() = input_name;
+        *reshape_layer->mutable_output()->Add() = reshape1_output_name;
+        model_builder.AddLayer(std::move(reshape_layer));
+      }
+      const auto softmax_output_name = model_builder.GetUniqueName(node, "softmax_output");
+      {
+        auto* coreml_softmaxnd = layer->mutable_softmaxnd();
+        coreml_softmaxnd->set_axis(-1);
+        *layer->mutable_input()->Add() = reshape1_output_name;
+        *layer->mutable_output()->Add() = softmax_output_name;
+        model_builder.AddLayer(std::move(layer));
+      }
+      {
+        // Add reshape back layer
+        auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape2");
+        *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {data_shape.cbegin(), data_shape.cend()};
+        *reshape_layer->mutable_input()->Add() = softmax_output_name;
+        *reshape_layer->mutable_output()->Add() = output_name;
+        model_builder.AddLayer(std::move(reshape_layer));
+      }
     }
   }
 
   return Status::OK();
 }
 
-bool SoftmaxOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /* input_params */,
+bool SoftmaxOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
                                          const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   std::vector<int64_t> input_shape;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
index dbd0f48576f8b..6372f3136123b 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
@@ -51,8 +51,8 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   auto calculate_remainder_and_chunk_size = [&](int32_t num_outputs) {
     // note: checked in IsOpSupportedImpl that ensures the dim value at splitting axis exists
     auto split_dim_size = data_shape[HandleNegativeAxis(axis, data_shape.size())];
-    uint64_t chunk_size = (split_dim_size + num_outputs - 1) / num_outputs;
-    uint64_t remainder = split_dim_size % chunk_size;
+    int64_t chunk_size = (split_dim_size + num_outputs - 1) / num_outputs;
+    int64_t remainder = split_dim_size % chunk_size;
     return std::make_tuple(remainder, chunk_size);
   };
 
@@ -106,20 +106,20 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       // if "split" is explicitly provided as an input
       // const auto& split_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name());
       Initializer unpacked_tensor(*model_builder.GetConstantInitializer(input_defs[1]->Name()));
-      auto split_span = unpacked_tensor.DataAsSpan<uint64_t>();
+      auto split_span = unpacked_tensor.DataAsSpan<int64_t>();
       for (const auto& split_size : split_span) {
         coreml_splitnd->add_splitsizes(split_size);
       }
     } else if (node.SinceVersion() < 18) {
-      uint64_t num_outputs = narrow<uint64_t>(node.OutputDefs().size());
+      int64_t num_outputs = narrow<int64_t>(node.OutputDefs().size());
       coreml_splitnd->set_numsplits(num_outputs);
     } else {
       // note: for opset 18+ 'num_outputs' is a required attribute
-      uint64_t num_outputs = narrow<uint64_t>(helper.GetInt64("num_outputs").value());
+      int64_t num_outputs = narrow<int64_t>(helper.GetInt64("num_outputs").value());
       auto [remainder, chunk_size] = calculate_remainder_and_chunk_size(static_cast<int32_t>(num_outputs));
       if (remainder) {
         // uneven
-        auto split_sizes = InlinedVector<uint64_t>(num_outputs, chunk_size);
+        auto split_sizes = InlinedVector<int64_t>(num_outputs, chunk_size);
         split_sizes.back() = remainder;
         for (size_t i = 0; i < split_sizes.size(); i++) {
           coreml_splitnd->add_splitsizes(split_sizes[i]);
@@ -162,7 +162,7 @@ bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar
     }
 
     const auto split_shape = *input_defs[1]->Shape();
-    if (split_shape.dim_size() < 2) {
+    if (split_shape.dim(0).dim_value() < 2) {
       LOGS(logger, VERBOSE) << "CoreML Split must produce at least 2 outputs.";
       return false;
     }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index e9cc1c2dbf638..a1b3a18265c70 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -5,10 +5,17 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/optimizer/initializer.h"
+#include "core/providers/cpu/tensor/unsqueeze.h"
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
 
 namespace onnxruntime {
 namespace coreml {
@@ -21,16 +28,16 @@ class SqueezeOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+  bool SupportsMLProgram() const override { return true; }
 };
 
 namespace {
-Status GetAxes(ModelBuilder& model_builder, const Node& node, std::vector<int64_t>& axes) {
+void GetAxes(ModelBuilder& model_builder, const Node& node, TensorShapeVector& axes) {
   // Squeeze opset 13 use input as axes
   if (node.SinceVersion() > 12) {
     // If axes is not provided, return an empty axes as default to squeeze all
     if (node.InputDefs().size() > 1) {
-      const auto& initializers(model_builder.GetInitializerTensors());
-      const auto& axes_tensor = *initializers.at(node.InputDefs()[1]->Name());
+      const auto& axes_tensor = *model_builder.GetConstantInitializer(node.InputDefs()[1]->Name());
       Initializer unpacked_tensor(axes_tensor);
       auto raw_axes = unpacked_tensor.DataAsSpan<int64_t>();
       const auto size = SafeInt<size_t>(axes_tensor.dims()[0]);
@@ -39,10 +46,9 @@ Status GetAxes(ModelBuilder& model_builder, const Node& node, std::vector<int64_
     }
   } else {
     NodeAttrHelper helper(node);
-    axes = helper.Get("axes", std::vector<int64_t>());
+    auto axes_attr = helper.Get("axes", std::vector<int64_t>());
+    axes.assign(axes_attr.begin(), axes_attr.end());
   }
-
-  return Status::OK();
 }
 }  // namespace
 
@@ -52,40 +58,103 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
   }
 }
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder,
+                                       const Node& node, const logging::Logger& logger) {
+  const auto& input_defs(node.InputDefs());
+  TensorShapeVector axes;
+  GetAxes(model_builder, node, axes);
+
+  std::vector<int64_t> input_shape;
+  GetShape(*input_defs[0], input_shape, logger);
+  auto op = model_builder.CreateOperation(node, "reshape");
+  AddOperationInput(*op, "x", input_defs[0]->Name());
+  TensorShapeVector output_shape = UnsqueezeBase::ComputeOutputShape(TensorShape(input_shape), axes);
+  AddOperationInput(*op, "shape", model_builder.AddConstant(op->type(), "shape", AsSpan(output_shape)));
+  AddOperationOutput(*op, *node.OutputDefs()[0]);
+  model_builder.AddOperation(std::move(op));
+}
+#endif
+
 Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
-                                               const logging::Logger& /* logger */) const {
+                                               [[maybe_unused]] const logging::Logger& logger) const {
   std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
   auto* coreml_squeeze = layer->mutable_squeeze();
-  std::vector<int64_t> axes;
-  ORT_RETURN_IF_ERROR(GetAxes(model_builder, node, axes));
-  if (axes.empty()) {
-    coreml_squeeze->set_squeezeall(true);
-  } else {
-    *coreml_squeeze->mutable_axes() = {axes.cbegin(), axes.cend()};
-    coreml_squeeze->set_squeezeall(false);
-  }
+  TensorShapeVector axes;
+  GetAxes(model_builder, node, axes);
+#if defined(COREML_ENABLE_MLPROGRAM)
+  const auto& input_defs(node.InputDefs());
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+#if defined(TARGET_CPU_X86_64) && TARGET_CPU_X86_64
+    // expand_dims has limited requirements for static shape, however, X86_64 has a bug that it can't handle scalar input
+    if (node.OpType() == "Unsqueeze" && input_defs[0]->Shape()->dim_size() < 2) {
+      HandleX86ArchUnsqueezeScalarInput(model_builder, node, logger);
+      return Status::OK();
+    }
+#endif
+    std::string_view coreml_op_type = node.OpType() == "Squeeze" ? "squeeze" : "expand_dims";
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+
+    if (!axes.empty()) {
+      // coreml supports negative axes
+      AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
+    }
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
+  } else  // NOLINT
+#endif
+  {
+    if (axes.empty()) {
+      coreml_squeeze->set_squeezeall(true);
+    } else {
+      *coreml_squeeze->mutable_axes() = {axes.cbegin(), axes.cend()};
+      coreml_squeeze->set_squeezeall(false);
+    }
 
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
 
-  model_builder.AddLayer(std::move(layer));
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
 
 bool SqueezeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
-                                         const logging::Logger& /*logger*/) const {
+                                         const logging::Logger& logger) const {
   // Squeeze opset 13 uses input 1 as axes, if we have input 1 then it needs to be an initializer
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  if (node.SinceVersion() > 12 && node.InputDefs().size() > 1) {
-    const auto& axes_name = node.InputDefs()[1]->Name();
-    if (!Contains(initializers, axes_name)) {
-      LOGS_DEFAULT(VERBOSE) << "Input axes of Squeeze must be known";
+  const auto& input_defs = node.InputDefs();
+  if (node.SinceVersion() > 12 && input_defs.size() > 1) {
+    const auto& axes_name = input_defs[1]->Name();
+    if (!input_params.graph_viewer.GetConstantInitializer(axes_name)) {
+      LOGS(logger, VERBOSE) << "Input axes must be known";
       return false;
     }
   }
 
+  if (node.OpType() == "Unsqueeze") {
+    if (!input_params.create_mlprogram) {
+      return false;
+    }
+
+    int64_t num_of_new_dims = 0;
+    if (node.SinceVersion() > 12) {
+      num_of_new_dims = node.InputDefs()[1]->Shape()->dim(0).dim_value();
+    } else {
+      NodeAttrHelper helper(node);
+      auto axes = helper.Get("axes", std::vector<int64_t>());
+      num_of_new_dims = static_cast<int64_t>(axes.size());
+    }
+
+    std::vector<int64_t> input_shape;
+    if (!GetShape(*input_defs[0], input_shape, logger) || input_shape.size() + num_of_new_dims > 5) {
+      LOGS(logger, VERBOSE) << "Unsqueeze to output shape with > 5 dimensions is not supported";
+      return false;
+    }
+  }
   return true;
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index a6580920343c4..bc3cad004aec1 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -16,6 +16,8 @@ class UnaryOpBuilder : public BaseOpBuilder {
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
   bool SupportsMLProgram() const override { return true; }
+  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                         const logging::Logger& logger) const override;
 };
 
 Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
@@ -32,6 +34,10 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
       coreml_op_type = "sqrt";
     } else if (op_type == "Reciprocal") {
       coreml_op_type = "inverse";
+    } else if (op_type == "Erf") {
+      coreml_op_type = "erf";
+    } else if (op_type == "Round") {
+      coreml_op_type = "round";
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "UnaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
@@ -74,6 +80,14 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   return Status::OK();
 }
 
+bool UnaryOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                       const logging::Logger& /*logger*/) const {
+  if (!input_params.create_mlprogram && (node.OpType() == "Erf" || node.OpType() == "Round")) {
+    return false;
+  }
+  return true;
+}
+
 void CreateUnaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   op_registrations.builders.push_back(std::make_unique<UnaryOpBuilder>());
   op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index f12e4dab5b3ec..6486942199df7 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -8,6 +8,7 @@
 #include "core/platform/env.h"
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/coreml_execution_provider.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/builders/impl/builder_utils.h"
@@ -401,14 +402,14 @@ std::string GetModelOutputPath(bool create_ml_program) {
 }  // namespace
 
 ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger,
-                           int32_t coreml_version, uint32_t coreml_flags,
+                           int32_t coreml_version, const CoreMLOptions& coreml_options,
                            std::vector<std::string>&& onnx_input_names,
                            std::vector<std::string>&& onnx_output_names)
     : graph_viewer_(graph_viewer),
       logger_(logger),
       coreml_version_(coreml_version),
-      coreml_flags_(coreml_flags),
-      create_ml_program_((coreml_flags_ & COREML_FLAG_CREATE_MLPROGRAM) != 0),
+      coreml_options_(coreml_options),
+      create_ml_program_(coreml_options.CreateMLProgram()),
       model_output_path_(GetModelOutputPath(create_ml_program_)),
       onnx_input_names_(std::move(onnx_input_names)),
       onnx_output_names_(std::move(onnx_output_names)),
@@ -988,7 +989,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
                                     get_sanitized_io_info(std::move(input_output_info_)),
                                     std::move(scalar_outputs_),
                                     std::move(int64_outputs_),
-                                    logger_, coreml_flags_);
+                                    logger_, coreml_options_);
   } else
 #endif
   {
@@ -998,7 +999,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
                                     std::move(input_output_info_),
                                     std::move(scalar_outputs_),
                                     std::move(int64_outputs_),
-                                    logger_, coreml_flags_);
+                                    logger_, coreml_options_);
   }
 
   return model->LoadModel();  // load using CoreML API, including compilation
@@ -1048,11 +1049,11 @@ std::string_view ModelBuilder::AddConstant(std::string_view op_type, std::string
 #endif
 // static
 Status ModelBuilder::Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
-                           int32_t coreml_version, uint32_t coreml_flags,
+                           int32_t coreml_version, const CoreMLOptions& coreml_options,
                            std::vector<std::string>&& onnx_input_names,
                            std::vector<std::string>&& onnx_output_names,
                            std::unique_ptr<Model>& model) {
-  ModelBuilder builder(graph_viewer, logger, coreml_version, coreml_flags,
+  ModelBuilder builder(graph_viewer, logger, coreml_version, coreml_options,
                        std::move(onnx_input_names), std::move(onnx_output_names));
 
   ORT_RETURN_IF_ERROR(builder.CreateModel());
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index c566dbe160b50..e19597cf0dc2e 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -7,6 +7,7 @@
 #include "core/graph/graph_viewer.h"
 #include "core/providers/coreml/builders/coreml_spec.h"
 #include "core/providers/coreml/model/model.h"
+#include "core/providers/coreml/coreml_options.h"
 
 #if defined(COREML_ENABLE_MLPROGRAM)
 // coremltools classes
@@ -29,14 +30,14 @@ class IOpBuilder;
 class ModelBuilder {
  private:
   ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger,
-               int32_t coreml_version, uint32_t coreml_flags,
+               int32_t coreml_version, const CoreMLOptions& coreml_options,
                std::vector<std::string>&& onnx_input_names,
                std::vector<std::string>&& onnx_output_names);
 
  public:
   // Create the CoreML model, serialize to disk, load and compile using the CoreML API and return in `model`
   static Status Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
-                      int32_t coreml_version, uint32_t coreml_flags,
+                      int32_t coreml_version, const CoreMLOptions& coreml_options,
                       std::vector<std::string>&& onnx_input_names,
                       std::vector<std::string>&& onnx_output_names,
                       std::unique_ptr<Model>& model);
@@ -216,7 +217,7 @@ class ModelBuilder {
   const GraphViewer& graph_viewer_;
   const logging::Logger& logger_;
   const int32_t coreml_version_;
-  const uint32_t coreml_flags_;
+  CoreMLOptions coreml_options_;
   const bool create_ml_program_;         // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
   const std::string model_output_path_;  // create_ml_program_ ? dir for mlpackage : filename for mlmodel
 
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
index 4fd0c0577a9b8..6e7df20a06097 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
@@ -24,13 +24,16 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   CreateActivationOpBuilder("Gelu", op_registrations);
 
   // Unary ops
+  CreateUnaryOpBuilder("Erf", op_registrations);
   CreateUnaryOpBuilder("Reciprocal", op_registrations);
+  CreateUnaryOpBuilder("Round", op_registrations);
   CreateUnaryOpBuilder("Sqrt", op_registrations);
 
   // Binary elementwise ops
   CreateBinaryOpBuilder("Add", op_registrations);
   CreateBinaryOpBuilder("Div", op_registrations);
   CreateBinaryOpBuilder("Mul", op_registrations);
+  CreateBinaryOpBuilder("Max", op_registrations);
   CreateBinaryOpBuilder("Pow", op_registrations);
   CreateBinaryOpBuilder("Sub", op_registrations);
 
@@ -42,6 +45,9 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
 
   // Reduction ops
   CreateReductionOpBuilder("ReduceMean", op_registrations);
+  CreateReductionOpBuilder("ReduceMin", op_registrations);
+  CreateReductionOpBuilder("ReduceMax", op_registrations);
+  CreateReductionOpBuilder("ReduceProd", op_registrations);
   CreateReductionOpBuilder("ReduceSum", op_registrations);
 
   // Normalization ops
@@ -72,6 +78,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   CreateSoftmaxOpBuilder("Softmax", op_registrations);
   CreateSqueezeOpBuilder("Squeeze", op_registrations);
   CreateTransposeOpBuilder("Transpose", op_registrations);
+  CreateSqueezeOpBuilder("Unsqueeze", op_registrations);
 
   return op_registrations;
 }
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index f7afbb2f98bd8..5a2867e5524e4 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -23,35 +23,14 @@ namespace onnxruntime {
 
 constexpr const char* COREML = "CoreML";
 
-CoreMLExecutionProvider::CoreMLExecutionProvider(uint32_t coreml_flags)
+CoreMLExecutionProvider::CoreMLExecutionProvider(const CoreMLOptions& options)
     : IExecutionProvider{onnxruntime::kCoreMLExecutionProvider},
-      coreml_flags_(coreml_flags),
+      coreml_options_(options),
       coreml_version_(coreml::util::CoreMLVersion()) {
   LOGS_DEFAULT(VERBOSE) << "CoreML version: " << coreml_version_;
   if (coreml_version_ < MINIMUM_COREML_VERSION) {
-    LOGS_DEFAULT(ERROR) << "CoreML EP is not supported on this platform.";
+    ORT_THROW("CoreML EP is not supported on this platform.");
   }
-
-  // check if only one flag is set
-  if ((coreml_flags & COREML_FLAG_USE_CPU_ONLY) && (coreml_flags & COREML_FLAG_USE_CPU_AND_GPU)) {
-    // multiple device options selected
-    ORT_THROW(
-        "Multiple device options selected, you should use at most one of the following options:"
-        "COREML_FLAG_USE_CPU_ONLY or COREML_FLAG_USE_CPU_AND_GPU or not set");
-  }
-
-#if defined(COREML_ENABLE_MLPROGRAM)
-  if (coreml_version_ < MINIMUM_COREML_MLPROGRAM_VERSION &&
-      (coreml_flags_ & COREML_FLAG_CREATE_MLPROGRAM) != 0) {
-    LOGS_DEFAULT(WARNING) << "ML Program is not supported on this OS version. Falling back to NeuralNetwork.";
-    coreml_flags_ ^= COREML_FLAG_CREATE_MLPROGRAM;
-  }
-#else
-  if ((coreml_flags_ & COREML_FLAG_CREATE_MLPROGRAM) != 0) {
-    LOGS_DEFAULT(WARNING) << "ML Program is not supported in this build. Falling back to NeuralNetwork.";
-    coreml_flags_ ^= COREML_FLAG_CREATE_MLPROGRAM;
-  }
-#endif
 }
 
 CoreMLExecutionProvider::~CoreMLExecutionProvider() {}
@@ -61,26 +40,17 @@ CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
                                        const IKernelLookup& /*kernel_lookup*/) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
-  if (coreml_version_ < MINIMUM_COREML_VERSION) {
-    return result;
-  }
-
   const auto& logger = *GetLogger();
 
   // We do not run CoreML EP on subgraph, instead we cover this in the control flow nodes
   // TODO investigate whether we want to support subgraph using CoreML EP. May simply require processing the
   // implicit inputs of the control flow node that contains the subgraph as inputs to the CoreML model we generate.
-  if (graph_viewer.IsSubgraph() && !(coreml_flags_ & COREML_FLAG_ENABLE_ON_SUBGRAPH)) {
-    return result;
-  }
-
-  const bool has_neural_engine = coreml::HasNeuralEngine(logger);
-  if ((coreml_flags_ & COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) && !has_neural_engine) {
-    LOGS(logger, WARNING) << "The current system does not have Apple Neural Engine. CoreML EP will not be used.";
+  if (graph_viewer.IsSubgraph() && !coreml_options_.EnableOnSubgraph()) {
     return result;
   }
 
-  const auto builder_params = coreml::MakeOpBuilderParams(graph_viewer, coreml_version_, coreml_flags_);
+  const auto builder_params = coreml::MakeOpBuilderParams(graph_viewer, coreml_version_,
+                                                          coreml_options_.RequireStaticShape(), coreml_options_.CreateMLProgram());
   const auto supported_nodes = coreml::GetSupportedNodes(graph_viewer, builder_params, logger);
 
   const auto gen_metadef_name =
@@ -143,7 +113,7 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
       std::vector<std::string> onnx_output_names = get_names(fused_node.OutputDefs());
 
       const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-      ORT_RETURN_IF_ERROR(coreml::ModelBuilder::Build(graph_viewer, *GetLogger(), coreml_version_, coreml_flags_,
+      ORT_RETURN_IF_ERROR(coreml::ModelBuilder::Build(graph_viewer, *GetLogger(), coreml_version_, coreml_options_,
                                                       std::move(onnx_input_names), std::move(onnx_output_names),
                                                       coreml_model));
     }
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.h b/onnxruntime/core/providers/coreml/coreml_execution_provider.h
index 24a001280eef5..650d81a4fecf7 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.h
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "core/common/inlined_containers.h"
+#include "core/providers/coreml/coreml_options.h"
 #include "core/framework/execution_provider.h"
 #include "core/framework/model_metadef_id_generator.h"
 
@@ -14,7 +14,7 @@ class Model;
 
 class CoreMLExecutionProvider : public IExecutionProvider {
  public:
-  CoreMLExecutionProvider(uint32_t coreml_flags);
+  CoreMLExecutionProvider(const CoreMLOptions& options);
   virtual ~CoreMLExecutionProvider();
 
   std::vector<std::unique_ptr<ComputeCapability>>
@@ -29,7 +29,7 @@ class CoreMLExecutionProvider : public IExecutionProvider {
  private:
   // The bit flags which define bool options for COREML EP, bits are defined as
   // COREMLFlags in include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
-  uint32_t coreml_flags_;
+  CoreMLOptions coreml_options_;
   const int32_t coreml_version_;
   ModelMetadefIdGenerator metadef_id_generator_;
 
diff --git a/onnxruntime/core/providers/coreml/coreml_options.cc b/onnxruntime/core/providers/coreml/coreml_options.cc
new file mode 100644
index 0000000000000..4ec780208e528
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/coreml_options.cc
@@ -0,0 +1,109 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/coreml/coreml_execution_provider.h"
+#include "core/providers/coreml/coreml_provider_factory.h"  // defines flags
+#include "core/providers/coreml/model/host_utils.h"
+#include "core/providers/coreml/builders/helper.h"
+
+namespace onnxruntime {
+
+CoreMLOptions::CoreMLOptions(uint32_t coreml_flags) {
+  // validate the flags and populate the members. should be moving code from ctor to here
+  require_static_shape_ = (coreml_flags & COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES) != 0;
+  create_mlprogram_ = (coreml_flags & COREML_FLAG_CREATE_MLPROGRAM) != 0;
+  enable_on_subgraph_ = (coreml_flags & COREML_FLAG_ENABLE_ON_SUBGRAPH) != 0;
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (coreml::util::CoreMLVersion() < MINIMUM_COREML_MLPROGRAM_VERSION && create_mlprogram_ != 0) {
+    LOGS_DEFAULT(WARNING) << "ML Program is not supported on this OS version. Falling back to NeuralNetwork.";
+    create_mlprogram_ = false;
+  }
+#else
+  if (create_mlprogram_ != 0) {
+    LOGS_DEFAULT(WARNING) << "ML Program is not supported in this build. Falling back to NeuralNetwork.";
+    create_mlprogram_ = false;
+  }
+#endif
+
+  compute_units_ = 0;  // 0 for all
+
+  if (coreml_flags & COREML_FLAG_USE_CPU_ONLY) {
+    compute_units_ |= COREML_FLAG_USE_CPU_ONLY;
+  }
+  if (coreml_flags & COREML_FLAG_USE_CPU_AND_GPU) {
+    compute_units_ |= COREML_FLAG_USE_CPU_AND_GPU;
+  }
+  if (coreml_flags & COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) {
+    compute_units_ |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE;
+  }
+
+  // assure only one device option is selected
+  if (compute_units_ & (compute_units_ - 1)) {
+    // multiple device options selected
+    ORT_THROW(
+        "Multiple device options selected, you should use at most one of the following options:"
+        "[COREML_FLAG_USE_CPU_ONLY, COREML_FLAG_USE_CPU_AND_GPU, COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE]");
+  }
+
+  const bool has_neural_engine = coreml::HasNeuralEngine();
+  if (ComputeUnits(COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) && !has_neural_engine) {
+    ORT_THROW("The current system does not have Apple Neural Engine.");
+  }
+}
+
+void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& options) {
+  const std::unordered_map<std::string, COREMLFlags> available_computeunits_options = {
+      {"CPUAndNeuralEngine", COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE},
+      {"CPUAndGPU", COREML_FLAG_USE_CPU_AND_GPU},
+      {"CPUOnly", COREML_FLAG_USE_CPU_ONLY},
+      {"ALL", COREML_FLAG_USE_NONE},
+  };
+  const std::unordered_map<std::string, COREMLFlags> available_modelformat_options = {
+      {"MLProgram", COREML_FLAG_CREATE_MLPROGRAM},
+      {"NeuralNetwork", COREML_FLAG_USE_NONE},
+  };
+  const std::unordered_set<std::string_view> valid_options = {
+      kCoremlProviderOption_MLComputeUnits,
+      kCoremlProviderOption_ModelFormat,
+      kCoremlProviderOption_RequireStaticInputShapes,
+      kCoremlProviderOption_EnableOnSubgraphs,
+      kCoremlProviderOption_SpecializationStrategy,
+      kCoremlProviderOption_ProfileComputePlan,
+      kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU,
+  };
+  // Validate the options
+  for (const auto& option : options) {
+    if (valid_options.find(option.first) == valid_options.end()) {
+      ORT_THROW("Unknown option: ", option.first);
+    }
+    if (kCoremlProviderOption_MLComputeUnits == option.first) {
+      if (available_computeunits_options.find(option.second) == available_computeunits_options.end()) {
+        ORT_THROW("Invalid value for option `", option.first, "`: ", option.second);
+      } else {
+        compute_units_ = available_computeunits_options.at(option.second);
+      }
+    } else if (kCoremlProviderOption_ModelFormat == option.first) {
+      if (available_modelformat_options.find(option.second) == available_modelformat_options.end()) {
+        ORT_THROW("Invalid value for option ", option.first, ": ", option.second);
+      } else {
+        create_mlprogram_ = available_modelformat_options.at(option.second) & COREML_FLAG_CREATE_MLPROGRAM;
+      }
+    } else if (kCoremlProviderOption_RequireStaticInputShapes == option.first) {
+      require_static_shape_ = option.second == "1";
+    } else if (kCoremlProviderOption_EnableOnSubgraphs == option.first) {
+      enable_on_subgraph_ = option.second == "1";
+    } else if (kCoremlProviderOption_SpecializationStrategy == option.first) {
+      if (option.second != "Default" && option.second != "FastPrediction") {
+        ORT_THROW("Invalid value for option ", option.first, ": ", option.second,
+                  ". Valid values are Default and FastPrediction.");
+      }
+      strategy_ = option.second;
+    } else if (kCoremlProviderOption_ProfileComputePlan == option.first) {
+      profile_compute_plan_ = option.second == "1";
+    } else if (kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU == option.first) {
+      allow_low_precision_accumulation_on_gpu_ = option.second == "1";
+    }
+  }
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/coreml_options.h b/onnxruntime/core/providers/coreml/coreml_options.h
new file mode 100644
index 0000000000000..fd05c96927bd1
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/coreml_options.h
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/inlined_containers.h"
+#include "core/framework/execution_provider.h"
+
+namespace onnxruntime {
+
+class CoreMLOptions {
+ private:
+  bool require_static_shape_{false};
+  bool create_mlprogram_{false};
+  bool enable_on_subgraph_{false};
+  uint32_t compute_units_{0};
+  std::string strategy_;
+  bool profile_compute_plan_{false};
+  bool allow_low_precision_accumulation_on_gpu_{false};
+
+ public:
+  explicit CoreMLOptions(uint32_t coreml_flags);
+
+  CoreMLOptions(const ProviderOptions& options) {
+    ValidateAndParseProviderOption(options);
+  }
+  bool RequireStaticShape() const { return require_static_shape_; }
+  bool CreateMLProgram() const { return create_mlprogram_; }
+  bool EnableOnSubgraph() const { return enable_on_subgraph_; }
+  uint32_t ComputeUnits(uint32_t specific_flag = 0xffffffff) const { return compute_units_ & specific_flag; }
+  bool AllowLowPrecisionAccumulationOnGPU() const { return allow_low_precision_accumulation_on_gpu_; }
+  bool UseStrategy(std::string_view strategy) const { return strategy_ == strategy; }
+  bool ProfileComputePlan() const { return profile_compute_plan_ && create_mlprogram_; }
+
+ private:
+  void ValidateAndParseProviderOption(const ProviderOptions& options);
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/coreml_provider_factory.cc b/onnxruntime/core/providers/coreml/coreml_provider_factory.cc
index fcdf37c446ce7..bc8702d3290f6 100644
--- a/onnxruntime/core/providers/coreml/coreml_provider_factory.cc
+++ b/onnxruntime/core/providers/coreml/coreml_provider_factory.cc
@@ -9,21 +9,28 @@
 using namespace onnxruntime;
 
 namespace onnxruntime {
+
 struct CoreMLProviderFactory : IExecutionProviderFactory {
-  CoreMLProviderFactory(uint32_t coreml_flags)
-      : coreml_flags_(coreml_flags) {}
+  CoreMLProviderFactory(const CoreMLOptions& options)
+      : options_(options) {}
   ~CoreMLProviderFactory() override {}
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override;
-  uint32_t coreml_flags_;
+  CoreMLOptions options_;
 };
 
 std::unique_ptr<IExecutionProvider> CoreMLProviderFactory::CreateProvider() {
-  return std::make_unique<CoreMLExecutionProvider>(coreml_flags_);
+  return std::make_unique<CoreMLExecutionProvider>(options_);
 }
 
 std::shared_ptr<IExecutionProviderFactory> CoreMLProviderFactoryCreator::Create(uint32_t coreml_flags) {
-  return std::make_shared<onnxruntime::CoreMLProviderFactory>(coreml_flags);
+  CoreMLOptions coreml_options(coreml_flags);
+  return std::make_shared<onnxruntime::CoreMLProviderFactory>(coreml_options);
+}
+
+std::shared_ptr<IExecutionProviderFactory> CoreMLProviderFactoryCreator::Create(const ProviderOptions& options) {
+  CoreMLOptions coreml_options(options);
+  return std::make_shared<onnxruntime::CoreMLProviderFactory>(coreml_options);
 }
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/core/providers/coreml/coreml_provider_factory_creator.h b/onnxruntime/core/providers/coreml/coreml_provider_factory_creator.h
index ba701724c4da9..93ec2af50698d 100644
--- a/onnxruntime/core/providers/coreml/coreml_provider_factory_creator.h
+++ b/onnxruntime/core/providers/coreml/coreml_provider_factory_creator.h
@@ -5,10 +5,12 @@
 
 #include <memory>
 
+#include "core/framework/provider_options.h"
 #include "core/providers/providers.h"
 
 namespace onnxruntime {
 struct CoreMLProviderFactoryCreator {
   static std::shared_ptr<IExecutionProviderFactory> Create(uint32_t coreml_flags);
+  static std::shared_ptr<IExecutionProviderFactory> Create(const ProviderOptions& options);
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/model/model.h b/onnxruntime/core/providers/coreml/model/model.h
index 7fdd6b25bc7db..84b7d741b4714 100644
--- a/onnxruntime/core/providers/coreml/model/model.h
+++ b/onnxruntime/core/providers/coreml/model/model.h
@@ -18,6 +18,7 @@
 #endif
 
 namespace onnxruntime {
+class CoreMLOptions;
 namespace coreml {
 
 class Execution;
@@ -53,7 +54,7 @@ class Model {
         std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
         std::unordered_set<std::string>&& scalar_outputs,
         std::unordered_set<std::string>&& int64_outputs,
-        const logging::Logger& logger, uint32_t coreml_flags);
+        const logging::Logger& logger, const CoreMLOptions& coreml_options);
 
   ~Model();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Model);
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index ff32c52f942b2..755dbfbd6e68c 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -25,6 +25,7 @@
 #include "core/providers/coreml/model/host_utils.h"
 #include "core/providers/coreml/model/objc_str_utils.h"
 #include "core/providers/coreml/shape_utils.h"
+#include "core/providers/coreml/coreml_options.h"
 
 // force the linker to create a dependency on the CoreML framework so that in MAUI usage we don't need
 // to manually do this
@@ -300,6 +301,53 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
   return Status::OK();
 }
 
+// since __clang_major__ >= 15, MLComputePlan is introduced in <CoreML/CoreML.h>
+// We are actually ensure the MacOS/IOS version and Xcode version is greater than `macOS 14.4, iOS 17.4`.
+// The macro API_AVAILABLE should also be fine.
+// Otherwise, the compiler will complain `MLComputePlan` is not defined.
+// we define __clang_analyzer__ here is for bypass static analysis
+void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
+#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
+  if (@available(macOS 14.4, iOS 17.4, *)) {
+    [MLComputePlan loadContentsOfURL:compileUrl
+                       configuration:config
+                   completionHandler:^(MLComputePlan* _Nullable computePlan, NSError* _Nullable error) {
+                     if (!computePlan) {
+                       NSLog(@"Error loading compute plan: %@", error);
+                       // Handle error.
+                       return;
+                     }
+                     MLModelStructureProgram* program = computePlan.modelStructure.program;
+                     if (!program) {
+                       NSLog(@"Error loading program from compute plan., this is not a mlprogram model");
+                       return;
+                     }
+
+                     MLModelStructureProgramFunction* mainFunction = program.functions[@"main"];
+                     if (!mainFunction) {
+                       NSLog(@"Error loading main function from program");
+                       return;
+                     }
+
+                     NSArray<MLModelStructureProgramOperation*>* operations = mainFunction.block.operations;
+                     NSLog(@"Number of operations, 'const' node is included. : %lu", operations.count);
+                     for (MLModelStructureProgramOperation* operation in operations) {
+                       // Get the compute device usage for the operation.
+                       MLComputePlanDeviceUsage* computeDeviceUsage = [computePlan computeDeviceUsageForMLProgramOperation:operation];
+                       id<MLComputeDeviceProtocol> preferredDevice = computeDeviceUsage.preferredComputeDevice;
+                       // Get the estimated cost of executing the operation.
+                       MLComputePlanCost* estimatedCost = [computePlan estimatedCostOfMLProgramOperation:operation];
+                       if (![operation.operatorName isEqualToString:@"const"]) {
+                         NSLog(@"Operation: %@, Device Usage: %@, Estimated Cost: %f", operation.operatorName, preferredDevice, estimatedCost.weight);
+                       }
+                     }
+                   }];
+  } else {
+    NSLog(@"iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API");
+  }
+#endif
+}
+
 // Internal Execution class
 // This class is part of the model class and handles the calls into CoreML. Specifically, it performs
 // 1. Compile the model by given path for execution
@@ -307,7 +355,7 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
 // 3. The compiled model will be removed in dealloc or removed using cleanup function
 class Execution {
  public:
-  Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags);
+  Execution(const std::string& path, const logging::Logger& logger, const CoreMLOptions& coreml_options);
   ~Execution();
 
   Status LoadModel();
@@ -320,13 +368,13 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
   NSString* coreml_model_path_{nil};
   NSString* compiled_model_path_{nil};
   const logging::Logger& logger_;
-  uint32_t coreml_flags_{0};
+  CoreMLOptions coreml_options_;
   MLModel* model_{nil};
 };
 
-Execution::Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags)
+Execution::Execution(const std::string& path, const logging::Logger& logger, const CoreMLOptions& coreml_options)
     : logger_(logger),
-      coreml_flags_(coreml_flags) {
+      coreml_options_(coreml_options) {
   @autoreleasepool {
     coreml_model_path_ = util::Utf8StringToNSString(path.c_str());
   }
@@ -395,15 +443,41 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
       compiled_model_path_ = [compileUrl path];
 
       MLModelConfiguration* config = [[MLModelConfiguration alloc] init];
-
-      if (coreml_flags_ & COREML_FLAG_USE_CPU_ONLY) {
+      uint32_t coreml_compute_unit = coreml_options_.ComputeUnits();
+      if (coreml_compute_unit & COREML_FLAG_USE_CPU_ONLY) {
         config.computeUnits = MLComputeUnitsCPUOnly;
-      } else if (coreml_flags_ & COREML_FLAG_USE_CPU_AND_GPU) {
+      } else if (coreml_compute_unit & COREML_FLAG_USE_CPU_AND_GPU) {
         config.computeUnits = MLComputeUnitsCPUAndGPU;
+      } else if (coreml_compute_unit & COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) {
+        config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;  // Apple Neural Engine
       } else {
         config.computeUnits = MLComputeUnitsAll;
       }
 
+      if (coreml_options_.AllowLowPrecisionAccumulationOnGPU()) {
+        config.allowLowPrecisionAccumulationOnGPU = YES;
+      }
+
+// Set the specialization strategy to FastPrediction  for macOS 10.15+
+// since __clang_major__ >= 15, optimizationHints is introduced in <CoreML/CoreML.h>
+// Same as above comments for why we are checking __clang_major__.
+// we define __clang_analyzer__ here is for bypass static analysis
+#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
+      if (HAS_COREML8_OR_LATER) {
+        MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
+        if (coreml_options_.UseStrategy("FastPrediction")) {
+          optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
+          config.optimizationHints = optimizationHints;
+        } else if (coreml_options_.UseStrategy("Default")) {
+          optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
+          config.optimizationHints = optimizationHints;
+        }
+      }
+#endif
+      if (coreml_options_.ProfileComputePlan()) {
+        ProfileComputePlan(compileUrl, config);
+      }
+
       model_ = [MLModel modelWithContentsOfURL:compileUrl configuration:config error:&error];
 
       if (error != nil || model_ == nil) {
@@ -522,8 +596,8 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
              std::unordered_set<std::string>&& scalar_outputs,
              std::unordered_set<std::string>&& int64_outputs,
              const logging::Logger& logger,
-             uint32_t coreml_flags)
-    : execution_(std::make_unique<Execution>(path, logger, coreml_flags)),
+             const CoreMLOptions& coreml_options)
+    : execution_(std::make_unique<Execution>(path, logger, coreml_options)),
       model_input_names_(std::move(model_input_names)),
       model_output_names_(std::move(model_output_names)),
       input_output_info_(std::move(input_output_info)),
diff --git a/onnxruntime/core/providers/coreml/model/model_stub.cc b/onnxruntime/core/providers/coreml/model/model_stub.cc
index c6f2e7401ea1e..e9036e2fc7e1a 100644
--- a/onnxruntime/core/providers/coreml/model/model_stub.cc
+++ b/onnxruntime/core/providers/coreml/model/model_stub.cc
@@ -4,6 +4,7 @@
 #include "core/providers/coreml/model/model.h"
 
 namespace onnxruntime {
+class CoreMLOptions;
 namespace coreml {
 
 class Execution {};
@@ -15,7 +16,7 @@ Model::Model(const std::string& /*path*/,
              std::unordered_set<std::string>&& scalar_outputs,
              std::unordered_set<std::string>&& int64_outputs,
              const logging::Logger& /*logger*/,
-             uint32_t /*coreml_flags*/)
+             const CoreMLOptions& /*coreml_flags*/)
     : execution_(std::make_unique<Execution>()),
       model_input_names_(std::move(model_input_names)),
       model_output_names_(std::move(model_output_names)),
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index d57c33ae965b1..0499a15e1df0a 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -31,8 +31,7 @@ CPUExecutionProvider::CPUExecutionProvider(const CPUExecutionProviderInfo& info)
     : IExecutionProvider{onnxruntime::kCpuExecutionProvider}, info_{info} {}
 
 std::vector<AllocatorPtr> CPUExecutionProvider::CreatePreferredAllocators() {
-  const bool is_arena_requested = info_.create_arena;
-  const bool create_arena = ShouldCpuAllocatorUseArena(is_arena_requested);
+  const bool create_arena = DoesCpuAllocatorSupportArenaUsage() ? info_.create_arena : false;
   AllocatorCreationInfo device_info{[](int) { return std::make_unique<CPUAllocator>(); },
                                     DEFAULT_CPU_ALLOCATOR_DEVICE_ID, create_arena};
 
@@ -2926,6 +2925,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, int32_t, TreeEnsembleClassifier);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, float, TreeEnsembleRegressor);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, double, TreeEnsembleRegressor);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 5, float, TreeEnsemble);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 5, double, TreeEnsemble);
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_string, LabelEncoder);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_float, LabelEncoder);
@@ -3044,6 +3045,10 @@ Status RegisterOnnxMLOperatorKernels(KernelRegistry& kernel_registry) {
                                                                   TreeEnsembleRegressor)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, double,
                                                                   TreeEnsembleRegressor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 5, float,
+                                                                  TreeEnsemble)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 5, double,
+                                                                  TreeEnsemble)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_string,
                                                                   LabelEncoder)>,
diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
index 0a1a3a5995872..37db095e92570 100644
--- a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
+++ b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
@@ -51,7 +51,6 @@ class FusedConvFp16 final : public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) override;
 
   Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
@@ -102,7 +101,6 @@ class FusedConvFp16 final : public OpKernel {
 };
 
 Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                              bool /*save_prepacked_initializers*/,
                               /*out*/ bool& is_packed,
                               /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
diff --git a/onnxruntime/core/providers/cpu/math/gemm.cc b/onnxruntime/core/providers/cpu/math/gemm.cc
index dbc7becdf2397..5406dd1a40446 100644
--- a/onnxruntime/core/providers/cpu/math/gemm.cc
+++ b/onnxruntime/core/providers/cpu/math/gemm.cc
@@ -248,7 +248,6 @@ template void Gemm<float>::ComputeGemm(CBLAS_TRANSPOSE trans_a, CBLAS_TRANSPOSE
 
 template <typename T>
 Status Gemm<T>::PrePack(const Tensor& /* tensor */, int /* input_idx */, AllocatorPtr /*alloc_for_caching*/,
-                        bool /*save_prepacked_initializers*/,
                         /*out*/ bool& is_packed,
                         /*out*/ PrePackedWeights* /*prepacked_weight_for_caching*/) {
   is_packed = false;
@@ -257,7 +256,7 @@ Status Gemm<T>::PrePack(const Tensor& /* tensor */, int /* input_idx */, Allocat
 
 template <>
 Status Gemm<float>::PrePack(const Tensor& tensor, int input_idx,
-                            AllocatorPtr alloc, bool /*save_prepacked_initializers*/, /*out*/ bool& is_packed,
+                            AllocatorPtr alloc, /*out*/ bool& is_packed,
                             /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
 
diff --git a/onnxruntime/core/providers/cpu/math/gemm.h b/onnxruntime/core/providers/cpu/math/gemm.h
index 92f05a7921f8b..953949732560d 100644
--- a/onnxruntime/core/providers/cpu/math/gemm.h
+++ b/onnxruntime/core/providers/cpu/math/gemm.h
@@ -21,7 +21,6 @@ class Gemm : protected GemmBase, public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
index 8f2c2c53b188b..2c6d23e4de908 100644
--- a/onnxruntime/core/providers/cpu/math/matmul.cc
+++ b/onnxruntime/core/providers/cpu/math/matmul.cc
@@ -173,7 +173,6 @@ bool GemmPackBBfloat16(AllocatorPtr& alloc,
 #endif
 
 Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
-                              bool /*save_prepacked_initializers*/,
                               /*out*/ bool& is_packed,
                               /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
diff --git a/onnxruntime/core/providers/cpu/math/matmul.h b/onnxruntime/core/providers/cpu/math/matmul.h
index 0bb0e6c2ef596..b9bbe36583879 100644
--- a/onnxruntime/core/providers/cpu/math/matmul.h
+++ b/onnxruntime/core/providers/cpu/math/matmul.h
@@ -37,7 +37,6 @@ class MatMul<float> final : public OpKernel {
   }
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
diff --git a/onnxruntime/core/providers/cpu/ml/ml_common.h b/onnxruntime/core/providers/cpu/ml/ml_common.h
index 2f4ebeabe043e..3359b2a69fe83 100644
--- a/onnxruntime/core/providers/cpu/ml/ml_common.h
+++ b/onnxruntime/core/providers/cpu/ml/ml_common.h
@@ -20,44 +20,48 @@ enum class OUTPUT_MODE {
   ALL_SCORES
 };
 
-enum NODE_MODE : uint8_t {
-  LEAF = 1,
-  BRANCH_LEQ = 2,
-  BRANCH_LT = 4,
-  BRANCH_GTE = 6,
-  BRANCH_GT = 8,
-  BRANCH_EQ = 10,
-  BRANCH_NEQ = 12
+enum NODE_MODE_ONNX : uint8_t {
+  BRANCH_LEQ = 0,
+  BRANCH_LT = 1,
+  BRANCH_GTE = 2,
+  BRANCH_GT = 3,
+  BRANCH_EQ = 4,
+  BRANCH_NEQ = 5,
+  BRANCH_MEMBER = 6,
+  LEAF = 7,
 };
 
-static inline NODE_MODE MakeTreeNodeMode(const std::string& input) {
+static inline NODE_MODE_ONNX MakeTreeNodeMode(const std::string& input) {
   if (input == "BRANCH_LEQ") {
-    return NODE_MODE::BRANCH_LEQ;
+    return NODE_MODE_ONNX::BRANCH_LEQ;
   }
   if (input == "LEAF") {
-    return NODE_MODE::LEAF;
+    return NODE_MODE_ONNX::LEAF;
   }
   if (input == "BRANCH_LT") {
-    return NODE_MODE::BRANCH_LT;
+    return NODE_MODE_ONNX::BRANCH_LT;
   }
   if (input == "BRANCH_GTE") {
-    return NODE_MODE::BRANCH_GTE;
+    return NODE_MODE_ONNX::BRANCH_GTE;
   }
   if (input == "BRANCH_GT") {
-    return NODE_MODE::BRANCH_GT;
+    return NODE_MODE_ONNX::BRANCH_GT;
   }
   if (input == "BRANCH_EQ") {
-    return NODE_MODE::BRANCH_EQ;
+    return NODE_MODE_ONNX::BRANCH_EQ;
   }
-  return NODE_MODE::BRANCH_NEQ;
+  if (input == "BRANCH_MEMBER") {
+    return NODE_MODE_ONNX::BRANCH_MEMBER;
+  }
+  return NODE_MODE_ONNX::BRANCH_NEQ;
 }
 
-enum class POST_EVAL_TRANSFORM {
-  NONE,
-  LOGISTIC,
-  SOFTMAX,
-  SOFTMAX_ZERO,
-  PROBIT
+enum class POST_EVAL_TRANSFORM : int64_t {
+  NONE = 0,
+  LOGISTIC = 1,
+  SOFTMAX = 2,
+  SOFTMAX_ZERO = 3,
+  PROBIT = 4
 };
 
 static inline POST_EVAL_TRANSFORM MakeTransform(const std::string& input) {
@@ -76,11 +80,11 @@ static inline POST_EVAL_TRANSFORM MakeTransform(const std::string& input) {
   return POST_EVAL_TRANSFORM::PROBIT;
 }
 
-enum class AGGREGATE_FUNCTION {
-  AVERAGE,
-  SUM,
-  MIN,
-  MAX
+enum class AGGREGATE_FUNCTION : int64_t {
+  AVERAGE = 0,
+  SUM = 1,
+  MIN = 2,
+  MAX = 3
 };
 
 static inline AGGREGATE_FUNCTION MakeAggregateFunction(const std::string& input) {
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble.cc b/onnxruntime/core/providers/cpu/ml/tree_ensemble.cc
new file mode 100644
index 0000000000000..3ff501d96b72d
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble.cc
@@ -0,0 +1,59 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cpu/ml/tree_ensemble.h"
+#include "core/providers/cpu/ml/tree_ensemble_helper.h"
+#include "core/common/inlined_containers_fwd.h"
+
+namespace onnxruntime {
+namespace ml {
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    TreeEnsemble,
+    5,
+    float,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()).MayInplace(0, 0),
+    TreeEnsemble<float>);
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    TreeEnsemble,
+    5,
+    double,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<double>()).MayInplace(0, 0),
+    TreeEnsemble<double>);
+
+template <typename T>
+TreeEnsemble<T>::TreeEnsemble(const OpKernelInfo& info) : OpKernel(info) {
+  if constexpr (std::is_same<T, double>::value) {
+    p_tree_ensemble_ = std::make_unique<detail::TreeEnsembleCommonV5<T, double>>();
+  } else {
+    p_tree_ensemble_ = std::make_unique<detail::TreeEnsembleCommonV5<T, float>>();
+  }
+  ORT_THROW_IF_ERROR(p_tree_ensemble_->Init(info));
+}
+
+template <typename T>
+Status TreeEnsemble<T>::GetRemovableAttributes(InlinedVector<std::string>& removable_attributes) const {
+  InlinedVector<std::string> names{
+      "leaf_targetids", "leaf_weights", "membership_values", "nodes_falseleafs",
+      "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates", "nodes_missing_value_tracks_true",
+      "nodes_modes", "nodes_splits", "nodes_trueleafs", "nodes_truenodeids"};
+  removable_attributes.swap(names);
+  return Status::OK();
+}
+
+template <typename T>
+common::Status TreeEnsemble<T>::Compute(OpKernelContext* context) const {
+  const auto* X = context->Input<Tensor>(0);
+  if (X == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
+  if (X->Shape().NumDimensions() == 0) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                  "Input shape needs to be at least a single dimension.");
+  }
+  int64_t N = X->Shape().NumDimensions() == 1 ? 1 : X->Shape()[0];
+  Tensor* Y = context->Output(0, {N, p_tree_ensemble_->get_target_or_class_count()});
+  return p_tree_ensemble_->compute(context, X, Y, NULL);
+}
+
+}  // namespace ml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble.h
new file mode 100644
index 0000000000000..697aae045a7e3
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "tree_ensemble_common.h"
+
+namespace onnxruntime {
+namespace ml {
+template <typename T>
+class TreeEnsemble final : public OpKernel {
+  typedef T InputType;       // input type
+  typedef float OutputType;  // output type
+ public:
+  explicit TreeEnsemble(const OpKernelInfo& info);
+  common::Status Compute(OpKernelContext* context) const override;
+  Status GetRemovableAttributes(InlinedVector<std::string>& removable_attributes) const override;
+
+ private:
+  // Pointer on one instance of
+  // detail::TreeEnsembleCommonV5<T, ThresholdType>
+  // where ThresholdType is defined after accessing the attributes.
+  std::unique_ptr<detail::TreeEnsembleCommonAttributes> p_tree_ensemble_;
+};
+}  // namespace ml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h
index b031a6f0cefa3..bf3fd37d10f5c 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h
@@ -78,6 +78,40 @@ union PtrOrWeight {
   } weight_data;
 };
 
+enum NODE_MODE_ORT : uint8_t {
+  LEAF = 1,
+  BRANCH_LEQ = 2,
+  BRANCH_LT = 4,
+  BRANCH_GTE = 6,
+  BRANCH_GT = 8,
+  BRANCH_EQ = 10,
+  BRANCH_NEQ = 12,
+  BRANCH_MEMBER = 14,
+};
+
+inline NODE_MODE_ORT Convert_NODE_MODE_ONNX_to_ORT(NODE_MODE_ONNX node_mode) {
+  switch (node_mode) {
+    case NODE_MODE_ONNX::LEAF:
+      return NODE_MODE_ORT::LEAF;
+    case NODE_MODE_ONNX::BRANCH_LEQ:
+      return NODE_MODE_ORT::BRANCH_LEQ;
+    case NODE_MODE_ONNX::BRANCH_LT:
+      return NODE_MODE_ORT::BRANCH_LT;
+    case NODE_MODE_ONNX::BRANCH_GTE:
+      return NODE_MODE_ORT::BRANCH_GTE;
+    case NODE_MODE_ONNX::BRANCH_GT:
+      return NODE_MODE_ORT::BRANCH_GT;
+    case NODE_MODE_ONNX::BRANCH_EQ:
+      return NODE_MODE_ORT::BRANCH_EQ;
+    case NODE_MODE_ONNX::BRANCH_NEQ:
+      return NODE_MODE_ORT::BRANCH_NEQ;
+    case NODE_MODE_ONNX::BRANCH_MEMBER:
+      return NODE_MODE_ORT::BRANCH_MEMBER;
+    default:
+      ORT_THROW("Unexpected value for node_mode");
+  };
+}
+
 template <typename T>
 struct TreeNodeElement {
   int feature_id;
@@ -98,10 +132,10 @@ struct TreeNodeElement {
   // weight in array `TreeEnsembleCommon::weights_`. If the number of targets or classes is one, the weight is also
   // stored in `value_or_unique_weight`.
   PtrOrWeight<T> truenode_or_weight;
-  uint8_t flags;
+  NODE_MODE_ORT flags;
 
-  inline NODE_MODE mode() const { return NODE_MODE(flags & 0xF); }
-  inline bool is_not_leaf() const { return !(flags & NODE_MODE::LEAF); }
+  inline NODE_MODE_ORT mode() const { return NODE_MODE_ORT(flags & 0xF); }
+  inline bool is_not_leaf() const { return !(flags & NODE_MODE_ORT::LEAF); }
   inline bool is_missing_track_true() const { return flags & MissingTrack::kTrue; }
 };
 
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h
new file mode 100644
index 0000000000000..d2d1ba9863ac7
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h
@@ -0,0 +1,321 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/inlined_containers.h"
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "ml_common.h"
+#include "tree_ensemble_helper.h"
+#include <vector>
+
+namespace onnxruntime {
+namespace ml {
+namespace detail {
+
+inline bool _isnan_(float x) { return std::isnan(x); }
+inline bool _isnan_(double x) { return std::isnan(x); }
+inline bool _isnan_(int64_t) { return false; }
+inline bool _isnan_(int32_t) { return false; }
+
+template <typename ThresholdType>
+struct TreeEnsembleAttributesV3 {
+  TreeEnsembleAttributesV3() {}
+  TreeEnsembleAttributesV3(const OpKernelInfo& info, bool classifier) {
+#if !defined(ORT_MINIMAL_BUILD)
+    ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "base_values_as_tensor", base_values_as_tensor));
+    ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "nodes_hitrates_as_tensor", nodes_hitrates_as_tensor));
+    ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "nodes_values_as_tensor", nodes_values_as_tensor));
+    if (classifier) {
+      ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "class_weights_as_tensor", target_class_weights_as_tensor));
+    } else {
+      ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "target_weights_as_tensor", target_class_weights_as_tensor));
+    }
+#endif
+
+    aggregate_function = info.GetAttrOrDefault<std::string>("aggregate_function", "SUM");
+    base_values = info.GetAttrsOrDefault<float>("base_values");
+    nodes_falsenodeids = info.GetAttrsOrDefault<int64_t>("nodes_falsenodeids");
+    nodes_featureids = info.GetAttrsOrDefault<int64_t>("nodes_featureids");
+    nodes_missing_value_tracks_true = info.GetAttrsOrDefault<int64_t>("nodes_missing_value_tracks_true");
+
+    std::vector<std::string> nodes_modes_string = info.GetAttrsOrDefault<std::string>("nodes_modes");
+    nodes_modes.reserve(nodes_modes_string.size());
+    for (auto s : nodes_modes_string) {
+      nodes_modes.emplace_back(MakeTreeNodeMode(s));
+    }
+
+    nodes_nodeids = info.GetAttrsOrDefault<int64_t>("nodes_nodeids");
+    nodes_treeids = info.GetAttrsOrDefault<int64_t>("nodes_treeids");
+    nodes_truenodeids = info.GetAttrsOrDefault<int64_t>("nodes_truenodeids");
+    nodes_values = info.GetAttrsOrDefault<float>("nodes_values");
+    post_transform = info.GetAttrOrDefault<std::string>("post_transform", "NONE");
+
+    if (classifier) {
+      target_class_ids = info.GetAttrsOrDefault<int64_t>("class_ids");
+      target_class_nodeids = info.GetAttrsOrDefault<int64_t>("class_nodeids");
+      target_class_treeids = info.GetAttrsOrDefault<int64_t>("class_treeids");
+      target_class_weights = info.GetAttrsOrDefault<float>("class_weights");
+      classlabels_strings = info.GetAttrsOrDefault<std::string>("classlabels_strings");
+      classlabels_int64s = info.GetAttrsOrDefault<int64_t>("classlabels_int64s");
+      n_targets_or_classes = classlabels_strings.empty() ? classlabels_int64s.size()
+                                                         : classlabels_strings.size();
+    } else {
+      n_targets_or_classes = info.GetAttrOrDefault<int64_t>("n_targets", 0);
+      target_class_ids = info.GetAttrsOrDefault<int64_t>("target_ids");
+      target_class_nodeids = info.GetAttrsOrDefault<int64_t>("target_nodeids");
+      target_class_treeids = info.GetAttrsOrDefault<int64_t>("target_treeids");
+      target_class_weights = info.GetAttrsOrDefault<float>("target_weights");
+
+      ORT_ENFORCE(n_targets_or_classes > 0);
+      ORT_ENFORCE(nodes_falsenodeids.size() == nodes_featureids.size());
+      ORT_ENFORCE(nodes_falsenodeids.size() == nodes_modes_string.size());
+      ORT_ENFORCE(nodes_falsenodeids.size() == nodes_nodeids.size());
+      ORT_ENFORCE(nodes_falsenodeids.size() == nodes_treeids.size());
+      ORT_ENFORCE(nodes_falsenodeids.size() == nodes_truenodeids.size());
+      ORT_ENFORCE(nodes_falsenodeids.size() == nodes_values.size() ||
+                  nodes_falsenodeids.size() == nodes_values_as_tensor.size());
+      ORT_ENFORCE(target_class_ids.size() == target_class_nodeids.size());
+      ORT_ENFORCE(target_class_ids.size() == target_class_treeids.size());
+      ORT_ENFORCE(target_class_weights.empty() || target_class_ids.size() == target_class_weights.size());
+      ORT_ENFORCE(base_values.empty() || base_values_as_tensor.empty());
+      ORT_ENFORCE(nodes_hitrates.empty() || nodes_hitrates_as_tensor.empty());
+      ORT_ENFORCE(nodes_values.empty() || nodes_values_as_tensor.empty());
+      ORT_ENFORCE(target_class_weights.empty() || target_class_weights_as_tensor.empty());
+      ORT_ENFORCE(nodes_modes_string.size() < std::numeric_limits<uint32_t>::max());
+    }
+  }
+
+  std::string aggregate_function;
+  std::vector<float> base_values;
+  std::vector<ThresholdType> base_values_as_tensor;
+  int64_t n_targets_or_classes;
+  std::vector<int64_t> nodes_falsenodeids;
+  std::vector<int64_t> nodes_featureids;
+  std::vector<float> nodes_hitrates;
+  std::vector<ThresholdType> nodes_hitrates_as_tensor;
+  std::vector<int64_t> nodes_missing_value_tracks_true;
+  std::vector<NODE_MODE_ONNX> nodes_modes;
+  std::vector<int64_t> nodes_nodeids;
+  std::vector<int64_t> nodes_treeids;
+  std::vector<int64_t> nodes_truenodeids;
+  std::vector<float> nodes_values;
+  std::vector<ThresholdType> nodes_values_as_tensor;
+  std::string post_transform;
+  std::vector<int64_t> target_class_ids;
+  std::vector<int64_t> target_class_nodeids;
+  std::vector<int64_t> target_class_treeids;
+  std::vector<float> target_class_weights;
+  std::vector<ThresholdType> target_class_weights_as_tensor;
+  std::vector<std::string> classlabels_strings;
+  std::vector<int64_t> classlabels_int64s;
+  std::vector<int64_t> class_labels;
+};
+
+template <typename ThresholdType>
+struct TreeEnsembleAttributesV5 {
+  TreeEnsembleAttributesV5() {}
+  TreeEnsembleAttributesV5(const OpKernelInfo& info) {
+#if !defined(ORT_MINIMAL_BUILD)
+    std::vector<uint8_t> nodes_modes_i;
+    ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "leaf_weights", leaf_weights));
+    ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "membership_values", membership_values));
+    ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "nodes_hitrates", nodes_hitrates));
+    ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "nodes_modes", nodes_modes_i));
+    ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "nodes_splits", nodes_splits));
+    nodes_modes.reserve(nodes_modes.size());
+    for (auto i : nodes_modes_i) {
+      nodes_modes.push_back(static_cast<NODE_MODE_ONNX>(i));
+    }
+#else
+    // GetVectorAttrsOrDefault is not part of the minimal build.
+    // As a result, TreeEnsemble v5 cannot be available in this build.
+    ORT_THROW("TreeEnsemble(ai.onnx.ml==5) is not supported with the minimal build.");
+#endif
+
+    aggregate_function = info.GetAttrOrDefault<int64_t>("aggregate_function", 1);
+    leaf_targetids = info.GetAttrsOrDefault<int64_t>("leaf_targetids");
+    n_targets = info.GetAttrOrDefault<int64_t>("n_targets", 0);
+    nodes_falseleafs = info.GetAttrsOrDefault<int64_t>("nodes_falseleafs");
+    nodes_falsenodeids = info.GetAttrsOrDefault<int64_t>("nodes_falsenodeids");
+    nodes_featureids = info.GetAttrsOrDefault<int64_t>("nodes_featureids");
+    nodes_missing_value_tracks_true = info.GetAttrsOrDefault<int64_t>("nodes_missing_value_tracks_true");
+    nodes_trueleafs = info.GetAttrsOrDefault<int64_t>("nodes_trueleafs");
+    nodes_truenodeids = info.GetAttrsOrDefault<int64_t>("nodes_truenodeids");
+    post_transform = info.GetAttrOrDefault<int64_t>("post_transform", 0);
+    tree_roots = info.GetAttrsOrDefault<int64_t>("tree_roots");
+  }
+
+  void convert_to_v3(TreeEnsembleAttributesV3<ThresholdType>& output) const {
+    // Doing all transformations to get the old format.
+    output.n_targets_or_classes = n_targets;
+    output.aggregate_function = aggregateFunctionToString();
+    output.post_transform = postTransformToString();
+    std::vector<std::vector<ThresholdType>> membership_values_by_id;
+    getMembershipValuesById(membership_values_by_id);
+    transformInputAllTrees(output, membership_values_by_id);
+  }
+
+  int64_t aggregate_function;
+  std::vector<int64_t> leaf_targetids;
+  std::vector<ThresholdType> leaf_weights;
+  std::vector<ThresholdType> membership_values;
+  int64_t n_targets;
+  std::vector<int64_t> nodes_falseleafs;
+  std::vector<int64_t> nodes_falsenodeids;
+  std::vector<int64_t> nodes_featureids;
+  std::vector<ThresholdType> nodes_hitrates;
+  std::vector<int64_t> nodes_missing_value_tracks_true;
+  std::vector<NODE_MODE_ONNX> nodes_modes;
+  std::vector<ThresholdType> nodes_splits;
+  std::vector<int64_t> nodes_trueleafs;
+  std::vector<int64_t> nodes_truenodeids;
+  int64_t post_transform;
+  std::vector<int64_t> tree_roots;
+
+ private:
+  // `membership_values` are seperated by NAN for different nodes
+  // It is more convenient to preserve the values for each node in a vector
+  // The vector would be empty for nodes that are not `BRANCH_MEMBER`
+  void getMembershipValuesById(std::vector<std::vector<ThresholdType>>& membership_values_by_id) const {
+    membership_values_by_id.clear();
+    membership_values_by_id.reserve(nodes_modes.size());
+
+    size_t curr_id = 0;
+    for (const auto node_mode : nodes_modes) {
+      membership_values_by_id.emplace_back();
+      if (node_mode != NODE_MODE_ONNX::BRANCH_MEMBER) {
+        continue;
+      }
+
+      while (curr_id < membership_values.size() && !_isnan_(membership_values[curr_id])) {
+        membership_values_by_id.back().push_back(membership_values[curr_id++]);
+      }
+      curr_id++;
+    }
+  }
+
+  std::string aggregateFunctionToString() const {
+    switch (aggregate_function) {
+      case static_cast<int64_t>(AGGREGATE_FUNCTION::AVERAGE):
+        return "AVERAGE";
+      case static_cast<int64_t>(AGGREGATE_FUNCTION::SUM):
+        return "SUM";
+      case static_cast<int64_t>(AGGREGATE_FUNCTION::MIN):
+        return "MIN";
+      case static_cast<int64_t>(AGGREGATE_FUNCTION::MAX):
+        return "MAX";
+      default:
+        ORT_THROW("Unknown value for aggregate_function.");
+    }
+  }
+
+  std::string postTransformToString() const {
+    switch (post_transform) {
+      case static_cast<int64_t>(POST_EVAL_TRANSFORM::NONE):
+        return "NONE";
+      case static_cast<int64_t>(POST_EVAL_TRANSFORM::SOFTMAX):
+        return "SOFTMAX";
+      case static_cast<int64_t>(POST_EVAL_TRANSFORM::LOGISTIC):
+        return "LOGISTIC";
+      case static_cast<int64_t>(POST_EVAL_TRANSFORM::SOFTMAX_ZERO):
+        return "SOFTMAX_ZERO";
+      case static_cast<int64_t>(POST_EVAL_TRANSFORM::PROBIT):
+        return "PROBIT";
+      default:
+        ORT_THROW("Unknown value for post_transform.");
+    }
+  }
+
+  int64_t transformInputOneTree(
+      const size_t curr_id, const int64_t curr_treeid, const int64_t curr_nodeid, const size_t curr_membership_value_id,
+      const bool is_leaf, std::vector<std::vector<ThresholdType>>& membership_values_by_id,
+      TreeEnsembleAttributesV3<ThresholdType>& output) const {
+    output.nodes_nodeids.push_back(curr_nodeid);
+    output.nodes_treeids.push_back(curr_treeid);
+
+    if (is_leaf) {
+      output.nodes_modes.push_back(NODE_MODE_ONNX::LEAF);
+      output.target_class_ids.push_back(leaf_targetids[curr_id]);
+      output.target_class_nodeids.push_back(curr_nodeid);
+      output.target_class_treeids.push_back(curr_treeid);
+      output.target_class_weights_as_tensor.push_back(leaf_weights[curr_id]);
+
+      // the below are irrelevant for a `LEAF`
+      output.nodes_featureids.push_back(0);
+      output.nodes_truenodeids.push_back(0);
+      output.nodes_falsenodeids.push_back(0);
+      output.nodes_values_as_tensor.push_back(0);
+      if (!nodes_hitrates.empty()) {
+        output.nodes_hitrates.push_back(0);
+      }
+      if (!nodes_missing_value_tracks_true.empty()) {
+        output.nodes_missing_value_tracks_true.push_back(0);
+      }
+
+      return curr_nodeid;
+    }
+
+    output.nodes_featureids.push_back(nodes_featureids[curr_id]);
+    if (!nodes_hitrates.empty()) {
+      output.nodes_hitrates_as_tensor.push_back(nodes_hitrates[curr_id]);
+    }
+    if (!nodes_missing_value_tracks_true.empty()) {
+      output.nodes_missing_value_tracks_true.push_back(nodes_missing_value_tracks_true[curr_id]);
+    }
+
+    // unroll `BRANCH_MEMBER` to a chain of `BRANCH_EQ`
+    if (nodes_modes[curr_id] == NODE_MODE_ONNX::BRANCH_MEMBER) {
+      output.nodes_modes.push_back(NODE_MODE_ONNX::BRANCH_EQ);
+      output.nodes_values_as_tensor.push_back(membership_values_by_id[curr_id][curr_membership_value_id]);
+    } else {
+      output.nodes_modes.push_back(nodes_modes[curr_id]);
+      output.nodes_values_as_tensor.push_back(nodes_splits[curr_id]);
+    }
+
+    size_t falsenodeid_id = output.nodes_falsenodeids.size();
+    output.nodes_falsenodeids.push_back(0);  // change after pushing truenode subtree
+
+    int64_t true_nodeid = curr_nodeid + 1;
+    output.nodes_truenodeids.push_back(true_nodeid);
+    true_nodeid = transformInputOneTree(onnxruntime::narrow<size_t>(nodes_truenodeids[curr_id]),
+                                        curr_treeid, true_nodeid, 0U, nodes_trueleafs[curr_id] != 0,
+                                        membership_values_by_id, output);
+
+    int64_t false_nodeid = true_nodeid + 1;
+    output.nodes_falsenodeids[falsenodeid_id] = false_nodeid;
+
+    // if node is `BRANCH_MEMBER` we are unrolling the `membership_values` for that node
+    // therefore if the value is not the last, the `falsenode_id` must be pointing to the "same" node with a different membership value
+    // so in that case we are only moving the pointer for `membership_values`
+    //
+    // otherwise, the `falsenode_id` is pointing to the real falsenode subtree
+    if (nodes_modes[curr_id] == NODE_MODE_ONNX::BRANCH_MEMBER &&
+        curr_membership_value_id + 1 < membership_values_by_id[curr_id].size()) {
+      false_nodeid = transformInputOneTree(curr_id, curr_treeid, false_nodeid, curr_membership_value_id + 1, false,
+                                           membership_values_by_id, output);
+    } else {
+      false_nodeid = transformInputOneTree(onnxruntime::narrow<size_t>(nodes_falsenodeids[curr_id]),
+                                           curr_treeid, false_nodeid, 0U, nodes_falseleafs[curr_id] != 0,
+                                           membership_values_by_id, output);
+    }
+    return false_nodeid;
+  }
+
+  void transformInputAllTrees(TreeEnsembleAttributesV3<ThresholdType>& output,
+                              std::vector<std::vector<ThresholdType>>& membership_values_by_id) const {
+    int64_t curr_treeid = 0;
+    for (const int64_t& tree_root : tree_roots) {
+      size_t tree_root_size_t = onnxruntime::narrow<size_t>(tree_root);
+      transformInputOneTree(tree_root_size_t, curr_treeid, 0, 0U,
+                            nodes_falsenodeids[tree_root_size_t] == nodes_truenodeids[tree_root_size_t],
+                            membership_values_by_id, output);
+      curr_treeid++;
+    }
+  }
+};
+
+}  // namespace detail
+}  // namespace ml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
index 94f79518ae8da..10d4db0e0e3b0 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
@@ -3,15 +3,21 @@
 
 #pragma once
 
-#include "tree_ensemble_aggregator.h"
 #include <mutex>
 #include "core/platform/threadpool.h"
 #include "tree_ensemble_helper.h"
+#include "tree_ensemble_attribute.h"
+#include "tree_ensemble_aggregator.h"
 
 namespace onnxruntime {
 namespace ml {
 namespace detail {
 
+/**
+ * These attributes are the kernel attributes. They are different from the onnx operator attributes
+ * to improve the computation efficiency. The initialization consists in moving the onnx attributes
+ * into the kernel attributes.
+ */
 class TreeEnsembleCommonAttributes {
  public:
   int64_t get_target_or_class_count() const { return this->n_targets_or_classes_; }
@@ -57,27 +63,7 @@ class TreeEnsembleCommon : public TreeEnsembleCommonAttributes {
   Status Init(int parallel_tree,
               int parallel_tree_N,
               int parallel_N,
-              const std::string& aggregate_function,
-              const std::vector<float>& base_values,
-              const std::vector<ThresholdType>& base_values_as_tensor,
-              int64_t n_targets_or_classes,
-              const std::vector<int64_t>& nodes_falsenodeids,
-              const std::vector<int64_t>& nodes_featureids,
-              const std::vector<float>& nodes_hitrates,
-              const std::vector<ThresholdType>& nodes_hitrates_as_tensor,
-              const std::vector<int64_t>& nodes_missing_value_tracks_true,
-              const std::vector<std::string>& nodes_modes,
-              const std::vector<int64_t>& nodes_nodeids,
-              const std::vector<int64_t>& nodes_treeids,
-              const std::vector<int64_t>& nodes_truenodeids,
-              const std::vector<float>& nodes_values,
-              const std::vector<ThresholdType>& nodes_values_as_tensor,
-              const std::string& post_transform,
-              const std::vector<int64_t>& target_class_ids,
-              const std::vector<int64_t>& target_class_nodeids,
-              const std::vector<int64_t>& target_class_treeids,
-              const std::vector<float>& target_class_weights,
-              const std::vector<ThresholdType>& target_class_weights_as_tensor);
+              const TreeEnsembleAttributesV3<ThresholdType>& attributes);
 
  protected:
   TreeNodeElement<ThresholdType>* ProcessTreeNodeLeave(TreeNodeElement<ThresholdType>* root,
@@ -87,49 +73,52 @@ class TreeEnsembleCommon : public TreeEnsembleCommonAttributes {
   void ComputeAgg(concurrency::ThreadPool* ttp, const Tensor* X, Tensor* Y, Tensor* label, const AGG& agg) const;
 
  private:
-  size_t AddNodes(const size_t i, const InlinedVector<NODE_MODE>& cmodes, const InlinedVector<size_t>& truenode_ids,
-                  const InlinedVector<size_t>& falsenode_ids, const std::vector<int64_t>& nodes_featureids,
-                  const std::vector<ThresholdType>& nodes_values_as_tensor, const std::vector<float>& node_values,
-                  const std::vector<int64_t>& nodes_missing_value_tracks_true, std::vector<size_t>& updated_mapping,
-                  int64_t tree_id, const InlinedVector<TreeNodeElementId>& node_tree_ids);
+  bool CheckIfSubtreesAreEqual(const size_t left_id, const size_t right_id, const int64_t tree_id, const InlinedVector<NODE_MODE_ONNX>& cmodes,
+                               const InlinedVector<size_t>& truenode_ids, const InlinedVector<size_t>& falsenode_ids, gsl::span<const int64_t> nodes_featureids,
+                               gsl::span<const ThresholdType> nodes_values_as_tensor, gsl::span<const float> node_values,
+                               gsl::span<const float> target_class_weights, gsl::span<const ThresholdType> target_class_weights_as_tensor,
+                               const InlinedVector<TreeNodeElementId>& node_tree_ids, InlinedVector<std::pair<TreeNodeElementId, uint32_t>> indices);
+  size_t AddNodes(const size_t i, const InlinedVector<NODE_MODE_ONNX>& cmodes, const InlinedVector<size_t>& truenode_ids,
+                  const InlinedVector<size_t>& falsenode_ids, gsl::span<const int64_t> nodes_featureids,
+                  gsl::span<const ThresholdType> nodes_values_as_tensor, gsl::span<const float> node_values,
+                  gsl::span<const int64_t> nodes_missing_value_tracks_true, std::vector<size_t>& updated_mapping,
+                  int64_t tree_id, const InlinedVector<TreeNodeElementId>& node_tree_ids, gsl::span<const float> target_class_weights,
+                  gsl::span<const ThresholdType> target_class_weights_as_tensor, InlinedVector<std::pair<TreeNodeElementId, uint32_t>>& indices);
 };
 
+// Below is simple implementation of `bit_cast` as it is supported from c++20 and the current supported version is c++17
+// Remove it when that is not the case
+template <class To, class From>
+std::enable_if_t<
+    sizeof(To) == sizeof(From) &&
+        std::is_trivially_copyable_v<From> &&
+        std::is_trivially_copyable_v<To>,
+    To>
+    // constexpr support needs compiler magic
+    static bit_cast(const From& src) noexcept {
+  static_assert(std::is_trivially_constructible_v<To>,
+                "This implementation additionally requires "
+                "destination type to be trivially constructible");
+
+  To dst;
+  std::memcpy(&dst, &src, sizeof(To));
+  return dst;
+}
+
+template <typename T>
+std::conditional_t<sizeof(T) == sizeof(uint32_t), uint32_t, uint64_t> bit_cast_int(T val) {
+  if constexpr (sizeof(T) == sizeof(uint32_t)) {
+    return bit_cast<uint32_t>(val);
+  } else if constexpr (sizeof(T) == sizeof(uint64_t)) {
+    return bit_cast<uint64_t>(val);
+  }
+  static_assert(sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t));
+}
+
 template <typename InputType, typename ThresholdType, typename OutputType>
 Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(const OpKernelInfo& info) {
-  std::vector<ThresholdType> base_values_as_tensor, nodes_hitrates_as_tensor,
-      nodes_values_as_tensor, target_weights_as_tensor;
-#if !defined(ORT_MINIMAL_BUILD)
-  ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "base_values_as_tensor", base_values_as_tensor));
-  ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "nodes_hitrates_as_tensor", nodes_hitrates_as_tensor));
-  ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "nodes_values_as_tensor", nodes_values_as_tensor));
-  ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "target_weights_as_tensor", target_weights_as_tensor));
-#endif
-
-  return Init(
-      80,
-      128,
-      50,
-      info.GetAttrOrDefault<std::string>("aggregate_function", "SUM"),
-      info.GetAttrsOrDefault<float>("base_values"),
-      base_values_as_tensor,
-      info.GetAttrOrDefault<int64_t>("n_targets", 0),
-      info.GetAttrsOrDefault<int64_t>("nodes_falsenodeids"),
-      info.GetAttrsOrDefault<int64_t>("nodes_featureids"),
-      info.GetAttrsOrDefault<float>("nodes_hitrates"),
-      nodes_hitrates_as_tensor,
-      info.GetAttrsOrDefault<int64_t>("nodes_missing_value_tracks_true"),
-      info.GetAttrsOrDefault<std::string>("nodes_modes"),
-      info.GetAttrsOrDefault<int64_t>("nodes_nodeids"),
-      info.GetAttrsOrDefault<int64_t>("nodes_treeids"),
-      info.GetAttrsOrDefault<int64_t>("nodes_truenodeids"),
-      info.GetAttrsOrDefault<float>("nodes_values"),
-      nodes_values_as_tensor,
-      info.GetAttrOrDefault<std::string>("post_transform", "NONE"),
-      info.GetAttrsOrDefault<int64_t>("target_ids"),
-      info.GetAttrsOrDefault<int64_t>("target_nodeids"),
-      info.GetAttrsOrDefault<int64_t>("target_treeids"),
-      info.GetAttrsOrDefault<float>("target_weights"),
-      target_weights_as_tensor);
+  TreeEnsembleAttributesV3<ThresholdType> attributes(info, false);
+  return Init(80, 128, 50, attributes);
 }
 
 template <typename InputType, typename ThresholdType, typename OutputType>
@@ -137,72 +126,35 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
     int parallel_tree,
     int parallel_tree_N,
     int parallel_N,
-    const std::string& aggregate_function,
-    const std::vector<float>& base_values,
-    const std::vector<ThresholdType>& base_values_as_tensor,
-    int64_t n_targets_or_classes,
-    const std::vector<int64_t>& nodes_falsenodeids,
-    const std::vector<int64_t>& nodes_featureids,
-    const std::vector<float>& nodes_hitrates,
-    const std::vector<ThresholdType>& nodes_hitrates_as_tensor,
-    const std::vector<int64_t>& nodes_missing_value_tracks_true,
-    const std::vector<std::string>& nodes_modes,
-    const std::vector<int64_t>& nodes_nodeids,
-    const std::vector<int64_t>& nodes_treeids,
-    const std::vector<int64_t>& nodes_truenodeids,
-    const std::vector<float>& nodes_values,
-    const std::vector<ThresholdType>& nodes_values_as_tensor,
-    const std::string& post_transform,
-    const std::vector<int64_t>& target_class_ids,
-    const std::vector<int64_t>& target_class_nodeids,
-    const std::vector<int64_t>& target_class_treeids,
-    const std::vector<float>& target_class_weights,
-    const std::vector<ThresholdType>& target_class_weights_as_tensor) {
+    const TreeEnsembleAttributesV3<ThresholdType>& attributes) {
   parallel_tree_ = parallel_tree;
   parallel_tree_N_ = parallel_tree_N;
   parallel_N_ = parallel_N;
 
-  ORT_ENFORCE(n_targets_or_classes > 0);
-  ORT_ENFORCE(nodes_falsenodeids.size() == nodes_featureids.size());
-  ORT_ENFORCE(nodes_falsenodeids.size() == nodes_modes.size());
-  ORT_ENFORCE(nodes_falsenodeids.size() == nodes_nodeids.size());
-  ORT_ENFORCE(nodes_falsenodeids.size() == nodes_treeids.size());
-  ORT_ENFORCE(nodes_falsenodeids.size() == nodes_truenodeids.size());
-  ORT_ENFORCE(nodes_falsenodeids.size() == nodes_values.size() ||
-              nodes_falsenodeids.size() == nodes_values_as_tensor.size());
-  ORT_ENFORCE(target_class_ids.size() == target_class_nodeids.size());
-  ORT_ENFORCE(target_class_ids.size() == target_class_treeids.size());
-  ORT_ENFORCE(target_class_weights.empty() || target_class_ids.size() == target_class_weights.size());
-  ORT_ENFORCE(base_values.empty() || base_values_as_tensor.empty());
-  ORT_ENFORCE(nodes_hitrates.empty() || nodes_hitrates_as_tensor.empty());
-  ORT_ENFORCE(nodes_values.empty() || nodes_values_as_tensor.empty());
-  ORT_ENFORCE(target_class_weights.empty() || target_class_weights_as_tensor.empty());
-
-  aggregate_function_ = MakeAggregateFunction(aggregate_function);
-  post_transform_ = MakeTransform(post_transform);
-  if (!base_values_as_tensor.empty()) {
-    ORT_ENFORCE(base_values.empty());
-    base_values_ = base_values_as_tensor;
+  aggregate_function_ = MakeAggregateFunction(attributes.aggregate_function);
+  post_transform_ = MakeTransform(attributes.post_transform);
+  if (!attributes.base_values_as_tensor.empty()) {
+    ORT_ENFORCE(attributes.base_values.empty());
+    base_values_ = attributes.base_values_as_tensor;
   } else {
-    base_values_.reserve(base_values.size());
-    for (size_t i = 0, limit = base_values.size(); i < limit; ++i) {
-      base_values_.push_back(static_cast<ThresholdType>(base_values[i]));
+    base_values_.reserve(attributes.base_values.size());
+    for (size_t i = 0, limit = attributes.base_values.size(); i < limit; ++i) {
+      base_values_.push_back(static_cast<ThresholdType>(attributes.base_values[i]));
     }
   }
-  n_targets_or_classes_ = n_targets_or_classes;
+  n_targets_or_classes_ = attributes.n_targets_or_classes;
   max_tree_depth_ = 1000;
-  ORT_ENFORCE(nodes_modes.size() < std::numeric_limits<uint32_t>::max());
 
   // Additional members
   size_t limit;
   uint32_t i;
-  InlinedVector<NODE_MODE> cmodes;
-  cmodes.reserve(nodes_modes.size());
+  InlinedVector<NODE_MODE_ONNX> cmodes;
+  cmodes.reserve(attributes.nodes_modes.size());
   same_mode_ = true;
   int fpos = -1;
-  for (i = 0, limit = nodes_modes.size(); i < limit; ++i) {
-    cmodes.push_back(MakeTreeNodeMode(nodes_modes[i]));
-    if (cmodes[i] == NODE_MODE::LEAF) continue;
+  for (i = 0, limit = attributes.nodes_modes.size(); i < limit; ++i) {
+    cmodes.push_back(attributes.nodes_modes[i]);
+    if (cmodes[i] == NODE_MODE_ONNX::LEAF) continue;
     if (fpos == -1) {
       fpos = static_cast<int>(i);
       continue;
@@ -210,7 +162,7 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
     if (cmodes[i] != cmodes[fpos]) same_mode_ = false;
   }
 
-  n_nodes_ = nodes_treeids.size();
+  n_nodes_ = attributes.nodes_treeids.size();
   limit = static_cast<size_t>(n_nodes_);
   InlinedVector<TreeNodeElementId> node_tree_ids;
   node_tree_ids.reserve(limit);
@@ -227,7 +179,7 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
 
   // Build node_tree_ids and node_tree_ids_map and truenode_ids and falsenode_ids
   for (i = 0; i < limit; ++i) {
-    TreeNodeElementId node_tree_id{static_cast<int>(nodes_treeids[i]), static_cast<int>(nodes_nodeids[i])};
+    TreeNodeElementId node_tree_id{static_cast<int>(attributes.nodes_treeids[i]), static_cast<int>(attributes.nodes_nodeids[i])};
     auto p = node_tree_ids_map.insert(std::pair<TreeNodeElementId, size_t>(node_tree_id, i));
     if (!p.second) {
       ORT_THROW("Node ", node_tree_id.node_id, " in tree ", node_tree_id.tree_id, " is already there.");
@@ -237,13 +189,13 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
 
   TreeNodeElementId coor;
   for (i = 0; i < limit; ++i) {
-    if (cmodes[i] == NODE_MODE::LEAF) {
+    if (cmodes[i] == NODE_MODE_ONNX::LEAF) {
       truenode_ids.push_back(0);
       falsenode_ids.push_back(0);
     } else {
       TreeNodeElementId& node_tree_id = node_tree_ids[i];
       coor.tree_id = node_tree_id.tree_id;
-      coor.node_id = static_cast<int>(nodes_truenodeids[i]);
+      coor.node_id = static_cast<int>(attributes.nodes_truenodeids[i]);
       ORT_ENFORCE((coor.node_id >= 0 && coor.node_id < n_nodes_));
 
       auto found = node_tree_ids_map.find(coor);
@@ -255,7 +207,7 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
       }
       truenode_ids.emplace_back(found->second);
 
-      coor.node_id = static_cast<int>(nodes_falsenodeids[i]);
+      coor.node_id = static_cast<int>(attributes.nodes_falsenodeids[i]);
       ORT_ENFORCE((coor.node_id >= 0 && coor.node_id < n_nodes_));
       found = node_tree_ids_map.find(coor);
       if (found == node_tree_ids_map.end()) {
@@ -270,41 +222,38 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
     }
   }
 
+  // Sort targets
+  InlinedVector<std::pair<TreeNodeElementId, uint32_t>> indices;
+  indices.reserve(attributes.target_class_nodeids.size());
+  for (i = 0, limit = attributes.target_class_nodeids.size(); i < limit; i++) {
+    indices.emplace_back(
+        TreeNodeElementId{attributes.target_class_treeids[i], attributes.target_class_nodeids[i]}, i);
+  }
+
+  std::sort(indices.begin(), indices.end());
+
   // Let's construct nodes_ such that the false branch is always the next element in nodes_.
   // updated_mapping will translates the old position of each node to the new node position in nodes_.
-  std::vector<size_t> updated_mapping(nodes_treeids.size(), 0);
+  std::vector<size_t> updated_mapping(attributes.nodes_treeids.size(), 0);
   int64_t previous_tree_id = -1;
   for (i = 0; i < n_nodes_; ++i) {
     if (previous_tree_id == -1 || (previous_tree_id != node_tree_ids[i].tree_id)) {
       // New tree.
       int64_t tree_id = node_tree_ids[i].tree_id;
       size_t root_position =
-          AddNodes(i, cmodes, truenode_ids, falsenode_ids, nodes_featureids, nodes_values_as_tensor, nodes_values,
-                   nodes_missing_value_tracks_true, updated_mapping, tree_id, node_tree_ids);
+          AddNodes(i, cmodes, truenode_ids, falsenode_ids, attributes.nodes_featureids, attributes.nodes_values_as_tensor, attributes.nodes_values,
+                   attributes.nodes_missing_value_tracks_true, updated_mapping, tree_id, node_tree_ids,
+                   attributes.target_class_weights, attributes.target_class_weights_as_tensor, indices);
       roots_.push_back(&nodes_[root_position]);
       previous_tree_id = tree_id;
     }
   }
-
   n_trees_ = roots_.size();
-  if (((int64_t)nodes_.size()) != n_nodes_) {
-    ORT_THROW("Number of nodes in nodes_ (", nodes_.size(), ") is different from n_nodes (", n_nodes_, ").");
-  }
-
-  // Sort targets
-  InlinedVector<std::pair<TreeNodeElementId, uint32_t>> indices;
-  indices.reserve(target_class_nodeids.size());
-  for (i = 0, limit = target_class_nodeids.size(); i < limit; i++) {
-    indices.emplace_back(
-        std::pair<TreeNodeElementId, uint32_t>(TreeNodeElementId{target_class_treeids[i], target_class_nodeids[i]}, i));
-  }
-
-  std::sort(indices.begin(), indices.end());
 
   TreeNodeElementId ind;
   SparseValue<ThresholdType> w;
   size_t indi;
-  for (indi = 0, limit = target_class_nodeids.size(); indi < limit; ++indi) {
+  for (indi = 0, limit = attributes.target_class_nodeids.size(); indi < limit; ++indi) {
     ind = indices[indi].first;
     i = indices[indi].second;
     auto found = node_tree_ids_map.find(ind);
@@ -319,9 +268,10 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
       // ORT_THROW("Node ", ind.tree_id, "-", ind.node_id, " is not a leaf.");
       continue;
     }
-    w.i = target_class_ids[i];
-    w.value = target_class_weights_as_tensor.empty() ? static_cast<ThresholdType>(target_class_weights[i])
-                                                     : target_class_weights_as_tensor[i];
+    w.i = attributes.target_class_ids[i];
+    w.value = attributes.target_class_weights_as_tensor.empty()
+                  ? static_cast<ThresholdType>(attributes.target_class_weights[i])
+                  : attributes.target_class_weights_as_tensor[i];
     if (leaf.truenode_or_weight.weight_data.n_weights == 0) {
       leaf.truenode_or_weight.weight_data.weight = static_cast<int32_t>(weights_.size());
       leaf.value_or_unique_weight = w.value;
@@ -331,7 +281,7 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
   }
 
   has_missing_tracks_ = false;
-  for (auto itm = nodes_missing_value_tracks_true.begin(); itm != nodes_missing_value_tracks_true.end(); ++itm) {
+  for (auto itm = attributes.nodes_missing_value_tracks_true.begin(); itm != attributes.nodes_missing_value_tracks_true.end(); ++itm) {
     if (*itm) {
       has_missing_tracks_ = true;
       break;
@@ -341,13 +291,58 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
   return Status::OK();
 }
 
+template <typename InputType, typename ThresholdType, typename OutputType>
+bool TreeEnsembleCommon<InputType, ThresholdType, OutputType>::CheckIfSubtreesAreEqual(
+    const size_t left_id, const size_t right_id, const int64_t tree_id, const InlinedVector<NODE_MODE_ONNX>& cmodes,
+    const InlinedVector<size_t>& truenode_ids, const InlinedVector<size_t>& falsenode_ids, gsl::span<const int64_t> nodes_featureids,
+    gsl::span<const ThresholdType> nodes_values_as_tensor, gsl::span<const float> node_values,
+    gsl::span<const float> target_class_weights, gsl::span<const ThresholdType> target_class_weights_as_tensor,
+    const InlinedVector<TreeNodeElementId>& node_tree_ids, InlinedVector<std::pair<TreeNodeElementId, uint32_t>> indices) {
+  // Leaves have values set at 0
+  if (cmodes[left_id] != cmodes[right_id] || nodes_featureids[left_id] != nodes_featureids[right_id] ||
+      (!nodes_values_as_tensor.empty() && nodes_values_as_tensor[left_id] != nodes_values_as_tensor[right_id]) ||
+      (nodes_values_as_tensor.empty() && node_values[left_id] != node_values[right_id])) {
+    return false;
+  }
+
+  if (cmodes[left_id] == NODE_MODE_ONNX::LEAF) {
+    const auto left_target_node = std::lower_bound(indices.begin(), indices.end(), std::make_pair(node_tree_ids[left_id], uint32_t(0)))->second;
+    const auto right_target_node = std::lower_bound(indices.begin(), indices.end(), std::make_pair(node_tree_ids[right_id], uint32_t(0)))->second;
+
+    if (target_class_weights_as_tensor.empty()) {
+      return target_class_weights[left_target_node] == target_class_weights[right_target_node];
+    } else {
+      return target_class_weights_as_tensor[left_target_node] == target_class_weights_as_tensor[right_target_node];
+    }
+  }
+
+  return CheckIfSubtreesAreEqual(falsenode_ids[left_id], falsenode_ids[right_id], tree_id, cmodes, truenode_ids, falsenode_ids, nodes_featureids,
+                                 nodes_values_as_tensor, node_values, target_class_weights, target_class_weights_as_tensor, node_tree_ids, indices) &&
+         CheckIfSubtreesAreEqual(truenode_ids[left_id], truenode_ids[right_id], tree_id, cmodes, truenode_ids, falsenode_ids, nodes_featureids,
+                                 nodes_values_as_tensor, node_values, target_class_weights, target_class_weights_as_tensor, node_tree_ids, indices);
+}
+
+inline void UpdateThreshold(double val, double& mask) {
+  uint64_t new_mask = bit_cast<uint64_t>(mask) | (1ll << (static_cast<uint32_t>(val) - 1));
+  mask = bit_cast<double>(new_mask);
+}
+
+inline void UpdateThreshold(float val, float& mask) {
+  uint32_t new_mask = bit_cast<uint32_t>(mask) | (1 << (static_cast<uint32_t>(val) - 1));
+  mask = bit_cast<float>(new_mask);
+}
+
+#define BITCOUNT(T) int64_t(sizeof(T) * 8)
+#define CANMASK(v, T) (v >= 1 && v <= BITCOUNT(T)) && v == std::floor(v)
+
 template <typename InputType, typename ThresholdType, typename OutputType>
 size_t TreeEnsembleCommon<InputType, ThresholdType, OutputType>::AddNodes(
-    const size_t i, const InlinedVector<NODE_MODE>& cmodes, const InlinedVector<size_t>& truenode_ids,
-    const InlinedVector<size_t>& falsenode_ids, const std::vector<int64_t>& nodes_featureids,
-    const std::vector<ThresholdType>& nodes_values_as_tensor, const std::vector<float>& node_values,
-    const std::vector<int64_t>& nodes_missing_value_tracks_true, std::vector<size_t>& updated_mapping, int64_t tree_id,
-    const InlinedVector<TreeNodeElementId>& node_tree_ids) {
+    const size_t i, const InlinedVector<NODE_MODE_ONNX>& cmodes, const InlinedVector<size_t>& truenode_ids,
+    const InlinedVector<size_t>& falsenode_ids, gsl::span<const int64_t> nodes_featureids,
+    gsl::span<const ThresholdType> nodes_values_as_tensor, gsl::span<const float> node_values,
+    gsl::span<const int64_t> nodes_missing_value_tracks_true, std::vector<size_t>& updated_mapping, int64_t tree_id,
+    const InlinedVector<TreeNodeElementId>& node_tree_ids, gsl::span<const float> target_class_weights,
+    gsl::span<const ThresholdType> target_class_weights_as_tensor, InlinedVector<std::pair<TreeNodeElementId, uint32_t>>& indices) {
   // Validate this index maps to the same tree_id as the one we should be building.
   if (node_tree_ids[i].tree_id != tree_id) {
     ORT_THROW("Tree id mismatch. Expected ", tree_id, " but got ", node_tree_ids[i].tree_id, " at position ", i);
@@ -364,28 +359,59 @@ size_t TreeEnsembleCommon<InputType, ThresholdType, OutputType>::AddNodes(
   updated_mapping[i] = node_pos;
 
   TreeNodeElement<ThresholdType> node;
-  node.flags = static_cast<uint8_t>(cmodes[i]);
+  node.flags = Convert_NODE_MODE_ONNX_to_ORT(cmodes[i]);
   node.feature_id = static_cast<int>(nodes_featureids[i]);
   if (node.feature_id > max_feature_id_) {
     max_feature_id_ = node.feature_id;
   }
-  node.value_or_unique_weight =
-      nodes_values_as_tensor.empty() ? static_cast<ThresholdType>(node_values[i]) : nodes_values_as_tensor[i];
+
+  node.value_or_unique_weight = 0;
+  const ThresholdType node_threshold = nodes_values_as_tensor.empty() ? static_cast<ThresholdType>(node_values[i]) : nodes_values_as_tensor[i];
+  if (node.flags == NODE_MODE_ORT::BRANCH_EQ && CANMASK(node_threshold, ThresholdType)) {
+    UpdateThreshold(node_threshold, node.value_or_unique_weight);
+    node.flags = NODE_MODE_ORT::BRANCH_MEMBER;
+  } else {
+    node.value_or_unique_weight = node_threshold;
+  }
+
   if (i < static_cast<size_t>(nodes_missing_value_tracks_true.size()) && nodes_missing_value_tracks_true[i] == 1) {
-    node.flags |= static_cast<uint8_t>(MissingTrack::kTrue);
+    node.flags = static_cast<NODE_MODE_ORT>(static_cast<uint8_t>(node.flags) | static_cast<uint8_t>(MissingTrack::kTrue));
   }
   nodes_.push_back(std::move(node));
   if (nodes_[node_pos].is_not_leaf()) {
+    size_t falsenode_id = falsenode_ids[i];
+
+    // Categoricals are represented as a chain of `EQ` nodes where the subtree for the true child is identical for all nodes in the chain
+    // Below we are folding together these nodes into one of mode `BRANCH_MEMBER`
+    // The threshold of this node should be interpreted as a bitmask showing which categoricals values were found in the chain
+    // Afterwards, when looking whether a feature is included we can do an `and` with the mask of the node
+    // and the one of the feature (the mask has only one bit set on the place for its value)
+    // Beware that if a category is bigger than the threshold type, the node stays as `EQ` and no combination is done
+    if (nodes_[node_pos].flags == NODE_MODE_ORT::BRANCH_MEMBER) {
+      ThresholdType falsenode_threshold = nodes_values_as_tensor.empty() ? static_cast<ThresholdType>(node_values[falsenode_id]) : nodes_values_as_tensor[falsenode_id];
+
+      while (cmodes[falsenode_id] == NODE_MODE_ONNX::BRANCH_EQ && nodes_[node_pos].feature_id == nodes_featureids[falsenode_id] &&
+             CANMASK(falsenode_threshold, ThresholdType) &&
+             CheckIfSubtreesAreEqual(truenode_ids[i], truenode_ids[falsenode_id], tree_id, cmodes, truenode_ids, falsenode_ids,
+                                     nodes_featureids, nodes_values_as_tensor, node_values, target_class_weights, target_class_weights_as_tensor, node_tree_ids, indices)) {
+        UpdateThreshold(falsenode_threshold, nodes_[node_pos].value_or_unique_weight);
+        falsenode_id = falsenode_ids[falsenode_id];
+        falsenode_threshold = nodes_values_as_tensor.empty() ? static_cast<ThresholdType>(node_values[falsenode_id]) : nodes_values_as_tensor[falsenode_id];
+      }
+    }
+
     size_t false_branch =
-        AddNodes(falsenode_ids[i], cmodes, truenode_ids, falsenode_ids, nodes_featureids, nodes_values_as_tensor,
-                 node_values, nodes_missing_value_tracks_true, updated_mapping, tree_id, node_tree_ids);
+        AddNodes(falsenode_id, cmodes, truenode_ids, falsenode_ids, nodes_featureids, nodes_values_as_tensor,
+                 node_values, nodes_missing_value_tracks_true, updated_mapping, tree_id, node_tree_ids,
+                 target_class_weights, target_class_weights_as_tensor, indices);
     if (false_branch != node_pos + 1) {
       ORT_THROW("False node must always be the next node, but it isn't at index ", node_pos, " with flags ",
                 static_cast<int>(nodes_[node_pos].flags));
     }
     size_t true_branch =
         AddNodes(truenode_ids[i], cmodes, truenode_ids, falsenode_ids, nodes_featureids, nodes_values_as_tensor,
-                 node_values, nodes_missing_value_tracks_true, updated_mapping, tree_id, node_tree_ids);
+                 node_values, nodes_missing_value_tracks_true, updated_mapping, tree_id, node_tree_ids,
+                 target_class_weights, target_class_weights_as_tensor, indices);
     // We don't need to store the false branch pointer since we know it is always in the immediate next entry in nodes_.
     // nodes_[node_pos].falsenode_inc_or_n_weights.ptr = &nodes_[false_branch];
     nodes_[node_pos].truenode_or_weight.ptr = &nodes_[true_branch];
@@ -684,10 +710,12 @@ void TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ComputeAgg(concur
     }                                                                                                  \
   }
 
-inline bool _isnan_(float x) { return std::isnan(x); }
-inline bool _isnan_(double x) { return std::isnan(x); }
-inline bool _isnan_(int64_t) { return false; }
-inline bool _isnan_(int32_t) { return false; }
+// Check whether the feature value is set true in the mask
+template <typename T1, typename T2>
+inline bool SetMembershipCheck(T1 val, T2 mask) {
+  const int64_t val_as_int = static_cast<int64_t>(val);
+  return CANMASK(val, T2) && (((1ll << (val_as_int - 1)) & bit_cast_int(mask)) != 0);
+}
 
 template <typename InputType, typename ThresholdType, typename OutputType>
 TreeNodeElement<ThresholdType>*
@@ -696,7 +724,7 @@ TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ProcessTreeNodeLeave(
   InputType val;
   if (same_mode_) {
     switch (root->mode()) {
-      case NODE_MODE::BRANCH_LEQ:
+      case NODE_MODE_ORT::BRANCH_LEQ:
         if (has_missing_tracks_) {
           while (root->is_not_leaf()) {
             val = x_data[root->feature_id];
@@ -711,22 +739,36 @@ TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ProcessTreeNodeLeave(
           }
         }
         break;
-      case NODE_MODE::BRANCH_LT:
+      case NODE_MODE_ORT::BRANCH_LT:
         TREE_FIND_VALUE(<)
         break;
-      case NODE_MODE::BRANCH_GTE:
+      case NODE_MODE_ORT::BRANCH_GTE:
         TREE_FIND_VALUE(>=)
         break;
-      case NODE_MODE::BRANCH_GT:
+      case NODE_MODE_ORT::BRANCH_GT:
         TREE_FIND_VALUE(>)
         break;
-      case NODE_MODE::BRANCH_EQ:
+      case NODE_MODE_ORT::BRANCH_EQ:
         TREE_FIND_VALUE(==)
         break;
-      case NODE_MODE::BRANCH_NEQ:
+      case NODE_MODE_ORT::BRANCH_NEQ:
         TREE_FIND_VALUE(!=)
         break;
-      case NODE_MODE::LEAF:
+      case NODE_MODE_ORT::BRANCH_MEMBER:
+        if (has_missing_tracks_) {
+          while (root->is_not_leaf()) {
+            val = x_data[root->feature_id];
+            root = (SetMembershipCheck(val, root->value_or_unique_weight) || (root->is_missing_track_true() && _isnan_(val)))
+                       ? root->truenode_or_weight.ptr
+                       : root + 1;
+          }
+        } else {
+          while (root->is_not_leaf()) {
+            val = x_data[root->feature_id];
+            root = SetMembershipCheck(val, root->value_or_unique_weight) ? root->truenode_or_weight.ptr : root + 1;
+          }
+        }
+      case NODE_MODE_ORT::LEAF:
         break;
     }
   } else {  // Different rules to compare to node thresholds.
@@ -735,31 +777,36 @@ TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ProcessTreeNodeLeave(
       val = x_data[root->feature_id];
       threshold = root->value_or_unique_weight;
       switch (root->mode()) {
-        case NODE_MODE::BRANCH_LEQ:
+        case NODE_MODE_ORT::BRANCH_LEQ:
           root = val <= threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
                                                                                      : root + 1;
           break;
-        case NODE_MODE::BRANCH_LT:
+        case NODE_MODE_ORT::BRANCH_LT:
           root = val < threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
                                                                                     : root + 1;
           break;
-        case NODE_MODE::BRANCH_GTE:
+        case NODE_MODE_ORT::BRANCH_GTE:
           root = val >= threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
                                                                                      : root + 1;
           break;
-        case NODE_MODE::BRANCH_GT:
+        case NODE_MODE_ORT::BRANCH_GT:
           root = val > threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
                                                                                     : root + 1;
           break;
-        case NODE_MODE::BRANCH_EQ:
+        case NODE_MODE_ORT::BRANCH_EQ:
           root = val == threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
                                                                                      : root + 1;
           break;
-        case NODE_MODE::BRANCH_NEQ:
+        case NODE_MODE_ORT::BRANCH_NEQ:
           root = val != threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
                                                                                      : root + 1;
           break;
-        case NODE_MODE::LEAF:
+        case NODE_MODE_ORT::BRANCH_MEMBER:
+          root = (SetMembershipCheck(val, root->value_or_unique_weight) || (root->is_missing_track_true() && _isnan_(val)))
+                     ? root->truenode_or_weight.ptr
+                     : root + 1;
+          break;
+        case NODE_MODE_ORT::LEAF:
           return root;
       }
     }
@@ -786,67 +833,13 @@ class TreeEnsembleCommonClassifier : public TreeEnsembleCommon<InputType, Thresh
   Status Init(int parallel_tree,
               int parallel_tree_N,
               int parallel_N,
-              const std::string& aggregate_function,
-              const std::vector<float>& base_values,
-              const std::vector<ThresholdType>& base_values_as_tensor,
-              const std::vector<int64_t>& nodes_falsenodeids,
-              const std::vector<int64_t>& nodes_featureids,
-              const std::vector<float>& nodes_hitrates,
-              const std::vector<ThresholdType>& nodes_hitrates_as_tensor,
-              const std::vector<int64_t>& nodes_missing_value_tracks_true,
-              const std::vector<std::string>& nodes_modes,
-              const std::vector<int64_t>& nodes_nodeids,
-              const std::vector<int64_t>& nodes_treeids,
-              const std::vector<int64_t>& nodes_truenodeids,
-              const std::vector<float>& nodes_values,
-              const std::vector<ThresholdType>& nodes_values_as_tensor,
-              const std::string& post_transform,
-              const std::vector<int64_t>& class_ids,
-              const std::vector<int64_t>& class_nodeids,
-              const std::vector<int64_t>& class_treeids,
-              const std::vector<float>& class_weights,
-              const std::vector<ThresholdType>& class_weights_as_tensor,
-              const std::vector<std::string>& classlabels_strings,
-              const std::vector<int64_t>& classlabels_int64s);
+              const TreeEnsembleAttributesV3<ThresholdType>& attributes);
 };
 
 template <typename InputType, typename ThresholdType, typename OutputType>
 Status TreeEnsembleCommonClassifier<InputType, ThresholdType, OutputType>::Init(const OpKernelInfo& info) {
-  std::vector<ThresholdType> base_values_as_tensor, nodes_hitrates_as_tensor,
-      nodes_values_as_tensor, class_weights_as_tensor;
-#if !defined(ORT_MINIMAL_BUILD)
-  ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "base_values_as_tensor", base_values_as_tensor));
-  ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "nodes_hitrates_as_tensor", nodes_hitrates_as_tensor));
-  ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "nodes_values_as_tensor", nodes_values_as_tensor));
-  ORT_THROW_IF_ERROR(GetVectorAttrsOrDefault(info, "class_weights_as_tensor", class_weights_as_tensor));
-#endif
-
-  return Init(
-      80,
-      128,
-      50,
-      info.GetAttrOrDefault<std::string>("aggregate_function", "SUM"),
-      info.GetAttrsOrDefault<float>("base_values"),
-      base_values_as_tensor,
-      info.GetAttrsOrDefault<int64_t>("nodes_falsenodeids"),
-      info.GetAttrsOrDefault<int64_t>("nodes_featureids"),
-      info.GetAttrsOrDefault<float>("nodes_hitrates"),
-      nodes_hitrates_as_tensor,
-      info.GetAttrsOrDefault<int64_t>("nodes_missing_value_tracks_true"),
-      info.GetAttrsOrDefault<std::string>("nodes_modes"),
-      info.GetAttrsOrDefault<int64_t>("nodes_nodeids"),
-      info.GetAttrsOrDefault<int64_t>("nodes_treeids"),
-      info.GetAttrsOrDefault<int64_t>("nodes_truenodeids"),
-      info.GetAttrsOrDefault<float>("nodes_values"),
-      nodes_values_as_tensor,
-      info.GetAttrOrDefault<std::string>("post_transform", "NONE"),
-      info.GetAttrsOrDefault<int64_t>("class_ids"),
-      info.GetAttrsOrDefault<int64_t>("class_nodeids"),
-      info.GetAttrsOrDefault<int64_t>("class_treeids"),
-      info.GetAttrsOrDefault<float>("class_weights"),
-      class_weights_as_tensor,
-      info.GetAttrsOrDefault<std::string>("classlabels_strings"),
-      info.GetAttrsOrDefault<int64_t>("classlabels_int64s"));
+  TreeEnsembleAttributesV3<ThresholdType> attributes(info, true);
+  return Init(80, 128, 50, attributes);
 }
 
 template <typename InputType, typename ThresholdType, typename OutputType>
@@ -854,65 +847,20 @@ Status TreeEnsembleCommonClassifier<InputType, ThresholdType, OutputType>::Init(
     int parallel_tree,
     int parallel_tree_N,
     int parallel_N,
-    const std::string& aggregate_function,
-    const std::vector<float>& base_values,
-    const std::vector<ThresholdType>& base_values_as_tensor,
-    const std::vector<int64_t>& nodes_falsenodeids,
-    const std::vector<int64_t>& nodes_featureids,
-    const std::vector<float>& nodes_hitrates,
-    const std::vector<ThresholdType>& nodes_hitrates_as_tensor,
-    const std::vector<int64_t>& nodes_missing_value_tracks_true,
-    const std::vector<std::string>& nodes_modes,
-    const std::vector<int64_t>& nodes_nodeids,
-    const std::vector<int64_t>& nodes_treeids,
-    const std::vector<int64_t>& nodes_truenodeids,
-    const std::vector<float>& nodes_values,
-    const std::vector<ThresholdType>& nodes_values_as_tensor,
-    const std::string& post_transform,
-    const std::vector<int64_t>& class_ids,
-    const std::vector<int64_t>& class_nodeids,
-    const std::vector<int64_t>& class_treeids,
-    const std::vector<float>& class_weights,
-    const std::vector<ThresholdType>& class_weights_as_tensor,
-    const std::vector<std::string>& classlabels_strings,
-    const std::vector<int64_t>& classlabels_int64s) {
-  auto status = TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
-      parallel_tree,
-      parallel_tree_N,
-      parallel_N,
-      aggregate_function,
-      base_values,
-      base_values_as_tensor,
-      classlabels_strings.empty() ? classlabels_int64s.size()
-                                  : classlabels_strings.size(),
-      nodes_falsenodeids,
-      nodes_featureids,
-      nodes_hitrates,
-      nodes_hitrates_as_tensor,
-      nodes_missing_value_tracks_true,
-      nodes_modes,
-      nodes_nodeids,
-      nodes_treeids,
-      nodes_truenodeids,
-      nodes_values,
-      nodes_values_as_tensor,
-      post_transform,
-      class_ids,
-      class_nodeids,
-      class_treeids,
-      class_weights,
-      class_weights_as_tensor);
+    const TreeEnsembleAttributesV3<ThresholdType>& attributes) {
+  auto status = TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(parallel_tree, parallel_tree_N, parallel_N, attributes);
   ORT_RETURN_IF_ERROR(status);
 
-  classlabels_strings_ = classlabels_strings;
-  classlabels_int64s_ = classlabels_int64s;
+  classlabels_strings_ = attributes.classlabels_strings;
+  classlabels_int64s_ = attributes.classlabels_int64s;
 
   InlinedHashSet<int64_t> weights_classes;
-  weights_classes.reserve(class_ids.size());
+  weights_classes.reserve(attributes.target_class_ids.size());
   weights_are_all_positive_ = true;
-  for (size_t i = 0, end = class_ids.size(); i < end; ++i) {
-    weights_classes.insert(class_ids[i]);
-    if (weights_are_all_positive_ && (!class_weights.empty() ? class_weights[i] : class_weights_as_tensor[i]) < 0)
+  for (size_t i = 0, end = attributes.target_class_ids.size(); i < end; ++i) {
+    weights_classes.insert(attributes.target_class_ids[i]);
+    if (weights_are_all_positive_ && (!attributes.target_class_weights.empty() ? attributes.target_class_weights[i]
+                                                                               : attributes.target_class_weights_as_tensor[i]) < 0)
       weights_are_all_positive_ = false;
   }
   binary_case_ = this->n_targets_or_classes_ == 2 && weights_classes.size() == 1;
@@ -957,6 +905,43 @@ Status TreeEnsembleCommonClassifier<InputType, ThresholdType, OutputType>::compu
   return Status::OK();
 }
 
+template <typename IOType, typename ThresholdType>
+class TreeEnsembleCommonV5 : public TreeEnsembleCommon<IOType, ThresholdType, IOType> {
+ public:
+  virtual Status Init(const OpKernelInfo& info);
+
+  Status Init(int parallel_tree,
+              int parallel_tree_N,
+              int parallel_N,
+              const TreeEnsembleAttributesV5<ThresholdType>& attributes);
+};
+
+template <typename IOType, typename ThresholdType>
+Status TreeEnsembleCommonV5<IOType, ThresholdType>::Init(const OpKernelInfo& info) {
+  TreeEnsembleAttributesV5<ThresholdType> attributes(info);
+  return Init(80, 128, 50, attributes);
+}
+
+template <typename IOType, typename ThresholdType>
+Status TreeEnsembleCommonV5<IOType, ThresholdType>::Init(
+    int parallel_tree,
+    int parallel_tree_N,
+    int parallel_N,
+    const TreeEnsembleAttributesV5<ThresholdType>& attributes) {
+  TreeEnsembleAttributesV3<ThresholdType> attributes_v3;
+  attributes.convert_to_v3(attributes_v3);
+
+  attributes_v3.base_values.clear();
+  attributes_v3.base_values_as_tensor.clear();
+  attributes_v3.nodes_hitrates.clear();
+  attributes_v3.nodes_values.clear();
+  attributes_v3.target_class_weights.clear();
+
+  auto status = TreeEnsembleCommon<IOType, ThresholdType, IOType>::Init(parallel_tree, parallel_tree_N, parallel_N, attributes_v3);
+  ORT_RETURN_IF_ERROR(status);
+  return Status::OK();
+}
+
 }  // namespace detail
 }  // namespace ml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc
index e2981da3a6f25..399dfd56b93c6 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.cc
@@ -5,63 +5,53 @@
 
 #include "core/providers/cpu/ml/tree_ensemble_helper.h"
 #include "core/common/common.h"
+#include "core/common/safeint.h"
 #include "onnx/defs/tensor_proto_util.h"
+#include "core/framework/tensorprotoutils.h"
 
 using namespace ::onnxruntime::common;
 using namespace std;
 namespace onnxruntime {
 namespace ml {
 
-Status GetNumberOfElementsAttrsOrDefault(const OpKernelInfo& info, const std::string& name,
-                                         ONNX_NAMESPACE::TensorProto_DataType proto_type,
-                                         size_t& n_elements, ONNX_NAMESPACE::TensorProto& proto) {
-  auto status = info.GetAttr(name, &proto);
-  if (!status.IsOK()) {
-    // Attribute is missing, n_elements is set to 0.
-    n_elements = 0;
-    return Status::OK();
-  }
-  auto n_dims = proto.dims_size();
-  if (n_dims == 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Attribute:'", name, "' is specified but is empty.");
-  }
-  ORT_ENFORCE(n_dims == 1, "Attribute '", name, "' must be a vector.");
-  ORT_ENFORCE(proto.data_type() == proto_type,
-              "Unexpected type (", proto.data_type(), "(for attribute '", name, "'.");
-
-  n_elements = onnxruntime::narrow<size_t>(proto.dims()[0]);
-  ORT_ENFORCE(n_elements > 0, "Attribute '", name, "' has one dimension but is empty.");
-  return Status::OK();
-}
+template <typename T>
+Status GetAnyVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name, std::vector<T>& data) {
+  ONNX_NAMESPACE::TensorProto proto;
+  auto result = info.GetAttr(name, &proto);
 
-template <typename TH>
-Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name,
-                               ONNX_NAMESPACE::TensorProto_DataType proto_type, std::vector<TH>& data) {
-  if (proto_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_DOUBLE) {
-    ORT_ENFORCE((std::is_same<double, TH>::value));
-  } else if (proto_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) {
-    ORT_ENFORCE((std::is_same<float, TH>::value));
-  } else {
-    ORT_NOT_IMPLEMENTED("GetVectorAttrsOrDefault not implemented for type ", proto_type);
+  SafeInt<int64_t> n_elements(1);
+  for (auto dim : proto.dims()) {
+    n_elements *= dim;
   }
 
-  ONNX_NAMESPACE::TensorProto proto;
-  size_t n_elements;
-  data.clear();
-  ORT_THROW_IF_ERROR(GetNumberOfElementsAttrsOrDefault(info, name, proto_type, n_elements, proto));
-  if (n_elements == 0) {
+  if (proto.dims().empty()) {
     return Status::OK();
   }
-  data = ONNX_NAMESPACE::ParseData<TH>(&proto);
+
+  const SafeInt<size_t> tensor_size(n_elements);
+  data.clear();
+  data.resize(tensor_size);
+
+  result = utils::UnpackTensor<T>(proto, std::filesystem::path(), data.data(), tensor_size);
+  ORT_ENFORCE(result.IsOK(), "TreeEnsemble could not unpack tensor attribute ", name);
+
   return Status::OK();
 }
 
 Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name, std::vector<double>& data) {
-  return GetVectorAttrsOrDefault(info, name, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_DOUBLE, data);
+  return GetAnyVectorAttrsOrDefault(info, name, data);
 }
 
 Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name, std::vector<float>& data) {
-  return GetVectorAttrsOrDefault(info, name, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT, data);
+  return GetAnyVectorAttrsOrDefault(info, name, data);
+}
+
+Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name, std::vector<MLFloat16>& data) {
+  return GetAnyVectorAttrsOrDefault(info, name, data);
+}
+
+Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name, std::vector<uint8_t>& data) {
+  return GetAnyVectorAttrsOrDefault(info, name, data);
 }
 
 }  // namespace ml
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h
index 33172c343a88e..ba23f1ad28ec1 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_helper.h
@@ -13,6 +13,8 @@ namespace ml {
 
 Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name, std::vector<double>& data);
 Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name, std::vector<float>& data);
+Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name, std::vector<MLFloat16>& data);
+Status GetVectorAttrsOrDefault(const OpKernelInfo& info, const std::string& name, std::vector<uint8_t>& data);
 
 }  // namespace ml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
index 2c7afddf38070..f0c1b0b409831 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
@@ -38,7 +38,6 @@ ONNX_CPU_OPERATOR_KERNEL(
 
 template <typename T>
 Status ConvTranspose<T>::PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
-                                 bool /*save_prepacked_initializers*/,
                                  /*out*/ bool& is_packed,
                                  /*out*/ PrePackedWeights* /*prepacked_weights*/
 ) {
@@ -48,7 +47,6 @@ Status ConvTranspose<T>::PrePack(const Tensor& /*tensor*/, int /*input_idx*/, Al
 
 template <>
 Status ConvTranspose<float>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                                     bool /*save_prepacked_initializers*/,
                                      /*out*/ bool& is_packed,
                                      /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose.h b/onnxruntime/core/providers/cpu/nn/conv_transpose.h
index d03b5566e334f..c82cd5ad49d7e 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.h
@@ -28,7 +28,6 @@ class ConvTranspose : public OpKernel {
   ConvTranspose(const OpKernelInfo& info) : OpKernel(info), conv_transpose_attrs_(info) {}
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
index fe2bf1035bb65..24a5dcab225c4 100644
--- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
+++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
@@ -229,7 +229,6 @@ Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const {
 }
 
 Status LayerNormImpl::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                              bool /*save_prepacked_initializers*/,
                               bool& is_packed, PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
 
diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h
index abce87d03c14b..f8b528b398cba 100644
--- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h
+++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h
@@ -15,7 +15,7 @@ class LayerNormImpl : public OpKernel {
   LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified = false, bool contrib_op = false);
   Status Compute(OpKernelContext* p_op_kernel_context) const override;
 
-  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool save_prepacked_initializers,
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
                  bool& is_packed, PrePackedWeights* prepacked_weights) override;
 
   // This method was created so that it can be called directly from `test/onnx/microbenchmark/layer_normalization.cc`.
diff --git a/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h b/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h
index 8a8ce27990069..e26eae19b8fd4 100644
--- a/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h
+++ b/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h
@@ -14,7 +14,6 @@ class MatMulIntegerBase : public OpKernel {
   MatMulIntegerBase(const OpKernelInfo& info) : OpKernel(info) {}
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool /*save_prepacked_initializers*/,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override {
     is_packed = false;
diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
index 736cde24591ff..7797cbe678bd4 100644
--- a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
+++ b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
@@ -25,7 +25,6 @@ class QLinearConv : public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
@@ -361,7 +360,6 @@ REGISTER_QLINEARCONV_INT8_KERNEL(kMSDomain, 1);
 
 template <typename ActType>
 Status QLinearConv<ActType>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                                     bool /*save_prepacked_initializers*/,
                                      /*out*/ bool& is_packed,
                                      /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
index 7afd00eacef89..b78c5236e6fab 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
@@ -284,7 +284,6 @@ bool DeepCpuGruOp::TryPackRecurrentWeights(const Tensor& weights, AllocatorPtr&
 }
 
 Status DeepCpuGruOp::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                             bool /*save_prepacked_initializers*/,
                              bool& is_packed, PrePackedWeights* prepacked_weights) {
   is_packed = false;
 
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.h b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.h
index 914077b2f2c15..5a6dd97c7c3f2 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.h
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.h
@@ -62,7 +62,6 @@ class DeepCpuGruOp final : public OpKernel {
 
  private:
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
@@ -198,4 +197,4 @@ class UniDirectionalGru {
 };
 }  // namespace detail
 
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
index e4082e5d7634a..09bbf6c4c79e6 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
@@ -225,9 +225,7 @@ static void UseSharedPrePackedBuffersImpl(std::vector<BufferUniquePtr>& prepacke
 }
 
 Status DeepCpuLstmOp::PrePack(const Tensor& tensor, int input_idx,
-                              AllocatorPtr alloc,
-                              bool /*save_prepacked_initializers*/,
-                              /*out*/ bool& is_packed,
+                              AllocatorPtr alloc, /*out*/ bool& is_packed,
                               /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
 
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
index ff8ab9abf0eed..9c4c12954022a 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
@@ -19,7 +19,6 @@ class DeepCpuLstmOp final : public OpKernel, public LSTMBase {
   DeepCpuLstmOp(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
diff --git a/onnxruntime/core/providers/cpu/tensor/unsqueeze.h b/onnxruntime/core/providers/cpu/tensor/unsqueeze.h
index 4b31e3a82f2d0..6960f8838ffde 100644
--- a/onnxruntime/core/providers/cpu/tensor/unsqueeze.h
+++ b/onnxruntime/core/providers/cpu/tensor/unsqueeze.h
@@ -20,15 +20,6 @@ class UnsqueezeBase {
   };
 
   Status PrepareCompute(OpKernelContext* context, Prepare& p) const;
-
- protected:
-  UnsqueezeBase(const OpKernelInfo& info) {
-    size_t num_inputs = info.GetInputCount();
-    if (num_inputs == 1) {  // axes must be a valid attribute
-      ORT_ENFORCE(info.GetAttrs("axes", axes_).IsOK(), "Missing/Invalid 'axes' attribute value");
-    }
-  }
-
   static TensorShapeVector ComputeOutputShape(
       const TensorShape& input_shape,
       const TensorShapeVector& axes) {
@@ -59,6 +50,14 @@ class UnsqueezeBase {
     return output_shape;
   }
 
+ protected:
+  UnsqueezeBase(const OpKernelInfo& info) {
+    size_t num_inputs = info.GetInputCount();
+    if (num_inputs == 1) {  // axes must be a valid attribute
+      ORT_ENFORCE(info.GetAttrs("axes", axes_).IsOK(), "Missing/Invalid 'axes' attribute value");
+    }
+  }
+
   TensorShapeVector axes_;
 };
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 8396e2629d2bf..d4013a7dc3d57 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -2693,7 +2693,7 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   // For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
   // These are usually shape related computation subgraphs
   // Following logic can be extended for other EPs
-  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, logger);
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0)
diff --git a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc
index 640025c248187..7cd320a26d973 100644
--- a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc
@@ -4,7 +4,7 @@
 #include "core/providers/cuda/shared_inc/cudnn_fe_call.h"
 #include "core/providers/shared_library/provider_api.h"
 #include <core/platform/env.h>
-#if !defined(__CUDACC__)
+#if !defined(__CUDACC__) && !defined(USE_CUDA_MINIMAL)
 #include <cudnn_frontend.h>
 #endif
 #ifdef _WIN32
@@ -22,7 +22,7 @@ const char* CudaErrString(ERRTYPE) {
   ORT_NOT_IMPLEMENTED();
 }
 
-#if !defined(__CUDACC__)
+#if !defined(__CUDACC__) && !defined(USE_CUDA_MINIMAL)
 #define CASE_ENUM_TO_STR_CUDNN_FE(x)    \
   case cudnn_frontend::error_code_t::x: \
     return #x
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index 45a1d3bbc0414..3129f519da2e5 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -52,7 +52,6 @@ REGISTER_KERNEL_TYPED(MLFloat16, kMSInternalNHWCDomain, true)
 // First input (in this case X) is in case NHWC == true also in NHWC format, the other inputs in NCHW
 template <typename T, bool NHWC>
 Status Conv<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                              bool /*save_prepacked_initializers*/,
                               bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
   is_packed = false;
   // only layout of weight input is adjusted via PrePack
diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h
index 6294566af3cb9..e4047a6af272e 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.h
+++ b/onnxruntime/core/providers/cuda/nn/conv.h
@@ -219,7 +219,6 @@ class Conv : public CudaKernel {
   }
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  bool& is_packed, PrePackedWeights* prepacked_weights) override;
 
   Status ComputeInternal(OpKernelContext* context) const override;
diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
index 9c9a83460daeb..2972ae999adc4 100644
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
@@ -45,8 +45,7 @@ REGISTER_KERNEL_TYPED(MLFloat16, kMSInternalNHWCDomain, true)
 
 // First input (in this case X) is in case NHWC == true also in NHWC format, the other inputs in NCHW
 template <typename T, bool NHWC>
-Status ConvTranspose<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                                       bool /*save_prepacked_initializers*/, bool& is_packed,
+Status ConvTranspose<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed,
                                        [[maybe_unused]] PrePackedWeights* prepacked_weights) {
   is_packed = false;
   // only layout of weight input is adjusted via PrePack
diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.h b/onnxruntime/core/providers/cuda/nn/conv_transpose.h
index f23c2b94501f2..1a6957164d22f 100644
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.h
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.h
@@ -22,7 +22,6 @@ class ConvTranspose : public CudaKernel {
 
   ConvTranspose(const OpKernelInfo& info) : CudaKernel(info), conv_transpose_attrs_(info) {};
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override;
   Status ComputeInternal(OpKernelContext* context) const override;
   Status DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const;
diff --git a/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h b/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h
index a51d84a7efa59..2ce7bc0bf51fd 100644
--- a/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h
+++ b/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h
@@ -5,7 +5,7 @@
 #include "core/common/common.h"
 #include "core/providers/cuda/cuda_pch.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
-#if !defined(__CUDACC__)
+#if !defined(__CUDACC__) && !defined(USE_CUDA_MINIMAL)
 #include <cudnn_frontend.h>
 #endif
 namespace onnxruntime {
@@ -14,10 +14,12 @@ namespace onnxruntime {
 // Error handling
 // -----------------------------------------------------------------------
 
+#ifndef USE_CUDA_MINIMAL
 #define CUDNN_FE_CALL(expr) (CudaCall<cudnn_frontend::error_t, false,                                                   \
                                       cudnn_frontend::error_code_t>((cudnn_frontend::error_t)(expr), #expr, "CUDNN_FE", \
                                                                     cudnn_frontend::error_code_t::OK, "", __FILE__, __LINE__))
 #define CUDNN_FE_CALL_THROW(expr) (CudaCall<cudnn_frontend::error_t, true,                                                    \
                                             cudnn_frontend::error_code_t>((cudnn_frontend::error_t)(expr), #expr, "CUDNN_FE", \
                                                                           cudnn_frontend::error_code_t::OK, "", __FILE__, __LINE__))
+#endif
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dml/CPPLINT.cfg b/onnxruntime/core/providers/dml/CPPLINT.cfg
deleted file mode 100644
index 02d14c65cc861..0000000000000
--- a/onnxruntime/core/providers/dml/CPPLINT.cfg
+++ /dev/null
@@ -1 +0,0 @@
-filter=-whitespace/braces,-whitespace/parens,-whitespace/line_length,-whitespace/indent,-whitespace/newline
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 68b9b3fe5706f..334a40b979bda 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -182,7 +182,7 @@ namespace Dml
         }
         else
         {
-            if (!m_closed)
+            if (!m_context->IsClosed())
             {
                 // Free the underlying allocation once queued work has completed.
     #ifdef _GAMING_XBOX
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 65bc9b7f69316..16283d5b19c9c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -46,11 +46,6 @@ namespace Dml
 
         void SetDefaultRoundingMode(AllocatorRoundingMode roundingMode);
 
-        void Close()
-        {
-            m_closed = true;
-        }
-
     public: // onnxruntime::IAllocator
         void* Alloc(size_t size, AllocatorRoundingMode roundingMode);
         void* Alloc(size_t size) final;
@@ -88,7 +83,6 @@ namespace Dml
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
         uint64_t m_currentResourceId = 0;
-        bool m_closed = false;
 
         // Unless specifically requested, allocation sizes are not rounded to enable pooling
         // until SetDefaultRoundingMode is called.  This should be done at completion of session
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
index 67faf333d21e1..988324bab1174 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
@@ -55,7 +55,7 @@ namespace Dml
         // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference
         // to its underlying D3D resource when freed. Furthermore, these references are unnecessary
         // since Close() already blocks for scheduled GPU work before clearing m_queuedReferences.
-        if (!m_clearingQueue)
+        if (!m_closing)
         {
             QueuedReference queuedReference = {GetLastFenceValue(), object};
 
@@ -70,15 +70,15 @@ namespace Dml
         }
     }
 
-    void CommandQueue::WaitForSignalAndClearQueue()
+    void CommandQueue::Close()
     {
         // Wait for flushed work:
-        assert(!m_clearingQueue);
-        m_clearingQueue = true;
+        assert(!m_closing);
+        m_closing = true;
         GpuEvent event = GetCurrentCompletionEvent();
         event.WaitForSignal(m_cpuSyncSpinningEnabled);
         m_queuedReferences.clear();
-        m_clearingQueue = false;
+        m_closing = false;
     }
 
     void CommandQueue::ReleaseCompletedReferences()
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h
index 9a4728d5845d4..71d5eb173cfec 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h
@@ -44,7 +44,7 @@ namespace Dml
         }
 #endif
 
-        void WaitForSignalAndClearQueue();
+        void Close();
         void ReleaseCompletedReferences();
 
     private:
@@ -61,7 +61,7 @@ namespace Dml
 
         ComPtr<ID3D12Fence> m_fence;
         uint64_t m_lastFenceValue = 0;
-        bool m_clearingQueue = false;
+        bool m_closing = false;
         bool m_cpuSyncSpinningEnabled = false;
     };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
index 35a2c451a49a5..9f95818501dac 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
@@ -62,7 +62,8 @@ namespace Dml
         const auto kernel_type_str_resolver = onnxruntime::OpSchemaKernelTypeStrResolver{};
         const auto kernel_lookup = onnxruntime::KernelLookup{provider_type,
                                                              gsl::make_span(&registry, 1),
-                                                             kernel_type_str_resolver};
+                                                             kernel_type_str_resolver,
+                                                             logger};
 
         std::vector<std::shared_ptr<CompiledPartitionInfo>> compiledPartitionInfos;
         std::vector<onnxruntime::NodeIndex> additionalSplittingNodes;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.cpp
index 6318b0d5e2865..b9b90d6bc17bd 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.cpp
@@ -54,7 +54,8 @@ namespace Dml
         const auto kernelLookup = onnxruntime::KernelLookup(
             providerType,
             gsl::make_span(&registry, 1),
-            kernelTypeStrResolver);
+            kernelTypeStrResolver,
+            logger);
 
         onnxruntime::GraphViewer graphViewer(graph);
         const auto& nodeTopologyList = graphViewer.GetNodesInTopologicalOrder();
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index ececf13fc8cdf..5dc1213bd76f0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -11,10 +11,13 @@ namespace Dml
         ID3D12Device* d3d12Device,
         IDMLDevice* dmlDevice,
         ID3D12CommandQueue* queue,
-        bool cpuSyncSpinningEnabled)
+        bool cpuSyncSpinningEnabled,
+        bool keepOpen
+        )
         : m_queue(std::make_shared<CommandQueue>(queue, cpuSyncSpinningEnabled))
         , m_dmlRecorder(d3d12Device, dmlDevice, m_queue)
         , m_cpuSyncSpinningEnabled(cpuSyncSpinningEnabled)
+        , m_keepOpen(keepOpen)
     {
         ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf())));
     }
@@ -33,6 +36,8 @@ namespace Dml
         D3D12_RESOURCE_STATES srcState,
         uint64_t byteCount)
     {
+        assert(!m_closed);
+
         SetCommandRecorder(&m_dmlRecorder);
 
         std::vector<D3D12_RESOURCE_BARRIER> barriers;
@@ -79,6 +84,8 @@ namespace Dml
         _Out_ uint64_t* completionValue
         )
     {
+        assert(!m_closed);
+
         SetCommandRecorder(&m_dmlRecorder);
         m_dmlRecorder.ExecuteCommandList(commandList, fence, completionValue);
     }
@@ -88,6 +95,7 @@ namespace Dml
         const DML_BINDING_DESC& persistentResourceBinding,
         const DML_BINDING_DESC& inputArrayBinding)
     {
+        assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
         m_dmlRecorder.InitializeOperator(op, persistentResourceBinding, inputArrayBinding);
@@ -99,6 +107,7 @@ namespace Dml
         gsl::span<const DML_BINDING_DESC> inputBindings,
         gsl::span<const DML_BINDING_DESC> outputBindings)
     {
+        assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
         m_dmlRecorder.ExecuteOperator(op, persistentResourceBinding, inputBindings, outputBindings);
@@ -106,6 +115,7 @@ namespace Dml
 
     void ExecutionContext::AddUAVBarrier()
     {
+        assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
         m_dmlRecorder.AddUAVBarrier();
@@ -113,6 +123,7 @@ namespace Dml
 
     void ExecutionContext::ResourceBarrier(gsl::span<const D3D12_RESOURCE_BARRIER> barriers)
     {
+        assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
         m_dmlRecorder.ResourceBarrier(barriers);
@@ -120,6 +131,7 @@ namespace Dml
 
     void ExecutionContext::GetCommandListForRecordingAndInvalidateState(ID3D12GraphicsCommandList** commandList)
     {
+        assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
         // Ensure the descriptor heap is reset to D3D as something external may change it before recording
@@ -130,6 +142,8 @@ namespace Dml
 
     void ExecutionContext::SetCommandRecorder(ICommandRecorder* newRecorder)
     {
+        assert(!m_closed);
+
         // If changing which recorder is the current one, we need to flush the old one first. This is to ensure correct
         // ordering of operations on the command queue.
         if (m_currentRecorder != newRecorder)
@@ -146,6 +160,8 @@ namespace Dml
 
     void ExecutionContext::Flush()
     {
+        assert(!m_closed);
+
         if (!m_currentRecorder || !m_currentRecorder->HasUnsubmittedWork())
         {
             // Nothing to flush
@@ -164,21 +180,34 @@ namespace Dml
 
     void ExecutionContext::QueueReference(IUnknown* object)
     {
+        assert(!m_closed);
         // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
         // value is the one to signal completion.
         bool waitForUnsubmittedWork = (m_currentRecorder != nullptr);
         m_queue->QueueReference(object, waitForUnsubmittedWork);
     }
 
-    void ExecutionContext::WaitForSignalAndClearQueue()
+    void ExecutionContext::Close()
     {
+        assert(!m_closed);
+
         // Discard unflushed work and clear queued references.  This prevents the circular reference:
         // Kernel --> ProviderImpl -->  Context --> QueuedRefs --> Kernel
-        m_queue->WaitForSignalAndClearQueue();
+        m_queue->Close();
+
+        // Keep the execution context open when requested, e.g. when used through the python API where there's a single context
+        // and single command queue
+        if (!m_keepOpen)
+        {
+            m_currentRecorder = nullptr;
+            m_closed = true;
+        }
     }
 
     GpuEvent ExecutionContext::GetCurrentCompletionEvent()
     {
+        assert(!m_closed);
+
         GpuEvent event = m_queue->GetCurrentCompletionEvent();
 
         // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
@@ -194,11 +223,13 @@ namespace Dml
 
     void ExecutionContext::ReleaseCompletedReferences()
     {
+        assert(!m_closed);
         m_queue->ReleaseCompletedReferences();
     }
 
     D3D12_COMMAND_LIST_TYPE ExecutionContext::GetCommandListTypeForQueue() const
     {
+        assert(!m_closed);
         return m_queue->GetType();
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
index 71aa26f4a0148..e7a6fa3d07296 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
@@ -23,13 +23,14 @@ namespace Dml
             ID3D12Device* d3d12Device,
             IDMLDevice* dmlDevice,
             ID3D12CommandQueue* queue,
-            bool cpuSyncSpinningEnabled);
+            bool cpuSyncSpinningEnabled,
+            bool keepOpen);
 
         void SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator);
 
         // Waits for flushed work, discards unflushed work, and discards associated references to
-        // prevent circular references.
-        void WaitForSignalAndClearQueue();
+        // prevent circular references.  Must be the last call on the object before destruction.
+        void Close();
 
         // Queues a CopyBufferRegion (see ID3D12GraphicsCommandList::CopyBufferRegion) for execution. Transition
         // barriers are automatically inserted to transition the source and destination resources to COPY_SOURCE and
@@ -86,6 +87,7 @@ namespace Dml
 
         D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const;
         bool CpuSyncSpinningEnabled() const { return m_cpuSyncSpinningEnabled; }
+        bool IsClosed() const { return m_closed; }
 
     private:
         Microsoft::WRL::ComPtr<ID3D12Device> m_d3dDevice;
@@ -101,6 +103,10 @@ namespace Dml
 
         bool m_closed = false;
         bool m_cpuSyncSpinningEnabled = false;
+
+        // The python API has a global state used for I/O binding where the execution context is shared between session,
+        // so we don't want to close the context when one of the sessions is destroyed
+        bool m_keepOpen = false;
     };
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index ae9be4ea91c28..826f48b5f7a68 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -95,7 +95,7 @@ namespace Dml
         const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup) const
     {
 #ifdef ENABLE_GRAPH_COMPILATION
-        return m_impl->GetCapability(graph, kernel_lookup);
+        return m_impl->GetCapability(graph, kernel_lookup, *GetLogger());
 #else
         return onnxruntime::IExecutionProvider::GetCapability(graph, kernel_lookup);
 #endif
@@ -106,26 +106,7 @@ namespace Dml
         // Release the cached command list references before closing the context
         m_capturedGraphs.clear();
 
-        // Close the allocator before clearing the command queue to stop it from
-        // appending resources to it in an attempt to keep them alive.
-        if (m_allocator)
-        {
-            m_allocator->Close();
-        }
-
-        // Destroy the allocators. We are closing the execution provider, so from now on the
-        // only thing it will be used for is doing copies via the DataTransfer, which doesn't
-        // require allocating any memory.
-        // TODO: Move the copy functions over to ExecutionContext so that we are able to cleanly
-        // destroy ExecutionProviderImpl, and instead have the DataTransfer keep the context alive.
-        m_allocator = nullptr;
-        m_cpuInputAllocator = nullptr;
-
-        // Wait for all pending commands to be done executing and empty the command queue. This will
-        // Force all kernels and resources in flight to get destroyed and, from this point forward,
-        // ExecutionProviderImpl will only be used to execute transfer between resources that are
-        // already existing via the DataTransfer;
-        m_context->WaitForSignalAndClearQueue();
+        m_context->Close();
     }
 
     void ExecutionProviderImpl::WaitForOutstandingWork()
@@ -895,7 +876,8 @@ namespace Dml
     std::vector<std::unique_ptr<onnxruntime::ComputeCapability>>
     ExecutionProviderImpl::GetCapability(
         const onnxruntime::GraphViewer& graph,
-        const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup) const
+        const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
+        const onnxruntime::logging::Logger& logger) const
     {
         uint32_t deviceDataTypeMask = GetSupportedDeviceDataTypeMask(); // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
 
@@ -919,7 +901,7 @@ namespace Dml
         }
 
         // Get the list of nodes that should stay on the CPU
-        auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes);
+        auto cpuPreferredNodes = GetCpuPreferredNodes(graph, kernel_lookup, tentativeNodes, logger);
 
         for (size_t nodeIndex : toplogicalOrder)
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 32a5b9add35a0..e7d859c5764de 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -88,7 +88,8 @@ namespace Dml
         std::vector<std::unique_ptr<onnxruntime::ComputeCapability>>
         GetCapability(
             const onnxruntime::GraphViewer& graph,
-            const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup
+            const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
+            const onnxruntime::logging::Logger& logger
             ) const;
 
         uint32_t GetSupportedDeviceDataTypeMask() const;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/CPPLINT.cfg b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/CPPLINT.cfg
index bf14c49304415..7e6be3c6874d5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/CPPLINT.cfg
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/CPPLINT.cfg
@@ -1 +1 @@
-filter=-whitespace/comments,-readability/todo,-whitespace/end_of_line,-runtime/indentation_namespace
+filter=-readability/todo
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index ceed388bb0a6f..b0b37d01370bc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -854,6 +854,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO_COPY(14,  Identity,                           typeNameListDefaultV,           supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY(16,  Identity,                           typeNameListDefaultV,           supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY(19,  Identity,                           typeNameListDefaultV,           supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
+    {REG_INFO_COPY(21,  Identity,                           typeNameListDefaultV,           supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY( 7,  Flatten,                            typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY( 9,  Flatten,                            typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
     {REG_INFO_COPY(11,  Flatten,                            typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported)},
@@ -1157,6 +1158,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO_MS(  1,   QLinearAdd,                         typeNameListDefault,            supportedTypeListInteger8,              DmlGraphSupport::Supported)},
     {REG_INFO(     10,  QLinearConv,                        typeNameListFour,               supportedTypeListQLinearConv,           DmlGraphSupport::Supported)},
     {REG_INFO(     10,  QLinearMatMul,                      typeNameListThree,              supportedTypeListQLinearMatMul,         DmlGraphSupport::Supported)},
+    {REG_INFO(     21,  QLinearMatMul,                      typeNameListThree,              supportedTypeListQLinearMatMul,         DmlGraphSupport::Supported)},
     {REG_INFO(     10,  MatMulInteger,                      typeNameListThree,              supportedTypeListInteger,               DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  MatMulIntegerToFloat,               typeNameListThree,              supportedTypeListMatMulIntegerToFloat,  DmlGraphSupport::Supported)},
     {REG_INFO(     10,  ConvInteger,                        typeNameListThree,              supportedTypeListInteger,               DmlGraphSupport::Supported)},
@@ -1170,6 +1172,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO_MS(   1,  BiasAdd,                            typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  QuickGelu,                          typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  GroupNorm,                          typeNameListGroupNorm,          supportedTypeListGroupNorm,             DmlGraphSupport::Supported)},
+    {REG_INFO(      21, GroupNorm,                          typeNameListGroupNorm,          supportedTypeListGroupNorm,             DmlGraphSupport::Supported)},
     {REG_INFO_MS(   1,  MatMulNBits,                        typeNameListTwo,                supportedTypeListMatMulNBits,           DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryMatMulNBits)},
 
     // Operators that need to alias an input with an output
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index c1ea69ab35374..c52e26dd321ab 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -1648,6 +1648,7 @@ using ShapeInferenceHelper_BatchNormalization15 = BatchNormalizationHelper;
 using ShapeInferenceHelper_LRN = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_MeanVarianceNormalization = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_GroupNorm = GetOutputShapeAsInputShapeHelper;
+using ShapeInferenceHelper_GroupNorm21 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_LayerNormalization = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_LayerNormalization17 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_SkipLayerNormalization = SkipLayerNormHelper;
@@ -1839,6 +1840,7 @@ using ShapeInferenceHelper_Identity13 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Identity14 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Identity16 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_Identity19 = GetOutputShapeAsInputShapeHelper;
+using ShapeInferenceHelper_Identity21 = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_MatMul = MatMulHelper;
 using ShapeInferenceHelper_MatMulInteger = MatMulHelper;
 using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulHelper;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
index c2a6d57fca0a9..b4d402a1d9e77 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@@ -452,6 +452,9 @@ namespace OperatorHelper
         static const int sc_sinceVer_Flatten = 21;
         static const int sc_sinceVer_Pad = 21;
         static const int sc_sinceVer_Transpose = 21;
+        static const int sc_sinceVer_Identity = 21;
+        static const int sc_sinceVer_QLinearMatMul = 21;
+        static const int sc_sinceVer_GroupNorm = 21;
     }
 
     namespace MsftOperatorSet1
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 89decfef6fef6..e8fe235fc1d46 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -86,11 +86,11 @@ std::unique_ptr<IExecutionProvider> DMLProviderFactory::CreateProvider() {
 
     // First, check if an I/O binding API that was used before this session or another session has already created a queue
     if (FAILED(d3d12_device->GetPrivateData(dml_execution_context_guid, &execution_context_ptr_size, execution_context.GetAddressOf()))) {
-      execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), true);
+      execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), true, true);
       ORT_THROW_IF_FAILED(d3d12_device->SetPrivateDataInterface(dml_execution_context_guid, execution_context.Get()));
     }
   } else {
-    execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), cpu_sync_spinning_enabled_);
+    execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), cpu_sync_spinning_enabled_, false);
   }
 
   auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), execution_context.Get(), metacommands_enabled_, graph_capture_enabled_, cpu_sync_spinning_enabled_, disable_memory_arena_);
diff --git a/onnxruntime/core/providers/get_execution_providers.cc b/onnxruntime/core/providers/get_execution_providers.cc
index d2a72c3a38b03..7d8c5525667b9 100644
--- a/onnxruntime/core/providers/get_execution_providers.cc
+++ b/onnxruntime/core/providers/get_execution_providers.cc
@@ -66,14 +66,6 @@ constexpr ProviderInfo kProvidersInPriorityOrder[] =
             true,
 #else
             false,
-#endif
-        },
-        {
-            kTvmExecutionProvider,
-#ifdef USE_TVM
-            true,
-#else
-            false,
 #endif
         },
         {
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index ca0cfedb7e2f0..c1a8b373bed84 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -341,6 +341,10 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gat
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, GatherElements);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, GatherElements);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, GatherND);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, GatherND);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, GatherND);
+
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 9, Slice);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, Slice);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Slice);
@@ -400,6 +404,13 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 2
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, int8_t, DequantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, int32_t, DequantizeLinear);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, 19, GridSample);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 16, 19, GridSample);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ScatterND);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 15, ScatterND);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, 17, ScatterND);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ScatterND);
+
 std::unique_ptr<KernelRegistry> RegisterKernels() {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
@@ -660,6 +671,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, GatherElements)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, GatherElements)>,
 
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, GatherND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, GatherND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, GatherND)>,
+
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, Resize)>,
@@ -728,6 +743,13 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, uint8_t, DequantizeLinear)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, int8_t, DequantizeLinear)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, int32_t, DequantizeLinear)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, 19, GridSample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 16, 19, GridSample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 15, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, 17, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ScatterND)>,
   };
 
   for (auto& function_table_entry : function_table) {
@@ -796,7 +818,7 @@ std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapabili
     candidates.push_back(node.Index());
     tenative_candidates.push_back(node.Index());
   }
-  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates);
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates, *GetLogger());
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0) {
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 276b600cf40d2..b04df44954295 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -78,7 +78,6 @@ class ConvBase : public JsKernel {
   }
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* /* prepacked_weights */) override {
     is_packed = false;
diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.h b/onnxruntime/core/providers/js/operators/conv_transpose.h
index baa93f825a203..5ff52e8fda4fa 100644
--- a/onnxruntime/core/providers/js/operators/conv_transpose.h
+++ b/onnxruntime/core/providers/js/operators/conv_transpose.h
@@ -126,10 +126,8 @@ class ConvTranspose : public JsKernel {
   }
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* /* prepacked_weights */) override {
-    ORT_UNUSED_PARAMETER(save_prepacked_initializers);
     is_packed = false;
 
     if (input_idx == 1) {
diff --git a/onnxruntime/core/providers/js/operators/gather_nd.cc b/onnxruntime/core/providers/js/operators/gather_nd.cc
new file mode 100644
index 0000000000000..ee69100cc658e
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/gather_nd.cc
@@ -0,0 +1,41 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+#include "core/providers/js/js_data_types.h"
+#include "gather_nd.h"
+
+namespace onnxruntime {
+namespace js {
+
+ONNX_OPERATOR_KERNEL_EX(
+    GatherND,
+    kOnnxDomain,
+    13,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    GatherND);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    GatherND,
+    kOnnxDomain,
+    12,
+    12,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    GatherND);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    GatherND,
+    kOnnxDomain,
+    11,
+    11,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    GatherND);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/gather_nd.h b/onnxruntime/core/providers/js/operators/gather_nd.h
new file mode 100644
index 0000000000000..cdf7a52630dad
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/gather_nd.h
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+
+class GatherND : public JsKernel {
+ public:
+  GatherND(const OpKernelInfo& info) : JsKernel(info) {
+    int64_t batchDims = info.GetAttrOrDefault<int64_t>("batch_dims", 0);
+
+    JSEP_INIT_KERNEL_ATTRIBUTE(GatherND, ({
+                                 "batch_dims" : Number($1),
+                               }),
+                               static_cast<int32_t>(batchDims));
+  }
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/grid_sample.cc b/onnxruntime/core/providers/js/operators/grid_sample.cc
new file mode 100644
index 0000000000000..84eb7df6c5bbe
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/grid_sample.cc
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "grid_sample.h"
+
+namespace onnxruntime {
+namespace js {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    GridSample,
+    kMSInternalNHWCDomain,
+    16, 19,
+    kJsExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T1", JsepSupportedDataTypes())
+        .TypeConstraint("T2", JsepSupportedFloatTypes()),
+    GridSample<true>);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    GridSample,
+    kOnnxDomain,
+    16, 19,
+    kJsExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T1", JsepSupportedDataTypes())
+        .TypeConstraint("T2", JsepSupportedFloatTypes()),
+    GridSample<false>);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/grid_sample.h b/onnxruntime/core/providers/js/operators/grid_sample.h
new file mode 100644
index 0000000000000..352decf33dc20
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/grid_sample.h
@@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+
+template <bool is_channels_last>
+class GridSample : public JsKernel {
+ public:
+  GridSample(const OpKernelInfo& info) : JsKernel(info) {
+    int64_t align_corners = info.GetAttrOrDefault<int64_t>("align_corners", 0);
+    std::string mode = info.GetAttrOrDefault<std::string>("mode", "linear");
+    std::string padding_mode = info.GetAttrOrDefault<std::string>("padding_mode", "zeros");
+    int64_t channels_last = is_channels_last ? 1 : info.GetAttrOrDefault<int64_t>("channels_last", 0);
+
+    JSEP_INIT_KERNEL_ATTRIBUTE(GridSample, ({
+                                 "align_corners" : $1,
+                                 "mode" : UTF8ToString($2),
+                                 "padding_mode" : UTF8ToString($3),
+                                 "format" : $4 ? "NHWC" : "NCHW"
+                               }),
+                               static_cast<int32_t>(align_corners), mode.c_str(),
+                               padding_mode.c_str(), static_cast<int32_t>(channels_last));
+  }
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/scatter_nd.cc b/onnxruntime/core/providers/js/operators/scatter_nd.cc
new file mode 100644
index 0000000000000..e9edb7f58fe5e
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/scatter_nd.cc
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+#include "core/providers/js/js_data_types.h"
+#include "scatter_nd.h"
+
+namespace onnxruntime {
+namespace js {
+
+ONNX_OPERATOR_KERNEL_EX(
+    ScatterND,
+    kOnnxDomain,
+    18,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    ScatterND);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    ScatterND,
+    kOnnxDomain,
+    16,
+    17,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    ScatterND);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    ScatterND,
+    kOnnxDomain,
+    13,
+    15,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    ScatterND);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    ScatterND,
+    kOnnxDomain,
+    11,
+    12,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", JsepSupportedDataTypes()),
+    ScatterND);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/scatter_nd.h b/onnxruntime/core/providers/js/operators/scatter_nd.h
new file mode 100644
index 0000000000000..8c81c62d71fe7
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/scatter_nd.h
@@ -0,0 +1,65 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+#include "core/framework/data_transfer_manager.h"
+
+namespace onnxruntime {
+namespace js {
+
+enum class ScatterNDReduction : int {
+  None = 0,
+  Add = 1,
+  Mul = 2,
+  Min = 3,
+  Max = 4,
+};
+
+class ScatterND : public JsKernel {
+ public:
+  ScatterND(const OpKernelInfo& info) : JsKernel(info) {
+    std::string reduction = info.GetAttrOrDefault<std::string>("reduction", "none");
+    if (reduction == "add") {
+      reduction_ = ScatterNDReduction::Add;
+    } else if (reduction == "mul") {
+      reduction_ = ScatterNDReduction::Mul;
+    } else if (reduction == "min") {
+      reduction_ = ScatterNDReduction::Min;
+    } else if (reduction == "max") {
+      reduction_ = ScatterNDReduction::Max;
+    } else if (reduction == "none") {
+      LOGS_DEFAULT(WARNING) << "ScatterND with reduction=='none' only guarantees "
+                            << "to be correct if indices are not duplicated.";
+    } else {
+      ORT_THROW("Reduction '", reduction, "' is not supported on webgpu when opset <= 13.");
+    }
+
+    JSEP_INIT_KERNEL_ATTRIBUTE(ScatterND, ({
+                                 "reduction" : UTF8ToString($1),
+                               }),
+                               reduction.c_str());
+  }
+
+  Status Compute(OpKernelContext* context) const override {
+    const Tensor* X = context->Input<Tensor>(0);
+    if (X == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
+    const TensorShape& X_shape = X->Shape();
+
+    Tensor* Y = context->Output(0, X_shape);
+    const void* source = X->DataRaw();
+    void* target = Y->MutableDataRaw();
+    // If source and target pointers are not equal (non-inplace operation), we need to copy the data.
+    if (target != source) {
+      ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*X, *Y));
+    }
+    return ComputeInternal(context);
+  }
+
+ private:
+  ScatterNDReduction reduction_{ScatterNDReduction::None};
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 3134e80f3021a..9017b36a0f087 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -802,6 +802,7 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer,
                                                     "ATen",
                                                     "AveragePool",
                                                     "BatchNormalization",
+                                                    "BiasGelu",
                                                     "Cast",
                                                     "Ceil",
                                                     "Celu",
@@ -826,12 +827,14 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer,
                                                     "Exp",
                                                     "Expand",
                                                     "EyeLike",
+                                                    "FastGelu",
                                                     "Flatten",
                                                     "Floor",
                                                     "GRU",
                                                     "Gather",
                                                     "GatherElements",
                                                     "GatherND",
+                                                    "Gelu",
                                                     "Gemm",
                                                     "GlobalAveragePool",
                                                     "GlobalMaxPool",
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index 12416ea0c121b..e4bee6f959a01 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -32,8 +32,16 @@ namespace nnapi {
 
 ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const NnApi& nnapi_handle,
                            gsl::span<const DeviceWrapper> nnapi_target_devices,
-                           TargetDeviceOption target_device_option)
-    : nnapi_(nnapi_handle), graph_viewer_(graph_viewer), nnapi_model_{std::make_unique<Model>(nnapi_handle)}, shaper_{graph_viewer}, nnapi_target_devices_(nnapi_target_devices), target_device_option_(target_device_option), nnapi_effective_feature_level_(GetNNAPIEffectiveFeatureLevel(nnapi_handle, nnapi_target_devices_)) {
+                           TargetDeviceOption target_device_option,
+                           const logging::Logger& logger)
+    : nnapi_(nnapi_handle),
+      graph_viewer_(graph_viewer),
+      nnapi_model_{std::make_unique<Model>(nnapi_handle)},
+      shaper_{graph_viewer},
+      nnapi_target_devices_(nnapi_target_devices),
+      target_device_option_(target_device_option),
+      nnapi_effective_feature_level_(GetNNAPIEffectiveFeatureLevel(nnapi_handle, nnapi_target_devices_)),
+      logger_(logger) {
   nnapi_model_->nnapi_effective_feature_level_ = nnapi_effective_feature_level_;
 }
 
@@ -136,7 +144,7 @@ const NodeUnit& ModelBuilder::GetNodeUnit(const Node* node) const {
 }
 
 void ModelBuilder::PreprocessNodeUnits() {
-  std::tie(node_unit_holder_, node_unit_map_) = QDQ::GetAllNodeUnits(graph_viewer_);
+  std::tie(node_unit_holder_, node_unit_map_) = QDQ::GetAllNodeUnits(graph_viewer_, logger_);
 }
 
 // Help to get all quantized operators' input and the NodeUnit(s) using the input
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index b2118150dd304..4db335afa98b0 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -14,7 +14,9 @@
 
 struct NnApi;
 namespace onnxruntime {
-
+namespace logging {
+class Logger;
+}
 class GraphViewer;
 enum class DataLayout;
 class NodeUnit;
@@ -31,7 +33,8 @@ class ModelBuilder {
   using Shape = Shaper::Shape;
 
   ModelBuilder(const GraphViewer& graph_viewer, const NnApi& nnapi_handle,
-               gsl::span<const DeviceWrapper> nnapi_target_devices, TargetDeviceOption target_device_option);
+               gsl::span<const DeviceWrapper> nnapi_target_devices, TargetDeviceOption target_device_option,
+               const logging::Logger& logger);
 
   common::Status Compile(std::unique_ptr<Model>& model);
 
@@ -173,6 +176,9 @@ class ModelBuilder {
   // <1,1> <1,2> <1,3>
   InlinedVector<std::pair<size_t, int32_t>> operations_recorder_;
 #endif
+
+  const logging::Logger& logger_;
+
   // Convert the ONNX model to ANeuralNetworksModel
   common::Status Prepare();
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
index fca52396a190c..f92c9592742d5 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
@@ -81,6 +81,7 @@ NnapiExecutionProvider::~NnapiExecutionProvider() {}
 std::vector<std::unique_ptr<ComputeCapability>>
 NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                       const IKernelLookup& /*kernel_lookup*/) const {
+  const auto& logger = *GetLogger();
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
   // TODO: Task 812756: NNAPI EP, add support for subgraph (If and Loop operators)
@@ -101,7 +102,7 @@ NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
     return ORT_NNAPI_MAX_SUPPORTED_API_LEVEL;
 #endif
   }();
-  LOGS_DEFAULT(VERBOSE) << "Effective NNAPI feature level: " << android_feature_level;
+  LOGS(logger, VERBOSE) << "Effective NNAPI feature level: " << android_feature_level;
 
   const nnapi::OpSupportCheckParams params{
       android_feature_level,
@@ -109,7 +110,7 @@ NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
   };
 
   if (params.android_feature_level < ORT_NNAPI_MIN_API_LEVEL) {
-    LOGS_DEFAULT(WARNING) << "All ops will fallback to CPU EP, because system NNAPI feature level ["
+    LOGS(logger, WARNING) << "All ops will fallback to CPU EP, because system NNAPI feature level ["
                           << params.android_feature_level
                           << "] is lower than minimal supported NNAPI API feature level ["
                           << ORT_NNAPI_MIN_API_LEVEL
@@ -121,7 +122,7 @@ NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
 
   // This holds the result of whether a NodeUnit is supported or not,
   // to prevent nodes in a NodeUnit to be checked for multiple times
@@ -150,7 +151,7 @@ NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
       node_unit_supported_result[node_unit] = supported;
     }
 
-    LOGS_DEFAULT(VERBOSE) << "Node supported: [" << supported
+    LOGS(logger, VERBOSE) << "Node supported: [" << supported
                           << "] Operator type: [" << node.OpType()
                           << "] index: [" << node.Index()
                           << "] name: [" << node.Name()
@@ -224,9 +225,9 @@ NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
   // If the graph is partitioned in multiple subgraphs, and this may impact performance,
   // we want to give users a summary message at warning level.
   if (num_of_partitions > 1) {
-    LOGS_DEFAULT(WARNING) << summary_msg;
+    LOGS(logger, WARNING) << summary_msg;
   } else {
-    LOGS_DEFAULT(INFO) << summary_msg;
+    LOGS(logger, INFO) << summary_msg;
   }
 
   return result;
@@ -273,11 +274,13 @@ static Status GetOutputBuffer(Ort::KernelContext& context,
 common::Status NnapiExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                                                std::vector<NodeComputeInfo>& node_compute_funcs) {
   using namespace android::nn::wrapper;
+  const auto& logger = *GetLogger();
+
   for (const auto& fused_node_and_graph : fused_nodes_and_graphs) {
     Node& fused_node = fused_node_and_graph.fused_node;
     const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
 
-    nnapi::ModelBuilder builder(graph_viewer, *nnapi_handle_, nnapi_target_devices_, target_device_option_);
+    nnapi::ModelBuilder builder(graph_viewer, *nnapi_handle_, nnapi_target_devices_, target_device_option_, logger);
     builder.SetUseNCHW(nnapi_flags_ & NNAPI_FLAG_USE_NCHW);
     builder.SetUseFp16(nnapi_flags_ & NNAPI_FLAG_USE_FP16);
 
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 4fca4037301fb..0ffde116f4efc 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -25,6 +25,11 @@ GlobalContext& BackendManager::GetGlobalContext() {
   return global_context_;
 }
 
+ov::CompiledModel& BackendManager::GetOVCompiledModel() {
+  ov::CompiledModel& ov_ptr = concrete_backend_->GetOVCompiledModel();
+  return (ov_ptr);
+}
+
 BackendManager::BackendManager(const GlobalContext& global_context,
                                const onnxruntime::Node& fused_node,
                                const onnxruntime::GraphViewer& subgraph,
@@ -35,7 +40,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
   openvino_sdk_version_ = std::to_string(global_context_.OpenVINO_Version.at(0)) + "." +
                           std::to_string(global_context_.OpenVINO_Version.at(1));
   if (ep_ctx_handle_.CheckForOVEPCtxNode(subgraph, openvino_sdk_version_)) {
-    if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph) != Status::OK())
+    if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph, global_context_.ep_context_embed_mode) != Status::OK())
       ORT_THROW("Import blob from model failed");
   }
 
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index b9ff7a72372b3..5ec462afd9d01 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -30,6 +30,7 @@ class BackendManager {
   GlobalContext& GetGlobalContext();
   Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph,
                                        const logging::Logger& logger);
+  ov::CompiledModel& GetOVCompiledModel();
 
  private:
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 8a1844544328c..56cceb8cf2a19 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -120,8 +120,8 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   } catch (const char* msg) {
     ORT_THROW(msg);
   }
-
-  inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, 1));
+  int num_infer_req = (global_context_.num_of_threads > 0) ? global_context_.num_of_threads : 1;
+  inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req));
 }
 
 bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
@@ -663,7 +663,6 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
     // Requesting for an idle infer_request from a pool of infer_requests_
     OVInferRequestPtr infer_request;
     infer_request = inferRequestsQueue_->getIdleRequest();
-
 #ifdef IO_BUFFER_ENABLED
     if ((global_context_.device_type.find("GPU") != std::string::npos) &&
         (global_context_.context != nullptr) && global_context_.is_wholly_supported_graph) {
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
index ee9486a62ea37..42a2b5d30c25c 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
@@ -21,7 +21,8 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
                                       const bool& ep_context_embed_mode,
                                       std::string&& model_blob_str,
                                       const std::string& openvino_sdk_version) const {
-  auto model_build = graph_viewer.CreateModel(logger);
+  auto& metadata = graph_viewer.GetGraph().GetModel().MetaData();
+  auto model_build = graph_viewer.CreateModel(logger, metadata);
   auto& graph_build = model_build->MainGraph();
 
   // Get graph inputs and outputs
@@ -94,11 +95,12 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
   return Status::OK();
 }
 
-Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer) {
+Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode) {
   auto node = graph_viewer.GetNode(0);
   auto& attrs = node->GetAttributes();
   ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) > 0);
   model_stream_ = std::make_shared<std::istringstream>(attrs.at(EP_CACHE_CONTEXT).s());
+  ep_context_embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
   LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
 
   is_valid_ep_ctx_graph_ = true;
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
index c631d011d02b1..c7ee943dff761 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
@@ -30,7 +30,7 @@ class EPCtxHandler {
                           const bool& ep_context_embed_mode,
                           std::string&& model_blob_str,
                           const std::string& openvino_sdk_version) const;
-  Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer);
+  Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode);
   bool CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const;
   bool IsValidOVEPCtxGraph() const { return is_valid_ep_ctx_graph_; }
   [[nodiscard]] const std::shared_ptr<std::istringstream> GetModelBlobStream() const { return model_stream_; }
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 19a634818a442..72a188108adef 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -2,13 +2,16 @@
 // Licensed under the MIT License
 #include <filesystem>
 #include <utility>
-
+#include <string>
+#include <memory>
+#include <vector>
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/openvino_execution_provider.h"
 #include "core/providers/openvino/contexts.h"
 #include "core/providers/openvino/backend_manager.h"
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/ov_versions/capability.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 #include "openvino/core/version.hpp"
 #ifdef USE_OVEP_NPU_MEMORY
 #include "core/providers/openvino/ov_allocator.h"
@@ -150,7 +153,7 @@ common::Status OpenVINOExecutionProvider::Compile(
                                                       graph_body_viewer,
                                                       *GetLogger(),
                                                       ep_ctx_handle_);
-
+    backend_manager_ = backend_manager;
     compute_info.create_state_func =
         [backend_manager](ComputeContext* context, FunctionState* state) {
           OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState();
@@ -186,16 +189,57 @@ common::Status OpenVINOExecutionProvider::Compile(
 
 #ifdef USE_OVEP_NPU_MEMORY
 std::vector<AllocatorPtr> OpenVINOExecutionProvider::CreatePreferredAllocators() {
-  AllocatorCreationInfo npu_allocator_info{
-      [this](OrtDevice::DeviceId device_id) {
-        return std::make_unique<OVRTAllocator>(global_context_->ie_core.Get(), OrtDevice::NPU, device_id, OpenVINO_RT_NPU);
-      },
-      0,
-  };
-
-  // fill in allocator
-  return std::vector<AllocatorPtr>{CreateAllocator(npu_allocator_info)};
+  if (global_context_->device_type.find("NPU") != std::string::npos) {
+    AllocatorCreationInfo npu_allocator_info{
+        [this](OrtDevice::DeviceId device_id) {
+          return std::make_unique<OVRTAllocator>(
+              global_context_->ie_core.Get(),
+              OrtDevice::NPU,
+              device_id,
+              OpenVINO_RT_NPU);
+        },
+        0,
+    };
+
+    // fill in allocator
+    return std::vector<AllocatorPtr>{CreateAllocator(npu_allocator_info)};
+  } else {
+    return std::vector<AllocatorPtr>{};
+  }
 }
 #endif
 
+common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const char* const> keys,
+                                                              gsl::span<const char* const> values) {
+  std::string workload_type = "";
+  // Ensure the number of keys and values match
+  if (keys.size() != values.size()) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Mismatched keys and values sizes.");
+  }
+
+  for (size_t i = 0; i < keys.size(); ++i) {
+    std::string key = keys[i];
+    std::string value = values[i];
+
+    if (key == kOrtEpDynamicOptionsWorkloadType) {
+      if (value == "Efficient") {
+        workload_type = "EFFICIENT";
+      } else if (value == "Default") {
+        workload_type = "DEFAULT";
+      } else {
+        LOGS_DEFAULT(WARNING) << "Unknown workload_type - ignoring " << key << "/" << value;
+        LOGS_DEFAULT(WARNING) << "Supported types are 'Efficient' and 'Default' \n";
+      }
+      if (workload_type != "") {
+        LOGS_DEFAULT(INFO) << "SetEpDynamicOptions - modifying: " << key << "/" << value;
+        ov::CompiledModel& ov_compiled_model = backend_manager_->GetOVCompiledModel();
+        ov_compiled_model.set_property(ov::workload_type(workload_type));
+      }
+    } else {
+      // Handle unknown options
+      LOGS_DEFAULT(WARNING) << "Unknown key/value pair - ignoring " << key << "/" << value;
+    }
+  }
+  return Status::OK();
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 7d9da65ea7e07..bea9badea475a 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -188,6 +188,9 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
   Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
                  std::vector<NodeComputeInfo>& node_compute_funcs) override;
 
+  Status SetEpDynamicOptions(gsl::span<const char* const> /*keys*/,
+                             gsl::span<const char* const> /*values*/) override;
+
   const void* GetExecutionHandle() const noexcept override {
     return nullptr;
   }
@@ -196,6 +199,7 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
 #endif
  private:
   std::unique_ptr<openvino_ep::GlobalContext> global_context_;
+  std::shared_ptr<openvino_ep::BackendManager> backend_manager_;
   openvino_ep::EPCtxHandler ep_ctx_handle_{};
 };
 
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index b46106db3c232..57c4e92685c96 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -7,6 +7,7 @@
 #include "core/providers/openvino/openvino_provider_factory.h"
 #include "core/providers/openvino/openvino_execution_provider.h"
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 #include "nlohmann/json.hpp"
 
 namespace onnxruntime {
@@ -50,10 +51,10 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
 };
 
 std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
-  bool so_disable_cpu_fallback = config_options_.GetConfigOrDefault("session.disable_cpu_ep_fallback", "0") == "1";
-  bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault("ep.context_enable", "0") == "1";
-  bool so_epctx_embed_mode = config_options_.GetConfigOrDefault("ep.context_embed_mode", "1") == "1";
-  std::string so_cache_path = config_options_.GetConfigOrDefault("ep.context_file_path", "").c_str();
+  bool so_disable_cpu_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
+  bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+  bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1") == "1";
+  std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str();
 
   if (so_export_ep_ctx_blob && !so_cache_path.empty()) {
     cache_dir_ = so_cache_path;
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 0d7ac64d86e68..95c7466e02f2f 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -35,16 +35,14 @@ GetCapability::GetCapability(const GraphViewer& graph_viewer_param,
     device_type_ = "CPU";
     if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true;
   }
-#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 1
-  data_ops_ = new DataOps(graph_viewer_, V_2024_1, device_type_, npu_qdq_optimizer_enabled);
-#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 2
-  data_ops_ = new DataOps(graph_viewer_, V_2024_2, device_type_, npu_qdq_optimizer_enabled);
-#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 3
+#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 3
   data_ops_ = new DataOps(graph_viewer_, V_2024_3, device_type_, npu_qdq_optimizer_enabled);
 #elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 4
   data_ops_ = new DataOps(graph_viewer_, V_2024_4, device_type_, npu_qdq_optimizer_enabled);
+#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5
+  data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled);
 #else
-  data_ops_ = new DataOps(graph_viewer_, V_2024_4, device_type_, npu_qdq_optimizer_enabled);
+  data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled);
 #endif
 }
 
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index e8f6ae0a43734..b2c5fd6f83167 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -118,6 +118,7 @@ std::vector<SupportedOp> supported_op_mode = {
     {"CumSum", V_2022_1, {"CPU", "GPU"}},
     {"DepthToSpace", V_2020_4, {"CPU", "GPU"}},
     {"DequantizeLinear", V_2021_4, {"CPU", "GPU"}},
+    {"DequantizeLinear", V_2024_4, {"NPU"}},
     {"Div", V_2020_4, {"CPU", "GPU"}},
     {"Dropout", V_2020_4, {"CPU", "GPU"}},
     {"Elu", V_2020_4, {"CPU", "GPU"}},
@@ -254,6 +255,8 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32));
   supported_types_initializer_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64));
+  supported_types_initializer_.insert(
+      std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16));
   supported_types_initializer_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16));
   supported_types_initializer_.insert(
@@ -262,6 +265,10 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2021_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8));
   supported_types_initializer_.insert(
       std::make_pair(V_2021_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT8));
+  supported_types_initializer_.insert(
+      std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT4));
+  supported_types_initializer_.insert(
+      std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4));
 
   supported_types_npu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL));
@@ -285,6 +292,10 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2024_3, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FN));
   supported_types_npu_.insert(
       std::make_pair(V_2024_3, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FNUZ));
+  supported_types_npu_.insert(
+      std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT4));
+  supported_types_npu_.insert(
+      std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4));
 
   supported_types_cpu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL));
@@ -304,6 +315,10 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64));
   supported_types_cpu_.insert(
       std::make_pair(V_2022_2, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16));
+  supported_types_cpu_.insert(
+      std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT4));
+  supported_types_cpu_.insert(
+      std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4));
 
   supported_types_gpu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT));
@@ -319,6 +334,10 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2021_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT8));
   supported_types_gpu_.insert(
       std::make_pair(V_2022_1, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL));
+  supported_types_gpu_.insert(
+      std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT4));
+  supported_types_gpu_.insert(
+      std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4));
 }
 
 void DataOps::populate_op_mode_supported() {
@@ -368,7 +387,7 @@ void DataOps::populate_op_mode_supported() {
 
   // populate unsupportedmode_t
   {
-    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4},
+    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch)
                                for (size_t i = 0; i < node->InputDefs().size(); i++) {
@@ -383,7 +402,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"ReduceMax", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
                              [this](const Node* node, const InitializedTensorSet&) {
                                const auto& input_arg = node->InputDefs()[1];
                                auto shape = input_arg->Shape();
@@ -400,7 +419,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Reshape", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the operator is unsqueeze
                                // If axes is an input, then we cannot produce a static graph.
@@ -415,7 +434,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Unsqueeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // check for attributes
                                auto& upsample_attr = node->GetAttributes();
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
index 5cd4c8658fb77..a2db56deca7cd 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
@@ -31,7 +31,8 @@ enum versionNum {
   V_2024_1,
   V_2024_2,
   V_2024_3,
-  V_2024_4
+  V_2024_4,
+  V_2024_5
 };
 
 using VersionNum = enum versionNum;
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index f1df1abf4c49a..decfe91c598be 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -687,7 +687,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
   // Get all the NodeUnits in the graph_viewer
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(&src_graph);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(&src_graph, logger);
 
   std::unordered_set<const NodeUnit*> seen_node_units;
   const auto& node_indices = src_graph.GetNodesInTopologicalOrder();
diff --git a/onnxruntime/core/providers/provider_factory_creators.h b/onnxruntime/core/providers/provider_factory_creators.h
index 41e418d9eb97f..1c62c1a7a8d0b 100644
--- a/onnxruntime/core/providers/provider_factory_creators.h
+++ b/onnxruntime/core/providers/provider_factory_creators.h
@@ -78,10 +78,6 @@
 #include "core/providers/tensorrt/tensorrt_provider_factory_creator.h"
 #endif
 
-#if defined(USE_TVM)
-#include "core/providers/tvm/tvm_provider_factory_creator.h"
-#endif
-
 #if defined(USE_VITISAI)
 #include "core/providers/vitisai/vitisai_provider_factory_creator.h"
 #endif
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index 57ae8c354abb7..79674fd706151 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -87,7 +87,8 @@ Status CreateNodeArgs(const std::vector<std::string>& names,
 Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
                                 const onnxruntime::PathString& ctx_onnx_model_path,
                                 QnnBackendManager* qnn_backend_manager,
-                                QnnModelLookupTable& qnn_models) {
+                                QnnModelLookupTable& qnn_models,
+                                int64_t max_spill_fill_size) {
   ORT_RETURN_IF_NOT(EPCONTEXT_OP == main_context_node.OpType(), "Should only filter in the EPContext node.");
   NodeAttrHelper node_helper(main_context_node);
   bool is_embed_mode = node_helper.Get(EMBED_MODE, true);
@@ -96,7 +97,8 @@ Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
     return qnn_backend_manager->LoadCachedQnnContextFromBuffer(const_cast<char*>(context_binary.c_str()),
                                                                static_cast<uint64_t>(context_binary.length()),
                                                                main_context_node.Name(),
-                                                               qnn_models);
+                                                               qnn_models,
+                                                               max_spill_fill_size);
   }
 
   std::filesystem::path folder_path = std::filesystem::path(ctx_onnx_model_path).parent_path();
@@ -145,17 +147,46 @@ Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
   return qnn_backend_manager->LoadCachedQnnContextFromBuffer(buffer.get(),
                                                              static_cast<uint64_t>(buffer_size),
                                                              main_context_node.Name(),
-                                                             qnn_models);
+                                                             qnn_models,
+                                                             max_spill_fill_size);
+}
+
+Status TryGetMaxSpillFillSize(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
+                              uint32_t total_context_size,
+                              int64_t& max_spill_fill_size,
+                              std::vector<int>& main_context_pos_list) {
+  max_spill_fill_size = 0;
+  int max_size_index = 0;
+  for (uint32_t i = 0; i < total_context_size; ++i) {
+    auto index = main_context_pos_list[i];
+    const onnxruntime::GraphViewer& main_ctx_graph_viewer(fused_nodes_and_graphs[index].filtered_graph);
+    ORT_RETURN_IF(main_ctx_graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
+    const auto& ep_context_node = main_ctx_graph_viewer.Nodes().begin();
+    NodeAttrHelper node_helper(*ep_context_node);
+    int64_t max_size = node_helper.Get(MAX_SIZE, static_cast<int64_t>(0));
+    if (max_size > max_spill_fill_size) {
+      max_spill_fill_size = max_size;
+      max_size_index = i;
+    }
+  }
+  if (0 != max_size_index) {
+    int tmp_index = main_context_pos_list[0];
+    main_context_pos_list[0] = main_context_pos_list[max_size_index];
+    main_context_pos_list[max_size_index] = tmp_index;
+  }
+
+  return Status::OK();
 }
 
 Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer,
                                const onnxruntime::PathString& ctx_onnx_model_path,
                                QnnBackendManager* qnn_backend_manager,
                                QnnModelLookupTable& qnn_models,
-                               const logging::Logger& logger) {
+                               const logging::Logger& logger,
+                               int64_t max_spill_fill_size) {
   ORT_RETURN_IF(graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
   Status status = GetEpContextFromMainNode(*graph_viewer.Nodes().begin(), ctx_onnx_model_path, qnn_backend_manager,
-                                           qnn_models);
+                                           qnn_models, max_spill_fill_size);
 
   // This is the protocol with customer that status with INVALID_GRAPH will be generated if failed to load context model
   if (!status.IsOK()) {
@@ -196,6 +227,7 @@ Status CreateEPContextNodes(Model* model,
                             const QnnModelLookupTable& qnn_models,
                             const onnxruntime::PathString& context_cache_path,
                             bool qnn_context_embed_mode,
+                            uint64_t max_spill_fill_buffer_size,
                             const logging::Logger& logger) {
   auto& graph = model->MainGraph();
 
@@ -238,6 +270,7 @@ Status CreateEPContextNodes(Model* model,
         }
         of_stream.write(reinterpret_cast<char*>(buffer), buffer_size);
         ep_node.AddAttribute(EP_CACHE_CONTEXT, context_cache_name);
+        ep_node.AddAttribute(MAX_SIZE, static_cast<int64_t>(max_spill_fill_buffer_size));
       }
     } else {
       ep_node.AddAttribute(MAIN_CONTEXT, static_cast<int64_t>(0));
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index f308a7456d46c..92c5391b40f09 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -28,6 +28,7 @@ static const std::string EP_CACHE_CONTEXT = "ep_cache_context";
 static const std::string EP_SDK_VER = "ep_sdk_version";
 static const std::string PARTITION_NAME = "partition_name";
 static const std::string SOURCE = "source";
+static const std::string MAX_SIZE = "max_size";
 
 bool GraphHasEpContextNode(const onnxruntime::GraphViewer& graph_viewer);
 
@@ -49,13 +50,20 @@ bool ValidateContextCacheFilePath(bool is_qnn_ctx_model,
 Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
                                 const onnxruntime::PathString& ctx_onnx_model_path,
                                 QnnBackendManager* qnn_backend_manager,
-                                QnnModelLookupTable& qnn_models);
+                                QnnModelLookupTable& qnn_models,
+                                int64_t max_spill_fill_size);
+
+Status TryGetMaxSpillFillSize(const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
+                              uint32_t total_context_size,
+                              int64_t& max_spill_fill_size,
+                              std::vector<int>& main_context_pos_list);
 
 Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer,
                                const onnxruntime::PathString& ctx_onnx_model_path,
                                QnnBackendManager* qnn_backend_manager,
                                QnnModelLookupTable& qnn_models,
-                               const logging::Logger& logger);
+                               const logging::Logger& logger,
+                               int64_t max_spill_fill_size);
 
 Status CreateEPContextNodes(Model* model,
                             unsigned char* buffer,
@@ -65,6 +73,7 @@ Status CreateEPContextNodes(Model* model,
                             const std::unordered_map<std::string, std::unique_ptr<QnnModel>>& qnn_models,
                             const onnxruntime::PathString& context_cache_path,
                             bool qnn_context_embed_mode,
+                            uint64_t max_spill_fill_buffer_size,
                             const logging::Logger& logger);
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index dd5c6a5a79cdb..6ef17b40d274b 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -83,6 +83,7 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
     CreateReduceOpBuilder("ReduceMin", *this);
     CreateReduceOpBuilder("ReduceProd", *this);
     CreateReduceOpBuilder("ReduceSum", *this);
+    CreateReduceOpBuilder("ReduceL2", *this);
   }
 
   {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
index d089235ceaa02..d1a0e88686f39 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
@@ -87,10 +87,10 @@ Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[BIAS_IDX], logger, input_names));
   }
 
-#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 17)
+#if QNN_API_VERSION_MAJOR == 2 && QNN_API_VERSION_MINOR >= 17 && QNN_API_VERSION_MINOR <= 20
   if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) {
-    // Bias is implicit. QNN SDK 2.24+ (QNN API version 2.17+) has a validation bug for implicit bias inputs,
-    // so provide an explicit bias of all 0 (quantized int32).
+    // Bias is implicit. QNN SDK 2.24 to 2.27 (QNN API version 2.17 to 2.20) has a validation bug for
+    // implicit bias inputs, so provide an explicit bias of all 0 (quantized int32).
     TensorInfo x_input_info = {};
     ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[X_IDX], x_input_info));
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
index 2aefe5f6b8e71..77bc58bd6f833 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
@@ -6,15 +6,15 @@
 #include <array>
 #include <vector>
 
+#include "core/common/safeint.h"
+#include "onnx/defs/data_type_utils.h"
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/endian_utils.h"
-#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -25,6 +25,7 @@ enum ReduceOpType {
   REDUCE_OP_TYPE_MEAN,
   REDUCE_OP_TYPE_PROD,
   REDUCE_OP_TYPE_SUM,
+  REDUCE_OP_TYPE_L2,
 
   REDUCE_OP_TYPE_COUNT,
   REDUCE_OP_TYPE_UNKNOWN,
@@ -41,6 +42,8 @@ ReduceOpType GetReduceOpType(const std::string& op_type) {
     return REDUCE_OP_TYPE_PROD;
   } else if (op_type == "ReduceSum") {
     return REDUCE_OP_TYPE_SUM;
+  } else if (op_type == "ReduceL2") {
+    return REDUCE_OP_TYPE_L2;
   } else {
     return REDUCE_OP_TYPE_UNKNOWN;
   }
@@ -51,21 +54,16 @@ class ReduceOpBuilder : public BaseOpBuilder {
   ReduceOpBuilder() : BaseOpBuilder("ReduceOpBuilder") {}
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ReduceOpBuilder);
 
-  Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
-                       const NodeUnit& node_unit,
+  Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                        const logging::Logger& logger) const override final ORT_MUST_USE_RESULT;
 
  protected:
-  Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
-                       const NodeUnit& node_unit,
-                       const logging::Logger& logger,
+  Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, const logging::Logger& logger,
                        std::vector<std::string>& input_names,
                        bool do_op_validation = false) const override ORT_MUST_USE_RESULT;
 
-  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
-                                     const NodeUnit& node_unit,
-                                     std::vector<std::string>&& input_names,
-                                     const logging::Logger& logger,
+  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
+                                     std::vector<std::string>&& input_names, const logging::Logger& logger,
                                      bool do_op_validation) const override ORT_MUST_USE_RESULT;
 
  private:
@@ -84,7 +82,8 @@ const std::array<int, REDUCE_OP_TYPE_COUNT> ReduceOpBuilder::opset_with_axes_as_
     18,  // ReduceMin
     18,  // ReduceMean
     18,  // ReduceProd
-    13   // ReduceSum
+    13,  // ReduceSum
+    18,  // ReduceL2
 };
 
 Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
@@ -175,8 +174,7 @@ Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const Nod
   return Status::OK();
 }
 
-Status ReduceOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
-                                      const NodeUnit& node_unit,
+Status ReduceOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                                       const logging::Logger& logger) const {
   ReduceOpType reduce_op_type = GetReduceOpType(node_unit.OpType());
   if (reduce_op_type == ReduceOpType::REDUCE_OP_TYPE_UNKNOWN) {
@@ -188,13 +186,17 @@ Status ReduceOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: ReduceProd operator not supported by HTP backend.");
   }
 
+  // ReduceL2 is composed by Mul->ReduceSum->Sqrt, it's not easy to set the quantization parameters for the activation
+  // tensors between, so we don't support ReduceL2 with quantized input for now.
+  if (reduce_op_type == ReduceOpType::REDUCE_OP_TYPE_L2 && node_unit.Inputs()[0].quant_param.has_value()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: ReduceL2 operator does not support quantized input for now.");
+  }
+
   return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
 }
 
-Status ReduceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
-                                      const NodeUnit& node_unit,
-                                      const logging::Logger& logger,
-                                      std::vector<std::string>& input_names,
+Status ReduceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
+                                      const logging::Logger& logger, std::vector<std::string>& input_names,
                                       bool do_op_validation) const {
   ORT_UNUSED_PARAMETER(do_op_validation);
 
@@ -207,11 +209,9 @@ Status ReduceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   return Status::OK();
 }
 
-Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
-                                                    const NodeUnit& node_unit,
+Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                                                     std::vector<std::string>&& input_names,
-                                                    const logging::Logger& logger,
-                                                    bool do_op_validation) const {
+                                                    const logging::Logger& logger, bool do_op_validation) const {
   NodeAttrHelper node_attr_helper(node_unit);
   std::vector<std::string> param_tensor_names;
 
@@ -229,8 +229,8 @@ Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   std::transform(axes_set.begin(), axes_set.end(), axes_data.begin(),
                  [](AxesOnnxIntType item) { return SafeInt<AxesQnnIntType>(item); });
 
-  QnnParamWrapper axes_param(node_unit.Index(), node_unit.Name(), QNN_OP_REDUCE_MAX_PARAM_AXES,
-                             std::move(axes_shape), std::move(axes_data));
+  QnnParamWrapper axes_param(node_unit.Index(), node_unit.Name(), QNN_OP_REDUCE_MAX_PARAM_AXES, std::move(axes_shape),
+                             std::move(axes_data));
   param_tensor_names.push_back(axes_param.GetParamTensorName());
   qnn_model_wrapper.AddParamWrapper(std::move(axes_param));
 
@@ -245,10 +245,57 @@ Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   param_tensor_names.push_back(keep_dims_param.GetParamTensorName());
   qnn_model_wrapper.AddParamWrapper(std::move(keep_dims_param));
 
-  ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
-                                     std::move(input_names),
-                                     std::move(param_tensor_names),
-                                     logger, do_op_validation, GetQnnOpType(node_unit.OpType())));
+  if (node_unit.OpType() == "ReduceL2") {
+    // If ReduceL2, QNN doesn't have a single Op for it, we need to add a
+    // ElementWiseMultiply->ReduceSum->ElementWiseSquareRoot node sequence.
+    const auto& input = node_unit.Inputs()[0];
+    const auto& output = node_unit.Outputs()[0];
+    std::vector<uint32_t> input_shape;
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input.node_arg, input_shape), "Cannot get input shape.");
+    std::vector<uint32_t> output_shape;
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(output.node_arg, output_shape), "Cannot get output shape.");
+    ORT_ENFORCE(!input.quant_param.has_value(), "Input tensor must not be quantized.");
+    const auto* type_proto = output.node_arg.TypeAsProto();
+    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
+    ORT_RETURN_IF_ERROR(utils::GetQnnDataType(false, type_proto, qnn_data_type));
+    const std::string input_name = input_names[0];
+
+    // Step 1: y_pow2 = x * x, using ElementWiseMultiply instead of ElementWisePower so we don't need to add a new
+    // initializer tensor for the power value. The performance difference is negligible.
+    const std::string pow2_name = input_name + "_ort_qnn_ep_pow2";
+    QnnTensorWrapper pow2_tensorwrapper(pow2_name, QNN_TENSOR_TYPE_NATIVE, qnn_data_type, QnnQuantParamsWrapper(),
+                                        std::move(input_shape));
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(pow2_tensorwrapper)), "AddTensorWrapper failed");
+    ORT_RETURN_IF_NOT(
+        qnn_model_wrapper.CreateQnnNode(pow2_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_MULTIPLY,
+                                        {input_name, input_name}, {pow2_name}, {}, do_op_validation),
+        "CreateQnnNode failed");
+
+    // Step 2: y_pow2_sum = ReduceSum(y_pow2)
+    const std::string reduce_name = input_name + "_ort_qnn_ep_pow2_sum";
+    QnnTensorWrapper reduce_tensorwrapper(reduce_name, QNN_TENSOR_TYPE_NATIVE, qnn_data_type, QnnQuantParamsWrapper(),
+                                          std::vector<uint32_t>(output_shape));
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(reduce_tensorwrapper)), "AddTensorWrapper failed");
+    ORT_RETURN_IF_NOT(
+        qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(node_unit), QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_REDUCE_SUM,
+                                        {pow2_name}, {reduce_name}, std::move(param_tensor_names), do_op_validation),
+        "CreateQnnNode failed");
+
+    // Step 3: y = Sqrt(y_pow2_sum)
+    Qnn_TensorType_t output_tensor_type =
+        qnn_model_wrapper.IsGraphOutput(output.node_arg.Name()) ? QNN_TENSOR_TYPE_APP_READ : QNN_TENSOR_TYPE_NATIVE;
+    QnnTensorWrapper sqrt_tensorwrapper(output.node_arg.Name(), output_tensor_type, qnn_data_type,
+                                        QnnQuantParamsWrapper(), std::move(output_shape));
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(sqrt_tensorwrapper)), "AddTensorWrapper failed");
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(input_name + "_ort_qnn_ep_pow2_sum_sqrt",
+                                                      QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_SQUARE_ROOT,
+                                                      {reduce_name}, {output.node_arg.Name()}, {}, do_op_validation),
+                      "CreateQnnNode failed");
+  } else {
+    ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit, std::move(input_names),
+                                       std::move(param_tensor_names), logger, do_op_validation,
+                                       GetQnnOpType(node_unit.OpType())));
+  }
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index bfc2102bdaac2..8a717c3f29ff9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -8,12 +8,14 @@
 #include <string>
 #include "QnnOpDef.h"
 #include "HTP/QnnHtpPerfInfrastructure.h"
+#include "HTP/QnnHtpSystemContext.h"
 #include "CPU/QnnCpuCommon.h"
 // TODO: not exist for Windows yet
 // #include "GPU/QnnGpuCommon.h"
 #include "DSP/QnnDspCommon.h"
 #include "HTP/QnnHtpCommon.h"
 #include "HTP/QnnHtpContext.h"
+#include "Saver/QnnSaver.h"
 #include <gsl/gsl>
 #include "core/framework/endian_utils.h"
 #include "core/common/logging/capture.h"
@@ -531,11 +533,11 @@ Status QnnBackendManager::CreateContext() {
   }
 
   QnnContext_Config_t context_config_weight_sharing = QNN_CONTEXT_CONFIG_INIT;
-  QnnHtpContext_CustomConfig_t customConfig;
-  customConfig.option = QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED;
-  customConfig.weightSharingEnabled = enable_htp_weight_sharing_;
+  QnnHtpContext_CustomConfig_t custom_config;
+  custom_config.option = QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED;
+  custom_config.weightSharingEnabled = enable_htp_weight_sharing_;
   context_config_weight_sharing.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
-  context_config_weight_sharing.customConfig = &customConfig;
+  context_config_weight_sharing.customConfig = &custom_config;
 
   QnnContext_Config_t context_priority_config = QNN_CONTEXT_CONFIG_INIT;
   ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, context_priority_config));
@@ -614,9 +616,71 @@ std::unique_ptr<unsigned char[]> QnnBackendManager::GetContextBinaryBuffer(uint6
   return context_buffer;
 }
 
+Status QnnBackendManager::GetMaxSpillFillBufferSize(unsigned char* buffer,
+                                                    uint64_t buffer_length,
+                                                    uint64_t& max_spill_fill_buffer_size) {
+  bool result = nullptr == qnn_sys_interface_.systemContextCreate ||
+                nullptr == qnn_sys_interface_.systemContextGetBinaryInfo ||
+                nullptr == qnn_sys_interface_.systemContextFree;
+  ORT_RETURN_IF(result, "Failed to get valid function pointer.");
+
+  QnnSystemContext_Handle_t sys_ctx_handle = nullptr;
+  auto rt = qnn_sys_interface_.systemContextCreate(&sys_ctx_handle);
+  ORT_RETURN_IF(QNN_SUCCESS != rt, "Failed to create system handle.");
+
+  const QnnSystemContext_BinaryInfo_t* binary_info = nullptr;
+  Qnn_ContextBinarySize_t binary_info_size{0};
+  rt = qnn_sys_interface_.systemContextGetBinaryInfo(sys_ctx_handle,
+                                                     static_cast<void*>(buffer),
+                                                     buffer_length,
+                                                     &binary_info,
+                                                     &binary_info_size);
+  ORT_RETURN_IF(QNN_SUCCESS != rt, "Failed to get context binary info.");
+
+  // binary_info life cycle is here
+  // Binary info to graph info
+  // retrieve Qnn graph info from binary info
+  ORT_RETURN_IF(nullptr == binary_info, "Qnn cached binary info is nullptr.");
+  uint32_t graph_count = 0;
+  QnnSystemContext_GraphInfo_t* graphs_info = nullptr;
+  if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
+    graph_count = binary_info->contextBinaryInfoV3.numGraphs;
+    graphs_info = binary_info->contextBinaryInfoV3.graphs;
+  } else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
+    graph_count = binary_info->contextBinaryInfoV2.numGraphs;
+    graphs_info = binary_info->contextBinaryInfoV2.graphs;
+  } else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
+    graph_count = binary_info->contextBinaryInfoV1.numGraphs;
+    graphs_info = binary_info->contextBinaryInfoV1.graphs;
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported context binary info version.");
+  }
+
+  for (uint32_t i = 0; i < graph_count; ++i) {
+    if (graphs_info[i].version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) {
+      auto htp_graph_info = reinterpret_cast<QnnHtpSystemContext_GraphBlobInfo_t*>(graphs_info[i].graphInfoV3.graphBlobInfo);
+      if (htp_graph_info->version == QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1) {
+        auto spill_fill_buffer_size = htp_graph_info->contextBinaryGraphBlobInfoV1.spillFillBufferSize;
+        max_spill_fill_buffer_size = spill_fill_buffer_size > max_spill_fill_buffer_size ? spill_fill_buffer_size : max_spill_fill_buffer_size;
+      } else {
+        LOGS(*logger_, VERBOSE) << "Unknown context binary graph info blob version.";
+      }
+    } else if (graphs_info[i].version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2 ||
+               graphs_info[i].version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) {
+      LOGS(*logger_, VERBOSE) << "Skip retrieve spill file buffer size, it is not supported with graph info v1 & v2.";
+    } else {
+      LOGS(*logger_, VERBOSE) << "Unknown context binary graph info version.";
+    }
+  }
+
+  LOGS(*logger_, VERBOSE) << "Get max spill fill buffer size completed.";
+  return Status::OK();
+}
+
 Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t buffer_length,
                                                          std::string node_name,
-                                                         QnnModelLookupTable& qnn_models) {
+                                                         QnnModelLookupTable& qnn_models,
+                                                         int64_t max_spill_fill_size) {
   bool result = nullptr == qnn_sys_interface_.systemContextCreate ||
                 nullptr == qnn_sys_interface_.systemContextGetBinaryInfo ||
                 nullptr == qnn_sys_interface_.systemContextFree;
@@ -637,7 +701,7 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
 
   // binary_info life cycle is here
   // Binary info to graph info
-  // retrieve Qnn graph infor from binary info
+  // retrieve Qnn graph info from binary info
   ORT_RETURN_IF(nullptr == binary_info, "Qnn cached binary info is nullptr.");
   uint32_t graph_count = 0;
   QnnSystemContext_GraphInfo_t* graphs_info = nullptr;
@@ -657,13 +721,33 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
   ORT_RETURN_IF(graph_count < 1 || graphs_info == nullptr, "Failed to get graph info from Qnn cached context.");
   LOGS(*logger_, VERBOSE) << "Graph count from QNN context: " << graph_count;
 
-  ORT_RETURN_IF(nullptr == qnn_interface_.contextCreateFromBinary,
-                "Invalid function pointer for contextCreateFromBinary.");
-
   QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT;
   ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, qnn_context_config));
-  const QnnContext_Config_t* context_configs[] = {&qnn_context_config, nullptr};
 
+  // Register spill fill buffer for multi context
+  QnnContext_Config_t spill_fill_config = QNN_CONTEXT_CONFIG_INIT;
+
+  // The spill fill buffer is available since 2.28, API version starts from 2.21
+#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 21)
+  QnnHtpContext_CustomConfig_t custom_config;
+  custom_config.option = QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS;
+  QnnHtpContext_GroupRegistration_t group_info;
+  size_t current_contexts_size = GetQnnContextSize();
+  // set to 0x0 (new group) if this is the first context, otherwise point to the first context handle
+  // note that we already move the context with max spill fill size to the beginning of the list
+  group_info.firstGroupHandle = (max_spill_fill_size > 0 && current_contexts_size > 0) ? GetQnnContext(0) : 0x0;
+  group_info.maxSpillFillBuffer = max_spill_fill_size;  // Max spill-fill buffer across contexts. Must be >0
+  custom_config.groupRegistration = group_info;
+  spill_fill_config.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
+  spill_fill_config.customConfig = &custom_config;
+#endif
+  QnnContext_Config_t* spill_fill_config_pointer = max_spill_fill_size > 0 ? &spill_fill_config : nullptr;
+  LOGS(*logger_, VERBOSE) << "Max spill fill buffer size:" << max_spill_fill_size;
+
+  const QnnContext_Config_t* context_configs[] = {&qnn_context_config, spill_fill_config_pointer, nullptr};
+
+  ORT_RETURN_IF(nullptr == qnn_interface_.contextCreateFromBinary,
+                "Invalid function pointer for contextCreateFromBinary.");
   Qnn_ContextHandle_t context = nullptr;
   rt = qnn_interface_.contextCreateFromBinary(backend_handle_,
                                               device_handle_,
@@ -672,7 +756,7 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
                                               buffer_length,
                                               &context,
                                               profile_backend_handle_);
-  ORT_RETURN_IF(QNN_SUCCESS != rt, "Failed to create context from binary.");
+  ORT_RETURN_IF(QNN_SUCCESS != rt, "Failed to create context from binary. Error code: ", rt);
   contexts_.push_back(context);
   if (1 == graph_count) {
     // in case the EPContext node is generated from script
@@ -698,7 +782,11 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
   return Status::OK();
 }
 
-Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool load_from_cached_context) {
+// need to load system lib if load from Qnn context binary
+// or generate Qnn context binary is enabled -- to get the max spill fill buffer size
+Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
+                                       bool load_from_cached_context,
+                                       bool need_load_system_lib) {
   std::lock_guard<std::mutex> lock(logger_mutex_);
   if (backend_setup_completed_) {
     LOGS(logger, VERBOSE) << "Backend setup already!";
@@ -713,7 +801,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool load_
 
   LOGS(logger, VERBOSE) << "LoadBackend succeed.";
 
-  if (load_from_cached_context) {
+  if (load_from_cached_context || need_load_system_lib) {
     ORT_RETURN_IF_ERROR(LoadQnnSystemLib());
   }
 
@@ -932,20 +1020,6 @@ Status QnnBackendManager::SetRpcControlLatency(uint32_t htp_power_config_client_
   return Status::OK();
 }
 
-void QnnBackendManager::Split(std::vector<std::string>& split_string,
-                              const std::string& tokenized_string,
-                              const char separator) {
-  split_string.clear();
-  std::istringstream tokenized_string_stream(tokenized_string);
-  while (!tokenized_string_stream.eof()) {
-    std::string value;
-    getline(tokenized_string_stream, value, separator);
-    if (!value.empty()) {
-      split_string.push_back(value);
-    }
-  }
-}
-
 Status QnnBackendManager::DestroyHTPPowerConfigID(uint32_t htp_power_config_id) {
   QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
   auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
@@ -1040,7 +1114,14 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
   const QnnProfile_EventId_t* profile_events{nullptr};
   uint32_t num_events{0};
   Qnn_ErrorHandle_t result = qnn_interface_.profileGetEvents(profile_backend_handle_, &profile_events, &num_events);
-  ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile events. Error: ", QnnErrorHandleToString(result));
+  if (!qnn_saver_path_.empty()) {  // Using QNN Saver backend
+    // QNN SDK 2.28.2 returns QNN_SAVER_ERROR_DUMMY_RETVALUE, but previous QNN versions return QNN_PROFILE_NO_ERROR.
+    // We accept both values.
+    ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result && QNN_SAVER_ERROR_DUMMY_RETVALUE != result,
+                  "Failed to get profile events. Error: ", QnnErrorHandleToString(result));
+  } else {
+    ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile events. Error: ", QnnErrorHandleToString(result));
+  }
 
   if (num_events > 0) {
     LOGS(*logger_, VERBOSE) << "profile_events: " << profile_events << " num_events: " << num_events;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index 43007d4a5c244..b145f2a2cd724 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -93,9 +93,10 @@ class QnnBackendManager {
 
   Status LoadCachedQnnContextFromBuffer(char* buffer, uint64_t buffer_length,
                                         std::string node_name,
-                                        std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models);
+                                        std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models,
+                                        int64_t max_spill_fill_size);
 
-  Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context);
+  Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context, bool need_load_system_lib);
 
   Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);
 
@@ -112,6 +113,10 @@ class QnnBackendManager {
     return contexts_[index];
   }
 
+  size_t GetQnnContextSize() {
+    return contexts_.size();
+  }
+
   const Qnn_BackendHandle_t& GetQnnBackendHandle() { return backend_handle_; }
 
   const Qnn_ProfileHandle_t& GetQnnProfileHandle() { return profile_backend_handle_; }
@@ -145,8 +150,6 @@ class QnnBackendManager {
 
   void ReleaseResources();
 
-  void Split(std::vector<std::string>& split_string, const std::string& tokenized_string, const char separator);
-
   Status ExtractBackendProfilingInfo();
   Status ExtractProfilingSubEvents(QnnProfile_EventId_t profile_event_id, std::ofstream& outfile,
                                    bool backendSupportsExtendedEventData, bool tracelogging_provider_ep_enabled);
@@ -163,6 +166,10 @@ class QnnBackendManager {
 
   Status DestroyHTPPowerConfigID(uint32_t htp_power_config_id);
 
+  Status GetMaxSpillFillBufferSize(unsigned char* buffer,
+                                   uint64_t buffer_length,
+                                   uint64_t& max_spill_fill_buffer_size);
+
  private:
   void* LoadLib(const char* file_name, int flags, std::string& error_msg);
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 88fa6429fc01e..75973c7031d62 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -104,7 +104,7 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
   // valid throughout the lifetime of the ModelBuilder
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
 
   // This name must be same with the EPContext node name
   const auto& graph_name = fused_node.Name();
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 6735528bebbf9..060bbd4f79bf2 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -363,20 +363,24 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_fp16_precision: " << enable_HTP_FP16_precision_;
   }
 
+  bool enable_htp_weight_sharing = false;
   static const std::string QNN_HTP_WEIGHT_SHARING_ENABLED = "enable_htp_weight_sharing";
   auto htp_weight_sharing_enabled_pos = provider_options_map.find(QNN_HTP_WEIGHT_SHARING_ENABLED);
   if (htp_weight_sharing_enabled_pos != provider_options_map.end()) {
     if ("1" == htp_weight_sharing_enabled_pos->second) {
-      enable_htp_weight_sharing_ = true;
+      enable_htp_weight_sharing = true;
     } else if ("0" == htp_weight_sharing_enabled_pos->second) {
-      enable_htp_weight_sharing_ = false;
+      enable_htp_weight_sharing = false;
     } else {
-      LOGS_DEFAULT(VERBOSE) << "Invalid enable_htp_weight_sharing: " << enable_htp_weight_sharing_
+      LOGS_DEFAULT(VERBOSE) << "Invalid enable_htp_weight_sharing: " << enable_htp_weight_sharing
                             << " only 0 or 1 allowed. Set to 0.";
     }
-    LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_weight_sharing: " << enable_htp_weight_sharing_;
+    LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_weight_sharing: " << enable_htp_weight_sharing;
   }
 
+  // Add this option because this feature requires QnnSystem lib and it's no supported for Windows x86_64 platform
+  enable_spill_fill_buffer_ = ParseBoolOption("enable_htp_spill_fill_buffer", false, provider_options_map);
+
   model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", false,
                                                                   provider_options_map);
 
@@ -396,7 +400,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
       device_id_,
       htp_arch,
       soc_model,
-      enable_htp_weight_sharing_);
+      enable_htp_weight_sharing);
 
 #ifdef _WIN32
   auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance();
@@ -686,7 +690,8 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
 
   // It will load the QnnSystem lib if is_qnn_ctx_model=true, and
   // delay the Qnn context creation to Compile() using the cached context binary
-  auto rt = qnn_backend_manager_->SetupBackend(logger, is_qnn_ctx_model);
+  // or generate context cache enable, need to use use QnnSystem lib to parse the binary to get the max spill fill buffer size
+  auto rt = qnn_backend_manager_->SetupBackend(logger, is_qnn_ctx_model, context_cache_enabled_ && enable_spill_fill_buffer_);
   if (Status::OK() != rt) {
     LOGS(logger, ERROR) << "QNN SetupBackend failed " << rt.ErrorMessage();
     return result;
@@ -713,7 +718,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
 
   // remove is_qnn_ctx_model related code
   const auto supported_nodes = GetSupportedNodes(graph_viewer, node_unit_map,
@@ -934,6 +939,16 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
 
     std::vector<int> main_context_pos_list;
     ORT_RETURN_IF_ERROR(qnn::GetMainContextNode(fused_nodes_and_graphs, main_context_pos_list));
+    uint32_t total_context_size = SafeInt<uint32_t>(main_context_pos_list.size());
+
+    int64_t max_spill_fill_size = 0;
+
+    // Adjust the main_context_pos_list, move the one with max spill fill buffer to the beginning
+    // HTP spill fill buffer only works for multiple QNN contexts generated after QNN v2.28
+    if (total_context_size > 1) {
+      ORT_RETURN_IF_ERROR(qnn::TryGetMaxSpillFillSize(fused_nodes_and_graphs, total_context_size,
+                                                      max_spill_fill_size, main_context_pos_list));
+    }
 
     for (auto main_context_pos : main_context_pos_list) {
       const onnxruntime::GraphViewer& main_ctx_graph_viewer(fused_nodes_and_graphs[main_context_pos].filtered_graph);
@@ -942,7 +957,8 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
                                                        context_cache_path,
                                                        qnn_backend_manager_.get(),
                                                        qnn_models,
-                                                       logger));
+                                                       logger,
+                                                       max_spill_fill_size));
     }
 
     for (auto fused_node_and_graph : fused_nodes_and_graphs) {
@@ -984,6 +1000,13 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
     // All partitioned graph share single QNN context, included in the same context binary
     uint64_t buffer_size(0);
     auto context_buffer = qnn_backend_manager_->GetContextBinaryBuffer(buffer_size);
+    // Get max spill fill buffer size
+    uint64_t max_spill_fill_buffer_size = 0;
+    if (enable_spill_fill_buffer_) {
+      ORT_RETURN_IF_ERROR(qnn_backend_manager_->GetMaxSpillFillBufferSize(context_buffer.get(),
+                                                                          buffer_size,
+                                                                          max_spill_fill_buffer_size));
+    }
     qnn_ep_context_model_ = std::make_unique<Model>("qnn_ep_context_model", false, logger);
     ORT_RETURN_IF_ERROR(qnn::CreateEPContextNodes(qnn_ep_context_model_.get(),
                                                   context_buffer.get(),
@@ -993,6 +1016,7 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
                                                   qnn_models_,
                                                   context_cache_path,
                                                   qnn_context_embed_mode_,
+                                                  max_spill_fill_buffer_size,
                                                   logger));
   }
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 35c061de6132c..a0577e8fd87f2 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -141,7 +141,6 @@ class QNNExecutionProvider : public IExecutionProvider {
   std::string context_node_name_prefix_ = "";
   bool disable_cpu_ep_fallback_ = false;  // True if CPU EP fallback has been disabled for this session.
   bool qnn_context_embed_mode_ = true;
-  bool enable_htp_weight_sharing_ = false;
   int32_t vtcm_size_in_mb_ = 0;
   std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
   ModelMetadefIdGenerator metadef_id_generator_;
@@ -150,6 +149,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   uint32_t default_rpc_control_latency_ = 0;
   bool enable_HTP_FP16_precision_ = true;
   bool share_ep_contexts_ = false;
+  bool enable_spill_fill_buffer_ = false;
 #ifdef _WIN32
   onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr;
 #endif
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 75b8ac7e134f3..0a427b146dcaa 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -2493,7 +2493,7 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   // For ROCM EP, exclude the subgraph that is preferred to be placed in CPU
   // These are usually shape related computation subgraphs
   // Following logic can be extended for other EPs
-  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, logger);
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0)
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index b84825236a453..45f81ed22b7f7 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -294,7 +294,8 @@ std::unique_ptr<IDataTransfer> CreateGPUDataTransfer();
 
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes);
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const logging::Logger& logger);
 
 std::string GetEnvironmentVar(const std::string& var_name);
 
@@ -371,8 +372,8 @@ constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<UInt4x2>() {
 
 namespace QDQ {
 inline std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
-GetAllNodeUnits(const GraphViewer* graph_viewer) {
-  return g_host->QDQ__GetAllNodeUnits(graph_viewer);
+GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger) {
+  return g_host->QDQ__GetAllNodeUnits(graph_viewer, logger);
 }
 }  // namespace QDQ
 
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index d3b12f9728135..aa8c367d25d51 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -369,8 +369,9 @@ std::string GetEnvironmentVar(const std::string& var_name) {
 
 std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                    const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                   gsl::span<const NodeIndex> tentative_nodes) {
-  return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+                                                   gsl::span<const NodeIndex> tentative_nodes,
+                                                   const logging::Logger& logger) {
+  return g_host->GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, logger);
 }
 
 namespace profiling {
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 3efc715fc3037..7ab93d56cfe26 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -202,7 +202,8 @@ struct ProviderHost {
 
   virtual std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                              const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                             gsl::span<const NodeIndex> tentative_nodes) = 0;
+                                                             gsl::span<const NodeIndex> tentative_nodes,
+                                                             const logging::Logger& logger) = 0;
 
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) = 0;
   virtual Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ float* p_data, size_t expected_size) = 0;
@@ -890,7 +891,7 @@ struct ProviderHost {
   virtual std::unique_ptr<Node__EdgeIterator> NodeUnit__OutputEdgesEnd(const NodeUnit* p) = 0;
 
   virtual std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
-  QDQ__GetAllNodeUnits(const GraphViewer* graph_viewer) = 0;
+  QDQ__GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger) = 0;
 
   // Model
   virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
@@ -960,7 +961,7 @@ struct ProviderHost {
 
   // GraphViewer
   virtual void GraphViewer__operator_delete(GraphViewer* p) = 0;
-  virtual std::unique_ptr<Model> GraphViewer__CreateModel(const GraphViewer* p, const logging::Logger& logger) = 0;
+  virtual std::unique_ptr<Model> GraphViewer__CreateModel(const GraphViewer* p, const logging::Logger& logger, const ModelMetaData&) = 0;
 
   virtual const std::string& GraphViewer__Name(const GraphViewer* p) noexcept = 0;
   virtual const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept = 0;
@@ -996,6 +997,7 @@ struct ProviderHost {
                                     bool include_outer_scope_args,
                                     int execution_order) noexcept = 0;
   virtual const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const = 0;
+  virtual IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const = 0;
 
   // OpKernel
   virtual const Node& OpKernel__Node(const OpKernel* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index b9e0951a740a2..a82ddfe64c64b 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -1022,11 +1022,13 @@ struct Graph final {
   PROVIDER_DISALLOW_ALL(Graph)
 };
 
+using ModelMetaData = std::unordered_map<std::string, std::string>;
+
 class GraphViewer final {
  public:
   static void operator delete(void* p) { g_host->GraphViewer__operator_delete(reinterpret_cast<GraphViewer*>(p)); }
 
-  std::unique_ptr<Model> CreateModel(const logging::Logger& logger) const { return g_host->GraphViewer__CreateModel(this, logger); }
+  std::unique_ptr<Model> CreateModel(const logging::Logger& logger, const ModelMetaData& metadata = ModelMetaData()) const { return g_host->GraphViewer__CreateModel(this, logger, metadata); }
 
   const std::string& Name() const noexcept { return g_host->GraphViewer__Name(this); }
   const std::filesystem::path& ModelPath() const noexcept { return g_host->GraphViewer__ModelPath(this); }
@@ -1068,6 +1070,7 @@ class GraphViewer final {
     g_host->GraphViewer__ToProto(this, graph_proto, include_initializers, include_outer_scope_args, execution_order);
   }
   const Node* GetProducerNode(const std::string& node_arg_name) const { return g_host->GraphViewer__GetProducerNode(this, node_arg_name); }
+  IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const { return g_host->GraphViewer__GetSchemaRegistry(this); }
 
   GraphViewer() = delete;
   GraphViewer(const GraphViewer&) = delete;
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
index ef45d6c85d6a9..fbccd7d4a286b 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
@@ -128,7 +128,8 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
 
   // Serialize modelproto to string
   auto new_graph_viewer = graph_build.CreateGraphViewer();
-  auto model = new_graph_viewer->CreateModel(*logger);
+  auto& metadata = graph_viewer.GetGraph().GetModel().MetaData();
+  auto model = new_graph_viewer->CreateModel(*logger, metadata);
   auto model_proto = model->ToProto();
   new_graph_viewer->ToProto(*model_proto->mutable_graph(), true, true);
   model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 4da40823ba4e9..1b432dad44263 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1725,6 +1725,12 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     runtime_ = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(GetTensorrtLogger(detailed_build_log_)));
   }
 
+  trt_version_ = getInferLibVersion();
+  CUDA_CALL_THROW(cudaRuntimeGetVersion(&cuda_version_));
+
+  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] TensorRT version is " << trt_version_;
+  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] CUDA version is " << cuda_version_;
+
   LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] TensorRT provider options: "
                         << "device_id: " << device_id_
                         << ", trt_max_partition_iterations: " << max_partition_iterations_
@@ -1948,7 +1954,7 @@ std::unique_ptr<IndexedSubGraph> TensorrtExecutionProvider::GetSubGraph(SubGraph
 
   // Find inputs and outputs of the subgraph
   std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::IndexedSubGraph::Create();
-  std::unordered_map<const NodeArg*, int> fused_inputs, fused_outputs, fused_outputs_to_add, graph_outputs_to_add;
+  std::unordered_map<const NodeArg*, int> original_inputs, fused_inputs, fused_outputs, fused_outputs_to_add, graph_outputs_to_add;
   std::unordered_set<const NodeArg*> erased;
   int input_order = 0;
   int output_order = 0;
@@ -2040,12 +2046,25 @@ std::unique_ptr<IndexedSubGraph> TensorrtExecutionProvider::GetSubGraph(SubGraph
   fused_outputs.insert(fused_outputs_to_add.begin(), fused_outputs_to_add.end());
   fused_outputs.insert(graph_outputs_to_add.begin(), graph_outputs_to_add.end());
 
-  // Sort inputs and outputs by the order they were added
   std::multimap<int, const NodeArg*> inputs, outputs;
+
+  // Get the input order of the original graph
+  int order = 0;
+  for (const auto* input : graph.GetInputs()) {
+    original_inputs[input] = order++;
+  }
+
+  // input order needs to be consistent with original graph's input order
   for (auto it = fused_inputs.begin(), end = fused_inputs.end(); it != end; ++it) {
-    inputs.insert(std::pair<int, const NodeArg*>(it->second, it->first));
+    const auto& iter = original_inputs.find(it->first);
+    if (iter != original_inputs.end()) {
+      inputs.insert(std::pair<int, const NodeArg*>(iter->second, iter->first));
+    } else {
+      inputs.insert(std::pair<int, const NodeArg*>(it->second, it->first));
+    }
   }
 
+  // Sort outputs by the order they were added
   for (auto it = fused_outputs.begin(), end = fused_outputs.end(); it != end; ++it) {
     outputs.insert(std::pair<int, const NodeArg*>(it->second, it->first));
   }
@@ -2449,23 +2468,43 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   // So, simply return the ComputeCapability here.
   if (graph.NumberOfNodes() == 1 && GraphHasCtxNode(graph)) {
     SubGraph_t supported_node_vector = {{0}, true};
-    std::unique_ptr<IndexedSubGraph> sub_graph = GetSubGraph(supported_node_vector, graph, TRTGenerateId(graph), 0);
+    std::unique_ptr<IndexedSubGraph> sub_graph = GetSubGraph(supported_node_vector, graph, TRTGenerateId(graph, std::to_string(trt_version_), std::to_string(cuda_version_)), 0);
     result.push_back(ComputeCapability::Create(std::move(sub_graph)));
     return result;
   }
 
   // Generate unique kernel name for TRT graph
-  HashValue model_hash = TRTGenerateId(graph);
+  HashValue model_hash = TRTGenerateId(graph, std::to_string(trt_version_), std::to_string(cuda_version_));
 
   // Get supported node list from TensorRT parser
   const int number_of_ort_nodes = graph.NumberOfNodes();
   std::vector<size_t> nodes_vector(number_of_ort_nodes);
   std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0);
 
-  std::vector<size_t> filtered_nodes_vector;
+  std::set<std::string> exclude_ops_set;
+
+  /*
+   * There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10.
+   * TRT EP automatically excludes DDS ops from running on TRT.
+   */
+  if (trt_version_ >= 100000 && trt_version_ < 110000) {
+    exclude_ops_set.insert("NonMaxSuppression");
+    exclude_ops_set.insert("NonZero");
+    exclude_ops_set.insert("RoiAlign");
+    LOGS_DEFAULT(VERBOSE) << "There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10. TRT EP automatically excludes DDS ops from running on TRT, if applicable";
+  }
+
+  SubGraphCollection_t parser_nodes_vector, supported_nodes_vector;
   const std::vector<NodeIndex>& node_index = graph.GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
+  bool new_subgraph = true;
+
+  /* Iterate all the nodes and exclude the node if:
+   *   1. It's a control flow op and its subgraph(s) is not fully TRT eligible.
+   *   2. It's a DDS op.
+   */
   for (const auto& index : nodes_vector) {
     const auto& node = graph.GetNode(node_index[index]);
+    bool supported_node = true;
 
     /* If current node is control flow op, we take different approach based on following four cases:
      *
@@ -2477,29 +2516,43 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
      * For cases 2, 3, 4, even though the control flow op is not assigned to TRT, any portion of its subgraphs that can run in TRT will be still fused and assigned to TRT EP.
      */
     if (control_flow_op_set_.find(node->OpType()) != control_flow_op_set_.end()) {
-      auto sub_graphs = node->GetSubgraphs();
-      if (sub_graphs.size() != 0) {
-        bool all_subgraphs_are_supported = true;
-        for (auto sub_graph : sub_graphs) {
-          // TRT EP should consider the empty subgraph is fully supported by TRT.
-          if (sub_graph->CreateGraphViewer()->NumberOfNodes() == 0) {
-            continue;
-          }
-          if (!AllNodesAssignedToSpecificEP(*(sub_graph->CreateGraphViewer()), kTensorrtExecutionProvider)) {
-            all_subgraphs_are_supported = false;
-            break;
+      auto supported_control_flow_op = [&](const Node* node) {
+        auto sub_graphs = node->GetSubgraphs();
+        if (sub_graphs.size() != 0) {
+          for (auto sub_graph : sub_graphs) {
+            // TRT EP should consider the empty subgraph is fully supported by TRT.
+            if (sub_graph->CreateGraphViewer()->NumberOfNodes() == 0) {
+              continue;
+            }
+            if (!AllNodesAssignedToSpecificEP(*(sub_graph->CreateGraphViewer()), kTensorrtExecutionProvider)) {
+              // if not all its subgraphs are supported, we need to exclude this control flow op
+              return false;
+            }
           }
         }
-        if (!all_subgraphs_are_supported) {
-          // if not all its subgraphs are supported, we need to exclude this control flow op
-          continue;
-        }
+        return true;
+      };
+      supported_node = supported_control_flow_op(node);
+    }
+
+    // Exclude any ops, if applicable
+    if (exclude_ops_set.find(node->OpType()) != exclude_ops_set.end()) {
+      supported_node = false;
+    }
+
+    if (supported_node) {
+      if (new_subgraph) {
+        parser_nodes_vector.emplace_back();
+        // Mark all new graphs as "UnKnown" which will later be parsed by TRT parser
+        parser_nodes_vector.back().second = false;
+        new_subgraph = false;
       }
+      parser_nodes_vector.back().first.emplace_back(index);
+    } else {
+      new_subgraph = true;
     }
-    filtered_nodes_vector.push_back(index);
   }
 
-  SubGraphCollection_t supported_nodes_vector, parser_nodes_vector = {{filtered_nodes_vector, false}};
   bool early_termination = false;
   supported_nodes_vector = GetSupportedList(parser_nodes_vector, 0, max_partition_iterations_, graph, &early_termination);
   if (early_termination) {
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index c057d48de4070..d3e0b0fba8891 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -329,6 +329,11 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool cuda_graph_enable_ = false;
   std::string cache_prefix_;
   bool engine_hw_compatible_ = false;
+  std::string op_types_to_exclude_;
+
+  // The format is as for TENSORRT_VERSION: (MAJOR * 100 + MINOR) * 100 + PATCH
+  int32_t trt_version_;
+  int32_t cuda_version_;
 
   // The OrtAllocator object will be get during ep compute time
   // and should be kept for the lifetime of TRT EP object.
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
index 95abcd1bad2b8..5a7b135fd92cd 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -520,7 +520,7 @@ void RemoveCachesByType(const std::string& root, std::string file_extension) {
  * compiled kernels, so the name must be unique and deterministic across models and sessions.
  * </remarks>
  */
-HashValue TRTGenerateId(const GraphViewer& graph_viewer) {
+HashValue TRTGenerateId(const GraphViewer& graph_viewer, std::string trt_version, std::string cuda_version) {
   HashValue model_hash = 0;
 
   // find the top level graph
@@ -583,12 +583,11 @@ HashValue TRTGenerateId(const GraphViewer& graph_viewer) {
 #endif
 
 #ifdef CUDA_VERSION
-  hash_str(std::to_string(CUDA_VERSION));
+  hash_str(cuda_version);
 #endif
 
 #if defined(NV_TENSORRT_MAJOR) && defined(NV_TENSORRT_MINOR)
-  std::string TRT_VERSION = std::to_string(NV_TENSORRT_MAJOR) + "." + std::to_string(NV_TENSORRT_MINOR);
-  hash_str(TRT_VERSION);
+  hash_str(trt_version);
 #endif
 
   model_hash = hash[0] | (uint64_t(hash[1]) << 32);
diff --git a/onnxruntime/core/providers/tvm/custom_logging.cc b/onnxruntime/core/providers/tvm/custom_logging.cc
deleted file mode 100644
index 1cabe81f8e87e..0000000000000
--- a/onnxruntime/core/providers/tvm/custom_logging.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-//
-// Enable custom logging - this will cause TVM to use a custom implementation
-// of tvm::runtime::detail::LogMessage. We use this to change the absolute
-// file path to relative file path.
-
-#include <ctime>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-// TODO(agladyshev): Make conditional choice of sep for Windows and UNIX
-std::string GetFileName(const std::string& file_path, char sep = '/') {
-  return {std::next(file_path.begin(), file_path.find_last_of(sep) + 1),
-          file_path.end()};
-}
-
-std::string GetTimedLogMessage(const std::string& file, int lineno, const std::string& message) {
-  std::stringstream sstream;
-  std::string file_name = GetFileName(file);
-  std::time_t t = std::time(nullptr);
-  sstream << "["
-#ifdef _WIN32
-// TODO(vvchernov): use #include <time.h> instead of <ctime> and localtime_s() approach for WIN32
-#pragma warning(disable : 4996)  // _CRT_SECURE_NO_WARNINGS
-#endif
-          << std::put_time(std::localtime(&t), "%H:%M:%S")
-#ifdef _WIN32
-#pragma warning(default : 4996)
-#endif
-          << "][TVM] "
-          << file_name << ":" << lineno << ": " + message;
-  return sstream.str();
-}
-
-namespace tvm {
-namespace runtime {
-namespace detail {
-void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
-  throw std::runtime_error(GetTimedLogMessage(file, lineno, message));
-}
-
-void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
-  std::cerr << GetTimedLogMessage(file, lineno, message) << std::endl;
-}
-
-}  // namespace detail
-}  // namespace runtime
-}  // namespace tvm
diff --git a/onnxruntime/core/providers/tvm/hash_alg/hasher.cc b/onnxruntime/core/providers/tvm/hash_alg/hasher.cc
deleted file mode 100644
index bb62b41c7aa85..0000000000000
--- a/onnxruntime/core/providers/tvm/hash_alg/hasher.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/common/common.h"
-
-#include "hasher.h"       // NOLINT(build/include_subdir)
-#include "hasher_impl.h"  // NOLINT(build/include_subdir)
-
-namespace onnxruntime {
-namespace tvm {
-
-Hasher::Hasher(const std::string& hash_type) {
-  hasher_ = getHasherImpl(hash_type);
-}
-
-std::string Hasher::hash(const char* src, size_t size) const {
-  return hasher_->hash(src, size);
-}
-
-std::shared_ptr<HasherImpl> Hasher::getHasherImpl(const std::string& hash_type) {
-  if (hash_type == "sha256") {
-    return std::make_shared<HasherSHA256Impl>();
-  } else {
-    ORT_NOT_IMPLEMENTED("Hasher was not implemented for hash type: ", hash_type);
-  }
-  return nullptr;
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/hash_alg/hasher.h b/onnxruntime/core/providers/tvm/hash_alg/hasher.h
deleted file mode 100644
index 7b2f50def2e36..0000000000000
--- a/onnxruntime/core/providers/tvm/hash_alg/hasher.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_H_
-#define ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_H_
-
-#include <memory>
-#include <string>
-
-namespace onnxruntime {
-namespace tvm {
-class HasherImpl;
-
-class Hasher {
- public:
-  Hasher() = delete;
-  explicit Hasher(const std::string& hash_type);
-  virtual ~Hasher() = default;
-
-  std::string hash(const char* src, size_t size) const;
-
- private:
-  std::shared_ptr<HasherImpl> getHasherImpl(const std::string& hash_type);
-
- private:
-  std::shared_ptr<HasherImpl> hasher_;
-};
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_H_
diff --git a/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.cc b/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.cc
deleted file mode 100644
index 20aef66f3046a..0000000000000
--- a/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "hasher_impl.h"  // NOLINT(build/include_subdir)
-
-namespace onnxruntime {
-namespace tvm {
-
-std::string HasherSHA256Impl::hash(const char* src, size_t size) const {
-  return hexdigest(src, size);
-}
-
-void HasherSHA256Impl::digest(const Ipp8u* src, int size, Ipp8u* dst) {
-  IppStatus status = ippStsNoErr;
-  const IppsHashMethod* hashMethod = ippsHashMethod_SHA256();
-  status = ippsHashMessage_rmf(src, size, dst, hashMethod);
-  if (ippStsNoErr != status) {
-    ORT_THROW("Can't get SHA-256...");
-  }
-}
-
-std::string HasherSHA256Impl::digest(const char* src, size_t size) {
-  const int digest_size_byte = IPP_SHA256_DIGEST_BITSIZE / 8;
-  auto dst = std::unique_ptr<char>(new char[digest_size_byte]);
-  digest(reinterpret_cast<const Ipp8u*>(src), static_cast<int>(size), reinterpret_cast<Ipp8u*>(dst.get()));
-  return std::string(dst.get(), digest_size_byte);
-}
-
-std::string HasherSHA256Impl::hexdigest(const char* src, size_t size) {
-  std::string byte_digest = digest(src, size);
-  std::stringstream ss;
-  for (char c : byte_digest) {
-    ss << std::hex << std::setw(2) << std::setfill('0') << (0xff & c);
-  }
-  return ss.str();
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.h b/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.h
deleted file mode 100644
index 6c285dd0c78f3..0000000000000
--- a/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_IMPL_H_
-#define ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_IMPL_H_
-
-#include <ippcp.h>
-#include <string>
-#include <iomanip>
-#include <sstream>
-#include <memory>
-
-#include "core/common/common.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-class HasherImpl {
- public:
-  HasherImpl() = default;
-  virtual ~HasherImpl() = default;
-
-  virtual std::string hash(const char* src, size_t size) const = 0;
-};
-
-class HasherSHA256Impl : public HasherImpl {
- public:
-  HasherSHA256Impl() = default;
-  virtual ~HasherSHA256Impl() = default;
-
-  std::string hash(const char* src, size_t size) const final;
-
- private:
-  static void digest(const Ipp8u* src, int size, Ipp8u* dst);
-  static std::string digest(const char* src, size_t size);
-  static std::string hexdigest(const char* src, size_t size);
-};
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_IMPL_H_
diff --git a/onnxruntime/core/providers/tvm/symbols.txt b/onnxruntime/core/providers/tvm/symbols.txt
deleted file mode 100644
index 8d903acd9ea76..0000000000000
--- a/onnxruntime/core/providers/tvm/symbols.txt
+++ /dev/null
@@ -1 +0,0 @@
-OrtSessionOptionsAppendExecutionProvider_Tvm
diff --git a/onnxruntime/core/providers/tvm/tvm_allocator.cc b/onnxruntime/core/providers/tvm/tvm_allocator.cc
deleted file mode 100644
index 4b68f6432e8cc..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_allocator.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <tvm/runtime/device_api.h>
-
-#include "tvm_allocator.h"
-#include "core/framework/session_state.h"
-#include "xpu_data_transfer.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-void* TVMAllocator::Alloc(size_t size) {
-  void* p = nullptr;
-  if (size > 0) {
-    DLDataType dl_type{kDLInt, 8, 1};
-    int err = TVMDeviceAllocDataSpace(ctx, size, ::tvm::runtime::kAllocAlignment, dl_type, reinterpret_cast<void**>(&p));
-    CHECK_EQ(err, 0);
-    return p;
-  }
-  return p;
-}
-
-void TVMAllocator::Free(void* p) {
-  TVMDeviceFreeDataSpace(ctx, p);
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_allocator.h b/onnxruntime/core/providers/tvm/tvm_allocator.h
deleted file mode 100644
index f3ba544b8ac46..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_allocator.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef TVM_ALLOCATOR
-#define TVM_ALLOCATOR
-
-#include "core/framework/allocator.h"
-#include "tvm_common.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-#define TVM_ALLOC_ALIGN 128
-
-class TVMAllocator : public IAllocator {
- public:
-  TVMAllocator() : TVMAllocator(OrtMemoryInfo("TVM",
-                                              OrtAllocatorType::OrtDeviceAllocator,
-                                              OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0),
-                                              0,
-                                              OrtMemTypeDefault)) {}
-  explicit TVMAllocator(const OrtMemoryInfo& info)
-      : IAllocator(info) {
-    switch (info.device.Type()) {
-      case OrtDevice::CPU:
-        ctx = {kDLCPU, info.device.Id()};
-        break;
-      case OrtDevice::GPU:
-        ctx = {kDLVulkan, info.device.Id()};
-        break;
-      default:
-        ORT_NOT_IMPLEMENTED("Unsupported device");
-        break;
-    }
-  }
-
-  virtual void* Alloc(size_t size) override;
-  virtual void Free(void* p) override;
-  DLDevice ctx;
-};
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // TVM_ALLOCATOR
diff --git a/onnxruntime/core/providers/tvm/tvm_api.cc b/onnxruntime/core/providers/tvm/tvm_api.cc
deleted file mode 100644
index e9a7d002e77c8..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_api.cc
+++ /dev/null
@@ -1,303 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <glob.h>  // glob(), globfree()
-#endif
-#include <string.h>  // memset()
-#include <unordered_map>
-#include <fstream>
-#include <sstream>
-
-#include <tvm/runtime/registry.h>
-#include <tvm/runtime/device_api.h>
-#include <tvm/target/codegen.h>
-
-#include "core/common/common.h"
-#include <gsl/gsl>
-
-#include "tvm_api.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-using TvmIntArray = ::tvm::Array<::tvm::Integer>;
-using TvmPackedFunc = ::tvm::PackedFunc;
-namespace tvm_rt = ::tvm::runtime;
-namespace tvm_rt_vm = tvm_rt::vm;
-
-TvmModule TVMCompile(const TvmEPOptions& options,
-                     const std::string& onnx_txt,
-                     const std::string& model_path,
-                     int opset,
-                     const TVMTensorShapes& input_shapes) {
-  ::tvm::Array<TvmIntArray> shapes;
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    TvmIntArray shape;
-    for (auto& dim : input_shapes[i]) {
-      shape.push_back(::tvm::Integer(dim));
-    }
-    shapes.push_back(shape);
-  }
-
-  const TvmPackedFunc* compile = tvm_rt::Registry::Get("tvm_onnx_import_and_compile");
-  ORT_ENFORCE(compile != nullptr, "Unable to retrieve 'tvm_onnx_import_and_compile'.");
-  TvmModule mod = (*compile)(TVMByteArray{onnx_txt.data(), onnx_txt.size()},
-                             model_path,
-                             options.executor,
-                             options.target,
-                             options.target_host,
-                             options.opt_level,
-                             opset,
-                             options.freeze_weights,
-                             shapes,
-                             options.to_nhwc,
-                             options.tuning_file_path,
-                             options.tuning_type);
-  ORT_ENFORCE(mod.get() != nullptr, "Compiled TVM Module is nullptr!");
-  return mod;
-}
-
-std::vector<std::string> glob(const std::string& dir, const std::string& extension) {
-  std::vector<std::string> filenames;
-#ifdef _WIN32
-  std::string pattern = dir + "/*." + extension;
-  WIN32_FIND_DATA fd;
-  HANDLE hFind = ::FindFirstFile(pattern.c_str(), &fd);
-  if (hFind != INVALID_HANDLE_VALUE) {
-    do {
-      if (!(fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
-        filenames.push_back(
-            dir +
-            ToUTF8String(PathString{k_preferred_path_separator}) +
-            fd.cFileName);
-      }
-    } while (::FindNextFile(hFind, &fd));
-    ::FindClose(hFind);
-  }
-#else
-  glob_t glob_result;
-  memset(&glob_result, 0, sizeof(glob_result));
-
-  std::string pattern = dir + "/*." + extension;
-  int return_value = glob(pattern.c_str(), GLOB_TILDE, NULL, &glob_result);
-  ORT_ENFORCE(return_value == 0, "No results of glob for pattern: " + pattern);
-
-  for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
-    filenames.push_back(std::string(glob_result.gl_pathv[i]));
-  }
-  globfree(&glob_result);
-#endif
-  return filenames;
-}
-
-std::string filter_lib_paths(const std::vector<std::string>& lib_paths, const std::string& lib_ext) {
-  std::string lib_path;
-  size_t counter = 0;
-  for (const auto& path : lib_paths) {
-    if (path.find("libtvm_runtime." + lib_ext) != std::string::npos ||
-        path.find("liboctomized_model." + lib_ext) != std::string::npos) {
-      ++counter;
-    } else {
-      lib_path = path;
-    }
-  }
-  ORT_ENFORCE((lib_paths.size() - counter) == 1, "It should be only one shared library for model after filtering");
-
-  return lib_path;
-}
-
-static std::unordered_map<std::string, uint64_t> str2dev_type = {
-    {"llvm", 1},
-    {"stackvm", 1},
-    {"cpu", 1},
-    {"c", 1},
-    {"hybrid", 1},
-    {"composite", 1},
-    {"cuda", 2},
-    {"nvptx", 2},
-    {"cl", 4},
-    {"opencl", 4},
-    {"sdaccel", 4},
-    {"aocl", 5},
-    {"aocl_sw_emu", 5},
-    {"vulkan", 7},
-    {"metal", 8},
-    {"vpi", 9},
-    {"rocm", 10},
-    {"ext_dev", 12},
-    {"hexagon", 14},
-    {"webgpu", 15}};
-
-TvmModule TVMSoCompile(const TvmEPOptions& options) {
-  const std::string& dir = options.so_folder;
-#ifdef _WIN32
-  std::string lib_ext = "dll";
-#else
-  std::string lib_ext = "so";
-#endif
-  const std::string lib_path = filter_lib_paths(glob(dir, lib_ext), lib_ext);
-  const std::string consts_path = dir +
-                                  ToUTF8String(PathString{k_preferred_path_separator}) +
-                                  "consts";
-  const auto& ro_paths = glob(dir, "ro");
-  ORT_ENFORCE(ro_paths.size() == 1, "It should be only one ro file in folder: " + dir);
-  const std::string vm_exec_code_path = ro_paths[0];
-
-  TvmModule lib = TvmModule::LoadFromFile(lib_path);
-
-  std::ifstream code(vm_exec_code_path, std::ios::binary);
-  std::stringstream ss;
-  ss << code.rdbuf();
-
-  auto exec_mod = tvm_rt_vm::Executable::Load(ss.str(), lib);
-  const tvm_rt_vm::Executable* tmp = exec_mod.as<tvm_rt_vm::Executable>();
-  auto exec = tvm_rt::GetObjectPtr<tvm_rt_vm::Executable>(const_cast<tvm_rt_vm::Executable*>(tmp));
-  exec->LoadLateBoundConstantsFromFile(consts_path);
-
-  auto vm = tvm_rt::make_object<tvm_rt_vm::VirtualMachine>();
-  vm->LoadExecutable(exec);
-
-  size_t pos = options.target.find(" ");
-  const std::string dev_type_str = options.target.substr(0, pos);
-  ORT_ENFORCE(!dev_type_str.empty(), "Device was not found in target string");
-  uint64_t dev_type = str2dev_type[dev_type_str];
-  const uint64_t cpu_type = str2dev_type["cpu"];
-  // Initialize the VM for the specified device. If the device is not a CPU,
-  // We'll need to add a CPU context to drive it.
-  int arity;
-  if (dev_type == cpu_type) {
-    arity = 3;
-  } else {
-    arity = 6;
-  }
-  uint64_t alloc_type = uint64_t(tvm_rt_vm::AllocatorType::kPooled);
-  // TODO(vchernov): multiple devices using and using device with specified id are not supported
-  // Always use the first device of the specified type.
-  uint64_t device_id = 0;
-  std::vector<TVMValue> init_vals(arity);
-  std::vector<int> codes(arity);
-  tvm_rt::TVMArgsSetter setter(init_vals.data(), codes.data());
-  setter(0, dev_type);
-  setter(1, device_id);
-  setter(2, alloc_type);
-  // Also initialize a CPU device context.
-  if (dev_type != cpu_type) {
-    setter(3, cpu_type);
-    setter(4, device_id);
-    setter(5, alloc_type);
-  }
-  tvm_rt::TVMRetValue rv;
-  // Call the packed func with the init arguments.
-  vm->GetFunction("init", nullptr).CallPacked(tvm_rt::TVMArgs(init_vals.data(), codes.data(), arity), &rv);
-
-  return TvmModule(vm);
-}
-
-void TVMSetInputs(TvmModule& mod,
-                  std::vector<size_t>& inds,
-                  std::vector<DLTensor>& inputs) {
-  TvmPackedFunc set_input = mod.GetFunction("set_input", false);
-  TvmPackedFunc set_input_zero_copy = mod.GetFunction("set_input_zero_copy", false);
-  for (size_t i = 0; i < inds.size(); ++i) {
-    if (reinterpret_cast<size_t>(inputs[i].data) % tvm_rt::kAllocAlignment == 0) {
-      set_input_zero_copy(inds[i], &inputs[i]);
-    } else {
-      set_input(inds[i], &inputs[i]);
-    }
-  }
-}
-
-void TVM_VM_SetInputs(TvmModule& mod,
-                      std::vector<size_t>& inds,
-                      std::vector<DLTensor>& inputs) {
-  size_t num_total_args = inputs.size() + 1;
-  std::vector<TVMValue> tvm_values(num_total_args);
-  std::vector<int> tvm_type_codes(num_total_args);
-  ::tvm::runtime::TVMArgsSetter setter(tvm_values.data(), tvm_type_codes.data());
-  const std::string func_name = "main";
-  setter(0, func_name.c_str());
-  for (size_t k = 0; k < num_total_args - 1; ++k) {
-    setter(inds[k] + 1, &inputs[k]);
-  }
-
-  TvmPackedFunc set_input = mod.GetFunction("set_input", false);
-  ::tvm::runtime::TVMRetValue rv;
-  set_input.CallPacked(::tvm::runtime::TVMArgs(tvm_values.data(), tvm_type_codes.data(), gsl::narrow_cast<int>(num_total_args)), &rv);
-}
-
-void TVMSetOutputsZeroCopy(TvmModule& mod,
-                           std::vector<DLTensor>& outputs) {
-  TvmPackedFunc set_output = mod.GetFunction("set_output_zero_copy", false);
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    set_output(i, &outputs[i]);
-  }
-}
-
-void TVM_VM_SetOutputsZeroCopy(TvmModule& mod,
-                               std::vector<DLTensor>& outputs) {
-  size_t num_total_args = outputs.size() + 1;
-  std::vector<TVMValue> tvm_values(num_total_args);
-  std::vector<int> tvm_type_codes(num_total_args);
-  tvm_rt::TVMArgsSetter setter(tvm_values.data(), tvm_type_codes.data());
-  const std::string func_name = "main";
-  setter(0, func_name.c_str());
-  for (size_t k = 0; k < num_total_args - 1; ++k) {
-    setter(k + 1, &outputs[k]);
-  }
-
-  TvmPackedFunc set_output = mod.GetFunction("set_outputs", false);
-  tvm_rt::TVMRetValue rv;
-  set_output.CallPacked(tvm_rt::TVMArgs(tvm_values.data(), tvm_type_codes.data(), gsl::narrow_cast<int>(num_total_args)), &rv);
-}
-
-void TVMGetOutputs(TvmModule& mod,
-                   std::vector<DLTensor>& outputs) {
-  TvmPackedFunc get_output = mod.GetFunction("get_output", false);
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    get_output(i, &outputs[i]);
-  }
-}
-
-void TVM_VM_GetOutputs(TvmModule& mod,
-                       std::vector<DLTensor>& outputs) {
-  TvmPackedFunc get_output = mod.GetFunction("get_output", false);
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    // TODO(vvchernov): think about improvement of memory management
-    tvm_rt::NDArray output_array = get_output(i);
-    output_array.CopyTo(&outputs[i]);
-  }
-}
-
-void TVMGetOutputShapes(TvmModule& mod,
-                        TVMTensorShapes& output_shapes) {
-  size_t size = output_shapes.size();
-  TvmPackedFunc get_output = mod.GetFunction("get_output", false);
-  for (size_t i = 0; i < size; ++i) {
-    tvm_rt::NDArray output_array = get_output(i);
-    tvm_rt::ShapeTuple shape_tuple = output_array.Shape();
-    size_t dims_num = shape_tuple.size();
-    TensorShapeVector dims;
-    for (size_t j = 0; j < dims_num; ++j) {
-      dims.push_back(int64_t(shape_tuple[j]));
-    }
-    output_shapes[i] = dims;
-  }
-}
-
-void TVMRun(TvmModule& mod) {
-  TvmPackedFunc run = mod.GetFunction("run", false);
-  ORT_ENFORCE(run != nullptr, "Unable to retrieve graph executor run.");
-  run();
-}
-
-void TVM_VM_Run(TvmModule& mod) {
-  TvmPackedFunc run = mod.GetFunction("invoke", false);
-  ORT_ENFORCE(run != nullptr, "Unable to retrieve virtual machine invoke.");
-  run("main");
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_api.h b/onnxruntime/core/providers/tvm/tvm_api.h
deleted file mode 100644
index bbf05f4fc06d9..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_api.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef TVM_API_H
-#define TVM_API_H
-
-#include <vector>
-#include <string>
-
-#include "tvm_common.h"
-#include "tvm_defaults.h"
-#include "tvm_ep_options.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-TvmModule TVMCompile(const TvmEPOptions& options,
-                     const std::string& onnx_txt,
-                     const std::string& model_path,
-                     int opset,
-                     const TVMTensorShapes& input_shapes);
-TvmModule TVMSoCompile(const TvmEPOptions& options);
-
-void TVMSetInputs(TvmModule& mod, std::vector<size_t>& inds, std::vector<DLTensor>& inputs);
-void TVM_VM_SetInputs(TvmModule& mod, std::vector<size_t>& inds, std::vector<DLTensor>& inputs);
-void TVMSetOutputsZeroCopy(TvmModule& mod, std::vector<DLTensor>& outputs);
-void TVM_VM_SetOutputsZeroCopy(TvmModule& mod, std::vector<DLTensor>& outputs);
-void TVMGetOutputs(TvmModule& mod, std::vector<DLTensor>& outputs);
-void TVM_VM_GetOutputs(TvmModule& mod, std::vector<DLTensor>& outputs);
-void TVMGetOutputShapes(TvmModule& mod,
-                        TVMTensorShapes& output_shapes);
-void TVMRun(TvmModule& mod);
-void TVM_VM_Run(TvmModule& mod);
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // TVM_API_H
diff --git a/onnxruntime/core/providers/tvm/tvm_common.h b/onnxruntime/core/providers/tvm/tvm_common.h
deleted file mode 100644
index 68e3b6496328a..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_common.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef TVM_COMMON_H
-#define TVM_COMMON_H
-
-#include <vector>
-#include <map>
-
-#include <dlpack/dlpack.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/vm/vm.h>
-
-namespace onnxruntime {
-namespace tvm {
-
-using TvmModule = ::tvm::runtime::Module;
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // TVM_COMMON_H
diff --git a/onnxruntime/core/providers/tvm/tvm_compiler.cc b/onnxruntime/core/providers/tvm/tvm_compiler.cc
deleted file mode 100644
index 8f4e7e7de9a36..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_compiler.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <utility>
-
-#include "tvm_compiler.h"
-#include "tvm_api.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-auto TVMCompilerBase::operator()(const TvmEPOptions& options,
-                                 const TVMTensorShapes& input_shapes) -> ModulePtr {
-  if (mod_) {
-    return mod_;
-  }
-
-  mod_ = std::make_shared<TvmModule>();
-  this->compileTVMModule(options, input_shapes);
-
-  return mod_;
-}
-
-TVMCompiler::TVMCompiler(std::string&& onnx_model_str,
-                         const std::string& model_path,
-                         int opset) : onnx_model_str_(std::move(onnx_model_str)),
-                                      model_path_(model_path),
-                                      opset_(opset) {
-}
-
-void TVMCompiler::compileTVMModule(const TvmEPOptions& options,
-                                   const TVMTensorShapes& input_shapes) {
-  *mod_ = tvm::TVMCompile(options,
-                          onnx_model_str_,
-                          model_path_,
-                          opset_,
-                          input_shapes);
-
-  onnx_model_str_.clear();
-}
-
-void TVMSoCompiler::compileTVMModule(const TvmEPOptions& options,
-                                     [[maybe_unused]] const TVMTensorShapes& input_shapes) {
-  *mod_ = tvm::TVMSoCompile(options);
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_compiler.h b/onnxruntime/core/providers/tvm/tvm_compiler.h
deleted file mode 100644
index bfc73d67aa07f..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_compiler.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef TVM_COMPILER_H
-#define TVM_COMPILER_H
-
-#include <string>
-#include <memory>
-
-#include "tvm_common.h"
-#include "tvm_ep_options.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-class TVMCompilerBase {
- public:
-  using ModulePtr = std::shared_ptr<TvmModule>;
-
-  TVMCompilerBase() = default;
-  virtual ~TVMCompilerBase() = default;
-
-  ModulePtr operator()(const TvmEPOptions& options,
-                       const TVMTensorShapes& input_shapes);
-
-  virtual void compileTVMModule(const TvmEPOptions& options,
-                                const TVMTensorShapes& input_shapes) = 0;
-
- protected:
-  ModulePtr mod_;
-};
-
-class TVMCompiler : public TVMCompilerBase {
- public:
-  TVMCompiler() = delete;
-  ~TVMCompiler() = default;
-
-  TVMCompiler(std::string&& onnx_model_str,
-              const std::string& model_path,
-              int opset);
-
-  void compileTVMModule(const TvmEPOptions& options,
-                        const TVMTensorShapes& input_shapes) final;
-
- private:
-  std::string onnx_model_str_;
-  std::string model_path_;
-  int opset_;
-};
-
-class TVMSoCompiler : public TVMCompilerBase {
- public:
-  TVMSoCompiler() = default;
-  ~TVMSoCompiler() = default;
-
-  void compileTVMModule(const TvmEPOptions& options,
-                        const TVMTensorShapes& input_shapes) final;
-};
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // TVM_COMPILER_H
diff --git a/onnxruntime/core/providers/tvm/tvm_defaults.h b/onnxruntime/core/providers/tvm/tvm_defaults.h
deleted file mode 100644
index 197d1f363c50d..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_defaults.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_DEFAULTS_H_
-#define ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_DEFAULTS_H_
-
-#include <string>
-
-namespace onnxruntime {
-namespace tvm {
-
-namespace env_vars {
-static const std::string kDumpSubgraphs = "ORT_TVM_DUMP_SUBGRAPHS";
-}  // namespace env_vars
-
-constexpr const char* default_executor_type = "vm";
-constexpr const char* vm_executor_type = "vm";
-constexpr const char* graph_executor_type = "graph";
-
-constexpr const char* default_target_str = "llvm";
-constexpr const char* llvm_target_str = "llvm";
-
-constexpr const char* cpu_target_str = "cpu";
-constexpr const char* gpu_target_str = "gpu";
-
-constexpr const char* default_tuning_type = "AutoTVM";
-constexpr const char* autotvm_tuning_type = "AutoTVM";
-constexpr const char* ansor_tuning_type = "Ansor";
-
-constexpr const unsigned int default_opt_level = 3;
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_DEFAULTS_H_
diff --git a/onnxruntime/core/providers/tvm/tvm_ep_options.cc b/onnxruntime/core/providers/tvm/tvm_ep_options.cc
deleted file mode 100644
index 70e99833cd78b..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_ep_options.cc
+++ /dev/null
@@ -1,273 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <unordered_set>
-#include <regex>
-
-#include "core/common/common.h"
-#include "core/common/cpuid_info.h"
-#include "core/framework/provider_options_utils.h"
-
-#include "tvm_ep_options.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-namespace provider_option_names {
-constexpr const char* kExecutor = "executor";
-constexpr const char* kSoFolder = "so_folder";
-constexpr const char* kCheckHash = "check_hash";
-constexpr const char* kHashFilePath = "hash_file_path";
-constexpr const char* kTarget = "target";
-constexpr const char* kTargetHost = "target_host";
-constexpr const char* kOptLevel = "opt_level";
-constexpr const char* kFreezeWeights = "freeze_weights";
-constexpr const char* kSetOutputZeroCopy = "set_output_zero_copy";
-constexpr const char* kToNHWC = "to_nhwc";
-constexpr const char* kTuningFilePath = "tuning_file_path";
-constexpr const char* kTuningType = "tuning_type";
-constexpr const char* kInputNames = "input_names";
-constexpr const char* kInputShapes = "input_shapes";
-
-static const std::unordered_set<std::string> valid_keys{
-    std::string{kExecutor},
-    std::string{kSoFolder},
-    std::string{kCheckHash},
-    std::string{kHashFilePath},
-    std::string{kTarget},
-    std::string{kTargetHost},
-    std::string{kOptLevel},
-    std::string{kFreezeWeights},
-    std::string{kSetOutputZeroCopy},
-    std::string{kToNHWC},
-    std::string{kTuningFilePath},
-    std::string{kTuningType},
-    std::string{kInputNames},
-    std::string{kInputShapes}};
-
-}  // namespace provider_option_names
-
-size_t split(const std::string& src, std::vector<std::string>& dst, char ch) {
-  dst.clear();
-
-  size_t pos = src.find(ch);
-  size_t initialPos = 0;
-  while (pos != std::string::npos) {
-    dst.push_back(src.substr(initialPos, pos - initialPos));
-    initialPos = pos + 1;
-
-    pos = src.find(ch, initialPos);
-  }
-  dst.push_back(src.substr(initialPos, std::min(pos, src.size()) - initialPos + 1));
-
-  return dst.size();
-}
-
-TvmEPOptions TvmEPOptionsHelper::FromOptionsString(const char* opt_str) {
-  std::string settings{opt_str};
-  ProviderOptions options;
-  if (!settings.empty()) {
-    const std::string& str = settings;
-
-    // tokenize settings
-    std::regex reg("\\s*,\\s*");
-    std::sregex_token_iterator iter(str.begin(), str.end(), reg, -1);
-    std::sregex_token_iterator iter_end;
-    std::vector<std::string> pairs(iter, iter_end);
-
-    ORT_ENFORCE(pairs.size() > 0);
-
-    for (const auto& pair : pairs) {
-      auto pos_colon = pair.find(':');
-      ORT_ENFORCE(pos_colon != std::string::npos, "Invalid key value pair.");
-      std::string key = pair.substr(0, pos_colon);
-      std::string value = pair.substr(pos_colon + 1);
-
-      // trim leading and trailing spaces from key/value
-      key = whitespace_trimming(key);
-      value = whitespace_trimming(value);
-
-      // Check keys of obtained options
-      if (tvm::provider_option_names::valid_keys.count(key) == 0) {
-        ORT_NOT_IMPLEMENTED("TvmOptions: unknown option (", key, ")");
-      }
-
-      options[key] = value;
-    }
-  }
-
-  return TvmEPOptionsHelper::FromProviderOptions(options);
-}
-
-std::string TvmEPOptionsHelper::whitespace_trimming(const std::string& str) {
-  const std::string WHITESPACE = " \n\r\t\f\v";
-  size_t start = str.find_first_not_of(WHITESPACE);
-  if (start == std::string::npos) {
-    return "";
-  } else {
-    size_t end = str.find_last_not_of(WHITESPACE);
-    ORT_ENFORCE(end != std::string::npos);
-    return str.substr(start, end + 1);
-  }
-}
-
-TvmEPOptions TvmEPOptionsHelper::FromProviderOptions(const ProviderOptions& pr_options) {
-  TvmEPOptions options{};
-
-  ORT_THROW_IF_ERROR(
-      ProviderOptionsParser{}
-          .AddAssignmentToReference(tvm::provider_option_names::kExecutor, options.executor)
-          .AddAssignmentToReference(tvm::provider_option_names::kSoFolder, options.so_folder)
-          .AddAssignmentToReference(tvm::provider_option_names::kCheckHash, options.check_hash)
-          .AddAssignmentToReference(tvm::provider_option_names::kHashFilePath, options.hash_file_path)
-          .AddAssignmentToReference(tvm::provider_option_names::kTarget, options.target)
-          .AddAssignmentToReference(tvm::provider_option_names::kTargetHost, options.target_host)
-          .AddAssignmentToReference(tvm::provider_option_names::kOptLevel, options.opt_level)
-          .AddAssignmentToReference(tvm::provider_option_names::kFreezeWeights, options.freeze_weights)
-          .AddAssignmentToReference(tvm::provider_option_names::kSetOutputZeroCopy, options.set_output_zero_copy)
-          .AddAssignmentToReference(tvm::provider_option_names::kToNHWC, options.to_nhwc)
-          .AddAssignmentToReference(tvm::provider_option_names::kTuningFilePath, options.tuning_file_path)
-          .AddAssignmentToReference(tvm::provider_option_names::kTuningType, options.tuning_type)
-          .AddAssignmentToReference(tvm::provider_option_names::kInputNames, options.input_names_str)
-          .AddAssignmentToReference(tvm::provider_option_names::kInputShapes, options.input_shapes_str)
-          .Parse(pr_options));
-
-  optionsPostprocess(options);
-
-  return options;
-}
-
-void TvmEPOptionsHelper::optionsPostprocess(TvmEPOptions& options) {
-  setInputShapes(options);
-  targetPostprocess(options.target);
-  targetHostPostprocess(options.target, options.target_host);
-  optLevelPostprocess(options.opt_level);
-}
-
-bool TvmEPOptionsHelper::checkCPUTarget(const std::string& target) {
-  bool check = target.find("llvm") != std::string::npos;
-  return check;
-}
-
-bool TvmEPOptionsHelper::checkGPUTarget(const std::string& target) {
-  bool check = (target.find("cuda") != std::string::npos ||
-                target.find("opencl") != std::string::npos ||
-                target.find("metal") != std::string::npos ||
-                target.find("vulkan") != std::string::npos);
-  return check;
-}
-
-void TvmEPOptionsHelper::setInputShapes(TvmEPOptions& options) {
-  if (options.input_names_str.empty() && options.input_shapes_str.empty())
-    return;
-  ORT_ENFORCE(!options.input_names_str.empty() && !options.input_shapes_str.empty(),
-              "Both provider options \"input_names\" and \"input_shapes\" should be empty or full");
-
-  std::vector<std::string> name_set;
-  std::string trimmed_names = whitespace_trimming(options.input_names_str);
-  size_t inp_tensors_num = split(trimmed_names, name_set, ' ');
-  ORT_ENFORCE(inp_tensors_num, "There is no any input tensor names!");
-
-  std::string trimmed_shapes = whitespace_trimming(options.input_shapes_str);
-  size_t end_pos = trimmed_shapes.find_last_of(']');
-  ORT_ENFORCE(end_pos != std::string::npos, "Invalid string for input shapes. Symbol ] is not found");
-  ORT_ENFORCE(end_pos == (trimmed_shapes.size() - 1),
-              "Invalid string for input shapes. Symbol ] should be last after whitespace trimming");
-
-  std::vector<std::string> shape_set;
-  split(trimmed_shapes, shape_set, ']');
-  shape_set.pop_back();
-  ORT_ENFORCE(shape_set.size() == inp_tensors_num,
-              "Number of shapes is not the same as number of input tensor names");
-
-  for (size_t i = 0; i < inp_tensors_num; ++i) {
-    size_t pos = shape_set[i].find('[');
-    ORT_ENFORCE(pos != std::string::npos, "There is no symbol [ as pair for ]");
-    std::string numbers = shape_set[i].substr(pos + 1);
-    std::vector<std::string> number_set;
-    ORT_ENFORCE(split(numbers, number_set, ' '), "There is no any number between [ and ] symbols");
-
-    TensorShapeVector dims;
-    for (const auto& number : number_set) {
-      dims.push_back(std::stoi(number));
-    }
-
-    options.input_shapes[name_set[i]] = dims;
-  }
-}
-
-void TvmEPOptionsHelper::targetPostprocess(std::string& target) {
-  if (target == tvm::cpu_target_str ||
-      target == tvm::llvm_target_str) {
-    ProcessCPUTarget(target);
-  } else if (target == tvm::gpu_target_str) {
-    ProcessGPUTarget();
-  } else if (target.empty()) {
-    ORT_NOT_IMPLEMENTED("target option is empty!");
-  } else {
-    // TODO(vvchernov): extend mechanism of auto-definition of target
-    // target is gotten from option set up by client
-  }
-}
-
-void TvmEPOptionsHelper::ProcessCPUTarget(std::string& target) {
-  const auto& cpu_id_info = CPUIDInfo::GetCPUIDInfo();
-  // auto detect from CPU ID
-  if (cpu_id_info.HasAVX512Skylake()) {
-    target = tvm::cpu_targets::LLVM_TARGET_SKYLAKE_AVX512;
-  } else if (cpu_id_info.HasAVX512f()) {
-    target = tvm::cpu_targets::LLVM_TARGET_AVX512;
-  } else if (cpu_id_info.HasAVX2()) {
-    target = tvm::cpu_targets::LLVM_TARGET_AVX2;
-  } else if (cpu_id_info.HasAVX()) {
-    target = tvm::cpu_targets::LLVM_TARGET_AVX;
-  } else {
-    // TODO(vvchernov): extend mechanism of auto-definition of cpu target
-    target = tvm::llvm_target_str;
-  }
-}
-
-void TvmEPOptionsHelper::ProcessGPUTarget() {
-  ORT_NOT_IMPLEMENTED("GPU target auto-defenition is not implemented now!");
-}
-
-void TvmEPOptionsHelper::targetHostPostprocess(const std::string& target, std::string& target_host) {
-  if ((target_host == tvm::cpu_target_str ||
-       target_host == tvm::llvm_target_str) &&
-      target_host != target) {
-    target_host = target;
-  } else if (target_host.empty()) {
-    target_host = target;
-  } else {
-    // TODO(vvchernov): extend mechanism of auto-definition of target host
-    // target host is gotten from option set up by client
-  }
-}
-
-void TvmEPOptionsHelper::optLevelPostprocess(unsigned int& opt_level) {
-  if (opt_level < 1) {
-    opt_level = tvm::default_opt_level;
-  }
-}
-
-std::ostream& operator<<(std::ostream& out, const TvmEPOptions& options) {
-  out << "TVM EP options:\n"
-      << "executor type: " << options.executor << "\n"
-      << "so_folder: " << options.so_folder << "\n"
-      << "check_hash: " << options.check_hash << "\n"
-      << "hash_file_path: " << options.hash_file_path << "\n"
-      << "target: " << options.target << "\n"
-      << "target_host: " << options.target_host << "\n"
-      << "opt level: " << options.opt_level << "\n"
-      << "freeze weights: " << options.freeze_weights << "\n"
-      << "set_output_zero_copy: " << options.set_output_zero_copy << "\n"
-      << "tuning file path: " << options.tuning_file_path << "\n"
-      << "tuning type: " << options.tuning_type << "\n"
-      << "convert layout to NHWC: " << options.to_nhwc << "\n"
-      << "input tensor names: " << options.input_names_str << "\n"
-      << "input tensor shapes: " << options.input_shapes_str;
-  return out;
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_ep_options.h b/onnxruntime/core/providers/tvm/tvm_ep_options.h
deleted file mode 100644
index 0f2db30a3b304..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_ep_options.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef TVM_EXECUTION_PROVIDER_OPTIONS_H
-#define TVM_EXECUTION_PROVIDER_OPTIONS_H
-
-#include <unordered_map>
-#include <vector>
-#include <string>
-#include <iostream>
-
-#include "core/framework/provider_options.h"
-#include "core/framework/tensor_shape.h"
-
-#include "tvm_defaults.h"
-
-namespace onnxruntime {
-
-namespace tvm {
-namespace cpu_targets {
-// TODO(vvchernov): avx and avx512 need more careful differentiation for target
-const std::string LLVM_TARGET_AVX = "llvm -mcpu=corei7-avx";
-const std::string LLVM_TARGET_AVX2 = "llvm -mcpu=core-avx2";
-const std::string LLVM_TARGET_SKYLAKE_AVX512 = "llvm -mcpu=skylake-avx512";
-const std::string LLVM_TARGET_AVX512 = "llvm -mcpu=skylake-avx512";
-}  // namespace cpu_targets
-
-using TVMTensorShapes = std::vector<TensorShapeVector>;
-using TVMInputShapes = std::unordered_map<std::string, TensorShapeVector>;
-using InputsInfoMap = std::unordered_map<size_t, TensorShapeVector>;
-
-// Information needed to construct an TVM execution provider.
-struct TvmEPOptions {
-  std::string executor{tvm::default_executor_type};
-  std::string so_folder{""};
-  bool check_hash = false;
-  std::string hash_file_path{""};
-  std::string target{tvm::default_target_str};
-  std::string target_host{tvm::default_target_str};
-  unsigned int opt_level{tvm::default_opt_level};
-  bool freeze_weights = true;
-  bool to_nhwc = false;
-  bool set_output_zero_copy = true;
-  std::string tuning_file_path{""};
-  std::string tuning_type{tvm::default_tuning_type};
-  std::string input_names_str{""};
-  std::string input_shapes_str{""};
-  TVMInputShapes input_shapes{};
-  TVMTensorShapes output_shapes{};
-};
-
-std::ostream& operator<<(std::ostream& out, const TvmEPOptions& options);
-
-class TvmEPOptionsHelper {
- public:
-  static TvmEPOptions FromOptionsString(const char* options);
-  static TvmEPOptions FromProviderOptions(const ProviderOptions& options);
-  static std::string whitespace_trimming(const std::string& str);
-
-  static bool checkCPUTarget(const std::string& target);
-  static bool checkGPUTarget(const std::string& target);
-
- private:
-  static void optionsPostprocess(TvmEPOptions& options);
-  static void setInputShapes(TvmEPOptions& options);
-  static void targetPostprocess(std::string& target);
-  static void ProcessCPUTarget(std::string& target);
-  static void ProcessGPUTarget();
-  static void targetHostPostprocess(const std::string& target, std::string& target_host);
-  static void optLevelPostprocess(unsigned int& opt_level);
-};
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // TVM_EXECUTION_PROVIDER_OPTIONS_H
diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider.cc b/onnxruntime/core/providers/tvm/tvm_execution_provider.cc
deleted file mode 100644
index 61ee8f899dbf1..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_execution_provider.cc
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <fstream>
-#include <map>
-#include <utility>
-
-#include "core/common/common.h"
-#include "core/framework/execution_provider.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/framework/kernel_registry.h"
-#include "core/framework/compute_capability.h"
-#include "core/graph/graph_proto_serializer.h"
-#include "core/platform/env.h"
-#include "core/graph/model.h"
-
-#include "tvm_execution_provider.h"
-#include "xpu_data_transfer.h"
-#include "tvm_allocator.h"
-#include "tvm_utils.h"
-#include "tvm_api.h"
-
-using namespace ONNX_NAMESPACE;
-
-namespace onnxruntime {
-namespace tvm {
-
-// Information to construct kernel function state.
-struct TVMFuncState {
-  AllocateFunc allocate_func = nullptr;
-  DestroyFunc release_func = nullptr;
-  AllocatorHandle allocator = nullptr;
-  std::shared_ptr<TVMCompilerBase> compiler = nullptr;
-};
-
-TvmExecutionProvider::TvmExecutionProvider(const TvmEPOptions& options)
-    : IExecutionProvider{kTvmExecutionProvider},
-      options_{options} {
-  AllocatorCreationInfo default_memory_info = {[](int) {
-                                                 return std::make_unique<TVMAllocator>();
-                                               },
-                                               0, false};
-  // Get environment variables
-  const Env& env_instance = Env::Default();
-
-  const std::string dump_subgraphs_env = env_instance.GetEnvironmentVar(env_vars::kDumpSubgraphs);
-  if (!dump_subgraphs_env.empty()) {
-    dump_subgraphs_ = std::stoi(dump_subgraphs_env) != 0;
-  }
-}
-
-std::vector<AllocatorPtr> TvmExecutionProvider::CreatePreferredAllocators() {
-  AllocatorCreationInfo default_memory_info = {[](int) {
-                                                 return std::make_unique<TVMAllocator>();
-                                               },
-                                               0, false};
-  return std::vector<AllocatorPtr>{CreateAllocator(default_memory_info)};  // TODO(leca): REVIEW: will CPU EP also use this?
-}
-
-TvmExecutionProvider::~TvmExecutionProvider() {}
-
-std::vector<std::unique_ptr<ComputeCapability>>
-TvmExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
-                                    const IKernelLookup& /*kernel_lookup*/) const {
-  std::vector<std::unique_ptr<ComputeCapability>> result;
-  if (graph_viewer.IsSubgraph()) {
-    return result;
-  }
-
-  const auto& init_tensors = graph_viewer.GetAllInitializedTensors();
-
-  std::unordered_set<std::string> required_initializers;
-  const std::vector<NodeIndex>& sorted_nodes = graph_viewer.GetNodesInTopologicalOrder();
-  std::unique_ptr<IndexedSubGraph> sub_graph = std::make_unique<IndexedSubGraph>();
-  for (auto& node_idx : sorted_nodes) {
-    graph_viewer.GetNode(node_idx)->ForEachDef([&required_initializers, &init_tensors](const NodeArg& node_arg, bool is_input) {
-              if(is_input && init_tensors.count(node_arg.Name())) {
-                  required_initializers.insert(node_arg.Name());
-              } }, true);
-  }
-
-  auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>();
-  meta_def->name = "TVMStandalone";
-  meta_def->domain = "StandaloneTest";
-  std::vector<std::string> inputs;
-  std::vector<std::string> outputs;
-
-  for (auto& nodeArgPtr : graph_viewer.GetInputs()) {
-    inputs.push_back(nodeArgPtr->Name());
-  }
-
-  for (auto& name : required_initializers) {
-    inputs.push_back(name);
-  }
-
-  for (auto& nodeArgPtr : graph_viewer.GetOutputs()) {
-    outputs.push_back(nodeArgPtr->Name());
-  }
-  meta_def->inputs = inputs;
-  meta_def->outputs = outputs;
-  meta_def->since_version = 1;
-  meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
-  sub_graph->SetMetaDef(std::move(meta_def));
-  sub_graph->nodes = sorted_nodes;
-  result.push_back(
-      std::make_unique<ComputeCapability>(std::move(sub_graph)));
-  return result;
-}
-
-common::Status TvmExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
-                                             std::vector<NodeComputeInfo>& node_compute_funcs) {
-  printOptions();
-  for (auto& fused_node_graph : fused_nodes_and_graphs) {
-    const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
-    const Node& fused_node = fused_node_graph.fused_node;
-    const std::string func_name = fused_node.Name();
-    Model model(graph_body_viewer.Name(), true, ModelMetaData(), PathString(),
-                IOnnxRuntimeOpSchemaRegistryList(), graph_body_viewer.DomainToVersionMap(),
-                std::vector<ONNX_NAMESPACE::FunctionProto>(), *GetLogger());
-    ONNX_NAMESPACE::ModelProto model_proto = model.ToProto();
-    // TVM EP is using static lib approach, so invoke serializer directly.
-    GraphViewerToProto(graph_body_viewer, *model_proto.mutable_graph(), true, true);
-    auto opset = model_proto.add_opset_import();
-    opset->set_domain(kOnnxDomain);
-    opset->set_version(graph_body_viewer.DomainToVersionMap().at(kOnnxDomain));
-
-    std::string onnx_model_str;
-    model_proto.SerializeToString(&onnx_model_str);
-    compilers_[func_name] = std::make_shared<TVMCompiler>(std::move(onnx_model_str),
-                                                          ToUTF8String(fused_node.ModelPath().ToPathString()),
-                                                          int(opset->version()));
-    InputsInfoMap all_input_shapes;
-    auto mod = compileModel(func_name, graph_body_viewer, all_input_shapes);
-
-    std::vector<DLTensor> output_tensors;
-    prepareOutputTensors(mod, output_tensors, graph_body_viewer.GetOutputs().size());
-
-    runners_[func_name] = std::make_shared<Runner>(options_, mod, all_input_shapes, output_tensors);
-
-    if (dump_subgraphs_) {
-      std::fstream dump("/tmp/" + func_name + ".onnx",
-                        std::ios::out | std::ios::trunc | std::ios::binary);
-      model_proto.SerializeToOstream(&dump);
-    }
-
-    // TODO(vvchernov): implement ops checking and mechanism of gracefully passing the responsibility to other EPs
-    // if the checking fails due to unsupported op(s)
-    NodeComputeInfo compute_info = prepareComputeInfo(func_name);
-
-    node_compute_funcs.push_back(compute_info);
-  }
-  return Status::OK();
-}
-
-std::unique_ptr<IDataTransfer> TvmExecutionProvider::GetDataTransfer() const {
-  // TODO(vvchernov): target or target host?
-  if (TvmEPOptionsHelper::checkGPUTarget(options_.target)) {
-    return std::make_unique<XPUDataTransfer>();
-  } else if (TvmEPOptionsHelper::checkCPUTarget(options_.target)) {
-    return std::make_unique<TvmCPUDataTransfer>();
-  } else {
-    ORT_NOT_IMPLEMENTED("TVM GetDataTransfer is not implemented for target ", options_.target);
-  }
-}
-
-void TvmExecutionProvider::printOptions() {
-  LOGS(*GetLogger(), INFO) << options_;
-}
-
-std::shared_ptr<TvmModule> TvmExecutionProvider::compileModel(const std::string& func_name,
-                                                              const GraphViewer& graph_viewer,
-                                                              InputsInfoMap& all_input_shapes) {
-  all_input_shapes.clear();
-
-  TVMTensorShapes input_shapes;
-  if (options_.freeze_weights) {
-    setInputShapesForFreezedNN(graph_viewer, input_shapes, all_input_shapes);
-  } else {
-    setInputShapesForUnfreezedNN(graph_viewer, input_shapes, all_input_shapes);
-  }
-
-  std::shared_ptr<TvmModule> mod = compilers_[func_name]->operator()(options_, input_shapes);
-
-  return mod;
-}
-
-void TvmExecutionProvider::setInputShapesForFreezedNN(const GraphViewer& graph_viewer,
-                                                      TVMTensorShapes& input_shapes,
-                                                      InputsInfoMap& all_input_shapes) {
-  const std::vector<const NodeArg*>& all_nodes = graph_viewer.GetInputsIncludingInitializers();
-
-  size_t indx = 0;
-  for (const auto* node : all_nodes) {
-    if (!graph_viewer.IsInitializedTensor(node->Name())) {
-      TensorShapeVector shape = getInputShape(node);
-      all_input_shapes[indx++] = shape;
-      input_shapes.emplace_back(shape);
-    }
-  }
-}
-
-void TvmExecutionProvider::setInputShapesForUnfreezedNN(const GraphViewer& graph_viewer,
-                                                        TVMTensorShapes& input_shapes,
-                                                        InputsInfoMap& all_input_shapes) {
-  const std::vector<const NodeArg*>& all_nodes = graph_viewer.GetInputsIncludingInitializers();
-
-  size_t indx = 0;
-  for (const auto* node : all_nodes) {
-    TensorShapeVector shape = getInputShape(node);
-    all_input_shapes[indx++] = shape;
-    if (!graph_viewer.IsInitializedTensor(node->Name())) {
-      input_shapes.emplace_back(shape);
-    }
-  }
-}
-
-TensorShapeVector TvmExecutionProvider::getInputShape(const NodeArg* node) {
-  TensorShapeVector shape;
-  const auto& node_name = node->Name();
-  if (!options_.input_shapes.empty() &&
-      options_.input_shapes.count(node_name)) {
-    shape = options_.input_shapes[node_name];
-  } else {
-    shape = convertTensorShape(*node->Shape());
-  }
-
-  return shape;
-}
-
-TensorShapeVector TvmExecutionProvider::convertTensorShape(const TensorShapeProto& shape_proto) {
-  TensorShape ort_shape = utils::GetTensorShapeFromTensorShapeProto(shape_proto);
-  size_t dims = ort_shape.NumDimensions();
-
-  TensorShapeVector shape(dims);
-  for (size_t j = 0; j < dims; ++j) {
-    int64_t dim = int64_t(ort_shape[j]);
-    ORT_ENFORCE(dim > 0, "Input dimension is not positive value (dim = " + std::to_string(dim) + "). " +
-                             "Please use provider options to setup input_names and input_shapes");
-    shape[j] = dim;
-  }
-
-  return shape;
-}
-
-void TvmExecutionProvider::prepareOutputTensors(const std::shared_ptr<TvmModule>& mod,
-                                                std::vector<DLTensor>& output_tensors,
-                                                size_t num) {
-  ORT_ENFORCE(mod != nullptr, "TVM module is not compiled");
-  output_tensors.clear();
-  options_.output_shapes.clear();
-  options_.output_shapes.resize(num);
-
-  if (options_.executor != "vm") {
-    TVMGetOutputShapes(*mod, options_.output_shapes);
-  }
-
-  for (auto& output_shape : options_.output_shapes) {
-    DLTensor t;
-    // Draft for tensor, correct data is defined during inference
-    t.strides = nullptr;
-    t.byte_offset = 0;
-    t.data = nullptr;
-    if (options_.executor == "vm") {
-      t.ndim = 0;
-      t.shape = nullptr;
-    } else {
-      t.ndim = output_shape.size();
-      t.shape = output_shape.data();
-    }
-
-    output_tensors.push_back(t);
-  }
-}
-
-NodeComputeInfo TvmExecutionProvider::prepareComputeInfo(const std::string& func_name) {
-  NodeComputeInfo compute_info;
-  compute_info.create_state_func = std::bind(&TvmExecutionProvider::createStateFunc,
-                                             this,
-                                             std::placeholders::_1,
-                                             std::placeholders::_2);
-
-  compute_info.release_state_func = [](FunctionState state) {
-    if (state)
-      delete static_cast<TVMFuncState*>(state);
-  };
-
-  compute_info.compute_func = *runners_[func_name].get();
-
-  return compute_info;
-}
-
-int TvmExecutionProvider::createStateFunc(ComputeContext* context, FunctionState* state) {
-  auto* state_ptr = new TVMFuncState();
-  *state_ptr = {context->allocate_func,
-                context->release_func,
-                context->allocator_handle,
-                compilers_[context->node_name]};
-  // TODO(vvchernov): Who and when release state?
-  *state = state_ptr;
-  return 0;
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider.h b/onnxruntime/core/providers/tvm/tvm_execution_provider.h
deleted file mode 100644
index baa46c593fa07..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_execution_provider.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef TVM_EXECUTION_PROVIDER_H
-#define TVM_EXECUTION_PROVIDER_H
-
-#include <string>
-#include <vector>
-#include <memory>
-#include <unordered_map>
-
-#include "core/common/logging/logging.h"
-#include "core/framework/execution_provider.h"
-#include <mutex>
-
-#include "tvm_compiler.h"
-#include "tvm_runner.h"
-
-namespace onnxruntime {
-class Graph;
-class NodeArg;
-namespace tvm {
-
-class TvmExecutionProvider : public IExecutionProvider {
-  using Compiler = TVMCompilerBase;
-  using Compilers = std::unordered_map<std::string, std::shared_ptr<Compiler>>;
-  using Runner = TVMRunner;
-  using Runners = std::unordered_map<std::string, std::shared_ptr<Runner>>;
-
- public:
-  explicit TvmExecutionProvider(const TvmEPOptions& options);
-  virtual ~TvmExecutionProvider();
-
-  std::vector<std::unique_ptr<ComputeCapability>>
-  GetCapability(const onnxruntime::GraphViewer& graph,
-                const IKernelLookup& /*kernel_lookup*/) const override;
-
-  common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
-                         std::vector<NodeComputeInfo>& node_compute_funcs) override;
-  std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const override;
-  std::vector<AllocatorPtr> CreatePreferredAllocators() override;
-
- private:
-  void printOptions();
-  std::shared_ptr<TvmModule> compileModel(const std::string& func_name,
-                                          const GraphViewer& graph_viewer,
-                                          InputsInfoMap& inputs_info);  // NOLINT
-  void setInputShapesForFreezedNN(const GraphViewer& graph_viewer,
-                                  TVMTensorShapes& input_shapes,     // NOLINT
-                                  InputsInfoMap& all_input_shapes);  // NOLINT
-  void setInputShapesForUnfreezedNN(const GraphViewer& graph_viewer,
-                                    TVMTensorShapes& input_shapes,     // NOLINT
-                                    InputsInfoMap& all_input_shapes);  // NOLINT
-  TensorShapeVector getInputShape(const NodeArg* node);
-  TensorShapeVector convertTensorShape(const ONNX_NAMESPACE::TensorShapeProto& shape_proto);
-  void prepareOutputTensors(const std::shared_ptr<TvmModule>& mod,
-                            std::vector<DLTensor>& output_tensors, size_t num);  // NOLINT
-  NodeComputeInfo prepareComputeInfo(const std::string& func_name);
-  int createStateFunc(ComputeContext*, FunctionState*);
-
- private:
-  TvmEPOptions options_;
-  Compilers compilers_;
-  Runners runners_;
-  bool dump_subgraphs_ = false;
-};
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // TVM_EXECUTION_PROVIDER_H
diff --git a/onnxruntime/core/providers/tvm/tvm_provider_factory.cc b/onnxruntime/core/providers/tvm/tvm_provider_factory.cc
deleted file mode 100644
index d83fd8ee4d1cb..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_provider_factory.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <atomic>
-#include <memory>
-
-#include "core/providers/tvm/tvm_provider_factory.h"
-#include "core/session/abi_session_options_impl.h"
-
-#include "tvm_execution_provider.h"
-#include "tvm_provider_factory_creator.h"
-#include "tvm_so_execution_provider.h"  // NOLINT(build/include_subdir)
-
-namespace onnxruntime {
-
-struct TvmProviderFactory : IExecutionProviderFactory {
-  TvmProviderFactory(const tvm::TvmEPOptions& options) : options_{options} {}
-  ~TvmProviderFactory() = default;
-
-  std::unique_ptr<IExecutionProvider> CreateProvider() override {
-    std::unique_ptr<IExecutionProvider> provider = nullptr;
-    if (options_.so_folder != "") {
-      ORT_ENFORCE(options_.executor == "vm",
-                  "Only virtual machine module is compiled from shared lib and dependences!");
-      provider = std::move(std::make_unique<tvm::TvmSoExecutionProvider>(options_));
-    } else {
-      provider = std::move(std::make_unique<tvm::TvmExecutionProvider>(options_));
-    }
-
-    return provider;
-  }
-
- private:
-  tvm::TvmEPOptions options_;
-};
-
-std::shared_ptr<IExecutionProviderFactory> TVMProviderFactoryCreator::Create(const char* opt_str) {
-  tvm::TvmEPOptions options = tvm::TvmEPOptionsHelper::FromOptionsString(opt_str);
-  return std::make_shared<TvmProviderFactory>(options);
-}
-
-std::shared_ptr<IExecutionProviderFactory> TVMProviderFactoryCreator::Create(const tvm::TvmEPOptions& options) {
-  return std::make_shared<TvmProviderFactory>(options);
-}
-}  // namespace onnxruntime
-
-ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tvm,
-                    _In_ OrtSessionOptions* options,
-                    _In_ const char* opt_str) {
-  onnxruntime::tvm::TvmEPOptions tvm_options = onnxruntime::tvm::TvmEPOptionsHelper::FromOptionsString(opt_str);
-  options->provider_factories.push_back(onnxruntime::TVMProviderFactoryCreator::Create(tvm_options));
-  return nullptr;
-}
diff --git a/onnxruntime/core/providers/tvm/tvm_provider_factory_creator.h b/onnxruntime/core/providers/tvm/tvm_provider_factory_creator.h
deleted file mode 100644
index 2d7e06b5b7c59..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_provider_factory_creator.h
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <memory>
-
-#include "core/providers/providers.h"
-
-namespace onnxruntime {
-namespace tvm {
-struct TvmEPOptions;
-}
-
-struct TVMProviderFactoryCreator {
-  static std::shared_ptr<IExecutionProviderFactory> Create(const tvm::TvmEPOptions& options);
-  static std::shared_ptr<IExecutionProviderFactory> Create(const char* params);
-};
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_runner.cc b/onnxruntime/core/providers/tvm/tvm_runner.cc
deleted file mode 100644
index 5dda8f5bf9c3e..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_runner.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/graph/model.h"
-#include "core/framework/tensorprotoutils.h"
-
-#include "tvm_runner.h"
-
-using namespace ONNX_NAMESPACE;
-namespace onnxruntime {
-namespace tvm {
-
-TVMRunner::TVMRunner(const TvmEPOptions& options,
-                     const std::shared_ptr<TvmModule>& mod,
-                     const InputsInfoMap& inputs_info,
-                     const std::vector<DLTensor>& output_tensors) {
-  runner_ = getTVMRunnerImpl(mod, options, inputs_info, output_tensors);
-}
-
-common::Status TVMRunner::operator()(FunctionState state, const OrtApi* /*api*/, OrtKernelContext* context) {
-  Ort::KernelContext ctx(context);
-  return runner_->run(ctx);
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_runner.h b/onnxruntime/core/providers/tvm/tvm_runner.h
deleted file mode 100644
index 4b7349ee3405e..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_runner.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef TVM_RUNNER_H
-#define TVM_RUNNER_H
-
-#include <vector>
-#include <memory>
-
-#include "tvm_runner_impl.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-class TVMRunner {
- public:
-  TVMRunner() = delete;
-  virtual ~TVMRunner() = default;
-
-  TVMRunner(const TvmEPOptions& options,
-            const std::shared_ptr<TvmModule>& mod,
-            const InputsInfoMap& inputs_info,
-            const std::vector<DLTensor>& output_tensor);
-
-  common::Status operator()(FunctionState state, const OrtApi* api, OrtKernelContext* context);
-
- private:
-  std::shared_ptr<RunnerImpl> runner_;
-};
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // TVM_TVM_RUNNER_H
diff --git a/onnxruntime/core/providers/tvm/tvm_runner_impl.cc b/onnxruntime/core/providers/tvm/tvm_runner_impl.cc
deleted file mode 100644
index c88de2652f14b..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_runner_impl.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/framework/tensorprotoutils.h"
-
-#include "tvm_runner_impl.h"
-#include "tvm_utils.h"
-#include "tvm_api.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-/* ------------------------------------ RunnerImplFactory ----------------------------- */
-
-std::shared_ptr<RunnerImpl> getTVMRunnerImpl(const std::shared_ptr<TvmModule>& mod,
-                                             const TvmEPOptions& options,
-                                             const InputsInfoMap& inputs_info,
-                                             const std::vector<DLTensor> output_tensors) {
-  const std::string& name = options.executor;
-  if (name == "graph") {
-    return std::make_shared<GERunnerImpl>(mod, inputs_info, options.output_shapes,
-                                          output_tensors, options.set_output_zero_copy);
-  } else if (name == "vm") {
-    return std::make_shared<VMRunnerImpl>(mod, inputs_info, options.output_shapes,
-                                          output_tensors, options.set_output_zero_copy);
-  }
-  return nullptr;
-}
-
-/* ------------------------------------ RunnerImpl ------------------------------------ */
-
-RunnerImpl::RunnerImpl(const std::shared_ptr<TvmModule>& mod,
-                       const InputsInfoMap& inputs_info,
-                       const TVMTensorShapes output_shapes,
-                       const std::vector<DLTensor> output_tensors,
-                       bool set_output_zero_copy) : mod_(mod),
-                                                    inputs_info_(inputs_info),
-                                                    output_shapes_(output_shapes),
-                                                    output_tensors_(output_tensors),
-                                                    set_output_zero_copy_(set_output_zero_copy) {
-}
-
-void RunnerImpl::convert_input_tensors2dl_tensors(Ort::KernelContext& context,
-                                                  std::vector<DLTensor>& dst,
-                                                  std::vector<size_t>& dst_inds) {
-  size_t num = inputs_info_.size();
-  dst.reserve(num);
-  dst_inds.reserve(num);
-  for (auto& info : inputs_info_) {
-    // TODO(vvchernov): decomposition declaration only available with -std=c++1z or -std=gnu++1z
-    auto& i = info.first;
-    auto& shape = info.second;
-
-    auto input_tensor = context.GetInput(i);
-    ORT_ENFORCE(input_tensor.IsTensor());
-
-    auto ort_device_type = input_tensor.GetTensorMemoryInfo().GetDeviceType();
-    const auto tensor_type = input_tensor.GetTensorTypeAndShapeInfo().GetElementType();
-
-    DLTensor t;
-    t.device = GetDLDevice(ort_device_type);
-    t.dtype = GetDataType(tensor_type);
-    t.strides = nullptr;
-    t.byte_offset = 0;
-    t.data = const_cast<void*>(input_tensor.GetTensorRawData());
-    t.ndim = shape.size();
-    t.shape = shape.data();
-    dst.emplace_back(t);
-    dst_inds.push_back(i);
-  }
-}
-
-void RunnerImpl::add_device_type_data2output_tensors(Ort::KernelContext& context) {
-  size_t num_outputs = output_tensors_.size();
-  for (auto i = 0u; i < num_outputs; i++) {
-    // setup output tensor property
-    auto output_tensor = context.GetOutput(i,
-                                           output_shapes_[i].data(),
-                                           output_shapes_[i].size());
-    ORT_ENFORCE(output_tensor.IsTensor());
-
-    output_tensors_[i].device =
-        GetDLDevice(output_tensor.GetTensorMemoryInfo().GetDeviceType());
-    output_tensors_[i].dtype =
-        GetDataType(output_tensor.GetTensorTypeAndShapeInfo().GetElementType());
-    output_tensors_[i].data = output_tensor.GetTensorMutableRawData();
-  }
-}
-
-/* ------------------------------------ GERunnerImpl ------------------------------------ */
-
-GERunnerImpl::GERunnerImpl(const std::shared_ptr<TvmModule>& mod,
-                           const InputsInfoMap& inputs_info,
-                           const TVMTensorShapes output_shapes,
-                           const std::vector<DLTensor> output_tensors,
-                           bool set_output_zero_copy) : RunnerImpl(mod, inputs_info, output_shapes, output_tensors, set_output_zero_copy) {
-}
-
-void GERunnerImpl::set_input(Ort::KernelContext& context) {
-  std::vector<size_t> inds;
-  std::vector<DLTensor> dl_tensors_inputs;
-  convert_input_tensors2dl_tensors(context, dl_tensors_inputs, inds);
-
-  tvm::TVMSetInputs(*mod_, inds, dl_tensors_inputs);
-}
-
-void GERunnerImpl::connect_output_tensors2ort(Ort::KernelContext& context) {
-  add_device_type_data2output_tensors(context);
-}
-
-void GERunnerImpl::set_output_zero_copy() {
-  tvm::TVMSetOutputsZeroCopy(*mod_, output_tensors_);
-}
-
-void GERunnerImpl::run() {
-  tvm::TVMRun(*mod_);
-}
-
-void GERunnerImpl::get_outputs() {
-  tvm::TVMGetOutputs(*mod_, output_tensors_);
-}
-
-/* ------------------------------------ VMRunnerImpl ------------------------------------ */
-
-VMRunnerImpl::VMRunnerImpl(const std::shared_ptr<TvmModule>& mod,
-                           const InputsInfoMap& inputs_info,
-                           const TVMTensorShapes output_shapes,
-                           const std::vector<DLTensor> output_tensors,
-                           bool set_output_zero_copy) : RunnerImpl(mod, inputs_info, output_shapes, output_tensors, set_output_zero_copy) {
-}
-
-void VMRunnerImpl::set_input(Ort::KernelContext& context) {
-  std::vector<size_t> inds;
-  std::vector<DLTensor> dl_tensors_inputs;
-  convert_input_tensors2dl_tensors(context, dl_tensors_inputs, inds);
-
-  tvm::TVM_VM_SetInputs(*mod_, inds, dl_tensors_inputs);
-}
-
-void VMRunnerImpl::connect_output_tensors2ort(Ort::KernelContext& context) {
-  // TODO(vvchernov): try to find more flexible solution
-  if (!probe_infer_) {
-    infer_once_to_get_output_shapes();
-  }
-
-  add_device_type_data2output_tensors(context);
-}
-
-void VMRunnerImpl::set_output_zero_copy() {
-  tvm::TVM_VM_SetOutputsZeroCopy(*mod_, output_tensors_);
-}
-
-void VMRunnerImpl::run() {
-  tvm::TVM_VM_Run(*mod_);
-}
-
-void VMRunnerImpl::get_outputs() {
-  tvm::TVM_VM_GetOutputs(*mod_, output_tensors_);
-}
-
-void VMRunnerImpl::infer_once_to_get_output_shapes() {
-  run();
-  size_t num_outputs = output_tensors_.size();
-  // TODO(vvchernov): check it
-  output_shapes_.resize(num_outputs);
-  tvm::TVMGetOutputShapes(*mod_, output_shapes_);
-  for (size_t i = 0; i < num_outputs; ++i) {
-    output_tensors_[i].ndim = output_shapes_[i].size();
-    output_tensors_[i].shape = output_shapes_[i].data();
-  }
-  probe_infer_ = true;
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_runner_impl.h b/onnxruntime/core/providers/tvm/tvm_runner_impl.h
deleted file mode 100644
index 8c325303673b6..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_runner_impl.h
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef TVM_RUNNER_IMPL_H
-#define TVM_RUNNER_IMPL_H
-
-#include <string>
-#include <memory>
-#include <map>
-
-#include "core/framework/func_api.h"
-#include "core/session/onnxruntime_cxx_api.h"
-
-#include "tvm_common.h"
-#include "tvm_ep_options.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-class RunnerImpl {
- public:
-  RunnerImpl() = delete;
-  RunnerImpl(const std::shared_ptr<TvmModule>& mod,
-             const InputsInfoMap& inputs_info,
-             const TVMTensorShapes output_shapes,
-             const std::vector<DLTensor> tensors_outputs,
-             bool set_output_zero_copy);
-  virtual ~RunnerImpl() = default;
-
-  virtual common::Status run(Ort::KernelContext& context) {
-    common::Status res;
-    if (set_output_zero_copy_) {
-      res = run_without_output_copying(context);
-    } else {
-      res = run_with_output_copying(context);
-    }
-    return res;
-  }
-
-  virtual common::Status run_without_output_copying(Ort::KernelContext& context) {
-    set_input(context);
-    connect_output_tensors2ort(context);
-    set_output_zero_copy();
-    run();
-
-    return Status::OK();
-  }
-
-  virtual common::Status run_with_output_copying(Ort::KernelContext& context) {
-    set_input(context);
-    connect_output_tensors2ort(context);
-    run();
-    get_outputs();
-
-    return Status::OK();
-  }
-
-  virtual void set_input(Ort::KernelContext& context) = 0;
-  virtual void connect_output_tensors2ort(Ort::KernelContext& context) = 0;
-  virtual void set_output_zero_copy() = 0;
-  virtual void run() = 0;
-  virtual void get_outputs() = 0;
-
- protected:
-  void convert_input_tensors2dl_tensors(Ort::KernelContext& context,
-                                        std::vector<DLTensor>& dst,
-                                        std::vector<size_t>& dst_inds);
-  void add_device_type_data2output_tensors(Ort::KernelContext& context);
-
- protected:
-  std::shared_ptr<TvmModule> mod_;
-  InputsInfoMap inputs_info_;
-  TVMTensorShapes output_shapes_;
-  std::vector<DLTensor> output_tensors_;
-  bool set_output_zero_copy_;
-};
-
-class GERunnerImpl : public RunnerImpl {
- public:
-  GERunnerImpl() = delete;
-  GERunnerImpl(const std::shared_ptr<TvmModule>& mod,
-               const InputsInfoMap& inputs_info,
-               const TVMTensorShapes output_shapes,
-               const std::vector<DLTensor> tensors_outputs,
-               bool set_output_zero_copy);
-  virtual ~GERunnerImpl() = default;
-
-  void set_input(Ort::KernelContext& context) final;
-  void connect_output_tensors2ort(Ort::KernelContext& context) final;
-  void set_output_zero_copy() final;
-  void run() final;
-  void get_outputs() final;
-};
-
-class VMRunnerImpl : public RunnerImpl {
- public:
-  VMRunnerImpl() = delete;
-  VMRunnerImpl(const std::shared_ptr<TvmModule>& mod,
-               const InputsInfoMap& inputs_info,
-               const TVMTensorShapes output_shapes,
-               const std::vector<DLTensor> tensors_outputs,
-               bool set_output_zero_copy);
-  virtual ~VMRunnerImpl() = default;
-
-  void set_input(Ort::KernelContext& context) final;
-  void connect_output_tensors2ort(Ort::KernelContext& context) final;
-  void set_output_zero_copy() final;
-  void run() final;
-  void get_outputs() final;
-
- private:
-  void infer_once_to_get_output_shapes();
-
- private:
-  bool probe_infer_ = false;
-};
-
-std::shared_ptr<RunnerImpl> getTVMRunnerImpl(const std::shared_ptr<TvmModule>& mod,
-                                             const TvmEPOptions& options,
-                                             const InputsInfoMap& inputs_info,
-                                             const std::vector<DLTensor> output_tensors);
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // TVM_TVM_RUNNER_IMPL_H
diff --git a/onnxruntime/core/providers/tvm/tvm_so_execution_provider.cc b/onnxruntime/core/providers/tvm/tvm_so_execution_provider.cc
deleted file mode 100644
index 029f25d6f292a..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_so_execution_provider.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <map>
-#include <unordered_set>
-#include <utility>
-
-#include "core/framework/execution_provider.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/framework/kernel_registry.h"
-#include "core/framework/compute_capability.h"
-#include "core/platform/env.h"
-#include "core/graph/model.h"
-
-#include "tvm_so_execution_provider.h"  // NOLINT(build/include_subdir)
-#include "xpu_data_transfer.h"          // NOLINT(build/include_subdir)
-#include "tvm_allocator.h"              // NOLINT(build/include_subdir)
-#include "tvm_utils.h"                  // NOLINT(build/include_subdir)
-#include "tvm_api.h"                    // NOLINT(build/include_subdir)
-#ifdef USE_TVM_HASH
-#include "hash_alg/hasher.h"  // NOLINT(build/include_subdir)
-#endif
-
-using ONNX_NAMESPACE::TensorShapeProto;
-
-namespace onnxruntime {
-namespace tvm {
-
-// Information to construct kernel function state.
-struct TVMFuncState {
-  AllocateFunc allocate_func = nullptr;
-  DestroyFunc release_func = nullptr;
-  AllocatorHandle allocator = nullptr;
-  std::shared_ptr<TVMCompilerBase> compiler = nullptr;
-};
-
-TvmSoExecutionProvider::TvmSoExecutionProvider(const TvmEPOptions& options)
-    : IExecutionProvider{kTvmExecutionProvider},
-      options_{options} {
-  // Get environment variables
-  const Env& env_instance = Env::Default();
-
-  const std::string dump_subgraphs_env = env_instance.GetEnvironmentVar(env_vars::kDumpSubgraphs);
-  ORT_ENFORCE(dump_subgraphs_env.empty(), "TVM EP processing shared lib does not support subgraphs");
-}
-
-std::vector<AllocatorPtr> TvmSoExecutionProvider::CreatePreferredAllocators() {
-  AllocatorCreationInfo default_memory_info = {[](int) {
-                                                 return std::make_unique<TVMAllocator>();
-                                               },
-                                               0, false};
-  return std::vector<AllocatorPtr>{CreateAllocator(default_memory_info)};
-}
-
-TvmSoExecutionProvider::~TvmSoExecutionProvider() {}
-
-std::vector<std::unique_ptr<ComputeCapability>>
-TvmSoExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
-                                      const IKernelLookup& /*kernel_lookup*/) const {
-  std::vector<std::unique_ptr<ComputeCapability>> result;
-  if (graph_viewer.IsSubgraph()) {
-    return result;
-  }
-
-  const auto& init_tensors = graph_viewer.GetAllInitializedTensors();
-
-  std::unordered_set<std::string> required_initializers;
-  const std::vector<NodeIndex>& sorted_nodes = graph_viewer.GetNodesInTopologicalOrder();
-  std::unique_ptr<IndexedSubGraph> sub_graph = std::make_unique<IndexedSubGraph>();
-  for (auto& node_idx : sorted_nodes) {
-    graph_viewer.GetNode(node_idx)->ForEachDef([&required_initializers, &init_tensors](const NodeArg& node_arg, bool is_input) {
-              if (is_input && init_tensors.count(node_arg.Name())) {
-                  required_initializers.insert(node_arg.Name());
-              } }, true);
-  }
-
-  auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>();
-  meta_def->name = "TVMStandalone";
-  meta_def->domain = "StandaloneTest";
-  std::vector<std::string> inputs;
-  std::vector<std::string> outputs;
-
-  for (auto& nodeArgPtr : graph_viewer.GetInputs()) {
-    inputs.push_back(nodeArgPtr->Name());
-  }
-
-  for (auto& name : required_initializers) {
-    inputs.push_back(name);
-  }
-
-  for (auto& nodeArgPtr : graph_viewer.GetOutputs()) {
-    outputs.push_back(nodeArgPtr->Name());
-  }
-  meta_def->inputs = inputs;
-  meta_def->outputs = outputs;
-  meta_def->since_version = 1;
-  meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
-  sub_graph->SetMetaDef(std::move(meta_def));
-  sub_graph->nodes = sorted_nodes;
-  result.push_back(
-      std::make_unique<ComputeCapability>(std::move(sub_graph)));
-  return result;
-}
-
-common::Status TvmSoExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
-                                               std::vector<NodeComputeInfo>& node_compute_funcs) {
-  printOptions();
-  for (auto& fused_node_graph : fused_nodes_and_graphs) {
-    const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
-    const Node& fused_node = fused_node_graph.fused_node;
-#ifdef USE_TVM_HASH
-    if (options_.check_hash) {
-      ORT_ENFORCE(checkHash(ToUTF8String(fused_node.ModelPath().ToPathString())),
-                  "Hash check shows that used tuning files were not obtained for the given onnx-model");
-    }
-#endif
-    const std::string func_name = fused_node.Name();
-
-    compilers_[func_name] = std::make_shared<TVMSoCompiler>();
-    InputsInfoMap all_input_shapes;
-    auto mod = compileModel(func_name, graph_body_viewer, all_input_shapes);
-
-    std::vector<DLTensor> output_tensors(graph_body_viewer.GetOutputs().size());
-    prepareOutputTensors(output_tensors);
-
-    runners_[func_name] = std::make_shared<Runner>(options_, mod, all_input_shapes, output_tensors);
-
-    // TODO(vvchernov): implement ops checking and mechanism of gracefully passing the responsibility to other EPs
-    // if the checking fails due to unsupported op(s)
-    NodeComputeInfo compute_info = prepareComputeInfo(func_name);
-
-    node_compute_funcs.push_back(compute_info);
-  }
-  return Status::OK();
-}
-
-std::unique_ptr<IDataTransfer> TvmSoExecutionProvider::GetDataTransfer() const {
-  // TODO(vvchernov): target or target host?
-  if (TvmEPOptionsHelper::checkGPUTarget(options_.target)) {
-    return std::make_unique<XPUDataTransfer>();
-  } else if (TvmEPOptionsHelper::checkCPUTarget(options_.target)) {
-    return std::make_unique<TvmCPUDataTransfer>();
-  } else {
-    ORT_NOT_IMPLEMENTED("TVM GetDataTransfer is not implemented for target ", options_.target);
-  }
-}
-
-void TvmSoExecutionProvider::printOptions() {
-  LOGS(*GetLogger(), INFO) << options_;
-}
-
-#ifdef USE_TVM_HASH
-bool TvmSoExecutionProvider::checkHash(const std::string& onnx_path) const {
-  auto hasher = Hasher("sha256");
-  std::string onnx_str = readFromFile(onnx_path);
-  std::string onnx_hash = hasher.hash(onnx_str.c_str(), onnx_str.size());
-  onnx_str.clear();
-  std::string hash;
-  if (options_.hash_file_path.empty()) {
-    // TODO(vvchernov): align hash file name with OctoML team
-    hash = readFromFile(options_.so_folder + "/hash.txt");
-  } else {
-    hash = readFromFile(options_.hash_file_path);
-  }
-  return onnx_hash == hash;
-}
-#endif
-
-std::shared_ptr<TvmModule> TvmSoExecutionProvider::compileModel(const std::string& func_name,
-                                                                const GraphViewer& graph_viewer,
-                                                                InputsInfoMap& all_input_shapes) {
-  all_input_shapes.clear();
-
-  TVMTensorShapes input_shapes;
-  if (options_.freeze_weights) {
-    setInputShapesForFreezedNN(graph_viewer, input_shapes, all_input_shapes);
-  } else {
-    setInputShapesForUnfreezedNN(graph_viewer, input_shapes, all_input_shapes);
-  }
-
-  std::shared_ptr<TvmModule> mod = compilers_[func_name]->operator()(options_, input_shapes);
-
-  return mod;
-}
-
-void TvmSoExecutionProvider::setInputShapesForFreezedNN(const GraphViewer& graph_viewer,
-                                                        TVMTensorShapes& input_shapes,
-                                                        InputsInfoMap& all_input_shapes) {
-  const std::vector<const NodeArg*>& all_nodes = graph_viewer.GetInputsIncludingInitializers();
-
-  size_t indx = 0;
-  for (const auto* node : all_nodes) {
-    if (!graph_viewer.IsInitializedTensor(node->Name())) {
-      TensorShapeVector shape = getInputShape(node);
-      all_input_shapes[indx++] = shape;
-      input_shapes.emplace_back(shape);
-    }
-  }
-}
-
-void TvmSoExecutionProvider::setInputShapesForUnfreezedNN(const GraphViewer& graph_viewer,
-                                                          TVMTensorShapes& input_shapes,
-                                                          InputsInfoMap& all_input_shapes) {
-  const std::vector<const NodeArg*>& all_nodes = graph_viewer.GetInputsIncludingInitializers();
-
-  size_t indx = 0;
-  for (const auto* node : all_nodes) {
-    TensorShapeVector shape = getInputShape(node);
-    all_input_shapes[indx++] = shape;
-    if (!graph_viewer.IsInitializedTensor(node->Name())) {
-      input_shapes.emplace_back(shape);
-    }
-  }
-}
-
-TensorShapeVector TvmSoExecutionProvider::getInputShape(const NodeArg* node) {
-  TensorShapeVector shape;
-  const auto& node_name = node->Name();
-  if (!options_.input_shapes.empty() &&
-      options_.input_shapes.count(node_name)) {
-    shape = options_.input_shapes[node_name];
-  } else {
-    shape = convertTensorShape(*node->Shape());
-  }
-
-  return shape;
-}
-
-TensorShapeVector TvmSoExecutionProvider::convertTensorShape(const TensorShapeProto& shape_proto) {
-  TensorShape ort_shape = utils::GetTensorShapeFromTensorShapeProto(shape_proto);
-  size_t dims = ort_shape.NumDimensions();
-
-  TensorShapeVector shape(dims);
-  for (size_t j = 0; j < dims; ++j) {
-    int64_t dim = int64_t(ort_shape[j]);
-    ORT_ENFORCE(dim > 0, "Input dimension is not positive value (dim = " + std::to_string(dim) + "). " +
-                             "Please use provider options to setup input_names and input_shapes");
-    shape[j] = dim;
-  }
-
-  return shape;
-}
-
-void TvmSoExecutionProvider::prepareOutputTensors(std::vector<DLTensor>& output_tensors) {
-  for (DLTensor& t : output_tensors) {
-    // Draft for tensor, correct data is defined during inference
-    t.strides = nullptr;
-    t.byte_offset = 0;
-    t.data = nullptr;
-    t.ndim = 0;
-    t.shape = nullptr;
-  }
-}
-
-NodeComputeInfo TvmSoExecutionProvider::prepareComputeInfo(const std::string& func_name) {
-  NodeComputeInfo compute_info;
-  compute_info.create_state_func = std::bind(&TvmSoExecutionProvider::createStateFunc,
-                                             this,
-                                             std::placeholders::_1,
-                                             std::placeholders::_2);
-
-  compute_info.release_state_func = [](FunctionState state) {
-    if (state)
-      delete static_cast<TVMFuncState*>(state);
-  };
-
-  compute_info.compute_func = *runners_[func_name].get();
-
-  return compute_info;
-}
-
-int TvmSoExecutionProvider::createStateFunc(ComputeContext* context, FunctionState* state) {
-  auto* state_ptr = new TVMFuncState();
-  *state_ptr = {context->allocate_func,
-                context->release_func,
-                context->allocator_handle,
-                compilers_[context->node_name]};
-  // TODO(vvchernov): Who and when release state?
-  *state = state_ptr;
-  return 0;
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/tvm_so_execution_provider.h b/onnxruntime/core/providers/tvm/tvm_so_execution_provider.h
deleted file mode 100644
index d3840f46b5b55..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_so_execution_provider.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_SO_EXECUTION_PROVIDER_H_
-#define ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_SO_EXECUTION_PROVIDER_H_
-
-#include <string>
-#include <vector>
-#include <memory>
-#include <unordered_map>
-
-#include "core/common/logging/logging.h"
-#include "core/framework/execution_provider.h"
-#include <mutex>
-
-#include "tvm_compiler.h"  // NOLINT(build/include_subdir)
-#include "tvm_runner.h"    // NOLINT(build/include_subdir)
-
-namespace onnxruntime {
-class Graph;
-class NodeArg;
-namespace tvm {
-
-class TvmSoExecutionProvider : public IExecutionProvider {
-  using Compiler = TVMCompilerBase;
-  using Compilers = std::unordered_map<std::string, std::shared_ptr<Compiler>>;
-  using Runner = TVMRunner;
-  using Runners = std::unordered_map<std::string, std::shared_ptr<Runner>>;
-
- public:
-  explicit TvmSoExecutionProvider(const TvmEPOptions& options);
-  virtual ~TvmSoExecutionProvider();
-
-  std::vector<std::unique_ptr<ComputeCapability>>
-  GetCapability(const onnxruntime::GraphViewer& graph,
-                const IKernelLookup& /*kernel_lookup*/) const override;
-
-  common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
-                         std::vector<NodeComputeInfo>& node_compute_funcs) override;
-  std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const override;
-  std::vector<AllocatorPtr> CreatePreferredAllocators() override;
-
- private:
-  void printOptions();
-#ifdef USE_TVM_HASH
-  bool checkHash(const std::string& onnx_path) const;
-#endif
-  std::shared_ptr<TvmModule> compileModel(const std::string& func_name,
-                                          const GraphViewer& graph_viewer,
-                                          InputsInfoMap& inputs_info);  // NOLINT
-  void setInputShapesForFreezedNN(const GraphViewer& graph_viewer,
-                                  TVMTensorShapes& input_shapes,     // NOLINT
-                                  InputsInfoMap& all_input_shapes);  // NOLINT
-  void setInputShapesForUnfreezedNN(const GraphViewer& graph_viewer,
-                                    TVMTensorShapes& input_shapes,     // NOLINT
-                                    InputsInfoMap& all_input_shapes);  // NOLINT
-  TensorShapeVector getInputShape(const NodeArg* node);
-  TensorShapeVector convertTensorShape(const ONNX_NAMESPACE::TensorShapeProto& shape_proto);
-  void prepareOutputTensors(std::vector<DLTensor>& output_tensors);  // NOLINT
-  NodeComputeInfo prepareComputeInfo(const std::string& func_name);
-  int createStateFunc(ComputeContext*, FunctionState*);
-
- private:
-  TvmEPOptions options_;
-  Compilers compilers_;
-  Runners runners_;
-};
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_SO_EXECUTION_PROVIDER_H_
diff --git a/onnxruntime/core/providers/tvm/tvm_utils.cc b/onnxruntime/core/providers/tvm/tvm_utils.cc
deleted file mode 100644
index e0a5b566835c8..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_utils.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef TVM_UTILS_H
-#define TVM_UTILS_H
-
-#include <fstream>
-#include <streambuf>
-
-#include "tvm_utils.h"  // NOLINT(build/include_subdir)
-
-namespace onnxruntime {
-namespace tvm {
-
-std::string readFromFile(const std::string& file_path) {
-  std::string str;
-
-  std::ifstream t(file_path);
-  t.seekg(0, std::ios::end);
-  str.reserve(t.tellg());
-  t.seekg(0, std::ios::beg);
-
-  str.assign((std::istreambuf_iterator<char>(t)),
-             std::istreambuf_iterator<char>());
-  return str;
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // TVM_UTILS_H
diff --git a/onnxruntime/core/providers/tvm/tvm_utils.h b/onnxruntime/core/providers/tvm/tvm_utils.h
deleted file mode 100644
index de77368c715b9..0000000000000
--- a/onnxruntime/core/providers/tvm/tvm_utils.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef TVM_UTILS_H
-#define TVM_UTILS_H
-
-#include <string>
-
-#include "tvm_common.h"
-
-#include "core/session/onnxruntime_cxx_api.h"
-#include "core/framework/ortdevice.h"
-#include "core/common/common.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-inline DLDataType GetDataType(ONNXTensorElementDataType type) {
-  switch (type) {
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
-      return {kDLUInt, 8, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
-      return {kDLInt, 8, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
-      return {kDLUInt, 16, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16:
-      return {kDLInt, 16, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
-      return {kDLUInt, 32, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
-      return {kDLInt, 32, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64:
-      return {kDLUInt, 64, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
-      return {kDLInt, 64, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
-      return {kDLFloat, 16, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
-      return {kDLFloat, 32, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
-      return {kDLFloat, 64, 1};
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
-      return {kDLUInt, 1, 1};
-    default:
-      ORT_NOT_IMPLEMENTED("Unsupported data type");
-  }
-}
-
-inline DLDevice GetDLDevice(OrtMemoryInfoDeviceType device_type) {
-  DLDevice context;
-  switch (device_type) {
-    case OrtDevice::CPU:
-      context = {kDLCPU, 0};
-      break;
-    case OrtDevice::GPU:
-      context = {kDLVulkan, 0};
-      break;
-    default:
-      ORT_NOT_IMPLEMENTED("Unsupported device");
-      break;
-  }
-  return context;
-}
-
-std::string readFromFile(const std::string& file_path);
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // TVM_UTILS_H
diff --git a/onnxruntime/core/providers/tvm/xpu_data_transfer.cc b/onnxruntime/core/providers/tvm/xpu_data_transfer.cc
deleted file mode 100644
index 4011dee7b7b7f..0000000000000
--- a/onnxruntime/core/providers/tvm/xpu_data_transfer.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/framework/tensor.h"
-
-#include "xpu_data_transfer.h"
-#include "tvm_utils.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-XPUDataTransfer::XPUDataTransfer() {
-}
-
-XPUDataTransfer::~XPUDataTransfer() {
-}
-
-bool XPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
-  return (src_device.Type() == OrtDevice::CPU && dst_device.Type() == OrtDevice::CPU) ||
-         (src_device.Type() == OrtDevice::GPU || dst_device.Type() == OrtDevice::GPU);
-}
-
-common::Status XPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
-  size_t bytes = src.SizeInBytes();
-  const void* src_data = src.DataRaw();
-  void* dst_data = dst.MutableDataRaw();
-  const auto src_device_type = src.Location().device.Type();
-  const auto dst_device_type = dst.Location().device.Type();
-
-  if ((src_device_type == OrtDevice::CPU) && (dst_device_type == OrtDevice::CPU)) {
-    if (src_data == dst_data) {
-      // no need copying as both pointers are referring to same piece of memory.
-      return Status::OK();
-    }
-    memcpy(dst_data, src_data, bytes);
-  } else {
-    DLTensor tvm_src, tvm_dst;
-    DLDataType dl_type{kDLInt, 8, 1};
-    std::vector<int64_t> shape{int64_t(bytes)};
-    // Construct source DLTensor
-    tvm_src.device = GetDLDevice(static_cast<OrtMemoryInfoDeviceType>(src_device_type));
-    tvm_src.dtype = dl_type;
-    tvm_src.strides = nullptr;
-    tvm_src.byte_offset = 0;
-    tvm_src.data = const_cast<void*>(src_data);
-    tvm_src.ndim = 1;
-    tvm_src.shape = shape.data();
-    // Construct destination DLTensor
-    tvm_dst.device = GetDLDevice(static_cast<OrtMemoryInfoDeviceType>(dst_device_type));
-    tvm_dst.dtype = dl_type;
-    tvm_dst.strides = nullptr;
-    tvm_dst.byte_offset = 0;
-    tvm_dst.data = dst_data;
-    tvm_dst.ndim = 1;
-    tvm_dst.shape = shape.data();
-    // Copying from src to dst
-    TVMDeviceCopyDataFromTo(&tvm_src, &tvm_dst, nullptr);
-  }
-  return Status::OK();
-}
-
-DLDevice XPUDataTransfer::get_context(const OrtDevice& device) const {
-  return GetDLDevice(static_cast<OrtMemoryInfoDeviceType>(device.Type()));
-}
-
-bool TvmCPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
-  return src_device.Type() == OrtDevice::CPU && dst_device.Type() == OrtDevice::CPU;
-}
-
-common::Status TvmCPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
-  const void* src_data = src.DataRaw();
-  void* dst_data = dst.MutableDataRaw();
-  if (src_data == dst_data) {
-    // no need copying as both pointers are referring to same piece of memory.
-    return Status::OK();
-  }
-  // Copying only happens between two same size tensors.
-  ORT_ENFORCE(src.SizeInBytes() == dst.SizeInBytes());
-  memcpy(dst_data, src_data, src.SizeInBytes());
-  return Status::OK();
-}
-
-}  // namespace tvm
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tvm/xpu_data_transfer.h b/onnxruntime/core/providers/tvm/xpu_data_transfer.h
deleted file mode 100644
index a2cf55b241bb1..0000000000000
--- a/onnxruntime/core/providers/tvm/xpu_data_transfer.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef XPU_DATA_TRANSFER
-#define XPU_DATA_TRANSFER
-
-#include "core/framework/data_transfer.h"
-#include "tvm_common.h"
-
-namespace onnxruntime {
-namespace tvm {
-
-class XPUDataTransfer : public IDataTransfer {
- public:
-  XPUDataTransfer();
-  ~XPUDataTransfer();
-
-  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
-
-  // Dumpen MSVC warning about not fully overriding
-  using IDataTransfer::CopyTensor;
-  common::Status CopyTensor(const Tensor& src, Tensor& dst) const override;
-  DLDevice get_context(const OrtDevice& device) const;
-};
-
-class TvmCPUDataTransfer : public IDataTransfer {
- public:
-  TvmCPUDataTransfer() = default;
-  // Dampen MSVC warning about not fully overriding CopyTensor
-  using IDataTransfer::CopyTensor;
-  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
-  common::Status CopyTensor(const Tensor& src, Tensor& dst) const override;
-};
-
-}  // namespace tvm
-}  // namespace onnxruntime
-
-#endif  // XPU_DATA_TRANSFER
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 772e778dd5ed4..3e802e5a77203 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -444,6 +444,11 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     }
   };
   the_global_api.node_arg_external_location = vaip::node_arg_external_location;
+  the_global_api.model_to_proto = [](onnxruntime::Model& model) { return model.ToProto().release(); };
+  the_global_api.model_proto_serialize_as_string = [](ONNX_NAMESPACE::ModelProto& model_proto) {
+    return vaip_core::DllSafe(model_proto.SerializeAsString());
+  };
+  the_global_api.model_proto_delete = [](ONNX_NAMESPACE::ModelProto* p) { delete p; };
   if (!s_library_vitisaiep.vaip_get_version) {
     return reinterpret_cast<vaip_core::OrtApiForVaip*>(&(the_global_api.host_));
   } else {
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h b/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h
index 5d020e00ff5b7..64cf52ec0a404 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h
@@ -25,18 +25,18 @@ class ExecutionProvider {
   virtual DllSafe<std::vector<std::string>> get_meta_def_nodes() const = 0;
   virtual DllSafe<std::vector<std::string>>
   get_meta_def_constant_initializer() const = 0;
+  virtual bool get_meta_def_fallback_CPU() const { return false; };
   virtual std::unique_ptr<CustomOp> compile() const = 0;
 
  public:
-  inline void set_fused_node(const onnxruntime::Node* fused_node) {
-    fused_node_ = fused_node;
-  }
-  inline const onnxruntime::Node* get_fused_node() const {
-    return fused_node_;
-  }
+  inline void set_fused_node(const onnxruntime::Node* fused_node) { fused_node_ = fused_node; }
+  inline const onnxruntime::Node* get_fused_node() const { return fused_node_; }
+  inline void set_model(onnxruntime::Model* model) { model_ = model; }
+  inline onnxruntime::Model* get_model() const { return model_; }
 
  private:
   const onnxruntime::Node* fused_node_ = nullptr;
+  onnxruntime::Model* model_ = nullptr;
 };
 
 class CustomOp {
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h b/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h
index 74482d8e9ee0e..7628e45d2b933 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h
@@ -20,6 +20,7 @@ struct NodeAttributes;
 namespace ONNX_NAMESPACE {
 struct AttributeProto;
 struct TensorProto;
+struct ModelProto;
 #ifndef USE_VITISAI
 enum TensorProto_DataType : int {
   TensorProto_DataType_UNDEFINED = 0,
@@ -70,6 +71,7 @@ enum AttributeProto_AttributeType : int {
 namespace vaip_core {
 class GraphHolder;
 using ONNX_NAMESPACE::AttributeProto;
+using ONNX_NAMESPACE::ModelProto;
 using ONNX_NAMESPACE::TensorProto;
 using onnxruntime::Graph;
 using onnxruntime::GraphViewer;
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
index bbe8b6e6e4934..288cfd6850d06 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
@@ -13,7 +13,7 @@ struct OrtApi;
 
 namespace vaip_core {
 
-#define VAIP_ORT_API_MAJOR (10u)
+#define VAIP_ORT_API_MAJOR (11u)
 #define VAIP_ORT_API_MINOR (0u)
 #define VAIP_ORT_API_PATCH (0u)
 struct OrtApiForVaip {
@@ -231,6 +231,9 @@ struct OrtApiForVaip {
                            gsl::span<const NodeArg* const> inputs);                                                                                   // [92]
   int (*node_arg_external_location)(const Graph& graph, const NodeArg& node_arg, std::string& file, size_t& offset, size_t& size, size_t& checksum);  // [93]
   void (*session_option_configuration)(void* mmap, void* session_option, void (*push)(void* mmap, const char* name, const char* value));              // [94]
+  ModelProto* (*model_to_proto)(Model& model);                                                                                                        // [95]
+  DllSafe<std::string> (*model_proto_serialize_as_string)(ModelProto& model_proto);                                                                   // [96]
+  void (*model_proto_delete)(ModelProto* p);                                                                                                          // [97]
 };
 
 #ifndef USE_VITISAI
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 633847e6f163b..023a954c83d70 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -76,7 +76,17 @@ common::Status VitisAIExecutionProvider::Compile(const std::vector<FusedNodeAndG
     auto& attrs = fused_node_graph.fused_node.get().GetAttributes();
     assert(attrs.count("index"));
     size_t index = attrs.at("index").i();
-    (**this->execution_providers_)[index]->set_fused_node(&fused_node_graph.fused_node.get());
+    auto& ep = (**this->execution_providers_)[index];
+    ep->set_fused_node(&fused_node_graph.fused_node.get());
+    if (ep->get_meta_def_fallback_CPU()) {
+      auto& subgraph = fused_node_graph.filtered_graph.get();
+      auto& logger = logging::LoggingManager::DefaultLogger();
+      auto model_proto = subgraph.CreateModel(logger)->ToProto();
+      subgraph.ToProto(*model_proto->mutable_graph(), true, true);
+      auto local_registries = IOnnxRuntimeOpSchemaRegistryList{subgraph.GetSchemaRegistry()};
+      auto model = Model::Create(std::move(*model_proto), subgraph.ModelPath(), &local_registries, logger);
+      ep->set_model(model.release());
+    }
     compute_info.create_state_func = [this, index](ComputeContext* context, FunctionState* state) {
       auto* p = (**this->execution_providers_)[index]->compile().release();
       *state = p;
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
index 07085cd248d06..9864a40bd1d3b 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
@@ -50,7 +50,6 @@ class VitisAIExecutionProvider : public IExecutionProvider {
   ProviderOptions info_;
   std::vector<OrtCustomOpDomain*> custom_op_domains_;
   std::shared_ptr<KernelRegistry> registry_;
-  std::set<std::string> vitisai_optypes_;
   // EP context related.
   bool ep_ctx_enabled_ = false;
   bool ep_ctx_embed_mode_ = true;
diff --git a/onnxruntime/core/providers/vsinpu/builders/impl/conv_op_builder.h b/onnxruntime/core/providers/vsinpu/builders/impl/conv_op_builder.h
index 3ed432c2efa1c..5278efdb4a400 100644
--- a/onnxruntime/core/providers/vsinpu/builders/impl/conv_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/conv_op_builder.h
@@ -112,7 +112,7 @@ class ConvOpBuilder : public BaseOpBuilder {
         }
       }
     } else {
-      auto pads = helper.Get("pads", std::vector<uint32_t>{0U, 0U});
+      auto pads = helper.Get("pads", std::vector<uint32_t>{0U, 0U, 0U, 0U});
       if (group != 1 && group != weight_tensor->GetShape()[OChannel_idx]) {
         if (is_1d_conv) {
           op = graph_ep->GetGraph()
diff --git a/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h b/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
index 4c10ba01b1c2e..7da1e6e674601 100644
--- a/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
@@ -65,6 +65,12 @@ ELEMENTWISE_OP_BUILDER(Floor, Floor);
 ELEMENTWISE_OP_BUILDER(Log, Log);
 ELEMENTWISE_OP_BUILDER(Sin, Sin);
 ELEMENTWISE_OP_BUILDER(HardSwish, HardSwish);
+ELEMENTWISE_OP_BUILDER(Neg, Neg);
+ELEMENTWISE_OP_BUILDER(Not, LogicalNot);
+ELEMENTWISE_OP_BUILDER(Ceil, Ceil);
+ELEMENTWISE_OP_BUILDER(Round, Round);
+ELEMENTWISE_OP_BUILDER(Min, Minimum);
+ELEMENTWISE_OP_BUILDER(Max, Maximum);
 
 class PowOpBuilder : public BaseOpBuilder {
   bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
diff --git a/onnxruntime/core/providers/vsinpu/builders/impl/pad_op_builder.h b/onnxruntime/core/providers/vsinpu/builders/impl/pad_op_builder.h
new file mode 100644
index 0000000000000..19cbe4e7f3e48
--- /dev/null
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/pad_op_builder.h
@@ -0,0 +1,191 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#pragma once
+#include <memory>
+#include <vector>
+#include <utility>
+#include <limits>
+#include <algorithm>
+#include "core/optimizer/initializer.h"
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+
+typedef tim::vx::ops::PadV2::pad_mode_type PadMode;
+
+class PadOpBuilder : public BaseOpBuilder {
+ public:
+  int GetMinSupportedOpSet(const NodeUnit& /* node_unit */) const override { return 11; }
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    NodeAttrHelper helper(*node);
+    const auto mode = helper.Get("mode", "constant");
+    auto input_defs = node->InputDefs();
+    size_t num_inputs = input_defs.size();
+    auto input_shape = vsi::npu::util::GetTensorShape(*input_defs[0]);
+    int32_t rank = input_shape.NumDimensions();
+    const auto& initializers = graph_viewer.GetAllInitializedTensors();
+
+    if (mode == "wrap") {
+      LOGS_DEFAULT(WARNING) << "`wrap` mode Pad is not currently supported for now.";
+      return false;
+    }
+    if (mode == "constant") {
+      if (num_inputs > 2 && input_defs[2]->Exists()) {
+        // only support if `constant_value` input is a constant initializer
+        if (!Contains(initializers, input_defs[2]->Name())) {
+          LOGS_DEFAULT(WARNING) << "constant_value must be a constant initializer.";
+          return false;
+        }
+      }
+    }
+    // only support if `pads` input is known and does not contain negative values
+    {
+      const auto* pads_initializer = graph_viewer.GetConstantInitializer(input_defs[1]->Name());
+      if (!pads_initializer) {
+        LOGS_DEFAULT(WARNING) << "pads must be a constant initializer";
+        return false;
+      }
+
+      Initializer unpacked_tensor(*pads_initializer);
+      auto tensor_data = unpacked_tensor.DataAsSpan<int64_t>();
+      for (size_t i = 0; i < unpacked_tensor.size(); i++) {
+        if (tensor_data[i] < 0) {
+          LOGS_DEFAULT(WARNING) << "Negative pad value is not supported: pads["
+                                << i << "] = " << tensor_data[i];
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  bool HasSupportedInputOutputsImpl(const InitializedTensorSet& initializers,
+                                    const NodeUnit& node_unit) const override {
+    for (size_t i = 0; i < node_unit.Inputs().size(); ++i) {
+      const auto& iodef = node_unit.Inputs()[i];
+      if (0 == i) {
+        if (!util::IsTypeSupported(&iodef.node_arg) ||
+            (*iodef.node_arg.Type() == "tensor(int64)") ||
+            (*iodef.node_arg.Type() == "tensor(bool)")) {
+          LOGS_DEFAULT(WARNING) << "Unspport tensor data type:" << *iodef.node_arg.Type();
+          return false;
+        }
+      } else if (1 == i) {
+        if (!Contains(initializers, iodef.node_arg.Name())) {
+          LOGS_DEFAULT(WARNING) << "pads must be a constant initializer.";
+          return false;
+        }
+      } else if (2 == i) {
+        if (iodef.node_arg.Exists() && !Contains(initializers, iodef.node_arg.Name())) {
+          LOGS_DEFAULT(WARNING) << "constant_value must be a constant initializer.";
+          return false;
+        }
+      } else if (i == 3) {
+        if (!Contains(initializers, iodef.node_arg.Name())) {
+          LOGS_DEFAULT(WARNING) << "axes must be a constant initializer..";
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Pad Op.";
+    NodeAttrHelper helper(node_unit);
+    const auto mode = helper.Get("mode", "constant");
+    auto input_defs = node_unit.Inputs();
+    PadMode pad_mode = PadMode::PAD_MODE_CONSTANT;
+    float const_val = 0.0f;
+    std::vector<int64_t> axes_tensor_data;
+    int32_t input_rank = inputs[0]->GetShape().size();
+
+    if (mode == "constant") {
+      pad_mode = PadMode::PAD_MODE_CONSTANT;
+    } else if (mode == "reflect") {
+      pad_mode = PadMode::PAD_MODE_REFLECT;
+    } else if (mode == "edge") {
+      pad_mode = PadMode::PAD_MODE_EDGE;
+    } else {
+      LOGS_DEFAULT(WARNING) << "`wrap` mode Pad is not currently supported for now.";
+      return false;
+    }
+
+    // `pads` input
+    std::vector<int64_t> onnx_pads(inputs[1]->GetSpec().GetElementNum());
+    inputs[1]->CopyDataFromTensor(onnx_pads.data());
+
+    // `constant_value` input
+    if (inputs.size() > 2 && pad_mode == PadMode::PAD_MODE_CONSTANT) {
+      if (input_defs[2].node_arg.Exists()) {
+        inputs[2]->CopyDataFromTensor(&const_val);
+      }
+    }
+    // `axes` input
+    if (inputs.size() > 3) {
+      // optional input axes is provided, use axes initializer data
+      std::vector<int64_t> axes_tensor(inputs[3]->GetSpec().GetElementNum());
+      inputs[3]->CopyDataFromTensor(axes_tensor.data());
+      std::transform(
+          axes_tensor.begin(), axes_tensor.end(), std::back_inserter(axes_tensor_data),
+          [input_rank](int64_t axis) { return HandleNegativeAxis(axis, input_rank); });
+    } else {
+      // if not provided, make a default axes as [0, 1, ..., input_rank - 1]
+      std::vector<int64_t> default_axes(input_rank);
+      std::iota(std::begin(default_axes), std::end(default_axes), 0);
+      axes_tensor_data = std::move(default_axes);
+    }
+
+    int64_t num_axes = axes_tensor_data.size();
+    std::vector<uint32_t> front_size(input_rank, 0);
+    std::vector<uint32_t> back_size(input_rank, 0);
+
+    int64_t axes_index = 0;
+    for (int64_t axes : axes_tensor_data) {
+      front_size[axes] = onnx_pads[axes_index];
+      back_size[axes] = onnx_pads[axes_index + num_axes];
+      axes_index++;
+    }
+
+    std::reverse(front_size.begin(), front_size.end());
+    std::reverse(back_size.begin(), back_size.end());
+
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::PadV2>(
+        front_size, back_size, const_val, pad_mode);
+    op->BindInput(inputs[0]).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+}  // namespace vsi
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vsinpu/builders/impl/split_op_builder.h b/onnxruntime/core/providers/vsinpu/builders/impl/split_op_builder.h
new file mode 100644
index 0000000000000..e08416bda70d4
--- /dev/null
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/split_op_builder.h
@@ -0,0 +1,190 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2024 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#pragma once
+#include <memory>
+#include <vector>
+#include <utility>
+#include <limits>
+#include <algorithm>
+#include "core/optimizer/initializer.h"
+#include "core/providers/vsinpu/builders/impl/base_op_builder.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace vsi {
+namespace npu {
+
+class SplitOpBuilder : public BaseOpBuilder {
+ public:
+  bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,
+                     const Node* node) const override {
+    NodeAttrHelper helper(*node);
+    auto axis = helper.Get("axis", 0);
+    auto input_defs = node->InputDefs();
+    size_t num_inputs = input_defs.size();
+    size_t num_outputs = node->OutputDefs().size();
+    auto input_shape = vsi::npu::util::GetTensorShape(*input_defs[0]);
+    int32_t rank = input_shape.NumDimensions();
+    std::vector<int64_t> splits_list;
+    bool split_provided = false;
+    if (axis >= rank || axis < -rank) {
+      LOGS_DEFAULT(WARNING) << "Axis is invalid in Split. Axis(" << axis
+                            << ") is out of rank[" << -rank << "," << rank - 1 << "]";
+      return false;
+    }
+    axis = HandleNegativeAxis(axis, rank);
+    const auto split_dims_at_axis = input_shape.GetDims()[axis];
+    if (num_inputs > 1 && input_defs[1]->Exists()) {
+      // if optional input `split` is provided
+      const auto* splits = graph_viewer.GetConstantInitializer(input_defs[1]->Name());
+      if (!splits) {
+        LOGS_DEFAULT(WARNING) << "Optional input 'split' must be a constant initializer if provided.";
+        return false;
+      }
+      Initializer unpacked_tensor(*splits);
+      auto split_sizes_ = unpacked_tensor.DataAsSpan<int64_t>();
+      splits_list.assign(split_sizes_.begin(), split_sizes_.end());
+      split_provided = true;
+    }
+    if (num_inputs == 1) {
+      // opset1,2,11 split as attribute
+      if (helper.HasAttr("split")) {
+        auto split_sizes_ = *helper.GetInt64s("split");
+        splits_list.assign(split_sizes_.begin(), split_sizes_.end());
+        split_provided = true;
+      } else if (node->SinceVersion() >= 18) {
+        const auto outputs_count = helper.GetInt64("num_outputs");
+        if (!outputs_count.has_value()) {
+          LOGS_DEFAULT(WARNING) << "No 'num_outputs' provided. For split 18+, num_outputs is a required attribute.";
+          return false;
+        }
+        if (outputs_count.value() != static_cast<int32_t>(num_outputs) ||
+            outputs_count.value() > split_dims_at_axis) {
+          LOGS_DEFAULT(WARNING) << "Invalid num_outputs provided.\n. The value should be smaller or equal to the size "
+                                   "of dimension being split. num_outputs: "
+                                << outputs_count.value();
+          return false;
+        }
+      }
+    }
+    if (!split_provided) {
+      // populate split sizes based on num_outputs so existing code can be utilized
+      int32_t size = narrow<int32_t>(std::ceil(float(split_dims_at_axis) / num_outputs));
+      int32_t remainder = split_dims_at_axis % size;
+      std::vector<int64_t> split_sizes_(num_outputs, size);
+      if (remainder) {
+        split_sizes_.back() = remainder;
+      }
+      splits_list.assign(split_sizes_.begin(), split_sizes_.end());
+    }
+
+    uint32_t sum_of_splits = std::accumulate(splits_list.begin(), splits_list.end(), SafeInt<uint32_t>(0));
+    if (sum_of_splits != split_dims_at_axis) {
+      LOGS_DEFAULT(WARNING) << "Sum of the 'split' input values must equal to the dim value at 'axis' specified. "
+                            << "dim value at 'axis' specified: "
+                            << split_dims_at_axis
+                            << ", sum of 'split' input values: "
+                            << sum_of_splits;
+      return false;
+    }
+    if (!std::all_of(splits_list.begin(), splits_list.end(), [](int64_t value) { return value >= 0; })) {
+      LOGS_DEFAULT(WARNING) << "Invalid value in 'split' attribute. All values must be > 0";
+      return false;
+    }
+    auto average_split = sum_of_splits / num_outputs;
+    if (!std::all_of(splits_list.begin(), splits_list.end(), [average_split](int64_t value) { return value == average_split; })) {
+      // TO DO, remove this check after driver supports it.
+      LOGS_DEFAULT(WARNING) << "Uneven splits are not currently supported for now.";
+      return false;
+    }
+
+    return true;
+  }
+
+  bool HasSupportedInputOutputsImpl(const InitializedTensorSet& initializers,
+                                    const NodeUnit& node_unit) const override {
+    for (size_t i = 0; i < node_unit.Inputs().size(); ++i) {
+      const auto& iodef = node_unit.Inputs()[i];
+      if (0 == i) {
+        if (!util::IsTypeSupported(&iodef.node_arg) ||
+            (*iodef.node_arg.Type() == "tensor(int64)") ||
+            (*iodef.node_arg.Type() == "tensor(bool)")) {
+          LOGS_DEFAULT(WARNING) << "Unsupport tensor data type:" << *iodef.node_arg.Type();
+          return false;
+        }
+      } else if (!Contains(initializers, iodef.node_arg.Name())) {
+        LOGS_DEFAULT(WARNING) << "Optional input 'split' must be a constant initializer if provided.";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,
+                     std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
+                     const NodeUnit& node_unit) override {
+    LOGS_DEFAULT(VERBOSE) << "Creating Split Op.";
+    NodeAttrHelper helper(node_unit);
+    auto axis = helper.Get("axis", 0);
+    axis = util::ReverseAxis(axis, inputs[0]->GetShape().size());
+    const auto split_dims_at_axis = inputs[0]->GetShape()[axis];
+    auto num_outputs = outputs.size();
+    // transform splite vector to timvx slice
+    std::vector<int64_t> onnx_split;
+    if (inputs.size() > 1) {
+      std::vector<int64_t> split_sizes_(inputs[1]->GetSpec().GetElementNum());
+      inputs[1]->CopyDataFromTensor(split_sizes_.data());
+      onnx_split.assign(split_sizes_.begin(), split_sizes_.end());
+    }
+    if (inputs.size() == 1) {
+      if (helper.HasAttr("split")) {
+        auto split_sizes_ = *helper.GetInt64s("split");
+        onnx_split.assign(split_sizes_.begin(), split_sizes_.end());
+      }
+      if (node_unit.SinceVersion() >= 18 || !helper.HasAttr("split")) {
+        // populate split sizes based on num_outputs so existing code can be utilized
+        int32_t size = narrow<int32_t>(std::ceil(float(split_dims_at_axis) / num_outputs));
+        int32_t remainder = split_dims_at_axis % size;
+        std::vector<int64_t> split_sizes_(num_outputs, size);
+        if (remainder) {
+          split_sizes_.back() = remainder;
+        }
+        onnx_split.assign(split_sizes_.begin(), split_sizes_.end());
+      }
+    }
+    std::vector<uint32_t> slices(onnx_split.begin(), onnx_split.end());
+    std::reverse(slices.begin(), slices.end());
+
+    auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Split>(
+        axis, slices);
+    op->BindInput(inputs[0]).BindOutputs(outputs);
+    graph_ep->GetOps().push_back(std::move(op));
+    return true;
+  }
+};
+}  // namespace npu
+}  // namespace vsi
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h b/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h
index dc0969429b8ff..fcf9479a6058b 100644
--- a/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h
@@ -53,6 +53,8 @@
 #include "impl/cast_op_builder.h"
 #include "impl/dropout_op_builder.h"
 #include "impl/slice_op_builder.h"
+#include "impl/split_op_builder.h"
+#include "impl/pad_op_builder.h"
 namespace onnxruntime {
 namespace vsi {
 namespace npu {
@@ -110,7 +112,15 @@ static const std::map<std::string, createIOpBuildItemFunc> reg = {
     REGISTER_OP_BUILDER("Resize", ResizeOpBuilder),
     REGISTER_OP_BUILDER("Cast", CastOpBuilder),
     REGISTER_OP_BUILDER("Dropout", DropoutOpBuilder),
-    REGISTER_OP_BUILDER("Slice", SliceOpBuilder)
+    REGISTER_OP_BUILDER("Slice", SliceOpBuilder),
+    REGISTER_OP_BUILDER("Split", SplitOpBuilder),
+    REGISTER_OP_BUILDER("Neg", NegOpBuilder),
+    REGISTER_OP_BUILDER("Not", NotOpBuilder),
+    REGISTER_OP_BUILDER("Ceil", CeilOpBuilder),
+    REGISTER_OP_BUILDER("Round", RoundOpBuilder),
+    REGISTER_OP_BUILDER("Min", MinOpBuilder),
+    REGISTER_OP_BUILDER("Max", MaxOpBuilder),
+    REGISTER_OP_BUILDER("Pad", PadOpBuilder)
 #undef REGISTER_OP_BUILDER
 };
 
diff --git a/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch b/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch
index 45de47f3e5128..95a4e4650e9fe 100644
--- a/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch
+++ b/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch
@@ -1,8 +1,8 @@
 diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
-index c02ac2096d..2bc51298f0 100644
+index 10c307b3b9..a52bf71c4d 100644
 --- a/cmake/onnxruntime_mlas.cmake
 +++ b/cmake/onnxruntime_mlas.cmake
-@@ -361,7 +361,7 @@ else()
+@@ -370,7 +370,7 @@ else()
          )
          set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
                                      PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
@@ -12,11 +12,11 @@ index c02ac2096d..2bc51298f0 100644
              ${mlas_platform_srcs}
              ${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S
 diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
-index e46105324a..414c46a1ce 100644
+index 28ae64c4d5..0c77e0ca78 100644
 --- a/onnxruntime/core/mlas/inc/mlas.h
 +++ b/onnxruntime/core/mlas/inc/mlas.h
-@@ -82,6 +82,9 @@ Abstract:
- 
+@@ -83,6 +83,9 @@ Abstract:
+
  #if (!defined(_MSC_VER)) || (_MSC_VER >= 1930)
  #if defined(MLAS_TARGET_ARM64) || defined(MLAS_TARGET_ARM64EC)
 +#if !defined(USE_VSINPU)
@@ -25,51 +25,51 @@ index e46105324a..414c46a1ce 100644
  #if !defined(__APPLE__)
  // Had to temporary disable fp16 under APPLE ARM64, as compiling
  // the source files require a hardware specific compilation flag.
-@@ -90,6 +93,7 @@ Abstract:
- 
+@@ -91,6 +94,7 @@ Abstract:
+
  #define MLAS_F16VEC_INTRINSICS_SUPPORTED
- 
+
 +#endif //
  #endif //
  #endif // ARM64
  #endif // Visual Studio 16 or earlier does not support fp16 intrinsic
-@@ -1635,6 +1639,7 @@ MlasHalfGemmConvertPackB(
+@@ -1644,6 +1648,7 @@ MlasHalfGemmConvertPackB(
      );
- 
+
  #if defined(__aarch64__) && defined(__linux__)
 +#if !defined(USE_VSINPU)
  /**
   * @brief Whether current CPU supports Bfloat16(bf16) acceleration.
   */
-@@ -1746,6 +1751,7 @@ MlasSBGemmPackBSize(size_t N, size_t K);
+@@ -1755,6 +1760,7 @@ MlasSBGemmPackBSize(size_t N, size_t K);
  void MLASCALL
  MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* PackedB);
  #endif
 +#endif
- 
+
  /**
   * @brief Indirect Depthwise convolution for fp16
 diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
-index 4239e2ecae..3df7e5573d 100644
+index 0533a5e49b..c18bf7f90d 100644
 --- a/onnxruntime/core/mlas/lib/mlasi.h
 +++ b/onnxruntime/core/mlas/lib/mlasi.h
-@@ -361,6 +361,7 @@ size_t
+@@ -377,6 +377,7 @@ size_t
  #else
- 
+
  #if defined(__aarch64__) && defined(__linux__)
 +#if !defined(USE_VSINPU)
  typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)(
      const float* A,
      const bfloat16_t* B,
-@@ -373,6 +374,7 @@ typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)(
+@@ -389,6 +390,7 @@ typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)(
      const float* Bias
  );
  #endif
 +#endif
- 
+
  typedef
  size_t
-@@ -763,8 +765,10 @@ extern "C" {
+@@ -796,8 +798,10 @@ extern "C" {
      MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero;
      MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd;
  #if defined(__aarch64__) && defined(__linux__)
@@ -80,39 +80,25 @@ index 4239e2ecae..3df7e5573d 100644
  #endif
      MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelZero;
      MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelAdd;
-@@ -899,8 +903,10 @@ extern "C" {
+@@ -946,8 +950,10 @@ extern "C" {
  #define MLAS_QGEMM_THREAD_COMPLEXITY                65536
- 
+
  #if defined(__aarch64__) && defined(__linux__)
 +#if !defined(USE_VSINPU)
  #define MLAS_SBGEMM_THREAD_COMPLEXITY (size_t(64) * size_t(1024))
  #endif
 +#endif
- 
+
  //
  // Single-threaded single precision matrix/matrix multiply operation.
-@@ -2570,4 +2576,3 @@ MlasPackInt4Elements(uint8_t* Output, UnpackedType ValueLow, UnpackedType ValueH
-     static_assert(std::is_same_v<UnpackedType, uint8_t> || std::is_same_v<UnpackedType, int8_t>);
-     *Output = static_cast<uint8_t>(((ValueHigh & 0xF) << 4) | (ValueLow & 0xF));
- }
--
 diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
-index ed437f20f7..8c9d0a75fd 100644
+index b3c9461293..424c3b0441 100644
 --- a/onnxruntime/core/mlas/lib/platform.cpp
 +++ b/onnxruntime/core/mlas/lib/platform.cpp
-@@ -20,7 +20,7 @@ Abstract:
- #include <thread>
- #include <mutex>
- 
--#if defined(MLAS_TARGET_POWER) 
-+#if defined(MLAS_TARGET_POWER)
- #if defined(__linux__)
- #include <sys/auxv.h>
- #elif defined(_AIX)
-@@ -536,7 +536,7 @@ Return Value:
-         this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
+@@ -574,7 +574,7 @@ Return Value:
+         this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
      }
- 
+
 -#if defined(__linux__)
 +#if defined(__linux__) && !defined(USE_VSINPU)
      //
@@ -124,12 +110,12 @@ index de7fd72fad..4f75dbd6fa 100644
 +++ b/onnxruntime/core/mlas/lib/sbgemm.h
 @@ -31,6 +31,7 @@ Abstract:
  --*/
- 
+
  #if defined(__aarch64__) && defined(__linux__)
 +#if !defined(USE_VSINPU)
- 
+
  #pragma once
- 
+
 @@ -396,4 +397,5 @@ MlasSBGemmBatch(const size_t M, const size_t N, const size_t K, const size_t Bat
          }
      );
@@ -137,11 +123,11 @@ index de7fd72fad..4f75dbd6fa 100644
 +#endif
  #endif  // defined(__aarch64__) && defined(__linux__)
 diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
-index 6a71283f9d..d8bd348854 100644
+index 2c6d23e4de..61aaacdfd6 100644
 --- a/onnxruntime/core/providers/cpu/math/matmul.cc
 +++ b/onnxruntime/core/providers/cpu/math/matmul.cc
-@@ -132,7 +132,7 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
- 
+@@ -133,7 +133,7 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
+
    return Status::OK();
  }
 -#if defined(__aarch64__) && defined(__linux__)
@@ -149,7 +135,7 @@ index 6a71283f9d..d8bd348854 100644
  bool GemmPackBBfloat16(AllocatorPtr& alloc,
                         const Tensor& tensor_b,
                         bool trans_b,
-@@ -180,6 +180,7 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
+@@ -181,6 +181,7 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
    if (input_idx == 1) {
      size_t packed_b_size;
  #if defined(__aarch64__) && defined(__linux__)
@@ -157,7 +143,7 @@ index 6a71283f9d..d8bd348854 100644
      size_t dim1 = 0;
      size_t dim2 = 0;
      TensorShape b_shape = tensor.Shape();
-@@ -192,6 +193,7 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
+@@ -193,6 +194,7 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
      if (use_fastmath_mode_ && (trans_b_attr_ == 0) && ((dim1 * dim2) >= kFastMathModeKernelsizeThreshold)) {
        is_packed = GemmPackBBfloat16(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_);
      } else
@@ -165,7 +151,7 @@ index 6a71283f9d..d8bd348854 100644
  #endif
      {
        is_packed = GemmPackBFp32(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_);
-@@ -257,6 +259,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
+@@ -259,6 +261,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
    const size_t lda = helper.Lda(trans_a);
    const size_t ldb = helper.Ldb(trans_b);
  #if defined(__aarch64__) && defined(__linux__)
@@ -173,7 +159,7 @@ index 6a71283f9d..d8bd348854 100644
    if (use_fastmath_mode_ && !trans_b && ((N * K) >= kFastMathModeKernelsizeThreshold)) {
      std::vector<MLAS_SBGEMM_DATA_PARAMS> data(max_len);
      for (size_t i = 0; i < max_len; i++) {
-@@ -273,6 +276,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
+@@ -275,6 +278,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
      }
      MlasSBGemmBatch(M, N, K, max_len, data.data(), thread_pool);
    } else
@@ -187,7 +173,7 @@ index b9bbe36583..2f570502d2 100644
 +++ b/onnxruntime/core/providers/cpu/math/matmul.h
 @@ -31,8 +31,10 @@ class MatMul<float> final : public OpKernel {
      trans_batch_b_ = trans_batch_b_attr != 0;
- 
+
  #if defined(__aarch64__) && defined(__linux__)
 +#if !defined(USE_VSINPU)
      auto config_ops = info.GetConfigOptions().GetConfigEntry(kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16);
@@ -195,10 +181,10 @@ index b9bbe36583..2f570502d2 100644
 +#endif
  #endif
    }
- 
+
 @@ -57,12 +59,14 @@ class MatMul<float> final : public OpKernel {
    bool trans_batch_b_;
- 
+
  #if defined(__aarch64__) && defined(__linux__)
 +#if !defined(USE_VSINPU)
    // fastmath mode state
@@ -209,7 +195,7 @@ index b9bbe36583..2f570502d2 100644
  #endif
 +#endif
  };
- 
+
  }  // namespace onnxruntime
 diff --git a/onnxruntime/test/mlas/unittest/test_sbgemm.cpp b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
 index f85fe97776..6039b7fa9e 100644
@@ -217,12 +203,12 @@ index f85fe97776..6039b7fa9e 100644
 +++ b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
 @@ -16,6 +16,7 @@ Abstract:
  --*/
- 
+
  #if defined(__aarch64__) && defined(__linux__)
 +#if !defined(USE_VSINPU)
- 
+
  #include "test_sbgemm.h"
- 
+
 @@ -138,4 +139,5 @@ static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_exe
    }
    return SBGemmRegistLongExecute() > 0;
@@ -235,15 +221,15 @@ index 13701e2e3d..7e432f53c2 100644
 +++ b/onnxruntime/test/mlas/unittest/test_sbgemm.h
 @@ -16,6 +16,7 @@ Abstract:
  --*/
- 
+
  #if defined(__aarch64__) && defined(__linux__)
 +#if !defined(USE_VSINPU)
- 
+
  #pragma once
- 
+
 @@ -278,4 +279,5 @@ class MlasSBGemmTest : public MlasTestBase {
    }
  };
- 
+
 +#endif
  #endif  // defined(__aarch64__) && defined(__linux__)
diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.cc b/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.cc
index bbf8255ac2940..db8a87d9eaf24 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.cc
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.cc
@@ -34,7 +34,8 @@ namespace onnxruntime {
 
 namespace vsi {
 namespace npu {
-GraphEP::GraphEP(const onnxruntime::GraphViewer& graph_viewer) : graph_viewer_(graph_viewer) {
+GraphEP::GraphEP(const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger)
+    : graph_viewer_(graph_viewer), logger_(logger) {
   Prepare();
   context_ = tim::vx::Context::Create();
   graph_ = context_->CreateGraph();
@@ -42,7 +43,7 @@ GraphEP::GraphEP(const onnxruntime::GraphViewer& graph_viewer) : graph_viewer_(g
 }
 
 bool GraphEP::Prepare() {
-  std::tie(node_unit_holder_, node_unit_map_) = QDQ::GetAllNodeUnits(graph_viewer_);
+  std::tie(node_unit_holder_, node_unit_map_) = QDQ::GetAllNodeUnits(graph_viewer_, logger_);
   for (const auto& node_unit : node_unit_holder_) {
     auto quant_op_type = util::GetQuantizedOpType(*node_unit);
 
diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.h b/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.h
index 49344770d060e..5bb332fad0177 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.h
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.h
@@ -51,7 +51,7 @@ struct NodeIOInfo {
 
 class GraphEP {
  public:
-  explicit GraphEP(const GraphViewer& graph_viewer);
+  explicit GraphEP(const GraphViewer& graph_viewer, const logging::Logger& logger);
   ~GraphEP() {}
 
   bool Prepare();
@@ -104,6 +104,7 @@ class GraphEP {
   // In the form of {input_name, [NodeUnit(s) using the input]}
   std::unordered_map<std::string, std::vector<const NodeUnit*>> all_quantized_op_inputs_;
   const GraphViewer& graph_viewer_;
+  const logging::Logger& logger_;
 
   // Holder for the NodeUnits in the graph, this will guarantee the NodeUnits is
   // valid throughout the lifetime of the ModelBuilder
diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
index 669c702544de8..7da7cc6cb63ba 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
@@ -62,6 +62,7 @@ VSINPUExecutionProvider::~VSINPUExecutionProvider() {}
 std::vector<std::unique_ptr<ComputeCapability>>
 VSINPUExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                        const IKernelLookup& /*kernel_lookup*/) const {
+  const auto& logger = *GetLogger();
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
   if (graph_viewer.IsSubgraph()) {
@@ -82,7 +83,7 @@ VSINPUExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
   // Get all the NodeUnits in the graph_viewer
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
 
   // This holds the result of whether a NodeUnit is supported or not,
   // to prevent nodes in a NodeUnit to be checked for multiple times
@@ -174,7 +175,8 @@ VSINPUExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
 }
 
 Status ComputeStateFunc(vsi::npu::GraphEP* graph_ep,
-                        OrtKernelContext* context) {
+                        OrtKernelContext* context,
+                        const logging::Logger& logger) {
   Ort::KernelContext ctx(context);
   size_t num_in = ctx.GetInputCount();
   const size_t num_inputs = graph_ep->GetGraphInputs().size();
@@ -192,7 +194,7 @@ Status ComputeStateFunc(vsi::npu::GraphEP* graph_ep,
   }
 
   if (!graph_ep->GetGraph()->Run()) {
-    LOGS_DEFAULT(ERROR) << "Failed to run graph.";
+    LOGS(logger, ERROR) << "Failed to run graph.";
   }
   for (size_t i = 0; i < ctx.GetOutputCount(); i++) {
     auto timvx_tensor = graph_ep->GetGraphOutputs()[i]->tensor;
@@ -207,12 +209,14 @@ Status ComputeStateFunc(vsi::npu::GraphEP* graph_ep,
 
 Status VSINPUExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                                         std::vector<NodeComputeInfo>& node_compute_funcs) {
+  const auto& logger = *GetLogger();
+
   for (const auto& fused_node_graph : fused_nodes_and_graphs) {
     const GraphViewer& graph_viewer = fused_node_graph.filtered_graph;
-    std::shared_ptr<vsi::npu::GraphEP> graph_ep = std::make_shared<vsi::npu::GraphEP>(graph_viewer);
+    std::shared_ptr<vsi::npu::GraphEP> graph_ep = std::make_shared<vsi::npu::GraphEP>(graph_viewer, logger);
 
     for (auto tensor : graph_viewer.GetInputsIncludingInitializers()) {
-      LOGS_DEFAULT(VERBOSE) << "subgraph input init:" << vsi::npu::util::PrintNode(*tensor) << "#"
+      LOGS(logger, VERBOSE) << "subgraph input init:" << vsi::npu::util::PrintNode(*tensor) << "#"
                             << graph_viewer.IsInitializedTensor(tensor->Name());
       auto input = std::make_shared<vsi::npu::GraphIOInfo>();
       input->name = tensor->Name();
@@ -220,7 +224,7 @@ Status VSINPUExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fu
       graph_ep->GetGraphInputs().push_back(input);
     }
     for (auto tensor : graph_viewer.GetOutputs()) {
-      LOGS_DEFAULT(VERBOSE) << "subgraph output:" << vsi::npu::util::PrintNode(*tensor);
+      LOGS(logger, VERBOSE) << "subgraph output:" << vsi::npu::util::PrintNode(*tensor);
       auto output = std::make_shared<vsi::npu::GraphIOInfo>();
       output->name = tensor->Name();
       output->is_initializer = false;
@@ -236,16 +240,16 @@ Status VSINPUExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fu
       if (node != &node_unit.GetNode()) {
         continue;
       }
-      LOGS_DEFAULT(VERBOSE) << "Adding node: [" << node->OpType() << "]";
+      LOGS(logger, VERBOSE) << "Adding node: [" << node->OpType() << "]";
       vsi::npu::SupportedBuiltinOps().at(node->OpType())->BuildOp(graph_ep.get(), graph_viewer, node_unit);
     }
 
-    LOGS_DEFAULT(INFO) << "Verifying graph";
+    LOGS(logger, INFO) << "Verifying graph";
     graph_ep->GetCompiled() = graph_ep->GetGraph()->Compile();
     if (!graph_ep->GetCompiled()) {
-      LOGS_DEFAULT(ERROR) << "Failed to verify graph.";
+      LOGS(logger, ERROR) << "Failed to verify graph.";
     } else {
-      LOGS_DEFAULT(INFO) << "Graph has been verified successfully.";
+      LOGS(logger, INFO) << "Graph has been verified successfully.";
     }
 
     NodeComputeInfo compute_info;
@@ -259,7 +263,7 @@ Status VSINPUExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fu
         [graph_ep, this](FunctionState /*state*/, const OrtApi* /* api */,
                          OrtKernelContext* context) {
           std::lock_guard<std::mutex> lock(this->GetMutex());
-          Status res = ComputeStateFunc(graph_ep.get(), context);
+          Status res = ComputeStateFunc(graph_ep.get(), context, *GetLogger());
           return res;
         };
 
diff --git a/onnxruntime/core/providers/webgpu/shader_variable.cc b/onnxruntime/core/providers/webgpu/shader_variable.cc
index 46b3d0d9023b0..15020b801c97d 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.cc
+++ b/onnxruntime/core/providers/webgpu/shader_variable.cc
@@ -81,8 +81,8 @@ constexpr static const auto ELEMENT_TYPE = details::_to_std_array(ELEMENT_TYPE_A
 
 inline std::string GetIndicesType(int rank) {
   return rank < 2 ? "u32"
-                  : (rank < 4 ? MakeStringWithClassicLocale("vec", rank, "<u32>")
-                              : MakeStringWithClassicLocale("array<u32, ", rank, ">"));
+                  : (rank <= 4 ? MakeStringWithClassicLocale("vec", rank, "<u32>")
+                               : MakeStringWithClassicLocale("array<u32, ", rank, ">"));
 }
 
 }  // namespace
diff --git a/onnxruntime/core/providers/webgpu/tensor/flatten.cc b/onnxruntime/core/providers/webgpu/tensor/flatten.cc
new file mode 100644
index 0000000000000..11ded865b6be2
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/flatten.cc
@@ -0,0 +1,67 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/tensor/flatten.h"
+#include "core/providers/webgpu/webgpu_execution_provider.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Flatten,
+    kOnnxDomain,
+    1, 8,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .Alias(0, 0)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes())
+        .InputMemoryType(OrtMemTypeCPU, 1),
+    Flatten);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Flatten,
+    kOnnxDomain,
+    9, 10,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .Alias(0, 0)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes())
+        .InputMemoryType(OrtMemTypeCPU, 1),
+    Flatten);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Flatten,
+    kOnnxDomain,
+    11, 12,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .Alias(0, 0)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes())
+        .InputMemoryType(OrtMemTypeCPU, 1),
+    Flatten);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Flatten,
+    kOnnxDomain,
+    13, 20,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .Alias(0, 0)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes())
+        .InputMemoryType(OrtMemTypeCPU, 1),
+    Flatten);
+
+ONNX_OPERATOR_KERNEL_EX(
+    Flatten,
+    kOnnxDomain,
+    21,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .Alias(0, 0)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes())
+        .InputMemoryType(OrtMemTypeCPU, 1),
+    Flatten);
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/flatten.h b/onnxruntime/core/providers/webgpu/tensor/flatten.h
new file mode 100644
index 0000000000000..5fc49a844b404
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/flatten.h
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/op_kernel.h"
+#include "core/providers/cpu/nn/flatten.h"
+#include "core/framework/data_transfer_manager.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class Flatten final : public OpKernel {
+ public:
+  explicit Flatten(const OpKernelInfo& info) : OpKernel{info} {
+    axis_ = info.GetAttrOrDefault<int64_t>("axis", 1);
+  }
+
+  Status Compute(OpKernelContext* context) const override {
+    const Tensor* input_tensor = context->Input<Tensor>(0);
+    const TensorShape& input_shape = input_tensor->Shape();
+    int64_t input_rank = input_shape.NumDimensions();
+
+    // Handle negative axis
+    int64_t axis = axis_;
+    if (axis < 0) {
+      axis += input_rank;
+    }
+
+    if (axis > input_rank) {
+      return Status(common::ONNXRUNTIME, common::FAIL, "Invalid value for axis, must be less than or equal to input_rank");
+    }
+
+    int64_t first_dim = 1;
+    for (int64_t i = 0; i < axis; i++) {
+      first_dim *= input_shape[i];
+    }
+
+    int64_t second_dim = 1;
+    for (int64_t i = axis; i < input_rank; i++) {
+      second_dim *= input_shape[i];
+    }
+
+    TensorShape output_shape({first_dim, second_dim});
+    Tensor* output_tensor = context->Output(0, output_shape);
+
+    const void* source = input_tensor->DataRaw();
+    void* target = output_tensor->MutableDataRaw();
+    // If source and target pointers are not equal (non-inplace operation), we need to copy the data.
+    if (target != source) {
+      ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*input_tensor, *output_tensor));
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  int64_t axis_;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 36aab2e628a16..ea0cbddb0205d 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -58,16 +58,15 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
       adapter_toggles_desc.enabledToggleCount = enabled_adapter_toggles.size();
       adapter_toggles_desc.enabledToggles = enabled_adapter_toggles.data();
 
-      wgpu::RequestAdapterCallbackInfo req_adapter_callback_info = {};
-      req_adapter_callback_info.mode = wgpu::CallbackMode::WaitAnyOnly;
-      req_adapter_callback_info.callback = [](WGPURequestAdapterStatus status,
-                                              WGPUAdapter adapter, const char* message,
-                                              void* userdata) {
-        ORT_ENFORCE(status == WGPURequestAdapterStatus_Success, "Failed to get a WebGPU adapter: ", message);
-        *static_cast<wgpu::Adapter*>(userdata) = wgpu::Adapter::Acquire(adapter);
-      };
-      req_adapter_callback_info.userdata = &adapter_;
-      ORT_ENFORCE(wgpu::WaitStatus::Success == instance_.WaitAny(instance_.RequestAdapter(&req_adapter_options, req_adapter_callback_info), UINT64_MAX));
+      ORT_ENFORCE(wgpu::WaitStatus::Success == instance_.WaitAny(instance_.RequestAdapter(
+                                                                     &req_adapter_options,
+                                                                     wgpu::CallbackMode::WaitAnyOnly,
+                                                                     [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, wgpu::StringView message, wgpu::Adapter* ptr) {
+                                                                       ORT_ENFORCE(status == wgpu::RequestAdapterStatus::Success, "Failed to get a WebGPU adapter: ", std::string_view{message});
+                                                                       *ptr = adapter;
+                                                                     },
+                                                                     &adapter_),
+                                                                 UINT64_MAX));
       ORT_ENFORCE(adapter_ != nullptr, "Failed to get a WebGPU adapter.");
     }
 
@@ -103,14 +102,15 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
         std::cerr << "WebGPU device lost (" << int(reason) << "): " << message;
       });
 
-      wgpu::RequestDeviceCallbackInfo req_device_callback_info = {};
-      req_device_callback_info.mode = wgpu::CallbackMode::WaitAnyOnly;
-      req_device_callback_info.callback = [](WGPURequestDeviceStatus status, WGPUDevice device, char const* message, void* userdata) {
-        ORT_ENFORCE(status == WGPURequestDeviceStatus_Success, "Failed to get a WebGPU device: ", message);
-        *static_cast<wgpu::Device*>(userdata) = wgpu::Device::Acquire(device);
-      };
-      req_device_callback_info.userdata = &device_;
-      ORT_ENFORCE(wgpu::WaitStatus::Success == instance_.WaitAny(adapter_.RequestDevice(&device_desc, req_device_callback_info), UINT64_MAX));
+      ORT_ENFORCE(wgpu::WaitStatus::Success == instance_.WaitAny(adapter_.RequestDevice(
+                                                                     &device_desc,
+                                                                     wgpu::CallbackMode::WaitAnyOnly,
+                                                                     [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message, wgpu::Device* ptr) {
+                                                                       ORT_ENFORCE(status == wgpu::RequestDeviceStatus::Success, "Failed to get a WebGPU device: ", std::string_view{message});
+                                                                       *ptr = device;
+                                                                     },
+                                                                     &device_),
+                                                                 UINT64_MAX));
       ORT_ENFORCE(device_ != nullptr, "Failed to get a WebGPU device.");
     }
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index 821c60ab602ea..66209adf6f1a9 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -347,7 +347,8 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13,
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 8, Flatten);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, 10, Flatten);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Flatten);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Flatten);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 20, Flatten);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 21, Flatten);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 6, 12, Tile);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Tile);
 
@@ -667,10 +668,12 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Slice)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Slice)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, 10, Flatten)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Flatten)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 9, 10, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 20, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 21, Flatten)>,
+
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 6, 12, Tile)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Tile)>,
 
@@ -795,7 +798,8 @@ std::vector<std::unique_ptr<ComputeCapability>> WebGpuExecutionProvider::GetCapa
     candidates.push_back(node.Index());
     tenative_candidates.push_back(node.Index());
   }
-  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates);
+
+  auto cpu_nodes = GetCpuPreferredNodes(graph, kernel_lookup, tenative_candidates, *GetLogger());
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& node_index : candidates) {
     if (cpu_nodes.count(node_index) > 0) {
diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc
index 1a0fdfdd2365b..f36f8283e9bf6 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.cc
+++ b/onnxruntime/core/providers/webnn/builders/helper.cc
@@ -69,17 +69,17 @@ bool IsNodeSupported(const Node& node, const GraphViewer& graph_viewer, const We
   }
 }
 
-bool IsInputSupported(const NodeArg& input, const std::string& parent_name, const logging::Logger& logger) {
-  const auto& input_name = input.Name();
-  const auto* shape_proto = input.Shape();
+bool IsTensorShapeSupported(const NodeArg& node_arg, const std::string& parent_name,
+                            const logging::Logger& logger, bool allow_empty_input) {
+  const auto& node_arg_name = node_arg.Name();
+  const auto* shape_proto = node_arg.Shape();
   // Optional tensors can be indicated by an empty name, just ignore it.
-  if (input_name.empty()) {
+  if (node_arg_name.empty()) {
     return true;
   }
-  // We do not support input with no shape.
+  // We do not support input/output with no shape.
   if (!shape_proto) {
-    LOGS(logger, VERBOSE) << "Input [" << input_name << "] of [" << parent_name
-                          << "] has not shape";
+    LOGS(logger, VERBOSE) << "Node arg [" << node_arg_name << "] of [" << parent_name << "] has not shape";
     return false;
   }
 
@@ -87,12 +87,11 @@ bool IsInputSupported(const NodeArg& input, const std::string& parent_name, cons
     // WebNN doesn't support dynamic shape - use sessionOptions.freeDimensionOverrides to fix the shape.
     if (!dim.has_dim_value()) {
       LOGS(logger, VERBOSE) << "Dynamic shape is not supported, "
-                            << "use sessionOptions.FreeDimensionOverrides to set a fixed shape for input: "
-                            << input_name;
+                            << "use sessionOptions.FreeDimensionOverrides to set a fixed shape: " << node_arg_name;
       return false;
     }
-    if (dim.dim_value() == 0) {
-      LOGS(logger, VERBOSE) << "The shape of [" << input_name << "] has 0 dimension which is not supported by WebNN";
+    if (dim.dim_value() == 0 && !allow_empty_input) {
+      LOGS(logger, VERBOSE) << "The shape of [" << node_arg_name << "] has 0 dimension which is not supported by WebNN";
       return false;
     }
   }
@@ -106,13 +105,6 @@ std::vector<std::vector<NodeIndex>> GetSupportedNodes(const GraphViewer& graph_v
                                                       const emscripten::val& wnn_limits,
                                                       const logging::Logger& logger) {
   std::vector<std::vector<size_t>> supported_node_groups;
-
-  for (const auto* input : graph_viewer.GetInputs()) {
-    if (!IsInputSupported(*input, "graph", logger)) {
-      return supported_node_groups;
-    }
-  }
-
   std::vector<size_t> supported_node_group;
   const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
 
@@ -270,5 +262,67 @@ bool IsMLTensorSupported() {
   return is_supported;
 }
 
+// Convert int8 to uint4/int4 (stored as uint8)
+uint8_t PackInt8ToUint8AsNibble(int8_t value, const int32_t& data_type) {
+  uint8_t result = 0;
+  if (data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
+    if (value < 0 || value > 15) {
+      ORT_THROW("Value cannot be safely converted to uint4.");
+    }
+    result |= (static_cast<uint8_t>(value) << 4);
+  } else {
+    if (value < -8 || value > 7) {
+      ORT_THROW("Value cannot be safely converted to int4.");
+    }
+    result |= (value << 4);
+  }
+
+  return result;
+}
+
+// Convert float32 to float16 (stored as uint16)
+uint16_t PackFloat32ToUint16AsFloat16(float value) {
+  uint32_t float32_bits;
+
+  // Safely copy the float bits into an integer
+  std::memcpy(&float32_bits, &value, sizeof(float));
+
+  // Extract the sign, exponent, and mantissa from the float32 bits
+  uint32_t sign = (float32_bits >> 31) & 0x1;
+  uint32_t exponent = (float32_bits >> 23) & 0xFF;
+  uint32_t mantissa = float32_bits & 0x7FFFFF;
+
+  // Shift the sign for float16
+  uint16_t sign_float16 = sign << 15;
+
+  // Handle special cases: Infinity and NaN
+  if (exponent == 255) {
+    return sign_float16 | (0x1F << 10) | (mantissa ? 0x200 : 0);
+  }
+  // Handle zero and subnormal numbers in float32
+  if (exponent == 0) {
+    return sign_float16 | (mantissa >> 13);
+  }
+
+  // Adjust the exponent for float16 (subtract bias difference: 127 - 15 = 112)
+  int exponent_float16 = exponent - 112;
+
+  // Handle exponent overflow (larger than float16 can represent)
+  if (exponent_float16 >= 0x1F) {
+    return sign_float16 | (0x1F << 10);
+  }
+  // Handle exponent underflow (smaller than float16 can represent)
+  if (exponent_float16 <= 0) {
+    mantissa = (mantissa | 0x800000) >> (1 - exponent_float16);
+    return sign_float16 | (mantissa >> 13);
+  }
+
+  // Adjust the mantissa by shifting it to fit float16 format (round to nearest even)
+  uint16_t mantissa_float16 = (mantissa + 0x1000) >> 13;
+
+  // Combine sign, exponent, and mantissa into the final float16 representation
+  return sign_float16 | (exponent_float16 << 10) | mantissa_float16;
+}
+
 }  // namespace webnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index ccc8394349c38..7fdfc5aefa798 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <cstring>
 #include <core/common/status.h>
 #include "core/common/inlined_containers.h"
 #include <core/graph/basic_types.h>
@@ -81,7 +82,7 @@ inline std::string GetTensorName(const ConstPointerContainer<std::vector<NodeArg
   return (input_defs.size() > index) ? std::string(input_defs[index]->Name()) : "";
 }
 
-inline std::vector<uint32_t> GetVecUint32FromVecInt64(const std::vector<int64_t>& int64_vec) {
+inline std::vector<uint32_t> GetVecUint32FromVecInt64(gsl::span<const int64_t> int64_vec) {
   std::vector<uint32_t> uint32_vec;
   uint32_vec.reserve(int64_vec.size());
   std::transform(int64_vec.begin(), int64_vec.end(),
@@ -180,7 +181,8 @@ inline bool IsEmptyTensor(const InitializedTensorSet& initializers, const std::s
   return std::any_of(dims.begin(), dims.end(), [](auto d) { return d == 0; });
 }
 
-bool IsInputSupported(const NodeArg& node_arg, const std::string& parent_name, const logging::Logger& logger);
+bool IsTensorShapeSupported(const NodeArg& node_arg, const std::string& parent_name,
+                            const logging::Logger& logger, bool allow_empty_input = false);
 
 // Get a list of groups of supported nodes, each group represents a subgraph supported by WebNN EP.
 std::vector<std::vector<NodeIndex>> GetSupportedNodes(const GraphViewer& graph_viewer,
@@ -209,6 +211,7 @@ static const InlinedHashMap<std::string, std::string> op_map = {
     {"DequantizeLinear", "dequantizeLinear"},
     {"Dropout", "identity"},
     {"DynamicQuantizeLinear", "dynamicQuantizeLinear"},
+    {"Einsum", "matmul"},
     {"Elu", "elu"},
     {"Equal", "equal"},
     {"Erf", "erf"},
@@ -226,7 +229,7 @@ static const InlinedHashMap<std::string, std::string> op_map = {
     {"GlobalLpPool", "l2Pool2d"},
     {"Greater", "greater"},
     {"GreaterOrEqual", "greaterOrEqual"},
-    {"Gru", "gru"},
+    {"GRU", "gru"},
     {"HardSigmoid", "hardSigmoid"},
     {"HardSwish", "hardSwish"},
     {"Identity", "identity"},
@@ -238,6 +241,7 @@ static const InlinedHashMap<std::string, std::string> op_map = {
     {"Log", "log"},
     {"LpPool", "l2Pool2d"},
     {"LSTM", "lstm"},
+    {"LRN", "averagePool2d"},
     {"MatMul", "matmul"},
     {"MatMulInteger", "matmulInteger"},
     {"Max", "max"},
@@ -345,5 +349,8 @@ bool SetWebnnDataType(emscripten::val& desc, const int32_t data_type);
 
 bool IsMLTensorSupported();
 
+uint8_t PackInt8ToUint8AsNibble(int8_t value, const int32_t& data_type);
+uint16_t PackFloat32ToUint16AsFloat16(float value);
+
 }  // namespace webnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.cc
index d9a3a8a6ff1b3..70fa0f9516c5c 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.cc
@@ -32,7 +32,7 @@ bool BaseOpBuilder::IsOpSupported(const InitializedTensorSet& initializers, cons
   if (!HasSupportedInputs(node, wnn_limits, logger))
     return false;
 
-  if (!HasSupportedOutputsImpl(node, wnn_limits, logger))
+  if (!HasSupportedOutputs(node, wnn_limits, logger))
     return false;
 
   if (!HasSupportedOpSet(node, logger))
@@ -45,7 +45,7 @@ bool BaseOpBuilder::HasSupportedInputs(const Node& node, const emscripten::val&
                                        const logging::Logger& logger) const {
   const auto node_name = MakeString("Node [", node.Name(), "] type [", node.OpType(), "]");
   for (const auto* input : node.InputDefs()) {
-    if (!IsInputSupported(*input, node_name, logger)) {
+    if (!IsTensorShapeSupported(*input, node_name, logger, allow_empty_tensor_as_input_)) {
       return false;
     }
   }
@@ -66,6 +66,18 @@ bool BaseOpBuilder::HasSupportedInputsImpl(const Node& node,
   return IsDataTypeSupportedByOp(op_type, input_type, wnn_limits, "input", "Input", logger);
 }
 
+bool BaseOpBuilder::HasSupportedOutputs(const Node& node, const emscripten::val& wnn_limits,
+                                        const logging::Logger& logger) const {
+  const auto node_name = MakeString("Node [", node.Name(), "] type [", node.OpType(), "]");
+  for (const auto* output : node.OutputDefs()) {
+    if (!IsTensorShapeSupported(*output, node_name, logger)) {
+      return false;
+    }
+  }
+
+  return HasSupportedOutputsImpl(node, wnn_limits, logger);
+}
+
 bool BaseOpBuilder::HasSupportedOutputsImpl(const Node& node,
                                             const emscripten::val& wnn_limits,
                                             const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h b/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h
index 584455f62cb4e..9412fa8026fb3 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/impl/base_op_builder.h
@@ -22,6 +22,9 @@ class BaseOpBuilder : public IOpBuilder {
                            const logging::Logger& logger) const override final ORT_MUST_USE_RESULT;
 
  protected:
+  explicit BaseOpBuilder(bool allow_empty_tensor_as_input = false)
+      : allow_empty_tensor_as_input_(allow_empty_tensor_as_input) {
+  }
   virtual Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                        const logging::Logger& logger) const ORT_MUST_USE_RESULT = 0;
 
@@ -54,6 +57,9 @@ class BaseOpBuilder : public IOpBuilder {
  private:
   bool HasSupportedOpSet(const Node& node, const logging::Logger& logger) const;
   bool HasSupportedInputs(const Node& node, const emscripten::val& wnn_limits, const logging::Logger& logger) const;
+  bool HasSupportedOutputs(const Node& node, const emscripten::val& wnn_limits, const logging::Logger& logger) const;
+
+  const bool allow_empty_tensor_as_input_;  // Some operators can handle ignoring an empty tensor as input.
 };
 
 }  // namespace webnn
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 329db75316e82..52fcc39ae5418 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -311,12 +311,12 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     if (input_defs.size() >= 3) {
       x_zero_point = model_builder.GetOperand(node.InputDefs()[2]->Name());
     } else {
-      x_zero_point = model_builder.GetZeroConstant(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
+      x_zero_point = model_builder.CreateOrGetConstant<uint8_t>(ONNX_NAMESPACE::TensorProto_DataType_UINT8, 0);
     }
     if (input_defs.size() >= 4) {
       w_zero_point = model_builder.GetOperand(node.InputDefs()[3]->Name());
     } else {
-      w_zero_point = model_builder.GetZeroConstant(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
+      w_zero_point = model_builder.CreateOrGetConstant<uint8_t>(ONNX_NAMESPACE::TensorProto_DataType_UINT8, 0);
     }
     output = model_builder.GetBuilder().call<emscripten::val>("conv2dInteger",
                                                               input, x_zero_point, filter, w_zero_point, options);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
index 018060f18c22f..be30c5520d62e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
@@ -19,6 +19,9 @@ namespace webnn {
 class CumSumOpBuilder : public BaseOpBuilder {
   // Add operator related.
 
+ public:
+  void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
+
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
@@ -30,8 +33,13 @@ class CumSumOpBuilder : public BaseOpBuilder {
 };
 
 // Add operator related.
-Status CumSumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
-                                              const Node& node,
+
+void CumSumOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  // Skip axis.
+  model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
+}
+
+Status CumSumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                               const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
@@ -39,10 +47,14 @@ Status CumSumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape");
   const auto input_rank = input_shape.size();
 
-  NodeAttrHelper helper(node);
-  int64_t axis = helper.Get("axis", 0);
-  axis = HandleNegativeAxis(axis, input_rank);
+  const auto& initializers = model_builder.GetInitializerTensors();
+  const std::string axis_name = GetTensorName(input_defs, 1);
+  const auto axis_tensor = *initializers.at(axis_name);
+  emscripten::val axis = emscripten::val::undefined();
+  ORT_RETURN_IF_NOT(ReadScalarTensorData(axis_tensor, axis, logger), "Cannot get axis value");
+  int64_t webnn_axis = HandleNegativeAxis(axis.as<int64_t>(), input_rank);
 
+  NodeAttrHelper helper(node);
   const auto exclusive = helper.Get("exclusive", 0);
   const auto reverse = helper.Get("reverse", 0);
 
@@ -52,13 +64,14 @@ Status CumSumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   options.set("label", node.Name());
 
   emscripten::val output = emscripten::val::object();
-  output = model_builder.GetBuilder().call<emscripten::val>("cumulativeSum", input, gsl::narrow<uint32_t>(axis), options);
+  output = model_builder.GetBuilder().call<emscripten::val>("cumulativeSum", input, gsl::narrow<uint32_t>(webnn_axis),
+                                                            options);
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
 
 // Operator support related.
-bool CumSumOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+bool CumSumOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                         const Node& node,
                                         WebnnDeviceType /* device_type */,
                                         const logging::Logger& logger) const {
@@ -68,6 +81,13 @@ bool CumSumOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializ
   if (!GetShape(*input_defs[0], input_shape, logger))
     return false;
 
+  const std::string axis_name = GetTensorName(input_defs, 1);
+  // Inputs contain optional 'axis' input.
+  if (!Contains(initializers, axis_name)) {
+    LOGS(logger, VERBOSE) << "The axis must be a constant initializer.";
+    return false;
+  }
+
   return true;
 }
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dropout_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dropout_op_builder.cc
index 5434194a214ac..9bb930c63b009 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/dropout_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/dropout_op_builder.cc
@@ -59,22 +59,14 @@ Status DropoutOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     std::vector<int64_t> mask_shape;
     ORT_RETURN_IF_NOT(GetShape(*output_defs[1], mask_shape, logger), "Cannot get mask output's shape");
     std::vector<uint32_t> dims = GetVecUint32FromVecInt64(mask_shape);
-
-    emscripten::val desc = emscripten::val::object();
-    desc.set("dataType", "uint8");
-    desc.set("dimensions", emscripten::val::array(dims));
-    desc.set("shape", emscripten::val::array(dims));
-    const auto num_elements = narrow<uint32_t>(Product(mask_shape));
-    emscripten::val ones_buffer = emscripten::val::global("Uint8Array").new_(num_elements);
-    ones_buffer.call<void>("fill", 1);
-
-    emscripten::val mask_output = model_builder.GetBuilder().call<emscripten::val>("constant", desc, ones_buffer);
+    emscripten::val one_constant = model_builder.CreateOrGetConstant<uint8_t>(
+        ONNX_NAMESPACE::TensorProto_DataType_BOOL, 1, dims);
 
     emscripten::val options = emscripten::val::object();
     options.set("label", output_defs[1]->Name() + "_identity");
     // Add additional identity op in case the mask is the output of a WebNN graph,
     // beacuse WebNN does not support a constant operand as output.
-    mask_output = model_builder.GetBuilder().call<emscripten::val>("identity", mask_output, options);
+    emscripten::val mask_output = model_builder.GetBuilder().call<emscripten::val>("identity", one_constant, options);
     model_builder.AddOperand(output_defs[1]->Name(), std::move(mask_output));
   }
   return Status::OK();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
new file mode 100644
index 0000000000000..931854d0f33c1
--- /dev/null
+++ b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
@@ -0,0 +1,793 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/safeint.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
+#include "core/providers/cpu/tensor/reshape_helper.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/webnn/builders/helper.h"
+#include "core/providers/webnn/builders/model_builder.h"
+#include "core/providers/webnn/builders/op_builder_factory.h"
+
+#include "base_op_builder.h"
+
+namespace onnxruntime {
+namespace webnn {
+
+class EinsumOpBuilder : public BaseOpBuilder {
+  // Add operator related.
+
+ private:
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+
+  // Operator support related.
+  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+  bool HasSupportedInputsImpl(const Node& node, const emscripten::val& wnn_limits,
+                              const logging::Logger& logger) const override;
+};
+
+// Helper functions, thanks for DML EP's OperatorHelper.
+enum class RecognizedOperatorType {
+  None,
+  Identity,
+  ReduceSum,
+  Transpose,
+  Diagonal,
+  Multiply,
+  Pairwise,
+  Total,
+};
+
+struct RecognizedOperatorInfo {
+  RecognizedOperatorType recognized_operator_type;
+  std::initializer_list<uint32_t> component_ranks;
+  std::initializer_list<uint32_t> label_indices;
+};
+
+struct Component {
+  uint32_t label_index_begin;
+  uint32_t label_index_end;
+
+  uint32_t GetDimensionCount() const noexcept {
+    return label_index_end - label_index_begin;
+  }
+  gsl::span<const uint32_t> GetLabels(gsl::span<const uint32_t> labels) const {
+    return labels.subspan(label_index_begin, label_index_end - label_index_begin);
+  }
+};
+
+bool ParseEquationComponents(const Node& node,
+                             const std::string_view equation,
+                             std::vector<uint32_t>& label_indices,
+                             std::vector<Component>& components,
+                             std::vector<uint32_t>& output_dimensions,
+                             uint32_t& num_labels,
+                             const logging::Logger& logger) {
+  // Parse an equation like 'ij,jk->ik' into components {ij, jk, ik} mapping letters to
+  // numeric indices {(0,1}, {1,2}, {0,2}}. The last component is the output.
+  // Read first to last character in equation, looking for letters, commas, and one arrow.
+  // The ellipsis is not supported.
+  std::map<char, uint32_t> label_maps;
+  std::set<char> repeated_labels;
+
+  num_labels = 0;
+  Component current_component = {};
+  bool at_output = false;
+  bool end_flag = false;
+
+  for (const char* it = equation.data(); !end_flag; ++it) {
+    // std::string.data() promises the end of the string is '\0'
+    char ch = *it;
+
+    if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
+      const auto [i, inserted] = label_maps.insert({ch, num_labels});
+      if (inserted) {
+        if (at_output) {
+          LOGS(logger, VERBOSE) << "Found label in equation output not matching any label from inputs.";
+          return false;
+        }
+        ++num_labels;
+      } else if (!at_output) {
+        repeated_labels.insert(ch);
+      }
+      label_indices.push_back(i->second);
+    } else if (ch == ' ') {
+      continue;
+    } else {
+      current_component.label_index_end = static_cast<uint32_t>(label_indices.size());
+      components.push_back(current_component);
+      current_component.label_index_begin = current_component.label_index_end;
+
+      switch (ch) {
+        case ',':
+          break;
+
+        case '-':
+          ++it;
+          if (*it != '>') {
+            LOGS(logger, VERBOSE) << "Expected '->' for output.";
+            return false;
+          }
+          if (at_output) {
+            LOGS(logger, VERBOSE) << "Only one output arrow '->' is valid.";
+            return false;
+          }
+          at_output = true;
+          break;
+
+        case '.':
+          // Ellipsis is unsupported
+          LOGS(logger, VERBOSE) << "Ellipsis is unsupported.";
+          return false;
+
+        case '\0':
+          end_flag = true;
+          break;  // End of string.
+
+        default:
+          LOGS(logger, VERBOSE) << "Unsupported character in equation string.";
+          return false;
+      }
+    }
+  }
+
+  // If no explicit output was given, generate an implicit output by ordering all the
+  // labels in alphabetic order (by ASCII value consistent with numpy, so Z < a).
+  // Exclude any labels that occurred more than once, as these cancel out.
+  if (!at_output) {
+    for (auto i : label_maps) {
+      if (repeated_labels.count(i.first) == 0) {
+        label_indices.push_back(i.second);
+      }
+    }
+
+    current_component.label_index_end = static_cast<uint32_t>(label_indices.size());
+    components.push_back(current_component);
+  }
+  return true;
+}
+
+// For two inputs A,B and one output C
+Status PairwiseOperandProcess(ModelBuilder& model_builder,
+                              const Node& node,
+                              const std::vector<uint32_t>& label_indices,
+                              const std::vector<Component>& components,
+                              const std::vector<uint32_t>& output_dimensions,
+                              uint32_t num_labels,
+                              emscripten::val& output,
+                              const logging::Logger& logger) {
+  auto input_a_labels = components[0].GetLabels(label_indices);
+  auto input_b_labels = components[1].GetLabels(label_indices);
+  auto output_labels = components[2].GetLabels(label_indices);
+
+  /*
+  Step 1. Transpose and Reshape
+
+  (0/1,0/1,0/1) means dim i whether appears in (A,B,C)
+  For new A, it has three segments [...a_1..., a_2, a_3], a_1 has multiple dims, a_2 and a_3 only have one dim respectively
+  For new B, it has three segments [...b_1..., b_2, b_3], b_1 has multiple dims, b_2 and b_3 only have one dim respectively
+  a_1 and b_1 are batch dims, and [a_2,a_3], [b_2,b_3] are for matmul
+
+  case (1,0,0) and (0,1,0): reduce, here we treat it as batch dimension, and reduceSum at the end.
+            add additional dim for B/A
+  case (1,1,1): batch dimension, put it in the front.
+  case (1,0,1): gemm dim for A, put it in a_2
+  case (0,1,1): gemm dim for B, put it in b_3
+  case (1,1,0): summation dim / gemm dim for both A and B, put it in a_3 and b_2
+
+  Notes:
+    # of (1,1,0) maybe > 1, flatten / reshape a_3 and b_2
+    # of (1,1,0) maybe = 0, add one additional dim for a_3 and b_2
+  */
+
+  // The index in input/output of the dim index
+  std::map<uint32_t, int32_t> input_a_axes_map, input_b_axes_map, output_axes_map;
+
+  for (uint32_t i = 0; i <= num_labels + 1; ++i) {
+    input_a_axes_map[i] = input_b_axes_map[i] = output_axes_map[i] = -1;
+  }
+  int32_t index = 0;
+  for (auto axis : input_a_labels) {
+    input_a_axes_map[axis] = index++;
+  }
+  index = 0;
+  for (auto axis : input_b_labels) {
+    input_b_axes_map[axis] = index++;
+  }
+  index = 0;
+  for (auto axis : output_labels) {
+    output_axes_map[axis] = index++;
+  }
+
+  // Inputs Reshape
+  // a_0 = [a_1,a_2,a_3], b_0 = [b_1,b_2,b_3]
+  std::vector<uint32_t> a_0, a_1, a_2, a_3, b_0, b_1, b_2, b_3;
+  uint32_t a_idx = input_a_labels.size();
+  uint32_t b_idx = input_b_labels.size();
+  bool a_flag = false;  // whether a_2 has element
+  bool b_flag = false;  // whether b_3 has element
+
+  for (uint32_t i = 0; i < num_labels; ++i) {
+    if (input_a_axes_map[i] != -1) {
+      if (input_b_axes_map[i] != -1) {
+        if (output_axes_map[i] != -1) {
+          // The index in input/output of the dim index
+          a_1.push_back(i);
+          b_1.push_back(i);
+        } else {
+          // (1,1,0) push back in the middle for b and end for a
+          a_3.push_back(i);
+          b_2.push_back(i);
+        }
+      } else {
+        // (1,0,x) push back in the middle for a. If more than one, push back in the front for a, b.
+        if (a_flag) {
+          a_1.push_back(i);
+          b_1.push_back(i);
+          input_b_axes_map[i] = b_idx++;
+        } else {
+          a_2.push_back(i);
+          a_flag = true;
+        }
+      }
+    } else {
+      // (0,1,x) push back in the end for b. If more than one, push back in the front for a, b.
+      if (input_b_axes_map[i] != -1) {
+        if (b_flag) {
+          a_1.push_back(i);
+          b_1.push_back(i);
+          input_a_axes_map[i] = a_idx++;
+        } else {
+          b_3.push_back(i);
+          b_flag = true;
+        }
+      }
+    }
+  }
+
+  // Matrix multiplication can be formatted in (...,i,j) * (...,j,k) ==> (...,i,k)
+  // Even inner and outer product can be reformatted as this.
+  // Inner product (1,i) * (i,1) ==> (1,1)
+  // Outer product (i,1) * (1,j) ==> (i,j)
+  // i.e., in our expression, (a_2,a_3) * (b_2,b_3) ==> (a_2,b_3)
+
+  if (!a_flag) {
+    // Lack of a_2 element, add a new a_2, whose dim value = 1
+    a_2.push_back(num_labels + 1);
+    input_a_axes_map[num_labels + 1] = a_idx++;
+  }
+  if (!b_flag) {
+    // Lack of b_3 element, add a new b_3, whose dim value = 1
+    b_3.push_back(num_labels + 2);
+    input_b_axes_map[num_labels + 2] = b_idx++;
+    b_idx++;
+  }
+
+  if (a_3.empty()) {
+    // Lack of a_3 and b_2 elements, add a new a_3 for A and a new b_2 for B, whose dim value = 1
+    a_3.push_back(num_labels);
+    b_2.push_back(num_labels);
+    input_a_axes_map[num_labels] = a_idx;
+    input_b_axes_map[num_labels] = b_idx;
+  }
+
+  a_0 = a_1;
+  b_0 = b_1;
+  a_0.insert(a_0.end(), a_2.begin(), a_2.end());
+  a_0.insert(a_0.end(), a_3.begin(), a_3.end());
+  b_0.insert(b_0.end(), b_2.begin(), b_2.end());
+  b_0.insert(b_0.end(), b_3.begin(), b_3.end());
+
+  std::vector<uint32_t> permutation_a, permutation_b;
+  for (uint32_t i = 0; i < a_0.size(); ++i) {
+    permutation_a.push_back(static_cast<uint32_t>(input_a_axes_map[a_0[i]]));
+    permutation_b.push_back(static_cast<uint32_t>(input_b_axes_map[b_0[i]]));
+  }
+
+  const auto& input_defs = node.InputDefs();
+  emscripten::val input_a = model_builder.GetOperand(input_defs[0]->Name());
+  emscripten::val input_b = model_builder.GetOperand(input_defs[1]->Name());
+  std::vector<uint32_t> new_a_shape, new_b_shape;
+  if (input_a_labels.size() < a_0.size()) {
+    std::vector<int64_t> input_a_shape;
+    ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_a_shape, logger), "Cannot get shape");
+    std::transform(input_a_shape.begin(), input_a_shape.end(), std::back_inserter(new_a_shape),
+                   [](int64_t i) { return static_cast<uint32_t>(i); });
+    for (uint32_t i = 0; i < a_0.size() - input_a_labels.size(); ++i) {
+      new_a_shape.push_back(SafeInt<int32_t>(1));
+    }
+
+    emscripten::val options = emscripten::val::object();
+    options.set("label", node.Name() + "_reshape");
+    input_a = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                               input_a,
+                                                               emscripten::val::array(new_a_shape),
+                                                               options);
+  }
+  if (input_b_labels.size() < b_0.size()) {
+    std::vector<int64_t> input_b_shape;
+    ORT_RETURN_IF_NOT(GetShape(*input_defs[1], input_b_shape, logger), "Cannot get shape");
+    std::transform(input_b_shape.begin(), input_b_shape.end(), std::back_inserter(new_b_shape),
+                   [](int64_t i) { return static_cast<uint32_t>(i); });
+    for (uint32_t i = 0; i < b_0.size() - input_b_labels.size(); ++i) {
+      new_b_shape.push_back(SafeInt<int32_t>(1));
+    }
+
+    emscripten::val options = emscripten::val::object();
+    options.set("label", node.Name() + "_reshape");
+    input_b = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                               input_b,
+                                                               emscripten::val::array(new_b_shape),
+                                                               options);
+  }
+
+  // Inputs Transpose
+  std::vector<uint32_t> sequence(permutation_a.size());
+  std::iota(sequence.begin(), sequence.end(), 0);
+  if (permutation_a != sequence) {
+    emscripten::val options = emscripten::val::object();
+    options.set("permutation", emscripten::val::array(permutation_a));
+    options.set("label", node.Name() + "_transpose");
+    input_a = model_builder.GetBuilder().call<emscripten::val>("transpose", input_a, options);
+  }
+  if (permutation_b != sequence) {
+    emscripten::val options = emscripten::val::object();
+    options.set("permutation", emscripten::val::array(permutation_b));
+    options.set("label", node.Name() + "_transpose");
+    input_b = model_builder.GetBuilder().call<emscripten::val>("transpose", input_b, options);
+  }
+
+  // Input Reshape: if the number of (1,1,0) > 1, flatten the b_2 and a_3 dims.
+  if (a_3.size() > 1) {
+    if (new_a_shape.empty()) {
+      std::vector<int64_t> input_a_shape;
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_a_shape, logger), "Cannot get shape");
+      std::transform(input_a_shape.begin(), input_a_shape.end(), std::back_inserter(new_a_shape),
+                     [](int64_t i) { return static_cast<uint32_t>(i); });
+    }
+    if (new_b_shape.empty()) {
+      std::vector<int64_t> input_b_shape;
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[1], input_b_shape, logger), "Cannot get shape");
+      std::transform(input_b_shape.begin(), input_b_shape.end(), std::back_inserter(new_b_shape),
+                     [](int64_t i) { return static_cast<uint32_t>(i); });
+    }
+    std::vector<uint32_t> new_new_a_shape, new_new_b_shape;
+    uint32_t a_dim = 1, b_dim = 1;
+    for (auto idx : a_1) {
+      new_new_a_shape.push_back(new_a_shape[idx]);
+    }
+    for (auto idx : a_2) {
+      new_new_a_shape.push_back(new_a_shape[idx]);
+    }
+    for (auto idx : a_3) {
+      a_dim *= new_a_shape[idx];
+    }
+    new_new_a_shape.push_back(a_dim);
+    for (auto idx : b_1) {
+      new_new_b_shape.push_back(new_b_shape[idx]);
+    }
+    for (auto idx : b_2) {
+      b_dim *= new_b_shape[idx];
+    }
+    new_new_b_shape.push_back(b_dim);
+    for (auto idx : b_3) {
+      new_new_b_shape.push_back(new_b_shape[idx]);
+    }
+
+    emscripten::val options = emscripten::val::object();
+    options.set("label", node.Name() + "_reshape");
+    input_a = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                               input_a,
+                                                               emscripten::val::array(new_new_a_shape),
+                                                               options);
+    input_b = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                               input_b,
+                                                               emscripten::val::array(new_b_shape),
+                                                               options);
+  }
+
+  // Step 2. Matmul
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name() + "_matmul");
+  output = model_builder.GetBuilder().call<emscripten::val>("matmul", input_a, input_b, options);
+  std::vector<uint32_t> output_indices = a_1;
+  output_indices.push_back(a_2.back());
+  output_indices.push_back(b_3.back());
+
+  /*
+    Step 3. Output Transpose:
+    Use the following fast permutation calculation algorithm
+    to calculate the permutation of transpose.
+    sequence x[] -> sequence y[] : permutation p[]
+    x[s[i]] = i, y[t[i]] = i, p[t[i]] = s[i]
+    output_indices is x and target_output_indices is y
+  */
+  std::vector<uint32_t> target_output_indices(output_labels.begin(), output_labels.end());
+
+  // map output dim labels to 0 ~ n-1
+  std::vector<uint32_t> output_indices_sorted(output_indices.begin(), output_indices.end());
+  std::map<uint32_t, uint32_t> mapping;
+  std::sort(output_indices_sorted.begin(), output_indices_sorted.end());
+  for (size_t i = 0; i < output_indices_sorted.size(); i++) {
+    mapping[output_indices_sorted[i]] = i;
+  }
+
+  for (size_t i = 0; i < output_indices.size(); i++) {
+    output_indices[i] = mapping[output_indices[i]];
+    if (i < target_output_indices.size()) {
+      target_output_indices[i] = mapping[target_output_indices[i]];
+    }
+  }
+
+  uint32_t pad = target_output_indices.size();
+  std::vector<int64_t> s(output_indices.size(), -1);
+  std::vector<int64_t> t(output_indices.size(), -1);
+  std::vector<uint32_t> p(output_indices.size(), 0);
+  for (uint32_t i = 0; i < output_indices.size(); ++i) {
+    s[output_indices[i]] = i;
+    if (i < target_output_indices.size()) {
+      t[target_output_indices[i]] = i;
+    }
+  }
+  for (uint32_t i = 0; i < output_indices.size(); ++i) {
+    if (t[i] == -1) {
+      t[i] = pad++;
+    }
+    p[static_cast<uint32_t>(t[i])] = static_cast<uint32_t>(s[i]);
+  }
+
+  std::vector<uint32_t> sequence_o(output_indices.size());
+  std::iota(sequence_o.begin(), sequence_o.end(), 0);
+  if (p != sequence_o) {
+    emscripten::val options = emscripten::val::object();
+    options.set("permutation", emscripten::val::array(p));
+    options.set("label", node.Name() + "_transpose");
+    output = model_builder.GetBuilder().call<emscripten::val>("transpose", output, options);
+  }
+
+  // Step 4. Output ReduceSum
+  if (output_labels.size() < output_indices.size()) {
+    std::vector<int32_t> axes_data;
+    for (uint32_t i = output_labels.size(); i < output_indices.size(); ++i) {
+      axes_data.push_back(SafeInt<int32_t>(i));
+    }
+    emscripten::val options_reduce = emscripten::val::object();
+    options_reduce.set("axes", emscripten::val::array(axes_data));
+    options_reduce.set("label", node.Name() + "_reduceSum");
+    output = model_builder.GetBuilder().call<emscripten::val>("reduceSum", output, options_reduce);
+  }
+  return Status::OK();
+}
+
+RecognizedOperatorType DetermineRecognizedOperatorType(const std::vector<uint32_t>& label_indices,
+                                                       const std::vector<Component>& components,
+                                                       const std::vector<uint32_t>& output_dimensions) {
+  if (components.empty()) return RecognizedOperatorType::None;
+
+  auto equals = [](gsl::span<const uint32_t> a, gsl::span<const uint32_t> b) {
+    return std::equal(a.begin(), a.end(), b.begin(), b.end());
+  };
+
+  std::array<uint32_t, 3> component_ranks;
+  if (components.size() > component_ranks.size()) {
+    // So far, not support for more than two inputs and one output.
+    return RecognizedOperatorType::None;
+  } else if (components.size() == 2) {  // one input
+    auto input_labels = components[0].GetLabels(label_indices);
+    auto output_labels = components[1].GetLabels(label_indices);
+    if (input_labels.size() == output_labels.size()) {
+      if (equals(input_labels, output_labels)) {
+        // Identity: input labels = output labels
+        return RecognizedOperatorType::Identity;
+      } else {
+        return RecognizedOperatorType::Transpose;
+      }
+    } else if (input_labels.size() == input_labels.back() + 1) {
+      // ReduceSum: There is no repeated character in input.
+      return RecognizedOperatorType::ReduceSum;
+    } else if (input_labels.size() == input_labels.back() + 2) {
+      // Diagonal: One repeated character in input, ii->i / iij->ij / iijk -> ijk.
+      return RecognizedOperatorType::Diagonal;
+    } else {
+      return RecognizedOperatorType::None;
+    }
+  } else if (components.size() == 3) {  // two inputs
+    auto input_A_labels = components[0].GetLabels(label_indices);
+    auto input_B_labels = components[1].GetLabels(label_indices);
+    auto output_labels = components[2].GetLabels(label_indices);
+    if (equals(input_A_labels, output_labels) && equals(input_B_labels, output_labels)) {  // element-wise product
+      return RecognizedOperatorType::Multiply;
+    }
+  }
+
+  return RecognizedOperatorType::Pairwise;
+}
+
+// Add operator related.
+
+Status EinsumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
+                                              const Node& node,
+                                              const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  emscripten::val output = emscripten::val::object();
+
+  NodeAttrHelper helper(node);
+  const auto equation = helper.Get("equation", std::string(" "));
+
+  std::vector<uint32_t> label_indices;
+  std::vector<Component> components;
+  std::vector<uint32_t> output_dimensions;
+  uint32_t num_labels;
+  ORT_RETURN_IF_NOT(ParseEquationComponents(node, equation, label_indices, components, output_dimensions,
+                                            num_labels, logger),
+                    "Error parsing equation components.");
+
+  RecognizedOperatorType recognized_operator_type = DetermineRecognizedOperatorType(label_indices, components,
+                                                                                    output_dimensions);
+
+  switch (recognized_operator_type) {
+    case RecognizedOperatorType::Multiply: {
+      emscripten::val a = model_builder.GetOperand(node.InputDefs()[0]->Name());
+      emscripten::val b = model_builder.GetOperand(node.InputDefs()[1]->Name());
+      emscripten::val options = emscripten::val::object();
+      options.set("label", node.Name() + "_mul");
+      output = model_builder.GetBuilder().call<emscripten::val>("mul", a, b, options);
+    } break;
+    case RecognizedOperatorType::ReduceSum: {
+      auto kept_axes = components.back().GetLabels(label_indices);
+      std::vector<uint32_t> reduced_axes;
+      uint32_t kept_axes_mask = 0;
+      for (auto axis : kept_axes) {
+        kept_axes_mask |= (1 << axis);
+      }
+      std::vector<int64_t> input_shape;
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+      for (uint32_t axis = 0, axis_count = static_cast<uint32_t>(input_shape.size()); axis < axis_count; ++axis) {
+        if (~kept_axes_mask & (1 << axis)) {
+          reduced_axes.push_back(axis);
+        }
+      }
+
+      emscripten::val input = model_builder.GetOperand(node.InputDefs()[0]->Name());
+      emscripten::val options = emscripten::val::object();
+      options.set("keepDimensions", false);
+      options.set("axes", emscripten::val::array(reduced_axes));
+      options.set("label", node.Name() + "_reduceSum");
+
+      output = model_builder.GetBuilder().call<emscripten::val>("reduceSum", input, options);
+
+      // transpose output
+      std::vector<uint32_t> output_labels_sorted(kept_axes.begin(), kept_axes.end());
+      std::map<uint32_t, uint32_t> mapping;
+      std::sort(output_labels_sorted.begin(), output_labels_sorted.end());
+
+      auto equals = [](std::vector<uint32_t> a, gsl::span<const uint32_t> b) {
+        return std::equal(a.begin(), a.end(), b.begin(), b.end());
+      };
+      if (equals(output_labels_sorted, kept_axes)) {
+        break;
+      }
+
+      for (size_t i = 0; i < output_labels_sorted.size(); i++) {
+        mapping[output_labels_sorted[i]] = i;
+      }
+      std::vector<uint32_t> permutation;
+      for (auto idx : kept_axes) {
+        permutation.push_back(mapping[idx]);
+      }
+      emscripten::val options_transpose = emscripten::val::object();
+      options.set("permutation", emscripten::val::array(permutation));
+      options.set("label", node.Name() + "_transpose");
+      output = model_builder.GetBuilder().call<emscripten::val>("transpose", output, options);
+    } break;
+    case RecognizedOperatorType::Diagonal: {
+      emscripten::val input = model_builder.GetOperand(node.InputDefs()[0]->Name());
+      auto input_labels = components[0].GetLabels(label_indices);
+      auto output_labels = components[1].GetLabels(label_indices);
+      uint32_t diagonal_idx_1, diagonal_idx_2;
+      uint32_t permutation_idx = 0;
+      for (uint32_t idx = 0; idx < input_labels.size(); idx++) {
+        if (idx != input_labels[idx]) {
+          diagonal_idx_1 = input_labels[idx];
+          diagonal_idx_2 = idx;
+          break;
+        }
+      }
+
+      // tranpose input
+      std::vector<uint32_t> permutation(input_labels.size());
+      for (uint32_t idx = 0; idx < input_labels.size(); idx++) {
+        if (idx != diagonal_idx_1 && idx != diagonal_idx_2) {
+          permutation[permutation_idx++] = idx;
+        }
+      }
+      permutation[permutation_idx++] = diagonal_idx_1;
+      permutation[permutation_idx] = diagonal_idx_2;
+
+      emscripten::val options = emscripten::val::object();
+      options.set("permutation", emscripten::val::array(permutation));
+      options.set("label", node.Name() + "_transpose");
+      output = model_builder.GetBuilder().call<emscripten::val>("transpose", input, options);
+
+      // triu + tril = diagonal
+      emscripten::val options_trilu = emscripten::val::object();
+      options_trilu.set("label", node.Name() + "_triangular");
+      output = model_builder.GetBuilder().call<emscripten::val>("triangular", output, options_trilu);  // triu
+      options_trilu.set("upper", false);
+      output = model_builder.GetBuilder().call<emscripten::val>("triangular", output, options_trilu);  // tril
+
+      // reducesum to achieve the diagonal values
+      std::vector<int64_t> input_shape;
+      std::vector<uint32_t> reduced_axes;
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+      if (input_shape[diagonal_idx_1] > input_shape[diagonal_idx_2]) {
+        reduced_axes.push_back(input_labels.size() - 2);
+      } else {
+        reduced_axes.push_back(input_labels.size() - 1);
+      }
+      emscripten::val options_reduce = emscripten::val::object();
+      options_reduce.set("keepDimensions", false);
+      options_reduce.set("axes", emscripten::val::array(reduced_axes));
+      options_reduce.set("label", node.Name() + "_reduceSum");
+      output = model_builder.GetBuilder().call<emscripten::val>("reduceSum", output, options_reduce);  // triu
+
+      // transpose output
+      std::vector<uint32_t> target_output_indices(output_labels.begin(), output_labels.end());
+      std::vector<uint32_t> output_indices(permutation.begin(), permutation.end() - 1);
+
+      // Use the fast permutation calculation algorithm mentioned above
+      std::vector<int64_t> s(output_indices.size(), -1);
+      std::vector<int64_t> t(output_indices.size(), -1);
+      std::vector<uint32_t> p(output_indices.size(), 0);
+      for (uint32_t i = 0; i < output_indices.size(); ++i) {
+        s[output_indices[i]] = i;
+        t[target_output_indices[i]] = i;
+      }
+      for (uint32_t i = 0; i < output_indices.size(); ++i) {
+        p[static_cast<uint32_t>(t[i])] = static_cast<uint32_t>(s[i]);
+      }
+
+      std::vector<uint32_t> sequence_o(output_indices.size());
+      std::iota(sequence_o.begin(), sequence_o.end(), 0);
+      if (p != sequence_o) {
+        emscripten::val options_transpose = emscripten::val::object();
+        options.set("permutation", emscripten::val::array(p));
+        options.set("label", node.Name() + "_transpose");
+        output = model_builder.GetBuilder().call<emscripten::val>("transpose", output, options);
+      }
+    } break;
+
+    case RecognizedOperatorType::Transpose: {
+      emscripten::val input = model_builder.GetOperand(node.InputDefs()[0]->Name());
+      assert(components.front().GetDimensionCount() == components.back().GetDimensionCount());
+      // Remap transposed strides using the component labels from input to output.
+      auto output_labels = components.back().GetLabels(label_indices);
+
+      std::vector<uint32_t> permutation{output_labels.begin(), output_labels.end()};
+      emscripten::val options = emscripten::val::object();
+      options.set("permutation", emscripten::val::array(permutation));
+      options.set("label", node.Name() + "_transpose");
+      output = model_builder.GetBuilder().call<emscripten::val>("transpose", input, options);
+    } break;
+
+    case RecognizedOperatorType::Identity: {
+      emscripten::val input = model_builder.GetOperand(node.InputDefs()[0]->Name());
+      output = input;
+    } break;
+
+    case RecognizedOperatorType::Pairwise: {
+      ORT_RETURN_IF_ERROR(PairwiseOperandProcess(model_builder, node, label_indices, components,
+                                                 output_dimensions, num_labels, output, logger));
+    } break;
+
+    default:
+      break;
+  }
+
+  model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
+  return Status::OK();
+}
+
+// Operator support related.
+
+bool EinsumOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+                                        const Node& node,
+                                        const WebnnDeviceType device_type,
+                                        const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+
+  if (input_defs.size() > 2) {
+    // TODO: Support more than two inputs.
+    LOGS(logger, VERBOSE) << "EinSum only supports up to two inputs.";
+    return false;
+  }
+
+  NodeAttrHelper helper(node);
+  const auto equation = helper.Get("equation", std::string(" "));
+  std::vector<uint32_t> label_indices;
+  std::vector<Component> components;
+  std::vector<uint32_t> output_dimensions;
+  uint32_t num_labels;
+
+  if (!ParseEquationComponents(node, equation, label_indices, components,
+                               output_dimensions, num_labels, logger)) {
+    LOGS(logger, VERBOSE) << "EinSum input equation is illegal.";
+    return false;
+  }
+
+  if (static_cast<uint32_t>(input_defs.size()) + 1 != components.size()) {
+    LOGS(logger, VERBOSE) << "EinSum input tensor count is inconsistent with the equation component count.";
+    return false;
+  }
+
+  RecognizedOperatorType recognized_operator_type = DetermineRecognizedOperatorType(label_indices, components,
+                                                                                    output_dimensions);
+  if (recognized_operator_type == RecognizedOperatorType::None) {
+    LOGS(logger, VERBOSE) << "The equation is not supported in Einsum.";
+    return false;
+  }
+
+  return true;
+}
+
+bool EinsumOpBuilder::HasSupportedInputsImpl(const Node& node, const emscripten::val& wnn_limits,
+                                             const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+
+  const auto& op_type = node.OpType();
+  int32_t input0_type;
+  int32_t input1_type;
+  bool has_input1 = input_defs.size() > 1 && input_defs[1]->Exists();
+
+  if (!GetType(*input_defs[0], input0_type, logger) ||
+      (has_input1 && !GetType(*input_defs[1], input1_type, logger))) {
+    return false;
+  }
+
+  if (has_input1 && input0_type != input1_type) {
+    LOGS(logger, VERBOSE) << "[" << op_type
+                          << "] Input data types should be the same.";
+    return false;
+  }
+
+  NodeAttrHelper helper(node);
+  const auto equation = helper.Get("equation", std::string(" "));
+  std::vector<uint32_t> label_indices;
+  std::vector<Component> components;
+  std::vector<uint32_t> output_dimensions;
+  uint32_t num_labels;
+
+  if (!ParseEquationComponents(node, equation, label_indices,
+                               components, output_dimensions, num_labels, logger)) {
+    LOGS(logger, VERBOSE) << "EinSum input equation is illegal.";
+    return false;
+  }
+
+  RecognizedOperatorType recognized_operator_type = DetermineRecognizedOperatorType(label_indices, components,
+                                                                                    output_dimensions);
+
+  if (recognized_operator_type == RecognizedOperatorType::None) {
+    LOGS(logger, VERBOSE) << "The equation is not supported in Einsum.";
+    return false;
+  } else if (recognized_operator_type == RecognizedOperatorType::Pairwise) {
+    // Map to WebNN's gemm or matmul
+    return IsDataTypeSupportedByOp("MatMul", input0_type, wnn_limits, "a", "inputs", logger);
+  } else if (recognized_operator_type == RecognizedOperatorType::ReduceSum) {
+    return IsDataTypeSupportedByOp("ReduceSum", input0_type, wnn_limits, "input", "inputs", logger);
+  } else {
+    return IsDataTypeSupportedByOp("Identity", input0_type, wnn_limits, "input", "inputs", logger);
+  }
+}
+
+void CreateEinsumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<EinsumOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace webnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 1477530ce1894..252d49a2f4d4d 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -113,12 +113,12 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     if (input_defs.size() >= 3) {
       a_zero_point = model_builder.GetOperand(node.InputDefs()[2]->Name());
     } else {
-      a_zero_point = model_builder.GetZeroConstant(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
+      a_zero_point = model_builder.CreateOrGetConstant<uint8_t>(ONNX_NAMESPACE::TensorProto_DataType_UINT8, 0);
     }
     if (input_defs.size() >= 4) {
       b_zero_point = model_builder.GetOperand(node.InputDefs()[3]->Name());
     } else {
-      b_zero_point = model_builder.GetZeroConstant(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
+      b_zero_point = model_builder.CreateOrGetConstant<uint8_t>(ONNX_NAMESPACE::TensorProto_DataType_UINT8, 0);
     }
     output = model_builder.GetBuilder().call<emscripten::val>("matmulInteger",
                                                               a,
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gru_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gru_op_builder.cc
index c92fe7366d494..ffb9b7fbf2e7a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gru_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gru_op_builder.cc
@@ -28,6 +28,8 @@ class GruOpBuilder : public BaseOpBuilder {
                          const WebnnDeviceType /*device_type*/, const logging::Logger& logger) const override;
   bool HasSupportedInputsImpl(const Node& node, const emscripten::val& wnn_limits,
                               const logging::Logger& logger) const override;
+  bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
+                               const logging::Logger& logger) const override;
 };
 
 void GruOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
@@ -189,40 +191,64 @@ bool GruOpBuilder::HasSupportedInputsImpl(const Node& node, const emscripten::va
                                           const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const auto& op_type = node.OpType();
-  int32_t input0_type = 0;  // input data type
-  int32_t input1_type = 0;  // weight data type
-  int32_t input2_type = 0;  // recurrentWeight data type
-  int32_t input3_type = 0;  // bias data type
-  int32_t input4_type = 0;  // recurrentBias data type
-  int32_t input5_type = 0;  // initialHiddenState data type
-  bool has_input3 = input_defs.size() > 3 && input_defs[3]->Exists();
-  bool has_input4 = input_defs.size() > 4 && input_defs[4]->Exists();
-  bool has_input5 = input_defs.size() > 5 && input_defs[5]->Exists();
-
-  if (!GetType(*input_defs[0], input0_type, logger) ||
-      !GetType(*input_defs[1], input1_type, logger) ||
-      !GetType(*input_defs[2], input2_type, logger) ||
-      (has_input3 && !GetType(*input_defs[3], input3_type, logger)) ||
-      (has_input4 && !GetType(*input_defs[4], input4_type, logger)) ||
-      (has_input5 && !GetType(*input_defs[5], input5_type, logger))) {
+  int32_t input_X_type = 0;          // input data type
+  int32_t input_W_type = 0;          // weight data type
+  int32_t input_R_type = 0;          // recurrent weight data type
+  int32_t input_B_type = 0;          // bias data type
+  int32_t input_initial_h_type = 0;  // initial hidden state data type
+  bool has_input_B = input_defs.size() > 3 && input_defs[3]->Exists();
+  bool has_input_initial_h = input_defs.size() > 5 && input_defs[5]->Exists();
+
+  if (!GetType(*input_defs[0], input_X_type, logger) ||
+      !GetType(*input_defs[1], input_W_type, logger) ||
+      !GetType(*input_defs[2], input_R_type, logger) ||
+      (has_input_B && !GetType(*input_defs[3], input_B_type, logger)) ||
+      // input_defs[4] refers to sequence_lens and is a fixed data type of int32.
+      (has_input_initial_h && !GetType(*input_defs[5], input_initial_h_type, logger))) {
     return false;
   }
 
-  InlinedVector<int32_t, 6> input_types = {input0_type, input1_type, input2_type};
-  if (has_input3) {
-    input_types.push_back(input3_type);
+  InlinedVector<int32_t, 5> input_types = {input_X_type, input_W_type, input_R_type};
+  if (has_input_B) {
+    input_types.push_back(input_B_type);
   }
-  if (has_input4) {
-    input_types.push_back(input4_type);
-  }
-  if (has_input5) {
-    input_types.push_back(input5_type);
+  if (has_input_initial_h) {
+    input_types.push_back(input_initial_h_type);
   }
   if (!AreInputDataTypesSame(op_type, input_types, logger)) {
     return false;
   }
 
-  return IsDataTypeSupportedByOp(op_type, input0_type, wnn_limits, "input", "X", logger);
+  return IsDataTypeSupportedByOp(op_type, input_X_type, wnn_limits, "input", "X", logger);
+}
+
+bool GruOpBuilder::HasSupportedOutputsImpl(const Node& node,
+                                           const emscripten::val& wnn_limits,
+                                           const logging::Logger& logger) const {
+  const auto& output_defs = node.OutputDefs();
+  const auto& op_type = node.OpType();
+  int32_t Y_type = 0;
+  int32_t Y_h_type = 0;
+  bool has_Y = output_defs.size() > 0 && output_defs[0]->Exists();
+  bool has_Y_h = output_defs.size() > 1 && output_defs[1]->Exists();
+
+  bool Y_supported = has_Y && GetType(*output_defs[0], Y_type, logger);
+  bool Y_h_supported = has_Y_h && GetType(*output_defs[1], Y_h_type, logger);
+
+  if (Y_supported && !Y_h_supported) {
+    return IsDataTypeSupportedByOp(op_type, Y_type, wnn_limits, "outputs", "Y", logger);
+  } else if (!Y_supported && Y_h_supported) {
+    return IsDataTypeSupportedByOp(op_type, Y_h_type, wnn_limits, "outputs", "Y_h", logger);
+  } else if (Y_supported && Y_h_supported) {
+    if (Y_type != Y_h_type) {
+      LOGS(logger, VERBOSE) << "[GRU] Output data types must be the same.";
+      return false;
+    }
+    return IsDataTypeSupportedByOp(op_type, Y_type, wnn_limits, "outputs", "Y", logger);
+  } else {
+    LOGS(logger, VERBOSE) << "[GRU] No output found.";
+    return false;
+  }
 }
 
 void CreateGruOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/lrn_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/lrn_op_builder.cc
new file mode 100644
index 0000000000000..19f6d6aff8f97
--- /dev/null
+++ b/onnxruntime/core/providers/webnn/builders/impl/lrn_op_builder.cc
@@ -0,0 +1,151 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/webnn/builders/helper.h"
+#include "core/providers/webnn/builders/model_builder.h"
+#include "core/providers/webnn/builders/op_builder_factory.h"
+
+#include "base_op_builder.h"
+
+namespace onnxruntime {
+namespace webnn {
+
+class LRNOpBuilder : public BaseOpBuilder {
+  // Add operator related.
+ private:
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+
+  // Operator support related.
+ private:
+  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+};
+
+Status LRNOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
+                                           const Node& node,
+                                           const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  int32_t input_data_type;
+  ORT_RETURN_IF_NOT(GetType(*input_defs[0], input_data_type, logger), "Cannot get input type");
+  emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
+  const auto node_name = node.Name();
+  emscripten::val wnn_builder = model_builder.GetBuilder();
+
+  NodeAttrHelper helper(node);
+  const float alpha = helper.Get("alpha", 0.0001f);
+  const float beta = helper.Get("beta", 0.75f);
+  const float bias = helper.Get("bias", 1.0f);
+  const uint32_t size = helper.Get("size", 1);
+
+  // Prepare WebNN constants for alpha, beta, bias attributes.
+  // Assume T is float, because input_data_type has been limited to float32 and float16 in 'hasSupportedInitsImpl'.
+  emscripten::val alpha_constant = model_builder.CreateOrGetConstant<float>(input_data_type, alpha);
+  emscripten::val beta_constant = model_builder.CreateOrGetConstant<float>(input_data_type, beta);
+  emscripten::val bias_constant = model_builder.CreateOrGetConstant<float>(input_data_type, bias);
+  emscripten::val pow1_constant = model_builder.CreateOrGetConstant<float>(input_data_type, 2);
+
+  /**
+      WebNN doesn't support LRN. So decompose it into a series of ops:
+      X --> Pow --> (Transpose)--> Pad --> AveragePool--> (Transpose) --> Mul --> Add --> Pow --> Div
+             ^           ^                      ^               ^          ^       ^       ^       ^
+             |           |                      |               |          |       |       |       |
+            Y:2      (0,2,3,1)           Kernel:(1,size)     (0,3,1,2)   B:alpha  B:bias B:beta  A:input
+      */
+  //
+  // pow(input, 2)
+  emscripten::val label_options = emscripten::val::object();
+  label_options.set("label", node_name + "_pow1");
+  emscripten::val pow1_output = wnn_builder.call<emscripten::val>("pow", input, pow1_constant, label_options);
+
+  // transpose(pow1_output, permutation=[0, 2, 3, 1])
+  // LRN is one of NHWC layout sensitive ops. When preferred layout is NCHW, move dimension 1 to dimension 3 (rightmost).
+  if (model_builder.GetPreferredLayout() == DataLayout::NCHW) {
+    std::vector<uint32_t> perm{0, 2, 3, 1};
+    emscripten::val transpose_options = emscripten::val::object();
+    transpose_options.set("label", node_name + "_transpose_rightmost");
+    transpose_options.set("permutation", emscripten::val::array(perm));
+    pow1_output =
+        wnn_builder.call<emscripten::val>("transpose", pow1_output, transpose_options);
+  }
+
+  // pad(pow1_output, beginning_padding = {0, 0, 0, leading_padding}, ending_padding = {0, 0, 0, trailing_padding})
+  // Adding a Pad before averagePool2d and calling AveragePool with pads as 0's.
+  const uint32_t leading_padding = floor((size - 1) / 2);
+  const uint32_t trailing_padding = ceil((size - 1) / 2);
+  std::vector<uint32_t> beginning_padding{0, 0, 0, leading_padding};
+  std::vector<uint32_t> ending_padding{0, 0, 0, trailing_padding};
+  emscripten::val pad_options = emscripten::val::object();
+  pad_options.set("label", node_name + "_pad");
+  emscripten::val pad_output =
+      wnn_builder.call<emscripten::val>("pad", pow1_output, emscripten::val::array(beginning_padding),
+                                        emscripten::val::array(ending_padding), pad_options);
+
+  // averagePool2d(pad_output, pool_options)
+  const std::vector<uint32_t> kernel_shape = {1, size};
+  emscripten::val pool_options = emscripten::val::object();
+  pool_options.set("label", node_name + "_averagePool2d");
+  pool_options.set("windowDimensions", emscripten::val::array(kernel_shape));
+  emscripten::val pool_output = wnn_builder.call<emscripten::val>("averagePool2d", pad_output, pool_options);
+
+  // transpose(pool_output, permutation=[0, 3, 1, 2])
+  // Move dimension 3 back to dimension 1.
+  if (model_builder.GetPreferredLayout() == DataLayout::NCHW) {
+    std::vector<uint32_t> perm{0, 3, 1, 2};
+    emscripten::val transpose_options = emscripten::val::object();
+    transpose_options.set("label", node_name + "_transpose_inverse");
+    transpose_options.set("permutation", emscripten::val::array(perm));
+    pool_output =
+        wnn_builder.call<emscripten::val>("transpose", pool_output, transpose_options);
+  }
+
+  // mul(pool_output, alpha_constant)
+  label_options.set("label", node_name + "_mul");
+  emscripten::val mul_output =
+      wnn_builder.call<emscripten::val>("mul", pool_output, alpha_constant, label_options);
+
+  // add(mul_output, bias_constant)
+  label_options.set("label", node_name + "_add");
+  emscripten::val add_output = wnn_builder.call<emscripten::val>("add", mul_output, bias_constant, label_options);
+
+  // pow(add_output, beta_constant)
+  label_options.set("label", node_name + "_pow2");
+  emscripten::val pow2_output = wnn_builder.call<emscripten::val>("pow", add_output, beta_constant, label_options);
+
+  // div(input, pow2_output)
+  label_options.set("label", node_name + "_div");
+  emscripten::val div_output = wnn_builder.call<emscripten::val>("div", input, pow2_output, label_options);
+
+  model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(div_output));
+  return Status::OK();
+}
+
+// Operator support related.
+bool LRNOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+                                     const Node& node,
+                                     const WebnnDeviceType /* device_type */,
+                                     const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  std::vector<int64_t> input_shape;
+  if (!GetShape(*input_defs[0], input_shape, logger))
+    return false;
+  const auto input_size = input_shape.size();
+  if (input_size != 4) {
+    LOGS(logger, VERBOSE) << "LRN only supports 4D input shape, input is "
+                          << input_size << "D shape";
+    return false;
+  }
+
+  return true;
+}
+
+void CreateLRNOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<LRNOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace webnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
index fa82c2f85f0d8..79ed0393e3044 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
@@ -100,7 +100,7 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
       X --> Pow --> ReduceMean --> Add --> Sqrt --> Div -> Mul
             ^          ^           ^                ^      ^
             |          |           |                |      |
-            Y:2        axis     B:epsilon           A:X  A:scale
+           Y:2        axis     B:epsilon           A:X  A:scale
       */
 
       int32_t input_type;
@@ -108,13 +108,7 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
       emscripten::val common_options = emscripten::val::object();
 
       // Pow
-      emscripten::val pow_constant_desc = emscripten::val::object();
-      ORT_RETURN_IF_NOT(SetWebnnDataType(pow_constant_desc, input_type), "Unsupported data type");
-      pow_constant_desc.set("shape", emscripten::val::array());
-      emscripten::val pow_buffer = emscripten::val::global("Float32Array").new_(1);
-      pow_buffer.set(0, 2);
-      emscripten::val pow_constant =
-          model_builder.GetBuilder().call<emscripten::val>("constant", pow_constant_desc, pow_buffer);
+      emscripten::val pow_constant = model_builder.CreateOrGetConstant<float>(input_type, 2);
       common_options.set("label", node.Name() + "_pow");
       emscripten::val pow =
           model_builder.GetBuilder().call<emscripten::val>("pow", input, pow_constant, common_options);
@@ -127,13 +121,7 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
       emscripten::val reduce_mean = model_builder.GetBuilder().call<emscripten::val>("reduceMean", pow, reduce_options);
 
       // Add
-      emscripten::val add_constant_desc = emscripten::val::object();
-      ORT_RETURN_IF_NOT(SetWebnnDataType(add_constant_desc, input_type), "Unsupported data type");
-      add_constant_desc.set("shape", emscripten::val::array());
-      emscripten::val add_buffer = emscripten::val::global("Float32Array").new_(1);
-      add_buffer.set(0, epsilon);
-      emscripten::val add_constant =
-          model_builder.GetBuilder().call<emscripten::val>("constant", add_constant_desc, add_buffer);
+      emscripten::val add_constant = model_builder.CreateOrGetConstant<float>(input_type, epsilon);
       common_options.set("label", node.Name() + "_add");
       emscripten::val add =
           model_builder.GetBuilder().call<emscripten::val>("add", reduce_mean, add_constant, common_options);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
index 8b58c4005874a..ca15e123d0999 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
@@ -35,11 +35,13 @@ Status QDQOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   std::vector<int64_t> input_shape;
   std::vector<int64_t> scale_shape;
+  std::vector<uint32_t> zero_point_shape;
   ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape");
   ORT_RETURN_IF_NOT(GetShape(*input_defs[1], scale_shape, logger), "Cannot get scale shape");
   int32_t input_type = 0;
   int32_t output_type = 0;
   int32_t zero_point_type = 0;
+  bool has_zero_point = false;
   ORT_RETURN_IF_NOT(GetType(*input_defs[0], input_type, logger), "Cannot get input data type");
   ORT_RETURN_IF_NOT(GetType(*output_defs[0], output_type, logger), "Cannot get output data type");
 
@@ -49,13 +51,59 @@ Status QDQOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   if (input_defs.size() == 3 && input_defs[2]->Exists()) {
     zero_point = model_builder.GetOperand(node.InputDefs()[2]->Name());
+    has_zero_point = true;
   } else {
     // DequantizeLinear: x_zero_point's data type equals to input data type
     // QuantizeLinear: x_zero_point's data type equals to output data type
-    // WebNN requires the zero_point to have the same shape as the scale
     zero_point_type = op_type == "DequantizeLinear" ? input_type : output_type;
-    const auto zero_point_shape = GetVecUint32FromVecInt64(scale_shape);
-    zero_point = model_builder.GetZeroConstant(zero_point_type, zero_point_shape);
+  }
+
+  const auto input_rank = input_shape.size();
+  NodeAttrHelper helper(node);
+  int32_t block_size = helper.Get("block_size", 0);
+  int32_t axis = helper.Get("axis", 1);
+  if (axis < 0) {
+    axis = SafeInt<int32_t>(HandleNegativeAxis(axis, input_rank));
+  }
+
+  // For per-axis quantization/dequantization and axis is not equal to input_rank - 1,
+  // we need to reshape the scale and zero_point tensors to make them broadcastable with the input tensor.
+  if (scale_shape.size() == 1 && input_rank > 1 &&
+      block_size == 0 && axis != static_cast<int32_t>(input_rank - 1)) {
+    // Insert ones before and after the axis dimension for broadcasting of scale tensor.
+    std::vector<uint32_t> target_shape{SafeInt<uint32_t>(input_shape[axis])};
+    target_shape.insert(target_shape.begin(), axis, 1);
+    target_shape.insert(target_shape.end(), input_rank - axis - 1, 1);
+    // zero_point has the same shape as the scale tensor.
+    zero_point_shape = target_shape;
+    emscripten::val reshape_scale_options = emscripten::val::object();
+    reshape_scale_options.set("label", node.Name() + "_reshape_scale");
+    scale = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                             scale,
+                                                             emscripten::val::array(target_shape),
+                                                             reshape_scale_options);
+
+    if (has_zero_point) {
+      // Reshape the zero_point tensor too.
+      emscripten::val reshape_zero_point_options = emscripten::val::object();
+      reshape_zero_point_options.set("label", node.Name() + "_reshape_zero_point");
+      zero_point = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                                    zero_point,
+                                                                    emscripten::val::array(target_shape),
+                                                                    reshape_zero_point_options);
+    }
+  }
+
+  // If zero_point is not provided, create a zero constant with the same shape as the scale tensor.
+  if (!has_zero_point) {
+    if (zero_point_shape.empty()) {
+      // zero_point has the same shape as the scale tensor.
+      zero_point_shape = GetVecUint32FromVecInt64(scale_shape);
+    }
+    // Create a zero constant with the same shape as the scale tensor.
+    // The zero value has been pre-processed in the CreateOrGetConstant function,
+    // so the type of T is not relevant here.
+    zero_point = model_builder.CreateOrGetConstant<uint8_t>(zero_point_type, 0, zero_point_shape);
   }
 
   emscripten::val options = emscripten::val::object();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index 3442afbc2b3cd..00f8cff25ccf5 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -21,6 +21,8 @@ namespace webnn {
 class ResizeOpBuilder : public BaseOpBuilder {
   // Add operator related.
  public:
+  // Allow roi and scales potentially being empty inputs that are ignored during processing.
+  ResizeOpBuilder() : BaseOpBuilder(/*allow empty inputs*/ true) {}
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
  private:
@@ -267,15 +269,9 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
       return false;
     }
 
-    // coordinate_transformation_mode
-    // Spec issue for supporting more coordinate transformation modes:
-    // https://github.com/webmachinelearning/webnn/issues/270
-    const std::string coordinate_transformation_mode = helper.Get("coordinate_transformation_mode", "half_pixel");
-    if (coordinate_transformation_mode != "half_pixel") {
-      LOGS(logger, VERBOSE) << "Resize does not support coordinate_transformation_mode: "
-                            << coordinate_transformation_mode;
-      return false;
-    }
+    // Ignore coordinate_transformation_mode because WebNN only supports half_pixel mode.
+    // TODO: Validate coordinate_transformation_mode. Related spec issue for supporting attribute coordinate
+    // transformation modes: https://github.com/webmachinelearning/webnn/issues/270
 
     // exclude_outside
     const auto exclude_outside = helper.Get("exclude_outside", 0);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
index 3f0d633ac888b..d51297f19f1c2 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
@@ -40,8 +40,7 @@ void SliceOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const No
   }
 }
 
-Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
-                                             const Node& node,
+Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                              const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   std::vector<int64_t> input_shape;
@@ -49,9 +48,7 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   auto rank = input_shape.size();
   NodeAttrHelper helper(node);
 
-  emscripten::val inputs = model_builder.GetOperand(input_defs[0]->Name());
-  std::vector<int32_t> starts(rank);
-  std::vector<int32_t> sizes(rank);
+  emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
 
   // Copy the data from the starts/ends/axes/steps initializers.
   std::vector<int64_t> input_starts;
@@ -75,8 +72,7 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     const auto& initializers(model_builder.GetInitializerTensors());
     const auto& tensor = *initializers.at(input_name);
     if (!ReadIntArrayFrom1DTensor(tensor, data, logger)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                             "Data type for starts and ends inputs is not supported in this build.");
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type for starts and ends inputs is not supported in this build.");
     }
 
     return Status::OK();
@@ -88,28 +84,55 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   ORT_RETURN_IF_ERROR(
       SliceOp::PrepareForComputeHelper(input_starts, input_ends, input_axes, input_steps, compute_metadata));
 
-  std::transform(compute_metadata.starts_.cbegin(), compute_metadata.starts_.cend(),
-                 starts.begin(),
-                 [](int64_t i) { return SafeInt<uint32_t>(i); });
-  std::transform(compute_metadata.ends_.cbegin(), compute_metadata.ends_.cend(), compute_metadata.starts_.cbegin(),
-                 sizes.begin(),
-                 [](int64_t i, int64_t j) { return SafeInt<uint32_t>(i - j); });
+  // Check if reverse op is needed.
+  std::vector<uint32_t> reverse_axes;
+  emscripten::val reverse_output = input;
+  for (size_t i = 0; i < rank; ++i) {
+    if (compute_metadata.steps_[i] < 0) {
+      reverse_axes.push_back(SafeInt<uint32_t>(i));
+      compute_metadata.steps_[i] = -compute_metadata.steps_[i];
+      compute_metadata.starts_[i] = input_shape[i] - 1 - compute_metadata.starts_[i];
+      compute_metadata.ends_[i] = input_shape[i] - 1 - compute_metadata.ends_[i];
+    }
+  }
+  if (!reverse_axes.empty()) {
+    emscripten::val reverse_options = emscripten::val::object();
+    reverse_options.set("axes", emscripten::val::array(reverse_axes));
+    reverse_options.set("label", node.Name() + "_reverse");
+    reverse_output = model_builder.GetBuilder().call<emscripten::val>("reverse", input, reverse_options);
+  }
+
+  // Check if slice op is needed.
+  bool is_slice_required = false;
+  for (size_t i = 0; i < rank; ++i) {
+    if (compute_metadata.steps_[i] != 1 || compute_metadata.starts_[i] != 0 ||
+        compute_metadata.ends_[i] != input_shape[i]) {
+      is_slice_required = true;
+      break;
+    }
+  }
 
-  emscripten::val options = emscripten::val::object();
-  options.set("label", node.Name());
-  emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("slice", inputs,
-                                                                            emscripten::val::array(starts),
-                                                                            emscripten::val::array(sizes),
-                                                                            options);
+  emscripten::val output = reverse_output;
+  if (is_slice_required) {
+    std::vector<uint32_t> starts = GetVecUint32FromVecInt64(compute_metadata.starts_);
+    std::vector<uint32_t> steps = GetVecUint32FromVecInt64(compute_metadata.steps_);
+    std::vector<uint32_t> sizes(rank);
+    std::transform(compute_metadata.ends_.cbegin(), compute_metadata.ends_.cend(), compute_metadata.starts_.cbegin(),
+                   sizes.begin(), [](int64_t i, int64_t j) { return SafeInt<uint32_t>(i - j); });
+
+    emscripten::val options = emscripten::val::object();
+    options.set("strides", emscripten::val::array(steps));
+    options.set("label", node.Name());
+    output = model_builder.GetBuilder().call<emscripten::val>("slice", reverse_output, emscripten::val::array(starts),
+                                                              emscripten::val::array(sizes), options);
+  }
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
 
-bool SliceOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
-                                       const Node& node,
-                                       const WebnnDeviceType /* device_type */,
-                                       const logging::Logger& logger) const {
+bool SliceOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+                                       const WebnnDeviceType /* device_type */, const logging::Logger& logger) const {
   const auto& name = node.Name();
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
@@ -129,36 +152,10 @@ bool SliceOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     // Optional tensors (axes, steps) can be indicated by an empty name, just ignore it.
     const std::string input_name = GetTensorName(input_defs, i);
     if (!input_name.empty() && !Contains(initializers, input_name)) {
-      LOGS(logger, VERBOSE) << "Input [" << input_name << "] of " << op_type
-                            << " [" << name << "] must be known as initializer";
-      return false;
-    }
-  }
-
-  if (input_defs.size() == 5) {  // Check steps.
-    const auto& steps_tensor = *initializers.at(input_defs[4]->Name());
-    std::vector<uint8_t> unpacked_tensor;
-    auto status = onnxruntime::utils::UnpackInitializerData(steps_tensor, unpacked_tensor);
-    if (!status.IsOK()) {
-      LOGS(logger, ERROR) << "Error while unpacking steps_tensor: " << status.ErrorMessage();
+      LOGS(logger, VERBOSE) << "Input [" << input_name << "] of " << op_type << " [" << name
+                            << "] must be known as initializer";
       return false;
     }
-    const auto data_type = steps_tensor.data_type();
-    // WebNN doesn't support steps other than 1.
-    if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
-      if (!std::all_of(reinterpret_cast<int64_t*>(unpacked_tensor.data()),
-                       reinterpret_cast<int64_t*>(unpacked_tensor.data() + unpacked_tensor.size()),
-                       [](int64_t i) { return i == 1; })) {
-        return false;
-      }
-    } else if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) {
-      if (!std::all_of(reinterpret_cast<int32_t*>(unpacked_tensor.data()),
-                       reinterpret_cast<int32_t*>(unpacked_tensor.data()) +
-                           unpacked_tensor.size() / sizeof(int32_t),
-                       [](int32_t i) { return i == 1; })) {
-        return false;
-      }
-    }
   }
 
   return true;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
index c4ccc80d44c0f..db10720f72762 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
@@ -28,6 +28,8 @@ class SplitOpBuilder : public BaseOpBuilder {
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+  bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
+                               const logging::Logger& logger) const override;
 };
 
 // Add operator related.
@@ -163,6 +165,23 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   return true;
 }
 
+bool SplitOpBuilder::HasSupportedOutputsImpl(const Node& node,
+                                             const emscripten::val& wnn_limits,
+                                             const logging::Logger& logger) const {
+  const auto& output_defs = node.OutputDefs();
+  const auto& op_type = node.OpType();
+  int32_t output_type = 0;
+
+  if (GetType(*output_defs[0], output_type, logger)) {
+    // Chromium has changed the output name of split from 'output' to 'outputs',
+    // to avoid breaking the existing API, we need to check both names.
+    std::string wnn_output_name = wnn_limits["split"]["output"].isUndefined() ? "outputs" : "output";
+    return IsDataTypeSupportedByOp(op_type, output_type, wnn_limits, wnn_output_name, "outputs", logger);
+  }
+
+  return false;
+}
+
 void CreateSplitOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   op_registrations.builders.push_back(std::make_unique<SplitOpBuilder>());
   op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index 8bcd3621b5d4b..e8f116d390199 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -14,7 +14,6 @@
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 
-#include <sstream>
 #include <utility>
 
 namespace onnxruntime {
@@ -227,7 +226,7 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
     if (!shape.empty()) {
       dims.reserve(shape.size());
       for (const auto& dim : shape) {
-        // dim_param free dimensions should have already been excluded by IsInputSupported().
+        // dim_param free dimensions should have already been excluded by IsTensorShapeSupported().
         assert(dim.has_dim_value());
         dims.push_back(SafeInt<int32_t>(dim.dim_value()));
       }
@@ -385,73 +384,6 @@ void ModelBuilder::AddOperand(const std::string& name, const emscripten::val& op
   wnn_operands_.insert(std::make_pair(name, operand));
 }
 
-// Get the zero constant with shape.
-const emscripten::val& ModelBuilder::GetZeroConstant(const int32_t& data_type,
-                                                     const std::vector<uint32_t>& shape) {
-  std::string name = "webnn_zero_constant_" + std::to_string(data_type);
-  emscripten::val dims = emscripten::val::array();
-  if (!shape.empty()) {
-    dims = emscripten::val::array(shape);
-    std::ostringstream name_stream;
-    name_stream << name;
-    for (const auto& dim : shape) {
-      name_stream << "_" << dim;
-    }
-    name = name_stream.str();
-  }
-  // If the operand does not exist, create it.
-  if (wnn_operands_.find(name) == wnn_operands_.end()) {
-    emscripten::val desc = emscripten::val::object();
-    desc.set("dimensions", dims);
-    desc.set("shape", dims);
-    emscripten::val zero_buffer = emscripten::val::undefined();
-    if (!SetWebnnDataType(desc, data_type)) {
-      ORT_THROW("Unsupported data type: " + std::to_string(data_type));
-    }
-    auto num_elements = Product(shape);
-    switch (data_type) {
-      case ONNX_NAMESPACE::TensorProto_DataType_INT4:
-      case ONNX_NAMESPACE::TensorProto_DataType_UINT4:
-        // For WebNN int4 and uint4 tensors are stored in Uint8Array,
-        // so we need to adjust the number of elements.
-        num_elements = (num_elements + 1) / 2;
-        zero_buffer = emscripten::val::global("Uint8Array").new_(num_elements);
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
-        zero_buffer = emscripten::val::global("Uint8Array").new_(num_elements);
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
-        zero_buffer = emscripten::val::global("Int8Array").new_(num_elements);
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
-        zero_buffer = emscripten::val::global("Uint16Array").new_(num_elements);
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
-        zero_buffer = emscripten::val::global("Float32Array").new_(num_elements);
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_INT32:
-        zero_buffer = emscripten::val::global("Int32Array").new_(num_elements);
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_INT64:
-        zero_buffer = emscripten::val::global("BigInt64Array").new_(num_elements);
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
-        zero_buffer = emscripten::val::global("Uint32Array").new_(num_elements);
-        break;
-      case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
-        zero_buffer = emscripten::val::global("BigUint64Array").new_(num_elements);
-        break;
-      default:
-        break;
-    }
-
-    emscripten::val zero_constant = wnn_builder_.call<emscripten::val>("constant", desc, zero_buffer);
-    wnn_operands_.insert(std::make_pair(name, zero_constant));
-  }
-  return wnn_operands_.at(name);
-}
-
 void ModelBuilder::AddInitializerToSkip(const std::string& tensor_name) {
   skipped_initializers_.insert(tensor_name);
 }
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.h b/onnxruntime/core/providers/webnn/builders/model_builder.h
index aeb128adc9a99..0fc2fa20670c7 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.h
@@ -11,6 +11,7 @@
 #include "core/framework/execution_provider.h"
 #include "core/providers/webnn/builders/helper.h"
 
+#include <sstream>
 #include <emscripten.h>
 #include <emscripten/val.h>
 
@@ -38,8 +39,11 @@ class ModelBuilder {
   const emscripten::val& GetOpSupportLimits() const { return wnn_limits_; }
 
   void AddOperand(const std::string& name, const emscripten::val& operand);
-  const emscripten::val& GetZeroConstant(
-      const int32_t& data_type, const std::vector<uint32_t>& shape = {});
+
+  template <typename T>
+  const emscripten::val& CreateOrGetConstant(const int32_t& data_type, T value,
+                                             const std::vector<uint32_t>& shape = {});
+
   // Use the buffers to persist WebNN allocated data like transposed weight.
   // It ensures the validity during inference session.
   std::vector<std::unique_ptr<uint8_t[]>> mem_persist_buffers_;
@@ -99,5 +103,120 @@ class ModelBuilder {
   static const IOpBuilder* GetOpBuilder(const Node& node);
 };
 
+// Create or retrieve one of the following:
+// - A WebNN constant MLOperand filled with the specified value, data type, and shape.
+// - A WebNN scalar constant MLOperand with the specified value and data type.
+// For scalar constant, it is workaround for builer.constant(type, value) method since
+// it has not been implemented now.
+// https://webmachinelearning.github.io/webnn/#api-mlgraphbuilder-constant-type-value
+//
+// This function enforces a mapping between the data_type and the value types:
+// - TensorProto_DataType_INT4    <-> int8_t
+// - TensorProto_DataType_UINT4   <-> int8_t
+// - TensorProto_DataType_BOOL    <-> bool
+// - TensorProto_DataType_UINT8   <-> uint8_t
+// - TensorProto_DataType_INT8    <-> int8_t
+// - TensorProto_DataType_FLOAT16 <-> float
+// - TensorProto_DataType_FLOAT   <-> float
+// - TensorProto_DataType_INT32   <-> int32_t
+// - TensorProto_DataType_INT64   <-> int64_t
+// - TensorProto_DataType_UINT32  <-> uint32_t
+// - TensorProto_DataType_UINT64  <-> uint64_t
+template <typename T>
+const emscripten::val& ModelBuilder::CreateOrGetConstant(const int32_t& data_type, T value,
+                                                         const std::vector<uint32_t>& shape) {
+  std::string name = "webnn_constant_" + std::to_string(data_type) + "_" + std::to_string(value);
+  emscripten::val dims = emscripten::val::array();
+  if (!shape.empty()) {
+    dims = emscripten::val::array(shape);
+    std::ostringstream name_stream;
+    name_stream << name;
+    for (const auto& dim : shape) {
+      name_stream << "_" << dim;
+    }
+    name = name_stream.str();
+  }
+
+  // If the operand does not exist, create it.
+  if (wnn_operands_.find(name) == wnn_operands_.end()) {
+    emscripten::val desc = emscripten::val::object();
+    desc.set("shape", dims);
+    desc.set("dimensions", dims);
+    emscripten::val buffer = emscripten::val::undefined();
+    if (!SetWebnnDataType(desc, data_type)) {
+      ORT_THROW("Unsupported data type: " + std::to_string(data_type));
+    }
+    auto num_elements = Product(shape);
+    switch (data_type) {
+      case ONNX_NAMESPACE::TensorProto_DataType_INT4:
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT4:
+        // For WebNN int4 and uint4 tensors are stored in Uint8Array,
+        // so we need to adjust the number of elements.
+        num_elements = (num_elements + 1) / 2;
+        buffer = emscripten::val::global("Uint8Array").new_(num_elements);
+        if (value) {
+          buffer.call<void>("fill", emscripten::val(PackInt8ToUint8AsNibble(value, data_type)));
+        }
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+        buffer = emscripten::val::global("Uint8Array").new_(num_elements);
+        if (value) {
+          buffer.call<void>("fill", emscripten::val(value));
+        }
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        buffer = emscripten::val::global("Int8Array").new_(num_elements);
+        if (value) {
+          buffer.call<void>("fill", emscripten::val(value));
+        }
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+        buffer = emscripten::val::global("Uint16Array").new_(num_elements);
+        if (value) {
+          buffer.call<void>("fill", emscripten::val(PackFloat32ToUint16AsFloat16(value)));
+        }
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+        buffer = emscripten::val::global("Float32Array").new_(num_elements);
+        if (value) {
+          buffer.call<void>("fill", emscripten::val(value));
+        }
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+        buffer = emscripten::val::global("Int32Array").new_(num_elements);
+        if (value) {
+          buffer.call<void>("fill", emscripten::val(value));
+        }
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
+        buffer = emscripten::val::global("Uint32Array").new_(num_elements);
+        if (value) {
+          buffer.call<void>("fill", emscripten::val(value));
+        }
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+        buffer = emscripten::val::global("BigInt64Array").new_(num_elements);
+        if (value) {
+          buffer.call<void>("fill", emscripten::val::global("BigInt")(value));
+        }
+        break;
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
+        buffer = emscripten::val::global("BigUint64Array").new_(num_elements);
+        if (value) {
+          buffer.call<void>("fill", emscripten::val::global("BigInt")(value));
+        }
+        break;
+      default:
+        break;
+    }
+
+    const emscripten::val constant = wnn_builder_.call<emscripten::val>("constant", desc, buffer);
+    wnn_operands_.insert(std::make_pair(name, constant));
+  }
+
+  return wnn_operands_.at(name);
+}
+
 }  // namespace webnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
index 1bce7c350a2c3..6d1c572128b93 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
@@ -82,7 +82,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   }
 
   {  // CumSum
-    CreateConcatOpBuilder("CumSum", op_registrations);
+    CreateCumSumOpBuilder("CumSum", op_registrations);
   }
 
   {  // Dropout
@@ -95,6 +95,10 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     CreateDynamicQuantizeLinearOpBuilder("DynamicQuantizeLinear", op_registrations);
   }
 
+  {  // Einsum
+    CreateEinsumOpBuilder("Einsum", op_registrations);
+  }
+
   {  // Expand
     CreateExpandOpBuilder("Expand", op_registrations);
   }
@@ -137,6 +141,10 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     CreateLogicalOpBuilder("Xor", op_registrations);
   }
 
+  {  // LRN
+    CreateLRNOpBuilder("LRN", op_registrations);
+  }
+
   {  // LSTM
     CreateLstmOpBuilder("LSTM", op_registrations);
   }
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.h b/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
index 6ac5b8de18a89..22bd6cd0cfa9f 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
@@ -30,6 +30,7 @@ void CreateCumSumOpBuilder(const std::string& op_type, OpBuilderRegistrations& o
 void CreateDropoutOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateDynamicQuantizeLinearOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateExpandOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateEinsumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGatherElementsOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
@@ -37,6 +38,7 @@ void CreateGatherNDOpBuilder(const std::string& op_type, OpBuilderRegistrations&
 void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGruOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateLogicalOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateLRNOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateLstmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateMaxMinOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateNormalizationOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc
index 4eef14dddecd3..2adf8339b4b66 100644
--- a/onnxruntime/core/providers/xnnpack/detail/utils.cc
+++ b/onnxruntime/core/providers/xnnpack/detail/utils.cc
@@ -5,6 +5,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include <limits>
 
 #include "core/common/common.h"
 #include "core/common/safeint.h"
@@ -239,8 +240,8 @@ std::unique_ptr<IndexedSubGraph::MetaDef> FuseActivation(const NodeUnit& node_un
   def.attributes = node_unit.GetNode().GetAttributes();
 
   // use infinity as the default as that's what xnnpack uses if min/max are not set
-  float min = -INFINITY;
-  float max = INFINITY;
+  float min = -std::numeric_limits<float>::infinity();
+  float max = std::numeric_limits<float>::infinity();
 
   const auto& activation_type = activation.OpType();
   if (activation_type == "Clip") {
diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.cc b/onnxruntime/core/providers/xnnpack/math/gemm.cc
index 68b55030c7363..a3ff3b585ae45 100644
--- a/onnxruntime/core/providers/xnnpack/math/gemm.cc
+++ b/onnxruntime/core/providers/xnnpack/math/gemm.cc
@@ -2,6 +2,9 @@
 // Licensed under the MIT License.
 
 #include "gemm.h"
+
+#include <limits>
+
 #include "core/framework/transpose_helper.h"
 #include "core/providers/utils.h"
 #include "core/providers/xnnpack/xnnpack_init.h"
@@ -117,7 +120,6 @@ Gemm::Gemm(const OpKernelInfo& info) : GemmBase(info), XnnpackKernel(info, /*ena
 }
 
 Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr,
-                     bool /*save_prepacked_initializers*/,
                      /*out*/ bool& is_packed,
                      /*out*/ PrePackedWeights*) {
   is_packed = false;
@@ -141,8 +143,8 @@ Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr,
   auto weights_cache = GetWeightsCache();
   xnn_status status = xnn_status::xnn_status_uninitialized;
   struct xnn_operator* p = nullptr;
-  float foutput_min = clip_min_max_ ? clip_min_max_->first : -INFINITY;
-  float foutput_max = clip_min_max_ ? clip_min_max_->second : INFINITY;
+  float foutput_min = clip_min_max_ ? clip_min_max_->first : -std::numeric_limits<float>::infinity();
+  float foutput_max = clip_min_max_ ? clip_min_max_->second : std::numeric_limits<float>::infinity();
   if (op_compute_type_ == OpComputeType::op_compute_type_fp32) {
     const float* bias_data = nullptr;
     if (C_matrix_exists_) {
diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.h b/onnxruntime/core/providers/xnnpack/math/gemm.h
index d632eef015f9a..954aab0698b9c 100644
--- a/onnxruntime/core/providers/xnnpack/math/gemm.h
+++ b/onnxruntime/core/providers/xnnpack/math/gemm.h
@@ -23,7 +23,6 @@ class Gemm : protected GemmBase, public XnnpackKernel {
   static bool IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& graph);
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.cc b/onnxruntime/core/providers/xnnpack/math/matmul.cc
index 71a11cb05d9af..f574238195ffd 100644
--- a/onnxruntime/core/providers/xnnpack/math/matmul.cc
+++ b/onnxruntime/core/providers/xnnpack/math/matmul.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "matmul.h"
+#include <limits>
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/xnnpack/xnnpack_init.h"
 
@@ -78,7 +79,6 @@ MatMul::MatMul(const OpKernelInfo& info) : XnnpackKernel(info, /*enable_caches*/
 }
 
 Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                       bool /*save_prepacked_initializers*/,
                        /*out*/ bool& is_packed,
                        /*out*/ PrePackedWeights* /*Not used*/) {
   is_packed = false;
@@ -110,8 +110,8 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
   xnn_weights_cache_t weight_cache = nullptr;
 #endif
 
-  float foutput_min = -INFINITY;
-  float foutput_max = INFINITY;
+  float foutput_min = -std::numeric_limits<float>::infinity();
+  float foutput_max = std::numeric_limits<float>::infinity();
   if (op_type_ == OpComputeType::op_compute_type_fp32) {
     status = xnn_create_fully_connected_nc_f32(
         shape_broadcast[0],    // size_t input_channels,
diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.h b/onnxruntime/core/providers/xnnpack/math/matmul.h
index 31a8c36ad418b..188cc73189af5 100644
--- a/onnxruntime/core/providers/xnnpack/math/matmul.h
+++ b/onnxruntime/core/providers/xnnpack/math/matmul.h
@@ -23,7 +23,6 @@ class MatMul : public XnnpackKernel {
   // Required for checking XNNpack restrictions on ORT side
   static bool IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& graph);
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
diff --git a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
index 1c8ed556e90d7..1fc941d9f52f6 100644
--- a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
@@ -33,8 +33,8 @@ Status CreateXnnpackKernel(const PoolAttributes& pool_attrs,
   if (pool_attrs.auto_pad == AutoPadType::SAME_UPPER) {
     flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
   }
-  float foutput_min = clip_min_max ? clip_min_max->first : -INFINITY;
-  float foutput_max = clip_min_max ? clip_min_max->second : INFINITY;
+  float foutput_min = clip_min_max ? clip_min_max->first : -std::numeric_limits<float>::infinity();
+  float foutput_max = clip_min_max ? clip_min_max->second : std::numeric_limits<float>::infinity();
   xnn_status status = xnn_status_unsupported_parameter;
   if (avgpool_type == OpComputeType::op_compute_type_fp32) {
     status = xnn_create_average_pooling2d_nhwc_f32(input_padding_top, input_padding_right,
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.cc b/onnxruntime/core/providers/xnnpack/nn/conv.cc
index f2e697df475da..4e6b308e28ae5 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv.cc
@@ -18,7 +18,6 @@ namespace xnnpack {
 
 // use PrePack to handle the weight layout change as that's not a simple NCHW -> NHWC transpose
 Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                     bool /*save_prepacked_initializers*/,
                      /*out*/ bool& is_packed,
                      /*out*/ PrePackedWeights* /*prepacked_weights*/) {
   is_packed = false;
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.h b/onnxruntime/core/providers/xnnpack/nn/conv.h
index 762b68c8bd49a..3630aae208d49 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv.h
+++ b/onnxruntime/core/providers/xnnpack/nn/conv.h
@@ -19,7 +19,6 @@ class Conv : public ConvBase {
 
   // use PrePack to handle the weight layout change as that's not a simple NCHW -> NHWC transpose
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 };
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
index e0723c0e7690e..458e6000c8d70 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
@@ -54,8 +54,8 @@ Status CreateXnnpackKernel(const ConvAttributes& conv_attrs,
 
   xnn_status status = xnn_status::xnn_status_uninitialized;
   p = nullptr;
-  float foutput_min = clip_min_max ? clip_min_max->first : -INFINITY;
-  float foutput_max = clip_min_max ? clip_min_max->second : INFINITY;
+  float foutput_min = clip_min_max ? clip_min_max->first : -std::numeric_limits<float>::infinity();
+  float foutput_max = clip_min_max ? clip_min_max->second : std::numeric_limits<float>::infinity();
   // with the following IC and OC number, we can cover depthwise and regular conv at the same time
   // the equation 'IC (group_input_channels) == C ' set up when group_count==1 (regular convolution)
   // and OC (group_output_channels) follows the same rule.
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
index 5729565b2feb9..b6930a5fc92d1 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
@@ -15,7 +15,6 @@ namespace xnnpack {
 
 // use PrePack to handle the weight layout change as that's not a simple NCHW -> NHWC transpose
 Status ConvTranspose::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                              bool /*save_prepacked_initializers*/,
                               /*out*/ bool& is_packed,
                               /*out*/ PrePackedWeights* /*prepacked_weights*/) {
   is_packed = false;
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.h b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.h
index 0313515d10fa1..866b9b6b98365 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.h
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.h
@@ -18,7 +18,6 @@ class ConvTranspose : public ConvBase {
 
   // use PrePack to handle the weight layout change as that's not a simple NCHW -> NHWC transpose
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 };
diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
index 6742e51e55082..c828ae9400174 100644
--- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
@@ -3,6 +3,8 @@
 
 #include "max_pool.h"
 
+#include <limits>
+
 #include "core/graph/graph.h"
 #include "core/providers/utils.h"
 #include "core/providers/xnnpack/xnnpack_init.h"
@@ -168,8 +170,8 @@ MaxPool::MaxPool(const OpKernelInfo& info)
   auto input_dtype = X_arg.TypeAsProto()->tensor_type().elem_type();
   xnn_status status = xnn_status_invalid_state;
   struct xnn_operator* p = nullptr;
-  float foutput_min = clip_min_max_ ? clip_min_max_->first : -INFINITY;
-  float foutput_max = clip_min_max_ ? clip_min_max_->second : INFINITY;
+  float foutput_min = clip_min_max_ ? clip_min_max_->first : -std::numeric_limits<float>::infinity();
+  float foutput_max = clip_min_max_ ? clip_min_max_->second : std::numeric_limits<float>::infinity();
   if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
     maxpool_type_ = OpComputeType::op_compute_type_fp32;
     status = xnn_create_max_pooling2d_nhwc_f32(input_padding_top, input_padding_right,
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
index 12e567e7080b3..ee4e7be0f1f49 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
@@ -258,6 +258,7 @@ static void AddComputeCapabilityForEachNodeInNodeUnit(
 std::vector<std::unique_ptr<ComputeCapability>> XnnpackExecutionProvider::GetCapability(
     const onnxruntime::GraphViewer& graph,
     const IKernelLookup& /*kernel_lookup*/) const {
+  const auto& logger = *GetLogger();
   std::vector<std::unique_ptr<ComputeCapability>> capabilities;
 
   std::shared_ptr<KernelRegistry> registry = GetKernelRegistry();
@@ -268,7 +269,7 @@ std::vector<std::unique_ptr<ComputeCapability>> XnnpackExecutionProvider::GetCap
   // Get all the NodeUnits in the GraphViewer so we can check if something is in a QDQ node group
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph, logger);
 
   // This holds the result of whether a NodeUnit is supported or not,
   // to prevent nodes in a NodeUnit being checked for multiple times
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index 5f929d3760a95..48213e3e3894a 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -117,8 +117,9 @@ Status Environment::CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, co
   }
 
   // determine if arena should be used
-  const bool is_arena_requested = mem_info.alloc_type == OrtArenaAllocator;
-  const bool create_arena = ShouldCpuAllocatorUseArena(is_arena_requested);
+  const bool create_arena = DoesCpuAllocatorSupportArenaUsage()
+                                ? (mem_info.alloc_type == OrtArenaAllocator)
+                                : false;
 
   AllocatorPtr allocator_ptr;
   // create appropriate DeviceAllocatorRegistrationInfo and allocator based on create_arena
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index bc5db98e7c595..a60ee500a9898 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1644,7 +1644,7 @@ Status ApplyOrtFormatModelRuntimeOptimizations(
        level <= static_cast<int>(session_options.graph_optimization_level);
        ++level) {
     const auto transformers = optimizer_utils::GenerateTransformersForMinimalBuild(
-        static_cast<TransformerLevel>(level), session_options, SatRuntimeOptimizationLoadContext{}, cpu_ep,
+        static_cast<TransformerLevel>(level), session_options, SatRuntimeOptimizationLoadContext{}, cpu_ep, logger,
         optimizers_to_disable, intra_op_thread_pool, p_buffered_tensors);
 
     for (const auto& transformer : transformers) {
@@ -1840,7 +1840,8 @@ common::Status InferenceSession::Initialize() {
       ORT_RETURN_IF_ERROR_SESSIONID_(AddPredefinedTransformers(graph_transformer_mgr_,
                                                                session_options_.graph_optimization_level,
                                                                minimal_build_optimization_handling,
-                                                               record_runtime_optimization_produced_op_schema));
+                                                               record_runtime_optimization_produced_op_schema,
+                                                               *session_logger_));
 
 #ifdef USE_DML
       const IExecutionProvider* dmlExecutionProvider = execution_providers_.Get(kDmlExecutionProvider);
@@ -2062,11 +2063,9 @@ common::Status InferenceSession::Initialize() {
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
     }
 
-    SessionState::PrePackInitializers pre_packed_initializers;
     ORT_RETURN_IF_ERROR_SESSIONID_(
         session_state_->FinalizeSessionState(model_location_, kernel_registry_manager_,
                                              // need to keep the initializers if saving the optimized model
-                                             pre_packed_initializers,
                                              !saving_model,
                                              saving_ort_format));
 
@@ -2102,47 +2101,11 @@ common::Status InferenceSession::Initialize() {
                   kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes, "1024"));
           Graph::OffsetAlignmentInfo align_info;
           align_info.align_offset = true;
-          bool save_prepacked_constant_initializers =
-              session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsSavePrePackedConstantInitializers, "0") == "1" ? true : false;
-          Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
-          if (save_prepacked_constant_initializers) {
-            LOGS(*session_logger_, WARNING) << "Serialize prepacked initializers option has been turn on."
-                                            << "Use this option only when run model inference on PC with CPU."
-                                            << "Make sure to save and load model in same device as prepack is device specific."
-                                            << "Note: this feature in only work with ONNX model format."
-                                            << "Process of use this option is like below:"
-                                            << "1. Optimize model with external data file with save_prepacked_constant_initializers on:"
-                                            << "       sample: sess_options.add_session_config_entry('session.save_prepacked_constant_initializers',  ' 1 ')"
-                                            << "   With save_prepacked_constant_initializers option, prepacked initializer will be serialized into data file."
-                                            << "2. Load optimized model and external data file in same device, no prepack is need."
-                                            << "3. Run inference with optimized model.";
-
-            if (fbs::utils::IsOrtFormatModel(session_options_.optimized_model_filepath)) {
-              ORT_RETURN_IF_ERROR_SESSIONID_(
-                  ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                                  "Unable to serialize prepacked external constant initializer for ORT format model."
-                                  "Please use ONNX format model with save_prepacked_constant_initializers."));
-            }
-
-            // convert pre_packed_initializers to tensorproto format and save to external data file
-            for (const auto& name_item_pair : pre_packed_initializers.pre_packed_initializers_to_save) {
-              auto initializer_name = name_item_pair.first;
-
-              for (const auto& kernel_name_initializer_item_pair : name_item_pair.second) {
-                auto kernel_name = kernel_name_initializer_item_pair.first;
-                auto prepacked_initializer_name = utils::GetPrepackedInitializerName(initializer_name, kernel_name);
-
-                pre_packed_initializers_tensor_proto[initializer_name][kernel_name] = utils::TensorToTensorProto(kernel_name_initializer_item_pair.second, prepacked_initializer_name);
-              }
-            }
-          }
           ORT_RETURN_IF_ERROR_SESSIONID_(Model::SaveWithExternalInitializers(*model_,
                                                                              session_options_.optimized_model_filepath,
                                                                              optimized_model_external_initializers_file_name,
                                                                              optimized_model_external_initializers_min_size_in_bytes,
-                                                                             align_info,
-                                                                             save_prepacked_constant_initializers,
-                                                                             pre_packed_initializers_tensor_proto));
+                                                                             align_info));
         }
       }
     }
@@ -2150,7 +2113,7 @@ common::Status InferenceSession::Initialize() {
     std::vector<TuningResults> tuning_results;
     bool found_tuning_results = false;
     ORT_RETURN_IF_ERROR_SESSIONID_(inference_session_utils::ParseTuningResultsFromModelMetadata(
-        model_metadata_, tuning_results, found_tuning_results));
+        model_metadata_, tuning_results, found_tuning_results, *session_logger_));
     if (found_tuning_results) {
       ORT_RETURN_IF_ERROR_SESSIONID_(SetTuningResults(tuning_results, /*error_on_invalid*/ false, /*auto_enable*/ true));
     }
@@ -3271,7 +3234,8 @@ common::Status InferenceSession::AddPredefinedTransformers(
     GraphTransformerManager& transformer_manager,
     TransformerLevel graph_optimization_level,
     MinimalBuildOptimizationHandling minimal_build_optimization_handling,
-    RecordRuntimeOptimizationProducedNodeOpSchemaFn record_runtime_optimization_produced_op_schema_fn) const {
+    RecordRuntimeOptimizationProducedNodeOpSchemaFn record_runtime_optimization_produced_op_schema_fn,
+    const logging::Logger& logger) const {
   const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);
   for (int i = static_cast<int>(TransformerLevel::Level1); i <= static_cast<int>(TransformerLevel::MaxLevel); i++) {
     TransformerLevel level = static_cast<TransformerLevel>(i);
@@ -3283,7 +3247,7 @@ common::Status InferenceSession::AddPredefinedTransformers(
             minimal_build_optimization_handling == MinimalBuildOptimizationHandling::ApplyFullBuildOptimizations;
 
         if (use_full_build_optimizations) {
-          return optimizer_utils::GenerateTransformers(level, session_options_, cpu_ep,
+          return optimizer_utils::GenerateTransformers(level, session_options_, cpu_ep, logger,
                                                        optimizers_to_disable_,
                                                        GetIntraOpThreadPoolToUse(),
                                                        session_state_->GetMutableBufferedTensors());
@@ -3295,6 +3259,7 @@ common::Status InferenceSession::AddPredefinedTransformers(
                         record_runtime_optimization_produced_op_schema_fn}}
                   : SatApplyContextVariant{SatDirectApplicationContext{}};
           return optimizer_utils::GenerateTransformersForMinimalBuild(level, session_options_, sat_context, cpu_ep,
+                                                                      logger,
                                                                       optimizers_to_disable_,
                                                                       GetIntraOpThreadPoolToUse(),
                                                                       session_state_->GetMutableBufferedTensors());
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index 0675f64848fd0..e28ff75345785 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -690,8 +690,9 @@ class InferenceSession {
    * If we encounter an invalid request, we return an error
    * back to the user.
    */
-  [[nodiscard]] common::Status ValidateAndParseShrinkArenaString(const std::string& ort_device_list,
-                                                                 /*out*/ InlinedVector<AllocatorPtr>& arenas_to_shrink) const;
+  [[nodiscard]] common::Status ValidateAndParseShrinkArenaString(
+      const std::string& ort_device_list,
+      /*out*/ InlinedVector<AllocatorPtr>& arenas_to_shrink) const;
 
   /*
    * Performs the shrinkage of arenas requested to be shrunk by the user
@@ -708,7 +709,8 @@ class InferenceSession {
       GraphTransformerManager& transformer_manager,
       TransformerLevel graph_optimization_level,
       MinimalBuildOptimizationHandling minimal_build_optimization_handling,
-      RecordRuntimeOptimizationProducedNodeOpSchemaFn record_runtime_optimization_produced_op_schema_fn) const;
+      RecordRuntimeOptimizationProducedNodeOpSchemaFn record_runtime_optimization_produced_op_schema_fn,
+      const logging::Logger& logger) const;
 
   common::Status TransformGraph(onnxruntime::Graph& graph, bool saving_model_in_ort_format);
 
diff --git a/onnxruntime/core/session/inference_session_utils.cc b/onnxruntime/core/session/inference_session_utils.cc
index 3436eebda3819..8b9de0c604441 100644
--- a/onnxruntime/core/session/inference_session_utils.cc
+++ b/onnxruntime/core/session/inference_session_utils.cc
@@ -236,7 +236,8 @@ Status JsonConfigParser::ParseRunOptionsFromModelProto(RunOptions& /*run_options
 
 Status ParseTuningResultsFromModelMetadata(const onnxruntime::ModelMetadata& metadata,
                                            std::vector<TuningResults>& results,
-                                           bool& key_found) {
+                                           bool& key_found,
+                                           const logging::Logger& logger) {
   results.clear();
   key_found = false;
   auto it = metadata.custom_metadata_map.find(kTuningResultsKeys);
@@ -245,7 +246,7 @@ Status ParseTuningResultsFromModelMetadata(const onnxruntime::ModelMetadata& met
   }
 
   key_found = true;
-  LOGS_DEFAULT(INFO) << "Found tuning results in the model file to be used while loading the model";
+  LOGS(logger, INFO) << "Found tuning results in the model file to be used while loading the model";
 
   Status status;
   ORT_TRY {
diff --git a/onnxruntime/core/session/inference_session_utils.h b/onnxruntime/core/session/inference_session_utils.h
index a0bcdb9013bf0..f297d928f8a0d 100644
--- a/onnxruntime/core/session/inference_session_utils.h
+++ b/onnxruntime/core/session/inference_session_utils.h
@@ -19,7 +19,9 @@ using json = nlohmann::json;
 #endif
 
 namespace onnxruntime {
-
+namespace logging {
+class Logger;
+}
 namespace inference_session_utils {
 
 // need this value to be accessible in all builds in order to report error for attempted usage in a minimal build
@@ -60,7 +62,8 @@ class JsonConfigParser {
 
 Status ParseTuningResultsFromModelMetadata(const onnxruntime::ModelMetadata& metadata,
                                            /*out*/ std::vector<TuningResults>& results,
-                                           /*out*/ bool& key_found);
+                                           /*out*/ bool& key_found,
+                                           const logging::Logger& logger);
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
diff --git a/onnxruntime/core/session/lora_adapters.cc b/onnxruntime/core/session/lora_adapters.cc
index 599c41f79a537..466edce187a56 100644
--- a/onnxruntime/core/session/lora_adapters.cc
+++ b/onnxruntime/core/session/lora_adapters.cc
@@ -4,9 +4,10 @@
 #include "core/session/lora_adapters.h"
 #include "lora/adapter_format_utils.h"
 
+#include <unordered_map>
+
 #include "core/framework/data_transfer.h"
 #include "core/framework/error_code_helper.h"
-#include "core/framework/execution_provider.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/allocator_adapters.h"
 #include "core/session/ort_apis.h"
@@ -15,15 +16,6 @@
 #include "core/providers/cuda/cuda_provider_factory.h"
 #endif
 
-#ifdef USE_DML
-#include "core/session/abi_session_options_impl.h"
-#include "core/providers/dml/dml_provider_factory_creator.h"
-#include "core/providers/dml/dml_provider_factory.h"
-#endif
-
-#include <functional>
-#include <unordered_map>
-
 namespace onnxruntime {
 
 #ifdef USE_CUDA
@@ -58,58 +50,28 @@ void LoraAdapter::MemoryMap(const std::filesystem::path& file_path) {
   InitializeParamsValues();
 }
 
-namespace {
-struct DataTransfer {
-  std::unique_ptr<IExecutionProvider> ep;
+static std::unique_ptr<IDataTransfer> GetDataTransfer(const OrtMemoryInfo& mem_info) {
   std::unique_ptr<IDataTransfer> data_transfer;
-  bool is_dml = false;
-  Status CopyTensor(const Tensor& src, Tensor& dst) const {
-    return data_transfer->CopyTensor(src, dst);
-  }
-  Status Sync() const {
-    if (is_dml) {
-      return ep->Sync();
-    } else {
-      return Status::OK();
-    }
-  }
-};
-}  // namespace
 
-static Status GetDataTransfer(const OrtMemoryInfo& mem_info, [[maybe_unused]] DataTransfer& dt) {
-  ORT_RETURN_IF(strcmp(mem_info.name, onnxruntime::CPU) == 0, "Expecting on device allocator for LoraAdapter");
+  if (strcmp(mem_info.name, onnxruntime::CPU) == 0) {
+    return data_transfer;
+  }
 
-  Status status;
   if (strcmp(mem_info.name, onnxruntime::CUDA) == 0) {
 #ifdef USE_CUDA
     auto* cuda_provider_info = TryGetProviderInfo_CUDA();
     if (cuda_provider_info != nullptr) {
-      dt.data_transfer = cuda_provider_info->CreateGPUDataTransfer();
-    } else {
-      status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA provider could not be loaded");
+      data_transfer = cuda_provider_info->CreateGPUDataTransfer();
     }
-#else
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "CUDA provider is not enabled in this build");
-#endif
-  } else if (strcmp(mem_info.name, onnxruntime::DML) == 0) {
-#ifdef USE_DML
-    auto ep_factory = onnxruntime::DMLProviderFactoryCreator::Create(ConfigOptions{}, 0, false, false, false);
-    dt.ep = ep_factory->CreateProvider();
-    dt.is_dml = true;
-    dt.data_transfer = dt.ep->GetDataTransfer();
-#else
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "DML provider is not enabled in this build");
 #endif
-  } else {
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported device allocator");
   }
 
-  return status;
+  return data_transfer;
 }
 
 static Status CreateOrtValueOnDevice(const OrtValue& ort_value_mapped,
                                      const AllocatorPtr& device_allocator,
-                                     const DataTransfer& data_transfer,
+                                     const IDataTransfer& data_transfer,
                                      OrtValue& out) {
   OrtValue result;
   const auto& src = ort_value_mapped.Get<Tensor>();
@@ -125,9 +87,12 @@ void LoraAdapter::InitializeParamsValues() {
     ORT_THROW("Adapter is not loaded yet.");
   }
 
-  DataTransfer data_transfer;
+  std::unique_ptr<IDataTransfer> data_transfer;
   if (device_allocator_) {
-    ORT_THROW_IF_ERROR(GetDataTransfer(device_allocator_->Info(), data_transfer));
+    data_transfer = GetDataTransfer(device_allocator_->Info());
+    if (data_transfer == nullptr) {
+      ORT_THROW("Data transfer is not available for the specified device allocator, it also must not be a CPU allocator");
+    }
   }
 
   const auto* params = adapter_->parameters();
@@ -135,12 +100,12 @@ void LoraAdapter::InitializeParamsValues() {
   std::unordered_map<std::string, Param> params_values;
   params_values.reserve(params->size());
   // Re-work in two separate loops due to compiler issues
-  if (device_allocator_) {
+  if (data_transfer) {
     for (const auto* param : *params) {
       auto [name, ort_value] = adapters::utils::CreateOrtValueOverLoraParameter(*param);
       OrtValue ort_value_ondevice;
       ORT_THROW_IF_ERROR(CreateOrtValueOnDevice(ort_value, device_allocator_,
-                                                data_transfer, ort_value_ondevice));
+                                                *data_transfer, ort_value_ondevice));
       Param lora_param(std::move(ort_value), std::move(ort_value_ondevice));
       params_values.emplace(std::move(name), std::move(lora_param));
     }
@@ -152,10 +117,6 @@ void LoraAdapter::InitializeParamsValues() {
     }
   }
 
-  if (device_allocator_) {
-    ORT_THROW_IF_ERROR(data_transfer.Sync());
-  }
-
   params_values_.swap(params_values);
 }
 
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 109445c877786..ca6950af0227a 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2803,12 +2803,15 @@ static constexpr OrtApi ort_api_1_to_21 = {
     &OrtApis::KernelInfoGetAllocator,
     &OrtApis::AddExternalInitializersFromFilesInMemory,
     // End of Version 18 - DO NOT MODIFY ABOVE (see above text for more information)
+    // End of Version 19 - DO NOT MODIFY ABOVE (see above text for more information)
+
     &OrtApis::CreateLoraAdapter,
     &OrtApis::CreateLoraAdapterFromArray,
     &OrtApis::ReleaseLoraAdapter,
     &OrtApis::RunOptionsAddActiveLoraAdapter,
 
     &OrtApis::SetEpDynamicOptions,
+    // End of Version 20 - DO NOT MODIFY ABOVE (see above text for more information)
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
@@ -2840,6 +2843,8 @@ static_assert(offsetof(OrtApi, GetBuildInfoString) / sizeof(void*) == 254, "Size
 static_assert(offsetof(OrtApi, KernelContext_GetResource) / sizeof(void*) == 265, "Size of version 16 API cannot change");
 static_assert(offsetof(OrtApi, SessionOptionsAppendExecutionProvider_OpenVINO_V2) / sizeof(void*) == 275, "Size of version 17 API cannot change");
 static_assert(offsetof(OrtApi, AddExternalInitializersFromFilesInMemory) / sizeof(void*) == 279, "Size of version 18 API cannot change");
+// no additions in version 19
+static_assert(offsetof(OrtApi, SetEpDynamicOptions) / sizeof(void*) == 284, "Size of version 20 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
 static_assert(std::string_view(ORT_VERSION) == "1.21.0",
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 2c4bffa4fb79f..c3832498af584 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -279,8 +279,9 @@ struct ProviderHostImpl : ProviderHost {
 
   std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewer& graph,
                                                      const IExecutionProvider::IKernelLookup& kernel_lookup,
-                                                     gsl::span<const NodeIndex> tentative_nodes) override {
-    return onnxruntime::GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes);
+                                                     gsl::span<const NodeIndex> tentative_nodes,
+                                                     const logging::Logger& logger) override {
+    return onnxruntime::GetCpuPreferredNodes(graph, kernel_lookup, tentative_nodes, logger);
   }
 
   Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) override { return utils::UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }
@@ -1057,8 +1058,8 @@ struct ProviderHostImpl : ProviderHost {
   }
 
   std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
-  QDQ__GetAllNodeUnits(const GraphViewer* graph_viewer) override {
-    return QDQ::GetAllNodeUnits(*graph_viewer);
+  QDQ__GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger) override {
+    return QDQ::GetAllNodeUnits(*graph_viewer, logger);
   }
 
   // Model (wrapped)
@@ -1156,8 +1157,8 @@ struct ProviderHostImpl : ProviderHost {
 
   // GraphViewer (wrapped)
   void GraphViewer__operator_delete(GraphViewer* p) override { delete p; }
-  std::unique_ptr<Model> GraphViewer__CreateModel(const GraphViewer* graph_viewer, const logging::Logger& logger) override {
-    return std::make_unique<Model>(graph_viewer->Name(), true, ModelMetaData(), PathString(),
+  std::unique_ptr<Model> GraphViewer__CreateModel(const GraphViewer* graph_viewer, const logging::Logger& logger, const ModelMetaData& metadata = ModelMetaData()) override {
+    return std::make_unique<Model>(graph_viewer->Name(), true, metadata, PathString(),
 #if !defined(ORT_MINIMAL_BUILD)
                                    IOnnxRuntimeOpSchemaRegistryList({graph_viewer->GetSchemaRegistry()}), graph_viewer->DomainToVersionMap(),
 #else
@@ -1212,6 +1213,7 @@ struct ProviderHostImpl : ProviderHost {
     GraphViewerToProto(*p, graph_proto, include_initializers, include_outer_scope_args, static_cast<ExecutionOrder>(execution_order));
   }
   const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const override { return p->GetProducerNode(node_arg_name); }
+  IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const override { return p->GetSchemaRegistry(); }
 
   // OpKernel (direct)
   const Node& OpKernel__Node(const OpKernel* p) override { return p->OpKernel::Node(); }
diff --git a/onnxruntime/core/session/provider_registration.cc b/onnxruntime/core/session/provider_registration.cc
index 8c512c561ea8c..7fb518cdc05ca 100644
--- a/onnxruntime/core/session/provider_registration.cc
+++ b/onnxruntime/core/session/provider_registration.cc
@@ -155,11 +155,21 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
     status = create_not_supported_status();
 #endif
   } else if (strcmp(provider_name, "VitisAI") == 0) {
+#ifdef USE_VITISAI
     status = OrtApis::SessionOptionsAppendExecutionProvider_VitisAI(options, provider_options_keys, provider_options_values, num_keys);
+#else
+    status = create_not_supported_status();
+#endif
+  } else if (strcmp(provider_name, "CoreML") == 0) {
+#if defined(USE_COREML)
+    options->provider_factories.push_back(CoreMLProviderFactoryCreator::Create(provider_options));
+#else
+    status = create_not_supported_status();
+#endif
   } else {
     ORT_UNUSED_PARAMETER(options);
     status = OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
-                                   "Unknown provider name. Currently supported values are 'OPENVINO', 'SNPE', 'XNNPACK', 'QNN', 'WEBNN' and 'AZURE'");
+                                   "Unknown provider name. Currently supported values are 'OPENVINO', 'SNPE', 'XNNPACK', 'QNN', 'WEBNN' ,'CoreML', and 'AZURE'");
   }
 
   return status;
@@ -205,15 +215,6 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Nnapi,
 }
 #endif
 
-#ifndef USE_TVM
-ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tvm,
-                    _In_ OrtSessionOptions* options, _In_ const char* settings) {
-  ORT_UNUSED_PARAMETER(options);
-  ORT_UNUSED_PARAMETER(settings);
-  return CreateNotEnabledStatus("Tvm");
-}
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/onnxruntime/core/session/standalone_op_invoker.cc b/onnxruntime/core/session/standalone_op_invoker.cc
index 9cbf01946e92b..2706448d831cc 100644
--- a/onnxruntime/core/session/standalone_op_invoker.cc
+++ b/onnxruntime/core/session/standalone_op_invoker.cc
@@ -314,7 +314,8 @@ class StandAloneKernelContext : public OpKernelContext {
   AllocatorPtr allocator_;
 };  // StandAloneKernelContext
 
-onnxruntime::Status CreateOpAttr(const char* name, const void* data, int len, OrtOpAttrType type, OrtOpAttr** op_attr) {
+onnxruntime::Status CreateOpAttr(const char* name, const void* data, int len, OrtOpAttrType type,
+                                 OrtOpAttr** op_attr) {
   auto attr = std::make_unique<ONNX_NAMESPACE::AttributeProto>();
   onnxruntime::Status status = onnxruntime::Status::OK();
   attr->set_name(std::string{name});
@@ -410,7 +411,9 @@ onnxruntime::Status CreateOp(_In_ const OrtKernelInfo* info,
 
   node_ptr->SetSinceVersion(version);
 
-  auto status = kernel_registry->TryFindKernel(*node_ptr, ep->Type(), type_constraint_map, &kernel_create_info);
+  auto status = kernel_registry->TryFindKernel(*node_ptr, ep->Type(), type_constraint_map,
+                                               logging::LoggingManager::DefaultLogger(),  // no other logger available
+                                               &kernel_create_info);
   ORT_RETURN_IF_ERROR(status);
 
   auto& kernel_def = kernel_create_info->kernel_def;
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 74bd20461efea..92396bb09bd4c 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -226,7 +226,7 @@ AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id) {
       auto dml_device = onnxruntime::DMLProviderFactoryCreator::CreateDMLDevice(d3d12_device.Get());
       ORT_THROW_IF_FAILED(d3d12_device->SetPrivateDataInterface(dml_device_guid, dml_device.Get()));
 
-      context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device.Get(), cmd_queue.Get(), true);
+      context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device.Get(), cmd_queue.Get(), true, true);
       ORT_THROW_IF_FAILED(d3d12_device->SetPrivateDataInterface(dml_execution_context_guid, context.Get()));
     }
 
diff --git a/onnxruntime/python/onnxruntime_pybind_schema.cc b/onnxruntime/python/onnxruntime_pybind_schema.cc
index 1319e8f6fe959..958da26f4faf0 100644
--- a/onnxruntime/python/onnxruntime_pybind_schema.cc
+++ b/onnxruntime/python/onnxruntime_pybind_schema.cc
@@ -69,11 +69,14 @@ void addGlobalSchemaFunctions(pybind11::module& m) {
 #ifdef USE_NNAPI
             onnxruntime::NnapiProviderFactoryCreator::Create(0, std::optional<std::string>()),
 #endif
+#ifdef USE_VSINPU
+            onnxruntime::VSINPUProviderFactoryCreator::Create(),
+#endif
 #ifdef USE_RKNPU
             onnxruntime::RknpuProviderFactoryCreator::Create(),
 #endif
 #ifdef USE_COREML
-            onnxruntime::CoreMLProviderFactoryCreator::Create(0),
+            onnxruntime::CoreMLProviderFactoryCreator::Create(ProviderOptions{}),
 #endif
 #ifdef USE_XNNPACK
             onnxruntime::XnnpackProviderFactoryCreator::Create(ProviderOptions{}, nullptr),
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 4d9583be0ef0f..9d544c0cee9ed 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1125,16 +1125,6 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
         LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please refer https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
       }
     }
-#endif
-  } else if (type == kTvmExecutionProvider) {
-#if USE_TVM
-    onnxruntime::tvm::TvmEPOptions info{};
-    const auto it = provider_options_map.find(type);
-    if (it != provider_options_map.end()) {
-      info = onnxruntime::tvm::TvmEPOptionsHelper::FromProviderOptions(it->second);
-    }
-
-    return onnxruntime::TVMProviderFactoryCreator::Create(info)->CreateProvider();
 #endif
   } else if (type == kVitisAIExecutionProvider) {
 #ifdef USE_VITISAI
@@ -1190,6 +1180,10 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
     const auto partitioning_stop_ops_list = session_options.config_options.GetConfigEntry(
         kOrtSessionOptionsConfigNnapiEpPartitioningStopOps);
     return onnxruntime::NnapiProviderFactoryCreator::Create(0, partitioning_stop_ops_list)->CreateProvider();
+#endif
+  } else if (type == kVSINPUExecutionProvider) {
+#ifdef USE_VSINPU
+    return onnxruntime::VSINPUProviderFactoryCreator::Create()->CreateProvider();
 #endif
   } else if (type == kRknpuExecutionProvider) {
 #ifdef USE_RKNPU
@@ -1222,6 +1216,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
         if (flags_str.find("COREML_FLAG_CREATE_MLPROGRAM") != std::string::npos) {
           coreml_flags |= COREMLFlags::COREML_FLAG_CREATE_MLPROGRAM;
         }
+      } else {
+        // read from provider_options
+        return onnxruntime::CoreMLProviderFactoryCreator::Create(options)->CreateProvider();
       }
     }
 
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index fa4916f8922f2..995341b0f8dc0 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -24,7 +24,7 @@ struct OrtStatus {
   char msg[1];  // a null-terminated string
 };
 
-#define BACKEND_DEVICE BACKEND_PROC BACKEND_DNNL BACKEND_OPENVINO BACKEND_TVM BACKEND_OPENBLAS BACKEND_MIGRAPHX BACKEND_ACL BACKEND_ARMNN BACKEND_DML BACKEND_CANN BACKEND_WEBGPU
+#define BACKEND_DEVICE BACKEND_PROC BACKEND_DNNL BACKEND_OPENVINO BACKEND_OPENBLAS BACKEND_MIGRAPHX BACKEND_ACL BACKEND_ARMNN BACKEND_DML BACKEND_CANN BACKEND_WEBGPU
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/providers.h"
 #include "core/providers/provider_factory_creators.h"
@@ -75,12 +75,6 @@ struct OrtStatus {
 #define BACKEND_OPENVINO ""
 #endif
 
-#ifdef USE_TVM
-#define BACKEND_TVM "-TVM"
-#else
-#define BACKEND_TVM ""
-#endif
-
 #if USE_OPENBLAS
 #define BACKEND_OPENBLAS "-OPENBLAS"
 #else
@@ -141,9 +135,6 @@ extern std::string openvino_device_type;
 }
 }  // namespace onnxruntime
 #endif
-#ifdef USE_TVM
-#include "core/providers/tvm/tvm_ep_options.h"
-#endif
 #ifdef USE_ACL
 #include "core/providers/acl/acl_provider_factory.h"
 #endif
@@ -444,15 +435,12 @@ std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGrap
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGraphX(int device_id);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Cuda(const OrtCUDAProviderOptions* params);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(const OrtDnnlProviderOptions* params);
-#ifdef USE_TVM
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tvm(const tvm::TvmEPOptions& info);
-std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tvm(const char* params);
-#endif
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_ACL(bool enable_fast_math);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_ArmNN(int use_arena);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_DML(int device_id);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Nnapi(
     uint32_t flags, const optional<std::string>& partitioning_stop_ops_list);
+std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_VSINPU();
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Rknpu();
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_CoreML(uint32_t flags);
 constexpr const char* kDefaultExecutionProviderEntry = "GetProvider";
diff --git a/onnxruntime/python/providers/tvm/__init__.py b/onnxruntime/python/providers/tvm/__init__.py
deleted file mode 100644
index 4bcbc0bfef586..0000000000000
--- a/onnxruntime/python/providers/tvm/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.  See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-"""
-JIT interface implementing packed functions that
-import and compile frontend models
-"""
-from .ort import ANSOR_TYPE, AUTO_TVM_TYPE, onnx_compile  # noqa: F401
diff --git a/onnxruntime/python/providers/tvm/extend_python_file.py b/onnxruntime/python/providers/tvm/extend_python_file.py
deleted file mode 100644
index 65902619f8150..0000000000000
--- a/onnxruntime/python/providers/tvm/extend_python_file.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.  See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-
-import argparse
-import textwrap
-
-
-def rewrite_target_file(target):
-    with open(target, "a") as f:
-        f.write(
-            textwrap.dedent(
-                """
-            import warnings
-
-            try:
-                # This import is necessary in order to delegate the loading of libtvm.so to TVM.
-                import tvm
-            except ImportError as e:
-                warnings.warn(
-                    f"WARNING: Failed to import TVM, libtvm.so was not loaded. More details: {e}"
-                )
-            try:
-                # Working between the C++ and Python parts in TVM EP is done using the PackedFunc and
-                # Registry classes. In order to use a Python function in C++ code, it must be registered in
-                # the global table of functions. Registration is carried out through the JIT interface,
-                # so it is necessary to call special functions for registration.
-                # To do this, we need to make the following import.
-                import onnxruntime.providers.tvm
-            except ImportError as e:
-                warnings.warn(
-                    f"WARNING: Failed to register python functions to work with TVM EP. More details: {e}"
-                )
-            """
-            )
-        )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--target_file",
-        type=str,
-        required=True,
-        help="Path to the file to be expanded.",
-    )
-    args = parser.parse_args()
-    rewrite_target_file(args.target_file)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/onnxruntime/python/providers/tvm/ort.py b/onnxruntime/python/providers/tvm/ort.py
deleted file mode 100644
index be6d23f39c532..0000000000000
--- a/onnxruntime/python/providers/tvm/ort.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.  See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-
-import collections
-import copy
-import logging
-import os
-
-import onnx
-import tvm
-from tvm import auto_scheduler, autotvm, relay
-from tvm.contrib import graph_executor
-from tvm.relay import vm
-
-log = logging.getLogger("tvm_ep")
-
-ANSOR_TYPE = "Ansor"
-AUTO_TVM_TYPE = "AutoTVM"
-
-
-@tvm.register_func("tvm_onnx_import_and_compile")
-def onnx_compile(
-    model_string,
-    model_path,
-    executor,
-    target,
-    target_host,
-    opt_level,
-    opset,
-    freeze_params,
-    input_shapes,
-    nhwc=False,
-    tuning_logfile="",
-    tuning_type=AUTO_TVM_TYPE,
-):
-    def get_tvm_executor(irmod, executor, target, params):
-        if executor == "vm":
-            log.info("Build TVM virtual machine")
-            lib = vm.compile(
-                copy.deepcopy(irmod),
-                target,
-                params=params,
-            )
-        elif executor == "graph":
-            log.info("Build TVM graph executor")
-            lib = relay.build(irmod, target=target, params=params)
-        else:
-            log.error(f'Executor type {executor} is unsupported. Only "vm" and "graph" types are supported')
-            return None
-        return lib
-
-    model = onnx.load_model_from_string(bytes(model_string))
-    if model_path:
-        base_dir = os.path.dirname(os.path.abspath(model_path))
-        onnx.load_external_data_for_model(model, base_dir)
-
-    # Collect only feed input names from all input names
-    all_input_names = [node.name for node in model.graph.input]
-    all_initializer = [node.name for node in model.graph.initializer]
-    net_feed_input_names = list(set(all_input_names) - set(all_initializer))
-
-    # Match names and input shapes
-    all_input_mapping = [(name, shape) for (name, shape) in zip(all_input_names, input_shapes)]
-    # Using an ordereddict maintains input ordering.
-    shape_dict = collections.OrderedDict(all_input_mapping)
-    # Get only feed input pairs
-    feed_shape_dict = {}
-    for name in net_feed_input_names:
-        feed_shape_dict[name] = shape_dict[name]
-
-    irmod, params = relay.frontend.from_onnx(model, feed_shape_dict, opset=opset, freeze_params=freeze_params)
-    irmod = relay.transform.DynamicToStatic()(irmod)
-
-    # Tuning file can be set by client through ep options
-    if not tuning_logfile:
-        tuning_logfile = os.getenv("AUTOTVM_TUNING_LOG")
-    lib = None
-    tvm_target = tvm.target.Target(target, host=target_host)
-    if tuning_logfile:
-        if tuning_type == ANSOR_TYPE:
-            desired_layouts = {
-                "nn.conv2d": ["NHWC", "default"],
-                "nn.conv2d_transpose": ["NHWC", "default"],
-                "nn.upsampling": ["NHWC", "default"],
-                "vision.roi_align": ["NHWC", "default"],
-            }
-            log.info("Use tuning file from %s: %s", ANSOR_TYPE, tuning_logfile)
-            with auto_scheduler.ApplyHistoryBest(tuning_logfile):  # noqa: SIM117
-                with tvm.transform.PassContext(
-                    opt_level=opt_level,
-                    config={
-                        "relay.backend.use_auto_scheduler": True,
-                        "relay.FuseOps.max_depth": 30,
-                    },
-                ):
-                    if nhwc:
-                        seq = tvm.transform.Sequential(
-                            [
-                                relay.transform.InferType(),
-                                relay.transform.ConvertLayout(desired_layouts),
-                                relay.transform.EliminateCommonSubexpr(),
-                                relay.transform.FoldConstant(),
-                            ]
-                        )
-                        irmod = seq(irmod)
-                    lib = get_tvm_executor(irmod, executor, tvm_target, params)
-        elif tuning_type == AUTO_TVM_TYPE:
-            with relay.build_config(opt_level=opt_level):
-                log.info("Use tuning file from %s: %s", AUTO_TVM_TYPE, tuning_logfile)
-                with autotvm.apply_history_best(tuning_logfile):
-                    lib = get_tvm_executor(irmod, executor, tvm_target, params)
-        else:
-            log.error(
-                f"Tuning log type {tuning_type} is unsupported. "
-                f"Only {ANSOR_TYPE} and {AUTO_TVM_TYPE} types are supported"
-            )
-            return None
-    else:
-        with tvm.transform.PassContext(opt_level=opt_level):
-            lib = get_tvm_executor(irmod, executor, tvm_target, params)
-
-    if lib is None:
-        return None
-
-    ctx = tvm.device(target, 0)
-    if executor == "vm":
-        m = tvm.runtime.vm.VirtualMachine(lib, ctx)
-    elif executor == "graph":
-        m = graph_executor.GraphModule(lib["default"](ctx))
-    else:
-        print(
-            f"ERROR: Executor type {executor} is unsupported. ",
-            'Only "vm" and "graph" types are supported',
-        )
-        return None
-
-    return m.module
diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
index f07fb30f10f82..6235db3234d49 100644
--- a/onnxruntime/python/tools/quantization/base_quantizer.py
+++ b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -19,7 +19,9 @@
 from .calibrate import TensorData
 from .onnx_model import ONNXModel
 from .quant_utils import (
+    DEQUANT_OP_NAME,
     ONNX_TYPE_TO_NP_TYPE,
+    QUANT_OP_NAME,
     TENSOR_NAME_QUANT_SUFFIX,
     find_by_name,
     model_has_infer_metadata,
@@ -178,6 +180,9 @@ def should_quantize_node(self, node):
         if node.op_type not in self.op_types_to_quantize:
             return False
 
+        if node.op_type in (DEQUANT_OP_NAME, QUANT_OP_NAME):
+            return False
+
         if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
             return False
 
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index 048c7f3296503..5552a4451c542 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -195,7 +195,11 @@ def __init__(
         # The default behavior is that multiple nodes can share a QDQ pair as their inputs.
         # In TRT, QDQ pair can`t be shared between nodes, so it will create dedicated QDQ pairs for each node.
         self.dedicated_qdq_pair = extra_options.get("DedicatedQDQPair", False)
-        self.tensor_to_its_receiving_nodes = {}
+        self.tensor_to_its_receiving_nodes: dict[str, list[onnx.NodeProto]] = {}
+
+        # Maps a tensor to the DequantizeLinear node (in the original input model) that outputs the tensor.
+        # Populated for input models with some pre-quantized weights (typically via a different tool).
+        self.tensor_to_producing_dq: dict[str, onnx.NodeProto] = {}
 
         # Let user set channel axis for specific op type and it's effective only when per channel quantization is supported and per_channel is True.
         self.qdq_op_type_per_channel_support_to_axis = extra_options.get("QDQOpTypePerChannelSupportToAxis", {})
@@ -555,6 +559,9 @@ def quantize_model(self):
                     if tensor_name not in self.tensor_to_its_receiving_nodes:
                         self.tensor_to_its_receiving_nodes[tensor_name] = []
                     self.tensor_to_its_receiving_nodes[tensor_name].append(node)
+            if node.op_type == DEQUANT_OP_NAME:
+                for tensor_name in node.output:
+                    self.tensor_to_producing_dq[tensor_name] = node
 
         self.initializer_quant_params = self._calc_initializer_quant_params()
         self._adjust_weight_quant_params_for_bias_tensors()
@@ -958,6 +965,14 @@ def _quantize_normal_tensors(self):
                 if initializer:
                     self._add_qdq_nodes_for_initializer(initializer)
                 else:
+                    # Check if this tensor is already a dequantized value. If so, skip it.
+                    # This happens if the original input model already has some pre-quantized weights
+                    # generated by a different tool.
+                    # Ex: (quantized_weight -> DequantizeLinear -> this_tensor)
+                    if tensor_name in self.tensor_to_producing_dq:
+                        del self.tensors_to_quantize[tensor_name]
+                        continue
+
                     tensor_qparam_initializers = self._make_tensor_scale_zp_initializers(tensor_name)
                     if not tensor_qparam_initializers:
                         raise ValueError(
@@ -1009,6 +1024,12 @@ def _quantize_sharing_param_tensors(self):
                     if self.is_input_a_initializer(tensor_name):
                         raise ValueError("Quantization parameter shared mode is not supported for weight yet")
 
+                    if tensor_name in self.tensor_to_producing_dq:
+                        raise ValueError(
+                            f"Quantization parameter sharing is invalid for tensor {tensor_name} "
+                            "because it has already been quantized"
+                        )
+
                     # Need to check if this tensor's quant_type is converted for some consumers.
                     # If so, create new scale/zp initializers for these consumers.
                     converted_qparam_inits = None
@@ -1147,6 +1168,30 @@ def is_tensor_per_channel(
 
         return True, axis
 
+    def _get_tensor_quantization_scale(self, tensor_name: str, consumer_node_name: str) -> np.ndarray | None:
+        """
+        Returns the quantization scale of a tensor that is consumed by the given node.
+        :parameter tensor_name: The name of the tensor.
+        :parameter consumer_node_name: The name of the node that consumes the tensor as input. Necessary in case
+                                       the quantization type of the tensor was converted.
+                                       Refer: QDQQuantizer::_add_qdq_ops_for_converted_activation.
+        :returns: The quantization scale or None.
+        """
+        initializers = self.model.initializer()
+        scale_initializer: onnx.TensorProto | None = None
+
+        if tensor_name in self.quantized_value_map:
+            # Tensor was quantized by this tool, so get scale from initializer created by this tool run.
+            scale_name = self.quantized_value_map[tensor_name].get_for_consumer(consumer_node_name).scale_name
+            scale_initializer = find_by_name(scale_name, initializers)
+        else:
+            # Tensor was already quantized in original model, so get scale from DQ node that outputs the tensor.
+            dq_node = self.tensor_to_producing_dq.get(tensor_name, None)
+            if dq_node:
+                scale_initializer = find_by_name(dq_node.input[1], initializers)
+
+        return tensor_proto_to_array(scale_initializer) if scale_initializer is not None else None
+
     def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> str:
         """
         Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
@@ -1156,17 +1201,21 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s
         if bias_name in self.quantized_value_map:
             return self.quantized_value_map[bias_name].original.q_name
 
-        # get scale for weight
-        weight_scale_name = self.quantized_value_map[bias_info.weight_name].original.scale_name
-        weight_scale_initializer = find_by_name(weight_scale_name, self.model.initializer())
-        weight_scale = tensor_proto_to_array(weight_scale_initializer)
+        # get scale for weight.
+        weight_scale = self._get_tensor_quantization_scale(bias_info.weight_name, bias_info.node_name)
+        if weight_scale is None:
+            raise ValueError(
+                f"Unable to get valid quantization scale for weight input '{bias_info.weight_name}' "
+                f"when quantizing bias '{bias_name}' to int32."
+            )
 
-        # get scale for input
-        input_scale_name = (
-            self.quantized_value_map[bias_info.input_name].get_for_consumer(bias_info.node_name).scale_name
-        )
-        input_scale_initializer = find_by_name(input_scale_name, self.model.initializer())
-        input_scale = tensor_proto_to_array(input_scale_initializer)
+        # get scale for input.
+        input_scale = self._get_tensor_quantization_scale(bias_info.input_name, bias_info.node_name)
+        if input_scale is None:
+            raise ValueError(
+                f"Unable to get valid quantization scale for input '{bias_info.input_name}' "
+                f"when quantizing bias '{bias_name}' to int32."
+            )
 
         (
             quantized_bias_name,
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index f368f35955955..4ffd8b9872982 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -231,6 +231,7 @@ def get_qdq_config(
     activation_symmetric: bool = False,
     weight_symmetric: bool | None = None,
     per_channel: bool = False,
+    reduce_range: bool = False,
     keep_removable_activations: bool = False,
     min_real_range: float | None = None,
     tensor_quant_overrides: dict[str, list[dict[str, Any]]] | None = None,
@@ -245,7 +246,7 @@ def get_qdq_config(
         calibration_data_reader: Calibration data reader.
         calibrate_methode: The calibration method. Defaults to MinMax.
         activation_type: The default activation quantization type. Defaults to QUInt8.
-        weight_type: The default weight quantization type. Defaults to QUInt8.
+        weight_type: The default weight quantization type. Defaults to QInt8.
         activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
             Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uint16,
             the zero-point values are 127 and 32,767, respectively.
@@ -254,6 +255,8 @@ def get_qdq_config(
         per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
             Defaults to false. Alternatively, use the tensor-level `tensor_quant_overrides` to select individual operators
             and their quantization axes.
+        reduce_range: quantize weights with 1 less bit of precision (e.g., 7 bits for QInt8). Defaults to false.
+            May improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode.
         keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
                         be removed, and will be explicitly represented in the QDQ model. If false, these activations
                         are automatically removed if activations are asymmetrically quantized. Keeping these activations
@@ -373,6 +376,7 @@ def get_qdq_config(
         op_types_to_quantize=list(op_types.difference(op_types_to_exclude)),
         nodes_to_exclude=final_nodes_to_exclude,
         per_channel=per_channel,
+        reduce_range=reduce_range,
         use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
         extra_options=final_extra_options,
     )
diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py
index 167fc8697ce06..ccf2497d61342 100644
--- a/onnxruntime/python/tools/transformers/bert_test_data.py
+++ b/onnxruntime/python/tools/transformers/bert_test_data.py
@@ -250,6 +250,7 @@ def generate_test_data(
     average_sequence_length: int,
     random_sequence_length: bool,
     mask_type: int,
+    dictionary_size: int = 10000,
 ):
     """Create given number of input data for testing
 
@@ -270,7 +271,6 @@ def generate_test_data(
         List[Dict[str,numpy.ndarray]]: list of test cases, where each test case is a dictionary
                                        with input name as key and a tensor as value
     """
-    dictionary_size = 10000
     all_inputs = fake_test_data(
         batch_size,
         sequence_length,
diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py
index 0c5125e74c8a4..03bcc20d9a5de 100644
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@@ -85,6 +85,7 @@ def run_test(
     segment_ids_name,
     input_mask_name,
     mask_type,
+    dictionary_size: int = 1024,
 ):
     # Try deduce input names from optimized model.
     input_ids, segment_ids, input_mask = get_bert_inputs(
@@ -105,6 +106,7 @@ def run_test(
         average_sequence_length,
         True,  # random sequence length
         mask_type,
+        dictionary_size=dictionary_size,
     )
 
     baseline_results, baseline_latency, output_names = run_model(
diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py
index 2398bb9d6031b..74adc951c4aa3 100644
--- a/onnxruntime/python/tools/transformers/float16.py
+++ b/onnxruntime/python/tools/transformers/float16.py
@@ -132,6 +132,7 @@ def make_value_info_from_tensor(tensor):
     "Scaler",
     "TreeEnsembleClassifier",
     "TreeEnsembleRegressor",
+    "TreeEnsemble",
     "ZipMap",
     "NonMaxSuppression",
     "TopK",
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index a9ff623fb6967..030708783bb61 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -42,26 +42,26 @@ def get_first_mask(self):
         assert len(self.mask_indice) > 0
         return next(iter(self.mask_indice))
 
-    def process_mask(self, input: str) -> str:
+    def process_mask(self, mask_2d: str) -> Optional[str]:
         if self.mask_format == AttentionMaskFormat.NoMask:
             return None
 
-        if input in self.mask_indice:
-            return self.mask_indice[input]
+        if mask_2d in self.mask_indice:
+            return self.mask_indice[mask_2d]
 
         # Add cast to convert int64 to int32
-        if self.model.find_graph_input(input):
-            casted, input_name = self.utils.cast_graph_input_to_int32(input)
+        if self.model.find_graph_input(mask_2d):
+            casted, input_name = self.utils.cast_graph_input_to_int32(mask_2d)
         else:
-            input_name, cast_node = self.utils.cast_input_to_int32(input)
+            input_name, _cast_node = self.utils.cast_input_to_int32(mask_2d)
             casted = True
 
         if casted:
-            self.mask_casted[input] = input_name
+            self.mask_casted[mask_2d] = input_name
 
         # Attention supports int32 attention mask (2D) since 1.4.0
         if self.mask_format == AttentionMaskFormat.AttentionMask:
-            self.mask_indice[input] = input_name
+            self.mask_indice[mask_2d] = input_name
             return input_name
 
         # Add a mask processing node to convert attention mask to mask index (1D)
@@ -97,7 +97,7 @@ def process_mask(self, input: str) -> str:
 
         self.model.add_node(mask_index_node)
 
-        self.mask_indice[input] = output_name
+        self.mask_indice[mask_2d] = output_name
         return output_name
 
 
@@ -173,17 +173,20 @@ def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]
             Tuple[int, int]: num_heads and hidden_size
         """
         # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
-        q_shape = self.model.get_initializer(reshape_q.input[1])
-        if q_shape is None:
+        q_shape_value = self.model.get_constant_value(reshape_q.input[1])
+        if q_shape_value is None:
             concat = self.model.get_parent(reshape_q, 1)
             if concat is not None and concat.op_type == "Concat":
                 return self.get_num_heads_and_hidden_size_from_concat(concat)
-            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+            logger.debug("%s is not initializer.", reshape_q.input[1])
             return self.num_heads, self.hidden_size  # Fall back to user specified value
 
-        q_shape_value = NumpyHelper.to_array(q_shape)
-        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
-            logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
+        if (
+            (not isinstance(q_shape_value, np.ndarray))
+            or len(q_shape_value) != 4
+            or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0)
+        ):
+            logger.debug("q_shape_value=%s. Expected value are like [0, 0, num_heads, head_size].", q_shape_value)
             return self.num_heads, self.hidden_size  # Fall back to user specified value
 
         num_heads = q_shape_value[2]
@@ -192,13 +195,15 @@ def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]
 
         if self.num_heads > 0 and num_heads != self.num_heads:
             if self.num_heads_warning:
-                logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
+                logger.warning(
+                    "--num_heads is %d. Detected value is %d. Using detected value.", self.num_heads, num_heads
+                )
                 self.num_heads_warning = False  # Do not show the warning more than once
 
         if self.hidden_size > 0 and hidden_size != self.hidden_size:
             if self.hidden_size_warning:
                 logger.warning(
-                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                    "--hidden_size is %d. Detected value is %d. Using detected value.", self.hidden_size, hidden_size
                 )
                 self.hidden_size_warning = False  # Do not show the warning more than once
 
@@ -216,11 +221,11 @@ def get_add_qk_str(self, add_qk: NodeProto):
         input_1_shape = self.shape_infer.get_edge_shape(add_qk.input[1])
 
         if input_0_shape is None or input_1_shape is None:
-            logger.debug(f"one of the inputs of {add_qk} is None")
+            logger.debug("one of the inputs of %s is None", add_qk)
             return None
 
         if input_0_shape != input_1_shape:
-            logger.debug(f"the shape of two inputs of {add_qk} is not same")
+            logger.debug("the shape of two inputs of %s is not same", add_qk)
             return None
 
         return add_qk.input[1]
@@ -305,55 +310,6 @@ def concat_kv(self, past_k: str, past_v: str) -> str:
 
         return kv_output_name
 
-    def reshape_kv(self, past_k: str, past_v: str) -> (str, str):
-        """Reshape past_k and past_v from 4D to 3D to use as inputs for multihead attention node.
-
-        Args:
-            past_k (str): name of past K value of shape 4D
-            past_v (str): name of past V value of shape 4D
-
-        Returns:
-            k_3d (str): name of past K value of shape 3D
-            v_3d (str): name of past V value of shape 3D
-        """
-        # Reshape past_k and past_v from (B,N,P,H) to (B,P,N*H)
-        # B = batch size, N = num heads, P = past seq len, H = head size
-
-        # Create initializer for reshaping past_k and past_v
-        new_dims_name = "kv_4d_to_3d"
-        new_dims = self.model.get_initializer(new_dims_name)
-        if new_dims is None:
-            new_dims = numpy_helper.from_array(
-                np.array([0, -1, self.model.hidden_size], dtype="int64"), name=new_dims_name
-            )
-            self.model.add_initializer(new_dims, self.this_graph_name)
-
-        reshape_k_name = self.model.create_node_name("Reshape")
-        reshape_v_name = self.model.create_node_name("Reshape")
-        k_3d_name = (past_k + "_3d").replace(".", "_")
-        v_3d_name = (past_v + "_3d").replace(".", "_")
-
-        k_3d = helper.make_node(
-            "Reshape",
-            inputs=[past_k, new_dims_name],
-            outputs=[k_3d_name],
-            name=reshape_k_name,
-        )
-        v_3d = helper.make_node(
-            "Reshape",
-            inputs=[past_v, new_dims_name],
-            outputs=[v_3d_name],
-            name=reshape_v_name,
-        )
-
-        # Add reshape nodes to graph
-        self.nodes_to_add.append(k_3d)
-        self.nodes_to_add.append(v_3d)
-        self.node_name_to_graph_name[reshape_k_name] = self.this_graph_name
-        self.node_name_to_graph_name[reshape_v_name] = self.this_graph_name
-
-        return k_3d_name, v_3d_name
-
     def split_kv(self, present_k_name: str, present_v_name: str, kv_node: str):
         """Split kv_node containing present KV values into separate present K and present V values.
 
@@ -476,8 +432,7 @@ def create_packed_qkv_matmul_node(
         q_add: NodeProto,
         k_add: Union[NodeProto, None],
         v_add: Union[NodeProto, None],
-        num_heads: int,
-    ) -> Union[NodeProto, None]:
+    ) -> Tuple[NodeProto, NodeProto, NodeProto]:
         """Create packed QKV MatMul node before MultiHeadAttention node.
            This is for the scenario where an Attention node should be created but cannot be created
            because past_key and past_value are separate inputs and not one concatenated input.
@@ -489,10 +444,11 @@ def create_packed_qkv_matmul_node(
             q_add (NodeProto): name of Add from Q path
             k_add (NodeProto): name of Add from K path
             v_add (NodeProto): name of Add from V path
-            num_heads (int): number of heads
 
         Returns:
-            Union[NodeProto, None]: the node created or None if failed.
+             q_output (NodeProto): Slice node for Q
+             k_output (NodeProto): Slice node for K
+             v_output (NodeProto): Slice node for V
         """
         matmul_node_name = self.model.create_node_name("MatMul")
 
@@ -611,6 +567,7 @@ def create_packed_qkv_matmul_node(
         self.nodes_to_add.extend(qkv_nodes)
         return q_output, k_output, v_output
 
+    # This function is used in child classes for bart or conformer model.
     def create_multihead_attention_node(
         self,
         q_matmul: NodeProto,
@@ -659,7 +616,7 @@ def create_multihead_attention_node(
         assert num_heads > 0
 
         if hidden_size > 0 and (hidden_size % num_heads) != 0:
-            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            logger.debug("input hidden size %d is not a multiple of num of heads %d", hidden_size, num_heads)
             return None
 
         graph_input_names = set([node.name for node in self.model.graph().input])
@@ -669,17 +626,22 @@ def create_multihead_attention_node(
         mha_inputs = []
         if packed_qkv:
             q_slice, k_slice, v_slice = self.create_packed_qkv_matmul_node(
-                q_matmul, k_matmul, v_matmul, q_add, k_add, v_add, num_heads
+                q_matmul,
+                k_matmul,
+                v_matmul,
+                q_add,
+                k_add,
+                v_add,
             )
             mha_inputs.extend([q_slice.output[0], k_slice.output[0], v_slice.output[0]])
-        elif type(k_matmul) is NodeProto and type(v_matmul) is NodeProto:
+        elif isinstance(k_matmul, NodeProto) and isinstance(v_matmul, NodeProto):
             if self.disable_multi_head_attention_bias:
                 mha_inputs.extend([q_add.output[0], k_matmul.output[0], v_add.output[0]])
             else:
                 mha_inputs.extend([q_matmul.output[0], k_matmul.output[0], v_matmul.output[0]])
         elif (
-            type(k_matmul) == str  # noqa: E721
-            and type(v_matmul) == str  # noqa: E721
+            isinstance(k_matmul, str)
+            and isinstance(v_matmul, str)
             and k_matmul in graph_input_names
             and v_matmul in graph_input_names
         ):
@@ -724,7 +686,7 @@ def create_multihead_attention_node(
 
     def create_attention_node(
         self,
-        mask_index: str,
+        mask_index: Optional[str],
         q_matmul: NodeProto,
         k_matmul: NodeProto,
         v_matmul: NodeProto,
@@ -733,7 +695,7 @@ def create_attention_node(
         v_add: NodeProto,
         num_heads: int,
         hidden_size: int,
-        input: str,
+        first_input: str,
         output: str,
         add_qk_str: str = "",
         past_k: str = "",
@@ -746,7 +708,7 @@ def create_attention_node(
         """Create an Attention node.
 
         Args:
-            mask_index (str): mask input
+            mask_index (str | None): mask input
             q_matmul (NodeProto): MatMul node in fully connection for Q
             k_matmul (NodeProto): MatMul node in fully connection for K
             v_matmul (NodeProto): MatMul node in fully connection for V
@@ -755,7 +717,7 @@ def create_attention_node(
             v_add (NodeProto): Add bias node in fully connection for V
             num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
             hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
-            input (str): input name
+            first_input (str): first input name
             output (str): output name
             add_qk_str (str): name of Add node after Q x K'
             past_k (str): name of input for past K value
@@ -771,7 +733,7 @@ def create_attention_node(
         assert num_heads > 0
 
         if hidden_size > 0 and (hidden_size % num_heads) != 0:
-            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            logger.debug("input hidden size %d is not a multiple of num of heads %d", hidden_size, num_heads)
             return None
 
         has_bias = True
@@ -813,8 +775,10 @@ def create_attention_node(
 
         if hidden_size > 0 and hidden_size != qw_in_size:
             logger.warning(
-                f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
-                "Please provide a correct input hidden size or pass in 0"
+                "Input hidden size (%d) is not same as weight matrix dimension of q,k,v (%d). "
+                "Please provide a correct input hidden size or pass in 0",
+                hidden_size,
+                qw_in_size,
             )
 
         is_qkv_diff_dims = False
@@ -836,6 +800,8 @@ def create_attention_node(
             qkv_weight = np.stack((qw, kw, vw), axis=1)
             qkv_weight_dim = 3 * qw_out_size
 
+        qkv_bias_dim = 0
+        qkv_bias: Optional[np.ndarray] = None
         if has_bias:
             qb = NumpyHelper.to_array(q_bias)
             kb = NumpyHelper.to_array(k_bias)
@@ -861,7 +827,7 @@ def create_attention_node(
             self.add_initializer(
                 name=attention_node_name + "_qkv_weight",
                 data_type=q_weight.data_type,
-                dims=[qw_in_size, qkv_weight_dim],
+                dims=[qw_in_size, int(qkv_weight_dim)],
                 vals=qkv_weight,
             )
 
@@ -869,7 +835,7 @@ def create_attention_node(
             self.add_initializer(
                 name=attention_node_name + "_qkv_bias",
                 data_type=q_bias.data_type,
-                dims=[qkv_bias_dim],
+                dims=[int(qkv_bias_dim)],
                 vals=qkv_bias,
             )
 
@@ -897,7 +863,7 @@ def create_attention_node(
             )
         else:
             attention_inputs = [
-                input,
+                first_input,
                 attention_node_name + "_qkv_weight",
                 attention_node_name + "_qkv_bias" if has_bias else "",
             ]
@@ -911,7 +877,7 @@ def create_attention_node(
                 past_kv = self.concat_kv(past_k, past_v)
                 attention_inputs.append(past_kv)
 
-            if add_qk_str is not None:
+            if add_qk_str:
                 mask_output_name = self.reshape_add_qk(add_qk_str)
 
                 # Add attention mask to attention node
@@ -951,9 +917,10 @@ def create_attention_node(
 
         return attention_node
 
-    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
         # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
         # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        normalize_node = node
         start_node = normalize_node
         if normalize_node.op_type == "LayerNormalization":
             add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
@@ -982,25 +949,24 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 return
 
         other_inputs = []
-        for _i, input in enumerate(start_node.input):
-            if input not in output_name_to_node:
+        for _i, node_input in enumerate(start_node.input):
+            if node_input not in output_name_to_node:
                 continue
 
-            if input == qkv_nodes[0].output[0]:
+            if node_input == qkv_nodes[0].output[0]:
                 continue
-            other_inputs.append(input)
+            other_inputs.append(node_input)
         if len(other_inputs) != 1:
             return
 
         root_input = other_inputs[0]
-        """
-        Match flaubert                     Mask
-                                            |
-        Mul --> LayerNormalization -->  Attention --> MatMul --> Add
-         |                                                        |
-         |                                                        |
-         +---------------------------------------------------------
-        """
+
+        # Match flaubert                     Mask
+        #                                     |
+        # Mul --> LayerNormalization -->  Attention --> MatMul --> Add
+        #  |                                                        |
+        #  |                                                        |
+        #  +---------------------------------------------------------
         mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
         if mul_before_layernorm is not None:
             mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
@@ -1020,19 +986,15 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 if child.op_type == "LayerNormalization":
                     root_input = child.output[0]
 
-        """
-        When Add before the LayerNormalization produces an output
-        that is consumed by some other nodes other than the LayerNormalization itself,
-        fused SkipLayerNormalization will have several outputs.
-        In this case we need to pick the one used in Attention
-
-        For example, this is the case for ViT
-
-        SkipLayerNormalization --> Attention --> MatMul --> Add --> SkipLayerNormalization
-         |                                                                     |
-         |                                                                     |
-         +---------------------------------------------------------------------+
-        """
+        # When Add before the LayerNormalization produces an output
+        # that is consumed by some other nodes other than the LayerNormalization itself,
+        # fused SkipLayerNormalization will have several outputs.
+        # In this case we need to pick the one used in Attention
+        # For example, this is the case for ViT
+        # SkipLayerNormalization --> Attention --> MatMul --> Add --> SkipLayerNormalization
+        #  |                                                                     |
+        #  |                                                                     |
+        #  +---------------------------------------------------------------------+
         parent_node = output_name_to_node[root_input]
         if parent_node.op_type == "SkipLayerNormalization" and len(parent_node.output) == 4:
             root_input = parent_node.output[0]
@@ -1051,12 +1013,14 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         is_distill = False
         is_distill_add = False
         is_no_mask_attention = False
+        is_sdpa = False
         qk_paths = {
             "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
             "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
             "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
             "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
             "path5": (["Softmax", "Div", "MatMul"], [0, 0, 0]),
+            "sdpa": (["Softmax", "Add", "MatMul", "Mul", "Sqrt"], [0, 0, None, 0, 1]),
         }
 
         qk_nodes = None
@@ -1066,10 +1030,12 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 continue
             if k == "path3":
                 is_distill = True
-            if k == "path4":
+            elif k == "path4":
                 is_distill_add = True
-            if k == "path5":
+            elif k == "path5":
                 is_no_mask_attention = True
+            elif k == "sdpa":
+                is_sdpa = True
             break
 
         if qk_nodes is None:
@@ -1079,19 +1045,23 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         add_qk = None
         matmul_qk = None
         where_qk = None
+        after_q = None
         if is_distill:
             (_, where_qk, matmul_qk, _) = qk_nodes
         elif is_distill_add:
             (_, add_qk, where_qk, matmul_qk) = qk_nodes
         elif is_no_mask_attention:
             (_, _, matmul_qk) = qk_nodes
+        elif is_sdpa:
+            (_, add_qk, matmul_qk, after_q, _) = qk_nodes
         else:
             (_, add_qk, _, matmul_qk) = qk_nodes
 
-        q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None])
+        after_q = after_q or matmul_qk
+        q_nodes = self.model.match_parent_path(after_q, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None])
         if q_nodes is None:
             q_nodes = self.model.match_parent_path(
-                matmul_qk,
+                after_q,
                 ["Div", "Transpose", "Reshape", "Add", "MatMul"],
                 [0, 0, 0, 0, None],
             )
@@ -1102,7 +1072,17 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         add_q = q_nodes[-2]
         matmul_q = q_nodes[-1]
 
-        k_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
+        after_k = matmul_qk
+        if is_sdpa:
+            mul_k_nodes = self.model.match_parent_path(matmul_qk, ["Mul", "Sqrt"], [1, None])
+            if mul_k_nodes is None:
+                logger.debug("fuse_attention: failed to match mul sqrt q path")
+                return
+            (after_k, _) = mul_k_nodes
+
+        k_nodes = self.model.match_parent_path(
+            after_k, ["Transpose", "Reshape", "Add", "MatMul"], [0 if is_sdpa else 1, 0, 0, None]
+        )
         if k_nodes is None:
             k_nodes = self.model.match_parent_path(
                 matmul_qk,
@@ -1117,7 +1097,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
 
         # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
         mask_nodes = None
-        add_qk_str = None
+        add_qk_str = ""
         if is_distill:
             _, mask_nodes, _ = self.model.match_parent_paths(
                 where_qk,
@@ -1140,7 +1120,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             if add_qk is not None:
                 add_qk_str = self.get_add_qk_str(add_qk)
                 if add_qk_str is None:
-                    logger.debug(f"fuse_attention: failed to verify shape inference of {add_qk}")
+                    logger.debug("fuse_attention: failed to verify shape inference of %s", add_qk)
                     return
         elif is_no_mask_attention:
             pass
@@ -1148,11 +1128,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             _, mask_nodes, _ = self.model.match_parent_paths(
                 add_qk,
                 [
-                    (
-                        ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
-                        [None, 0, 1, 0, 0],
-                    ),
+                    (["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0, 0]),
                     (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
+                    # The following two patterns are for SDPA.
+                    (["Where", "Cast", "Sub", "Expand", "Unsqueeze", "Unsqueeze"], [None, 0, 0, 1, 0, 0]),
+                    (["Where", "Cast", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"], [None, 0, 0, 1, 0, 0, 0]),
                 ],
                 output_name_to_node,
             )
@@ -1160,10 +1140,17 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             logger.debug("fuse_attention: failed to match mask path")
             return
 
-        if not is_no_mask_attention and len(mask_nodes) > 1 and mask_nodes[0].op_type == "Mul":
+        if not is_no_mask_attention and len(mask_nodes) > 1:
             _, mul_val = self.model.get_constant_input(mask_nodes[0])
-            if mul_val != -10000:
-                self.mask_filter_value = mul_val
+            # The mask value shall be a float scalar (usually is the lowest float value).
+            if (
+                (mul_val is None)
+                or not (isinstance(mul_val, np.ndarray) and mul_val.size == 1)
+                or (float(mul_val) >= 0)
+            ):
+                return
+            if float(mul_val) != -10000:
+                self.mask_filter_value = float(mul_val)
 
         if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
             mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) if not is_no_mask_attention else None
@@ -1181,19 +1168,20 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
             # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
             new_node = self.create_attention_node(
-                mask_index,
-                matmul_q,
-                matmul_k,
-                matmul_v,
-                add_q,
-                add_k,
-                add_v,
-                q_num_heads,
-                q_hidden_size,
-                root_input,
-                attention_last_node.output[0],
-                add_qk_str,
+                mask_index=mask_index,
+                q_matmul=matmul_q,
+                k_matmul=matmul_k,
+                v_matmul=matmul_v,
+                q_add=add_q,
+                k_add=add_k,
+                v_add=add_v,
+                num_heads=q_num_heads,
+                hidden_size=q_hidden_size,
+                first_input=root_input,
+                output=attention_last_node.output[0],
+                add_qk_str=add_qk_str,
             )
+
             if new_node is None:
                 return
 
@@ -1208,7 +1196,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                     name="shape_modified_tensor" + unique_index,
                     data_type=TensorProto.INT64,
                     dims=[4],
-                    vals=np.int64([0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]),
+                    vals=[0, 0, q_num_heads, int(q_hidden_size / q_num_heads)],
                     raw=False,
                 )
 
diff --git a/onnxruntime/python/tools/transformers/fusion_attention_clip.py b/onnxruntime/python/tools/transformers/fusion_attention_clip.py
index b027957fcc725..16e2c36bfd092 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention_clip.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention_clip.py
@@ -239,9 +239,9 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             v_add=add_v,
             num_heads=num_heads,
             hidden_size=hidden_size,
-            input=root_input,
+            first_input=root_input,
             output=attention_last_node.output[0],
-            add_qk_str=None,
+            add_qk_str="",
             scale=None,
             causal=(add_mask is not None),
         )
diff --git a/onnxruntime/python/tools/transformers/fusion_bart_attention.py b/onnxruntime/python/tools/transformers/fusion_bart_attention.py
index ebecc1db24792..8c334b83abfeb 100644
--- a/onnxruntime/python/tools/transformers/fusion_bart_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_bart_attention.py
@@ -564,15 +564,15 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 # value whereas attention supports concatenated past key and past value.
                 new_node = (
                     self.create_multihead_attention_node(
-                        matmul_q,
-                        matmul_k if decoder_cross_attention or decoder_attention_with_past else past_k,
-                        matmul_v if decoder_cross_attention or decoder_attention_with_past else past_v,
-                        add_q,
-                        add_k if decoder_cross_attention or decoder_attention_with_past else None,
-                        add_v if decoder_cross_attention or decoder_attention_with_past else None,
-                        num_heads,
-                        hidden_size,
-                        attention_last_node.output[0],
+                        q_matmul=matmul_q,
+                        k_matmul=matmul_k if decoder_cross_attention or decoder_attention_with_past else past_k,
+                        v_matmul=matmul_v if decoder_cross_attention or decoder_attention_with_past else past_v,
+                        q_add=add_q,
+                        k_add=add_k if decoder_cross_attention or decoder_attention_with_past else None,
+                        v_add=add_v if decoder_cross_attention or decoder_attention_with_past else None,
+                        num_heads=num_heads,
+                        hidden_size=hidden_size,
+                        output=attention_last_node.output[0],
                         past_k=past_k if decoder_attention_with_past else "",
                         past_v=past_v if decoder_attention_with_past else "",
                         present_k=present_k,
@@ -586,19 +586,20 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 # Temporarily set multihead attention flag to false
                 use_multi_head_attention_ground_truth = self.use_multi_head_attention
                 self.use_multi_head_attention = False
+                add_qk_str = mask_index if decoder_attention and mask_index else ""
                 new_node = self.create_attention_node(
-                    None,
-                    matmul_q,
-                    matmul_k,
-                    matmul_v,
-                    add_q,
-                    add_k,
-                    add_v,
-                    num_heads,
-                    hidden_size,
-                    root_input,
-                    attention_last_node.output[0],
-                    add_qk_str=mask_index if decoder_attention else None,
+                    mask_index=None,
+                    q_matmul=matmul_q,
+                    k_matmul=matmul_k,
+                    v_matmul=matmul_v,
+                    q_add=add_q,
+                    k_add=add_k,
+                    v_add=add_v,
+                    num_heads=num_heads,
+                    hidden_size=hidden_size,
+                    first_input=root_input,
+                    output=attention_last_node.output[0],
+                    add_qk_str=add_qk_str,
                     past_k=past_k,
                     past_v=past_v,
                     present_k=present_k,
diff --git a/onnxruntime/python/tools/transformers/fusion_conformer_attention.py b/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
index 6bc681c57444e..f29d0a0ac9441 100644
--- a/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
@@ -102,15 +102,15 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             return
 
         new_node = self.create_multihead_attention_node(
-            matmul_q,
-            matmul_k,
-            matmul_v,
-            add_q,
-            add_k,
-            add_v,
-            num_heads,
-            hidden_size,
-            attention_last_node.output[0],
+            q_matmul=matmul_q,
+            k_matmul=matmul_k,
+            v_matmul=matmul_v,
+            q_add=add_q,
+            k_add=add_k,
+            v_add=add_v,
+            num_heads=num_heads,
+            hidden_size=hidden_size,
+            output=attention_last_node.output[0],
             add_qk=add_qk.input[1],
             past_k=past_k,
             past_v=past_v,
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
index 3879e25386d53..0708d57f040f8 100755
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
@@ -51,6 +51,10 @@ def example_prompts():
     return prompts, negative_prompt
 
 
+def warmup_prompts():
+    return "warm up", "bad"
+
+
 def measure_gpu_memory(monitor_type, func, start_memory=None):
     return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory)
 
@@ -136,7 +140,14 @@ def run_ort_pipeline(
     prompts, negative_prompt = example_prompts()
 
     def warmup():
-        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+        prompt, negative = warmup_prompts()
+        pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative] * batch_size,
+        )
 
     # Run warm up, and measure GPU memory of two runs
     # cuDNN/MIOpen The first run has  algo search so it might need more memory)
@@ -149,22 +160,20 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
-            images = pipe(
-                [prompt] * batch_size,
-                height,
-                width,
-                num_inference_steps=steps,
-                negative_prompt=[negative_prompt] * batch_size,
-                guidance_scale=7.5,
-            ).images
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"Inference took {latency:.3f} seconds")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        inference_start = time.time()
+        images = pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative_prompt] * batch_size,
+        ).images
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
 
     from onnxruntime import __version__ as ort_version
 
@@ -200,7 +209,14 @@ def run_torch_pipeline(
 
     # total 2 runs of warm up, and measure GPU memory for CUDA EP
     def warmup():
-        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+        prompt, negative = warmup_prompts()
+        pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative] * batch_size,
+        )
 
     # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
     first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
@@ -215,25 +231,23 @@ def warmup():
         if i >= num_prompts:
             break
         torch.cuda.synchronize()
-        for j in range(batch_count):
-            inference_start = time.time()
-            images = pipe(
-                prompt=[prompt] * batch_size,
-                height=height,
-                width=width,
-                num_inference_steps=steps,
-                guidance_scale=7.5,
-                negative_prompt=[negative_prompt] * batch_size,
-                generator=None,  # torch.Generator
-            ).images
+        inference_start = time.time()
+        images = pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative_prompt] * batch_size,
+            generator=None,  # torch.Generator
+        ).images
 
-            torch.cuda.synchronize()
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"Inference took {latency:.3f} seconds")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        torch.cuda.synchronize()
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
 
     return {
         "engine": "torch",
@@ -306,6 +320,7 @@ def get_optimum_ort_pipeline(
     directory: str,
     provider="CUDAExecutionProvider",
     disable_safety_checker: bool = True,
+    use_io_binding: bool = False,
 ):
     from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
 
@@ -321,7 +336,7 @@ def get_optimum_ort_pipeline(
             pipeline = ORTStableDiffusionPipeline.from_pretrained(
                 directory,
                 provider=provider,
-                use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
+                use_io_binding=use_io_binding,
             )
     elif "xl" in model_name:
         pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
@@ -337,7 +352,7 @@ def get_optimum_ort_pipeline(
             model_name,
             export=True,
             provider=provider,
-            use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
+            use_io_binding=use_io_binding,
         )
         pipeline.save_pretrained(directory)
 
@@ -359,15 +374,33 @@ def run_optimum_ort_pipeline(
     batch_count,
     start_memory,
     memory_monitor_type,
+    use_num_images_per_prompt=False,
 ):
     from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
 
     assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline))
 
-    prompts = example_prompts()
+    prompts, negative_prompt = example_prompts()
 
     def warmup():
-        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+        prompt, negative = warmup_prompts()
+        if use_num_images_per_prompt:
+            pipe(
+                prompt=prompt,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                negative_prompt=negative,
+                num_images_per_prompt=batch_count,
+            )
+        else:
+            pipe(
+                prompt=[prompt] * batch_size,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                negative_prompt=[negative] * batch_size,
+            )
 
     # Run warm up, and measure GPU memory of two runs.
     # The first run has algo search for cuDNN/MIOpen, so it might need more memory.
@@ -380,23 +413,30 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
+        inference_start = time.time()
+        if use_num_images_per_prompt:
             images = pipe(
-                prompt,
-                height,
-                width,
+                prompt=prompt,
+                height=height,
+                width=width,
                 num_inference_steps=steps,
-                negative_prompt=None,
-                guidance_scale=0.0,  # 7.5
+                negative_prompt=negative_prompt,
                 num_images_per_prompt=batch_size,
             ).images
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"Inference took {latency:.3f} seconds")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        else:
+            images = pipe(
+                prompt=[prompt] * batch_size,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                negative_prompt=[negative_prompt] * batch_size,
+            ).images
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
 
     from onnxruntime import __version__ as ort_version
 
@@ -429,9 +469,12 @@ def run_optimum_ort(
     batch_count: int,
     start_memory,
     memory_monitor_type,
+    use_io_binding: bool = False,
 ):
     load_start = time.time()
-    pipe = get_optimum_ort_pipeline(model_name, directory, provider, disable_safety_checker)
+    pipe = get_optimum_ort_pipeline(
+        model_name, directory, provider, disable_safety_checker, use_io_binding=use_io_binding
+    )
     load_end = time.time()
     print(f"Model loading took {load_end - load_start} seconds")
 
@@ -530,9 +573,8 @@ def run_ort_trt_static(
     pipeline.load_resources(height, width, batch_size)
 
     def warmup():
-        pipeline.run(
-            ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
-        )
+        prompt, negative = warmup_prompts()
+        pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
 
     # Run warm up, and measure GPU memory of two runs
     # The first run has algo search so it might need more memory
@@ -548,24 +590,23 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            images, pipeline_time = pipeline.run(
-                [prompt] * batch_size,
-                [negative_prompt] * batch_size,
-                height,
-                width,
-                denoising_steps=steps,
-                guidance=7.5,
-                seed=123,
-            )
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = pipeline.run(
+            [prompt] * batch_size,
+            [negative_prompt] * batch_size,
+            height,
+            width,
+            denoising_steps=steps,
+            guidance=7.5,
+            seed=123,
+        )
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
 
     pipeline.teardown()
 
@@ -671,9 +712,8 @@ def run_tensorrt_static(
     pipeline.load_resources(height, width, batch_size)
 
     def warmup():
-        pipeline.run(
-            ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
-        )
+        prompt, negative = warmup_prompts()
+        pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
 
     # Run warm up, and measure GPU memory of two runs
     # The first run has algo search so it might need more memory
@@ -689,24 +729,22 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            images, pipeline_time = pipeline.run(
-                [prompt] * batch_size,
-                [negative_prompt] * batch_size,
-                height,
-                width,
-                denoising_steps=steps,
-                guidance=7.5,
-                seed=123,
-            )
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = pipeline.run(
+            [prompt] * batch_size,
+            [negative_prompt] * batch_size,
+            height,
+            width,
+            denoising_steps=steps,
+            seed=123,
+        )
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
 
     pipeline.teardown()
 
@@ -828,7 +866,8 @@ def run_sd_xl_inference(prompt, negative_prompt, seed=None):
         )
 
     def warmup():
-        run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
+        prompt, negative = warmup_prompts()
+        run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
 
     # Run warm up, and measure GPU memory of two runs
     # The first run has algo search so it might need more memory
@@ -845,20 +884,15 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            if nvtx_profile:
-                cudart.cudaProfilerStart()
-            images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
-            if nvtx_profile:
-                cudart.cudaProfilerStop()
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.png")
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.png")
 
     pipeline.teardown()
 
@@ -911,8 +945,6 @@ def run_ort_trt_xl(
         opt_batch_size=batch_size,
     )
 
-    from cuda import cudart
-
     assert batch_size <= max_batch_size
 
     pipeline.load_resources(height, width, batch_size)
@@ -929,7 +961,8 @@ def run_sd_xl_inference(prompt, negative_prompt, seed=None):
         )
 
     def warmup():
-        run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
+        prompt, negative = warmup_prompts()
+        run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
 
     # Run warm up, and measure GPU memory of two runs
     # The first run has algo search so it might need more memory
@@ -946,22 +979,17 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            if nvtx_profile:
-                cudart.cudaProfilerStart()
-            images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
-            if nvtx_profile:
-                cudart.cudaProfilerStop()
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                filename = f"{image_filename_prefix}_{i}_{j}_{k}.png"
-                image.save(filename)
-                print("Image saved to", filename)
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            filename = f"{image_filename_prefix}_{i}_{k}.png"
+            image.save(filename)
+            print("Image saved to", filename)
 
     pipeline.teardown()
 
@@ -1137,6 +1165,14 @@ def parse_arguments():
     )
     parser.set_defaults(use_xformers=False)
 
+    parser.add_argument(
+        "--use_io_binding",
+        required=False,
+        action="store_true",
+        help="Use I/O Binding for Optimum.",
+    )
+    parser.set_defaults(use_io_binding=False)
+
     parser.add_argument(
         "-b",
         "--batch_size",
@@ -1176,8 +1212,8 @@ def parse_arguments():
         "--num_prompts",
         required=False,
         type=int,
-        default=1,
-        help="Number of prompts. Default is 1.",
+        default=10,
+        help="Number of prompts. Default is 10.",
     )
 
     parser.add_argument(
@@ -1312,6 +1348,7 @@ def main():
             batch_count=args.batch_count,
             start_memory=start_memory,
             memory_monitor_type=memory_monitor_type,
+            use_io_binding=args.use_io_binding,
         )
     elif args.engine == "onnxruntime":
         assert args.pipeline and os.path.isdir(
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt
index 8ff5990b7815a..5bdd422a11750 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt
@@ -10,6 +10,10 @@ packaging
 protobuf==3.20.3
 psutil
 sympy
+nvtx==0.2.5
+torchvision==0.15.2
+tensorrt==8.5.1.7
+mediapipe
 controlnet_aux==0.0.9
 # The following are for SDXL
 optimum==1.20.0
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt
index e51ffb395c643..1938f59208ae7 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt
@@ -2,3 +2,4 @@ git+https://github.com/openai/CLIP.git
 open_clip_torch
 sentence_transformers
 pillow
+numpy==1.22.2
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 212a7c4871e6a..c3ccde50dac85 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -392,11 +392,13 @@ def validate_and_optimize_onnx(
             False,
             output_names,
         )
-    if optimize_info == OptimizerInfo.NOOPT:
+    if optimize_info.name == OptimizerInfo.NOOPT.name:
         return onnx_model_path, is_valid_onnx_model, config.vocab_size
 
     if (
-        optimize_info == OptimizerInfo.BYSCRIPT or precision == Precision.FLOAT16 or precision == Precision.INT8
+        optimize_info.name == OptimizerInfo.BYSCRIPT.name
+        or precision == Precision.FLOAT16
+        or precision == Precision.INT8
     ):  # Use script (optimizer.py) to optimize
         optimized_model_path = get_onnx_file_path(
             onnx_dir,
@@ -439,7 +441,7 @@ def validate_and_optimize_onnx(
             QuantizeHelper.quantize_onnx_model(onnx_model_path, onnx_model_path, use_external_data_format)
             logger.info(f"Finished quantizing model: {onnx_model_path}")
 
-    if optimize_info == OptimizerInfo.BYORT:  # Use OnnxRuntime to optimize
+    if optimize_info.name == OptimizerInfo.BYORT.name:  # Use OnnxRuntime to optimize
         if is_valid_onnx_model:
             ort_model_path = add_filename_suffix(onnx_model_path, "_ort")
             optimize_onnx_model_by_ort(
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
index c781a91c9e493..efcd92129597a 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
@@ -178,18 +178,17 @@ def fuse_attention(self):
                 mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
                 logger.debug("Create an Attention node.")
                 attention_node = self.attention_fusion.create_attention_node(
-                    mask_index,
-                    matmul_q,
-                    matmul_k,
-                    matmul_v,
-                    add_q,
-                    add_k,
-                    add_v,
-                    self.num_heads,
-                    self.hidden_size,
-                    parent.output[0],
-                    reshape_qkv.output[0],
-                    None,
+                    mask_index=mask_index,
+                    q_matmul=matmul_q,
+                    k_matmul=matmul_k,
+                    v_matmul=matmul_v,
+                    q_add=add_q,
+                    k_add=add_k,
+                    v_add=add_v,
+                    num_heads=self.num_heads,
+                    hidden_size=self.hidden_size,
+                    first_input=parent.output[0],
+                    output=reshape_qkv.output[0],
                 )
                 if attention_node is None:
                     continue
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py b/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
index b7891223e1dc2..a89b6c9e9395d 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
@@ -480,18 +480,17 @@ def fuse_attention(self):
 
                 # For tf models, q and v are flipped.
                 attention_node = self.attention_fusion.create_attention_node(
-                    mask_index,
-                    matmul_k,
-                    matmul_q,
-                    matmul_v,
-                    add_k,
-                    add_q,
-                    add_v,
-                    self.num_heads,
-                    self.hidden_size,
-                    parent.output[0],
-                    qkv_nodes[2].output[0],
-                    None,
+                    mask_index=mask_index,
+                    q_matmul=matmul_k,
+                    k_matmul=matmul_q,
+                    v_matmul=matmul_v,
+                    q_add=add_k,
+                    k_add=add_q,
+                    v_add=add_v,
+                    num_heads=self.num_heads,
+                    hidden_size=self.hidden_size,
+                    first_input=parent.output[0],
+                    output=qkv_nodes[2].output[0],
                 )
                 if attention_node is None:
                     continue
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index f6fc9ea7662cb..ca600c0700682 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -7,6 +7,8 @@
 #include <gsl/gsl>
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
+#include "test/providers/model_tester.h"
+#include "test/util/include/current_test_name.h"
 
 #ifdef USE_CUDA
 #include "core/providers/cuda/cuda_provider_options.h"
@@ -394,5 +396,33 @@ TEST(BeamSearchTest, GptBeamSearchFp16_VocabPadded) {
   }
 }
 
+TEST(BeamSearchTest, DummyT5) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  SKIP_CUDA_TEST_WITH_DML;
+#endif
+  ModelTester tester(CurrentTestName(), ORT_TSTR("testdata/dummy_t5.onnx"));
+  tester.ConfigEp(DefaultCpuExecutionProvider());
+  tester.AddInput("encoder_input_ids", {1, 5}, {14, 6, 13, 9, 7});
+  tester.AddOutput("sequences", {1, 3, 10}, {2, 16, 6, 14, 1, 15, 6, 14, 1, 15, 2, 3, 4, 15, 6, 14, 1, 15, 6, 14, 2, 16, 6, 14, 1, 15, 6, 14, 1, 14});
+#ifdef USE_CUDA
+  tester.ConfigEp(DefaultCudaExecutionProvider());
+#endif
+  tester.RunWithConfig();
+}
+
+TEST(BeamSearchTest, DummyT5WithOuterScopeInitializers) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  SKIP_CUDA_TEST_WITH_DML;
+#endif
+  ModelTester tester(CurrentTestName(), ORT_TSTR("testdata/dummy_t5_with_outer_scope_initializers.onnx"));
+  tester.ConfigEp(DefaultCpuExecutionProvider());
+  tester.AddInput("encoder_input_ids", {1, 5}, {14, 6, 13, 9, 7});
+  tester.AddOutput("sequences", {1, 3, 10}, {2, 16, 6, 14, 1, 15, 6, 14, 1, 15, 2, 3, 4, 15, 6, 14, 1, 15, 6, 14, 2, 16, 6, 14, 1, 15, 6, 14, 1, 14});
+#ifdef USE_CUDA
+  tester.ConfigEp(DefaultCudaExecutionProvider());
+#endif
+  tester.RunWithConfig();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
index 17685ab82f0ef..208545eacf224 100644
--- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
@@ -757,7 +757,7 @@ static void TestDecoderMaskedMultiHeadAttention(bool is_cross_attn = true, bool
 
   OpTester tester("DecoderMaskedMultiHeadAttention", 1, onnxruntime::kMSDomain);
   FixedPatternValueGenerator generator{};
-  RandomValueGenerator random{};
+  RandomValueGenerator random{123};
 
   // Attributes
   tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 7d99ffab9a88f..6dedce24e7e07 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -82,6 +82,7 @@ struct TestOptions {
   bool has_bias{false};
 
   std::optional<float> output_abs_error{};
+  std::optional<float> output_rel_error{};
 };
 
 std::ostream& operator<<(std::ostream& os, const TestOptions& opts) {
@@ -253,6 +254,10 @@ void RunTest(const TestOptions& opts,
     test.SetOutputAbsErr("Y", *opts.output_abs_error);
   }
 
+  if (opts.output_rel_error.has_value()) {
+    test.SetOutputRelErr("Y", *opts.output_rel_error);
+  }
+
   if (!explicit_eps.empty()) {
     test.ConfigEps(std::move(explicit_eps));
   }
@@ -271,14 +276,10 @@ void TestMatMulNBitsTyped() {
 
   if (base_opts.accuracy_level == 4) {
     base_opts.output_abs_error = 0.1f;
-  } else {
-    if constexpr (std::is_same<AType, MLFloat16>::value) {
-#ifdef USE_WEBGPU
-      base_opts.output_abs_error = 0.03f;
-#else
-      base_opts.output_abs_error = 0.01f;
-#endif
-    }
+    base_opts.output_rel_error = 0.02f;
+  } else if constexpr (std::is_same<AType, MLFloat16>::value) {
+    base_opts.output_abs_error = 0.055f;
+    base_opts.output_rel_error = 0.02f;
   }
 
   {
@@ -391,48 +392,48 @@ TEST(MatMulNBits, Float32_Accuracy4) {
   TestMatMulNBitsTyped<float, 100, 288, 1234, 16, 4>();
 }
 
-#ifdef MLAS_TARGET_AMD64_IX86
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_ARM64)
 #if !defined(USE_DML)
 // Actual and expected difference is over 0.01 with DmlExecutionProvider.
 // Skip the tests instead of raising the tolerance to make is pass.
+TEST(MatMulNBits, Float16_Accuracy2) {
+  TestMatMulNBitsTyped<MLFloat16, 1, 1, 16, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 1, 2, 16, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 1, 32, 16, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 1, 32, 32, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 1, 32, 16, 128, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 1, 288, 16, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 1, 288, 1024, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 1, 288, 1024, 128, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 1, 288, 93, 32, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 1, 288, 93, 128, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 1, 288, 1234, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 2, 1, 16, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 2, 2, 16, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 1, 16, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 2, 16, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 32, 16, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 32, 32, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 32, 16, 128, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 288, 16, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 16, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 128, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 32, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 128, 2>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 288, 1234, 16, 2>();
+}
+
 TEST(MatMulNBits, Float16_Accuracy0) {
   TestMatMulNBitsTyped<MLFloat16, 1, 1, 16, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 1, 2, 16, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 1, 32, 16, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 1, 32, 32, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 1, 32, 16, 128, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 1, 288, 16, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 1, 288, 1024, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 1, 288, 1024, 128, 0>();
   TestMatMulNBitsTyped<MLFloat16, 1, 288, 93, 32, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 1, 288, 93, 128, 0>();
   TestMatMulNBitsTyped<MLFloat16, 1, 288, 1234, 16, 0>();
   TestMatMulNBitsTyped<MLFloat16, 2, 1, 16, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 2, 2, 16, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 1, 16, 16, 0>();
   TestMatMulNBitsTyped<MLFloat16, 100, 2, 16, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 32, 16, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 32, 32, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 32, 16, 128, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 288, 16, 16, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 16, 0>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 128, 0>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 32, 0>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 128, 0>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1234, 16, 0>();
 }
 
-TEST(MatMulNBits, Float16_Accuracy1) {
-  TestMatMulNBitsTyped<MLFloat16, 1, 1, 16, 16, 1>();
-  TestMatMulNBitsTyped<MLFloat16, 1, 288, 93, 32, 1>();
-  TestMatMulNBitsTyped<MLFloat16, 1, 288, 1234, 16, 1>();
-  TestMatMulNBitsTyped<MLFloat16, 2, 1, 16, 16, 1>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 2, 16, 16, 1>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 128, 1>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 32, 1>();
-  TestMatMulNBitsTyped<MLFloat16, 100, 288, 1234, 16, 1>();
-}
-
 TEST(MatMulNBits, Float16_Accuracy4) {
   TestMatMulNBitsTyped<MLFloat16, 1, 1, 16, 16, 4>();
   TestMatMulNBitsTyped<MLFloat16, 1, 2, 16, 16, 4>();
diff --git a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
index 7d5a70148747f..0e964cf64fbbd 100644
--- a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
@@ -75,7 +75,7 @@ static void RunTest(
   if (enable_dml && !disable_dml) {
     execution_providers.push_back(DefaultDmlExecutionProvider());
   }
-  if (tensor_type == TensorType::kFloat && !disable_cpu) {
+  if ((tensor_type == TensorType::kFloat || tensor_type == TensorType::kFloat16) && !disable_cpu) {
     execution_providers.push_back(DefaultCpuExecutionProvider());
   }
   if (enable_webgpu) {
@@ -140,26 +140,7 @@ static void RunTests(const std::vector<float>& input_data,
                      int64_t interleaved = 0,
                      int64_t is_packed_batching = 0,
                      bool use_float16 = true) {
-  // FP32 test for CPU
-  RunTest(input_data,
-          position_ids,
-          cos_cache,
-          sin_cache,
-          output_data,
-          batch_size,
-          sequence_length,
-          head_size,
-          rotary_embedding_dim,
-          num_heads,
-          max_sequence_length,
-          interleaved,
-          is_packed_batching,
-          TensorType::kFloat,
-          false, /* disable_cpu */
-          true,  /* disable_cuda */
-          true /* disable_dml */);
-
-  // FP32 test for CUDA and DML
+  // FP32 test for CPU, CUDA and DML
   RunTest(input_data,
           position_ids,
           cos_cache,
@@ -178,7 +159,7 @@ static void RunTests(const std::vector<float>& input_data,
           false, /* disable_cuda */
           false /* disable_dml */);
 
-  // FP16 test for CUDA and DML
+  // FP16 test for CPU, CUDA and DML
   if (use_float16) {
     RunTest(input_data,
             position_ids,
@@ -194,26 +175,9 @@ static void RunTests(const std::vector<float>& input_data,
             interleaved,
             is_packed_batching,
             TensorType::kFloat16,
-            true,  /* disable_cpu */
+            false, /* disable_cpu */
             false, /* disable_cuda*/
             false /* disable_dml */);
-
-    // RunTest(input_data,
-    //         position_ids,
-    //         cos_cache,
-    //         sin_cache,
-    //         output_data,
-    //         batch_size,
-    //         sequence_length,
-    //         head_size,
-    //         rotary_embedding_dim,
-    //         num_heads,
-    //         max_sequence_length,
-    //         interleaved,
-    //         TensorType::kBFloat16,
-    //         true,  /* disable_cpu */
-    //         false, /* disable_cuda*/
-    //         false /* disable_dml */);
   }
 }
 
diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
index b9ca55073d411..4e8d1b9f016f0 100644
--- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
@@ -194,6 +194,32 @@ static void RunTest(
   }
 }
 
+TEST(SkipLayerNormTest, SkipLayerNormPrePack) {
+  OpTester test("SkipLayerNormalization", 1, onnxruntime::kMSDomain);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+
+  int batch_size = 1;
+  int sequence_length = 2;
+  int hidden_size = 2;
+  std::vector<int64_t> input_skip_output_dims = {batch_size, sequence_length, hidden_size};
+  std::vector<int64_t> gamma_beta_bias_dims = {hidden_size};
+  test.AddInput<MLFloat16>("x", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f}));
+  test.AddInput<MLFloat16>("skip", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f}));
+  test.AddInput<MLFloat16>("gamma", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true);
+  test.AddInput<MLFloat16>("beta", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true);
+  test.AddOutput<MLFloat16>("output", input_skip_output_dims, ToFloat16({
+                                                                  1.f,
+                                                                  1.f,
+                                                                  1.f,
+                                                                  1.f,
+                                                              }));
+
+  // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
+            kNnapiExecutionProvider, kQnnExecutionProvider});
+}
+
 TEST(SkipLayerNormTest, SkipLayerNormNullInput) {
   int batch_size = 1;
   int sequence_length = 0;
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index a7f8a6424aa50..adab93908cdc4 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -252,6 +252,7 @@ class PlannerTest : public ::testing::Test {
 
   void BindKernel(onnxruntime::Node* p_node, ::onnxruntime::KernelDef& kernel_def, KernelRegistry* reg,
                   std::unordered_map<NodeIndex, gsl::not_null<const KernelCreateInfo*>>& kernel_create_info_map) {
+    const auto& logger = DefaultLoggingManager().DefaultLogger();
     const IExecutionProvider* ep = execution_providers_.Get(*p_node);
     ASSERT_NE(ep, nullptr);
     auto info = std::make_unique<OpKernelInfo>(
@@ -261,7 +262,7 @@ class PlannerTest : public ::testing::Test {
     op_kernel_infos_.push_back(std::move(info));
     const auto kernel_type_str_resolver = OpSchemaKernelTypeStrResolver{};
     if (!KernelRegistry::HasImplementationOf(*reg, *p_node, onnxruntime::kCpuExecutionProvider,
-                                             kernel_type_str_resolver)) {
+                                             kernel_type_str_resolver, logger)) {
       ASSERT_STATUS_OK(reg->Register(
           KernelCreateInfo(std::make_unique<KernelDef>(kernel_def),
                            [](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
@@ -271,7 +272,7 @@ class PlannerTest : public ::testing::Test {
     }
 
     const KernelCreateInfo* kci;
-    ASSERT_STATUS_OK(reg->TryFindKernel(*p_node, "", kernel_type_str_resolver, &kci));
+    ASSERT_STATUS_OK(reg->TryFindKernel(*p_node, "", kernel_type_str_resolver, logger, &kci));
     kernel_create_info_map.insert({p_node->Index(), gsl::not_null<const KernelCreateInfo*>(kci)});
   }
 
@@ -283,7 +284,8 @@ class PlannerTest : public ::testing::Test {
     }
   }
 
-  void CreatePlan(const std::vector<const NodeArg*>& outer_scope_node_args = {}, bool invoke_createPlan_explicityly = true) {
+  void CreatePlan(const std::vector<const NodeArg*>& outer_scope_node_args = {},
+                  bool invoke_createPlan_explicityly = true) {
     state_.reset(new SessionState(graph_, execution_providers_, tp_.get(), nullptr, dtm_, edlm_,
                                   DefaultLoggingManager().DefaultLogger(), profiler_, *sess_options_));
     EXPECT_EQ(graph_.Resolve(), Status::OK());
diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc
index 8961058628490..fa6c4966d6953 100644
--- a/onnxruntime/test/framework/allocator_test.cc
+++ b/onnxruntime/test/framework/allocator_test.cc
@@ -3,6 +3,7 @@
 #include <absl/base/config.h>
 
 #include "core/framework/allocator.h"
+#include "core/framework/allocator_utils.h"
 
 #include "test_utils.h"
 #include "gtest/gtest.h"
@@ -15,12 +16,10 @@ TEST(AllocatorTest, CPUAllocatorTest) {
   ASSERT_STREQ(cpu_arena->Info().name, CPU);
   EXPECT_EQ(cpu_arena->Info().id, 0);
 
-  // arena is disabled for CPUExecutionProvider on x86 and JEMalloc
-#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) && !defined(USE_JEMALLOC) && !defined(USE_MIMALLOC) && !defined(ABSL_HAVE_ADDRESS_SANITIZER)
-  EXPECT_EQ(cpu_arena->Info().alloc_type, OrtAllocatorType::OrtArenaAllocator);
-#else
-  EXPECT_EQ(cpu_arena->Info().alloc_type, OrtAllocatorType::OrtDeviceAllocator);
-#endif
+  const auto expected_allocator_type = DoesCpuAllocatorSupportArenaUsage()
+                                           ? OrtAllocatorType::OrtArenaAllocator
+                                           : OrtAllocatorType::OrtDeviceAllocator;
+  EXPECT_EQ(cpu_arena->Info().alloc_type, expected_allocator_type);
 
   size_t size = 1024;
   auto bytes = cpu_arena->Alloc(size);
diff --git a/onnxruntime/test/framework/function_test.cc b/onnxruntime/test/framework/function_test.cc
index fa3545ef27d72..180a75a64c10e 100644
--- a/onnxruntime/test/framework/function_test.cc
+++ b/onnxruntime/test/framework/function_test.cc
@@ -580,13 +580,7 @@ TEST(FunctionTest, TestInlinedLocalFunctionNotRemoved) {
 
   // myfun is not removed because it was claimed by InternalTestingEP
   model_proto = session_object.GetModel().ToProto();
-#ifdef USE_TVM
-  // TVM EP takes the whole graph and optimizes it within its own framework.
-  // It does not retain the original graph.
-  ASSERT_EQ(0, model_proto.functions_size());
-#else
   ASSERT_EQ(1, model_proto.functions_size());
-#endif
 }
 
 TEST(FunctionTest, TestInlinedFunctionDoesNotReserrectNonExistingArgs) {
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 9c7e6e9761728..7f4616c964e33 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -46,7 +46,6 @@
 #include "core/session/environment.h"
 #include "core/session/IOBinding.h"
 #include "core/session/inference_session_utils.h"
-#include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/onnxruntime_run_options_config_keys.h"
 #include "dummy_provider.h"
@@ -66,8 +65,6 @@ using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::logging;
 using namespace onnxruntime::concurrency;
 
-extern std::unique_ptr<Ort::Env> ort_env;
-
 namespace {
 struct KernelRegistryAndStatus {
   std::shared_ptr<onnxruntime::KernelRegistry> kernel_registry = std::make_shared<onnxruntime::KernelRegistry>();
@@ -500,57 +497,6 @@ TEST(InferenceSessionTests, TestModelSerialization) {
   ASSERT_TRUE(session_object_emptyValidation.Initialize().IsOK());
 }
 
-// Test feature serialize prepack weight is only used in PC with CPU on inference,
-// disable this test for training, other device and eps
-#if !ENABLE_TRAINING && !defined(USE_CUDA) && !defined(__wasm__) && !defined(USE_DNNL) && !defined(USE_QNN) && !defined(__ANDROID__) && !defined(USE_COREML)
-// MLAS dispatcher used in matmul_nbits kernels here is 64 bit only
-#if defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)
-TEST(InferenceSessionTests, TestPrePackSerialization) {
-  SessionOptions so;
-  std::string model_name = "model_with_matmul_nbits";
-
-  const std::string test_model = "testdata/prepack/" + model_name + ".onnx";
-  const std::string optimized_model = "testdata/prepack/" + model_name + "_opt.onnx";
-
-  so.session_logid = "InferenceSessionTests.TestPrepackSerialization";
-  so.enable_cpu_mem_arena = false;
-  so.graph_optimization_level = TransformerLevel::Default;
-  so.optimized_model_filepath = optimized_model;
-  std::string external_initializer_file_name = model_name + "_opt.onnx.data";
-
-  // enable serialize prepack initializer to data file
-  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsSavePrePackedConstantInitializers,
-                                                    "1"));
-  // always save external initializer to data file for test
-  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes,
-                                                    "0"));
-  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsOptimizedModelExternalInitializersFileName,
-                                                    external_initializer_file_name.c_str()));
-
-  // optimize model with serialize prepack constant initializers
-  InferenceSessionWrapper session_object{so, GetEnvironment()};
-  ASSERT_TRUE(session_object.Load(test_model).IsOK());
-  ASSERT_TRUE(session_object.Initialize().IsOK());
-
-  // Verify prepack initializers are serialized into optimized model and data file
-  // load optimized model and check initializer are prepacked
-  auto logger = DefaultLoggingManager().CreateLogger("TestPrepackSerialization");
-  std::shared_ptr<Model> model;
-  auto load_status = Model::Load(ToWideString(optimized_model), model, nullptr, *logger);
-  ASSERT_EQ(Status::OK(), load_status);
-  Graph& graph = model->MainGraph();
-
-  bool found_prepack_initializer = false;
-  for (const auto& item : graph.GetAllInitializedTensors()) {
-    if (item.first.find(':') != std::string::npos) {
-      found_prepack_initializer = true;
-    }
-  }
-  ASSERT_TRUE(found_prepack_initializer);
-}
-#endif
-#endif
-
 #ifdef ORT_RUN_EXTERNAL_ONNX_TESTS
 static bool Compare(const InputDefList& f_arg, const InputDefList& s_arg) {
   if (f_arg.size() != s_arg.size()) {
@@ -872,6 +818,47 @@ TEST(InferenceSessionTests, CheckRunProfilerStartTime) {
   ASSERT_TRUE(before_start_time <= profiling_start_time && profiling_start_time <= after_start_time);
 }
 
+TEST(InferenceSessionTests, CheckRunProfilerWithOptionalValues) {
+  // Test whether the profiler can work on model with optional values
+  SessionOptions so;
+
+  so.session_logid = "CheckRunProfiler";
+  so.enable_profiling = true;
+  so.profile_file_prefix = ORT_TSTR("onnxprofile_profile_test");
+
+  InferenceSession session_object(so, GetEnvironment());
+  ASSERT_STATUS_OK(session_object.Load(ORT_TSTR("testdata/relu_with_optional.onnx")));
+  ASSERT_STATUS_OK(session_object.Initialize());
+
+  RunOptions run_options;
+  run_options.run_tag = "RunTag";
+
+  // prepare inputs
+  std::vector<int64_t> dims_x = {1};
+  std::vector<int> values_x = {-4};
+  OrtValue ml_value;
+  CreateMLValue<int>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_x, values_x, &ml_value);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("input", ml_value));
+
+  // prepare outputs
+  std::vector<std::string> output_names;
+  output_names.push_back("output");
+  std::vector<OrtValue> fetches;
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_y = {1};
+  std::vector<int> expected_values_y = {0};
+
+  // Now run
+  common::Status st = session_object.Run(run_options, feeds, output_names, &fetches);
+  if (!st.IsOK()) {
+    std::cout << "Run returned status: " << st.ErrorMessage() << std::endl;
+  }
+  ASSERT_TRUE(st.IsOK());
+  VerifyOutputs<int>(fetches.at(0).Get<Tensor>(), expected_dims_y, expected_values_y);
+}
+
 TEST(InferenceSessionTests, MultipleSessionsNoTimeout) {
   SessionOptions session_options;
 
diff --git a/onnxruntime/test/framework/save_model_with_external_initializers.cc b/onnxruntime/test/framework/save_model_with_external_initializers.cc
index 0f76cb61ace74..d0bc088175755 100644
--- a/onnxruntime/test/framework/save_model_with_external_initializers.cc
+++ b/onnxruntime/test/framework/save_model_with_external_initializers.cc
@@ -7,7 +7,6 @@
 #include "core/framework/data_types.h"
 #include "core/graph/model.h"
 #include "core/framework/tensorprotoutils.h"
-#include "core/framework/session_state.h"
 #include "test/test_environment.h"
 #include "test_utils.h"
 #include "test/util/include/asserts.h"
@@ -20,34 +19,19 @@ using namespace onnxruntime;
 namespace onnxruntime {
 namespace test {
 
-std::vector<std::string> split(const std::string& str, char delimiter) {
-  std::vector<std::string> result;
-  std::stringstream ss(str);
-  std::string token;
-
-  // Use getline with a delimiter to split the string
-  while (std::getline(ss, token, delimiter)) {
-    result.push_back(token);
-  }
-
-  return result;
-}
-
 Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
                                const std::filesystem::path& input_external_init_file,
                                const std::filesystem::path& output_onnx,
                                const std::filesystem::path& output_external_init_file,
                                size_t initializer_size_threshold,
-                               const Graph::OffsetAlignmentInfo& align_info,
-                               Graph::PrePackedTensorProtoToSave& pre_packed_initializers_tensor_proto,
-                               bool save_prepacked_constant_initializers = false) {
+                               const Graph::OffsetAlignmentInfo& align_info) {
   auto logger = DefaultLoggingManager().CreateLogger("LoadSaveAndCompareModel");
   std::shared_ptr<Model> model;
   ORT_RETURN_IF_ERROR(Model::Load(input_onnx, model, nullptr, *logger));
   std::filesystem::remove(output_onnx);
   std::filesystem::remove(output_external_init_file);
   ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file, initializer_size_threshold,
-                                                          align_info, save_prepacked_constant_initializers, pre_packed_initializers_tensor_proto));
+                                                          align_info));
 
   std::shared_ptr<Model> model_from_external;
   ORT_RETURN_IF_ERROR(Model::Load(output_onnx.native(), model_from_external, nullptr, *logger));
@@ -66,11 +50,10 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
   // Compare the initializers of the two versions.
   std::filesystem::path model_path{};
   std::filesystem::path external_data_path{};
-  for (const auto& i : initializers_from_external) {
+  for (const auto& i : initializers) {
     const std::string kInitName = i.first;
-    const ONNX_NAMESPACE::TensorProto* from_external_tensor_proto = i.second;
-    // prepack initializer will have name as [original name]:[kernel name] in case initializer used by multiple kernels
-    const ONNX_NAMESPACE::TensorProto* tensor_proto = save_prepacked_constant_initializers ? initializers[split(kInitName, ':')[0]] : initializers[kInitName];
+    const ONNX_NAMESPACE::TensorProto* tensor_proto = i.second;
+    const ONNX_NAMESPACE::TensorProto* from_external_tensor_proto = initializers_from_external[kInitName];
 
     std::vector<uint8_t> tensor_proto_data;
     model_path = input_onnx;
@@ -92,12 +75,8 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
       ORT_RETURN_IF_NOT(from_external_tensor_proto->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL, "location mismatch");
     }
 
-    if (!save_prepacked_constant_initializers) {
-      ORT_RETURN_IF_NOT(tensor_proto_size == from_external_tensor_proto_size, "size mismatch");
-      ORT_RETURN_IF_NOT(memcmp(tensor_proto_data.data(), from_external_tensor_proto_data.data(), tensor_proto_size) == 0, "data mismatch");
-    } else {
-      ORT_RETURN_IF_NOT(from_external_tensor_proto_size >= tensor_proto_size, "prepack initializer's size is at least same as original tensor, might be larger");
-    }
+    ORT_RETURN_IF_NOT(tensor_proto_size == from_external_tensor_proto_size, "size mismatch");
+    ORT_RETURN_IF_NOT(memcmp(tensor_proto_data.data(), from_external_tensor_proto_data.data(), tensor_proto_size) == 0, "data mismatch");
 
     if (align_info.align_offset) {
       for (const StringStringEntryProto& entry : from_external_tensor_proto->external_data()) {
@@ -110,7 +89,6 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
       }
     }
   }
-
   // Cleanup.
   ORT_RETURN_IF_NOT(std::filesystem::remove(output_onnx), "delete file failed");
   ORT_RETURN_IF_NOT(std::filesystem::remove(external_data_path), "delete file failed");
@@ -120,15 +98,13 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
 // Original model does not have external initializers
 TEST(SaveWithExternalInitializers, Mnist) {
   Graph::OffsetAlignmentInfo align_info;
-  Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/mnist.onnx"), ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), ORT_TSTR("mnist_external_initializers.bin"), 100, align_info, pre_packed_initializers_tensor_proto));
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/mnist.onnx"), ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), ORT_TSTR("mnist_external_initializers.bin"), 100, align_info));
 }
 
 // Original model has external initializers
 TEST(SaveWithExternalInitializers, ModelWithOriginalExternalData) {
   Graph::OffsetAlignmentInfo align_info;
-  Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info, pre_packed_initializers_tensor_proto));
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info));
 }
 
 // Original model has external initializers, align offset
@@ -136,22 +112,7 @@ TEST(SaveWithExternalInitializers, ModelWithOriginalExternalDataAlignOffset) {
   Graph::OffsetAlignmentInfo align_info;
   align_info.align_offset = true;
   align_info.align_threshold = 0;
-  Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info, pre_packed_initializers_tensor_proto));
-}
-
-// Original model has external initializers, align offset and serialize prepacked external initializer to model file
-TEST(SaveWithExternalInitializers, ModelWithOriginalExternalDataAlignOffsetAndSavePrepackTensors) {
-  Graph::OffsetAlignmentInfo align_info;
-  align_info.align_offset = true;
-  align_info.align_threshold = 0;
-  std::shared_ptr<CPUAllocator> alloc = std::make_shared<CPUAllocator>();
-  TensorShape shape = {178};
-  // prepack both initializers for test purpose
-  Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
-  pre_packed_initializers_tensor_proto["MatMul.Weight"]["MatMul_0"] = utils::TensorToTensorProto(Tensor(DataTypeImpl::GetType<uint8_t>(), shape, alloc), "MatMul.Weight:MatMul_0");
-  pre_packed_initializers_tensor_proto["scales"]["MatMul_0"] = utils::TensorToTensorProto(Tensor(DataTypeImpl::GetType<float>(), shape, alloc), "scales:MatMul_0");
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/prepack/model_with_matmul_nbits.onnx"), ORT_TSTR("model_with_matmul_nbits.onnx.data"), ORT_TSTR("testdata/prepack/model_with_matmul_nbits_opt.onnx"), ORT_TSTR("model_with_matmul_nbits_opt.onnx.data"), 0, align_info, pre_packed_initializers_tensor_proto, true));
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info));
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index 6265eccb7bd9b..3e694020f796b 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -5,6 +5,7 @@
 #include <absl/base/config.h>
 
 #include "asserts.h"
+#include "core/framework/allocator_utils.h"
 #include "core/framework/execution_providers.h"
 #include "core/framework/graph_partitioner.h"
 #include "core/framework/kernel_registry.h"
@@ -216,10 +217,12 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
 
 // Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator
 // if the relevant session option config flag is set
-// For this test we need to enable the arena-based allocator which is not supported on x86 builds, so
-// enable this test only on x64 builds
-#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) && !defined(USE_MIMALLOC) && !defined(ABSL_HAVE_ADDRESS_SANITIZER)
 TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
+  // For this test we need to enable the arena-based allocator.
+  if (!DoesCpuAllocatorSupportArenaUsage()) {
+    GTEST_SKIP() << "CPU allocator does not support arena usage.";
+  }
+
   AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
   // Part 1: Feature turned ON (i.e.) allocate from non-arena memory
   {
@@ -348,8 +351,6 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
   }
 }
 
-#endif
-
 INSTANTIATE_TEST_SUITE_P(SessionStateTests, SessionStateTestP, testing::ValuesIn(param_list));
 
 #ifndef ENABLE_TRAINING_CORE
@@ -372,11 +373,10 @@ class PrePackingTestOpKernel : public OpKernel {
     return Status::OK();
   }
 
-  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool save_prepacked_initializers,
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
                  /*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) override {
     ORT_UNUSED_PARAMETER(tensor);
     ORT_UNUSED_PARAMETER(input_idx);
-    ORT_UNUSED_PARAMETER(save_prepacked_initializers);
 
     size_t weight_packed_len = 8;
     weight_packed_ = IAllocator::MakeUniquePtr<void>(alloc, weight_packed_len, true);
@@ -394,20 +394,9 @@ class PrePackingTestOpKernel : public OpKernel {
     return Status::OK();
   }
 
-  std::optional<Tensor> GetPrePackTensor(int input_idx) override {
-    ORT_UNUSED_PARAMETER(input_idx);
-    ++get_prepack_tensors_count;
-
-    TensorShape shape = {2};
-    packed_tensor = Tensor(DataTypeImpl::GetType<float>(), shape, std::make_shared<CPUAllocator>());
-    return std::move(packed_tensor);
-  }
-
   int prepack_calls_count = 0;
   int store_pre_packed_weight_calls_count = 0;
-  int get_prepack_tensors_count = 0;
   IAllocatorUniquePtr<void> weight_packed_;
-  Tensor packed_tensor;
 };
 
 static void CreateSimpleGraph(Graph& graph) {
@@ -542,7 +531,6 @@ static void PlaceAllNodesToCPUEP(Graph& graph) {
 struct PrepackingTestParam {
   bool test_subgraph;
   bool test_prepacking;
-  bool test_save_prepack_initializer;
 };
 
 class SessionStatePrepackingTest : public testing::TestWithParam<PrepackingTestParam> {};
@@ -585,8 +573,6 @@ TEST_P(SessionStatePrepackingTest, PrePackingTest) {
   sess_options.enable_mem_reuse = true;
   sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] =
       test_param.test_prepacking ? "0" : "1";
-  sess_options.config_options.configurations[kOrtSessionOptionsSavePrePackedConstantInitializers] =
-      test_param.test_save_prepack_initializer ? "1" : "0";
 
   SessionState session_state(model.MainGraph(),
                              execution_providers,
@@ -612,47 +598,12 @@ TEST_P(SessionStatePrepackingTest, PrePackingTest) {
   kernel_registry_manager.RegisterKernelRegistry(kernel_registry);
 
   PlaceAllNodesToCPUEP(model.MainGraph());
-  SessionState::PrePackInitializers pre_packed_initializers;
   ASSERT_STATUS_OK(session_state.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
-                                                      kernel_registry_manager,
-                                                      pre_packed_initializers));
+                                                      kernel_registry_manager));
 
   const auto& const_initialized_tensors = session_state.GetConstantInitializedTensors();
   // check prepacking
   ASSERT_EQ(const_initialized_tensors.size(), size_t(test_param.test_prepacking ? 0 : 1));
-
-  // check get prepack tensor method called when set save_prepacked_constant_initializers
-  if (!test_param.test_subgraph) {
-    const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state.GetKernel(0));
-    ASSERT_EQ(kernel->get_prepack_tensors_count, (test_param.test_prepacking && test_param.test_save_prepack_initializer) ? 1 : 0);
-  } else {
-    auto if_index = 1;
-    if (session_state.GetKernel(0)->Node().OpType() == "If") {
-      if_index = 0;
-    }
-
-    const auto& subgraph_session_states = session_state.GetSubgraphSessionStateMap();
-    const auto& if_node_session_states = subgraph_session_states.at(if_index);
-    const auto& session_state_1_then_branch_session_state = *if_node_session_states.at("then_branch");
-    const auto& session_state_1_else_branch_session_state = *if_node_session_states.at("else_branch");
-
-    const auto* kernel_if_0 = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1_then_branch_session_state.GetKernel(0));
-    const auto* kernel_if_1 = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1_else_branch_session_state.GetKernel(0));
-    ASSERT_EQ(kernel_if_0->get_prepack_tensors_count, (test_param.test_prepacking && test_param.test_save_prepack_initializer) ? 1 : 0);
-    ASSERT_EQ(kernel_if_1->get_prepack_tensors_count, (test_param.test_prepacking && test_param.test_save_prepack_initializer) ? 1 : 0);
-  }
-
-  // check pre_packed_initializers_to_save will be set properly when set save_prepacked_constant_initializers
-  if (!test_param.test_subgraph && test_param.test_prepacking && test_param.test_save_prepack_initializer) {
-    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save.size(), size_t(1));
-    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save.count("node_0_input_1"), size_t(1));
-    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save["node_0_input_1"].count("node_0"), size_t(1));
-  } else if (test_param.test_subgraph && test_param.test_prepacking && test_param.test_save_prepack_initializer) {
-    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save.size(), size_t(1));
-    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save.count("if_shared"), size_t(1));
-    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save["if_shared"].count("if_node_1"), size_t(1));
-    ASSERT_EQ(pre_packed_initializers.pre_packed_initializers_to_save["if_shared"].count("if_node_0"), size_t(1));
-  }
 }
 
 class SessionStateTestSharedInitalizersWithPrePacking : public ::testing::Test {
@@ -1050,14 +1001,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test4) {
 
 INSTANTIATE_TEST_SUITE_P(SessionStateTests,
                          SessionStatePrepackingTest,
-                         testing::Values(PrepackingTestParam{false, false, false},
-                                         PrepackingTestParam{false, true, false},
-                                         PrepackingTestParam{true, false, false},
-                                         PrepackingTestParam{true, true, false},
-                                         PrepackingTestParam{false, false, true},
-                                         PrepackingTestParam{false, true, true},
-                                         PrepackingTestParam{true, false, true},
-                                         PrepackingTestParam{true, true, true}));
+                         testing::Values(PrepackingTestParam{false, false},
+                                         PrepackingTestParam{false, true},
+                                         PrepackingTestParam{true, false},
+                                         PrepackingTestParam{true, true}));
 #endif
 
 }  // namespace test
diff --git a/onnxruntime/test/framework/tensor_test.cc b/onnxruntime/test/framework/tensor_test.cc
index 9202543b75a6f..fba099f9c55b3 100644
--- a/onnxruntime/test/framework/tensor_test.cc
+++ b/onnxruntime/test/framework/tensor_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/framework/tensor.h"
+#include "core/framework/allocator_utils.h"
 #include "test_utils.h"
 
 #include "gmock/gmock.h"
@@ -137,12 +138,10 @@ TEST(TensorTest, EmptyTensorTest) {
   ASSERT_STREQ(location.name, CPU);
   EXPECT_EQ(location.id, 0);
 
-  // arena is disabled for CPUExecutionProvider on x86 and JEMalloc
-#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) && !defined(USE_JEMALLOC) && !defined(USE_MIMALLOC) && !defined(ABSL_HAVE_ADDRESS_SANITIZER)
-  EXPECT_EQ(location.alloc_type, OrtAllocatorType::OrtArenaAllocator);
-#else
-  EXPECT_EQ(location.alloc_type, OrtAllocatorType::OrtDeviceAllocator);
-#endif
+  const auto expected_allocator_type = DoesCpuAllocatorSupportArenaUsage()
+                                           ? OrtAllocatorType::OrtArenaAllocator
+                                           : OrtAllocatorType::OrtDeviceAllocator;
+  EXPECT_EQ(location.alloc_type, expected_allocator_type);
 }
 
 TEST(TensorTest, StringTensorTest) {
diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index 8338c7d547a09..9d8febb453739 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -200,7 +200,8 @@ TEST(LoraAdapterTest, Load) {
 }
 
 #ifdef USE_CUDA
-TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
+TEST(LoraAdapterTest, VerifyDeviceCopy) {
+  // These checks for CUDA/DML combined Package, Be careful when you want to remove it!
   if (DefaultCudaExecutionProvider() == nullptr) {
     GTEST_SKIP() << "Skip This Test Due to this EP is null";
   }
@@ -209,73 +210,31 @@ TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
     GTEST_FAIL() << "It should not run with DML EP";
   }
 #endif
-  auto cpu_ep = DefaultCpuExecutionProvider();
-  auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
-  auto cuda_allocator = DefaultCudaExecutionProvider()->CreatePreferredAllocators()[0];
-  auto cuda_transfer = DefaultCudaExecutionProvider()->GetDataTransfer();
-
-  auto test_params = GenerateTestParameters<float>()();
-  lora::LoraAdapter adapter(std::move(cuda_allocator));
-  adapter.Load(std::move(test_params));
-
-  auto [begin, end] = adapter.GetParamIterators();
-  for (; begin != end; ++begin) {
-    const auto& [_, param] = *begin;
-    const auto& tensor_device = param.GetDeviceOrMapped().Get<Tensor>();
-    ASSERT_EQ(0, strcmp(tensor_device.Location().name, onnxruntime::CUDA));
-
-    const auto& tensor_cpu = param.GetMapped().Get<Tensor>();
-    ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size());
-
-    Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator);
-    ASSERT_TRUE(cuda_transfer->CanCopy(tensor_device.Location().device,
-                                       copy.Location().device));
-    ASSERT_STATUS_OK(cuda_transfer->CopyTensor(tensor_device, copy));
-
-    auto expected_span = tensor_cpu.DataAsSpan<float>();
-    auto copy_span = copy.DataAsSpan<float>();
-
-    ASSERT_EQ(expected_span, copy_span);
-  }
-}
-#endif
-
-#ifdef USE_DML
-TEST(LoraAdapterTest, VerifyDmlDeviceCopy) {
-  // NO_DML_TEST is set, DML test is skipped
-  if (DefaultDmlExecutionProvider() == nullptr) {
-    GTEST_SKIP() << "Skip This Test Due to this EP is null";
-  }
-
-#ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    GTEST_FAIL() << "It should not run with CUDA EP";
-  }
-#endif
 
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
+  auto cuda_ep = DefaultCudaExecutionProvider();
+  auto cuda_allocator = cuda_ep->CreatePreferredAllocators()[0];
 
-  auto dml_allocator = DefaultDmlExecutionProvider()->CreatePreferredAllocators()[0];
-  auto dml_transfer = DefaultDmlExecutionProvider()->GetDataTransfer();
+  auto gpu_transfer = cuda_ep->GetDataTransfer();
 
   auto test_params = GenerateTestParameters<float>()();
-  lora::LoraAdapter adapter(std::move(dml_allocator));
+  lora::LoraAdapter adapter(std::move(cuda_allocator));
   adapter.Load(std::move(test_params));
 
   auto [begin, end] = adapter.GetParamIterators();
   for (; begin != end; ++begin) {
     const auto& [_, param] = *begin;
     const auto& tensor_device = param.GetDeviceOrMapped().Get<Tensor>();
-    ASSERT_EQ(0, strcmp(tensor_device.Location().name, onnxruntime::DML));
+    ASSERT_EQ(0, strcmp(tensor_device.Location().name, onnxruntime::CUDA));
 
     const auto& tensor_cpu = param.GetMapped().Get<Tensor>();
     ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size());
 
     Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator);
-    ASSERT_TRUE(dml_transfer->CanCopy(tensor_device.Location().device,
+    ASSERT_TRUE(gpu_transfer->CanCopy(tensor_device.Location().device,
                                       copy.Location().device));
-    ASSERT_STATUS_OK(dml_transfer->CopyTensor(tensor_device, copy));
+    ASSERT_STATUS_OK(gpu_transfer->CopyTensor(tensor_device, copy));
 
     auto expected_span = tensor_cpu.DataAsSpan<float>();
     auto copy_span = copy.DataAsSpan<float>();
@@ -284,6 +243,5 @@ TEST(LoraAdapterTest, VerifyDmlDeviceCopy) {
   }
 }
 #endif
-
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/mlas/bench/bench_fp16_neon_common.cpp b/onnxruntime/test/mlas/bench/bench_cast.cpp
similarity index 100%
rename from onnxruntime/test/mlas/bench/bench_fp16_neon_common.cpp
rename to onnxruntime/test/mlas/bench/bench_cast.cpp
diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp
similarity index 53%
rename from onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
rename to onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp
index 71db7d81075b5..64d229889214b 100644
--- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp
@@ -8,6 +8,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <vector>
+#include <type_traits>
 
 #include "benchmark/benchmark.h"
 
@@ -16,16 +17,16 @@
 #include "core/util/thread_utils.h"
 #include "core/platform/env_var_utils.h"
 
-template <size_t BlkBitWidth>
-void RunSQNBitGemmBenchmark(size_t BlkLen,
-                            size_t M, size_t N, size_t K,
-                            size_t Threads,
-                            bool Symmetric,
-                            bool HasBias,
-                            MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
-                            benchmark::State& state) {
-  if (!MlasIsSQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) {
-    state.SkipWithMessage("SQNBitGemm is not available with the given configuration on the current machine.");
+template <typename AType, size_t BlkBitWidth>
+void RunQNBitGemmBenchmark(size_t BlkLen,
+                           size_t M, size_t N, size_t K,
+                           size_t Threads,
+                           bool Symmetric,
+                           bool HasBias,
+                           MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
+                           benchmark::State& state) {
+  if (!MlasIsQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) {
+    state.SkipWithMessage("QNBitGemm is not available with the given configuration on the current machine.");
     return;
   }
 
@@ -43,40 +44,40 @@ void RunSQNBitGemmBenchmark(size_t BlkLen,
       onnxruntime::concurrency::CreateThreadPool(&onnxruntime::Env::Default(),
                                                  tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP));
 
-  const auto A = RandomVectorUniform(M * K, -1.0f, 1.0f);
-  const auto B = RandomVectorUniform(K * N, -1.0f, 1.0f);
+  const auto A = RandomVectorUniform(M * K, AType(-1.0f), AType(1.0f));
+  const auto B = RandomVectorUniform(K * N, AType(-1.0f), AType(1.0f));
 
-  const auto Bias = HasBias ? RandomVectorUniform(N, -1.0f, 1.0f) : std::vector<float>();
+  const auto Bias = HasBias ? RandomVectorUniform(N, AType(-1.0f), AType(1.0f)) : std::vector<AType>();
 
-  std::vector<float> C(static_cast<size_t>(M * N));
+  std::vector<AType> C(static_cast<size_t>(M * N));
 
   std::vector<uint8_t> QuantBData(QuantBDataSizeInBytes);
-  std::vector<float> QuantBScale(QuantBScaleSize);
+  std::vector<AType> QuantBScale(QuantBScaleSize);
   std::vector<uint8_t> QuantBZeroPoint(Symmetric ? 0 : QuantBZeroPointSizeInBytes);
   bool has_zp_input = !Symmetric;
 
-  MlasQuantizeBlockwise<float, BlkBitWidth>(QuantBData.data(), QuantBScale.data(),
+  MlasQuantizeBlockwise<AType, BlkBitWidth>(QuantBData.data(), QuantBScale.data(),
                                             Symmetric ? nullptr : QuantBZeroPoint.data(),
                                             B.data(), static_cast<int>(BlkLen), /* columnwise */ true,
                                             static_cast<int>(K), static_cast<int>(N), static_cast<int>(N),
                                             tp.get());
 
   std::unique_ptr<std::byte[]> Workspace;
-  if (const auto WorkspaceSize = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType);
+  if (const auto WorkspaceSize = MlasQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType);
       WorkspaceSize > 0) {
     Workspace = std::make_unique<std::byte[]>(WorkspaceSize);
   }
 
   std::unique_ptr<std::byte[]> PackedQuantBData;
-  if (const auto PackedQuantBDataSize = MlasSQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType);
+  if (const auto PackedQuantBDataSize = MlasQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType);
       PackedQuantBDataSize > 0) {
     PackedQuantBData = std::make_unique<std::byte[]>(PackedQuantBDataSize);
-    MlasSQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData.data(), PackedQuantBData.get(),
-                                 QuantBScale.data(), has_zp_input, QuantBZeroPoint.data(),
-                                 tp.get());
+    MlasQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData.data(), PackedQuantBData.get(),
+                                QuantBScale.data(), has_zp_input, QuantBZeroPoint.data(),
+                                tp.get());
   }
 
-  MLAS_SQNBIT_GEMM_DATA_PARAMS params{};
+  MLAS_QNBIT_GEMM_DATA_PARAMS<AType> params{};
   params.A = A.data();
   params.lda = K;
   if (PackedQuantBData != nullptr)
@@ -92,15 +93,15 @@ void RunSQNBitGemmBenchmark(size_t BlkLen,
   params.ldc = N;
 
   // warm up run
-  MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace.get(), tp.get());
+  MlasQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace.get(), tp.get());
 
   for (auto _ : state) {
-    MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace.get(), tp.get());
+    MlasQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace.get(), tp.get());
   }
 }
 
-template <size_t BlkBitWidth>
-void SQNBITGEMM(benchmark::State& state) {
+template <typename AType, size_t BlkBitWidth>
+void QNBITGEMM(benchmark::State& state) {
   using onnxruntime::narrow;
 
   const auto BlkLen = narrow<size_t>(state.range(0));
@@ -110,46 +111,50 @@ void SQNBITGEMM(benchmark::State& state) {
   const auto Threads = narrow<size_t>(state.range(4));
   const auto Symmetric = narrow<bool>(state.range(5));
   const bool HasBias = narrow<bool>(state.range(6));
-  const auto ComputeType = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(state.range(7));
+  const auto ComputeType = static_cast<MLAS_QNBIT_GEMM_COMPUTE_TYPE>(state.range(7));
 
-  RunSQNBitGemmBenchmark<BlkBitWidth>(BlkLen, M, N, K, Threads, Symmetric, HasBias, ComputeType, state);
+  RunQNBitGemmBenchmark<AType, BlkBitWidth>(BlkLen, M, N, K, Threads, Symmetric, HasBias, ComputeType, state);
 }
 
-static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) {
+template <typename AType>
+static void QNBitGemmArgs(benchmark::internal::Benchmark* b) {
   b->ArgNames({"BlkLen", "M", "N", "K", "Threads", "Symmetric", "HasBias", "ComputeType"});
 
   b->ArgsProduct({
-      {128},                                   // BlkLen
-      {1},                                     // M
-      {4096, 11008},                           // N
-      {4096, 11008},                           // K
-      {1, 8},                                  // Threads
-      {int64_t{false}, int64_t{true}},         // Symmetric
-      {int64_t{false}, int64_t{true}},         // HasBias
-      {int64_t{CompFp32}, int64_t{CompInt8}},  // ComputeType
+      {128},                            // BlkLen
+      {1, 4096},                        // M
+      {4096, 11008},                    // N
+      {4096, 11008},                    // K
+      {1, 8},                           // Threads
+      {int64_t{false}, int64_t{true}},  // Symmetric
+      {int64_t{false}, int64_t{true}},  // HasBias
+      std::is_same_v<AType, MLAS_FP16>
+          ? std::vector<int64_t>{int64_t{HQNBIT_CompFp16}}
+          : std::vector<int64_t>{int64_t{SQNBIT_CompFp32}, int64_t{SQNBIT_CompInt8}},  // ComputeType
   });
 }
 
-BENCHMARK(SQNBITGEMM<4>)->Apply(SQNBitGemmArgs)->UseRealTime();
+BENCHMARK(QNBITGEMM<float, 4>)->Apply(QNBitGemmArgs<float>)->UseRealTime();
+BENCHMARK(QNBITGEMM<MLAS_FP16, 4>)->Apply(QNBitGemmArgs<MLAS_FP16>)->UseRealTime();
 
 // This test gets benchmark arguments from environment variables.
-template <size_t BlkBitWidth>
-void SQNBITGEMM_ENV(benchmark::State& state) {
+template <typename AType, size_t BlkBitWidth>
+void QNBITGEMM_ENV(benchmark::State& state) {
   using onnxruntime::ParseEnvironmentVariableWithDefault;
 
-  const auto BlkLen = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_BLKLEN", 32);
-  const auto M = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_M", 1);
-  const auto N = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_N", 4096);
-  const auto K = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_K", 4096);
-  const auto Threads = ParseEnvironmentVariableWithDefault<size_t>("ORT_SQNBITGEMM_THREADS", 1);
-  const auto Symmetric = ParseEnvironmentVariableWithDefault<bool>("ORT_SQNBITGEMM_SYMMETRIC", true);
-  const auto HasBias = ParseEnvironmentVariableWithDefault<bool>("ORT_SQNBITGEMM_HAS_BIAS", false);
-  const auto ComputeType = ParseEnvironmentVariableWithDefault<int32_t>("ORT_SQNBITGEMM_COMPUTE_TYPE",
-                                                                        static_cast<int32_t>(CompFp32));
+  const auto BlkLen = ParseEnvironmentVariableWithDefault<size_t>("ORT_QNBITGEMM_BLKLEN", 32);
+  const auto M = ParseEnvironmentVariableWithDefault<size_t>("ORT_QNBITGEMM_M", 1);
+  const auto N = ParseEnvironmentVariableWithDefault<size_t>("ORT_QNBITGEMM_N", 4096);
+  const auto K = ParseEnvironmentVariableWithDefault<size_t>("ORT_QNBITGEMM_K", 4096);
+  const auto Threads = ParseEnvironmentVariableWithDefault<size_t>("ORT_QNBITGEMM_THREADS", 1);
+  const auto Symmetric = ParseEnvironmentVariableWithDefault<bool>("ORT_QNBITGEMM_SYMMETRIC", true);
+  const auto HasBias = ParseEnvironmentVariableWithDefault<bool>("ORT_QNBITGEMM_HAS_BIAS", false);
+  const auto ComputeType = ParseEnvironmentVariableWithDefault<int32_t>("ORT_QNBITGEMM_COMPUTE_TYPE",
+                                                                        static_cast<int32_t>(SQNBIT_CompFp32));
 
-  RunSQNBitGemmBenchmark<BlkBitWidth>(BlkLen, M, N, K, Threads, Symmetric, HasBias,
-                                      static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(ComputeType),
-                                      state);
+  RunQNBitGemmBenchmark<AType, BlkBitWidth>(BlkLen, M, N, K, Threads, Symmetric, HasBias,
+                                            static_cast<MLAS_QNBIT_GEMM_COMPUTE_TYPE>(ComputeType),
+                                            state);
 
   std::ostringstream s;
   s << "BlkBitWidth:" << BlkBitWidth << "/BlkLen:" << BlkLen
@@ -159,4 +164,4 @@ void SQNBITGEMM_ENV(benchmark::State& state) {
   state.SetLabel(s.str());
 }
 
-BENCHMARK(SQNBITGEMM_ENV<4>)->UseRealTime();
+BENCHMARK(QNBITGEMM_ENV<float, 4>)->UseRealTime();
diff --git a/onnxruntime/test/mlas/bench/bench_util.h b/onnxruntime/test/mlas/bench/bench_util.h
index f96dd5c673b3d..78789ef1cbc1a 100644
--- a/onnxruntime/test/mlas/bench/bench_util.h
+++ b/onnxruntime/test/mlas/bench/bench_util.h
@@ -8,8 +8,12 @@
 #include <functional>
 #include <random>
 
+#include "core/framework/float16.h"
+#include "core/mlas/inc/mlas.h"
+
 template <typename ElementType>
-std::vector<ElementType> RandomVectorUniform(
+typename std::enable_if_t<!std::is_same_v<ElementType, MLAS_FP16>, std::vector<ElementType>>
+RandomVectorUniform(
     size_t N,
     ElementType min_value = std::numeric_limits<ElementType>::lowest(),
     ElementType max_value = std::numeric_limits<ElementType>::max()) {
@@ -26,6 +30,25 @@ std::vector<ElementType> RandomVectorUniform(
   return r;
 }
 
+template <typename ElementType>
+typename std::enable_if_t<std::is_same_v<ElementType, MLAS_FP16>, std::vector<ElementType>>
+RandomVectorUniform(
+    size_t N,
+    ElementType min_value,
+    ElementType max_value) {
+  if (min_value.ToFloat() >= max_value.ToFloat()) {
+    return std::vector<ElementType>(N, min_value);
+  }
+  std::default_random_engine generator(static_cast<unsigned>(N));
+  std::uniform_real_distribution<float> distribution(min_value.ToFloat(), max_value.ToFloat());
+
+  std::vector<ElementType> r(N);
+  for (size_t i = 0; i < N; i++) {
+    r[i] = ElementType(distribution(generator));
+  }
+  return r;
+}
+
 std::vector<float> RandomVectorUniform(std::vector<int64_t> shape, float min_value, float max_value);
 
 std::vector<int64_t> BenchArgsVector(benchmark::State& state, size_t& start, size_t count);
diff --git a/onnxruntime/test/mlas/unittest/test_hqnbitgemm_neon.cpp b/onnxruntime/test/mlas/unittest/test_hqnbitgemm_neon.cpp
new file mode 100644
index 0000000000000..b598c20e29280
--- /dev/null
+++ b/onnxruntime/test/mlas/unittest/test_hqnbitgemm_neon.cpp
@@ -0,0 +1,501 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    test_hqnbitgemm_neon.cpp
+
+Abstract:
+
+    Tests for MLAS n-bit int block quantized GEMM on ARM CPU with input A type T1 fp16.
+
+--*/
+
+#include <vector>
+#include <random>
+
+#include "test_util.h"
+#include "core/mlas/lib/mlasi.h"
+#include "core/mlas/lib/qnbitgemm.h"
+#include "mlas_qnbit.h"
+
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+
+class MlasNeonFp16CastTest : public MlasTestBase {
+ private:
+  MatrixGuardBuffer<float> fp32Buffer_;
+  MatrixGuardBuffer<unsigned short> fp16Buffer_;
+
+  template <size_t count>
+  void TestFp16ToFp32() {
+    const auto* src = fp16Buffer_.GetFilledBuffer(count, [](unsigned short* start, size_t size) {
+      for (size_t i = 0; i < size; i++) {
+        start[i] = static_cast<unsigned short>(i);
+      }
+    });
+    auto* dest = fp32Buffer_.GetBuffer(count, true);
+
+    MlasCastF16ToF32KernelNeon(src, dest, count);
+
+    for (size_t i = 0; i < count; i++) {
+      if ((src[i] & 0x1c00) == 0x1c00) continue;  // skip inf and nan
+      ASSERT_EQ(dest[i], MLAS_FP16::FromBits(src[i]).ToFloat());
+    }
+  }
+
+  template <size_t count>
+  void TestFp32ToFp16() {
+    const auto* src = fp32Buffer_.GetFilledBuffer(count, [](float* p, size_t size) {
+      for (size_t i = 0; i < size; i++) {
+        p[i] = static_cast<float>(i) + 0.125f;
+      }
+    });
+    auto* dest = fp16Buffer_.GetBuffer(count, true);
+
+    MlasCastF32ToF16KernelNeon(src, dest, count);
+
+    for (size_t i = 0; i < count; i++) {
+      ASSERT_EQ(dest[i], MLAS_FP16(src[i]).val);
+    }
+  }
+
+ public:
+  static const char* GetTestSuiteName() {
+    return "NeonFp16Cast";
+  }
+
+  void ExecuteShort(void) override {
+    TestFp16ToFp32<(1 << 16)>();
+    TestFp16ToFp32<1>();
+    TestFp16ToFp32<4>();
+    TestFp16ToFp32<7>();
+    TestFp32ToFp16<(1 << 16)>();
+    TestFp32ToFp16<3>();
+    TestFp32ToFp16<4>();
+    TestFp32ToFp16<6>();
+  }
+};
+
+class MlasNeonFp16PrepackTest : public MlasTestBase {
+ private:
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  std::uniform_int_distribution<> distrib_;
+  MatrixGuardBuffer<uint8_t> input_, ref_, packed_;
+
+  template <size_t Ldb>
+  MLAS_FORCEINLINE void Transpose8x8(const uint8_t* src, size_t n, size_t k, uint8_t* dst) {
+    for (size_t c = 0; c < 8; c++) {
+      for (size_t r = 0; r < 8; r++) {
+        size_t i = (n + c) * Ldb + r + k;
+        size_t j = n * Ldb + (r + k) * 8 + c;
+        dst[j] = src[i];
+      }
+    }
+  }
+
+  MLAS_FORCEINLINE
+  uint8_t GetInt4(uint8_t v, size_t i) {
+    return (i & 1) ? (v >> 4) : (v & 0x0f);
+  }
+
+  MLAS_FORCEINLINE
+  void PrepackSlice(const uint8_t* src, size_t j, uint8_t* dst) {
+    for (size_t i = 0; i < 8; i++) {
+      uint8_t v0 = GetInt4(src[j + (i >> 1)], i);
+      uint8_t v1 = GetInt4(src[j + ((8 + i) >> 1)], i + 8);
+      dst[j + i] = v0 | (v1 << 4);
+    }
+  }
+
+  template <size_t Ldb, size_t N, size_t K>
+  MLAS_FORCEINLINE void Prepack(const uint8_t* src, uint8_t* dst) {
+    size_t n = 0;
+    for (; n + 8 <= N; n += 8) {
+      for (size_t k = 0; k < Ldb; k += 8) {
+        Transpose8x8<Ldb>(src, n, k, dst);
+      }
+    }
+
+    for (; n < N; ++n) {
+      for (size_t k = 0; k < Ldb; k += 8) {
+        PrepackSlice(src, n * Ldb + k, dst);
+      }
+    }
+  }
+
+  template <size_t Ldb, size_t N, size_t K>
+  MLAS_FORCEINLINE void Check(const uint8_t* packed, const uint8_t* ref) {
+    size_t n = 0;
+    for (; n + 8 <= N; n += 8) {
+      for (size_t i = 0; i < K; i += 2) {
+        for (size_t j = 0; j < 8; ++j) {
+          ASSERT_EQ(packed[n * Ldb + (i >> 1) * 8 + j], ref[n * Ldb + (i >> 1) * 8 + j])
+              << " seed " << seed_
+              << " n " << n << " i " << i << " j " << j;
+        }
+      }
+    }
+
+    for (; n < N; ++n) {
+      for (size_t i = 0; i < K; i += 2) {
+        ASSERT_EQ(packed[n * Ldb + (i >> 1)], ref[n * Ldb + (i >> 1)])
+            << " seed " << seed_
+            << " n " << n << " i " << i;
+      }
+    }
+  }
+
+  template <size_t N, size_t K, size_t BlkLen>
+  void TestPrepack() {
+    constexpr size_t Bits = 4;
+    constexpr size_t Ldb = (((K + BlkLen - 1) & (~(BlkLen - 1))) * Bits + 7) / 8;
+    constexpr size_t BufferSize = N * Ldb;
+    auto InitializeBuffer = [this](uint8_t* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = static_cast<uint8_t>(distrib_(gen_));
+      }
+    };
+
+    const auto* input = input_.GetFilledBuffer(BufferSize, InitializeBuffer);
+    auto* packed = packed_.GetBuffer(BufferSize, true);
+    auto* ref = ref_.GetBuffer(BufferSize, true);
+    MlasQNBitGemmPackQuantBData(
+        N, K, Bits, BlkLen, MLAS_QNBIT_GEMM_COMPUTE_TYPE::HQNBIT_CompFp16, input, packed,
+        nullptr, false, nullptr, nullptr);
+    Prepack<Ldb, N, K>(input, ref);
+    Check<Ldb, N, K>(packed, ref);
+  }
+
+ public:
+  MlasNeonFp16PrepackTest()
+      : seed_(19287), gen_(seed_), distrib_(0, 255) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonFp16Prepack";
+  }
+
+  void ExecuteShort(void) override {
+    TestPrepack<1, 1, 16>();
+    TestPrepack<1, 15, 16>();
+    TestPrepack<1, 31, 16>();
+    TestPrepack<8, 1, 16>();
+    TestPrepack<8, 16, 16>();
+    TestPrepack<9, 31, 16>();
+    TestPrepack<9, 33, 32>();
+    TestPrepack<15, 33, 16>();
+    TestPrepack<17, 67, 16>();
+    TestPrepack<17, 96, 128>();
+    TestPrepack<263, 263, 16>();
+  }
+};
+
+class MlasNeonFp16DequantBTest : public MlasTestBase {
+ private:
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  std::uniform_int_distribution<> distrib_;
+  std::uniform_real_distribution<float> _distribFp;
+  MatrixGuardBuffer<uint8_t> input_, zero_points_;
+  MatrixGuardBuffer<MLAS_FP16> dequant_, ref_, scales_;
+
+  MLAS_FORCEINLINE
+  uint8_t GetInt4(uint8_t v, size_t i) {
+    return (i & 1) ? (v >> 4) : (v & 0x0f);
+  }
+
+  template <size_t N, size_t K, size_t BlkLen, bool UseZeroPoints>
+  void DequantB(const uint8_t* src, MLAS_FP16* dst, const MLAS_FP16* scales, const uint8_t* zero_points) {
+    constexpr size_t blkNum = (K + BlkLen - 1) / BlkLen;
+    constexpr size_t ld_src = (blkNum * BlkLen + 1) / 2;
+    constexpr size_t ld_dst = blkNum * BlkLen;
+    constexpr size_t ld_zp = (blkNum + 1) / 2;
+    size_t n = 0;
+    for (; n + 8 <= N; n += 8) {
+      size_t i_src = n * ld_src, i_dst = n * ld_dst, i_scale = n * blkNum, i_zp = n * ld_zp;
+      for (size_t blk = 0; blk < blkNum; i_zp += (blk & 1), ++blk, ++i_scale) {
+        for (size_t i = 0; i < BlkLen; i += 2, i_dst += 8) {
+          for (size_t j = 0; j < 8; ++j, ++i_src, ++i_dst) {
+            uint8_t v = src[i_src];
+            float v0 = static_cast<float>(GetInt4(v, 0));
+            float v1 = static_cast<float>(GetInt4(v, 1));
+            float zp = static_cast<float>(UseZeroPoints ? GetInt4(zero_points[i_zp + ld_zp * j], blk) : 8);
+            float scale = scales[i_scale + blkNum * j];
+            dst[i_dst] = MLAS_FP16(v0 * scale - zp * scale);
+            dst[i_dst + 8] = MLAS_FP16(v1 * scale - zp * scale);
+          }
+        }
+      }
+    }
+
+    for (; n < N; ++n) {
+      size_t i_src = n * ld_src, i_dst = n * ld_dst, i_scale = n * blkNum, i_zp = n * ld_zp;
+      for (size_t blk = 0; blk < blkNum; i_zp += (blk & 1), ++blk, ++i_scale) {
+        float zp = static_cast<float>(UseZeroPoints ? GetInt4(zero_points[i_zp], blk) : 8);
+        float scale = scales[i_scale];
+        for (size_t i = 0; i < BlkLen; i += 16, i_dst += 8) {
+          for (size_t j = 0; j < 16; j += 2, ++i_src, ++i_dst) {
+            uint8_t v = src[i_src];
+            float v0 = static_cast<float>(GetInt4(v, 0));
+            float v1 = static_cast<float>(GetInt4(v, 1));
+            dst[i_dst] = MLAS_FP16(v0 * scale - zp * scale);
+            dst[i_dst + 8] = MLAS_FP16(v1 * scale - zp * scale);
+          }
+        }
+      }
+    }
+  }
+
+  MLAS_FORCEINLINE
+  bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) {
+    float f0 = std::abs(v0.ToFloat()), f1 = std::abs(v1.ToFloat());
+    return std::abs(f0 - f1) <= f1 * rtol + atol;
+  }
+
+  template <size_t Ldb, size_t N, size_t K>
+  MLAS_FORCEINLINE void Check(const MLAS_FP16* target, const MLAS_FP16* ref) {
+    size_t n = 0;
+    for (; n + 8 <= N; n += 8) {
+      for (size_t i = 0; i < K; ++i) {
+        for (size_t j = 0; j < 8; ++j) {
+          size_t idx = n * Ldb + i * 8 + j;
+          ASSERT_TRUE(FloatEqual(target[idx], ref[idx], 0.01f, 0.01f))
+              << " seed " << seed_
+              << " v0 " << target[idx] << " v1 " << ref[idx]
+              << " n " << n << " i " << i << " j " << j;
+        }
+      }
+    }
+
+    for (; n < N; ++n) {
+      for (size_t i = 0; i < K; ++i) {
+        size_t idx = n * Ldb + i;
+        ASSERT_TRUE(FloatEqual(target[idx], ref[idx], 0.01f, 0.01f))
+            << " seed " << seed_
+            << " v0 " << target[idx] << " v1 " << ref[idx]
+            << " n " << n << " i " << i;
+      }
+    }
+  }
+
+  template <size_t N, size_t K, size_t BlkLen, bool UseZeroPoints>
+  void TestDequant() {
+    constexpr size_t BlkNum = (K + BlkLen - 1) / BlkLen;
+    constexpr size_t BCount = BlkNum * BlkLen * N;
+    constexpr size_t ScaleCount = N * BlkNum;
+    constexpr size_t ZpSize = N * ((BlkNum + 1) / 2);
+
+    auto InitializeBuffer_i8 = [this](uint8_t* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = static_cast<uint8_t>(distrib_(gen_));
+      }
+    };
+
+    auto InitializeBuffer_fp16 = [this](MLAS_FP16* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = MLAS_FP16(_distribFp(gen_));
+      }
+    };
+
+    const auto* input = input_.GetFilledBuffer(BCount / 2, InitializeBuffer_i8);
+    const auto* zero_points = zero_points_.GetFilledBuffer(ZpSize, InitializeBuffer_i8);
+    auto* dequant = dequant_.GetBuffer(BCount);
+    auto* ref = ref_.GetBuffer(BCount);
+    const auto* scales = scales_.GetFilledBuffer(ScaleCount, InitializeBuffer_fp16);
+    GetMlasPlatform().QNBitGemmDispatch->HQ4BitBlkDequantBForHgemm_CompFp16(
+        BlkLen, dequant, reinterpret_cast<const std::byte*>(input), scales,
+        UseZeroPoints ? reinterpret_cast<const std::byte*>(zero_points) : nullptr,
+        N, K, BlkNum);
+    DequantB<N, K, BlkLen, UseZeroPoints>(input, ref, scales, zero_points);
+    Check<BlkLen * BlkNum, N, K>(dequant, ref);
+  }
+
+ public:
+  MlasNeonFp16DequantBTest()
+      : seed_(19287), gen_(seed_), distrib_(0, 255), _distribFp(0.5f, 2.0f) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonFp16DequantB";
+  }
+
+  void ExecuteShort(void) override {
+    TestDequant<1, 1, 16, false>();
+    TestDequant<1, 1, 16, true>();
+    TestDequant<1, 15, 16, false>();
+    TestDequant<1, 15, 16, true>();
+    TestDequant<1, 31, 16, false>();
+    TestDequant<1, 31, 16, true>();
+    TestDequant<8, 1, 16, false>();
+    TestDequant<8, 1, 16, true>();
+    TestDequant<8, 16, 16, false>();
+    TestDequant<8, 16, 16, true>();
+    TestDequant<9, 31, 16, false>();
+    TestDequant<9, 31, 16, true>();
+    TestDequant<9, 33, 32, false>();
+    TestDequant<9, 33, 32, true>();
+    TestDequant<15, 33, 16, false>();
+    TestDequant<15, 33, 16, true>();
+    TestDequant<17, 67, 16, false>();
+    TestDequant<17, 67, 16, true>();
+    TestDequant<17, 96, 128, false>();
+    TestDequant<17, 96, 128, true>();
+    TestDequant<263, 263, 16, false>();
+    TestDequant<263, 263, 16, true>();
+  }
+};
+
+class MlasNeonFp16HQ4BitGemmKernelTest : public MlasTestBase {
+ private:
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  MatrixGuardBuffer<MLAS_FP16> A_, B_, C_, ref_, bias_;
+
+  MLAS_FORCEINLINE
+  void InitializeBuffer(MLAS_FP16* buffer, float min, float max, size_t count) {
+    std::uniform_real_distribution<float> distrib(min, max);
+    for (size_t i = 0; i < count; i++) {
+      buffer[i] = MLAS_FP16(distrib(gen_));
+    }
+  }
+
+  MLAS_FORCEINLINE
+  bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) {
+    float f0 = v0.ToFloat(), f1 = v1.ToFloat();
+    return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol;
+  }
+
+  template <size_t ldb, size_t N, size_t K>
+  float GetBVal(const MLAS_FP16* B, size_t n, size_t k) {
+    size_t i;
+    if ((N & (~7)) > n) {
+      size_t full8 = n & (~7);
+      i = full8 * ldb + 8 * k + (n - full8);
+    } else {
+      i = n * ldb + k;
+    }
+    return B[i].ToFloat();
+  }
+
+  template <size_t M, size_t N, size_t K, size_t ldb, bool UseBias>
+  void MatMul(const MLAS_FP16* A, const MLAS_FP16* B, const MLAS_FP16* bias, MLAS_FP16* C) {
+    for (size_t m = 0; m < M; ++m) {
+      for (size_t n = 0; n < N; ++n) {
+        float accu = UseBias ? bias[n] : 0.0f;
+        for (size_t k = 0; k < K; ++k) {
+          float a = A[m * K + k].ToFloat();
+          float b = GetBVal<ldb, N, K>(B, n, k);
+          accu = accu + a * b;
+        }
+        C[m * N + n] = MLAS_FP16(accu);
+      }
+    }
+  }
+
+  template <size_t Ldc, size_t M, size_t N>
+  MLAS_FORCEINLINE void Check(const MLAS_FP16* target, const MLAS_FP16* ref) {
+    for (size_t m = 0; m < M; ++m) {
+      for (size_t n = 0; n < N; ++n) {
+        size_t i = m * Ldc + n;
+        ASSERT_TRUE(FloatEqual(target[i], ref[i], 0.02f, 0.055f))
+            << " seed " << seed_
+            << " v0 " << target[i] << " v1 " << ref[i]
+            << " m " << m << " n " << n;
+      }
+    }
+  }
+
+  template <size_t M, size_t N, size_t K, size_t BlkLen, bool UseBias>
+  void TestHQ4BitGemmKernel() {
+    static_assert(M <= 2);
+    constexpr size_t BlkNum = (K + BlkLen - 1) / BlkLen;
+    constexpr size_t ldb = BlkNum * BlkLen;
+
+    const auto* A = A_.GetFilledBuffer(M * K, [this](MLAS_FP16* p, size_t t) {
+      InitializeBuffer(p, -0.25f, 0.25f, t);
+    });
+    const auto* B = B_.GetFilledBuffer(ldb * N, [this](MLAS_FP16* p, size_t t) {
+      InitializeBuffer(p, -0.25f, 0.25f, t);
+    });
+    auto* C = C_.GetBuffer(M * N, true);
+    auto* ref = ref_.GetBuffer(M * N, true);
+    auto* bias = bias_.GetFilledBuffer(N, [this](MLAS_FP16* p, size_t t) {
+      InitializeBuffer(p, -5.0f, 5.0f, t);
+    });
+
+    GetMlasPlatform().QNBitGemmDispatch->HQ4BitGemmKernel_CompFp16(
+        A, B, UseBias ? bias : nullptr, C, M, N, K, K, ldb, N);
+
+    MatMul<M, N, K, ldb, UseBias>(A, B, bias, ref);
+    Check<N, M, N>(C, ref);
+  }
+
+ public:
+  MlasNeonFp16HQ4BitGemmKernelTest()
+      : seed_(19287), gen_(seed_) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonFp16HQ4BitGemmKernel";
+  }
+
+  template <size_t M>
+  void ExecuteShort_T(void) {
+    TestHQ4BitGemmKernel<M, 1, 1, 16, false>();
+    TestHQ4BitGemmKernel<M, 1, 1, 16, true>();
+    TestHQ4BitGemmKernel<M, 1, 15, 16, false>();
+    TestHQ4BitGemmKernel<M, 1, 15, 16, true>();
+    TestHQ4BitGemmKernel<M, 1, 31, 16, false>();
+    TestHQ4BitGemmKernel<M, 1, 31, 16, true>();
+    TestHQ4BitGemmKernel<M, 31, 1, 16, false>();
+    TestHQ4BitGemmKernel<M, 31, 1, 16, true>();
+    TestHQ4BitGemmKernel<M, 31, 15, 16, false>();
+    TestHQ4BitGemmKernel<M, 31, 15, 16, true>();
+    TestHQ4BitGemmKernel<M, 31, 31, 16, false>();
+    TestHQ4BitGemmKernel<M, 31, 31, 16, true>();
+    TestHQ4BitGemmKernel<M, 31, 63, 128, false>();
+    TestHQ4BitGemmKernel<M, 31, 63, 128, true>();
+    TestHQ4BitGemmKernel<M, 31, 511, 128, false>();
+    TestHQ4BitGemmKernel<M, 31, 511, 128, true>();
+    TestHQ4BitGemmKernel<M, 128, 1, 16, false>();
+    TestHQ4BitGemmKernel<M, 128, 1, 16, true>();
+    TestHQ4BitGemmKernel<M, 128, 15, 16, false>();
+    TestHQ4BitGemmKernel<M, 128, 15, 16, true>();
+    TestHQ4BitGemmKernel<M, 128, 31, 16, false>();
+    TestHQ4BitGemmKernel<M, 128, 31, 16, true>();
+    TestHQ4BitGemmKernel<M, 128, 63, 128, false>();
+    TestHQ4BitGemmKernel<M, 128, 63, 128, true>();
+    TestHQ4BitGemmKernel<M, 128, 511, 128, false>();
+    TestHQ4BitGemmKernel<M, 128, 511, 128, true>();
+  }
+
+  void ExecuteShort(void) override {
+    ExecuteShort_T<1>();
+    ExecuteShort_T<2>();
+  }
+};
+
+static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
+  size_t count = 0;
+  if (is_short_execute) {
+    count += MlasDirectShortExecuteTests<MlasNeonFp16CastTest>::RegisterShortExecute();
+    if (GetMlasPlatform().QNBitGemmDispatch) {
+      if (GetMlasPlatform().QNBitGemmDispatch->HQ4BitGemmPackQuantBData) {
+        count += MlasDirectShortExecuteTests<MlasNeonFp16PrepackTest>::RegisterShortExecute();
+      }
+      if (GetMlasPlatform().QNBitGemmDispatch->HQ4BitBlkDequantBForHgemm_CompFp16) {
+        count += MlasDirectShortExecuteTests<MlasNeonFp16DequantBTest>::RegisterShortExecute();
+      }
+      if (GetMlasPlatform().QNBitGemmDispatch->HQ4BitGemmKernel_CompFp16) {
+        count += MlasDirectShortExecuteTests<MlasNeonFp16HQ4BitGemmKernelTest>::RegisterShortExecute();
+      }
+    }
+  }
+  return count;
+});
+
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
diff --git a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
index 0710981fa17c6..e22018ae2877f 100644
--- a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
@@ -18,11 +18,11 @@ Module Name:
 #include "mlas_q4.h"
 #include "mlas_qnbit.h"
 
-static constexpr const char* ComputeTypeName(MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType) {
+static constexpr const char* ComputeTypeName(MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType) {
   switch (ComputeType) {
-    case CompFp32:
+    case SQNBIT_CompFp32:
       return "Fp32";
-    case CompInt8:
+    case SQNBIT_CompInt8:
       return "Int8";
     default:
       return "unknown";
@@ -63,16 +63,16 @@ class MlasSQNBitGemmTest : public MlasTestBase {
                 float* C,
                 size_t ldc,
                 void* Workspace,
-                MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+                MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
                 MLAS_THREADPOOL* Threadpool) {
-    MLAS_SQNBIT_GEMM_DATA_PARAMS params;
+    MLAS_QNBIT_GEMM_DATA_PARAMS<float> params;
     params.A = A;
     params.lda = lda;
     params.Bias = Bias;
     params.C = C;
     params.ldc = ldc;
 #ifdef MLAS_TARGET_AMD64_IX86
-    if (ComputeType == CompInt8) {
+    if (ComputeType == SQNBIT_CompInt8) {
       params.QuantBDataWorkspace = PackedQuantBDataWorkspace;
     }
 #endif
@@ -81,7 +81,7 @@ class MlasSQNBitGemmTest : public MlasTestBase {
     params.QuantBZeroPoint = QuantBZeroPoint;
     params.PostProcessor = nullptr;
 
-    MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace, Threadpool);
+    MlasQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace, Threadpool);
   }
 
   void QuantizeA(size_t M, size_t K, const float* A, int8_t* QuantAData, float* QuantAScale) {
@@ -201,7 +201,7 @@ class MlasSQNBitGemmTest : public MlasTestBase {
 
  public:
   void Test(size_t M, size_t N, size_t K,
-            MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+            MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
             bool WithThreadpool, bool Symmetric, bool WithBias) {
     MLAS_THREADPOOL* Threadpool = WithThreadpool ? GetMlasThreadPool() : nullptr;
 
@@ -265,19 +265,19 @@ class MlasSQNBitGemmTest : public MlasTestBase {
     }
 
     void* Workspace = nullptr;
-    if (const auto WorkspaceSize = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType);
+    if (const auto WorkspaceSize = MlasQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType);
         WorkspaceSize > 0) {
       Workspace = BufferWorkspace.GetBuffer(WorkspaceSize);
     }
 
     void* PackedQuantBDataWorkspace = nullptr;
-    if (const auto PackedQuantBDataSize = MlasSQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType);
+    if (const auto PackedQuantBDataSize = MlasQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType);
         PackedQuantBDataSize > 0) {
       PackedQuantBDataWorkspace = BufferPackedQuantBData.GetBuffer(PackedQuantBDataSize);
       bool has_zp_input = QuantBZeroPoint != nullptr;
-      MlasSQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData, PackedQuantBDataWorkspace,
-                                   QuantBScale, has_zp_input, QuantBZeroPoint,
-                                   GetMlasThreadPool());
+      MlasQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData, PackedQuantBDataWorkspace,
+                                  QuantBScale, has_zp_input, QuantBZeroPoint,
+                                  GetMlasThreadPool());
     }
 
     CallGemm(M, N, K,
@@ -289,9 +289,9 @@ class MlasSQNBitGemmTest : public MlasTestBase {
              ComputeType,
              Threadpool);
 
-    if (ComputeType == CompFp32) {
+    if (ComputeType == SQNBIT_CompFp32) {
       CallReferenceGemm_CompFp32(M, N, K, A, QuantBData, QuantBScale, QuantBZeroPoint, Bias, CReference);
-    } else if (ComputeType == CompInt8) {
+    } else if (ComputeType == SQNBIT_CompInt8) {
       CallReferenceGemm_CompInt8(M, N, K, A, QuantBData, QuantBScale, QuantBZeroPoint, Bias, CReference);
     } else {
       FAIL() << "Test is not implemented for compute type "
@@ -324,7 +324,7 @@ template <size_t BlkBitWidth, size_t BlkLen>
 class SQNBitGemmShortExecuteTest : public MlasTestFixture<MlasSQNBitGemmTest<BlkBitWidth, BlkLen>> {
  public:
   explicit SQNBitGemmShortExecuteTest(size_t M, size_t N, size_t K,
-                                      MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+                                      MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
                                       bool WithThreadpool, bool Symmetric, bool WithBias)
       : M_(M),
         N_(N),
@@ -341,11 +341,11 @@ class SQNBitGemmShortExecuteTest : public MlasTestFixture<MlasSQNBitGemmTest<Blk
   }
 
   static size_t RegisterSingleTest(size_t M, size_t N, size_t K,
-                                   MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
+                                   MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType,
                                    bool WithThreadpool, bool Symmetric, bool WithBias) {
     size_t tests_registered = 0;
 
-    if (MlasIsSQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) {
+    if (MlasIsQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) {
       std::stringstream ss;
       ss << (WithThreadpool ? "SingleThread" : "Threaded")
          << "/isSymmetric" << Symmetric
@@ -376,7 +376,7 @@ class SQNBitGemmShortExecuteTest : public MlasTestFixture<MlasSQNBitGemmTest<Blk
   static size_t RegisterShortExecuteTests() {
     size_t tests_registered = 0;
 
-    for (MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType : {CompFp32, CompInt8}) {
+    for (MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType : {SQNBIT_CompFp32, SQNBIT_CompInt8}) {
       for (bool WithThreadpool : {false, true}) {
         for (bool Symmetric : {false, true}) {
           for (size_t b = 1; b < 16; b++) {
@@ -416,7 +416,7 @@ class SQNBitGemmShortExecuteTest : public MlasTestFixture<MlasSQNBitGemmTest<Blk
 
  private:
   size_t M_, N_, K_;
-  MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType_;
+  MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType_;
   bool WithThreadpool_, Symmetric_, WithBias_;
 };
 
diff --git a/onnxruntime/test/mlas/unittest/test_util.h b/onnxruntime/test/mlas/unittest/test_util.h
index 8eecda900ff27..a000e353f370d 100644
--- a/onnxruntime/test/mlas/unittest/test_util.h
+++ b/onnxruntime/test/mlas/unittest/test_util.h
@@ -115,24 +115,24 @@ class MatrixGuardBuffer {
       return GetFilledBuffer(
           Elements,
           [](T* start, size_t size) {
-            std::fill_n(start, size, T(0));
+            std::fill_n(start, size, T(0.0f));
           });
     }
 
     return GetFilledBuffer(
         Elements,
         [](T* start, size_t size) {
-          constexpr int offset = -21;
-          constexpr int range = 43;
+          constexpr float offset = -21.f;
+          constexpr float range = 43.f;
 
-          int FillValue = 11;
+          float FillValue = 11.f;
           T* FillAddress = start;
           for (size_t i = 0; i < size; i++) {
             auto itemv = FillValue - offset;
             *FillAddress++ = (T)(itemv);
 
-            FillValue += 7;
-            FillValue %= range;
+            FillValue += 7.f;
+            FillValue = FillValue >= range ? FillValue - range : FillValue;
           }
         });
   }
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 93a1bf9f30651..ddc453f84feb6 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -631,7 +631,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     }
     if (enable_coreml) {
 #ifdef USE_COREML
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sf, 0));
+      sf.AppendExecutionProvider("CoreML", {});
 #else
       fprintf(stderr, "CoreML is not supported in this build");
       return -1;
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 3aec0d5a67e94..2ff0b599beebf 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -831,7 +831,8 @@ static void VerifyConstantFoldingWithDequantizeLinear(const std::unordered_map<s
 
   bool has_constant_folding = false;
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  auto transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, session_options, *e.get(), {});
+  auto transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, session_options, *e.get(), logger,
+                                                            {});
   for (auto& transformer : transformers) {
     if (transformer->Name() == "ConstantFolding") {
       ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(transformer), TransformerLevel::Level1));
@@ -1764,6 +1765,35 @@ TEST_F(GraphTransformationTests, FuseMatmulBNDirectly) {
   }
 }
 
+TEST_F(GraphTransformationTests, DoNotApplyFuseMatmulBNDirectly) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-directly-dont-fuse.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  std::string expected_output_name;
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "BatchNormalization") {
+      expected_output_name = node.OutputDefs()[0]->Name();
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<MatmulBNFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["BatchNormalization"], 1);
+  ASSERT_EQ(op_to_count["MatMul"], 1);
+  ASSERT_EQ(op_to_count["Gemm"], 0);
+}
+
 TEST_F(GraphTransformationTests, FuseMatmulBNWithOnlyReshape) {
   constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-only-reshape.onnx";
 
@@ -4675,7 +4705,8 @@ TEST_F(GraphTransformationTests, BiasGeluSwitchedInputOrder) {
   // Compare results
   double per_sample_tolerance = 1e-3;
   double relative_per_sample_tolerance = 0.0;
-  auto ret = CompareOrtValue(optimized_fetches[0], unoptimized_fetches[0], per_sample_tolerance, relative_per_sample_tolerance, false);
+  auto ret = CompareOrtValue(optimized_fetches[0], unoptimized_fetches[0],
+                             per_sample_tolerance, relative_per_sample_tolerance, false);
   EXPECT_EQ(ret.first, COMPARE_RESULT::SUCCESS) << ret.second;
 }
 
@@ -4684,7 +4715,8 @@ static void VerifyGeluApproximation(bool is_enabled, SessionOptions& session_opt
       std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
 
   bool has_gelu_approximation = false;
-  auto transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, session_options, *e.get(), {});
+  auto transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, session_options, *e.get(),
+                                                            DefaultLoggingManager().DefaultLogger(), {});
   for (auto& transformer : transformers) {
     if (transformer->Name() == "GeluApproximation") {
       has_gelu_approximation = true;
@@ -4699,7 +4731,8 @@ TEST_F(GraphTransformationTests, DoubleQDQRemover_SessionOptionConfig) {
   auto verify_session_config = [&](bool is_enabled, SessionOptions& session_option) {
     std::unique_ptr<CPUExecutionProvider> cpu_ep = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
     bool has_double_qdq_remover = false;
-    auto transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, session_option, *cpu_ep.get(), {});
+    auto transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, session_option, *cpu_ep.get(),
+                                                              DefaultLoggingManager().DefaultLogger(), {});
     for (auto& transformer : transformers) {
       if (transformer->Name() == "DoubleQDQPairsRemover") {
         has_double_qdq_remover = true;
@@ -5859,6 +5892,22 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
   EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
 }
+
+TEST_F(GraphTransformationTests, MatMulIntegerToFloatLargeTensorTest) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float_large_tensor.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kDmlExecutionProvider);
+  }
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 0);
+}
 #endif  // USE_DML
 
 #endif
diff --git a/onnxruntime/test/optimizer/graph_transform_utils_test.cc b/onnxruntime/test/optimizer/graph_transform_utils_test.cc
index 66b74641e41d3..caa64560426af 100644
--- a/onnxruntime/test/optimizer/graph_transform_utils_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_utils_test.cc
@@ -36,9 +36,11 @@ TEST(GraphTransformerUtilsTests, TestGenerateGraphTransformers) {
   std::string l2_transformer = "ConvActivationFusion";
   InlinedHashSet<std::string> disabled = {l1_rule1, l1_transformer, l2_transformer};
   CPUExecutionProvider cpu_ep(CPUExecutionProviderInfo{});
+  const auto& logger = DefaultLoggingManager().DefaultLogger();
 
-  auto all_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, {}, cpu_ep);
-  auto filtered_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, {}, cpu_ep, disabled);
+  auto all_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, {}, cpu_ep, logger);
+  auto filtered_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, {}, cpu_ep, logger,
+                                                                     disabled);
 
   // check ConstantFolding transformer was removed
   ASSERT_TRUE(filtered_transformers.size() == all_transformers.size() - 1);
@@ -61,8 +63,9 @@ TEST(GraphTransformerUtilsTests, TestGenerateGraphTransformers) {
 
 #ifndef DISABLE_CONTRIB_OPS
   // check that ConvActivationFusion was removed
-  all_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, cpu_ep);
-  filtered_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, cpu_ep, disabled);
+  all_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, cpu_ep, logger);
+  filtered_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, cpu_ep, logger,
+                                                                disabled);
   ASSERT_TRUE(filtered_transformers.size() == all_transformers.size() - 1);
 #endif
 }
diff --git a/onnxruntime/test/optimizer/optimizer_test.cc b/onnxruntime/test/optimizer/optimizer_test.cc
index 81c1a4ace1e33..b306f026b2dfd 100644
--- a/onnxruntime/test/optimizer/optimizer_test.cc
+++ b/onnxruntime/test/optimizer/optimizer_test.cc
@@ -27,6 +27,7 @@ namespace test {
 TEST(OptimizerTest, Basic) {
   Model model("OptimizerBasic", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
               {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
+  const logging::Logger& logger = DefaultLoggingManager().DefaultLogger();
   auto& graph = model.MainGraph();
 
   constexpr int tensor_dim = 10;
@@ -66,22 +67,21 @@ TEST(OptimizerTest, Basic) {
 
   auto cpu_execution_provider = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
 #if !defined(DISABLE_SPARSE_TENSORS)
-  OptimizerExecutionFrame::Info info(nodes, initialized_tensor_set,
-                                     graph.ModelPath(),
-                                     *cpu_execution_provider.get(),
-                                     [&graph](const std::string& name) -> bool {
-                                       return graph.IsSparseInitializer(name);
-                                     });
+  OptimizerExecutionFrame::Info info(
+      nodes, initialized_tensor_set, graph.ModelPath(), *cpu_execution_provider.get(),
+      [&graph](const std::string& name) -> bool {
+        return graph.IsSparseInitializer(name);
+      },
+      logger);
 #else
-  OptimizerExecutionFrame::Info info(nodes, initialized_tensor_set,
-                                     graph.ModelPath(),
-                                     *cpu_execution_provider.get(),
-                                     [](std::string const&) { return false; });
+  OptimizerExecutionFrame::Info info(
+      nodes, initialized_tensor_set, graph.ModelPath(), *cpu_execution_provider.get(),
+      [](std::string const&) { return false; },
+      logger);
 #endif  //! defined(DISABLE_SPARSE_TENSORS)
 
   std::vector<int> fetch_mlvalue_idxs{info.GetMLValueIndex("out")};
   OptimizerExecutionFrame frame(info, fetch_mlvalue_idxs);
-  const logging::Logger& logger = DefaultLoggingManager().DefaultLogger();
 
   const ConfigOptions empty_config_options;
 
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index d07977d4b97b8..043b92d7ef121 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -11,6 +11,7 @@
 #include "core/graph/onnx_protobuf.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/optimizer/double_qdq_pairs_remover.h"
+#include "core/optimizer/qdq_transformer/bias_quantization.h"
 #include "core/optimizer/qdq_transformer/qdq_final_cleanup.h"
 #include "core/optimizer/qdq_transformer/qdq_propagation.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
@@ -3927,6 +3928,7 @@ TEST(QDQTransformerTests, QDQPropagation_DQForward_SliceMultipleConsumers) {
 
 TEST(QDQTransformerTests, QDQ_Selector_Test) {
   const ORTCHAR_T* model_file_name = ORT_TSTR("testdata/transform/qdq_conv.onnx");
+  const auto& logger = DefaultLoggingManager().DefaultLogger();
 
   SessionOptions so;
   // We want to keep the graph un-optimized to prevent QDQ transformer to kick in
@@ -3961,7 +3963,7 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) {
 
   // Check if SelectorManager get a conv qdq group selection as expected
   {
-    const auto result = selector_mgr.GetQDQSelections(whole_graph_viewer);
+    const auto result = selector_mgr.GetQDQSelections(whole_graph_viewer, logger);
     ASSERT_FALSE(result.empty());
     const auto& qdq_group = result.at(0);
     ASSERT_EQ(std::vector<NodeIndex>({0, 1, 2}), qdq_group.dq_nodes);
@@ -3976,7 +3978,7 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) {
     std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
     std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-    std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(whole_graph_viewer);
+    std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(whole_graph_viewer, logger);
 
     // We should get a single QDQ Node unit in the result
     ASSERT_EQ(1, node_unit_holder.size());
@@ -4044,7 +4046,7 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) {
 
     // Check SelectorManager will get empty result
     {
-      const auto result = selector_mgr.GetQDQSelections(partial_graph_viewer);
+      const auto result = selector_mgr.GetQDQSelections(partial_graph_viewer, logger);
       ASSERT_TRUE(result.empty());
     }
   }
@@ -4846,5 +4848,95 @@ TEST(QDQTransformerTests, DropDQSelectorWithDQProducingGraphOutput) {
 }
 #endif  // !defined(DISABLE_CONTRIB_OPS)
 
+TEST(QDQTransformerTests, BiasQuantization_Conv) {
+  auto test_case = [](bool use_contrib_qdq) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      NodeArg* input_arg = builder.MakeInput<uint8_t>({1, 24, 128, 128}, std::numeric_limits<uint8_t>::min(),
+                                                      std::numeric_limits<uint8_t>::max());
+      NodeArg* weight_arg = builder.MakeInitializer<uint8_t>({24, 1, 3, 3}, std::numeric_limits<uint8_t>::min(),
+                                                             std::numeric_limits<uint8_t>::max());
+      NodeArg* bias_arg = builder.MakeInitializer<float>({24}, -0.1f, 0.1f);
+      NodeArg* input_dq_arg = builder.MakeIntermediate();
+      NodeArg* weight_dq_arg = builder.MakeIntermediate();
+      NodeArg* conv_dq_arg = builder.MakeIntermediate();
+      NodeArg* output_arg = builder.MakeOutput();
+
+      builder.AddDequantizeLinearNode<uint8_t>(input_arg, 0.07f, static_cast<uint8_t>(0), input_dq_arg,
+                                               use_contrib_qdq);
+      auto& weight_dq_node = builder.AddDequantizeLinearNode<uint8_t>(weight_arg, std::vector<float>(24, 0.05f),
+                                                                      std::vector<uint8_t>(24, static_cast<uint8_t>(0)),
+                                                                      weight_dq_arg, nullptr, use_contrib_qdq);
+      weight_dq_node.AddAttribute("axis", static_cast<int64_t>(0));
+      auto& conv_node = builder.AddNode("Conv", {input_dq_arg, weight_dq_arg, bias_arg}, {conv_dq_arg});
+      conv_node.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+      conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
+      conv_node.AddAttribute("strides", std::vector<int64_t>{1, 1});
+      conv_node.AddAttribute("group", static_cast<int64_t>(24));
+      conv_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+      builder.AddQuantizeLinearNode<uint8_t>(conv_dq_arg, 0.14f, static_cast<uint8_t>(127), output_arg,
+                                             use_contrib_qdq);
+    };
+
+    auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
+      EXPECT_EQ(op_to_count["QLinearConv"], 1);
+    };
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 18);
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 19);
+  };
+
+  test_case(false);
+#if !defined(DISABLE_CONTRIB_OPS)
+  test_case(true);
+#endif
+}
+
+TEST(QDQTransformerTests, BiasQuantization_Gemm) {
+  auto test_case = [](bool use_contrib_qdq) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      NodeArg* input_arg =
+          builder.MakeInput<uint8_t>({1, 32}, std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
+      NodeArg* weight_arg = builder.MakeInitializer<uint8_t>({16, 32}, std::numeric_limits<uint8_t>::min(),
+                                                             std::numeric_limits<uint8_t>::max());
+      NodeArg* bias_arg = builder.MakeInitializer<float>({16}, -0.1f, 0.1f);
+      NodeArg* input_dq_arg = builder.MakeIntermediate();
+      NodeArg* weight_dq_arg = builder.MakeIntermediate();
+      NodeArg* gemm_dq_arg = builder.MakeIntermediate();
+      NodeArg* output_arg = builder.MakeOutput();
+
+      builder.AddDequantizeLinearNode<uint8_t>(input_arg, 0.001f, static_cast<uint8_t>(0), input_dq_arg,
+                                               use_contrib_qdq);
+      builder.AddDequantizeLinearNode<uint8_t>(weight_arg, 0.26f, static_cast<uint8_t>(0), weight_dq_arg,
+                                               use_contrib_qdq);
+      auto& gemm_node = builder.AddNode("Gemm", {input_dq_arg, weight_dq_arg, bias_arg}, {gemm_dq_arg});
+      gemm_node.AddAttribute("transB", static_cast<int64_t>(1));
+      builder.AddQuantizeLinearNode<uint8_t>(gemm_dq_arg, 0.144f, static_cast<uint8_t>(69), output_arg,
+                                             use_contrib_qdq);
+    };
+
+    auto check_graph = [use_contrib_qdq](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
+      EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1);
+    };
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 18);
+
+    TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, 19);
+  };
+
+  test_case(false);
+#if !defined(DISABLE_CONTRIB_OPS)
+  test_case(true);
+#endif
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
index 35ba1a3369597..f6fce37322c10 100644
--- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
@@ -22,6 +22,7 @@
 #include "test/optimizer/graph_transform_test_builder.h"
 #include "test/providers/internal_testing/internal_testing_execution_provider.h"
 #include "test/util/include/asserts.h"
+#include "test/util/include/default_providers.h"
 #include "test/util/include/inference_session_wrapper.h"
 #include "test/util/include/test_utils.h"
 
@@ -3800,6 +3801,46 @@ TEST(TransposeOptimizerTests, TestCast) {
                     /*opset_version*/ {15, 18});
 }
 
+TEST(TransposeOptimizerTests, TestQLinearSoftmax) {
+  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
+    auto* input0_arg = MakeInput<uint8_t>(builder, std::nullopt, {1, 384, 384, 21}, 0, 255);
+    auto* transpose_1_out_0 = builder.MakeIntermediate();
+    auto* input_x_scale = builder.MakeScalarInitializer<float>(0.5086354613304138);
+    auto* input_x_zero_point = builder.MakeScalarInitializer<uint8_t>(74);
+    auto* input_y_scale = builder.MakeScalarInitializer<float>(0.003921568859368563);
+    auto* input_y_zero_point = builder.MakeScalarInitializer<uint8_t>(0);
+    auto* qlinearsoftmax_1_out_0 = builder.MakeIntermediate();
+    auto* transpose_2_out_0 = builder.MakeOutput();
+
+    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
+    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
+    auto& qlinearsoftmax_1 = builder.AddNode("QLinearSoftmax",
+                                             {transpose_1_out_0, input_x_scale, input_x_zero_point, input_y_scale, input_y_zero_point},
+                                             {qlinearsoftmax_1_out_0}, kMSDomain);
+    qlinearsoftmax_1.AddAttribute("axis", static_cast<int64_t>(1));
+    qlinearsoftmax_1.AddAttribute("opset", static_cast<int64_t>(13));
+    auto& transpose_2 = builder.AddNode("Transpose", {qlinearsoftmax_1_out_0}, {transpose_2_out_0});
+    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
+  };
+
+  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
+    int transpose_cost = EstimateTransposeCost(session.GetGraph());
+    EXPECT_EQ(transpose_cost, 0);
+  };
+
+  TransformerTester(build_test_case_1,
+                    check_optimized_graph_1,
+                    TransformerLevel::Level2,
+                    TransformerLevel::Level3,
+                    /*opset_version*/ 13,
+                    /*per_sample_tolerance*/ 0.0,
+                    /*relative_per_sample_tolerance*/ 0.0,
+                    /*transformer*/ nullptr,
+                    /*add_session_options*/ {},
+                    /*disabled_optimizers*/ {},
+                    /*ep*/ DefaultCpuExecutionProvider());
+}
+
 TEST(TransposeOptimizerTests, TestBroadcastReusedInputs) {
   auto build_test_case_1 = [&](ModelTestBuilder& builder) {
     auto* input0_arg = MakeInput<float>(builder, {{-1, -1, 3, 4}}, {1, 2, 3, 4}, 0.0, 1.0);
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index e40544d950ed7..23c3812ebd025 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -24,6 +24,7 @@
 #include <core/optimizer/graph_transformer_level.h>
 
 #include "test_configuration.h"
+#include "strings_helper.h"
 
 namespace onnxruntime {
 namespace perftest {
@@ -36,7 +37,7 @@ namespace perftest {
       "\t\tProvide 'duration' to run the test for a fix duration, and 'times' to repeated for a certain times. \n"
       "\t-M: Disable memory pattern.\n"
       "\t-A: Disable memory arena\n"
-      "\t-I: Generate tensor input binding (Free dimensions are treated as 1.)\n"
+      "\t-I: Generate tensor input binding. Free dimensions are treated as 1 unless overridden using -f.\n"
       "\t-c [parallel runs]: Specifies the (max) number of runs to invoke simultaneously. Default:1.\n"
       "\t-e [cpu|cuda|dnnl|tensorrt|openvino|dml|acl|nnapi|coreml|qnn|snpe|rocm|migraphx|xnnpack|vitisai|webgpu]: Specifies the provider 'cpu','cuda','dnnl','tensorrt', "
       "'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'qnn', 'snpe', 'rocm', 'migraphx', 'xnnpack', 'vitisai' or 'webgpu'. "
@@ -100,6 +101,7 @@ namespace perftest {
       "\t    Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
       "\t    [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
       "\t    Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
+      "\t    [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary."
       "\t    [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n"
       "\n"
       "\t    [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
@@ -129,8 +131,14 @@ namespace perftest {
       "\t    [NNAPI only] [NNAPI_FLAG_CPU_ONLY]: Using CPU only in NNAPI EP.\n"
       "\t    [Example] [For NNAPI EP] -e nnapi -i \"NNAPI_FLAG_USE_FP16 NNAPI_FLAG_USE_NCHW NNAPI_FLAG_CPU_DISABLED\"\n"
       "\n"
-      "\t    [CoreML only] [COREML_FLAG_CREATE_MLPROGRAM COREML_FLAG_USE_CPU_ONLY COREML_FLAG_USE_CPU_AND_GPU]: Create an ML Program model instead of Neural Network.\n"
-      "\t    [Example] [For CoreML EP] -e coreml -i \"COREML_FLAG_CREATE_MLPROGRAM\"\n"
+      "\t    [CoreML only] [ModelFormat]:[MLProgram, NeuralNetwork] Create an ML Program model or Neural Network. Default is NeuralNetwork.\n"
+      "\t    [CoreML only] [MLComputeUnits]:[CPUAndNeuralEngine CPUAndGPU ALL CPUOnly] Specify to limit the backend device used to run the model.\n"
+      "\t    [CoreML only] [AllowStaticInputShapes]:[0 1].\n"
+      "\t    [CoreML only] [EnableOnSubgraphs]:[0 1].\n"
+      "\t    [CoreML only] [SpecializationStrategy]:[Default FastPrediction].\n"
+      "\t    [CoreML only] [ProfileComputePlan]:[0 1].\n"
+      "\t    [CoreML only] [AllowLowPrecisionAccumulationOnGPU]:[0 1].\n"
+      "\t    [Example] [For CoreML EP] -e coreml -i \"ModelFormat|MLProgram MLComputeUnits|CPUAndGPU\"\n"
       "\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
       "\t    [SNPE only] [priority]: execution priority, options: 'low', 'normal'. \n"
@@ -175,39 +183,6 @@ static bool ParseDimensionOverride(std::basic_string<ORTCHAR_T>& dim_identifier,
   return true;
 }
 
-static bool ParseSessionConfigs(const std::string& configs_string,
-                                std::unordered_map<std::string, std::string>& session_configs) {
-  std::istringstream ss(configs_string);
-  std::string token;
-
-  while (ss >> token) {
-    if (token == "") {
-      continue;
-    }
-
-    std::string_view token_sv(token);
-
-    auto pos = token_sv.find("|");
-    if (pos == std::string_view::npos || pos == 0 || pos == token_sv.length()) {
-      // Error: must use a '|' to separate the key and value for session configuration entries.
-      return false;
-    }
-
-    std::string key(token_sv.substr(0, pos));
-    std::string value(token_sv.substr(pos + 1));
-
-    auto it = session_configs.find(key);
-    if (it != session_configs.end()) {
-      // Error: specified duplicate session configuration entry: {key}
-      return false;
-    }
-
-    session_configs.insert(std::make_pair(std::move(key), std::move(value)));
-  }
-
-  return true;
-}
-
 /*static*/ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int argc, ORTCHAR_T* argv[]) {
   int ch;
   while ((ch = getopt(argc, argv, ORT_TSTR("m:e:r:t:p:x:y:c:d:o:u:i:f:F:S:T:C:AMPIDZvhsqznlR:"))) != -1) {
@@ -382,7 +357,13 @@ static bool ParseSessionConfigs(const std::string& configs_string,
         test_config.run_config.intra_op_thread_affinities = ToUTF8String(optarg);
         break;
       case 'C': {
-        if (!ParseSessionConfigs(ToUTF8String(optarg), test_config.run_config.session_config_entries)) {
+        ORT_TRY {
+          ParseSessionConfigs(ToUTF8String(optarg), test_config.run_config.session_config_entries);
+        }
+        ORT_CATCH(const std::exception& ex) {
+          ORT_HANDLE_EXCEPTION([&]() {
+            fprintf(stderr, "Error parsing session configuration entries: %s\n", ex.what());
+          });
           return false;
         }
         break;
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 8f2e5282ede9a..a96028ed3903e 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -17,6 +17,7 @@
 #include <assert.h>
 #include "providers.h"
 #include "TestCase.h"
+#include "strings_helper.h"
 
 #ifdef USE_OPENVINO
 #include "nlohmann/json.hpp"
@@ -58,6 +59,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
   Ort::SessionOptions session_options;
 
   provider_name_ = performance_test_config.machine_config.provider_type_name;
+  std::unordered_map<std::string, std::string> provider_options;
   if (provider_name_ == onnxruntime::kDnnlExecutionProvider) {
 #ifdef USE_DNNL
     // Generate provider options
@@ -72,24 +74,10 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
     std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
 #endif  // defined(_MSC_VER)
     int num_threads = 0;
-    std::istringstream ss(ov_string);
-    std::string token;
-    while (ss >> token) {
-      if (token == "") {
-        continue;
-      }
-      auto pos = token.find("|");
-      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-        ORT_THROW(
-            "[ERROR] [OneDNN] Use a '|' to separate the key and value for the "
-            "run-time option you are trying to use.\n");
-      }
-
-      auto key = token.substr(0, pos);
-      auto value = token.substr(pos + 1);
-
-      if (key == "num_of_threads") {
-        std::stringstream sstream(value);
+    ParseSessionConfigs(ov_string, provider_options, {"num_of_threads"});
+    for (const auto& provider_option : provider_options) {
+      if (provider_option.first == "num_of_threads") {
+        std::stringstream sstream(provider_option.second);
         sstream >> num_threads;
         if (num_threads < 0) {
           ORT_THROW(
@@ -97,10 +85,6 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
               " set number of threads or use '0' for default\n");
           // If the user doesnt define num_threads, auto detect threads later
         }
-      } else {
-        ORT_THROW(
-            "[ERROR] [OneDNN] wrong key type entered. "
-            "Choose from the following runtime key options that are available for OneDNN. ['num_of_threads']\n");
       }
     }
     dnnl_options.threadpool_args = static_cast<void*>(&num_threads);
@@ -144,22 +128,10 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
 #else
     std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
 #endif
-    std::istringstream ss(ov_string);
-    std::string token;
-    while (ss >> token) {
-      if (token == "") {
-        continue;
-      }
-      auto pos = token.find("|");
-      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-        ORT_THROW(
-            "[ERROR] [CUDA] Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
-      }
-
-      buffer.emplace_back(token.substr(0, pos));
-      option_keys.push_back(buffer.back().c_str());
-      buffer.emplace_back(token.substr(pos + 1));
-      option_values.push_back(buffer.back().c_str());
+    ParseSessionConfigs(ov_string, provider_options);
+    for (const auto& provider_option : provider_options) {
+      option_keys.push_back(provider_option.first.c_str());
+      option_values.push_back(provider_option.second.c_str());
     }
 
     Ort::Status status(api.UpdateCUDAProviderOptions(cuda_options,
@@ -192,24 +164,11 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
 #else
     std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
 #endif
-    std::istringstream ss(ov_string);
-    std::string token;
-    while (ss >> token) {
-      if (token == "") {
-        continue;
-      }
-      auto pos = token.find("|");
-      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-        ORT_THROW(
-            "[ERROR] [TensorRT] Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
-      }
-
-      buffer.emplace_back(token.substr(0, pos));
-      option_keys.push_back(buffer.back().c_str());
-      buffer.emplace_back(token.substr(pos + 1));
-      option_values.push_back(buffer.back().c_str());
+    ParseSessionConfigs(ov_string, provider_options);
+    for (const auto& provider_option : provider_options) {
+      option_keys.push_back(provider_option.first.c_str());
+      option_values.push_back(provider_option.second.c_str());
     }
-
     Ort::Status status(api.UpdateTensorRTProviderOptions(tensorrt_options,
                                                          option_keys.data(), option_values.data(), option_keys.size()));
     if (!status.IsOK()) {
@@ -239,22 +198,14 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
 #else
     std::string option_string = performance_test_config.run_config.ep_runtime_config_string;
 #endif
-    std::istringstream ss(option_string);
-    std::string token;
-    std::unordered_map<std::string, std::string> qnn_options;
-
-    while (ss >> token) {
-      if (token == "") {
-        continue;
-      }
-      auto pos = token.find("|");
-      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-        ORT_THROW("Use a '|' to separate the key and value for the run-time option you are trying to use.");
-      }
-
-      std::string key(token.substr(0, pos));
-      std::string value(token.substr(pos + 1));
-
+    ParseSessionConfigs(option_string, provider_options,
+                        {"backend_path", "profiling_file_path", "profiling_level", "rpc_control_latency",
+                         "vtcm_mb", "soc_model", "device_id", "htp_performance_mode", "qnn_saver_path",
+                         "htp_graph_finalization_optimization_mode", "qnn_context_priority", "htp_arch",
+                         "enable_htp_fp16_precision", "offload_graph_io_quantization", "enable_htp_spill_fill_buffer"});
+    for (const auto& provider_option : provider_options) {
+      const std::string& key = provider_option.first;
+      const std::string& value = provider_option.second;
       if (key == "backend_path" || key == "profiling_file_path") {
         if (value.empty()) {
           ORT_THROW("Please provide the valid file path.");
@@ -302,7 +253,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           std::string str = str_stream.str();
           ORT_THROW("Wrong value for htp_arch. select from: " + str);
         }
-      } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization") {
+      } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" || key == "enable_htp_spill_fill_buffer") {
         std::unordered_set<std::string> supported_options = {"0", "1"};
         if (supported_options.find(value) == supported_options.end()) {
           std::ostringstream str_stream;
@@ -311,16 +262,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           std::string str = str_stream.str();
           ORT_THROW("Wrong value for ", key, ". select from: ", str);
         }
-      } else {
-        ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
-'profiling_level', 'profiling_file_path', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
-'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority', 'soc_model',
-'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization'])");
       }
-
-      qnn_options[key] = value;
     }
-    session_options.AppendExecutionProvider("QNN", qnn_options);
+    session_options.AppendExecutionProvider("QNN", provider_options);
 #else
     ORT_THROW("QNN is not supported in this build\n");
 #endif
@@ -331,22 +275,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
 #else
     std::string option_string = performance_test_config.run_config.ep_runtime_config_string;
 #endif
-    std::istringstream ss(option_string);
-    std::string token;
-    std::unordered_map<std::string, std::string> snpe_options;
-
-    while (ss >> token) {
-      if (token == "") {
-        continue;
-      }
-      auto pos = token.find("|");
-      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-        ORT_THROW("Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
-      }
-
-      std::string key(token.substr(0, pos));
-      std::string value(token.substr(pos + 1));
-
+    ParseSessionConfigs(option_string, provider_options, {"runtime", "priority", "buffer_type", "enable_init_cache"});
+    for (const auto& provider_option : provider_options) {
       if (key == "runtime") {
         std::set<std::string> supported_runtime = {"CPU", "GPU_FP32", "GPU", "GPU_FLOAT16", "DSP", "AIP_FIXED_TF"};
         if (supported_runtime.find(value) == supported_runtime.end()) {
@@ -365,14 +295,10 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
         if (value != "1") {
           ORT_THROW("Set to 1 to enable_init_cache.");
         }
-      } else {
-        ORT_THROW("Wrong key type entered. Choose from options: ['runtime', 'priority', 'buffer_type', 'enable_init_cache'] \n");
       }
-
-      snpe_options[key] = value;
     }
 
-    session_options.AppendExecutionProvider("SNPE", snpe_options);
+    session_options.AppendExecutionProvider("SNPE", provider_options);
 #else
     ORT_THROW("SNPE is not supported in this build\n");
 #endif
@@ -416,30 +342,43 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
   } else if (provider_name_ == onnxruntime::kCoreMLExecutionProvider) {
 #ifdef __APPLE__
 #ifdef USE_COREML
-    uint32_t coreml_flags = 0;
     std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
-    std::istringstream ss(ov_string);
-
-    std::string key;
-    while (ss >> key) {
-      if (key == "COREML_FLAG_CREATE_MLPROGRAM") {
-        coreml_flags |= COREML_FLAG_CREATE_MLPROGRAM;
-        std::cout << "Enabling ML Program.\n";
-      } else if (key == "COREML_FLAG_USE_CPU_ONLY") {
-        coreml_flags |= COREML_FLAG_USE_CPU_ONLY;
-        std::cout << "CoreML enabled COREML_FLAG_USE_CPU_ONLY.\n";
-      } else if (key == "COREML_FLAG_USE_CPU_AND_GPU") {
-        coreml_flags |= COREML_FLAG_USE_CPU_AND_GPU;
-        std::cout << "CoreML enabled COREML_FLAG_USE_CPU_AND_GPU.\n";
-      } else if (key.empty()) {
+    static const std::unordered_set<std::string> available_keys = {kCoremlProviderOption_MLComputeUnits,
+                                                                   kCoremlProviderOption_ModelFormat,
+                                                                   kCoremlProviderOption_RequireStaticInputShapes,
+                                                                   kCoremlProviderOption_EnableOnSubgraphs,
+                                                                   kCoremlProviderOption_SpecializationStrategy,
+                                                                   kCoremlProviderOption_ProfileComputePlan,
+                                                                   kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU};
+    ParseSessionConfigs(ov_string, provider_options, available_keys);
+
+    std::unordered_map<std::string, std::string> available_options = {
+        {"CPUAndNeuralEngine", "1"},
+        {"CPUAndGPU", "1"},
+        {"CPUOnly", "1"},
+        {"ALL", "1"},
+    };
+    for (const auto& provider_option : provider_options) {
+      if (provider_option.first == kCoremlProviderOption_MLComputeUnits &&
+          available_options.find(provider_option.second) != available_options.end()) {
+      } else if (provider_option.first == kCoremlProviderOption_ModelFormat &&
+                 (provider_option.second == "MLProgram" || provider_option.second == "NeuralNetwork")) {
+      } else if (provider_option.first == kCoremlProviderOption_RequireStaticInputShapes &&
+                 (provider_option.second == "1" || provider_option.second == "0")) {
+      } else if (provider_option.first == kCoremlProviderOption_EnableOnSubgraphs &&
+                 (provider_option.second == "0" || provider_option.second == "1")) {
+      } else if (provider_option.first == kCoremlProviderOption_SpecializationStrategy &&
+                 (provider_option.second == "Default" || provider_option.second == "FastPrediction")) {
+      } else if (provider_option.first == kCoremlProviderOption_ProfileComputePlan &&
+                 (provider_option.second == "0" || provider_option.second == "1")) {
+      } else if (provider_option.first == kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU &&
+                 (provider_option.second == "0" || provider_option.second == "1")) {
       } else {
-        ORT_THROW(
-            "[ERROR] [CoreML] wrong key type entered. Choose from the following runtime key options "
-            "that are available for CoreML. ['COREML_FLAG_CREATE_MLPROGRAM'] \n");
+        ORT_THROW("Invalid value for option ", provider_option.first, ": ", provider_option.second);
       }
     }
     // COREML_FLAG_CREATE_MLPROGRAM
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(session_options, coreml_flags));
+    session_options.AppendExecutionProvider("CoreML", provider_options);
 #else
     ORT_THROW("CoreML is not supported in this build\n");
 #endif
@@ -448,34 +387,20 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #endif
   } else if (provider_name_ == onnxruntime::kDmlExecutionProvider) {
 #ifdef USE_DML
-    std::unordered_map<std::string, std::string> dml_options;
-    dml_options["performance_preference"] = "high_performance";
-    dml_options["device_filter"] = "gpu";
-    dml_options["disable_metacommands"] = "false";
-    dml_options["enable_graph_capture"] = "false";
 #ifdef _MSC_VER
     std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
 #else
     std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
 #endif
-    std::istringstream ss(ov_string);
-    std::string token;
-    while (ss >> token) {
-      if (token == "") {
-        continue;
-      }
-      auto pos = token.find("|");
-      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-        ORT_THROW("[ERROR] [DML] Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
-      }
-
-      auto key = token.substr(0, pos);
-      auto value = token.substr(pos + 1);
-
+    ParseSessionConfigs(ov_string, provider_options,
+                        {"device_filter", "performance_preference", "disable_metacommands",
+                         "enable_graph_capture", "enable_graph_serialization"});
+    for (const auto& provider_option : provider_options) {
+      const std::string& key = provider_option.first;
+      const std::string& value = provider_option.second;
       if (key == "device_filter") {
         std::set<std::string> ov_supported_device_types = {"gpu", "npu"};
         if (ov_supported_device_types.find(value) != ov_supported_device_types.end()) {
-          dml_options[key] = value;
         } else {
           ORT_THROW(
               "[ERROR] [DML] You have selected a wrong configuration value for the key 'device_filter'. "
@@ -484,7 +409,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
       } else if (key == "performance_preference") {
         std::set<std::string> ov_supported_values = {"default", "high_performance", "minimal_power"};
         if (ov_supported_values.find(value) != ov_supported_values.end()) {
-          dml_options[key] = value;
         } else {
           ORT_THROW(
               "[ERROR] [DML] You have selected a wrong configuration value for the key 'performance_preference'. "
@@ -493,7 +417,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
       } else if (key == "disable_metacommands") {
         std::set<std::string> ov_supported_values = {"true", "True", "false", "False"};
         if (ov_supported_values.find(value) != ov_supported_values.end()) {
-          dml_options[key] = value;
         } else {
           ORT_THROW(
               "[ERROR] [DML] You have selected a wrong value for the key 'disable_metacommands'. "
@@ -502,7 +425,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
       } else if (key == "enable_graph_capture") {
         std::set<std::string> ov_supported_values = {"true", "True", "false", "False"};
         if (ov_supported_values.find(value) != ov_supported_values.end()) {
-          dml_options[key] = value;
         } else {
           ORT_THROW(
               "[ERROR] [DML] You have selected a wrong value for the key 'enable_graph_capture'. "
@@ -519,7 +441,19 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
         }
       }
     }
-    session_options.AppendExecutionProvider("DML", dml_options);
+    if (provider_options.find("performance_preference") == provider_options.end()) {
+      provider_options["performance_preference"] = "high_performance";
+    }
+    if (provider_options.find("device_filter") == provider_options.end()) {
+      provider_options["device_filter"] = "gpu";
+    }
+    if (provider_options.find("disable_metacommands") == provider_options.end()) {
+      provider_options["disable_metacommands"] = "false";
+    }
+    if (provider_options.find("enable_graph_capture") == provider_options.end()) {
+      provider_options["enable_graph_capture"] = "false";
+    }
+    session_options.AppendExecutionProvider("DML", provider_options);
 #else
     ORT_THROW("DML is not supported in this build\n");
 #endif
@@ -530,21 +464,9 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #else
     std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
 #endif  // defined(_MSC_VER)
-    std::istringstream ss(ov_string);
-    std::string token;
     bool enable_fast_math = false;
-    while (ss >> token) {
-      if (token == "") {
-        continue;
-      }
-      auto pos = token.find("|");
-      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-        ORT_THROW("[ERROR] [ACL] Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
-      }
-
-      auto key = token.substr(0, pos);
-      auto value = token.substr(pos + 1);
-
+    ParseSessionConfigs(ov_string, provider_options, {"enable_fast_math"});
+    for (const auto& provider_option : provider_options) {
       if (key == "enable_fast_math") {
         std::set<std::string> ov_supported_values = {"true", "True", "false", "False"};
         if (ov_supported_values.find(value) != ov_supported_values.end()) {
@@ -554,9 +476,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
               "[ERROR] [ACL] You have selcted an invalid value for the key 'enable_fast_math'. "
               "Select from 'true' or 'false' \n");
         }
-      } else {
-        ORT_THROW(
-            "[ERROR] [ACL] Unrecognized option: ", key);
       }
     }
     Ort::ThrowOnError(
@@ -612,24 +531,9 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #else
     std::string option_string = performance_test_config.run_config.ep_runtime_config_string;
 #endif
-    std::istringstream ss(option_string);
-    std::string token;
-    std::unordered_map<std::string, std::string> vitisai_session_options;
-
-    while (ss >> token) {
-      if (token == "") {
-        continue;
-      }
-      auto pos = token.find("|");
-      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-        ORT_THROW("[ERROR] [VitisAI] Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
-      }
+    ParseSessionConfigs(option_string, provider_options);
 
-      std::string key(token.substr(0, pos));
-      std::string value(token.substr(pos + 1));
-      vitisai_session_options[key] = value;
-    }
-    session_options.AppendExecutionProvider_VitisAI(vitisai_session_options);
+    session_options.AppendExecutionProvider_VitisAI(provider_options);
 #else
     ORT_THROW("VitisAI is not supported in this build\n");
 #endif
diff --git a/onnxruntime/test/perftest/strings_helper.cc b/onnxruntime/test/perftest/strings_helper.cc
new file mode 100644
index 0000000000000..e09c8fac70887
--- /dev/null
+++ b/onnxruntime/test/perftest/strings_helper.cc
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <sstream>
+
+#include "strings_helper.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace perftest {
+
+void ParseSessionConfigs(const std::string& configs_string,
+                         std::unordered_map<std::string, std::string>& session_configs,
+                         const std::unordered_set<std::string>& available_keys) {
+  std::istringstream ss(configs_string);
+  std::string token;
+
+  while (ss >> token) {
+    if (token == "") {
+      continue;
+    }
+
+    std::string_view token_sv(token);
+
+    auto pos = token_sv.find("|");
+    if (pos == std::string_view::npos || pos == 0 || pos == token_sv.length()) {
+      ORT_THROW("Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
+    }
+
+    std::string key(token_sv.substr(0, pos));
+    std::string value(token_sv.substr(pos + 1));
+
+    if (available_keys.empty() == false && available_keys.count(key) == 0) {
+      // Error: unknown option: {key}
+      std::string available_keys_str;
+      for (const auto& av_key : available_keys) {
+        available_keys_str += av_key;
+        available_keys_str += ", ";
+      }
+      ORT_THROW("[ERROR] wrong key type entered : `", key,
+                "`. The following runtime key options are avaible: [", available_keys_str, "]");
+    }
+
+    auto it = session_configs.find(key);
+    if (it != session_configs.end()) {
+      // Error: specified duplicate session configuration entry: {key}
+      ORT_THROW("Specified duplicate session configuration entry: ", key);
+    }
+
+    session_configs.insert(std::make_pair(std::move(key), std::move(value)));
+  }
+}
+}  // namespace perftest
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/perftest/strings_helper.h b/onnxruntime/test/perftest/strings_helper.h
new file mode 100644
index 0000000000000..0d6c56709fde6
--- /dev/null
+++ b/onnxruntime/test/perftest/strings_helper.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// Licensed under the MIT License.
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace onnxruntime {
+namespace perftest {
+
+void ParseSessionConfigs(const std::string& configs_string,
+                         std::unordered_map<std::string, std::string>& session_configs,
+                         const std::unordered_set<std::string>& available_keys = {});
+}  // namespace perftest
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
index 32b4b32e299d6..fa95c1fc52b94 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
@@ -35,8 +35,9 @@ void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU =
 
 #if COREML_EP_AVAILABLE
   if (useCoreML) {
-    const uint32_t flags = COREML_FLAG_USE_CPU_ONLY;
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(session_options, flags));
+    std::unordered_map<std::string, std::string> provider_options = {
+        {kCoremlProviderOption_MLComputeUnits, "CPUOnly"}};
+    session_options.AppendExecutionProvider("CoreML", provider_options);
   }
 #else
   (void)useCoreML;
diff --git a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
index 86001b6cb50a5..b53a4a2df09b4 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
@@ -35,8 +35,9 @@ void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU =
 
 #if COREML_EP_AVAILABLE
   if (useCoreML) {
-    const uint32_t flags = COREML_FLAG_USE_CPU_ONLY;
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(session_options, flags));
+    std::unordered_map<std::string, std::string> provider_options = {
+        {kCoremlProviderOption_MLComputeUnits, "CPUOnly"}};
+    session_options.AppendExecutionProvider("CoreML", provider_options);
   }
 #else
   (void)useCoreML;
diff --git a/onnxruntime/test/platform/windows/stacktrace_test.cc b/onnxruntime/test/platform/windows/stacktrace_test.cc
index de09dbcf270a9..9b1840f4b5d65 100644
--- a/onnxruntime/test/platform/windows/stacktrace_test.cc
+++ b/onnxruntime/test/platform/windows/stacktrace_test.cc
@@ -14,7 +14,6 @@ namespace onnxruntime {
 namespace test {
 
 using namespace ::testing;
-// TVM is not working with StackTrace now.
 #if !defined(ORT_NO_EXCEPTIONS)
 TEST(StacktraceTests, BasicTests) {
   auto result = ::onnxruntime::GetStackTrace();
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 9d83c789c5124..b0958e05dc373 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -420,6 +420,7 @@ bool SetEpsForAllNodes(Graph& graph,
       continue;
 
     bool found = false;
+    const auto& logger = DefaultLoggingManager().DefaultLogger();
 
     for (const auto& ep : execution_providers) {
       auto provider_type = ep->Type();
@@ -438,7 +439,8 @@ bool SetEpsForAllNodes(Graph& graph,
       }
 
       // Check the EP has an impl for the node from builtin registry.
-      if (KernelRegistry::HasImplementationOf(*ep->GetKernelRegistry(), node, ep->Type(), kernel_type_str_resolver)) {
+      if (KernelRegistry::HasImplementationOf(*ep->GetKernelRegistry(), node, ep->Type(), kernel_type_str_resolver,
+                                              logger)) {
         found = true;
         break;
       }
@@ -451,6 +453,7 @@ bool SetEpsForAllNodes(Graph& graph,
                                                              std::string_view(kMSInternalNHWCDomain),
                                                              node.SinceVersion(),
                                                              type_constraint_map,
+                                                             logger,
                                                              &kci);
         if (status.IsOK() && kci != nullptr) {
           found = true;
@@ -463,7 +466,7 @@ bool SetEpsForAllNodes(Graph& graph,
           std::any_of(custom_registries->cbegin(), custom_registries->cend(),
                       [&](auto reg) {
                         return KernelRegistry::HasImplementationOf(*reg->GetKernelRegistry(), node, ep->Type(),
-                                                                   kernel_type_str_resolver);
+                                                                   kernel_type_str_resolver, logger);
                       })) {
         found = true;
         break;
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index de647d9e3aa3e..a8480e7416de5 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -4,7 +4,7 @@
 #include "core/common/logging/logging.h"
 #include "core/graph/graph.h"
 #include "core/graph/graph_viewer.h"
-#include "core/providers/coreml/coreml_execution_provider.h"
+#include "core/providers/coreml/coreml_provider_factory_creator.h"
 #include "core/providers/coreml/coreml_provider_factory.h"
 #include "core/session/inference_session.h"
 #include "test/common/tensor_op_test_utils.h"
@@ -30,11 +30,11 @@ using namespace ::onnxruntime::logging;
 namespace onnxruntime {
 namespace test {
 
-// We want to run UT on CPU only to get output value without losing precision to pass the verification
-static constexpr uint32_t s_coreml_flags = COREML_FLAG_USE_CPU_ONLY;
-
-static std::unique_ptr<IExecutionProvider> MakeCoreMLExecutionProvider(uint32_t flags = s_coreml_flags) {
-  return std::make_unique<CoreMLExecutionProvider>(flags);
+static std::unique_ptr<IExecutionProvider> MakeCoreMLExecutionProvider(
+    std::string ModelFormat = "NeuralNetwork", std::string ComputeUnits = "CPUOnly") {
+  std::unordered_map<std::string, std::string> provider_options = {{kCoremlProviderOption_MLComputeUnits, ComputeUnits},
+                                                                   {kCoremlProviderOption_ModelFormat, ModelFormat}};
+  return CoreMLProviderFactoryCreator::Create(provider_options)->CreateProvider();
 }
 
 #if !defined(ORT_MINIMAL_BUILD)
@@ -128,7 +128,7 @@ TEST(CoreMLExecutionProviderTest, ArgMaxCastTest) {
                             feeds,
                             verification_params);
   RunAndVerifyOutputsWithEP(model_file_name, CurrentTestName(),
-                            MakeCoreMLExecutionProvider(COREML_FLAG_CREATE_MLPROGRAM),
+                            MakeCoreMLExecutionProvider("MLProgram"),
                             feeds,
                             verification_params);
 #else
@@ -170,7 +170,7 @@ TEST(CoreMLExecutionProviderTest, ArgMaxUnsupportedCastTest) {
                             verification_params);
 
   RunAndVerifyOutputsWithEP(model_file_name, CurrentTestName(),
-                            MakeCoreMLExecutionProvider(COREML_FLAG_CREATE_MLPROGRAM),
+                            MakeCoreMLExecutionProvider("MLProgram"),
                             feeds,
                             verification_params);
 #else
diff --git a/onnxruntime/test/providers/coreml/dynamic_input_test.cc b/onnxruntime/test/providers/coreml/dynamic_input_test.cc
index c91ef23650040..8294f65745256 100644
--- a/onnxruntime/test/providers/coreml/dynamic_input_test.cc
+++ b/onnxruntime/test/providers/coreml/dynamic_input_test.cc
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include "core/providers/coreml/coreml_execution_provider.h"
+#include "core/providers/coreml/coreml_provider_factory_creator.h"
 #include "core/providers/coreml/coreml_provider_factory.h"  // for COREMLFlags
 #include "test/common/random_generator.h"
 #include "test/providers/model_tester.h"
@@ -20,8 +21,8 @@ TEST(CoreMLExecutionProviderDynamicInputShapeTest, MatMul) {
 
   auto test = [&](const size_t M) {
     SCOPED_TRACE(MakeString("M=", M));
-
-    auto coreml_ep = std::make_unique<CoreMLExecutionProvider>(0);
+    std::unordered_map<std::string, std::string> options;
+    auto coreml_ep = CoreMLProviderFactoryCreator::Create(options)->CreateProvider();
 
     const auto ep_verification_params = EPVerificationParams{
         ExpectedEPNodeAssignment::All,
@@ -54,8 +55,8 @@ TEST(CoreMLExecutionProviderDynamicInputShapeTest, MobileNetExcerpt) {
 
   auto test = [&](const size_t batch_size) {
     SCOPED_TRACE(MakeString("batch_size=", batch_size));
-
-    auto coreml_ep = std::make_unique<CoreMLExecutionProvider>(0);
+    std::unordered_map<std::string, std::string> options;
+    auto coreml_ep = CoreMLProviderFactoryCreator::Create(options)->CreateProvider();
 
     const auto ep_verification_params = EPVerificationParams{
         ExpectedEPNodeAssignment::All,
@@ -87,6 +88,7 @@ TEST(CoreMLExecutionProviderDynamicInputShapeTest, EmptyInputFails) {
   constexpr auto model_path = ORT_TSTR("testdata/matmul_with_dynamic_input_shape.onnx");
 
   ModelTester tester(CurrentTestName(), model_path);
+  std::unordered_map<std::string, std::string> options;
 
   tester.AddInput<float>("A", {0, 2}, {});
   tester.AddOutput<float>("Y", {0, 4}, {});
@@ -94,14 +96,15 @@ TEST(CoreMLExecutionProviderDynamicInputShapeTest, EmptyInputFails) {
   tester
       .Config(ModelTester::ExpectResult::kExpectFailure,
               "the runtime shape ({0,2}) has zero elements. This is not supported by the CoreML EP.")
-      .ConfigEp(std::make_unique<CoreMLExecutionProvider>(0))
+      .ConfigEp(CoreMLProviderFactoryCreator::Create(options)->CreateProvider())
       .RunWithConfig();
 }
 
 TEST(CoreMLExecutionProviderDynamicInputShapeTest, OnlyAllowStaticInputShapes) {
   constexpr auto model_path = ORT_TSTR("testdata/matmul_with_dynamic_input_shape.onnx");
-
-  auto coreml_ep = std::make_unique<CoreMLExecutionProvider>(COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES);
+  std::unordered_map<std::string, std::string> options = {{kCoremlProviderOption_RequireStaticInputShapes, "1"}};
+  auto coreml_ep = CoreMLProviderFactoryCreator::Create(options)->CreateProvider();
+  ;
 
   TestModelLoad(model_path, std::move(coreml_ep),
                 // expect no supported nodes because we disable dynamic input shape support
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index d32e286ad933e..a74517840097c 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -2271,6 +2271,21 @@ TEST(MathOpTest, Max_12_MLFloat16_Scalar1) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Max_12_MLFloat16_Scalar2) {
+  OpTester test("Max", 12);
+  test.AddInput<MLFloat16>("data_0", {1},
+                           MakeMLFloat16({-1.f}));
+  test.AddInput<MLFloat16>("data_1", {},
+                           MakeMLFloat16({2.f}));
+  test.AddInput<MLFloat16>("data_2", {1, 3},
+                           MakeMLFloat16({-2.f, -3.f, -4.f}));
+  test.AddInput<MLFloat16>("data_3", {1, 1, 3},
+                           MakeMLFloat16({-2.f, -3.f, -4.f}));
+  test.AddOutput<MLFloat16>("max", {1, 1, 3},
+                            MakeMLFloat16({2.f, 2.f, 2.f}));
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
+}
+
 TEST(MathOpTest, Max_13_Float16_MatrixVector) {
   TestFloat16MinMax("Max",
                     {4, 3},
diff --git a/onnxruntime/test/providers/cpu/ml/tree_ensembler_test.cc b/onnxruntime/test/providers/cpu/ml/tree_ensembler_test.cc
new file mode 100644
index 0000000000000..49bb0ae65d1c9
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/ml/tree_ensembler_test.cc
@@ -0,0 +1,294 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+static ONNX_NAMESPACE::TensorProto make_tensor(std::vector<double> array, std::string name) {
+  ONNX_NAMESPACE::TensorProto array_as_tensor;
+  array_as_tensor.set_name(name);
+  array_as_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
+  array_as_tensor.add_dims(array.size());
+  for (auto v : array) {
+    array_as_tensor.add_double_data(v);
+  }
+
+  return array_as_tensor;
+}
+
+static ONNX_NAMESPACE::TensorProto make_tensor(std::vector<float> array, std::string name) {
+  ONNX_NAMESPACE::TensorProto array_as_tensor;
+  array_as_tensor.set_name(name);
+  array_as_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  array_as_tensor.add_dims(array.size());
+  for (auto v : array) {
+    array_as_tensor.add_float_data(v);
+  }
+
+  return array_as_tensor;
+}
+
+static ONNX_NAMESPACE::TensorProto make_tensor(std::vector<uint8_t> array, std::string name) {
+  ONNX_NAMESPACE::TensorProto array_as_tensor;
+  array_as_tensor.set_name(name);
+  array_as_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
+  array_as_tensor.add_dims(array.size());
+  for (const auto v : array) {
+    array_as_tensor.add_int32_data(v);
+  }
+
+  return array_as_tensor;
+}
+
+template <typename T>
+void _multiply_update_array(std::vector<T>& data, int n, T inc = 0) {
+  std::vector<T> copy = data;
+  data.resize(copy.size() * n);
+  T cst = 0;
+  for (int i = 0; i < n; ++i) {
+    for (size_t j = 0; j < copy.size(); ++j) {
+      data[j + i * copy.size()] = copy[j] + cst;
+    }
+    cst += inc;
+  }
+}
+
+template <typename T>
+void _multiply_update_childnode(std::vector<T>& childnodes, std::vector<T>& childleafs, std::vector<T>& otherchildleafs, int n) {
+  int64_t leafs_cnt = 0;
+  int64_t nodes_cnt = childnodes.size();
+  for (auto& childleaf : childleafs) {
+    if (childleaf) {
+      leafs_cnt++;
+    }
+  }
+  for (auto& childleaf : otherchildleafs) {
+    if (childleaf) {
+      leafs_cnt++;
+    }
+  }
+
+  std::vector<T> copy = childnodes;
+  childnodes.resize(copy.size() * n);
+  T leafs_cst = 0;
+  T nodes_cst = 0;
+  for (int i = 0; i < n; ++i) {
+    for (size_t j = 0; j < copy.size(); ++j) {
+      T curr_inc = childleafs[j] ? leafs_cst : nodes_cst;
+      childnodes[j + i * copy.size()] = copy[j] + curr_inc;
+    }
+
+    leafs_cst += leafs_cnt;
+    nodes_cst += nodes_cnt;
+  }
+}
+
+template <typename T>
+void _multiply_arrays_values(std::vector<T>& data, int64_t val) {
+  for (auto& curr : data) {
+    curr *= val;
+  }
+}
+
+template <typename T>
+void GenTreeAndRunTest(const std::vector<T>& X, const std::vector<T>& Y, const int64_t& aggregate_function, int n_trees = 1) {
+  OpTester test("TreeEnsemble", 5, onnxruntime::kMLDomain);
+  int64_t n_targets = 2;
+
+  int64_t post_transform = 0;
+  std::vector<int64_t> tree_roots = {0};
+  std::vector<int64_t> nodes_featureids = {0, 0, 0};
+  std::vector<uint8_t> nodes_modes = {0, 0, 0};
+  std::vector<T> nodes_splits = {3.14f, 1.2f, 4.2f};
+  std::vector<int64_t> nodes_truenodeids = {1, 0, 1};
+  std::vector<int64_t> nodes_trueleafs = {0, 1, 1};
+  std::vector<int64_t> nodes_falsenodeids = {2, 2, 3};
+  std::vector<int64_t> nodes_falseleafs = {0, 1, 1};
+
+  std::vector<int64_t> leaf_targetids = {0, 1, 0, 1};
+  std::vector<T> leaf_weights = {5.23f, 12.12f, -12.23f, 7.21f};
+
+  if (n_trees > 1) {
+    // Multiplies the number of trees to test the parallelization by trees.
+    _multiply_update_array(tree_roots, n_trees, (int64_t)nodes_truenodeids.size());
+    _multiply_update_array(nodes_featureids, n_trees);
+    _multiply_update_childnode(nodes_truenodeids, nodes_trueleafs, nodes_falseleafs, n_trees);
+    _multiply_update_childnode(nodes_falsenodeids, nodes_falseleafs, nodes_trueleafs, n_trees);
+    _multiply_update_array(nodes_trueleafs, n_trees);
+    _multiply_update_array(nodes_falseleafs, n_trees);
+    _multiply_update_array(leaf_targetids, n_trees);
+    _multiply_update_array(nodes_modes, n_trees);
+    _multiply_update_array(nodes_splits, n_trees);
+    _multiply_update_array(leaf_weights, n_trees);
+  }
+
+  auto nodes_modes_as_tensor = make_tensor(nodes_modes, "nodes_modes");
+  auto nodes_splits_as_tensor = make_tensor(nodes_splits, "nodes_splits");
+  auto leaf_weights_as_tensor = make_tensor(leaf_weights, "leaf_weight");
+
+  // add attributes
+  test.AddAttribute("n_targets", n_targets);
+  test.AddAttribute("aggregate_function", aggregate_function);
+  test.AddAttribute("post_transform", post_transform);
+  test.AddAttribute("tree_roots", tree_roots);
+  test.AddAttribute("nodes_modes", nodes_modes_as_tensor);
+  test.AddAttribute("nodes_featureids", nodes_featureids);
+  test.AddAttribute("nodes_splits", nodes_splits_as_tensor);
+  test.AddAttribute("nodes_truenodeids", nodes_truenodeids);
+  test.AddAttribute("nodes_trueleafs", nodes_trueleafs);
+  test.AddAttribute("nodes_falsenodeids", nodes_falsenodeids);
+  test.AddAttribute("nodes_falseleafs", nodes_falseleafs);
+  test.AddAttribute("leaf_targetids", leaf_targetids);
+  test.AddAttribute("leaf_weights", leaf_weights_as_tensor);
+
+  // fill input data
+  test.AddInput<T>("X", {3, 2}, X);
+  test.AddOutput<T>("Y", {3, 2}, Y);
+  test.Run();
+}
+
+template <typename T>
+void GenTreeAndRunTestWithSetMembership(const std::vector<T>& X, const std::vector<T>& Y, const int64_t& aggregate_function, int n_trees = 1) {
+  OpTester test("TreeEnsemble", 5, onnxruntime::kMLDomain);
+  int64_t n_targets = 4;
+
+  int64_t post_transform = 0;
+  std::vector<int64_t> tree_roots = {0};
+  std::vector<int64_t> nodes_featureids = {0, 0, 0};
+  std::vector<int64_t> nodes_truenodeids = {1, 0, 1};
+  std::vector<int64_t> nodes_trueleafs = {0, 1, 1};
+  std::vector<int64_t> nodes_falsenodeids = {2, 2, 3};
+  std::vector<int64_t> nodes_falseleafs = {1, 0, 1};
+  std::vector<int64_t> leaf_targetids = {0, 1, 2, 3};
+
+  std::vector<uint8_t> nodes_modes = {0, 6, 6};
+  std::vector<T> nodes_splits = {11.f, 232344.f, NAN};
+  std::vector<T> membership_values = {1.2f, 3.7f, 8.f, 9.f, NAN, 12.f, 7.f, NAN};
+  std::vector<T> leaf_weights = {1.f, 10.f, 1000.f, 100.f};
+
+  if (n_trees > 1) {
+    // Multiplies the number of trees to test the parallelization by trees.
+    _multiply_update_array(tree_roots, n_trees, (int64_t)nodes_truenodeids.size());
+    _multiply_update_array(nodes_featureids, n_trees);
+    _multiply_update_childnode(nodes_truenodeids, nodes_trueleafs, nodes_falseleafs, n_trees);
+    _multiply_update_childnode(nodes_falsenodeids, nodes_falseleafs, nodes_trueleafs, n_trees);
+    _multiply_update_array(nodes_trueleafs, n_trees);
+    _multiply_update_array(nodes_falseleafs, n_trees);
+    _multiply_update_array(leaf_targetids, n_trees);
+    _multiply_update_array(nodes_modes, n_trees);
+    _multiply_update_array(nodes_splits, n_trees);
+    _multiply_update_array(membership_values, n_trees);
+    _multiply_update_array(leaf_weights, n_trees);
+  }
+
+  auto nodes_modes_as_tensor = make_tensor(nodes_modes, "nodes_modes");
+  auto nodes_splits_as_tensor = make_tensor(nodes_splits, "nodes_splits");
+  auto membership_values_as_tensor = make_tensor(membership_values, "membership_values");
+  auto leaf_weights_as_tensor = make_tensor(leaf_weights, "leaf_weight");
+
+  // add attributes
+  test.AddAttribute("n_targets", n_targets);
+  test.AddAttribute("aggregate_function", aggregate_function);
+  test.AddAttribute("post_transform", post_transform);
+  test.AddAttribute("tree_roots", tree_roots);
+  test.AddAttribute("nodes_modes", nodes_modes_as_tensor);
+  test.AddAttribute("nodes_featureids", nodes_featureids);
+  test.AddAttribute("nodes_splits", nodes_splits_as_tensor);
+  test.AddAttribute("membership_values", membership_values_as_tensor);
+  test.AddAttribute("nodes_truenodeids", nodes_truenodeids);
+  test.AddAttribute("nodes_trueleafs", nodes_trueleafs);
+  test.AddAttribute("nodes_falsenodeids", nodes_falsenodeids);
+  test.AddAttribute("nodes_falseleafs", nodes_falseleafs);
+  test.AddAttribute("leaf_targetids", leaf_targetids);
+  test.AddAttribute("leaf_weights", leaf_weights_as_tensor);
+
+  // fill input data
+  test.AddInput<T>("X", {6, 1}, X);
+  test.AddOutput<T>("Y", {6, 4}, Y);
+  test.Run();
+}
+
+TEST(MLOpTest, TreeEnsembleFloat) {
+  std::vector<float> X = {1.2f, 3.4f, -0.12f, 1.66f, 4.14f, 1.77f};
+  std::vector<float> Y = {5.23f, 0.f, 5.23f, 0.f, 0.f, 12.12f};
+  GenTreeAndRunTest<float>(X, Y, 1, 1);
+
+  Y = {15.69f, 0.f, 15.69f, 0.f, 0.f, 36.36f};
+  GenTreeAndRunTest<float>(X, Y, 1, 3);
+}
+
+TEST(MLOpTest, TreeEnsembleDouble) {
+  std::vector<double> X = {1.2f, 3.4f, -0.12f, 1.66f, 4.14f, 1.77f};
+  std::vector<double> Y = {5.23f, 0.f, 5.23f, 0.f, 0.f, 12.12f};
+  GenTreeAndRunTest<double>(X, Y, 1, 1);
+
+  _multiply_arrays_values(Y, 3);
+  GenTreeAndRunTest<double>(X, Y, 1, 3);
+}
+
+TEST(MLOpTest, TreeEnsembleSetMembership) {
+  std::vector<double> X = {1.2f, 3.4f, -0.12f, NAN, 12.0f, 7.0f};
+  std::vector<double> Y = {
+      1.f, 0.f, 0.f, 0.f,
+      0.f, 0.f, 0.f, 100.f,
+      0.f, 0.f, 0.f, 100.f,
+      0.f, 0.f, 1000.f, 0.f,
+      0.f, 0.f, 1000.f, 0.f,
+      0.f, 10.f, 0.f, 0.f};
+  GenTreeAndRunTestWithSetMembership<double>(X, Y, 1, 1);
+
+  _multiply_arrays_values(Y, 5);
+  GenTreeAndRunTestWithSetMembership<double>(X, Y, 1, 5);
+}
+
+TEST(MLOpTest, TreeEnsembleLeafOnly) {
+  OpTester test("TreeEnsemble", 5, onnxruntime::kMLDomain);
+  int64_t n_targets = 1;
+
+  int64_t aggregate_function = 1;
+  int64_t post_transform = 0;
+  std::vector<int64_t> tree_roots = {0};
+  std::vector<uint8_t> nodes_modes = {0};
+  std::vector<int64_t> nodes_featureids = {0};
+  std::vector<double> nodes_splits = {0.f};
+  std::vector<int64_t> nodes_truenodeids = {0};
+  std::vector<int64_t> nodes_trueleafs = {1};
+  std::vector<int64_t> nodes_falsenodeids = {0};
+  std::vector<int64_t> nodes_falseleafs = {1};
+
+  std::vector<int64_t> leaf_targetids = {0};
+  std::vector<double> leaf_weights = {6.23f};
+
+  auto nodes_modes_as_tensor = make_tensor(nodes_modes, "nodes_modes");
+  auto nodes_splits_as_tensor = make_tensor(nodes_splits, "nodes_splits");
+  auto leaf_weights_as_tensor = make_tensor(leaf_weights, "leaf_weight");
+
+  // add attributes
+  test.AddAttribute("n_targets", n_targets);
+  test.AddAttribute("aggregate_function", aggregate_function);
+  test.AddAttribute("post_transform", post_transform);
+  test.AddAttribute("tree_roots", tree_roots);
+  test.AddAttribute("nodes_modes", nodes_modes_as_tensor);
+  test.AddAttribute("nodes_featureids", nodes_featureids);
+  test.AddAttribute("nodes_splits", nodes_splits_as_tensor);
+  test.AddAttribute("nodes_truenodeids", nodes_truenodeids);
+  test.AddAttribute("nodes_trueleafs", nodes_trueleafs);
+  test.AddAttribute("nodes_falsenodeids", nodes_falsenodeids);
+  test.AddAttribute("nodes_falseleafs", nodes_falseleafs);
+  test.AddAttribute("leaf_targetids", leaf_targetids);
+  test.AddAttribute("leaf_weights", leaf_weights_as_tensor);
+
+  // fill input data
+  std::vector<double> X = {1.f, 4.f};
+  std::vector<double> Y = {6.23f, 6.23f};
+
+  test.AddInput<double>("X", {2, 1}, X);
+  test.AddOutput<double>("Y", {2, 1}, Y);
+  test.Run();
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc b/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc
index 33c23b53fb5aa..eaf8fea03eaa0 100644
--- a/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/treeregressor_test.cc
@@ -679,6 +679,90 @@ TEST(MLOpTest, TreeRegressorSingleTargetSum_as_tensor_precision) {
   GenTreeAndRunTest1_as_tensor_precision(3);
 }
 
+TEST(MLOpTest, TreeRegressorCategoricals) {
+  OpTester test("TreeEnsembleRegressor", 3, onnxruntime::kMLDomain);
+
+  // tree
+  int64_t n_targets = 1;
+  std::vector<int64_t> nodes_featureids = {0, 0, 0, 0, 1, 0, 0};
+  std::vector<std::string> nodes_modes = {"BRANCH_EQ", "BRANCH_EQ", "BRANCH_EQ", "LEAF", "BRANCH_LEQ", "LEAF", "LEAF"};
+  std::vector<float> nodes_values = {1, 3, 4, 0, 5.5, 0, 0};
+
+  std::vector<int64_t> nodes_treeids = {0, 0, 0, 0, 0, 0, 0};
+  std::vector<int64_t> nodes_nodeids = {0, 1, 2, 3, 4, 5, 6};
+  std::vector<int64_t> nodes_falsenodeids = {1, 2, 3, 0, 5, 0, 0};
+  std::vector<int64_t> nodes_truenodeids = {4, 4, 4, 0, 6, 0, 0};
+
+  std::string post_transform = "NONE";
+  std::vector<int64_t> target_ids = {0, 0, 0};
+  std::vector<int64_t> target_nodeids = {3, 5, 6};
+  std::vector<int64_t> target_treeids = {0, 0, 0};
+  std::vector<float> target_weights = {-4.699999809265137, 17.700000762939453, 11.100000381469727};
+
+  // add attributes
+  test.AddAttribute("nodes_truenodeids", nodes_truenodeids);
+  test.AddAttribute("nodes_falsenodeids", nodes_falsenodeids);
+  test.AddAttribute("nodes_treeids", nodes_treeids);
+  test.AddAttribute("nodes_nodeids", nodes_nodeids);
+  test.AddAttribute("nodes_featureids", nodes_featureids);
+  test.AddAttribute("nodes_values", nodes_values);
+  test.AddAttribute("nodes_modes", nodes_modes);
+  test.AddAttribute("target_treeids", target_treeids);
+  test.AddAttribute("target_nodeids", target_nodeids);
+  test.AddAttribute("target_ids", target_ids);
+  test.AddAttribute("target_weights", target_weights);
+  test.AddAttribute("n_targets", n_targets);
+
+  // fill input data
+  std::vector<float> X = {3.0f, 6.6f, 1.0f, 5.0f, 5.0f, 5.5f};
+  std::vector<float> Y = {17.700000762939453, 11.100000381469727, -4.699999809265137};
+  test.AddInput<float>("X", {3, 2}, X);
+  test.AddOutput<float>("Y", {3, 1}, Y);
+  test.Run();
+}
+
+TEST(MLOpTest, TreeRegressorCategoricalsFolding) {
+  OpTester test("TreeEnsembleRegressor", 3, onnxruntime::kMLDomain);
+
+  // tree
+  int64_t n_targets = 1;
+  std::vector<int64_t> nodes_featureids = {0, 0, 1, 1, 0, 0, 0};
+  std::vector<std::string> nodes_modes = {"BRANCH_EQ", "BRANCH_EQ", "BRANCH_EQ", "BRANCH_EQ", "LEAF", "LEAF", "LEAF"};
+  std::vector<float> nodes_values = {1, 3, 2, 3, 0, 0, 0};
+
+  std::vector<int64_t> nodes_treeids = {0, 0, 0, 0, 0, 0, 0};
+  std::vector<int64_t> nodes_nodeids = {0, 1, 2, 3, 4, 5, 6};
+  std::vector<int64_t> nodes_falsenodeids = {1, 2, 3, 4, 0, 0, 0};
+  std::vector<int64_t> nodes_truenodeids = {5, 5, 6, 6, 0, 0, 0};
+
+  std::string post_transform = "NONE";
+  std::vector<int64_t> target_ids = {0, 0, 0};
+  std::vector<int64_t> target_nodeids = {4, 5, 6};
+  std::vector<int64_t> target_treeids = {0, 0, 0};
+  std::vector<float> target_weights = {17.700000762939453, 11.100000381469727, -4.699999809265137};
+
+  // add attributes
+  test.AddAttribute("nodes_truenodeids", nodes_truenodeids);
+  test.AddAttribute("nodes_falsenodeids", nodes_falsenodeids);
+  test.AddAttribute("nodes_treeids", nodes_treeids);
+  test.AddAttribute("nodes_nodeids", nodes_nodeids);
+  test.AddAttribute("nodes_featureids", nodes_featureids);
+  test.AddAttribute("nodes_values", nodes_values);
+  test.AddAttribute("nodes_modes", nodes_modes);
+  test.AddAttribute("target_treeids", target_treeids);
+  test.AddAttribute("target_nodeids", target_nodeids);
+  test.AddAttribute("target_ids", target_ids);
+  test.AddAttribute("target_weights", target_weights);
+  test.AddAttribute("n_targets", n_targets);
+
+  // fill input data
+  std::vector<float> X = {1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 2.0f, 1.0f};
+  std::vector<float> Y = {11.100000381469727, 11.100000381469727, -4.699999809265137, 17.700000762939453};
+  test.AddInput<float>("X", {4, 2}, X);
+  test.AddOutput<float>("Y", {4, 1}, Y);
+  test.Run();
+}
+
 TEST(MLOpTest, TreeRegressorTrueNodeBeforeNode) {
   OpTester test("TreeEnsembleRegressor", 3, onnxruntime::kMLDomain);
 
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index c1c049ae5f967..f68a245d103e1 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -1375,7 +1375,7 @@ TEST(ReductionOpTest, ReduceMax_double) {
   test.Run();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
 TEST(ReductionOpTest, ReduceMax_half) {
   OpTester test("ReduceMax");
   test.AddAttribute("axes", std::vector<int64_t>{1, 2});
@@ -2158,7 +2158,7 @@ TEST(ReductionOpTest, ReduceMin_double) {
   test.Run();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
 TEST(ReductionOpTest, ReduceMin_half) {
   OpTester test("ReduceMin");
   test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@@ -2356,7 +2356,7 @@ TEST(ReductionOpTest, ReduceSum_int32) {
   test.Run();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
 TEST(ReductionOpTest, ReduceSumHalfHalf) {
   OpTester test("ReduceSum");
   test.AddAttribute("keepdims", (int64_t)0);
@@ -5681,7 +5681,7 @@ TEST(ReductionOpTest, ReduceSum_RK_parallel) {
   test.AddOutput<float>("reduced", {32}, expected);
 
   // CoreML does not provide 1e-5 precision here (it's off by 1e-4)
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCoreMLExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
 }
 
 TEST(ReductionOpTest, ReduceSum_RK_keepdims) {
diff --git a/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc b/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
index d2aa5dd428fec..d1910c89f76b7 100644
--- a/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
@@ -11,7 +11,7 @@ namespace test {
 
 // Disable TensorRT on the tests because of SegFault errors in the parser
 
-TEST(TensorOpTest, Unsqueeze_1) {
+TEST(UnsqueezeOpTest, Unsqueeze_1) {
   OpTester test("Unsqueeze");
 
   test.AddAttribute("axes", std::vector<int64_t>{1});
@@ -20,7 +20,7 @@ TEST(TensorOpTest, Unsqueeze_1) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-TEST(TensorOpTest, Unsqueeze_1_int32) {
+TEST(UnsqueezeOpTest, Unsqueeze_1_int32) {
   OpTester test("Unsqueeze");
 
   test.AddAttribute("axes", std::vector<int64_t>{1});
@@ -29,7 +29,7 @@ TEST(TensorOpTest, Unsqueeze_1_int32) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-TEST(TensorOpTest, Unsqueeze_2) {
+TEST(UnsqueezeOpTest, Unsqueeze_2) {
   OpTester test("Unsqueeze");
 
   test.AddAttribute("axes", std::vector<int64_t>{0, 4});
@@ -38,7 +38,7 @@ TEST(TensorOpTest, Unsqueeze_2) {
   test.Run();
 }
 
-TEST(TensorOpTest, Unsqueeze_3) {
+TEST(UnsqueezeOpTest, Unsqueeze_3) {
   OpTester test("Unsqueeze");
 
   test.AddAttribute("axes", std::vector<int64_t>{2, 1, 0});
@@ -47,7 +47,7 @@ TEST(TensorOpTest, Unsqueeze_3) {
   test.Run();
 }
 
-TEST(TensorOpTest, Unsqueeze_scalar) {
+TEST(UnsqueezeOpTest, Unsqueeze_scalar) {
   {
     OpTester test("Unsqueeze");
 
@@ -85,7 +85,7 @@ TEST(TensorOpTest, Unsqueeze_scalar) {
   run_test(true);
 }
 
-TEST(TensorOpTest, Unsqueeze_scalar_2) {
+TEST(UnsqueezeOpTest, Unsqueeze_scalar_2) {
   {
     OpTester test("Unsqueeze");
 
@@ -105,7 +105,7 @@ TEST(TensorOpTest, Unsqueeze_scalar_2) {
   run_test(true);
 }
 
-TEST(TensorOpTest, Unsqueeze_Duplicate) {
+TEST(UnsqueezeOpTest, Unsqueeze_Duplicate) {
   {
     OpTester test("Unsqueeze", 12);  // opset 1-12 has axes attribute
 
@@ -128,7 +128,7 @@ TEST(TensorOpTest, Unsqueeze_Duplicate) {
   }
 }
 
-TEST(TensorOpTest, Unsqueeze_OutOfRange) {
+TEST(UnsqueezeOpTest, Unsqueeze_OutOfRange) {
   {
     OpTester test("Unsqueeze", 12);  // opset 1-12 has axes attribute
     test.AddAttribute("axes", std::vector<int64_t>{4});
@@ -149,7 +149,7 @@ TEST(TensorOpTest, Unsqueeze_OutOfRange) {
   }
 }
 
-TEST(TensorOpTest, UnsqueezeNegAxis_3) {
+TEST(UnsqueezeOpTest, UnsqueezeNegAxis_3) {
   {
     OpTester test("Unsqueeze", 12);  // opset 1-12 has axes attribute
     test.AddAttribute("axes", std::vector<int64_t>{-4, 1, -6});
@@ -171,7 +171,7 @@ TEST(TensorOpTest, UnsqueezeNegAxis_3) {
   run_test(true);
 }
 
-TEST(TensorOpTest, Unsqueeze_1_int32_axes_input) {
+TEST(UnsqueezeOpTest, Unsqueeze_1_int32_axes_input) {
   auto run_test = [](bool axes_is_initializer) {
     OpTester test("Unsqueeze", 13);
 
@@ -185,7 +185,7 @@ TEST(TensorOpTest, Unsqueeze_1_int32_axes_input) {
   run_test(true);
 }
 
-TEST(TensorOpTest, Unsqueeze_3_axes_input) {
+TEST(UnsqueezeOpTest, Unsqueeze_3_axes_input) {
   auto run_test = [](bool axes_is_initializer) {
     OpTester test("Unsqueeze", 13);
 
@@ -200,7 +200,7 @@ TEST(TensorOpTest, Unsqueeze_3_axes_input) {
 }
 
 #if defined(USE_DNNL)
-TEST(TensorOpTest, Unsqueeze_3_axes_input_bfloat16) {
+TEST(UnsqueezeOpTest, Unsqueeze_3_axes_input_bfloat16) {
 #ifdef USE_DNNL
   if (!DnnlHasBF16Support()) {
     LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
@@ -218,7 +218,7 @@ TEST(TensorOpTest, Unsqueeze_3_axes_input_bfloat16) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
-TEST(TensorOpTest, UnsqueezeNegAxis_3_bfloat16) {
+TEST(UnsqueezeOpTest, UnsqueezeNegAxis_3_bfloat16) {
 #ifdef USE_DNNL
   if (!DnnlHasBF16Support()) {
     LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
diff --git a/onnxruntime/test/providers/cuda/cuda_provider_test.cc b/onnxruntime/test/providers/cuda/cuda_provider_test.cc
index e57cdd2350fab..e745e1bcb8171 100644
--- a/onnxruntime/test/providers/cuda/cuda_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/cuda_provider_test.cc
@@ -11,7 +11,7 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA_Test();
 
 namespace test {
 namespace cuda {
-TEST(CUDA_EP_Unittest, All) {
+TEST(CudaEpUnittest, All) {
   onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA_Test();
   ep.TestAll();
 }
diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
index b413d04fe81e8..ec7c6ec4e1605 100644
--- a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
@@ -11,7 +11,7 @@
 namespace onnxruntime {
 namespace test {
 
-TEST(AllocatorTest, CUDAAllocatorTest) {
+TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
   // ensure CUDA device is available.
@@ -77,7 +77,7 @@ TEST(AllocatorTest, CUDAAllocatorTest) {
 }
 
 // test that we fallback to smaller allocations if the growth of the arena exceeds the available memory
-TEST(AllocatorTest, CUDAAllocatorFallbackTest) {
+TEST(CudaEpAllocatorTest, CUDAAllocatorFallbackTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
   size_t free = 0;
diff --git a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
index b2e986f680763..ccdc56de5937d 100644
--- a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
@@ -17,7 +17,7 @@ using onnxruntime::contrib::attention::AttentionBackend;
 namespace onnxruntime {
 namespace test {
 
-TEST(AttentionKernelOptionsTest, NonZeroValue) {
+TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
   {
     AttentionKernelOptions options;
     int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION) | static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION);
@@ -156,7 +156,7 @@ TEST(AttentionKernelOptionsTest, NonZeroValue) {
 }
 
 // Test all environment variables take effect when option value is 0.
-TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
+TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
   constexpr int value = 0;
   ScopedEnvironmentVariables scoped_env_vars{
       EnvVarMap{
@@ -186,7 +186,7 @@ TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
 }
 
 // Test default min sequence lengths when environment variables are not set.
-TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) {
+TEST(CudaEpAttentionKernelOptionsTest, DefaultMinSeqLens) {
   constexpr int value = 0;
   ScopedEnvironmentVariables scoped_env_vars{
       EnvVarMap{
diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
index a0d115c41c14b..97d50398a5550 100644
--- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@@ -68,7 +68,7 @@ void ComputeTopKReference(const std::vector<float>& values,
   }
 }
 
-TEST(TestBeamSearch, TopK) {
+TEST(CudaEpTestBeamSearch, TopK) {
   int32_t batch_size = 4;
   int32_t beam_size = 4;
   int32_t vocab_size = 50257;
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
index 3fcb9045ee7e6..d8fb3c8256012 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -230,7 +230,7 @@ void testPrepack(int rows, int columns) {
 }
 
 // TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
-TEST(BlkQ4_GEMM, PrepackSm80Test) {
+TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -263,7 +263,7 @@ TEST(BlkQ4_GEMM, PrepackSm80Test) {
   testPrepack<true, false>(256, 256);
 }
 
-TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -292,7 +292,7 @@ TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576);
 }
 
-TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -305,7 +305,7 @@ TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576);
 }
 
-TEST(BlkQ4_GEMM, Sm80SmallMTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -326,7 +326,7 @@ TEST(BlkQ4_GEMM, Sm80SmallMTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576);
 }
 
-TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
+TEST(CudaEpBlkQ4_GEMM, Sm80SmallTileKernelTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index 72357ec7e02d2..f3222c6f683b5 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -19,7 +19,7 @@ namespace cuda {
 namespace test {
 // TODO: Since the "DeferredRelease" has been migrated to CudaStream class,
 // we should migrate this test from CudaEP unit test to CudaStream unit test.
-TEST(TestDeferredRelease, WithArena) {
+TEST(CudaEpTestDeferredRelease, WithArena) {
   // Create CUDA EP.
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
@@ -52,7 +52,7 @@ TEST(TestDeferredRelease, WithArena) {
   ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
-TEST(TestDeferredRelease, WithoutArena) {
+TEST(CudaEpTestDeferredRelease, WithoutArena) {
   // Create CUDA EP.
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
index 7468a5718425e..3538c7add94d0 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@@ -40,7 +40,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
 }
 }  // namespace
 
-TEST(CudaUtilsTest, FillCorrectness) {
+TEST(CudaEpUnittest, FillCorrectness) {
   TestFillCorrectness<int8_t>(1 << 20, 1);
   TestFillCorrectness<int16_t>(1 << 20, 2);
   TestFillCorrectness<int32_t>(1 << 20, 3);
diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
index 6636e15040393..518fde5804b23 100644
--- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@@ -10,7 +10,7 @@ namespace onnxruntime {
 namespace cuda {
 namespace test {
 
-TEST(CudaGemmOptions, TestDefaultOptions) {
+TEST(CudaEpGemmOptions, TestDefaultOptions) {
   HalfGemmOptions gemm_options;
   ASSERT_FALSE(gemm_options.IsCompute16F());
 #if defined(USE_CUDA)
@@ -22,7 +22,7 @@ TEST(CudaGemmOptions, TestDefaultOptions) {
 #endif
 }
 
-TEST(CudaGemmOptions, TestCompute16F) {
+TEST(CudaEpGemmOptions, TestCompute16F) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(1);
   ASSERT_TRUE(gemm_options.IsCompute16F());
@@ -35,7 +35,7 @@ TEST(CudaGemmOptions, TestCompute16F) {
 #endif
 }
 
-TEST(CudaGemmOptions, NoReducedPrecision) {
+TEST(CudaEpGemmOptions, NoReducedPrecision) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(2);
   ASSERT_FALSE(gemm_options.IsCompute16F());
@@ -48,7 +48,7 @@ TEST(CudaGemmOptions, NoReducedPrecision) {
 #endif
 }
 
-TEST(CudaGemmOptions, Pedantic) {
+TEST(CudaEpGemmOptions, Pedantic) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(4);
   ASSERT_FALSE(gemm_options.IsCompute16F());
@@ -61,7 +61,7 @@ TEST(CudaGemmOptions, Pedantic) {
 #endif
 }
 
-TEST(CudaGemmOptions, Compute16F_Pedantic) {
+TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(5);
   ASSERT_TRUE(gemm_options.IsCompute16F());
@@ -74,7 +74,7 @@ TEST(CudaGemmOptions, Compute16F_Pedantic) {
 #endif
 }
 
-TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) {
+TEST(CudaEpGemmOptions, Compute16F_NoReducedPrecision) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(3);
   ASSERT_TRUE(gemm_options.IsCompute16F());
diff --git a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
index 6b8cd68de0fca..ba24cf858e80f 100644
--- a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
@@ -41,7 +41,7 @@ void ComputeTop1Reference(const std::vector<float>& values,
   }
 }
 
-TEST(TestGreedySearch, TopOne) {
+TEST(CudaEpTestGreedySearch, TopOne) {
   int32_t batch_size = 4;
   int32_t vocab_size = 50257;
   int32_t batch_x_vocab = batch_size * vocab_size;
diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
index ec7e98528504e..09c9c1e5f8f6a 100644
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -179,7 +179,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
 }
 }  // namespace
 
-TEST(ReductionFunctionsTest, ReduceRowToScalar) {
+TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
   TestReduceRowToScalarApis(3);
   TestReduceRowToScalarApis(19);
   TestReduceRowToScalarApis(123);
@@ -188,7 +188,7 @@ TEST(ReductionFunctionsTest, ReduceRowToScalar) {
   TestReduceRowToScalarApis(941736, 2e-4f);
 }
 
-TEST(ReductionFunctionsTest, ReduceRowsToRow) {
+TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
   for (int m : {3, 193, 2945}) {
     for (int n : {3, 193, 2945}) {
       TestReduceRowsToRow(m, n, true);
@@ -197,7 +197,7 @@ TEST(ReductionFunctionsTest, ReduceRowsToRow) {
   }
 }
 
-TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
+TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
   for (int m : {3, 193, 2945}) {
     for (int n : {3, 193, 2945}) {
       TestReduceColumnsToColumn(m, n);
@@ -205,7 +205,7 @@ TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
   }
 }
 
-TEST(ReductionFunctionsTest, BufferOffsets) {
+TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
   const int m = 2048;
   const int n = 1024;
   const TensorShape shape{m, n};
@@ -240,7 +240,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
   }
 }
 
-TEST(ReductionFunctionsTest, InvalidBufferSize) {
+TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
   const int m = 2048;
   const int n = 1024;
   const TensorShape shape{m, n};
@@ -262,7 +262,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
   ASSERT_FALSE(status.IsOK());
 }
 
-TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
+TEST(CudaEpReductionFunctionsTest, GetApplicableMatrixReduction) {
   auto test_get_applicable_matrix_reduction =
       [](cudnnReduceTensorOp_t cudnn_op,
          const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,
diff --git a/onnxruntime/test/providers/kernel_compute_test_utils.cc b/onnxruntime/test/providers/kernel_compute_test_utils.cc
index 23ec48fa649dd..93e688570631e 100644
--- a/onnxruntime/test/providers/kernel_compute_test_utils.cc
+++ b/onnxruntime/test/providers/kernel_compute_test_utils.cc
@@ -42,8 +42,9 @@ void KernelComputeTester::Run(std::unordered_set<int> strided_outputs) {
   }
 #endif
 
+  const auto& logger = DefaultLoggingManager().DefaultLogger();
   Model model("test", false, ModelMetaData(), ORT_TSTR(""), IOnnxRuntimeOpSchemaRegistryList(),
-              {{domain_, opset_version_}}, {}, DefaultLoggingManager().DefaultLogger());
+              {{domain_, opset_version_}}, {}, logger);
 
   std::vector<NodeArg*> input_args;
   std::unordered_map<std::string, OrtValue> initializer_map;
@@ -89,8 +90,7 @@ void KernelComputeTester::Run(std::unordered_set<int> strided_outputs) {
   ASSERT_STATUS_OK(graph.Resolve());
 
   node.SetExecutionProviderType(ep_type);
-  OptimizerExecutionFrame::Info info({&node}, initializer_map, graph.ModelPath(), *execution_providers.Get(ep_type),
-                                     [](std::string const&) { return false; });
+  OptimizerExecutionFrame::Info info({&node}, initializer_map, graph.ModelPath(), *execution_providers.Get(ep_type), [](std::string const&) { return false; }, logger);
   const KernelCreateInfo* kernel_create_info = nullptr;
   ASSERT_STATUS_OK(info.TryFindKernel(&node, &kernel_create_info));
   ASSERT_TRUE(kernel_create_info);
@@ -139,7 +139,7 @@ void KernelComputeTester::Run(std::unordered_set<int> strided_outputs) {
 #pragma warning(disable : 6387)
 #endif
   OptimizerExecutionFrame frame(info, fetch_mlvalue_idxs, outputs);
-  OpKernelContext op_kernel_context(&frame, kernel.get(), nullptr, nullptr, DefaultLoggingManager().DefaultLogger());
+  OpKernelContext op_kernel_context(&frame, kernel.get(), nullptr, nullptr, logger);
 #ifdef _WIN32
 #pragma warning(pop)
 #endif
diff --git a/onnxruntime/test/providers/partitioning_utils_test.cc b/onnxruntime/test/providers/partitioning_utils_test.cc
index 5db69489afaef..f1fbb1cea7ea2 100644
--- a/onnxruntime/test/providers/partitioning_utils_test.cc
+++ b/onnxruntime/test/providers/partitioning_utils_test.cc
@@ -51,7 +51,7 @@ TEST(PartitioningUtilsTest, TestQDQHandling) {
 
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
 
   auto result = utils::CreateSupportedPartitions(graph_viewer, is_node_supported, on_group_closed,
                                                  gen_metadef_name, "TEST", kCpuExecutionProvider, &node_unit_map,
@@ -82,7 +82,7 @@ static void CheckAllNodesProcessed(const std::function<void(ModelTestBuilder&)>&
 
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
 
   const auto is_node_supported = [&](const Node& /*node*/) -> bool {
     return true;
diff --git a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
index 019d619f9be49..55177cc7ed131 100644
--- a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
@@ -132,6 +132,7 @@ TEST_F(QnnHTPBackendTests, GatherOp_IndicesDynamicInt32_Axis0) {
 }
 
 // disabled for QNN 2.28.0.241029 failed for accuracy validation
+// Also fails on QNN 2.28.2.
 // qdq@QNN_EP val: 3.6094117164611816 (err: 1.3094117641448975, err/output_range: 22.19342041015625%)
 // qdq@CPU_EP val: 2.2905881404876709 (err: 0.0094118118286132812, err/output_range: 0.15952222049236298%)
 // abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 22.033897399902344%
diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc
index 2773568dde717..947ac19be40a8 100644
--- a/onnxruntime/test/providers/qnn/layer_norm_test.cc
+++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc
@@ -188,15 +188,11 @@ TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_StaticBias_AU8_WU8_B
                                         ExpectedEPNodeAssignment::All);
 }
 
-// QNN 2.27 accuracy issue
-// Inaccuracy detected for output 'output_0', element 0
-// output_range=1.2245157957077026, tolerance=0.40000000596046448%.
-// Expected val (f32@CPU_EP): -0
-// qdq@QNN_EP val: 0.19133351743221283 (err: 0.19133351743221283, err/output_range: 15.625238418579102%)
-// qdq@CPU_EP val: 0 (err: 0, err/output_range: 0%)
-TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_QNN2_24_ImplicitBias_ValidationBug) {
-  // QNN 2.24 LayerNorm fails validation (intermittent) if the bias input is not provided. QNN EP will provide an
-  // explicit bias of all zeros to get around this bug.
+TEST_F(QnnHTPBackendTests, LayerNorm1D_QNN2_24_ImplicitBias_ValidationBug) {
+  // QNN 2.24 to 2.27: LayerNorm fails validation (intermittent) if the bias input is not provided. QNN EP will provide
+  // an explicit bias of all zeros to get around this bug.
+  // QNN 2.28.0: Validation bug is fixed, but get accuracy errors.
+  // QNN 2.28.2: All fixed.
   for (size_t i = 0; i < 15; i++) {  // Run it multiple times since this is an intermittent bug.
     RunLayerNormQDQTest<uint16_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 1.0f, 6)),
                                            TestInputDef<float>({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)),
@@ -207,14 +203,9 @@ TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_QNN2_24_ImplicitBias_ValidationB
   }
 }
 
-// Test accuracy of 16-bit QDQ LayerNorm with a static scale input.
-// QNN 2.27 accuracy issue
-// Inaccuracy detected for output 'output_0', element 0
-// output_range=1.224743127822876, tolerance=0.40000000596046448%.
-// Expected val (f32@CPU_EP): -0
-// qdq@QNN_EP val: 0.19136904180049896 (err: 0.19136904180049896, err/output_range: 15.625238418579102%)
-// qdq@CPU_EP val: 0 (err: 0, err/output_range: 0%)
-TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_LastAxis_StaticScale_AU16_WU8) {
+TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU16_WU8) {
+  // QNN 2.28.0: Get accuracy errors.
+  // QNN 2.28.2: All fixed.
   RunLayerNormQDQTest<uint16_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
                                          TestInputDef<float>({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)),  // Static
                                          TestInputDef<float>(),
@@ -225,7 +216,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_LastAxis_StaticScale_AU16_WU8) {
 
 // Test accuracy of 8-bit QDQ LayerNorm with a dynamic scale input.
 //
-// TODO(adrianlizarraga): Fails to finalize with QNN SDK 2.22.
+// TODO(adrianlizarraga): Fails to finalize with QNN SDK 2.22. Still fails on QNN SDK 2.28.2.
 // Verbose logs:
 // Starting stage: Graph Transformations and Optimizations
 // C:\...\QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:203:ERROR:could not create op: q::flat_to_vtcm
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index 800457d906940..5c6967761b1db 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -273,7 +273,7 @@ TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightUInt4) {
 }
 
 // Test QDQ per-channel MatMul with int8 act, int4 weights (static)
-// QNN 2.27 regression
+// QNN 2.27 regression. Also fails on QNN 2.28.2.
 // Failed to finalize QNN graph. Error code: 1002
 TEST_F(QnnHTPBackendTests, DISABLED_MatMulOp_PerChannel_AS8_WeightInt4) {
   std::vector<float> input0_data = GetFloatDataInRange(-5.0f, 5.0f, 6);
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
index 79e7d39e85518..4feeb5f830508 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
@@ -388,6 +388,7 @@ bool ReduceOpHasAxesInput(const std::string& op_type, int opset_version) {
       {"ReduceMean", 18},
       {"ReduceProd", 18},
       {"ReduceSum", 13},
+      {"ReduceL2", 18},
   };
 
   const auto it = opset_with_axes_as_input.find(op_type);
diff --git a/onnxruntime/test/providers/qnn/reduce_op_test.cc b/onnxruntime/test/providers/qnn/reduce_op_test.cc
index 13173d9a87f55..e4abe85908373 100644
--- a/onnxruntime/test/providers/qnn/reduce_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reduce_op_test.cc
@@ -309,6 +309,27 @@ TEST_F(QnnCPUBackendTests, ReduceMeanOpset13) {
                        ExpectedEPNodeAssignment::All);
 }
 
+//
+// ReduceL2
+//
+TEST_F(QnnCPUBackendTests, ReduceL2Opset18) {
+  RunReduceTest<float>("ReduceL2",
+                       TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                       std::vector<int64_t>{0, 1},
+                       true,  // keepdims
+                       18,
+                       ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, ReduceL2Opset13) {
+  RunReduceTest<float>("ReduceL2",
+                       TestInputDef<float>({2, 2}, false, -10.0f, 10.0f),
+                       std::vector<int64_t>{0, 1},
+                       true,  // keepdims
+                       13,
+                       ExpectedEPNodeAssignment::All);
+}
+
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 // Test creates a graph with a ReduceSum node, and checks that all nodes are supported by the QNN EP
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 05731976c453f..7541d94bac0c6 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -230,6 +230,7 @@ TEST_F(QnnHTPBackendTests, UnaryOp_Tanh) {
 }
 
 // disabled for QNN 2.28.0.241029 backendValidateOpConfig failed
+// still fails on QNN 2.28.2.
 // QnnDsp <E> [4294967295] has incorrect Value -32768, expected equal to 0.
 // QnnDsp <V> validateNativeOps node_token_6:qti.aisw:Tanh htp op validator failed 3110
 // QnnDsp <V> registered validator failed => 3110
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index 63327a028c6f4..0022d7fc0e184 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -342,8 +342,12 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
   Graph& graph = model->MainGraph();
   GraphViewer viewer(graph);
 
+  std::string trt_version = std::to_string(NV_TENSORRT_MAJOR) + "." + std::to_string(NV_TENSORRT_MINOR);
+  std::string cuda_version = std::to_string(CUDA_VERSION);
+  std::string ort_version = ORT_VERSION;
+
   // get the hash for the model when loaded from file
-  HashValue model_hash = TRTGenerateId(viewer);
+  HashValue model_hash = TRTGenerateId(viewer, trt_version, cuda_version);
   ASSERT_NE(model_hash, 0);
 
   // now load the model from bytes and check the hash differs
@@ -358,7 +362,7 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
   // Test loading same model from file and byte steam. Hash values should be different
   Graph& graph2 = model2->MainGraph();
   GraphViewer viewer2(graph2);
-  HashValue model_hash2 = TRTGenerateId(viewer2);
+  HashValue model_hash2 = TRTGenerateId(viewer2, trt_version, cuda_version);
   ASSERT_NE(model_hash, model_hash2);
 
   // Test loading same model from different path, see if hash values are same as well
@@ -367,7 +371,7 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
   ASSERT_TRUE(Model::Load(model_path, model3, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
   Graph& graph3 = model3->MainGraph();
   GraphViewer viewer3(graph3);
-  HashValue model_hash3 = TRTGenerateId(viewer3);
+  HashValue model_hash3 = TRTGenerateId(viewer3, trt_version, cuda_version);
   ASSERT_EQ(model_hash, model_hash3) << "model 1&3 are same models and they have same hash, no matter where they are loaded";
 }
 
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index 8fc76da3495a8..a274b90dc042f 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -105,7 +105,7 @@ def load_jsonc(basename: str):
     return json.loads("\n".join(lines))
 
 
-def create_backend_test(test_name=None):
+def create_backend_test(devices: list[str], test_name=None):
     """Creates an OrtBackendTest and adds its TestCase's to global scope so unittest will find them."""
 
     overrides = load_jsonc("onnx_backend_test_series_overrides.jsonc")
@@ -126,30 +126,29 @@ def create_backend_test(test_name=None):
     else:
         filters = load_jsonc("onnx_backend_test_series_filters.jsonc")
         current_failing_tests = apply_filters(filters, "current_failing_tests")
-
         if platform.architecture()[0] == "32bit":
             current_failing_tests += apply_filters(filters, "current_failing_tests_x86")
 
-        if backend.supports_device("DNNL"):
+        if backend.supports_device("DNNL") or "DNNL" in devices:
             current_failing_tests += apply_filters(filters, "current_failing_tests_DNNL")
 
-        if backend.supports_device("NNAPI"):
+        if backend.supports_device("NNAPI") or "NNAPI" in devices:
             current_failing_tests += apply_filters(filters, "current_failing_tests_NNAPI")
 
-        if backend.supports_device("OPENVINO_GPU"):
+        if backend.supports_device("OPENVINO_GPU") or "OPENVINO_GPU" in devices:
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_GPU")
 
-        if backend.supports_device("OPENVINO_CPU"):
+        if backend.supports_device("OPENVINO_CPU") or "OPENVINO_CPU" in devices:
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP32")
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP16")
 
-        if backend.supports_device("OPENVINO_NPU"):
+        if backend.supports_device("OPENVINO_NPU") or "OPENVINO_NPU" in devices:
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU")
 
-        if backend.supports_device("OPENVINO"):
+        if backend.supports_device("OPENVINO") or "OPENVINO" in devices:
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_opset18")
 
-        if backend.supports_device("MIGRAPHX"):
+        if backend.supports_device("MIGRAPHX") or "MIGRAPHX" in devices:
             current_failing_tests += apply_filters(filters, "current_failing_tests_MIGRAPHX")
 
         if backend.supports_device("WEBGPU"):
@@ -158,8 +157,16 @@ def create_backend_test(test_name=None):
         # Skip these tests for a "pure" DML onnxruntime python wheel. We keep these tests enabled for instances where both DML and CUDA
         # EPs are available (Windows GPU CI pipeline has this config) - these test will pass because CUDA has higher precedence than DML
         # and the nodes are assigned to only the CUDA EP (which supports these tests)
-        if backend.supports_device("DML") and not backend.supports_device("GPU"):
+        if (backend.supports_device("DML") and not backend.supports_device("GPU")) or "DML" in devices:
             current_failing_tests += apply_filters(filters, "current_failing_tests_pure_DML")
+            # exclude CUDA EP when DML test is running.
+            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,CUDAExecutionProvider"
+        elif backend.supports_device("DML") and "DML" not in devices:
+            # exclude DML EP when CUDA test is running.
+            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,DmlExecutionProvider"
+        else:
+            # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior
+            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider"
 
         filters = (
             current_failing_tests
@@ -172,9 +179,6 @@ def create_backend_test(test_name=None):
         backend_test.exclude("(" + "|".join(filters) + ")")
         print("excluded tests:", filters)
 
-        # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior
-        os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider"
-
     # import all test cases at global scope to make
     # them visible to python.unittest.
     globals().update(backend_test.enable_report().test_cases)
@@ -199,6 +203,15 @@ def parse_args():
         help="Only run tests that match this value. Matching is regex based, and '.*' is automatically appended",
     )
 
+    parser.add_argument(
+        "--devices",
+        type=str,
+        choices=["CPU", "CUDA", "MIGRAPHX", "DNNL", "DML", "OPENVINO_GPU", "OPENVINO_CPU", "OPENVINO_NPU", "OPENVINO"],
+        nargs="+",  # allows multiple values
+        default=["CPU"],  # default to ["CPU"] if no input is given
+        help="Select one or more devices CPU, CUDA, MIGRAPHX, DNNL, DML, OPENVINO_GPU, OPENVINO_CPU, OPENVINO_NPU, OPENVINO",
+    )
+
     # parse just our args. python unittest has its own args and arg parsing, and that runs inside unittest.main()
     parsed, unknown = parser.parse_known_args()
     sys.argv = sys.argv[:1] + unknown
@@ -209,5 +222,5 @@ def parse_args():
 if __name__ == "__main__":
     args = parse_args()
 
-    create_backend_test(args.test_name)
+    create_backend_test(args.devices, args.test_name)
     unittest.main()
diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
index 103b68a4f7dfb..2f8fb84c4c651 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@@ -39,27 +39,27 @@ def unique_element(lst):
 
 
 class TestSymbolicShapeInference(unittest.TestCase):
-    # TODO: investigate why symbolic shape infer test failed for Python 3.10
-    # def test_symbolic_shape_infer(self):
-    #     from pathlib import Path
-    #     cwd = os.getcwd()
-    #     test_model_dir = os.path.join(cwd, "..", "models")
-    #     for filename in Path(test_model_dir).rglob("*.onnx"):
-    #         if filename.name.startswith("."):
-    #             continue  # skip some bad model files
-    #
-    #         # https://github.com/onnx/models/issues/562
-    #         if any(model_name in str(filename) for model_name in skipped_models):
-    #             print(f"Skip symbolic shape inference on : {filename!s}")
-    #             continue
-    #
-    #         print("Running symbolic shape inference on : " + str(filename))
-    #         SymbolicShapeInference.infer_shapes(
-    #             in_mp=onnx.load(str(filename)),
-    #             auto_merge=True,
-    #             int_max=100000,
-    #             guess_output_rank=True,
-    #         )
+    def test_symbolic_shape_infer(self):
+        from pathlib import Path
+
+        cwd = os.getcwd()
+        test_model_dir = os.path.join(cwd, "..", "models")
+        for filename in Path(test_model_dir).rglob("*.onnx"):
+            if filename.name.startswith("."):
+                continue  # skip some bad model files
+
+            # https://github.com/onnx/models/issues/562
+            if any(model_name in str(filename) for model_name in skipped_models):
+                print(f"Skip symbolic shape inference on : {filename!s}")
+                continue
+
+            print("Running symbolic shape inference on : " + str(filename))
+            SymbolicShapeInference.infer_shapes(
+                in_mp=onnx.load(str(filename)),
+                auto_merge=True,
+                int_max=100000,
+                guess_output_rank=True,
+            )
 
     def test_mismatched_types(self):
         graph = helper.make_graph(
@@ -343,56 +343,55 @@ def test_einsum_sum(self):
     def test_einsum_transpose(self):
         self._test_einsum_one_input_impl(["a", "b"], ["b", "a"], "ij -> ji")
 
-    # TODO: investigate why symbolic shape infer test failed for Python 3.10
-    # def test_mul_precision(self):
-    #     graph_input = onnx.helper.make_tensor_value_info("input", TensorProto.FLOAT, [1024])
-    #     graph_output = onnx.helper.make_tensor_value_info("output", TensorProto.FLOAT, None)
-    #
-    #     # initializers
-    #     value = numpy.array([0.5], dtype=numpy.float32)
-    #     constant = numpy_helper.from_array(value, name="constant")
-    #
-    #     nodes = [
-    #         # Get the shape of the input tensor: `input_tensor_shape = [1024]`.
-    #         onnx.helper.make_node("Shape", ["input"], ["input_shape"]),
-    #         # mul(1024, 0.5) => 512
-    #         onnx.helper.make_node("Mul", ["input_shape", "constant"], ["output_shape"]),
-    #         # Resize input
-    #         onnx.helper.make_node(
-    #             "Resize", inputs=["input", "", "", "output_shape"], outputs=["output"], mode="nearest"
-    #         ),
-    #     ]
-    #
-    #     graph_def = onnx.helper.make_graph(nodes, "TestMulPrecision", [graph_input], [graph_output], [constant])
-    #     model = SymbolicShapeInference.infer_shapes(onnx.helper.make_model(graph_def))
-    #     output_dims = unique_element(model.graph.output).type.tensor_type.shape.dim
-    #     self.assertEqual(len(output_dims), 1)
-    #     self.assertEqual(output_dims[0].dim_value, 512)
-
-    # def test_div_precision(self):
-    #     graph_input = onnx.helper.make_tensor_value_info("input", TensorProto.FLOAT, [768])
-    #     graph_output = onnx.helper.make_tensor_value_info("output", TensorProto.FLOAT, None)
-    #
-    #     # initializers
-    #     value = numpy.array([1.5], dtype=numpy.float32)
-    #     constant = numpy_helper.from_array(value, name="constant")
-    #
-    #     nodes = [
-    #         # Get the shape of the input tensor: `input_tensor_shape = [768]`.
-    #         onnx.helper.make_node("Shape", ["input"], ["input_shape"]),
-    #         # div(768, 1.5) => 512
-    #         onnx.helper.make_node("Div", ["input_shape", "constant"], ["output_shape"]),
-    #         # Resize input
-    #         onnx.helper.make_node(
-    #             "Resize", inputs=["input", "", "", "output_shape"], outputs=["output"], mode="nearest"
-    #         ),
-    #     ]
-    #
-    #     graph_def = onnx.helper.make_graph(nodes, "TestDivPrecision", [graph_input], [graph_output], [constant])
-    #     model = SymbolicShapeInference.infer_shapes(onnx.helper.make_model(graph_def))
-    #     output_dims = unique_element(model.graph.output).type.tensor_type.shape.dim
-    #     self.assertEqual(len(output_dims), 1)
-    #     self.assertEqual(output_dims[0].dim_value, 512)
+    def test_mul_precision(self):
+        graph_input = onnx.helper.make_tensor_value_info("input", TensorProto.FLOAT, [1024])
+        graph_output = onnx.helper.make_tensor_value_info("output", TensorProto.FLOAT, None)
+
+        # initializers
+        value = numpy.array([0.5], dtype=numpy.float32)
+        constant = numpy_helper.from_array(value, name="constant")
+
+        nodes = [
+            # Get the shape of the input tensor: `input_tensor_shape = [1024]`.
+            onnx.helper.make_node("Shape", ["input"], ["input_shape"]),
+            # mul(1024, 0.5) => 512
+            onnx.helper.make_node("Mul", ["input_shape", "constant"], ["output_shape"]),
+            # Resize input
+            onnx.helper.make_node(
+                "Resize", inputs=["input", "", "", "output_shape"], outputs=["output"], mode="nearest"
+            ),
+        ]
+
+        graph_def = onnx.helper.make_graph(nodes, "TestMulPrecision", [graph_input], [graph_output], [constant])
+        model = SymbolicShapeInference.infer_shapes(onnx.helper.make_model(graph_def))
+        output_dims = unique_element(model.graph.output).type.tensor_type.shape.dim
+        self.assertEqual(len(output_dims), 1)
+        self.assertEqual(output_dims[0].dim_value, 512)
+
+    def test_div_precision(self):
+        graph_input = onnx.helper.make_tensor_value_info("input", TensorProto.FLOAT, [768])
+        graph_output = onnx.helper.make_tensor_value_info("output", TensorProto.FLOAT, None)
+
+        # initializers
+        value = numpy.array([1.5], dtype=numpy.float32)
+        constant = numpy_helper.from_array(value, name="constant")
+
+        nodes = [
+            # Get the shape of the input tensor: `input_tensor_shape = [768]`.
+            onnx.helper.make_node("Shape", ["input"], ["input_shape"]),
+            # div(768, 1.5) => 512
+            onnx.helper.make_node("Div", ["input_shape", "constant"], ["output_shape"]),
+            # Resize input
+            onnx.helper.make_node(
+                "Resize", inputs=["input", "", "", "output_shape"], outputs=["output"], mode="nearest"
+            ),
+        ]
+
+        graph_def = onnx.helper.make_graph(nodes, "TestDivPrecision", [graph_input], [graph_output], [constant])
+        model = SymbolicShapeInference.infer_shapes(onnx.helper.make_model(graph_def))
+        output_dims = unique_element(model.graph.output).type.tensor_type.shape.dim
+        self.assertEqual(len(output_dims), 1)
+        self.assertEqual(output_dims[0].dim_value, 512)
 
     def test_quantize_linear(self):
         """
diff --git a/onnxruntime/test/python/onnxruntime_test_python_tvm.py b/onnxruntime/test/python/onnxruntime_test_python_tvm.py
deleted file mode 100644
index 0080bf53520f2..0000000000000
--- a/onnxruntime/test/python/onnxruntime_test_python_tvm.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation.  All rights reserved.
-# Licensed under the MIT License.  See License.txt in the project root for
-# license information.
-# --------------------------------------------------------------------------
-"""
-Module for unit testing of TVM EP
-"""
-
-import os
-import sys
-import tempfile
-import unittest
-from typing import Any, AnyStr, Dict, List, Tuple
-
-import numpy
-import tvm
-from numpy.testing import assert_almost_equal
-from onnx import ModelProto, TensorProto, mapping
-from onnx.helper import make_graph, make_model, make_node, make_tensor_value_info
-
-import onnxruntime
-
-numpy.random.seed(32)
-
-
-def is_windows():
-    """
-    Function to determine the Windows system
-    """
-    return sys.platform.startswith("win")
-
-
-def get_model_with_dynamic_shapes() -> ModelProto:
-    """
-    Create model with Dynamic Shapes
-    """
-    x = make_tensor_value_info("X", TensorProto.FLOAT, [None, None])  # pylint: disable=invalid-name, no-member
-    a = make_tensor_value_info("A", TensorProto.FLOAT, [None, None])  # pylint: disable=invalid-name, no-member
-    b = make_tensor_value_info("B", TensorProto.FLOAT, [None, None])  # pylint: disable=invalid-name, no-member
-    y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None])  # pylint: disable=invalid-name, no-member
-    node1 = make_node("MatMul", ["X", "A"], ["XA"])
-    node2 = make_node("Add", ["XA", "B"], ["Y"])
-    graph = make_graph([node1, node2], "lr", [x, a, b], [y])
-    onnx_model = make_model(graph)
-    return onnx_model
-
-
-def get_model_with_fixed_shapes() -> ModelProto:
-    """
-    Create model with Static Shapes
-    """
-
-    def change_input_shape(model: ModelProto, ind: int, shape: Tuple) -> None:
-        """
-        Function to change the input form
-        """
-        dims = model.graph.input[ind].type.tensor_type.shape.dim
-        assert len(dims) == len(shape), "Input rank and new shape rank do not match."
-        for i, new_dim in enumerate(shape):
-            model.graph.input[ind].type.tensor_type.shape.dim[i].dim_value = new_dim
-
-    dynamic_model = get_model_with_dynamic_shapes()
-    change_input_shape(dynamic_model, 0, (1, 2))  # X
-    change_input_shape(dynamic_model, 1, (2, 2))  # A
-    change_input_shape(dynamic_model, 2, (1, 2))  # B
-    return dynamic_model
-
-
-def get_input_data_for_model_with_dynamic_shapes() -> Dict[AnyStr, numpy.ndarray]:
-    """
-    Create input data for model with dynamic shapes
-    """
-    a = numpy.random.randn(2, 2).astype(numpy.float32)  # pylint: disable=invalid-name
-    b = numpy.random.randn(1, 2).astype(numpy.float32)  # pylint: disable=invalid-name
-    x = numpy.random.randn(1, 2).astype(numpy.float32)  # pylint: disable=invalid-name
-    data = {"A": a, "B": b, "X": x}
-    return data
-
-
-def get_input_data_for_model_with_fixed_shapes(onnx_model: ModelProto) -> Dict[AnyStr, numpy.ndarray]:
-    """
-    Create input data for model with static shapes
-    """
-
-    def get_onnx_input_names(model: ModelProto) -> List[AnyStr]:
-        inputs = [node.name for node in model.graph.input]
-        initializer = [node.name for node in model.graph.initializer]
-        inputs = list(set(inputs) - set(initializer))
-        return sorted(inputs)
-
-    def get_onnx_input_types(model: ModelProto) -> List[numpy.dtype]:
-        input_names = get_onnx_input_names(model)
-        return [
-            mapping.TENSOR_TYPE_TO_NP_TYPE[node.type.tensor_type.elem_type]
-            for node in sorted(model.graph.input, key=lambda node: node.name)
-            if node.name in input_names
-        ]
-
-    def get_onnx_input_shapes(model: ModelProto) -> List[List[int]]:
-        input_names = get_onnx_input_names(model)
-        return [
-            [dv.dim_value for dv in node.type.tensor_type.shape.dim]
-            for node in sorted(model.graph.input, key=lambda node: node.name)
-            if node.name in input_names
-        ]
-
-    input_names = get_onnx_input_names(onnx_model)
-    input_shapes = get_onnx_input_shapes(onnx_model)
-    input_types = get_onnx_input_types(onnx_model)
-    assert len(input_names) == len(input_types) == len(input_shapes)
-    random_inputs = [numpy.random.uniform(size=shape).astype(dtype) for shape, dtype in zip(input_shapes, input_types)]
-    return dict(zip(input_names, random_inputs))
-
-
-def get_input_names_and_shapes(data: Dict[AnyStr, numpy.ndarray]) -> Tuple[List[AnyStr], List[AnyStr]]:
-    """
-    Create text representations for model input names and shapes
-    """
-    keys = list(data.keys())
-    values = [data[key] for key in keys]
-    return (
-        list(data.keys()),
-        [str(value.shape).replace(",", "").replace("(", "[").replace(")", "]") for value in values],
-    )
-
-
-def get_cpu_output(onnx_model: ModelProto, data: Dict[AnyStr, numpy.ndarray]) -> List[numpy.ndarray]:
-    """
-    Run inference with CPUExecutionProvider
-    """
-    # pylint: disable=no-member
-    sess = onnxruntime.InferenceSession(
-        onnx_model.SerializeToString(),
-        providers=["CPUExecutionProvider"],
-    )
-    output = sess.run(None, data)
-    return output
-
-
-def get_tvm_output(
-    onnx_model: ModelProto, data: Dict[AnyStr, numpy.ndarray], provider_options: Dict[AnyStr, Any]
-) -> List[numpy.ndarray]:
-    """
-    Run inference with TVMExecutionProvider
-    """
-    session_options = onnxruntime.SessionOptions()  # pylint: disable=no-member
-    session_options.log_severity_level = 0
-    session_options.log_verbosity_level = 0
-    # pylint: disable=no-member
-    session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
-
-    sess = onnxruntime.InferenceSession(
-        onnx_model.SerializeToString(),
-        session_options,
-        providers=["TvmExecutionProvider"],
-        provider_options=[provider_options],
-    )
-
-    output = sess.run(None, data)
-    return output
-
-
-# pylint: disable=no-member
-def compile_virtual_machine(model: ModelProto, target_str: AnyStr) -> tvm.runtime.vm.Executable:
-    """
-    Compile ONNX model using VirtualMachine
-    """
-    ir_mod, _ = tvm.relay.frontend.from_onnx(
-        model,
-        opset=model.opset_import[0].version,
-        freeze_params=True,
-    )
-    target = tvm.target.Target(target=target_str, host=target_str)
-    return tvm.relay.backend.vm.compile(ir_mod, target)
-
-
-def serialize_virtual_machine(vm_exec: tvm.runtime.vm.Executable) -> AnyStr:
-    """
-    Serialize VirtualMachine
-    """
-    temp_directory = tempfile.mkdtemp()
-    path_consts = os.path.join(temp_directory, "consts")
-    vm_exec.move_late_bound_consts(path_consts, byte_limit=256)
-    lib_path = os.path.join(temp_directory, f"model.{'dll' if is_windows() else 'so'}")
-    code_path = os.path.join(temp_directory, "model.ro")
-    code, lib = vm_exec.save()
-    lib.export_library(lib_path)
-    with open(code_path, "wb") as code_file:
-        code_file.write(code)
-    return temp_directory
-
-
-class TestTVM(unittest.TestCase):
-    """
-    Unit tests for TVM EP
-    """
-
-    @staticmethod
-    def test_accuracy_for_model_with_dynamic_shapes():
-        """
-        Accuracy test for model with dynamic shapes
-        """
-        onnx_model = get_model_with_dynamic_shapes()
-        data = get_input_data_for_model_with_dynamic_shapes()
-
-        cpu_output = get_cpu_output(onnx_model, data)
-        names, shapes = get_input_names_and_shapes(data)
-        provider_options = dict(
-            target="llvm",
-            input_names=" ".join(names),
-            input_shapes=" ".join(shapes),
-        )
-        tvm_output = get_tvm_output(onnx_model, data, provider_options)
-
-        assert_almost_equal(cpu_output, tvm_output, decimal=5)
-
-    @staticmethod
-    def test_accuracy_for_tvm_so():
-        """
-        Accuracy test for TVMso Ep
-        """
-        onnx_model = get_model_with_fixed_shapes()
-        data = get_input_data_for_model_with_fixed_shapes(onnx_model)
-
-        cpu_output = get_cpu_output(onnx_model, data)
-
-        compiled_vm_exec = compile_virtual_machine(onnx_model, target_str="llvm")
-        so_folder = serialize_virtual_machine(compiled_vm_exec)
-        provider_options = dict(
-            target="llvm",
-            so_folder=so_folder,
-        )
-        tvm_output = get_tvm_output(onnx_model, data, provider_options)
-
-        assert_almost_equal(cpu_output, tvm_output, decimal=5)
-
-
-if __name__ == "__main__":
-    if "TvmExecutionProvider" not in onnxruntime.get_available_providers():
-        raise AssertionError(f"Unable to find 'TvmExecutionProvider' in {onnxruntime.get_available_providers()}")
-    unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py
index d7055764f745a..58d00272475cd 100644
--- a/onnxruntime/test/python/quantization/test_get_qdq_config.py
+++ b/onnxruntime/test/python/quantization/test_get_qdq_config.py
@@ -93,6 +93,7 @@ def test_basic_args(self):
             activation_type=QuantType.QUInt16,
             weight_type=QuantType.QInt16,
             per_channel=True,
+            reduce_range=True,
             nodes_to_exclude=["Mul"],
             # Other options converted to extra_options:
             min_real_range=0.0001,
@@ -104,6 +105,7 @@ def test_basic_args(self):
         self.assertEqual(qdq_config.activation_type, QuantType.QUInt16)
         self.assertEqual(qdq_config.weight_type, QuantType.QInt16)
         self.assertTrue(qdq_config.per_channel)
+        self.assertTrue(qdq_config.reduce_range)
         self.assertEqual(set(qdq_config.nodes_to_exclude), {"Mul"})
         self.assertEqual(set(qdq_config.op_types_to_quantize), {"Add"})
 
diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py
index 05736019cd7c8..755c7fae5e3e8 100644
--- a/onnxruntime/test/python/quantization/test_op_pad.py
+++ b/onnxruntime/test/python/quantization/test_op_pad.py
@@ -548,14 +548,15 @@ def build_pad_model(
         opset: int = 21,
         float_type: onnx.TensorProto.DataType = onnx.TensorProto.FLOAT,
     ) -> onnx.ModelProto:
+        num_pads_start = 1
         input_0 = onnx.helper.make_tensor_value_info("input_0", float_type, (3, 2))
-        output_0 = onnx.helper.make_tensor_value_info("output_0", float_type, (3, 4))
+        output_0 = onnx.helper.make_tensor_value_info("output_0", float_type, (3, 2 + num_pads_start))
 
         initializers = []
         pad_input_names = ["input_0"]
         attrs = {"mode": mode}
 
-        pads_data = np.array([0, 2, 0, 0], dtype=np.int64)  # Pad two vals at beginning of axis 1.
+        pads_data = np.array([0, num_pads_start, 0, 0], dtype=np.int64)  # Pad one val at beginning of axis 1.
         if opset >= 11:
             initializers.append(onnx.numpy_helper.from_array(pads_data, "pads"))
             pad_input_names.append("pads")
diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py
index 24039fe7398a8..23b397ffd80e1 100644
--- a/onnxruntime/test/python/quantization/test_qdq.py
+++ b/onnxruntime/test/python/quantization/test_qdq.py
@@ -20,10 +20,12 @@
     check_op_type_count,
     check_op_type_order,
     create_clip_node,
+    get_tensor_consumers_and_producers,
 )
 
 from onnxruntime.quantization import QDQQuantizer, QuantFormat, QuantType, quantize_static, write_calibration_table
 from onnxruntime.quantization.calibrate import CalibrationMethod, TensorData, TensorsData
+from onnxruntime.quantization.quant_utils import quantize_nparray
 
 
 class TestQDQFormat(unittest.TestCase):
@@ -1925,5 +1927,280 @@ def test_dup_shared_bias(self):
         self.assertEqual(len(bias_names), 2)
 
 
+class TestQDQPrequantWeights(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.prequant_weight")
+
+        # Note: swap with the commented line if you want to see the models in local test dir.
+        cls._tmp_dir_path = cls._tmp_model_dir.name
+        # cls._tmp_dir_path = "."
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_model_dir.cleanup()
+
+    def build_conv_model(
+        self,
+        inp_shape: list[int],
+        weight_quant_data: np.ndarray,
+        weight_scale_data: np.ndarray,
+        weight_zp_data: np.ndarray,
+        bias_data: np.ndarray,
+        float_type: onnx.TensorProto.DataType = onnx.TensorProto.FLOAT,
+    ):
+        """
+        Builds a model with a Conv that has a pre-quantized constant weight input.
+        """
+        input_0 = onnx.helper.make_tensor_value_info("input_0", float_type, inp_shape)
+        output_0 = onnx.helper.make_tensor_value_info("output_0", float_type, None)
+        weight_quant = onnx.numpy_helper.from_array(weight_quant_data, "weight_quant")
+        weight_scale = onnx.numpy_helper.from_array(weight_scale_data, "weight_scale")
+        weight_zp = onnx.numpy_helper.from_array(weight_zp_data, "weight_zp")
+        bias = onnx.numpy_helper.from_array(bias_data, "bias")
+
+        dq_node = onnx.helper.make_node(
+            "DequantizeLinear", ["weight_quant", "weight_scale", "weight_zp"], ["weight_dequant"], name="DQ0"
+        )
+        conv_node = onnx.helper.make_node("Conv", ["input_0", "weight_dequant", "bias"], ["output_0"], name="Conv0")
+        graph = onnx.helper.make_graph(
+            [dq_node, conv_node],
+            "ConvPreQuantWeight",
+            [input_0],
+            [output_0],
+            initializer=[weight_quant, weight_scale, weight_zp, bias],
+        )
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+
+        return onnx.shape_inference.infer_shapes(model)
+
+    def build_conv_dynamic_weight_model(
+        self,
+        input_quant_data: np.ndarray,
+        input_scale_data: np.ndarray,
+        input_zp_data: np.ndarray,
+        weight_shape: list[int],
+        bias_data: np.ndarray,
+        float_type: onnx.TensorProto.DataType = onnx.TensorProto.FLOAT,
+    ):
+        """
+        Builds a model with a Conv that has a dynamic float weight input, but a constant
+        pre-quantized input[0].
+        """
+        dyn_weight = onnx.helper.make_tensor_value_info("dyn_weight", float_type, weight_shape)
+        output_0 = onnx.helper.make_tensor_value_info("output_0", float_type, None)
+        input_quant = onnx.numpy_helper.from_array(input_quant_data, "input_quant")
+        input_scale = onnx.numpy_helper.from_array(input_scale_data, "input_scale")
+        input_zp = onnx.numpy_helper.from_array(input_zp_data, "input_zp")
+        bias = onnx.numpy_helper.from_array(bias_data, "bias")
+
+        dq_node = onnx.helper.make_node(
+            "DequantizeLinear", ["input_quant", "input_scale", "input_zp"], ["input_dequant"], name="DQ0"
+        )
+        conv_node = onnx.helper.make_node("Conv", ["input_dequant", "dyn_weight", "bias"], ["output_0"], name="Conv0")
+        graph = onnx.helper.make_graph(
+            [dq_node, conv_node],
+            "ConvPreQuantInput_DynamicWeight",
+            [dyn_weight],
+            [output_0],
+            initializer=[input_quant, input_scale, input_zp, bias],
+        )
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+
+        return onnx.shape_inference.infer_shapes(model)
+
+    def test_quantize_with_prequantized_weights(self):
+        """
+        Test quantization of Conv with pre-quantized weights.
+        """
+        rng = np.random.default_rng(123)
+        test_configs = [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16]
+
+        for float_type in test_configs:
+            with self.subTest(float_type=float_type):
+                label = f"_{onnx.TensorProto.DataType.Name(float_type)}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"conv.f32.prequant_weight{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"conv.prequant_weight{label}.qdq.onnx")
+
+                inp_shape = [1, 2, 100, 100]
+                weight_shape = [2, 2, 20, 20]
+                np_dtype = onnx.helper.tensor_dtype_to_np_dtype(float_type)
+
+                # range = 2.0, scale = 2/254, zp = 0
+                weight_scale_data = np.array(2 / 254, dtype=np_dtype)
+                weight_zp_data = np.array(0, dtype=np.int8)
+                weight_data = np.linspace(-1.0, 1.0, num=1600, dtype=np_dtype).reshape(weight_shape)
+                weight_quant_data = quantize_nparray(
+                    onnx.TensorProto.INT8, weight_data, weight_scale_data, weight_zp_data
+                )
+
+                bias_data = np.array([-10.0, 10.0], dtype=np_dtype)
+                float_model = self.build_conv_model(
+                    inp_shape, weight_quant_data, weight_scale_data, weight_zp_data, bias_data, float_type
+                )
+
+                onnx.checker.check_model(float_model, True)
+                onnx.save_model(float_model, float_model_path)
+
+                # Check that the input model only has a pre-quantized weight and save its scale/zero-point
+                # to check that it doesn't change after quantization.
+                float_node_counts = {"QuantizeLinear": 0, "DequantizeLinear": 1}
+                check_op_type_count(self, float_model_path, **float_node_counts)
+                conv_node_original = next((node for node in float_model.graph.node if node.op_type == "Conv"), None)
+                self.assertNotEqual(conv_node_original, None)
+
+                _, producers_original = get_tensor_consumers_and_producers(float_model)
+                weight_dq_node_original = producers_original.get(conv_node_original.input[1], None)
+                initializers_original = {initializer.name: initializer for initializer in float_model.graph.initializer}
+                scale_name_original = weight_dq_node_original.input[1]
+                scale_val_original = onnx.numpy_helper.to_array(initializers_original[scale_name_original])
+                zp_name_original = weight_dq_node_original.input[2]
+                zp_val_original = onnx.numpy_helper.to_array(initializers_original[zp_name_original])
+
+                input_data_list = [
+                    {"input_0": rng.uniform(-10.0, 10.0, inp_shape).astype(np_dtype)},
+                ]
+                data_reader = TestDataFeeds(input_data_list)
+
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    weight_type=QuantType.QInt8,
+                    op_types_to_quantize=["Conv"],
+                )
+
+                # The final model should have everything quantized
+                qdq_node_counts = {"QuantizeLinear": 2, "DequantizeLinear": 4}
+                check_op_type_count(self, qdq_model_path, **qdq_node_counts)
+
+                # Check that the pre-quantized weight still has the same scale/zp after quantization
+                qdq_model = onnx.load_model(qdq_model_path)
+                conv_node = next((node for node in qdq_model.graph.node if node.op_type == "Conv"), None)
+                self.assertNotEqual(conv_node, None)
+
+                _, producers = get_tensor_consumers_and_producers(qdq_model)
+                weight_dq_node = producers.get(conv_node.input[1], None)
+                initializers = {initializer.name: initializer for initializer in qdq_model.graph.initializer}
+
+                scale_name = weight_dq_node.input[1]
+                self.assertEqual(scale_name, scale_name_original)
+                scale_val = onnx.numpy_helper.to_array(initializers[scale_name])
+                self.assertEqual(scale_val, scale_val_original)
+
+                zp_name = weight_dq_node.input[2]
+                self.assertEqual(zp_name, zp_name_original)
+                zp_val = onnx.numpy_helper.to_array(initializers[zp_name])
+                self.assertEqual(zp_val, zp_val_original)
+
+    def test_quantize_with_prequantized_input(self):
+        """
+        Test quantization of Conv with pre-quantized input and dynamic weight.
+        """
+        rng = np.random.default_rng(123)
+        test_configs = [
+            (onnx.TensorProto.FLOAT, False),
+            (onnx.TensorProto.FLOAT16, False),
+            (onnx.TensorProto.FLOAT, True),
+            (onnx.TensorProto.FLOAT16, True),
+        ]
+
+        for float_type, convert_weight_qtype in test_configs:
+            with self.subTest(float_type=float_type):
+                convert_label = "_convert_qtype" if convert_weight_qtype else ""
+                label = f"_{onnx.TensorProto.DataType.Name(float_type)}{convert_label}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"conv.f32.prequant_input{label}.onnx")
+                qdq_model_path = os.path.join(self._tmp_dir_path, f"conv.prequant_input{label}.qdq.onnx")
+
+                inp_shape = [1, 2, 40, 40]
+                weight_shape = [2, 2, 20, 20]
+                np_dtype = onnx.helper.tensor_dtype_to_np_dtype(float_type)
+
+                # range = 3.0, scale = 3/255, zp = 127
+                input_scale_data = np.array(3 / 255, dtype=np_dtype)
+                input_zp_data = np.array(127, dtype=np.uint8)
+                input_data = np.linspace(-1.5, 1.5, num=3200, dtype=np_dtype).reshape(inp_shape)
+                input_quant_data = quantize_nparray(onnx.TensorProto.UINT8, input_data, input_scale_data, input_zp_data)
+
+                bias_data = np.array([-10.0, 10.0], dtype=np_dtype)
+                float_model = self.build_conv_dynamic_weight_model(
+                    input_quant_data, input_scale_data, input_zp_data, weight_shape, bias_data, float_type
+                )
+
+                onnx.checker.check_model(float_model, True)
+                onnx.save_model(float_model, float_model_path)
+
+                # Check that the input model only has a pre-quantized input and save its scale/zero-point
+                # to check that it doesn't change after quantization.
+                float_node_counts = {"QuantizeLinear": 0, "DequantizeLinear": 1}
+                check_op_type_count(self, float_model_path, **float_node_counts)
+                conv_node_original = next((node for node in float_model.graph.node if node.op_type == "Conv"), None)
+                self.assertNotEqual(conv_node_original, None)
+
+                _, producers_original = get_tensor_consumers_and_producers(float_model)
+                input_dq_node_original = producers_original.get(conv_node_original.input[0], None)
+                initializers_original = {initializer.name: initializer for initializer in float_model.graph.initializer}
+                scale_name_original = input_dq_node_original.input[1]
+                scale_val_original = onnx.numpy_helper.to_array(initializers_original[scale_name_original])
+                zp_name_original = input_dq_node_original.input[2]
+                zp_val_original = onnx.numpy_helper.to_array(initializers_original[zp_name_original])
+
+                # Create data reader with random input calibration data.
+                dyn_weight_data_list = [
+                    {"dyn_weight": rng.uniform(-10.0, 10.0, weight_shape).astype(np_dtype)},
+                ]
+                data_reader = TestDataFeeds(dyn_weight_data_list)
+
+                extra_options = {}
+                if convert_weight_qtype:
+                    # Test converting the dynamic weight's quantization type, which results in
+                    # dyn_weight -> Q(u16) -> DQ(f32) -> Q(u8) -> DQ(f32) -> Conv
+                    extra_options["TensorQuantOverrides"] = {
+                        "dyn_weight": [{"quant_type": QuantType.QUInt16, "convert": {"quant_type": QuantType.QUInt8}}],
+                    }
+
+                quantize_static(
+                    float_model_path,
+                    qdq_model_path,
+                    data_reader,
+                    quant_format=QuantFormat.QDQ,
+                    activation_type=QuantType.QUInt8,
+                    weight_type=QuantType.QInt8,
+                    op_types_to_quantize=["Conv"],
+                    extra_options=extra_options,
+                )
+
+                # The final model should have everything quantized
+                qdq_node_counts = {"QuantizeLinear": 2, "DequantizeLinear": 4}
+                if convert_weight_qtype:
+                    qdq_node_counts["QuantizeLinear"] += 1
+                    qdq_node_counts["DequantizeLinear"] += 1
+
+                check_op_type_count(self, qdq_model_path, **qdq_node_counts)
+
+                # Check that the pre-quantized input still has the same scale/zp after quantization
+                qdq_model = onnx.load_model(qdq_model_path)
+                conv_node = next((node for node in qdq_model.graph.node if node.op_type == "Conv"), None)
+                self.assertNotEqual(conv_node, None)
+
+                _, producers = get_tensor_consumers_and_producers(qdq_model)
+                input_dq_node = producers.get(conv_node.input[0], None)
+                initializers = {initializer.name: initializer for initializer in qdq_model.graph.initializer}
+
+                scale_name = input_dq_node.input[1]
+                self.assertEqual(scale_name, scale_name_original)
+                scale_val = onnx.numpy_helper.to_array(initializers[scale_name])
+                self.assertEqual(scale_val, scale_val_original)
+
+                zp_name = input_dq_node.input[2]
+                self.assertEqual(zp_name, zp_name_original)
+                zp_val = onnx.numpy_helper.to_array(initializers[zp_name])
+                self.assertEqual(zp_val, zp_val_original)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_flash_attn_cuda.py b/onnxruntime/test/python/transformers/test_flash_attn_cuda.py
index 46ab905977f48..a74d5389e9047 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn_cuda.py
+++ b/onnxruntime/test/python/transformers/test_flash_attn_cuda.py
@@ -24,7 +24,7 @@
 from parameterized import parameterized
 from test_gqa_cpu import smooth_softmax_ref
 
-from onnxruntime import InferenceSession, OrtValue, SessionOptions
+from onnxruntime import InferenceSession, OrtValue, SessionOptions, get_available_providers
 
 torch.manual_seed(0)
 
@@ -1999,6 +1999,8 @@ def parity_check_gqa_past_no_buff(
 def has_flash_attention():
     if not torch.cuda.is_available():
         return False
+    if "CUDAExecutionProvider" not in get_available_providers():
+        return False
     major, _ = torch.cuda.get_device_capability()
     return major >= 8 and (
         platform.system() == "Linux"
@@ -2009,6 +2011,8 @@ def has_flash_attention():
 def has_memory_efficient():
     if not torch.cuda.is_available():
         return False
+    if "CUDAExecutionProvider" not in get_available_providers():
+        return False
     major, minor = torch.cuda.get_device_capability()
     if major < 5 or (major == 5 and minor < 3):
         return False
@@ -2047,8 +2051,8 @@ def mha_test_cases():
             (2048, 2048),
         ]
     )
-    num_h = [1, 3] if pipeline_mode else [1, 6, 16]
-    h_sizes = [16, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+    num_h = [3] if pipeline_mode else [1, 6, 16]
+    h_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
 
     for b in batches:
         for s, s2 in seqs:
@@ -2080,11 +2084,7 @@ def gqa_no_past_memory_efficient_test_cases():
     batches = [3] if pipeline_mode else [1, 3, 5]
     seqs = (
         [
-            (127, 127),
-            (35, 35),
             (2000, 2000),
-            (200, 200),
-            (240, 240),
         ]
         if pipeline_mode
         else [
@@ -2095,8 +2095,8 @@ def gqa_no_past_memory_efficient_test_cases():
             (240, 240),
         ]
     )
-    num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-    h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+    num_h = [(9, 3)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
+    h_sizes = [128] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
     torch.manual_seed(69)
 
     for b in batches:
@@ -2121,10 +2121,6 @@ def gqa_no_past_flash_attention_test_cases():
     batches = [3] if pipeline_mode else [1, 3, 5]
     seqs = (
         [
-            (127, 127),
-            (35, 35),
-            (2000, 2000),
-            (200, 200),
             (240, 240),
         ]
         if pipeline_mode
@@ -2136,8 +2132,8 @@ def gqa_no_past_flash_attention_test_cases():
             (240, 240),
         ]
     )
-    num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-    h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+    num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
+    h_sizes = [128] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
     torch.manual_seed(69)
 
     for b in batches:
@@ -2163,7 +2159,7 @@ def gqa_no_past_flash_attention_test_cases():
 def gqa_past_memory_efficient_test_cases():
     batches = [5] if pipeline_mode else [1, 3, 5]
     seqs = (
-        [(1, 128), (1, 1024), (1, 2048)]
+        [(1, 1024)]
         if pipeline_mode
         else [
             (1, 128),
@@ -2179,8 +2175,8 @@ def gqa_past_memory_efficient_test_cases():
             # (128, 128),
         ]
     )
-    num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-    h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+    num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
+    h_sizes = [256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
     random.seed(69)
 
     for b in batches:
@@ -2205,7 +2201,7 @@ def gqa_past_memory_efficient_test_cases():
 def gqa_past_flash_attention_test_cases():
     batches = [5] if pipeline_mode else [1, 3, 5]
     seqs = (
-        [(1, 128), (1, 1024), (1, 2048)]
+        [(1, 2048)]
         if pipeline_mode
         else [
             (1, 128),
@@ -2221,8 +2217,8 @@ def gqa_past_flash_attention_test_cases():
             # (128, 128),
         ]
     )
-    num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-    h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+    num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
+    h_sizes = [256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
     random.seed(69)
 
     for b in batches:
@@ -2249,7 +2245,7 @@ def gqa_past_flash_attention_test_cases():
 def gqa_interactive_one_batch_flash_attention_test_cases():
     batches = [1]
     seqs = (
-        [(2, 128), (128, 129), (32, 128), (256, 2048)]
+        [(128, 2048)]
         if pipeline_mode
         else [
             (1, 128),
@@ -2265,8 +2261,8 @@ def gqa_interactive_one_batch_flash_attention_test_cases():
             # (128, 128),
         ]
     )
-    num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-    h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+    num_h = [(9, 3)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
+    h_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
     random.seed(69)
 
     for b in batches:
@@ -2290,7 +2286,7 @@ def gqa_interactive_one_batch_flash_attention_test_cases():
 def gqa_interactive_one_batch_memory_efficient_attention_test_cases():
     batches = [1]
     seqs = (
-        [(2, 128), (128, 129), (32, 128), (256, 2048)]
+        [(32, 128)]
         if pipeline_mode
         else [
             (1, 128),
@@ -2306,8 +2302,8 @@ def gqa_interactive_one_batch_memory_efficient_attention_test_cases():
             # (128, 128),
         ]
     )
-    num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-    h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+    num_h = [(9, 3)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
+    h_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
     random.seed(69)
 
     for b in batches:
@@ -2326,120 +2322,114 @@ def gqa_interactive_one_batch_memory_efficient_attention_test_cases():
                             )
 
 
-class TestGQA(unittest.TestCase):
-    @parameterized.expand(gqa_no_past_memory_efficient_test_cases())
-    def test_gqa_no_past_memory_efficient(self, _, config, rotary, rotary_interleaved, packed, softcap):
-        if not has_memory_efficient():
-            return
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
-        print("------- MEMORY EFFICIENT ATTENTION (PROMPT CASE) ---------")
+@unittest.skipIf(not has_flash_attention(), reason="Flash Attention is not available, skipping tests.")
+class TestFlashGQA(unittest.TestCase):
+    @parameterized.expand(gqa_no_past_flash_attention_test_cases())
+    def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap):
+        print("------- FLASH ATTENTION (PROMPT CASE) --------")
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
 
         parity_check_gqa_prompt(
             config,
-            rtol=5e-3,
-            atol=5e-3,
+            local=local,
             past_format=Formats.BNSH,
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
             softcap=softcap,
-            use_smooth_softmax=False,
+            use_smooth_softmax=True,
         )
         parity_check_gqa_prompt_no_buff(
             config,
-            rtol=5e-3,
-            atol=5e-3,
+            local=local,
             past_format=Formats.BNSH,
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
             softcap=softcap,
-            use_smooth_softmax=True,
+            use_smooth_softmax=False,
         )
 
-    @parameterized.expand(gqa_no_past_flash_attention_test_cases())
-    def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap):
-        if not has_flash_attention():
-            return
-        print("------- FLASH ATTENTION (PROMPT CASE) --------")
+    @parameterized.expand(gqa_past_flash_attention_test_cases())
+    def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap):
+        print("------- FLASH ATTENTION (TOKEN GEN) -------")
         os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
 
-        parity_check_gqa_prompt(
+        parity_check_gqa_past(
             config,
             local=local,
             past_format=Formats.BNSH,
+            rtol=1e-3,
+            atol=1e-3,
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
             softcap=softcap,
-            use_smooth_softmax=True,
+            use_smooth_softmax=False,
         )
-        parity_check_gqa_prompt_no_buff(
+        parity_check_gqa_past_no_buff(
             config,
             local=local,
             past_format=Formats.BNSH,
+            rtol=1e-3,
+            atol=1e-3,
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
             softcap=softcap,
-            use_smooth_softmax=False,
+            use_smooth_softmax=True,
         )
 
-    @parameterized.expand(gqa_past_memory_efficient_test_cases())
-    def test_gqa_past_memory_efficient(self, _, config, rotary, rotary_interleaved, packed, softcap):
-        if not has_memory_efficient():
-            return
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
-        print("-------- MEMORY EFFICIENT (TOKEN GEN) --------")
+    @parameterized.expand(gqa_interactive_one_batch_flash_attention_test_cases())
+    def test_gqa_interactive_one_batch_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed):
+        print("------- FLASH ATTENTION (INTERACTIVE) -------")
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
 
         parity_check_gqa_past(
             config,
+            local=local,
             past_format=Formats.BNSH,
-            rtol=1e-3,
-            atol=1e-3,
+            rtol=5e-3,
+            atol=5e-3,
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            softcap=softcap,
-            use_smooth_softmax=True,
         )
         parity_check_gqa_past_no_buff(
             config,
+            local=local,
             past_format=Formats.BNSH,
-            rtol=1e-3,
-            atol=1e-3,
+            rtol=5e-3,
+            atol=5e-3,
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            softcap=softcap,
-            use_smooth_softmax=False,
         )
 
-    @parameterized.expand(gqa_past_flash_attention_test_cases())
-    def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap):
-        if not has_flash_attention():
-            return
-        print("------- FLASH ATTENTION (TOKEN GEN) -------")
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
 
-        parity_check_gqa_past(
+@unittest.skipIf(not has_memory_efficient(), reason="Memory efficient FMHA is not available, skipping tests.")
+class TestMemoryEfficientGQA(unittest.TestCase):
+    @parameterized.expand(gqa_no_past_memory_efficient_test_cases())
+    def test_gqa_no_past_memory_efficient(self, _, config, rotary, rotary_interleaved, packed, softcap):
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
+        print("------- MEMORY EFFICIENT ATTENTION (PROMPT CASE) ---------")
+
+        parity_check_gqa_prompt(
             config,
-            local=local,
+            rtol=5e-3,
+            atol=5e-3,
             past_format=Formats.BNSH,
-            rtol=1e-3,
-            atol=1e-3,
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
             softcap=softcap,
             use_smooth_softmax=False,
         )
-        parity_check_gqa_past_no_buff(
+        parity_check_gqa_prompt_no_buff(
             config,
-            local=local,
+            rtol=5e-3,
+            atol=5e-3,
             past_format=Formats.BNSH,
-            rtol=1e-3,
-            atol=1e-3,
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
@@ -2447,38 +2437,36 @@ def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interle
             use_smooth_softmax=True,
         )
 
-    @parameterized.expand(gqa_interactive_one_batch_flash_attention_test_cases())
-    def test_gqa_interactive_one_batch_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed):
-        if not has_flash_attention():
-            return
-        print("------- FLASH ATTENTION (INTERACTIVE) -------")
-        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0"
+    @parameterized.expand(gqa_past_memory_efficient_test_cases())
+    def test_gqa_past_memory_efficient(self, _, config, rotary, rotary_interleaved, packed, softcap):
+        os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
+        print("-------- MEMORY EFFICIENT (TOKEN GEN) --------")
 
         parity_check_gqa_past(
             config,
-            local=local,
             past_format=Formats.BNSH,
-            rtol=5e-3,
-            atol=5e-3,
+            rtol=1e-3,
+            atol=1e-3,
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
+            softcap=softcap,
+            use_smooth_softmax=True,
         )
         parity_check_gqa_past_no_buff(
             config,
-            local=local,
             past_format=Formats.BNSH,
-            rtol=5e-3,
-            atol=5e-3,
+            rtol=1e-3,
+            atol=1e-3,
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
+            softcap=softcap,
+            use_smooth_softmax=False,
         )
 
     @parameterized.expand(gqa_interactive_one_batch_memory_efficient_attention_test_cases())
     def test_gqa_interactive_one_batch_memory_efficient_attention(self, _, config, rotary, rotary_interleaved, packed):
-        if not has_memory_efficient():
-            return
         os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1"
         print("-------- MEMORY EFFICIENT (INTERACTIVE) --------")
 
diff --git a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
index 99460722c2469..a5910c28c2975 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
+++ b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
@@ -16,16 +16,16 @@
 import onnxruntime
 
 
-class TestGQA(unittest.TestCase):
+@unittest.skipIf(
+    (not torch.cuda.is_available())
+    or (platform.system() != "Linux")
+    or ("ROCMExecutionProvider" not in onnxruntime.get_available_providers()),
+    reason="ROCm is not available, skipping tests.",
+)
+class TestRocmGQA(unittest.TestCase):
     @parameterized.expand(gqa_no_past_flash_attention_test_cases())
     def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap):
         config.ep = "ROCMExecutionProvider"
-        if not torch.cuda.is_available():
-            return
-        if platform.system() != "Linux":
-            return
-        if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
-            return
         print("------- FLASH ATTENTION (PROMPT CASE) --------")
 
         parity_check_gqa_prompt(
@@ -52,12 +52,6 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte
     @parameterized.expand(gqa_past_flash_attention_test_cases())
     def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap):
         config.ep = "ROCMExecutionProvider"
-        if not torch.cuda.is_available():
-            return
-        if platform.system() != "Linux":
-            return
-        if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
-            return
         print("------- FLASH ATTENTION (TOKEN GEN) -------")
 
         parity_check_gqa_past(
diff --git a/onnxruntime/test/python/transformers/test_gqa_cpu.py b/onnxruntime/test/python/transformers/test_gqa_cpu.py
index 08ec5de328b9d..77b4b326bf645 100644
--- a/onnxruntime/test/python/transformers/test_gqa_cpu.py
+++ b/onnxruntime/test/python/transformers/test_gqa_cpu.py
@@ -1900,7 +1900,7 @@ class TestGQA(unittest.TestCase):
     def test_gqa_no_past(self):
         torch.manual_seed(69)
         print("-------- TEST GQA NO PAST (PROMPT CASE) ---------")
-        batches = [1, 3] if pipeline_mode else [1, 3, 5]
+        batches = [3] if pipeline_mode else [1, 3, 5]
         seqs = (
             [
                 (127, 127),
@@ -1916,8 +1916,8 @@ def test_gqa_no_past(self):
                 (8000, 8000),
             ]
         )
-        num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-        h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+        num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
+        h_sizes = [128] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
         for b in batches:
             for sq, skv in seqs:
                 for n, n2 in num_h:
@@ -1954,9 +1954,9 @@ def test_gqa_no_past(self):
 
     def test_gqa_past(self):
         print("-------- TEST GQA PAST (TOKEN GEN) ---------")
-        batches = [1, 3] if pipeline_mode else [1, 3, 5]
+        batches = [1] if pipeline_mode else [1, 3, 5]
         seqs = (
-            [(1, 128), (1, 1024), (1, 2048)]
+            [(1, 128)]
             if pipeline_mode
             else [
                 (1, 128),
@@ -1972,8 +1972,8 @@ def test_gqa_past(self):
                 # (128, 128),
             ]
         )
-        num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-        h_sizes = [16, 64, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+        num_h = [(9, 3)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
+        h_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
         random.seed(69)
         for b in batches:
             for s, s2 in seqs:
@@ -2018,7 +2018,7 @@ def test_gqa_interactive_one_batch(self):
         print("-------- TEST GQA INTERACTIVE ---------")
         batches = [1]
         seqs = (
-            [(2, 128), (128, 129), (32, 128), (256, 2048)]
+            [(256, 2048)]
             if pipeline_mode
             else [
                 (1, 128),
@@ -2034,8 +2034,8 @@ def test_gqa_interactive_one_batch(self):
                 # (128, 128),
             ]
         )
-        num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-        h_sizes = [16, 64, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+        num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
+        h_sizes = [32] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
         random.seed(69)
         for b in batches:
             for s, s2 in seqs:
diff --git a/onnxruntime/test/python/transformers/test_optimizer.py b/onnxruntime/test/python/transformers/test_optimizer.py
index c7db636a2f11f..058b1d2c9e0fa 100644
--- a/onnxruntime/test/python/transformers/test_optimizer.py
+++ b/onnxruntime/test/python/transformers/test_optimizer.py
@@ -5,30 +5,21 @@
 # license information.
 # --------------------------------------------------------------------------
 
-# For live logging, use the command: pytest -o log_cli=true --log-cli-level=DEBUG
+# For live logging, use the following command:
+#   pytest -o log_cli=true --log-cli-level=DEBUG test_optimizer.py
 
-import shutil
 import unittest
 
-import pytest
-import torch
 from model_loader import get_fusion_test_model, get_test_data_path
 from onnx import TensorProto, load_model
 from parity_utilities import find_transformers_source
-from transformers import is_tf_available
 
 if find_transformers_source():
-    from benchmark_helper import ConfigModifier, OptimizerInfo, Precision
     from fusion_options import FusionOptions
-    from huggingface_models import MODELS
-    from onnx_exporter import export_onnx_model_from_pt, export_onnx_model_from_tf
     from onnx_model import OnnxModel
     from optimizer import optimize_model
 else:
-    from onnxruntime.transformers.benchmark_helper import ConfigModifier, OptimizerInfo, Precision
     from onnxruntime.transformers.fusion_options import FusionOptions
-    from onnxruntime.transformers.huggingface_models import MODELS
-    from onnxruntime.transformers.onnx_exporter import export_onnx_model_from_pt, export_onnx_model_from_tf
     from onnxruntime.transformers.onnx_model import OnnxModel
     from onnxruntime.transformers.optimizer import optimize_model
 
@@ -66,70 +57,6 @@ def verify_node_count(self, onnx_model, expected_node_count, test_name):
 
                 self.assertEqual(len(onnx_model.get_nodes_by_op_type(op_type)), count)
 
-    # test huggingface pytorch model
-    def _test_optimizer_on_huggingface_model(
-        self,
-        model_name,
-        expected_fusion_result_list,
-        inputs_count=1,
-        validate_model=True,
-    ):
-        # Remove cached model so that CI machine has enough space. Do not remove cache models in dev machine.
-        if not find_transformers_source():
-            shutil.rmtree("./cache_models", ignore_errors=True)
-        shutil.rmtree("./onnx_models", ignore_errors=True)
-
-        # expect fusion result list have the following keys
-        # EmbedLayerNormalization, Attention, Gelu, FastGelu, BiasGelu, LayerNormalization, SkipLayerNormalization
-        model_fusion_statistics = {}
-
-        input_names = MODELS[model_name][0]
-
-        config_modifier = ConfigModifier(None)
-        fusion_options = None
-        model_class = "AutoModel"
-        with torch.no_grad():
-            _, is_valid_onnx_model, _, _ = export_onnx_model_from_pt(
-                model_name,
-                MODELS[model_name][1],  # opset version
-                MODELS[model_name][2],  # use_external_data_format
-                MODELS[model_name][3],  # optimization model type
-                model_class,
-                config_modifier,
-                "./cache_models",
-                "./onnx_models",
-                input_names[:inputs_count],
-                False,
-                Precision.FLOAT32,
-                OptimizerInfo.BYSCRIPT,
-                True,
-                True,
-                True,
-                model_fusion_statistics,
-                fusion_options,
-            )
-
-        if validate_model:
-            self.assertEqual(is_valid_onnx_model, True)
-
-        expected_node_count = {
-            "EmbedLayerNormalization": expected_fusion_result_list[0],
-            "Attention": expected_fusion_result_list[1],
-            "Gelu": expected_fusion_result_list[2],
-            "FastGelu": expected_fusion_result_list[3],
-            "BiasGelu": expected_fusion_result_list[4],
-            "LayerNormalization": expected_fusion_result_list[5],
-            "SkipLayerNormalization": expected_fusion_result_list[6],
-        }
-
-        for value in model_fusion_statistics.values():
-            actual_node_count = value
-
-        for op_type, count in expected_node_count.items():
-            if op_type not in actual_node_count or actual_node_count[op_type] != count:
-                print(f"expected: {expected_node_count} got {actual_node_count}")
-                self.assertTrue(False)
-
     def test_gpt2_past(self):
         for enable_skip_layer_norm_fusion in [False, True]:
             input_path = _get_test_model_path("gpt2_past")
@@ -227,176 +154,6 @@ def test_embed_layer_norm_fusion(self):
             }
             self.verify_node_count(model, expected_node_count, file)
 
-    @pytest.mark.slow
-    def test_huggingface_bert_fusion_1(self):
-        self._test_optimizer_on_huggingface_model("bert-base-uncased", [1, 12, 0, 0, 12, 0, 24], inputs_count=1)
-
-    @pytest.mark.slow
-    def test_huggingface_bert_fusion_2(self):
-        self._test_optimizer_on_huggingface_model("bert-base-uncased", [1, 12, 0, 0, 12, 0, 24], inputs_count=2)
-
-    @pytest.mark.slow
-    def test_huggingface_bert_fusion_3(self):
-        self._test_optimizer_on_huggingface_model("bert-base-uncased", [1, 12, 0, 0, 12, 0, 24], inputs_count=3)
-
-    @pytest.mark.slow
-    def test_huggingface_openaigpt_fusion(self):
-        self._test_optimizer_on_huggingface_model("openai-gpt", [0, 12, 0, 12, 0, 0, 24])
-
-    @pytest.mark.slow
-    @unittest.skip("skip failed fusion test of gpt-2 on PyTorch 1.12 and transformers 4.18. TODO: fix it")
-    def test_huggingface_gpt2_fusion(self):
-        self._test_optimizer_on_huggingface_model("gpt2", [0, 12, 0, 12, 0, 25, 0])
-
-    @pytest.mark.slow
-    @unittest.skip("skip failed fusion test of xlm on PyTorch 1.12 and transformers 4.18. TODO: fix it")
-    def test_huggingface_xlm_fusion(self):
-        self._test_optimizer_on_huggingface_model("xlm-mlm-ende-1024", [0, 6, 0, 0, 6, 0, 13])
-
-    @pytest.mark.slow
-    def test_huggingface_roberta_fusion(self):
-        self._test_optimizer_on_huggingface_model("roberta-base", [0, 12, 0, 0, 12, 1, 24])
-
-    @pytest.mark.slow
-    def test_huggingface_distillbert_fusion(self):
-        self._test_optimizer_on_huggingface_model("distilbert-base-uncased", [1, 6, 0, 0, 6, 0, 12], inputs_count=1)
-        self._test_optimizer_on_huggingface_model("distilbert-base-uncased", [1, 6, 0, 0, 6, 0, 12], inputs_count=2)
-
-    @pytest.mark.slow
-    @unittest.skip("skip failed fusion test of camembert on PyTorch 1.12 and transformers 4.18. TODO: fix it")
-    def test_huggingface_camembert_fusion(self):
-        self._test_optimizer_on_huggingface_model("camembert-base", [0, 12, 0, 0, 12, 1, 24], validate_model=False)
-
-    @pytest.mark.slow
-    @unittest.skip("skip failed fusion test of albert on PyTorch 1.12 and transformers 4.18. TODO: fix it")
-    def test_huggingface_albert_fusion(self):
-        self._test_optimizer_on_huggingface_model("albert-base-v1", [0, 12, 0, 0, 12, 1, 24])
-
-    @pytest.mark.slow
-    @unittest.skip("skip fusion test of t5 since it is not implemented yet")
-    def test_huggingface_t5_fusion(self):
-        self._test_optimizer_on_huggingface_model("t5-small", [0, 0, 0, 0, 0, 0, 0])
-
-    @pytest.mark.slow
-    def test_huggingface_xlmroberta_fusion(self):
-        self._test_optimizer_on_huggingface_model("xlm-roberta-base", [0, 12, 0, 0, 12, 1, 24])
-
-    @pytest.mark.slow
-    @unittest.skip("skip failed fusion test of flaubert on PyTorch 1.12 and transformers 4.18. TODO: fix it")
-    def test_huggingface_flaubert_fusion(self):
-        self._test_optimizer_on_huggingface_model(
-            "flaubert/flaubert_base_cased",
-            [0, 12, 0, 0, 12, 0, 25],
-            validate_model=False,
-        )
-        self._test_optimizer_on_huggingface_model(
-            "flaubert/flaubert_small_cased",
-            [0, 6, 0, 0, 6, 12, 1],
-            validate_model=False,
-        )
-
-    @pytest.mark.slow
-    @unittest.skip("skip failed fusion test of dialogpt on PyTorch 1.12 and transformers 4.18. TODO: fix it")
-    def test_huggingface_dialogpt_fusion(self):
-        self._test_optimizer_on_huggingface_model("microsoft/DialoGPT-small", [0, 12, 0, 12, 0, 25, 0])
-
-    @pytest.mark.slow
-    def test_huggingface_bart_fusion(self):
-        self._test_optimizer_on_huggingface_model("facebook/bart-base", [0, 0, 0, 0, 12, 2, 30])
-
-    @pytest.mark.slow
-    def test_huggingface_vit_fusion(self):
-        self._test_optimizer_on_huggingface_model("google/vit-base-patch16-224", [0, 11, 0, 0, 12, 1, 24])
-
-
-@unittest.skipUnless(is_tf_available(), "skip TestBertOptimizationTF since tensorflow is not available")
-class TestTensorflowModelOptimization(unittest.TestCase):
-    def setUp(self):
-        try:
-            import tf2onnx  # noqa: F401
-        except ImportError:
-            self.skipTest("skip TestBertOptimizationTF since tf2onnx not installed")
-
-    def _test_optimizer_on_tf_model(self, model_name, expected_fusion_result_list, inputs_count, validate_model=True):
-        # Remove cached model so that CI machine has enough space. Do not remove cache models in dev machine.
-        if not find_transformers_source():
-            shutil.rmtree("./cache_models", ignore_errors=True)
-        shutil.rmtree("./onnx_models", ignore_errors=True)
-
-        # expect fusion result list have the following keys
-        # EmbedLayerNormalization, Attention, Gelu, FastGelu, BiasGelu, LayerNormalization, SkipLayerNormalization
-        model_fusion_statistics = {}
-        print("testing mode ", model_name)
-        print("testing input number = ", inputs_count)
-        input_names = MODELS[model_name][0]
-
-        config_modifier = ConfigModifier(None)
-        fusion_options = None
-        model_class = "AutoModel"
-        with torch.no_grad():
-            _, is_valid_onnx_model, _, _ = export_onnx_model_from_tf(
-                model_name,
-                MODELS[model_name][1],  # opset version
-                MODELS[model_name][2],  # use_external_data_format
-                MODELS[model_name][3],  # optimization model
-                model_class,
-                config_modifier,
-                "./cache_models",
-                "./onnx_models",
-                input_names[:inputs_count],
-                False,
-                Precision.FLOAT32,
-                True,
-                True,
-                True,
-                True,
-                model_fusion_statistics,
-                fusion_options,
-            )
-
-        onnx_model = next(iter(model_fusion_statistics.keys()))
-        fusion_result_list = list(model_fusion_statistics[onnx_model].values())
-
-        if validate_model:
-            self.assertEqual(is_valid_onnx_model, True)
-        self.assertEqual(fusion_result_list, expected_fusion_result_list)
-
-    @pytest.mark.slow
-    def test_huggingface_bert_base_cased_from_tf2onnx_1(self):
-        self._test_optimizer_on_tf_model("bert-base-cased", [0, 12, 0, 0, 0, 0, 25], 1)
-
-    @pytest.mark.slow
-    def test_huggingface_bert_base_cased_from_tf2onnx_2(self):
-        self._test_optimizer_on_tf_model("bert-base-cased", [0, 12, 0, 0, 0, 0, 25], 2)
-
-    @pytest.mark.slow
-    def test_huggingface_bert_base_cased_from_tf2onnx_3(self):
-        self._test_optimizer_on_tf_model("bert-base-cased", [0, 12, 0, 0, 0, 0, 25], 3)
-
-    @pytest.mark.slow
-    def test_huggingface_distilgpt2_from_tf2onnx(self):
-        self._test_optimizer_on_tf_model("distilgpt2", [0, 0, 0, 0, 0, 12, 1], 1)
-
-    @pytest.mark.slow
-    def test_huggingface_albert_from_tf2onnx(self):
-        self._test_optimizer_on_tf_model("albert-base-v1", [0, 0, 0, 0, 0, 0, 25], 1)
-
-    @pytest.mark.slow
-    def test_huggingface_gpt2_from_tf2onnx(self):
-        self._test_optimizer_on_tf_model("gpt2", [0, 0, 0, 0, 0, 24, 1], 1, validate_model=False)
-
-    @pytest.mark.slow
-    def test_huggingface_roberta_from_tf2onnx(self):
-        self._test_optimizer_on_tf_model("roberta-base", [0, 12, 0, 0, 0, 0, 25], 1, validate_model=False)
-
-    @pytest.mark.slow
-    def test_huggingface_distilbert_from_tf2onnx(self):
-        self._test_optimizer_on_tf_model("distilbert-base-uncased", [0, 0, 0, 0, 0, 0, 13], 1, validate_model=False)
-
-    @pytest.mark.slow
-    def test_huggingface_xlm_from_tf2onnx(self):
-        self._test_optimizer_on_tf_model("xlm-mlm-ende-1024", [0, 0, 0, 0, 0, 1, 12], 1, validate_model=False)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_optimizer_huggingface_bert.py b/onnxruntime/test/python/transformers/test_optimizer_huggingface_bert.py
new file mode 100644
index 0000000000000..e4f883dc8b45c
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_optimizer_huggingface_bert.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+# For live logging, use the following command:
+#   pytest -o log_cli=true --log-cli-level=DEBUG test_optimizer_huggingface_bert.py
+
+import shutil
+import unittest
+from pathlib import Path
+
+import torch
+from parity_utilities import find_transformers_source
+from transformers.utils import default_cache_path
+
+if find_transformers_source():
+    from benchmark_helper import ConfigModifier, OptimizerInfo, Precision
+    from compare_bert_results import run_test as bert_parity_test
+    from onnx_exporter import export_onnx_model_from_pt
+else:
+    from onnxruntime.transformers.benchmark_helper import ConfigModifier, OptimizerInfo, Precision
+    from onnxruntime.transformers.compare_bert_results import run_test as bert_parity_test
+    from onnxruntime.transformers.onnx_exporter import export_onnx_model_from_pt
+
+
+class TestHuggingfaceBertModelOptimization(unittest.TestCase):
+    def run_optimizer_on_model(
+        self,
+        model_name,
+        expected_fusion_result_list,
+        inputs_count=1,
+        validate_model=True,
+        opset_version=16,
+        use_external_data_format=False,
+        model_type="bert",
+    ):
+        onnx_dir = Path(".") / "onnx_models" / model_name
+        shutil.rmtree(onnx_dir, ignore_errors=True)
+
+        Path(onnx_dir).mkdir(parents=True, exist_ok=True)
+
+        model_fusion_statistics = {}
+
+        input_names = ["input_ids", "attention_mask", "token_type_ids"]
+
+        config_modifier = ConfigModifier(None)
+        fusion_options = None
+        model_class = "AutoModel"
+        with torch.no_grad():
+            optimized_model_path, is_valid_onnx_model, _, _ = export_onnx_model_from_pt(
+                model_name=model_name,
+                opset_version=opset_version,
+                use_external_data_format=use_external_data_format,
+                model_type=model_type,
+                model_class=model_class,
+                config_modifier=config_modifier,
+                cache_dir=default_cache_path,
+                onnx_dir=str(onnx_dir),
+                input_names=input_names[:inputs_count],
+                use_gpu=False,
+                precision=Precision.FLOAT32,
+                optimizer_info=OptimizerInfo.BYSCRIPT,
+                validate_onnx=True,
+                use_raw_attention_mask=True,
+                overwrite=True,
+                model_fusion_statistics=model_fusion_statistics,
+                fusion_options=fusion_options,
+            )
+
+        if validate_model:
+            self.assertEqual(is_valid_onnx_model, True)
+
+        expected_node_count = {
+            "EmbedLayerNormalization": expected_fusion_result_list[0],
+            "Attention": expected_fusion_result_list[1],
+            "Gelu": expected_fusion_result_list[2],
+            "FastGelu": expected_fusion_result_list[3],
+            "BiasGelu": expected_fusion_result_list[4],
+            "LayerNormalization": expected_fusion_result_list[5],
+            "SkipLayerNormalization": expected_fusion_result_list[6],
+        }
+
+        node_count = None
+        for value in model_fusion_statistics.values():
+            node_count = value
+        self.assertIsNotNone(node_count)
+
+        actual_node_count = {}
+        for op_type in expected_node_count:
+            actual_node_count[op_type] = node_count.get(op_type, 0)
+
+        expected = ", ".join(f"{key}: {value}" for key, value in sorted(expected_node_count.items()))
+        actual = ", ".join(f"{key}: {value}" for key, value in sorted(actual_node_count.items()))
+        self.assertEqual(expected, actual)
+
+        suffix = "_fp32_cpu.onnx"
+        assert optimized_model_path.endswith(suffix)
+        baseline_model_path = optimized_model_path[: -len(suffix)] + ".onnx"
+        for batch_size in [1, 2]:
+            for sequence_length in [1, 8]:
+                max_abs_diff, case_passed = bert_parity_test(
+                    baseline_model_path,
+                    optimized_model_path,
+                    output_dir=None,
+                    batch_size=batch_size,
+                    sequence_length=sequence_length,
+                    use_gpu=False,
+                    test_cases=1,
+                    seed=123,
+                    verbose=False,
+                    rtol=1e-4,
+                    atol=1e-4,
+                    input_ids_name=input_names[0],
+                    segment_ids_name=input_names[2] if inputs_count > 2 else None,
+                    input_mask_name=input_names[1] if inputs_count > 1 else None,
+                    mask_type=2,
+                    dictionary_size=1024,
+                )
+                self.assertTrue(
+                    case_passed, f"bert parity test failed: {batch_size=} {sequence_length=} {max_abs_diff=}"
+                )
+
+    def test_bert(self):
+        model_name = "hf-internal-testing/tiny-random-bert"
+        self.run_optimizer_on_model(model_name, [1, 5, 0, 0, 5, 0, 10], inputs_count=1)
+        self.run_optimizer_on_model(model_name, [1, 5, 0, 0, 5, 0, 10], inputs_count=2)
+        self.run_optimizer_on_model(model_name, [1, 5, 0, 0, 5, 0, 10], inputs_count=3)
+
+    def test_roberta(self):
+        model_name = "hf-internal-testing/tiny-random-roberta"
+        # TODO: EmbedLayerNormalization fusion.
+        self.run_optimizer_on_model(model_name, [0, 5, 0, 0, 5, 1, 10], inputs_count=1)
+        self.run_optimizer_on_model(model_name, [0, 5, 0, 0, 5, 1, 10], inputs_count=2)
+
+    def test_distillbert(self):
+        model_name = "hf-internal-testing/tiny-random-distilbert"
+        self.run_optimizer_on_model(model_name, [1, 5, 0, 0, 5, 0, 10], inputs_count=1)
+        self.run_optimizer_on_model(model_name, [1, 5, 0, 0, 5, 0, 10], inputs_count=2)
+
+    def test_xlm_roberta(self):
+        model_name = "hf-internal-testing/tiny-xlm-roberta"
+        # TODO: EmbedLayerNormalization fusion.
+        self.run_optimizer_on_model(model_name, [0, 2, 0, 0, 2, 1, 4], inputs_count=1)
+        self.run_optimizer_on_model(model_name, [0, 2, 0, 0, 2, 1, 4], inputs_count=2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_parity_moe.py b/onnxruntime/test/python/transformers/test_parity_moe.py
index 1e7940e38335f..baaaeaa766db9 100644
--- a/onnxruntime/test/python/transformers/test_parity_moe.py
+++ b/onnxruntime/test/python/transformers/test_parity_moe.py
@@ -651,7 +651,6 @@ def parity_check(self):
         torch_output = self.forward(hidden_state)
         ort_output = self.ort_forward(hidden_state)
         if ort_output is not None:
-            assert torch.allclose(torch_output, ort_output.to(torch.float32), rtol=THRESHOLD, atol=THRESHOLD)
             print(
                 "name:",
                 self.__class__.__name__,
@@ -661,8 +660,8 @@ def parity_check(self):
                 self.sequence_length,
                 " max_diff:",
                 (torch_output - ort_output).abs().max(),
-                " parity: OK",
             )
+            torch.testing.assert_close(ort_output.to(torch.float32), torch_output, rtol=THRESHOLD, atol=THRESHOLD)
 
     def benchmark_ort(self):
         hidden_state = torch.randn(self.batch_size, self.sequence_length, self.hidden_dim)
@@ -996,6 +995,13 @@ def small_test_cases():
             yield batch_size, sequence_length
 
 
+def phi3_test_cases():
+    # TODO: phi3 moe failed in long sequence lengths (max diff 0.22 > threshold 0.01), need investigation.
+    for batch_size in [1, 4, 16]:
+        for sequence_length in [128]:
+            yield batch_size, sequence_length
+
+
 class TestSwitchMoE(unittest.TestCase):
     @parameterized.expand(small_test_cases())
     def test_switch_moe_parity(self, batch_size, sequence_length):
@@ -1023,7 +1029,7 @@ def test_mixtral_moe_parity(self, batch_size, sequence_length):
 
 
 class TestPhiMoE(unittest.TestCase):
-    @parameterized.expand(small_test_cases())
+    @parameterized.expand(phi3_test_cases())
     def test_phi3_moe_parity(self, batch_size, sequence_length):
         config = PhiMoEConfig(hidden_size=256, intermediate_size=1024)
         phi3_moe = PhiMoESparseMoeBlock(config, batch_size, sequence_length)
diff --git a/onnxruntime/test/qnn_ctx_gen/command_args_parser.cc b/onnxruntime/test/qnn_ctx_gen/command_args_parser.cc
index 5b3720992c542..24c343c7b9541 100644
--- a/onnxruntime/test/qnn_ctx_gen/command_args_parser.cc
+++ b/onnxruntime/test/qnn_ctx_gen/command_args_parser.cc
@@ -50,6 +50,7 @@ namespace qnnctxgen {
       "\t    [enable_htp_weight_sharing]: Allows common weights across graphs to be shared and stored in a single context binary. Defaults to '1' (enabled).\n"
       "\t    [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
       "\t    Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
+      "\t    [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary."
       "\t    [Example] -i \"vtcm_mb|8 htp_arch|73\" \n"
       "\n"
       "\t-h: help\n");
@@ -146,7 +147,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
               ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
             }
           } else if (key == "enable_htp_fp16_precision" || key == "enable_htp_weight_sharing" ||
-                     key == "offload_graph_io_quantization") {
+                     key == "offload_graph_io_quantization" || key == "enable_htp_spill_fill_buffer") {
             std::unordered_set<std::string> supported_options = {"0", "1"};
             if (supported_options.find(value) == supported_options.end()) {
               std::ostringstream str_stream;
@@ -158,7 +159,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
           } else {
             ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'vtcm_mb', 'htp_performance_mode',
  'htp_graph_finalization_optimization_mode', 'soc_model', 'htp_arch', 'enable_htp_fp16_precision', 'enable_htp_weight_sharing',
- 'offload_graph_io_quantization'])");
+ 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer'])");
           }
 
           test_config.run_config.qnn_options[key] = value;
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index e19362e0ec32d..0be1c0b1965ac 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -4600,86 +4600,3 @@ TEST(CApiTest, OrtCustomOp_GetInPlace) {
   ASSERT_EQ(len, static_cast<size_t>(2));
   mock_gqa.ReleaseAliasMap(input_index, output_index);
 }
-
-TEST(CApiTest, Serialize_PrePack_Initializers) {
-  std::string model_name = "model_with_matmul_nbits";
-
-  const std::string test_model = "testdata/prepack/" + model_name + ".onnx";
-  const std::string optimized_model = "testdata/prepack/" + model_name + "_opt.onnx";
-  std::string external_initializer_file_name = model_name + "_opt.onnx.data";
-
-  // Generate optimized with prepacked weights serialized
-  Ort::SessionOptions session_options_opt;
-  session_options_opt.AddConfigEntry(kOrtSessionOptionsOptimizedModelExternalInitializersFileName, external_initializer_file_name.c_str());
-  session_options_opt.AddConfigEntry(kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes, "0");
-  session_options_opt.AddConfigEntry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1");
-
-#if defined(_WIN32) || defined(_WIN64)
-  std::wstring test_model_wide = onnxruntime::ToWideString(test_model);
-  session_options_opt.SetOptimizedModelFilePath(onnxruntime::ToWideString(optimized_model).c_str());
-  Ort::Session session_opt_model(*ort_env, test_model_wide.c_str(), session_options_opt);
-#else
-  session_options_opt.SetOptimizedModelFilePath(optimized_model.c_str());
-  Ort::Session session_opt_model(*ort_env, test_model.c_str(), session_options_opt);
-#endif
-
-  // Do inference with original model and optimized model and check output is identical
-  // set inputs and session options
-  Ort::SessionOptions session_options;
-  const char* input_names[] = {"A"};
-  const char* const output_names[] = {"Y"};
-  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
-
-  std::vector<Ort::Value> ort_inputs;
-  std::vector<float> input_0_data = {1.3f};
-  std::vector<int64_t> input_0_dims = {1, 1};
-  ort_inputs.emplace_back(
-      Ort::Value::CreateTensor<float>(info, const_cast<float*>(input_0_data.data()),
-                                      input_0_data.size(), input_0_dims.data(), input_0_dims.size()));
-
-  // run inference with original model
-  // Convert std::string to std::wstring
-#if defined(_WIN32) || defined(_WIN64)
-  Ort::Session session(*ort_env, test_model_wide.c_str(), session_options);
-#else
-  Ort::Session session(*ort_env, test_model.c_str(), session_options);
-#endif
-  auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
-                                 output_names, 1);
-
-  // run inference with optimized model which load serialized prepack initializer
-#if defined(_WIN32) || defined(_WIN64)
-  std::wstring optimized_model_wide = onnxruntime::ToWideString(optimized_model);
-  Ort::Session session_opt(*ort_env, optimized_model_wide.c_str(), session_options);
-#else
-  Ort::Session session_opt(*ort_env, optimized_model.c_str(), session_options);
-#endif
-  auto ort_outputs_opt = session_opt.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
-                                         output_names, 1);
-
-  // check output of original model and optimized model are equal
-  ASSERT_EQ(ort_outputs.size(), ort_outputs_opt.size());
-
-  for (size_t i = 0; i < ort_outputs.size(); ++i) {
-    const auto& sequences = ort_outputs[i];
-    ASSERT_TRUE(sequences.IsTensor());
-
-    const auto& sequences_opt = ort_outputs_opt[i];
-    ASSERT_TRUE(sequences_opt.IsTensor());
-
-    auto result_ts = sequences.GetTensorTypeAndShapeInfo();
-    auto result_ts_opt = sequences_opt.GetTensorTypeAndShapeInfo();
-
-    ASSERT_EQ(result_ts.GetElementType(), result_ts_opt.GetElementType());
-
-    ASSERT_EQ(result_ts.GetShape(), result_ts_opt.GetShape());
-
-    const auto* result_vals = sequences.GetTensorData<float>();
-    auto result_span = gsl::make_span(result_vals, ort_outputs.size());
-
-    const auto* result_vals_opt = sequences_opt.GetTensorData<float>();
-    auto result_span_opt = gsl::make_span(result_vals_opt, ort_outputs_opt.size());
-
-    ASSERT_TRUE(std::equal(result_span_opt.begin(), result_span_opt.end(), result_span.begin(), result_span.end()));
-  }
-}
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/dummy_t5.onnx b/onnxruntime/test/testdata/dummy_t5.onnx
new file mode 100644
index 0000000000000..3a3bbf4767523
Binary files /dev/null and b/onnxruntime/test/testdata/dummy_t5.onnx differ
diff --git a/onnxruntime/test/testdata/dummy_t5_with_outer_scope_initializers.onnx b/onnxruntime/test/testdata/dummy_t5_with_outer_scope_initializers.onnx
new file mode 100644
index 0000000000000..4b36cc9b6eca0
Binary files /dev/null and b/onnxruntime/test/testdata/dummy_t5_with_outer_scope_initializers.onnx differ
diff --git a/onnxruntime/test/testdata/model_with_external_initializers.onnx b/onnxruntime/test/testdata/model_with_external_initializers.onnx
index 3538f01b53c18..f815b4000f98f 100644
--- a/onnxruntime/test/testdata/model_with_external_initializers.onnx
+++ b/onnxruntime/test/testdata/model_with_external_initializers.onnx
@@ -1,8 +1,7 @@
-
-onnx-example:�
-,
+onnx-example:�
+&
 X
-PadsYpad0"Pad*
+PadsY"Pad*
 mode"constant�
 test-model*"BPadsj
 locationPads.binpZ
@@ -17,4 +16,4 @@ test-model*"BPadsj
 Y
 
 
-B
\ No newline at end of file
+B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/model_with_external_initializers.py b/onnxruntime/test/testdata/model_with_external_initializers.py
index dc64d4a41424a..8d2589a9e6564 100644
--- a/onnxruntime/test/testdata/model_with_external_initializers.py
+++ b/onnxruntime/test/testdata/model_with_external_initializers.py
@@ -35,10 +35,9 @@ def GenerateModel(model_name, external_data_name):  # noqa: N802
 
     # Create a node (NodeProto)
     node_def = helper.make_node(
-        "Pad",  # op type
+        "Pad",  # node name
         ["X", external_data_name],  # inputs
         ["Y"],  # outputs
-        "pad0",  # node name
         mode="constant",  # Attributes
     )
 
diff --git a/onnxruntime/test/testdata/model_with_orig_ext_data.onnx b/onnxruntime/test/testdata/model_with_orig_ext_data.onnx
index 47d0c68235099..6f9cce0bc5b4f 100644
--- a/onnxruntime/test/testdata/model_with_orig_ext_data.onnx
+++ b/onnxruntime/test/testdata/model_with_orig_ext_data.onnx
@@ -1,8 +1,7 @@
-
-onnx-example:�
-@
+	onnx-example:�
+:
 X
-model_with_orig_ext_dataYpad0"Pad*
+model_with_orig_ext_dataY"Pad*
 mode"constant�
 test-model*JBmodel_with_orig_ext_dataj(
 locationmodel_with_orig_ext_data.binpZ
@@ -17,4 +16,4 @@ test-model*JBmodel_with_orig_ext_dataj(
 Y
 
 
-B
\ No newline at end of file
+B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index f083ab14ad133..7ecaab6fedb02 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -750,6 +750,13 @@
         "^test_reduce_log_sum_empty_set_cpu",
         "^test_reduce_log_sum_exp_empty_set_cpu",
         "^test_reduce_prod_empty_set_cpu",
+        // Bug: DML EP some how executes these CUDA tests and failed
+        // TODO: Remove these tests when DML EP is fixed
+        "^test_convtranspose_autopad_same_cuda",
+        "^test_asin_example_cuda",
+        "^test_dynamicquantizelinear_cuda",
+        "^test_dynamicquantizelinear_expanded_cuda",
+        "^test_reduce_min_empty_set_cuda",
         //Bug: DML EP does not execute operators with an empty input tensor
         //TODO: Resolve as a graph implementation that returns a constant inf tensor with appropriate strides
         "^test_reduce_min_empty_set_cpu"
diff --git a/onnxruntime/test/testdata/prepack/MatMul.Weight.bin b/onnxruntime/test/testdata/prepack/MatMul.Weight.bin
deleted file mode 100644
index 0f8a571589c10..0000000000000
Binary files a/onnxruntime/test/testdata/prepack/MatMul.Weight.bin and /dev/null differ
diff --git a/onnxruntime/test/testdata/prepack/model_with_external_initializers_and_prepack_kernel.py b/onnxruntime/test/testdata/prepack/model_with_external_initializers_and_prepack_kernel.py
deleted file mode 100644
index 86af461edc2c4..0000000000000
--- a/onnxruntime/test/testdata/prepack/model_with_external_initializers_and_prepack_kernel.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-import os
-
-import numpy as np
-import onnx
-from onnx import TensorProto, helper
-from onnx.external_data_helper import set_external_data
-from onnx.numpy_helper import from_array
-
-M = 1
-K = 1
-N = 1
-q_cols = 1
-q_rows = 1
-q_scale_size = 1
-
-
-def create_external_data_tensor(value, tensor_name, data_type):
-    tensor = from_array(np.array(value))
-    tensor.name = tensor_name
-    tensor_filename = f"{tensor_name}.bin"
-    set_external_data(tensor, location=tensor_filename)
-
-    with open(os.path.join(tensor_filename), "wb") as data_file:
-        data_file.write(tensor.raw_data)
-    tensor.ClearField("raw_data")
-    tensor.data_location = onnx.TensorProto.EXTERNAL
-    tensor.data_type = data_type
-    return tensor
-
-
-def create_internal_data_tensor(value, tensor_name, data_type):
-    tensor = helper.make_tensor(name=tensor_name, data_type=data_type, dims=value.shape, vals=value.flatten().tolist())
-    print(tensor)
-    tensor.data_location = onnx.TensorProto.DEFAULT
-    return tensor
-
-
-def GenerateMatmulNBitsModel(model_name, external_data_name):  # noqa: N802
-    A = helper.make_tensor_value_info("A", TensorProto.FLOAT, [M, K])  # noqa: N806
-    Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [M, N])  # noqa: N806
-
-    # Create a node (NodeProto)
-    node_def = helper.make_node(
-        op_type="MatMulNBits",  # op type
-        inputs=["A", external_data_name, "scales"],  # inputs
-        outputs=["Y"],  # outputs
-        name="MatMul_0",  # node name
-        domain="com.microsoft",  # Custom domain for this operator
-        accuracy_level=4,  # Attributes
-        bits=4,  # Attributes
-        block_size=32,  # Attributes
-        K=K,  # Attributes
-        N=N,  # Attributes
-    )
-
-    # Create the graph (GraphProto)
-    graph_def = helper.make_graph(
-        [node_def],
-        "test-model-matmul4bits",
-        [A],
-        [Y],
-        [
-            create_external_data_tensor([[171]], external_data_name, TensorProto.UINT8),
-            create_internal_data_tensor(np.array([1.5], dtype=np.float32), "scales", TensorProto.FLOAT),
-        ],
-    )
-
-    # Create the model
-    model_def = helper.make_model(
-        graph_def,
-        producer_name="onnx-example",
-        opset_imports=[helper.make_operatorsetid("", 14), helper.make_operatorsetid("com.microsoft", 1)],
-    )
-
-    print(f"The ir_version in model: {model_def.ir_version}\n")
-    print(f"The producer_name in model: {model_def.producer_name}\n")
-    print(f"The graph in model:\n{model_def.graph}")
-    onnx.checker.check_model(model_def)
-    print("The model is checked!")
-    with open(model_name, "wb") as model_file:
-        model_file.write(model_def.SerializeToString())
-
-
-if __name__ == "__main__":
-    GenerateMatmulNBitsModel("model_with_matmul_nbits.onnx", "MatMul.Weight")
diff --git a/onnxruntime/test/testdata/prepack/model_with_matmul_nbits.onnx b/onnxruntime/test/testdata/prepack/model_with_matmul_nbits.onnx
deleted file mode 100644
index 0e06a75a5a7e8..0000000000000
Binary files a/onnxruntime/test/testdata/prepack/model_with_matmul_nbits.onnx and /dev/null differ
diff --git a/onnxruntime/test/testdata/relu_with_optional.onnx b/onnxruntime/test/testdata/relu_with_optional.onnx
new file mode 100644
index 0000000000000..b52c6927527bd
Binary files /dev/null and b/onnxruntime/test/testdata/relu_with_optional.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-directly-dont-fuse.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-directly-dont-fuse.onnx
new file mode 100644
index 0000000000000..8ca8282572db8
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-directly-dont-fuse.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx
new file mode 100644
index 0000000000000..2521a89b7bb56
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx
@@ -0,0 +1,41 @@
+	:�
+R
+inputAa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
+Y
+a_quantized
+inputB
+a_zp
+inputBZPmatmulinteger_outputMatMulInteger"MatMulInteger
+-
+a_scale
+inputBScalemul_1	mul_right"Mul
+:
+matmulinteger_outputcast_outputcast"Cast*	
+to�
+-
+mul_1
+cast_outputoutput
+mul_bottom"Mul+matmul_integer_to_float_large_tensor_fusionZ"
+inputA
+
+
+ 
+�
+
+�
+Z
+inputB
+
+
+�
+
+�
+Z
+inputBZP
+
+
+Z
+inputBScale
+
+
+B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py
new file mode 100644
index 0000000000000..543517cc015ef
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py
@@ -0,0 +1,49 @@
+from enum import Enum  # noqa: F401
+
+import onnx
+from onnx import TensorProto, helper
+
+
+def GenerateModel(model_name):  # noqa: N802
+    inputs = []
+    outputs = []
+    initializers = []
+    nodes = []
+
+    inputs.append(helper.make_tensor_value_info("inputA", TensorProto.FLOAT, [16, 32, 1280, 1280]))
+    inputs.append(helper.make_tensor_value_info("inputB", TensorProto.INT8, [1280, 1280]))
+    inputs.append(helper.make_tensor_value_info("inputBZP", TensorProto.INT8, [1]))
+    inputs.append(helper.make_tensor_value_info("inputBScale", TensorProto.FLOAT, [1]))
+
+    nodes = [  # construct graph
+        helper.make_node(
+            "DynamicQuantizeLinear",
+            ["inputA"],
+            ["a_quantized", "a_scale", "a_zp"],
+            "DynamicQuantizeLinear",
+        ),
+        helper.make_node(
+            "MatMulInteger",
+            ["a_quantized", "inputB", "a_zp", "inputBZP"],
+            ["matmulinteger_output"],
+            "MatMulInteger",
+        ),
+        helper.make_node("Mul", ["a_scale", "inputBScale"], ["mul_1"], "mul_right"),
+        helper.make_node("Cast", ["matmulinteger_output"], ["cast_output"], "cast", to=1),
+        helper.make_node("Mul", ["mul_1", "cast_output"], ["output"], "mul_bottom"),
+    ]
+
+    graph = helper.make_graph(
+        nodes,
+        "matmul_integer_to_float_large_tensor_fusion",  # name
+        inputs,
+        outputs,
+        initializers,
+    )
+
+    model = helper.make_model(graph)
+    onnx.save(model, model_name)
+
+
+if __name__ == "__main__":
+    GenerateModel("matmul_integer_to_float_large_tensor.onnx")
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 62bdedd833025..59926bbcd1c6f 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -140,6 +140,12 @@ std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef ENABLE_CUDA_NHWC_OPS
 std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
 #if defined(USE_CUDA)
+#ifdef USE_DML
+  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
+  if (no_cuda_ep_test == "1") {
+    return nullptr;
+  }
+#endif
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
   provider_options.use_tf32 = false;
@@ -182,14 +188,6 @@ std::unique_ptr<IExecutionProvider> DnnlExecutionProviderWithOptions(const OrtDn
   return nullptr;
 }
 
-// std::unique_ptr<IExecutionProvider> DefaultTvmExecutionProvider() {
-// #ifdef USE_TVM
-//   return TVMProviderFactoryCreator::Create("")->CreateProvider();
-// #else
-//   return nullptr;
-// #endif
-// }
-
 std::unique_ptr<IExecutionProvider> DefaultNnapiExecutionProvider() {
 // The NNAPI EP uses a stub implementation on non-Android platforms so cannot be used to execute a model.
 // Manually append an NNAPI EP instance to the session to unit test the GetCapability and Compile implementation.
@@ -253,14 +251,14 @@ std::unique_ptr<IExecutionProvider> DefaultCoreMLExecutionProvider(bool use_mlpr
   // The test will create a model but execution of it will obviously fail.
 #if defined(USE_COREML) && defined(__APPLE__)
   // We want to run UT on CPU only to get output value without losing precision
-  uint32_t coreml_flags = 0;
-  coreml_flags |= COREML_FLAG_USE_CPU_ONLY;
+  auto option = ProviderOptions();
+  option[kCoremlProviderOption_MLComputeUnits] = "CPUOnly";
 
   if (use_mlprogram) {
-    coreml_flags |= COREML_FLAG_CREATE_MLPROGRAM;
+    option[kCoremlProviderOption_ModelFormat] = "MLProgram";
   }
 
-  return CoreMLProviderFactoryCreator::Create(coreml_flags)->CreateProvider();
+  return CoreMLProviderFactoryCreator::Create(option)->CreateProvider();
 #else
   ORT_UNUSED_PARAMETER(use_mlprogram);
   return nullptr;
diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h
index ed95bf67f1ffb..9b44150d972db 100644
--- a/onnxruntime/test/util/include/default_providers.h
+++ b/onnxruntime/test/util/include/default_providers.h
@@ -20,7 +20,6 @@ std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(c
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_MIGraphX(const OrtMIGraphXProviderOptions* params);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Nnapi(
     uint32_t flags, const optional<std::string>& partitioning_stop_ops_list);
-// std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tvm(const char*);
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_VSINPU();
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Rknpu();
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Rocm(const OrtROCMProviderOptions* provider_options);
diff --git a/onnxruntime/test/util/include/providers.h b/onnxruntime/test/util/include/providers.h
index a73b237ae10df..01be1a444646b 100644
--- a/onnxruntime/test/util/include/providers.h
+++ b/onnxruntime/test/util/include/providers.h
@@ -7,9 +7,6 @@
 #ifdef USE_DNNL
 #include "core/providers/dnnl/dnnl_provider_factory.h"
 #endif
-#ifdef USE_TVM
-#include "core/providers/tvm/tvm_provider_factory.h"
-#endif
 #ifdef USE_OPENVINO
 #include "core/providers/openvino/openvino_provider_factory.h"
 #endif
diff --git a/onnxruntime/test/wasm/package-lock.json b/onnxruntime/test/wasm/package-lock.json
index 522e96fc3188a..3bd5d173dbe79 100644
--- a/onnxruntime/test/wasm/package-lock.json
+++ b/onnxruntime/test/wasm/package-lock.json
@@ -27,9 +27,9 @@
       }
     },
     "node_modules/@socket.io/component-emitter": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.0.tgz",
-      "integrity": "sha512-+9jVqKhRSpsc591z5vX+X5Yyw+he/HCB4iQ/RYxw35CEPaY1gnsNE43nf9n9AaYjAQrTiI/mOwKUKdUs9vf7Xg==",
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.2.tgz",
+      "integrity": "sha512-9BCxFwvbGg/RsZK9tjXd8s4UcwR0MWeFQ1XEKIQVVvAGJyINdrqKMcTRyLoK8Rse1GjzLV9cwjWV1olXRWEXVA==",
       "dev": true
     },
     "node_modules/@types/cookie": {
@@ -39,19 +39,22 @@
       "dev": true
     },
     "node_modules/@types/cors": {
-      "version": "2.8.13",
-      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.13.tgz",
-      "integrity": "sha512-RG8AStHlUiV5ysZQKq97copd2UmVYw3/pRMLefISZ3S1hK104Cwm7iLQ3fTKx+lsUH2CE8FlLaYeEA2LSeqYUA==",
+      "version": "2.8.17",
+      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
+      "integrity": "sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==",
       "dev": true,
       "dependencies": {
         "@types/node": "*"
       }
     },
     "node_modules/@types/node": {
-      "version": "18.13.0",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.13.0.tgz",
-      "integrity": "sha512-gC3TazRzGoOnoKAhUx+Q0t8S9Tzs74z7m0ipwGpSqQrleP14hKxP4/JUeEQcD3W1/aIpnWl8pHowI7WokuZpXg==",
-      "dev": true
+      "version": "22.10.1",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.10.1.tgz",
+      "integrity": "sha512-qKgsUwfHZV2WCWLAnVP1JqnpE6Im6h3Y0+fYgMTasNQ7V++CBX5OT1as0g0f+OyubbFqhf6XVNIsmN4IIhEgGQ==",
+      "dev": true,
+      "dependencies": {
+        "undici-types": "~6.20.0"
+      }
     },
     "node_modules/accepts": {
       "version": "1.3.8",
@@ -162,12 +165,12 @@
       }
     },
     "node_modules/braces": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz",
-      "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==",
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
+      "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
       "dev": true,
       "dependencies": {
-        "fill-range": "^7.0.1"
+        "fill-range": "^7.1.1"
       },
       "engines": {
         "node": ">=8"
@@ -288,9 +291,9 @@
       }
     },
     "node_modules/cookie": {
-      "version": "0.4.2",
-      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.2.tgz",
-      "integrity": "sha512-aSWTXFzaKWkvHO1Ny/s+ePFpvKsPnjc551iI41v3ny/ow6tBG5Vd+FuqGNhh1LxOmVzOlGUriIlOaokOvhaStA==",
+      "version": "0.7.2",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz",
+      "integrity": "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==",
       "dev": true,
       "engines": {
         "node": ">= 0.6"
@@ -409,9 +412,9 @@
       }
     },
     "node_modules/engine.io": {
-      "version": "6.4.2",
-      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.4.2.tgz",
-      "integrity": "sha512-FKn/3oMiJjrOEOeUub2WCox6JhxBXq/Zn3fZOMCBxKnNYtsdKjxhl7yR3fZhM9PV+rdE75SU5SYMc+2PGzo+Tg==",
+      "version": "6.6.2",
+      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.6.2.tgz",
+      "integrity": "sha512-gmNvsYi9C8iErnZdVcJnvCpSKbWTt1E8+JZo8b+daLninywUWi5NQ5STSHZ9rFjFO7imNcvb8Pc5pe/wMR5xEw==",
       "dev": true,
       "dependencies": {
         "@types/cookie": "^0.4.1",
@@ -419,32 +422,32 @@
         "@types/node": ">=10.0.0",
         "accepts": "~1.3.4",
         "base64id": "2.0.0",
-        "cookie": "~0.4.1",
+        "cookie": "~0.7.2",
         "cors": "~2.8.5",
         "debug": "~4.3.1",
-        "engine.io-parser": "~5.0.3",
-        "ws": "~8.11.0"
+        "engine.io-parser": "~5.2.1",
+        "ws": "~8.17.1"
       },
       "engines": {
-        "node": ">=10.0.0"
+        "node": ">=10.2.0"
       }
     },
     "node_modules/engine.io-parser": {
-      "version": "5.0.6",
-      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.0.6.tgz",
-      "integrity": "sha512-tjuoZDMAdEhVnSFleYPCtdL2GXwVTGtNjoeJd9IhIG3C1xs9uwxqRNEu5WpnDZCaozwVlK/nuQhpodhXSIMaxw==",
+      "version": "5.2.3",
+      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.2.3.tgz",
+      "integrity": "sha512-HqD3yTBfnBxIrbnM1DoD6Pcq8NECnh8d4As1Qgh0z5Gg3jRRIqijury0CL3ghu/edArpUYiYqQiDUQBIs4np3Q==",
       "dev": true,
       "engines": {
         "node": ">=10.0.0"
       }
     },
     "node_modules/engine.io/node_modules/debug": {
-      "version": "4.3.4",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
-      "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
+      "version": "4.3.7",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz",
+      "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
       "dev": true,
       "dependencies": {
-        "ms": "2.1.2"
+        "ms": "^2.1.3"
       },
       "engines": {
         "node": ">=6.0"
@@ -456,9 +459,9 @@
       }
     },
     "node_modules/engine.io/node_modules/ms": {
-      "version": "2.1.2",
-      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
-      "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==",
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
       "dev": true
     },
     "node_modules/ent": {
@@ -516,9 +519,9 @@
       "dev": true
     },
     "node_modules/fill-range": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz",
-      "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==",
+      "version": "7.1.1",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
+      "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
       "dev": true,
       "dependencies": {
         "to-regex-range": "^5.0.1"
@@ -1304,35 +1307,60 @@
       }
     },
     "node_modules/socket.io": {
-      "version": "4.6.0",
-      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.6.0.tgz",
-      "integrity": "sha512-b65bp6INPk/BMMrIgVvX12x3Q+NqlGqSlTuvKQWt0BUJ3Hyy3JangBl7fEoWZTXbOKlCqNPbQ6MbWgok/km28w==",
+      "version": "4.8.1",
+      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.8.1.tgz",
+      "integrity": "sha512-oZ7iUCxph8WYRHHcjBEc9unw3adt5CmSNlppj/5Q4k2RIrhl8Z5yY2Xr4j9zj0+wzVZ0bxmYoGSzKJnRl6A4yg==",
       "dev": true,
       "dependencies": {
         "accepts": "~1.3.4",
         "base64id": "~2.0.0",
+        "cors": "~2.8.5",
         "debug": "~4.3.2",
-        "engine.io": "~6.4.0",
+        "engine.io": "~6.6.0",
         "socket.io-adapter": "~2.5.2",
-        "socket.io-parser": "~4.2.1"
+        "socket.io-parser": "~4.2.4"
       },
       "engines": {
-        "node": ">=10.0.0"
+        "node": ">=10.2.0"
       }
     },
     "node_modules/socket.io-adapter": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.2.tgz",
-      "integrity": "sha512-87C3LO/NOMc+eMcpcxUBebGjkpMDkNBS9tf7KJqcDsmL936EChtVva71Dw2q4tQcuVC+hAUy4an2NO/sYXmwRA==",
+      "version": "2.5.5",
+      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.5.tgz",
+      "integrity": "sha512-eLDQas5dzPgOWCk9GuuJC2lBqItuhKI4uxGgo9aIV7MYbk2h9Q6uULEh8WBzThoI7l+qU9Ast9fVUmkqPP9wYg==",
+      "dev": true,
+      "dependencies": {
+        "debug": "~4.3.4",
+        "ws": "~8.17.1"
+      }
+    },
+    "node_modules/socket.io-adapter/node_modules/debug": {
+      "version": "4.3.7",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz",
+      "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
       "dev": true,
       "dependencies": {
-        "ws": "~8.11.0"
+        "ms": "^2.1.3"
+      },
+      "engines": {
+        "node": ">=6.0"
+      },
+      "peerDependenciesMeta": {
+        "supports-color": {
+          "optional": true
+        }
       }
     },
+    "node_modules/socket.io-adapter/node_modules/ms": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+      "dev": true
+    },
     "node_modules/socket.io-parser": {
-      "version": "4.2.3",
-      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.3.tgz",
-      "integrity": "sha512-JMafRntWVO2DCJimKsRTh/wnqVvO4hrfwOqtO7f+uzwsQMuxO6VwImtYxaQ+ieoyshWOTJyV0fA21lccEXRPpQ==",
+      "version": "4.2.4",
+      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.4.tgz",
+      "integrity": "sha512-/GbIKmo8ioc+NIWIhwdecY0ge+qVBSMdgxGygevmdHj24bsfgtCmcUUcQ5ZzcylGFHsN3k4HB4Cgkl96KVnuew==",
       "dev": true,
       "dependencies": {
         "@socket.io/component-emitter": "~3.1.0",
@@ -1343,12 +1371,12 @@
       }
     },
     "node_modules/socket.io-parser/node_modules/debug": {
-      "version": "4.3.4",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
-      "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
+      "version": "4.3.7",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz",
+      "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
       "dev": true,
       "dependencies": {
-        "ms": "2.1.2"
+        "ms": "^2.1.3"
       },
       "engines": {
         "node": ">=6.0"
@@ -1360,9 +1388,9 @@
       }
     },
     "node_modules/socket.io-parser/node_modules/ms": {
-      "version": "2.1.2",
-      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
-      "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==",
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
       "dev": true
     },
     "node_modules/socket.io/node_modules/debug": {
@@ -1534,6 +1562,12 @@
         "node": "*"
       }
     },
+    "node_modules/undici-types": {
+      "version": "6.20.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.20.0.tgz",
+      "integrity": "sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==",
+      "dev": true
+    },
     "node_modules/universalify": {
       "version": "0.1.2",
       "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz",
@@ -1615,16 +1649,16 @@
       "dev": true
     },
     "node_modules/ws": {
-      "version": "8.11.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.11.0.tgz",
-      "integrity": "sha512-HPG3wQd9sNQoT9xHyNCXoDUa+Xw/VevmY9FoHyQ+g+rrMn4j6FB4np7Z0OhdTgjx6MgQLK7jwSy1YecU1+4Asg==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
+      "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==",
       "dev": true,
       "engines": {
         "node": ">=10.0.0"
       },
       "peerDependencies": {
         "bufferutil": "^4.0.1",
-        "utf-8-validate": "^5.0.2"
+        "utf-8-validate": ">=5.0.2"
       },
       "peerDependenciesMeta": {
         "bufferutil": {
@@ -1686,9 +1720,9 @@
       "dev": true
     },
     "@socket.io/component-emitter": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.0.tgz",
-      "integrity": "sha512-+9jVqKhRSpsc591z5vX+X5Yyw+he/HCB4iQ/RYxw35CEPaY1gnsNE43nf9n9AaYjAQrTiI/mOwKUKdUs9vf7Xg==",
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.2.tgz",
+      "integrity": "sha512-9BCxFwvbGg/RsZK9tjXd8s4UcwR0MWeFQ1XEKIQVVvAGJyINdrqKMcTRyLoK8Rse1GjzLV9cwjWV1olXRWEXVA==",
       "dev": true
     },
     "@types/cookie": {
@@ -1698,19 +1732,22 @@
       "dev": true
     },
     "@types/cors": {
-      "version": "2.8.13",
-      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.13.tgz",
-      "integrity": "sha512-RG8AStHlUiV5ysZQKq97copd2UmVYw3/pRMLefISZ3S1hK104Cwm7iLQ3fTKx+lsUH2CE8FlLaYeEA2LSeqYUA==",
+      "version": "2.8.17",
+      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
+      "integrity": "sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==",
       "dev": true,
       "requires": {
         "@types/node": "*"
       }
     },
     "@types/node": {
-      "version": "18.13.0",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.13.0.tgz",
-      "integrity": "sha512-gC3TazRzGoOnoKAhUx+Q0t8S9Tzs74z7m0ipwGpSqQrleP14hKxP4/JUeEQcD3W1/aIpnWl8pHowI7WokuZpXg==",
-      "dev": true
+      "version": "22.10.1",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.10.1.tgz",
+      "integrity": "sha512-qKgsUwfHZV2WCWLAnVP1JqnpE6Im6h3Y0+fYgMTasNQ7V++CBX5OT1as0g0f+OyubbFqhf6XVNIsmN4IIhEgGQ==",
+      "dev": true,
+      "requires": {
+        "undici-types": "~6.20.0"
+      }
     },
     "accepts": {
       "version": "1.3.8",
@@ -1796,12 +1833,12 @@
       }
     },
     "braces": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz",
-      "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==",
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
+      "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
       "dev": true,
       "requires": {
-        "fill-range": "^7.0.1"
+        "fill-range": "^7.1.1"
       }
     },
     "bytes": {
@@ -1890,9 +1927,9 @@
       "dev": true
     },
     "cookie": {
-      "version": "0.4.2",
-      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.2.tgz",
-      "integrity": "sha512-aSWTXFzaKWkvHO1Ny/s+ePFpvKsPnjc551iI41v3ny/ow6tBG5Vd+FuqGNhh1LxOmVzOlGUriIlOaokOvhaStA==",
+      "version": "0.7.2",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz",
+      "integrity": "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==",
       "dev": true
     },
     "cors": {
@@ -1986,9 +2023,9 @@
       "dev": true
     },
     "engine.io": {
-      "version": "6.4.2",
-      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.4.2.tgz",
-      "integrity": "sha512-FKn/3oMiJjrOEOeUub2WCox6JhxBXq/Zn3fZOMCBxKnNYtsdKjxhl7yR3fZhM9PV+rdE75SU5SYMc+2PGzo+Tg==",
+      "version": "6.6.2",
+      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.6.2.tgz",
+      "integrity": "sha512-gmNvsYi9C8iErnZdVcJnvCpSKbWTt1E8+JZo8b+daLninywUWi5NQ5STSHZ9rFjFO7imNcvb8Pc5pe/wMR5xEw==",
       "dev": true,
       "requires": {
         "@types/cookie": "^0.4.1",
@@ -1996,34 +2033,34 @@
         "@types/node": ">=10.0.0",
         "accepts": "~1.3.4",
         "base64id": "2.0.0",
-        "cookie": "~0.4.1",
+        "cookie": "~0.7.2",
         "cors": "~2.8.5",
         "debug": "~4.3.1",
-        "engine.io-parser": "~5.0.3",
-        "ws": "~8.11.0"
+        "engine.io-parser": "~5.2.1",
+        "ws": "~8.17.1"
       },
       "dependencies": {
         "debug": {
-          "version": "4.3.4",
-          "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
-          "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
+          "version": "4.3.7",
+          "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz",
+          "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
           "dev": true,
           "requires": {
-            "ms": "2.1.2"
+            "ms": "^2.1.3"
           }
         },
         "ms": {
-          "version": "2.1.2",
-          "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
-          "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==",
+          "version": "2.1.3",
+          "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+          "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
           "dev": true
         }
       }
     },
     "engine.io-parser": {
-      "version": "5.0.6",
-      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.0.6.tgz",
-      "integrity": "sha512-tjuoZDMAdEhVnSFleYPCtdL2GXwVTGtNjoeJd9IhIG3C1xs9uwxqRNEu5WpnDZCaozwVlK/nuQhpodhXSIMaxw==",
+      "version": "5.2.3",
+      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.2.3.tgz",
+      "integrity": "sha512-HqD3yTBfnBxIrbnM1DoD6Pcq8NECnh8d4As1Qgh0z5Gg3jRRIqijury0CL3ghu/edArpUYiYqQiDUQBIs4np3Q==",
       "dev": true
     },
     "ent": {
@@ -2072,9 +2109,9 @@
       "dev": true
     },
     "fill-range": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz",
-      "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==",
+      "version": "7.1.1",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
+      "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
       "dev": true,
       "requires": {
         "to-regex-range": "^5.0.1"
@@ -2651,17 +2688,18 @@
       }
     },
     "socket.io": {
-      "version": "4.6.0",
-      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.6.0.tgz",
-      "integrity": "sha512-b65bp6INPk/BMMrIgVvX12x3Q+NqlGqSlTuvKQWt0BUJ3Hyy3JangBl7fEoWZTXbOKlCqNPbQ6MbWgok/km28w==",
+      "version": "4.8.1",
+      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.8.1.tgz",
+      "integrity": "sha512-oZ7iUCxph8WYRHHcjBEc9unw3adt5CmSNlppj/5Q4k2RIrhl8Z5yY2Xr4j9zj0+wzVZ0bxmYoGSzKJnRl6A4yg==",
       "dev": true,
       "requires": {
         "accepts": "~1.3.4",
         "base64id": "~2.0.0",
+        "cors": "~2.8.5",
         "debug": "~4.3.2",
-        "engine.io": "~6.4.0",
+        "engine.io": "~6.6.0",
         "socket.io-adapter": "~2.5.2",
-        "socket.io-parser": "~4.2.1"
+        "socket.io-parser": "~4.2.4"
       },
       "dependencies": {
         "debug": {
@@ -2682,18 +2720,36 @@
       }
     },
     "socket.io-adapter": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.2.tgz",
-      "integrity": "sha512-87C3LO/NOMc+eMcpcxUBebGjkpMDkNBS9tf7KJqcDsmL936EChtVva71Dw2q4tQcuVC+hAUy4an2NO/sYXmwRA==",
+      "version": "2.5.5",
+      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.5.tgz",
+      "integrity": "sha512-eLDQas5dzPgOWCk9GuuJC2lBqItuhKI4uxGgo9aIV7MYbk2h9Q6uULEh8WBzThoI7l+qU9Ast9fVUmkqPP9wYg==",
       "dev": true,
       "requires": {
-        "ws": "~8.11.0"
+        "debug": "~4.3.4",
+        "ws": "~8.17.1"
+      },
+      "dependencies": {
+        "debug": {
+          "version": "4.3.7",
+          "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz",
+          "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
+          "dev": true,
+          "requires": {
+            "ms": "^2.1.3"
+          }
+        },
+        "ms": {
+          "version": "2.1.3",
+          "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+          "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+          "dev": true
+        }
       }
     },
     "socket.io-parser": {
-      "version": "4.2.3",
-      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.3.tgz",
-      "integrity": "sha512-JMafRntWVO2DCJimKsRTh/wnqVvO4hrfwOqtO7f+uzwsQMuxO6VwImtYxaQ+ieoyshWOTJyV0fA21lccEXRPpQ==",
+      "version": "4.2.4",
+      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.4.tgz",
+      "integrity": "sha512-/GbIKmo8ioc+NIWIhwdecY0ge+qVBSMdgxGygevmdHj24bsfgtCmcUUcQ5ZzcylGFHsN3k4HB4Cgkl96KVnuew==",
       "dev": true,
       "requires": {
         "@socket.io/component-emitter": "~3.1.0",
@@ -2701,18 +2757,18 @@
       },
       "dependencies": {
         "debug": {
-          "version": "4.3.4",
-          "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
-          "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
+          "version": "4.3.7",
+          "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz",
+          "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
           "dev": true,
           "requires": {
-            "ms": "2.1.2"
+            "ms": "^2.1.3"
           }
         },
         "ms": {
-          "version": "2.1.2",
-          "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
-          "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==",
+          "version": "2.1.3",
+          "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+          "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
           "dev": true
         }
       }
@@ -2817,6 +2873,12 @@
       "integrity": "sha512-s8ax/CeZdK9R/56Sui0WM6y9OFREJarMRHqLB2EwkovemBxNQ+Bqu8GAsUnVcXKgphb++ghr/B2BZx4mahujPw==",
       "dev": true
     },
+    "undici-types": {
+      "version": "6.20.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.20.0.tgz",
+      "integrity": "sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==",
+      "dev": true
+    },
     "universalify": {
       "version": "0.1.2",
       "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz",
@@ -2874,9 +2936,9 @@
       "dev": true
     },
     "ws": {
-      "version": "8.11.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.11.0.tgz",
-      "integrity": "sha512-HPG3wQd9sNQoT9xHyNCXoDUa+Xw/VevmY9FoHyQ+g+rrMn4j6FB4np7Z0OhdTgjx6MgQLK7jwSy1YecU1+4Asg==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
+      "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==",
       "dev": true,
       "requires": {}
     },
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index 213f0fbc1e458..45e2475548df5 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -241,7 +241,7 @@ Module['jsepInit'] = (name, params) => {
     Module['jsepCreateMLContext'] = (optionsOrGpuDevice) => {
       return backend['createMLContext'](optionsOrGpuDevice);
     };
-    Module.jsepRegisterMLConstant = (externalFilePath, dataOffset, dataLength, builder, desc) => {
+    Module['jsepRegisterMLConstant'] = (externalFilePath, dataOffset, dataLength, builder, desc) => {
       return backend['registerMLConstant'](
           externalFilePath, dataOffset, dataLength, builder, desc, Module.MountedFiles);
     };
diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc
index 87a7cbc0375a4..f1545e96481fa 100644
--- a/orttraining/orttraining/core/session/training_session.cc
+++ b/orttraining/orttraining/core/session/training_session.cc
@@ -758,7 +758,8 @@ Status TrainingSession::AddPredefinedTransformers(
     GraphTransformerManager& transformer_manager,
     TransformerLevel graph_optimization_level,
     MinimalBuildOptimizationHandling minimal_build_optimization_handling,
-    RecordRuntimeOptimizationProducedNodeOpSchemaFn /*record_runtime_optimization_produced_op_schema_fn*/) const {
+    RecordRuntimeOptimizationProducedNodeOpSchemaFn /*record_runtime_optimization_produced_op_schema_fn*/,
+    const logging::Logger& /*logger*/) const {
   ORT_RETURN_IF_NOT(
       minimal_build_optimization_handling == MinimalBuildOptimizationHandling::ApplyFullBuildOptimizations,
       "Only applying full build optimizations is supported by TrainingSession.");
diff --git a/orttraining/orttraining/core/session/training_session.h b/orttraining/orttraining/core/session/training_session.h
index 765f88e1c992e..58492dc62400f 100644
--- a/orttraining/orttraining/core/session/training_session.h
+++ b/orttraining/orttraining/core/session/training_session.h
@@ -489,7 +489,8 @@ class TrainingSession : public InferenceSession {
       GraphTransformerManager& transformer_manager,
       TransformerLevel graph_optimization_level,
       MinimalBuildOptimizationHandling minimal_build_optimization_handling,
-      RecordRuntimeOptimizationProducedNodeOpSchemaFn record_runtime_optimization_produced_op_schema_fn) const override;
+      RecordRuntimeOptimizationProducedNodeOpSchemaFn record_runtime_optimization_produced_op_schema_fn,
+      const logging::Logger& logger) const override;
 
   /** Perform auto-diff to add backward graph into the model.
   @param weights_to_train a set of weights to be training.
diff --git a/orttraining/orttraining/models/bert/main.cc b/orttraining/orttraining/models/bert/main.cc
index ec7a458237c77..c4c7a98ba116a 100644
--- a/orttraining/orttraining/models/bert/main.cc
+++ b/orttraining/orttraining/models/bert/main.cc
@@ -42,7 +42,6 @@ static SessionOptions session_options = {
     ExecutionMode::ORT_SEQUENTIAL,     // execution_mode
     ExecutionOrder::PRIORITY_BASED,    // execution_order
     false,                             // enable_profiling
-    false,                             // save prepacked initializer
     ORT_TSTR(""),                      // optimized_model_filepath
     true,                              // enable_mem_pattern
     true,                              // enable_mem_reuse
diff --git a/orttraining/orttraining/models/pipeline_poc/main.cc b/orttraining/orttraining/models/pipeline_poc/main.cc
index 0e40d04ddac8c..1b7d6b9ea26f6 100644
--- a/orttraining/orttraining/models/pipeline_poc/main.cc
+++ b/orttraining/orttraining/models/pipeline_poc/main.cc
@@ -89,7 +89,6 @@ int main(int argc, char* argv[]) {
       ExecutionMode::ORT_SEQUENTIAL,     // execution_mode
       ExecutionOrder::DEFAULT,           // execution_order
       false,                             // enable_profiling
-      false,                             // save prepacked initializer
       ORT_TSTR(""),                      // optimized_model_filepath
       true,                              // enable_mem_pattern
       true,                              // enable_mem_reuse
diff --git a/orttraining/orttraining/models/runner/training_runner.cc b/orttraining/orttraining/models/runner/training_runner.cc
index 5a2f1cd13683e..dae6f613f4329 100644
--- a/orttraining/orttraining/models/runner/training_runner.cc
+++ b/orttraining/orttraining/models/runner/training_runner.cc
@@ -37,7 +37,6 @@ static SessionOptions SESSION_OPTION = {
     ExecutionMode::ORT_SEQUENTIAL,     // execution_mode
     ExecutionOrder::PRIORITY_BASED,    // execution_order
     false,                             // enable_profiling
-    false,                             // save prepacked initializer
     ORT_TSTR(""),                      // optimized_model_filepath
     true,                              // enable_mem_pattern
     true,                              // enable_mem_reuse
diff --git a/orttraining/orttraining/test/gradient/gradient_op_test_utils.cc b/orttraining/orttraining/test/gradient/gradient_op_test_utils.cc
index 0944e46ff8eaf..58c173ed90277 100644
--- a/orttraining/orttraining/test/gradient/gradient_op_test_utils.cc
+++ b/orttraining/orttraining/test/gradient/gradient_op_test_utils.cc
@@ -139,7 +139,8 @@ void GradientOpTester::Run(int output_index_to_use_as_loss,
 
           auto reg = execution_provider->GetKernelRegistry();
           const KernelCreateInfo* kci;
-          auto st = reg->TryFindKernel(node, execution_provider->Type(), kernel_type_str_resolver, &kci);
+          auto st = reg->TryFindKernel(node, execution_provider->Type(), kernel_type_str_resolver,
+                                       DefaultLoggingManager().DefaultLogger(), &kci);
           if (!st.IsOK()) {
             // The goal here is unclear. It seems best to leave it to the Session
             // creation to figure out whether the model can be executed using some
diff --git a/orttraining/orttraining/test/optimizer/graph_transformer_utils_test.cc b/orttraining/orttraining/test/optimizer/graph_transformer_utils_test.cc
index 548f39bb0150c..1b8699d1de497 100644
--- a/orttraining/orttraining/test/optimizer/graph_transformer_utils_test.cc
+++ b/orttraining/orttraining/test/optimizer/graph_transformer_utils_test.cc
@@ -23,8 +23,10 @@ TEST(GraphTransformerUtilsTestsForTraining, TestGenerateGraphTransformers) {
   InlinedHashSet<std::string> disabled = {l1_rule1, l1_transformer, l2_transformer};
   CPUExecutionProvider cpu_ep(CPUExecutionProviderInfo{});
 
-  auto all_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, {}, cpu_ep);
-  auto filtered_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, {}, cpu_ep, disabled);
+  const auto& logger = DefaultLoggingManager().DefaultLogger();
+  auto all_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, {}, cpu_ep, logger);
+  auto filtered_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, {}, cpu_ep, logger,
+                                                                     disabled);
 
   // check ConstantFolding transformer was removed
   ASSERT_TRUE(filtered_transformers.size() == all_transformers.size() - 1);
@@ -47,8 +49,8 @@ TEST(GraphTransformerUtilsTestsForTraining, TestGenerateGraphTransformers) {
 
 #ifndef DISABLE_CONTRIB_OPS
   // check that ConvActivationFusion was removed
-  all_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, cpu_ep);
-  filtered_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, cpu_ep, disabled);
+  all_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, cpu_ep, logger);
+  filtered_transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, cpu_ep, logger, disabled);
   ASSERT_TRUE(filtered_transformers.size() == all_transformers.size() - 1);
 #endif
 }
diff --git a/orttraining/tools/amdgpu/Dockerfile.rocm4.3.1.pytorch b/orttraining/tools/amdgpu/Dockerfile.rocm4.3.1.pytorch
index 3a408e2265fe7..29b8812c979e4 100644
--- a/orttraining/tools/amdgpu/Dockerfile.rocm4.3.1.pytorch
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm4.3.1.pytorch
@@ -46,7 +46,7 @@ RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
     rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
 
 ENV PATH=${OLD_PATH}
-ENV unset OLD_PATH
+ENV unset=OLD_PATH
 
 # python env
 RUN pip3 install --upgrade setuptools
diff --git a/tools/android_custom_build/Dockerfile b/tools/android_custom_build/Dockerfile
index 0ff365dd5ff74..fcaffd9ef5e78 100644
--- a/tools/android_custom_build/Dockerfile
+++ b/tools/android_custom_build/Dockerfile
@@ -15,7 +15,7 @@ RUN apt-get update && apt-get install --yes --no-install-recommends \
   ca-certificates \
   git \
   ninja-build \
-  openjdk-11-jdk-headless \
+  openjdk-17-jdk-headless \
   python3-dev \
   python3-numpy \
   python3-pip \
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 24dc6124d4a89..6ee37b8b0519e 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -584,13 +584,7 @@ def convert_arg_line_to_args(self, arg_line):
     parser.add_argument("--use_preinstalled_eigen", action="store_true", help="Use pre-installed Eigen.")
     parser.add_argument("--eigen_path", help="Path to pre-installed Eigen.")
     parser.add_argument("--enable_msinternal", action="store_true", help="Enable for Microsoft internal builds only.")
-    parser.add_argument("--llvm_path", help="Path to llvm dir")
     parser.add_argument("--use_vitisai", action="store_true", help="Build with Vitis-AI")
-    parser.add_argument("--use_tvm", action="store_true", help="Build with TVM")
-    parser.add_argument("--tvm_cuda_runtime", action="store_true", default=False, help="Build TVM with CUDA support")
-    parser.add_argument(
-        "--use_tvm_hash", action="store_true", help="Build ipp-crypto for hash generation. It is used by TVM EP only"
-    )
     parser.add_argument("--use_tensorrt", action="store_true", help="Build with TensorRT")
     parser.add_argument(
         "--use_tensorrt_builtin_parser", action="store_true", default=True, help="Use TensorRT builtin parser"
@@ -602,12 +596,6 @@ def convert_arg_line_to_args(self, arg_line):
     parser.add_argument("--migraphx_home", help="Path to MIGraphX installation dir")
     parser.add_argument("--use_full_protobuf", action="store_true", help="Use the full protobuf library")
 
-    parser.add_argument(
-        "--llvm_config",
-        type=str,
-        default="",
-        help="Path to llvm-config.exe for LLVM built from sources. It is strongly needed for build on Windows",
-    )
     parser.add_argument(
         "--skip_onnx_tests",
         action="store_true",
@@ -1031,16 +1019,11 @@ def generate_build_tree(
         "-Donnxruntime_USE_NNAPI_BUILTIN=" + ("ON" if args.use_nnapi else "OFF"),
         "-Donnxruntime_USE_VSINPU=" + ("ON" if args.use_vsinpu else "OFF"),
         "-Donnxruntime_USE_RKNPU=" + ("ON" if args.use_rknpu else "OFF"),
-        "-Donnxruntime_USE_LLVM=" + ("ON" if args.use_tvm else "OFF"),
         "-Donnxruntime_ENABLE_MICROSOFT_INTERNAL=" + ("ON" if args.enable_msinternal else "OFF"),
         "-Donnxruntime_USE_VITISAI=" + ("ON" if args.use_vitisai else "OFF"),
         "-Donnxruntime_USE_TENSORRT=" + ("ON" if args.use_tensorrt else "OFF"),
         "-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER="
         + ("ON" if args.use_tensorrt_builtin_parser and not args.use_tensorrt_oss_parser else "OFF"),
-        # set vars for TVM
-        "-Donnxruntime_USE_TVM=" + ("ON" if args.use_tvm else "OFF"),
-        "-Donnxruntime_TVM_CUDA_RUNTIME=" + ("ON" if args.use_tvm and args.tvm_cuda_runtime else "OFF"),
-        "-Donnxruntime_TVM_USE_HASH=" + ("ON" if args.use_tvm_hash else "OFF"),
         # set vars for migraphx
         "-Donnxruntime_USE_MIGRAPHX=" + ("ON" if args.use_migraphx else "OFF"),
         "-Donnxruntime_DISABLE_CONTRIB_OPS=" + ("ON" if args.disable_contrib_ops else "OFF"),
@@ -1172,8 +1155,6 @@ def generate_build_tree(
         cmake_args.append("-Donnxruntime_ROCM_VERSION=" + args.rocm_version)
     if args.use_tensorrt:
         cmake_args.append("-Donnxruntime_TENSORRT_HOME=" + tensorrt_home)
-    if args.llvm_config:
-        cmake_args.append("-Donnxruntime_TVM_USE_LLVM=" + args.llvm_config)
 
     if args.use_cuda:
         add_default_definition(cmake_extra_defines, "onnxruntime_USE_CUDA", "ON")
@@ -1256,9 +1237,6 @@ def generate_build_tree(
     if args.use_full_protobuf or args.use_openvino or args.use_vitisai or args.gen_doc:
         cmake_args += ["-Donnxruntime_USE_FULL_PROTOBUF=ON", "-DProtobuf_USE_STATIC_LIBS=ON"]
 
-    if args.use_tvm and args.llvm_path is not None:
-        cmake_args += [f"-DLLVM_DIR={args.llvm_path}"]
-
     if args.use_cuda and not is_windows():
         nvml_stub_path = cuda_home + "/lib64/stubs"
         cmake_args += ["-DCUDA_CUDA_LIBRARY=" + nvml_stub_path]
@@ -1583,8 +1561,7 @@ def generate_build_tree(
                 ldflags = ["/profile", "/DYNAMICBASE"]
                 # Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled.
                 if not args.enable_address_sanitizer:
-                    # Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs
-                    cflags += ["/Qspectre", "/DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH"]
+                    cflags += ["/Qspectre"]
                 if config == "Release":
                     cflags += ["/O2", "/Ob2", "/DNDEBUG"]
                 elif config == "RelWithDebInfo":
@@ -1659,16 +1636,6 @@ def generate_build_tree(
             cxxflags = cflags.copy()
         config_build_dir = get_config_build_dir(build_dir, config)
         os.makedirs(config_build_dir, exist_ok=True)
-        if args.use_tvm:
-            os.environ["PATH"] = (
-                os.path.join(config_build_dir, "_deps", "tvm-build")
-                + os.pathsep
-                + os.path.join(config_build_dir, "_deps", "tvm-src")
-                + os.pathsep
-                + os.path.dirname(sys.executable)
-                + os.pathsep
-                + os.environ["PATH"]
-            )
         preinstalled_dir = Path(build_dir) / config
         temp_cmake_args = cmake_args.copy()
         if cflags is not None and cxxflags is not None and len(cflags) != 0 and len(cxxflags) != 0:
@@ -2097,8 +2064,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
 
         if args.enable_pybind:
             python_path = None
-            if args.use_tvm:
-                python_path = str((Path(build_dir) / config / "_deps" / "tvm-src" / "python").resolve())
 
             # Disable python tests in a reduced build as we don't know which ops have been included and which
             # models can run.
@@ -2108,6 +2073,17 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
             if is_windows():
                 cwd = os.path.join(cwd, config)
 
+            if args.enable_transformers_tool_test and not args.disable_contrib_ops and not args.use_rocm:
+                # PyTorch is required for transformers tests, and optional for some python tests.
+                # Install cpu only version of torch when cuda is not enabled in Linux.
+                extra = [] if args.use_cuda and is_linux() else ["--index-url", "https://download.pytorch.org/whl/cpu"]
+                run_subprocess(
+                    [sys.executable, "-m", "pip", "install", "torch", *extra],
+                    cwd=cwd,
+                    dll_path=dll_path,
+                    python_path=python_path,
+                )
+
             run_subprocess(
                 [sys.executable, "onnxruntime_test_python.py"], cwd=cwd, dll_path=dll_path, python_path=python_path
             )
@@ -2162,6 +2138,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                     dll_path=dll_path,
                     python_path=python_path,
                 )
+
                 if not args.disable_contrib_ops:
                     run_subprocess(
                         [sys.executable, "-m", "unittest", "discover", "-s", "quantization"], cwd=cwd, dll_path=dll_path
@@ -2183,7 +2160,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                             ],
                             cwd=SCRIPT_DIR,
                         )
-                        run_subprocess([sys.executable, "-m", "pytest", "transformers"], cwd=cwd)
+                        run_subprocess([sys.executable, "-m", "pytest", "--durations=0", "transformers"], cwd=cwd)
                         # Restore initial numpy/protobuf version in case other tests use it
                         run_subprocess([sys.executable, "-m", "pip", "install", "numpy==" + numpy_init_version])
                         run_subprocess([sys.executable, "-m", "pip", "install", "protobuf==" + pb_init_version])
@@ -2221,17 +2198,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                     run_subprocess([sys.executable, "onnxruntime_test_python_keras.py"], cwd=cwd, dll_path=dll_path)
 
 
-def tvm_run_python_tests(build_dir, configs):
-    for config in configs:
-        cwd = get_config_build_dir(build_dir, config)
-        if is_windows():
-            cwd = os.path.join(cwd, config)
-        python_path = os.path.join(build_dir, config, "_deps", "tvm-src", "python")
-        run_subprocess(
-            [sys.executable, "onnxruntime_test_python_tvm.py"], cwd=cwd, python_path=os.path.abspath(python_path)
-        )
-
-
 def run_nodejs_tests(nodejs_binding_dir):
     args = ["npm", "test", "--", "--timeout=90000"]
     if is_windows():
@@ -2251,7 +2217,6 @@ def build_python_wheel(
     use_dnnl,
     use_tensorrt,
     use_openvino,
-    use_tvm,
     use_vitisai,
     use_acl,
     use_armnn,
@@ -2304,8 +2269,6 @@ def build_python_wheel(
             args.append("--use_openvino")
         elif use_dnnl:
             args.append("--use_dnnl")
-        elif use_tvm:
-            args.append("--use_tvm")
         elif use_vitisai:
             args.append("--use_vitisai")
         elif use_acl:
@@ -2334,7 +2297,6 @@ def build_nuget_package(
     use_openvino,
     use_tensorrt,
     use_dnnl,
-    use_tvm,
     use_winml,
     use_qnn,
     enable_training_apis,
@@ -2381,9 +2343,6 @@ def build_nuget_package(
         package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu"
     elif use_rocm:
         package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm"
-    elif use_tvm:
-        execution_provider = "/p:ExecutionProvider=tvm"
-        package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Tvm"
     elif use_qnn:
         execution_provider = "/p:ExecutionProvider=qnn"
         package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.QNN"
@@ -2625,7 +2584,7 @@ def main():
     if args.use_tensorrt:
         args.use_cuda = True
 
-    if args.build_wheel or args.gen_doc or args.use_tvm or args.enable_training:
+    if args.build_wheel or args.gen_doc or args.enable_training:
         args.enable_pybind = True
 
     if (
@@ -2907,12 +2866,6 @@ def main():
 
         run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs)
 
-        # TODO(agladyshev):
-        # to support Windows, we need to update .github/workflows/windows.yml
-        # and add to the PATH variable the following value: C:Program Files\LLVM\bin
-        if args.enable_pybind and args.use_tvm and not is_windows():
-            tvm_run_python_tests(build_dir, configs)
-
         # run node.js binding tests
         if args.build_nodejs and not args.skip_nodejs_tests:
             nodejs_binding_dir = os.path.normpath(os.path.join(source_dir, "js", "node"))
@@ -2940,7 +2893,6 @@ def main():
                 args.use_dnnl,
                 args.use_tensorrt,
                 args.use_openvino,
-                args.use_tvm,
                 args.use_vitisai,
                 args.use_acl,
                 args.use_armnn,
@@ -2968,7 +2920,6 @@ def main():
                 args.use_openvino,
                 args.use_tensorrt,
                 args.use_dnnl,
-                args.use_tvm,
                 args.use_winml,
                 args.use_qnn,
                 args.enable_training_apis,
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index 19f66245a45e2..1b34b3d302e57 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -23,11 +23,11 @@
 
 # Onnx Runtime native library is built against NDK API 21 by default
 # It is possible to build from source for Android API levels below 21, but it is not guaranteed
-DEFAULT_ANDROID_MIN_SDK_VER = 21
+DEFAULT_ANDROID_MIN_SDK_VER = 24
 
 # Android API 24 is the default target API version for Android builds, based on Microsoft 1CS requirements
 # It is possible to build from source using API level 21 and higher as the target SDK version
-DEFAULT_ANDROID_TARGET_SDK_VER = 24
+DEFAULT_ANDROID_TARGET_SDK_VER = 34
 
 
 def _parse_build_settings(args):
diff --git a/tools/ci_build/github/android/default_full_aar_build_settings.json b/tools/ci_build/github/android/default_full_aar_build_settings.json
index b0eff75812673..1c7769c623d41 100644
--- a/tools/ci_build/github/android/default_full_aar_build_settings.json
+++ b/tools/ci_build/github/android/default_full_aar_build_settings.json
@@ -5,8 +5,8 @@
         "x86",
         "x86_64"
     ],
-    "android_min_sdk_version": 21,
-    "android_target_sdk_version": 24,
+    "android_min_sdk_version": 24,
+    "android_target_sdk_version": 34,
     "build_params": [
         "--enable_lto",
         "--android",
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index b269026ea02ac..4991b4329646f 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -13,6 +13,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:ConvTranspose|Weight and bias must be constant.<br/>padding_type of SAME_UPPER/SAME_LOWER is not supported.<br/>kernel_shape must have default values.<br/>output_shape is not supported.<br/>output_padding must have default values.|
 |ai.onnx:DepthToSpace|If 'mode' is 'CRD' the input must have a fixed shape.|
 |ai.onnx:Div||
+|ai.onnx:Erf||
 |ai.onnx:Gemm|Input B must be constant.|
 |ai.onnx:Gelu||
 |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
@@ -24,17 +25,26 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:LeakyRelu||
 |ai.onnx:MatMul|Only support for transA == 0, alpha == 1.0 and beta == 1.0 is currently implemented.|
 |ai.onnx:MaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
+|ai.onnx:Max||
 |ai.onnx:Mul||
 |ai.onnx:Pow|Only supports cases when both inputs are fp32.|
 |ai.onnx:PRelu||
 |ai.onnx:Reciprocal|this ask for a `epislon` (default 1e-4) where onnx don't provide|
+|ai.onnx:ReduceSum||
+|ai.onnx:ReduceMean||
+|ai.onnx:ReduceMax||
 |ai.onnx:Relu||
 |ai.onnx:Reshape||
 |ai.onnx:Resize|See [resize_op_builder.cc](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc) implementation. There are too many permutations to describe the valid combinations.|
+|ai.onnx:Round||
+|ai.onnx:Shape||
 |ai.onnx:Slice|starts/ends/axes/steps must be constant initializers.|
 |ai.onnx:Split|If provided, `splits` must be constant.|
 |ai.onnx:Sub||
 |ai.onnx:Sigmoid||
+|ai.onnx:Softmax||
 |ai.onnx:Sqrt||
+|ai.onnx:Squeeze||
 |ai.onnx:Tanh||
 |ai.onnx:Transpose||
+|ai.onnx:Unsqueeze||
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index 20252220da8f9..c3dbee336b69d 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 jobs:
 - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 0da1f8fc8d540..aca06c320d1d3 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -41,7 +41,7 @@ parameters:
 
 variables:
   - name: docker_base_image
-    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1
+    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
   - name: linux_trt_version
     value: 10.3.0.26-1.cuda11.8
   - name: Repository
@@ -115,7 +115,7 @@ stages:
               set -ex; \
               env; \
               ccache -s; \
-              /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
+              /opt/python/cp310-cp310/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
                 --build_dir /build --cmake_generator Ninja \
                 --config Release --update --build \
                 --skip_submodule_sync \
@@ -180,6 +180,17 @@ stages:
         TargetPath: '$(Build.BinariesDirectory)/Release'
         SpecificArtifact: ${{ parameters.specificArtifact }}
         BuildId: ${{ parameters.BuildId }}
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
+        Context: tools/ci_build/github/linux/docker/
+        ScriptName: tools/ci_build/get_docker_image.py
+        DockerBuildArgs: "
+        --build-arg BUILD_UID=$( id -u )
+        "
+        Repository: onnxruntimeubuntupackagestest_cuda11
+        UseImageCacheContainerRegistry: false
+        UpdateDepsTxt: false
 
     - task: Cache@2
       inputs:
@@ -196,18 +207,15 @@ stages:
           -v $(Build.BinariesDirectory)/Release:/Release \
           -v $(STABLE_DIFFUSION_MODEL_CACHE):/model_cache:rw \
           -v $(GenerateImage_DIR):/images:rw \
-          nvcr.io/nvidia/pytorch:22.11-py3 \
+          onnxruntimeubuntupackagestest_cuda11 \
           bash -c ' \
             set -ex; \
-            pip uninstall -y $(pip list --format=freeze | grep opencv); \
-            rm -rf /usr/local/lib/python3.8/dist-packages/cv2/; \
-            apt-get update; \
-            DEBIAN_FRONTEND="noninteractive" apt-get install --yes python3-opencv; \
             python3 --version; \
             python3 -m pip install --upgrade pip; \
             python3 -m pip install /Release/*.whl; \
             pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion; \
             python3 -m pip install -r requirements/cuda11/requirements.txt; \
+            python3 -m pip install numpy==1.22.2; \
             python3 -m pip install --upgrade polygraphy onnx-graphsurgeon ; \
             echo Generate an image guided by a text prompt; \
             python3 demo_txt2img.py --framework-model-dir /model_cache --seed 1 --deterministic "astronaut riding a horse on mars" ; \
@@ -238,7 +246,7 @@ stages:
     - script: |
         docker run --rm --gpus all -v $PWD:/workspace \
           -v $(CLIP_MODEL_CACHE):/model_cache:rw  \
-          nvcr.io/nvidia/pytorch:22.11-py3 \
+          onnxruntimeubuntupackagestest_cuda11 \
           bash -c '
             set -x; \
             python3 --version; \
@@ -265,7 +273,7 @@ stages:
     - script: |
         docker run --rm --gpus all -v $PWD:/workspace \
           -v $(CLIP_MODEL_CACHE):/model_cache:rw  \
-          nvcr.io/nvidia/pytorch:22.11-py3 \
+          onnxruntimeubuntupackagestest_cuda11 \
           bash -c '
             set -ex; \
             python3 --version; \
@@ -273,6 +281,7 @@ stages:
             pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion/; \
             image2=$(find $(pwd) -name "astronaut_riding_a_h*.png") ; \
             pushd test; \
+            python3 -m pip install numpy==1.22.2; \
             python3 -m pip install -r requirements.txt; \
             echo check demo_txt2image.py generate image; \
             python3 -u check_image.py --image1 astronaut_riding_txt2image-DDIM-50.png --image2 $image2 --cache_dir /model_cache ; \
@@ -438,7 +447,7 @@ stages:
 
     - template: templates/get-docker-image-steps.yml
       parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
         Context: tools/ci_build/github/linux/docker/
         ScriptName: tools/ci_build/get_docker_image.py
         DockerBuildArgs: '--build-arg BUILD_UID=$( id -u )'
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 43cd21dcfccd0..798868f3b957e 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -112,6 +112,7 @@ stages:
     BuildVariant: 'default'
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
+    QnnSDKVersion: ${{ parameters.QnnSdk }}
 
 - template: stages/java-cuda-packaging-stage.yml
   parameters:
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index 9b103715e734d..bc33aba57ec93 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -98,7 +98,6 @@ stages:
     jobs:
     - template: templates/c-api-linux-cpu.yml
       parameters:
-        BaseImage: 'registry.access.redhat.com/ubi8/ubi'
         OnnxruntimeArch: 'x64'
         OnnxruntimeNodejsBindingArch: 'x64'
         PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 002f74b4eb63c..2eb2839cdac02 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -54,7 +54,7 @@ stages:
         parameters:
           Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
           Context: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu
-          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=registry.access.redhat.com/ubi8/ubi"
+          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
           Repository: onnxruntimecpubuildcentos8x64
 
       - template: templates/linux-build-step-with-cache.yml
@@ -149,7 +149,7 @@ stages:
         parameters:
           Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
           Context: tools/ci_build/github/linux/docker/
-          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=registry.access.redhat.com/ubi8/ubi"
+          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
           Repository: onnxruntimecpubuild
 
       - task: PythonScript@0
@@ -299,6 +299,7 @@ stages:
       machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
       with_cache: true
       cmake_build_type: Release
+      python_exe_path: '/opt/python/cp310-cp310/bin/python3.10'
 
 - stage: arm64_test
   dependsOn: ['arm64_build']
@@ -306,4 +307,27 @@ stages:
   - template: templates/py-packaging-linux-test-cpu.yml
     parameters:
       arch: 'aarch64'
+      ep: 'cpu'
+      machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
+
+- stage: arm64_build_xnnpack
+  dependsOn: []
+  jobs:
+  - template: templates/py-linux.yml
+    parameters:
+      arch: 'aarch64'
+      machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
+      with_cache: true
+      cmake_build_type: Release
+      ep: 'XNNPack'
+      extra_build_arg: '--use_xnnpack'
+      python_exe_path: '/opt/python/cp310-cp310/bin/python3.10'
+
+- stage: arm64_test_xnnpack
+  dependsOn: ['arm64_build_xnnpack']
+  jobs:
+  - template: templates/py-packaging-linux-test-cpu.yml
+    parameters:
+      arch: 'aarch64'
+      ep: 'XNNPack'
       machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
index 2d3260a13f13a..4964d33067092 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
@@ -128,7 +128,7 @@ jobs:
           -e NIGHTLY_BUILD \
           -e BUILD_BUILDNUMBER \
           onnxruntimecpubuild \
-            /opt/python/cp38-cp38/bin/python3.8 /onnxruntime_src/tools/ci_build/build.py \
+            /opt/python/cp310-cp310/bin/python3.10 /onnxruntime_src/tools/ci_build/build.py \
               --build_dir /build/2 --cmake_generator Ninja \
               --config Debug \
               --skip_submodule_sync \
@@ -210,7 +210,7 @@ jobs:
           -e NIGHTLY_BUILD \
           -e BUILD_BUILDNUMBER \
           onnxruntimecpubuild \
-            /opt/python/cp38-cp38/bin/python3.8 /onnxruntime_src/tools/ci_build/build.py \
+            /opt/python/cp310-cp310/bin/python3.10 /onnxruntime_src/tools/ci_build/build.py \
               --build_dir /build/5 --cmake_generator Ninja \
               --config Debug \
               --skip_submodule_sync \
@@ -231,7 +231,7 @@ jobs:
           -e NIGHTLY_BUILD \
           -e BUILD_BUILDNUMBER \
           onnxruntimecpubuild \
-            /opt/python/cp38-cp38/bin/python3.8 /onnxruntime_src/tools/ci_build/build.py \
+            /opt/python/cp310-cp310/bin/python3.10 /onnxruntime_src/tools/ci_build/build.py \
               --build_dir /build/6a \
               --cmake_generator Ninja \
               --config MinSizeRel \
@@ -258,7 +258,7 @@ jobs:
           -e NIGHTLY_BUILD \
           -e BUILD_BUILDNUMBER \
           onnxruntimecpubuild \
-            /opt/python/cp38-cp38/bin/python3.8 /onnxruntime_src/tools/ci_build/build.py \
+            /opt/python/cp310-cp310/bin/python3.10 /onnxruntime_src/tools/ci_build/build.py \
               --build_dir /build/6b \
               --cmake_generator Ninja \
               --config MinSizeRel \
@@ -287,7 +287,7 @@ jobs:
           -e NIGHTLY_BUILD \
           -e BUILD_BUILDNUMBER \
           onnxruntimecpubuild \
-            /opt/python/cp38-cp38/bin/python3.8 /onnxruntime_src/tools/ci_build/build.py \
+            /opt/python/cp310-cp310/bin/python3.10 /onnxruntime_src/tools/ci_build/build.py \
               --build_dir /build/6c \
               --cmake_generator Ninja \
               --config MinSizeRel \
@@ -317,7 +317,7 @@ jobs:
           -e ALLOW_RELEASED_ONNX_OPSET_ONLY=1 \
           -e NIGHTLY_BUILD \
           onnxruntimecpubuild \
-            /opt/python/cp38-cp38/bin/python3.8 /onnxruntime_src/tools/ci_build/build.py \
+            /opt/python/cp310-cp310/bin/python3.10 /onnxruntime_src/tools/ci_build/build.py \
               --build_dir /build/7 \
               --cmake_generator Ninja \
               --config MinSizeRel \
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index b0f40429c1a1e..7bb1deb60c6ba 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -49,9 +49,9 @@ parameters:
 variables:
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3
 
   - name: Repository
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 14a5c10abc74e..9d60c9ea17cd8 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -40,9 +40,9 @@ variables:
   - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
index d8c0120fc9ee5..9ee589a3d6ef3 100644
--- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
@@ -33,5 +33,5 @@ jobs:
   parameters:
     AgentPool : 'Linux-CPU-2019'
     JobName: 'Linux_CI_Dev'
-    RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2024.3.0 -x "--use_openvino CPU --build_wheel"'
+    RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2024.4.0 -x "--use_openvino CPU --build_wheel"'
     TimeoutInMinutes: 120
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index 03859b1548fd2..d3826d90f9073 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -33,7 +33,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 jobs:
   - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
index f92144ee45168..9296928ad97e0 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
@@ -18,7 +18,7 @@ stages:
           machine_pool: 'Onnxruntime-Linux-GPU'
           python_wheel_suffix: '_gpu'
           timeout: 480
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
           trt_version: '10.6.0.26-1.cuda11.8'
           cuda_version: '11.8'
 
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
index f4cfc6c59bbfa..307415b7be16f 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
@@ -18,6 +18,6 @@ stages:
           machine_pool: 'Onnxruntime-Linux-GPU'
           python_wheel_suffix: '_gpu'
           timeout: 480
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3
           cuda_version: '12.2'
 
diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
index c329e0926b046..bd33282fd494e 100644
--- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
@@ -59,7 +59,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 trigger: none
 
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index f2c0561368a9e..d54b8018c232a 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -2,7 +2,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 - name: build_config
   displayName: Build Configuration
diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
index a38486995478d..716383fd61dbb 100644
--- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
@@ -148,9 +148,9 @@ stages:
       value: false
     - name: docker_base_image
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1
+        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1
+        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3
     timeoutInMinutes: 60
 
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
index d08cbc8e73272..47092393e0039 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
@@ -45,9 +45,9 @@ jobs:
       - template: ../../templates/common-variables.yml
       - name: docker_base_image
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
         ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3
       - name: linux_trt_version
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: ${{ variables.linux_trt_version_cuda11 }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml
new file mode 100644
index 0000000000000..9a721c65de332
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml
@@ -0,0 +1,21 @@
+parameters:
+- name: EP_NAME
+  type: string
+  default: CPU
+
+- name: PYTHON_VERSION
+  type: string
+
+steps:
+- powershell: |
+    python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
+    Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+    mkdir -p $(Agent.TempDirectory)\ort_test_data
+    Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
+    Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
+    cd $(Agent.TempDirectory)\ort_test_data
+    python onnx_backend_test_series.py --devices ${{ parameters.EP_NAME }} -v
+    cd $(Agent.TempDirectory)
+    Remove-Item -Path $(Agent.TempDirectory)\ort_test_data -Recurse -Force
+  workingDirectory: '$(Build.sourcesDirectory)'
+  displayName: 'Run Python Tests with ${{ parameters.EP_NAME }} EP'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index c4d57a66da519..72df94c9ea672 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -59,7 +59,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 stages:
 - ${{ if eq(parameters.enable_windows_cpu, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index a5684be0f0eac..f7235e3ad2076 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -56,7 +56,7 @@ stages:
           PYTHON_VERSION: ${{ python_version }}
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
-          EP_BUILD_FLAGS: --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          EP_BUILD_FLAGS: --use_dml --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
           use_tensorrt: True
 
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
@@ -68,9 +68,9 @@ stages:
           cmake_build_type: ${{ parameters.cmake_build_type }}
           cuda_version: ${{ parameters.cuda_version }}
           ${{ if eq(parameters.cuda_version, '11.8') }}:
-            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1
+            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
           ${{ if eq(parameters.cuda_version, '12.2') }}:
-            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1
+            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3
 
   - ${{ if eq(parameters.enable_windows_dml, true) }}:
     - ${{ each python_version in parameters.PythonVersions }}:
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index aa7f2845fc0fa..dd0539f751c89 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -33,7 +33,7 @@ parameters:
    - Release
    - RelWithDebInfo
    - MinSizeRel
-   
+
 - name: use_tensorrt
   type: boolean
   default: false
@@ -134,7 +134,7 @@ stages:
                 --cmake_generator "$(VSGenerator)"
                 --enable_pybind
                 --enable_onnx_tests
-                --parallel --use_binskim_compliant_compile_flags --update --build
+                --parallel 4 --use_binskim_compliant_compile_flags --update --build
                 $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }}
               workingDirectory: '$(Build.BinariesDirectory)'
 
@@ -206,19 +206,20 @@ stages:
             DownloadTRT: ${{ parameters.use_tensorrt }}
 
         - task: PowerShell@2
-          displayName: 'Install ONNX'
+          displayName: 'Install Third Party Dependencies'
           inputs:
             filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
             workingDirectory: '$(Build.BinariesDirectory)'
             arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\installed -build_config ${{ parameters.cmake_build_type }}
 
-        - powershell: |
-            python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
-            Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
-            mkdir -p $(Agent.TempDirectory)\ort_test_data
-            Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
-            Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
-            cd $(Agent.TempDirectory)\ort_test_data
-            python onnx_backend_test_series.py
-          workingDirectory: '$(Build.sourcesDirectory)'
-          displayName: 'Run Python Tests'
+        - template: jobs/steps/py_packaging_test_step.yml
+          parameters:
+            EP_NAME: DML
+            PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }}
+
+        - template: jobs/steps/py_packaging_test_step.yml
+          parameters:
+            EP_NAME: CUDA
+            PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }}
+
+
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml b/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml
index acce2a4098ed0..4d9606d82ced2 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-binary-size-check-stage.yml
@@ -91,7 +91,7 @@ stages:
             -e BUILD_REASON=$(Build.Reason) \
             -e BUILD_BRANCH=$(Build.SourceBranch) \
             onnxruntimecpubuild \
-              /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/github/linux/ort_minimal/build_ort_and_check_binary_size.py \
+              /opt/python/cp310-cp310/bin/python3 /onnxruntime_src/tools/ci_build/github/linux/ort_minimal/build_ort_and_check_binary_size.py \
                 --build_dir /build/1a \
                 ${BINARY_SIZE_THRESHOLD_ARGS} \
                 "/onnxruntime_src/${{ parameters.BuildConfigFile }}"
@@ -147,7 +147,7 @@ stages:
               -e BUILD_REASON=$(Build.Reason) \
               -e BUILD_BRANCH=$(Build.SourceBranch) \
               onnxruntimecpubuild \
-                /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/github/linux/ort_minimal/build_ort_and_check_binary_size.py \
+                /opt/python/cp310-cp310/bin/python3 /onnxruntime_src/tools/ci_build/github/linux/ort_minimal/build_ort_and_check_binary_size.py \
                   --build_dir /build/1b \
                   --with_debug_info \
                   "/onnxruntime_src/${{ parameters.BuildConfigFile }}"
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
index 13236453f9906..29caa7fa4955a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
@@ -19,6 +19,11 @@ parameters:
   type: string
   default: ''
 
+- name: QnnSDKVersion
+  displayName: QNN SDK Version
+  type: string
+  default: '2.28.0.241029'
+
 jobs:
 - job: Final_AAR_Testing_Android_${{ parameters.job_name_suffix }}
   workspace:
@@ -50,36 +55,61 @@ jobs:
 
   - template: use-android-ndk.yml
 
-  - template: use-android-emulator.yml
-    parameters:
-      create: true
-      start: true
-
   - script: |
-      set -e -x
-      mkdir android_test
-      cd android_test
-      cp -av $(Build.SourcesDirectory)/java/src/test/android ./
-      cd ./android
-      mkdir -p app/libs
-      cp $(Build.BinariesDirectory)/final-android-aar/${{parameters.packageName}}-$(OnnxRuntimeVersion)${{parameters.ReleaseVersionSuffix}}.aar app/libs/onnxruntime-android.aar
-      $(Build.SourcesDirectory)/java/gradlew --no-daemon clean connectedDebugAndroidTest --stacktrace
-    displayName: Run E2E test using Emulator
+        set -e -x
+        mkdir -p android_test/android/app/libs
+        cd android_test/android
+        cp -av $(Build.SourcesDirectory)/java/src/test/android/* ./
+        cp $(Build.BinariesDirectory)/final-android-aar/${{parameters.packageName}}-$(OnnxRuntimeVersion)${{parameters.ReleaseVersionSuffix}}.aar app/libs/${{parameters.packageName}}.aar
+    displayName: Copy Android test files and AAR to android_test directory
     workingDirectory: $(Build.BinariesDirectory)
 
-  - template: use-android-emulator.yml
-    parameters:
-      stop: true
+  # skip emulator tests for qnn package as there are no arm64-v8a emulators and no qnn libraries for x86
+  - ${{ if not(contains(parameters.packageName, 'qnn')) }}:
+    - template: use-android-emulator.yml
+      parameters:
+        create: true
+        start: true
+
+    - script: |
+        set -e -x
+        cd android_test/android
+        $(Build.SourcesDirectory)/java/gradlew --no-daemon clean connectedDebugAndroidTest --stacktrace
+      displayName: Run E2E test using Emulator
+      workingDirectory: $(Build.BinariesDirectory)
+
+    - template: use-android-emulator.yml
+      parameters:
+        stop: true
+
+  - ${{ else }}:
+    - script: |
+        # QNN SDK version string, expected format: 2.28.0.241029
+        # Extract the first three parts of the version string to get the Maven package version (e.g., 2.28.0)
+        QnnMavenPackageVersion=$(echo ${{ parameters.QnnSDKVersion }} | cut -d'.' -f1-3)
+        echo "QnnMavenPackageVersion: $QnnMavenPackageVersion"
+        echo "##vso[task.setvariable variable=QnnMavenPackageVersion]$QnnMavenPackageVersion"
+      displayName: Trim QNN SDK version to major.minor.patch
+
+    - script: |
+          set -e -x
+          # build apks for qnn package as they are not built in the emulator test step
+          $(Build.SourcesDirectory)/java/gradlew  --no-daemon clean assembleDebug assembleAndroidTest -DqnnVersion=$(QnnMavenPackageVersion) --stacktrace
+      displayName: Build QNN APK
+      workingDirectory: $(Build.BinariesDirectory)/android_test/android
 
   # we run e2e tests on one older device (Pixel 3) and one newer device (Galaxy 23)
   - script: |
       set -e -x
       pip install requests
+
       python $(Build.SourcesDirectory)/tools/python/upload_and_run_browserstack_tests.py \
         --test_platform espresso \
         --app_path "debug/app-debug.apk" \
         --test_path "androidTest/debug/app-debug-androidTest.apk" \
-        --devices "Samsung Galaxy S23-13.0" "Google Pixel 3-9.0"
+        --devices "Samsung Galaxy S23-13.0" "Google Pixel 3-9.0" \
+        --build_tag "${{ parameters.packageName }}"
+
     displayName: Run E2E tests using Browserstack
     workingDirectory: $(Build.BinariesDirectory)/android_test/android/app/build/outputs/apk
     timeoutInMinutes: 15
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
index 51e47fde74bb2..c38736edd58f1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
@@ -48,6 +48,11 @@ parameters:
   type: string
   default: ''
 
+- name: QnnSDKVersion
+  displayName: QNN SDK Version
+  type: string
+  default: '2.28.0.241029'
+
 jobs:
 - job: Android_Java_API_AAR_Packaging_${{ parameters.job_name_suffix }}
   timeoutInMinutes: 120
@@ -85,6 +90,8 @@ jobs:
 
   - ${{ if contains(parameters.packageName, 'qnn') }}:
     - template: jobs/download_linux_qnn_sdk.yml
+      parameters:
+        QnnSDKVersion: '${{parameters.QnnSDKVersion}}'
 
   - task: CmdLine@2
     displayName: Build Android AAR Packages
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index a98efa8f3fc92..b105e919c5b12 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -48,6 +48,11 @@ parameters:
   type: string
   default: '0'
 
+- name: QnnSDKVersion
+  displayName: QNN SDK Version
+  type: string
+  default: 2.28.0.241029
+
 stages:
 - template: linux-cpu-packaging-pipeline.yml
   parameters:
@@ -98,7 +103,14 @@ stages:
       enable_code_sign: ${{ parameters.DoEsrp }}
       packageName: 'onnxruntime-android-qnn'
       ReleaseVersionSuffix: $(ReleaseVersionSuffix)
-  #TODO: Add test job for QNN Android AAR
+      QnnSDKVersion: ${{ parameters.QnnSDKVersion }}
+
+  - template: android-java-api-aar-test.yml
+    parameters:
+      artifactName: 'onnxruntime-android-qnn-aar'
+      job_name_suffix: 'QNN'
+      packageName: 'onnxruntime-android-qnn'
+      QnnSDKVersion: ${{ parameters.QnnSDKVersion }}
 
 - stage: iOS_Full_xcframework
   dependsOn: []
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
index c5bd4b93db947..d3b3315ebb04c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
@@ -5,9 +5,6 @@ parameters:
   type: string
   default: ''
 
-- name: BaseImage
-  type: string
-
 - name: OnnxruntimeArch
   type: string
 
@@ -50,7 +47,7 @@ jobs:
         parameters:
           Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
           Context: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu
-          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{parameters.BaseImage}}"
+          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
           Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}}_packaging
 
     - ${{ if eq(parameters.OnnxruntimeArch, 'aarch64') }}:
@@ -58,7 +55,7 @@ jobs:
         parameters:
           Dockerfile: tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
           Context: tools/ci_build/github/linux/docker/inference/aarch64/default/cpu
-          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{parameters.BaseImage}}"
+          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
           Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}}_packaging
           UpdateDepsTxt: false
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 4fa36a1ff548b..949479fb8b5e4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.200
+      version: 1.0.201
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.200
+      version: 1.0.201
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
index 97ca94e7ab516..179a846509cc1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.28.0.241029'
+    default: '2.28.2.241116'
 
 steps:
   - script: |
@@ -16,6 +16,29 @@ steps:
       echo $(QnnSDKRootDir)
     displayName: 'Print QnnSDKRootDir after downloading QNN SDK'
 
+  - script: |
+      set -x
+      sdk_file="$(QnnSDKRootDir)/sdk.yaml"
+      # Parse the sdk.yaml file to get the QNN SDK version downloaded
+      downloaded_qnn_sdk_version=$(grep '^version:' "$sdk_file" | head -n 1 | cut -d':' -f2 | xargs | cut -d'.' -f1-3 | tr -d '\r')
+
+      # Extract major.minor.patch part from QnnSDKVersion passed as parameter
+      expected_qnn_sdk_version=$(echo ${{ parameters.QnnSDKVersion }} | cut -d'.' -f1-3)
+
+      if [[ -z "$downloaded_qnn_sdk_version" ]]; then
+        echo "QNN version not found in sdk.yaml."
+        exit 1
+      fi
+
+      # Compare provided version with version from sdk.yaml
+      if [[ "$downloaded_qnn_sdk_version" == "$expected_qnn_sdk_version" ]]; then
+        echo "Success: QnnSDKVersion matches sdk.yaml version ($downloaded_qnn_sdk_version)."
+      else
+        echo "Error: QnnSDKVersion ($expected_qnn_sdk_version) does not match sdk.yaml version ($downloaded_qnn_sdk_version) in the QNN SDK directory"
+        exit 1
+      fi
+    displayName: "Sanity Check: QnnSDKVersion vs sdk.yaml version"
+
   - script: |
       azcopy cp --recursive 'https://lotusscus.blob.core.windows.net/models/qnnsdk/Qualcomm AI Hub Proprietary License.pdf' $(QnnSDKRootDir)
     displayName: 'Download Qualcomm AI Hub license'
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
index 6b318664d1b12..9df8b249f681e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.28.0.241029'
+    default: '2.28.2.241116'
 
 steps:
   - powershell: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
index b39d7edb8fb22..7bdd069de711b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -109,7 +109,7 @@ jobs:
   - ${{ if eq(parameters.buildArch, 'x64') }}:
       - task: JavaToolInstaller@0
         inputs:
-          versionSpec: '11'
+          versionSpec: '17'
           jdkArchitectureOption: ${{ parameters.buildArch }}
           jdkSourceOption: 'PreInstalled'
 
@@ -218,16 +218,32 @@ jobs:
       - powershell: |
          python3 -m pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml -qq
          Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
-
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
         displayName: 'Install onnxruntime wheel'
 
   - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
-      - powershell: |
-         python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022"  --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-
-        workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-        displayName: 'Run tests'
+      - ${{ if and(contains(parameters.additionalBuildFlags, 'use_cuda'), contains(parameters.additionalBuildFlags, 'use_dml')) }}:
+        - powershell: |
+           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+          displayName: 'Run tests excluding CUDA tests'
+          env:
+            NO_CUDA_TEST: '1'
+            GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*:*cpu_*models*' # Exclude CUDA EP tests under providers/cuda/ and cpu models test
+            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' # For onnxruntime4j_test to find dependent dlls
+        - powershell: |
+            python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+          displayName: 'Run tests excluding DML tests'
+          env:
+            NO_DML_TEST: '1'
+            GTEST_FILTER: '-*cpu_*models*'
+            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)'
+      - ${{ else }}:
+        - powershell: |
+           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+          displayName: 'Run tests'
 
   - ${{ if eq(parameters.GenerateDocumentation, true) }}:
     - task: PythonScript@0
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
index 8972d55f6e190..7ac2e3a8addb6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
@@ -29,7 +29,6 @@ stages:
   - template: c-api-linux-cpu.yml
     parameters:
       AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }}
-      BaseImage: 'registry.access.redhat.com/ubi8/ubi'
       OnnxruntimeArch: 'x64'
       OnnxruntimeNodejsBindingArch: 'x64'
       PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
@@ -40,7 +39,6 @@ stages:
   - template: c-api-linux-cpu.yml
     parameters:
       AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }}
-      BaseImage: 'arm64v8/almalinux:8'
       OnnxruntimeArch: 'aarch64'
       OnnxruntimeNodejsBindingArch: 'arm64'
       PoolName: 'onnxruntime-linux-ARM64-CPU-2019'
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
index 080079388a76c..ab31e592d7d71 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
@@ -68,9 +68,6 @@ stages:
   jobs:
     - job: MacOS_C_API_Package_Publish
       pool:
-        ${{ if eq(parameters.DoESRP, true)}}:
-          vmImage: 'macOS-12'
-        ${{ else }}:
           vmImage: 'macOS-13'
       steps:
       - checkout: none
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index d253fb0e746f4..7a1addffee0e3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -45,16 +45,17 @@ jobs:
     submodules: none
 
   - task: UsePythonVersion@0
-    displayName: Use Python 3.11
+    displayName: Use Python 3.10
     inputs:
-      versionSpec: 3.11
+      versionSpec: 3.10
+
   - task: NodeTool@0
     inputs:
       versionSpec: '20.x'
 
   - task: JavaToolInstaller@0
     inputs:
-      versionSpec: "11"
+      versionSpec: "17"
       jdkArchitectureOption: "x64"
       jdkSourceOption: 'PreInstalled'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
deleted file mode 100644
index 5f073433265fa..0000000000000
--- a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-parameters:
-- name: DockerImageTag
-  type: string
-- name: BuildConfig
-  type: string
-
-steps:
-
-- template: jobs/download_training_test_data.yml
-
-  # Entry point for all ORTModule tests
-  # The onnxruntime folder is deleted in the build directory
-  # to enforce use of the onnxruntime wheel
-  # Uninstall orttraining requirements.txt and install ortmodule requirements.txt before running tests.
-- script: |
-    docker run \
-      --gpus all \
-      --shm-size=1024m \
-      --rm \
-      --volume $(Build.SourcesDirectory):/onnxruntime_src \
-      --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \
-      --volume $(Agent.TempDirectory)/mnist:/mnist \
-      ${{ parameters.DockerImageTag }} \
-        bash -c "rm -rf /build/onnxruntime/ && python3 -m pip show torch && python3 -m pip install torch==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw' --cwd /build" \
-  displayName: 'Run orttraining_ortmodule_tests.py'
-  condition: succeededOrFailed()
-  timeoutInMinutes: 60
-
-# Entry point for all ort training api tests
-- script: |
-    docker run \
-      --gpus all \
-      --shm-size=1024m \
-      --rm \
-      --volume $(Build.SourcesDirectory):/onnxruntime_src \
-      --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \
-      ${{ parameters.DockerImageTag }} \
-        bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m pip install torch==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \
-  displayName: 'Run ORT Training APIs Tests'
-  condition: succeededOrFailed()
-  timeoutInMinutes: 120
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
index d2ce7c84aa40d..b1cec2284df65 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
@@ -26,7 +26,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 jobs:
 - job: Linux_py_qnn_Wheels_x64
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
index dd9d2412f8f91..c7becac763e28 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
@@ -26,8 +26,16 @@ parameters:
   type: string
   default: ''
 
+- name: ep
+  type: string
+  default: 'cpu'
+
+- name: python_exe_path
+  type: string
+  default: ''
+
 jobs:
-- job: Linux_py_Wheels_${{ parameters.arch }}
+- job: Linux_py_Wheels_${{ parameters.arch }}_${{parameters.ep}}
   timeoutInMinutes: 240
   workspace:
     clean: all
@@ -42,9 +50,15 @@ jobs:
       value: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
     - name: extra_build_args
       ${{ if ne(parameters.extra_build_arg, '') }}:
-        value: -x ${{ parameters.extra_build_arg }}
+        value: '-x ${{ parameters.extra_build_arg }}'
       ${{ if eq(parameters.extra_build_arg, '') }}:
         value: ''
+    - name: python_exe_path
+      ${{ if ne(parameters.python_exe_path, '') }}:
+        value: '-p ${{ parameters.python_exe_path }}'
+      ${{ if eq(parameters.python_exe_path, '') }}:
+        value: ''
+
   steps:
     - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
       displayName: 'Clean Agent Directories'
@@ -78,7 +92,7 @@ jobs:
             inputs:
               targetType: filePath
               filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
-              arguments: -i onnxruntimecpubuildpython${{ parameters.arch }} -d "${{ parameters.device }}" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
+              arguments: -i onnxruntimecpubuildpython${{ parameters.arch }} -d "${{ parameters.device }}" -c ${{ parameters.cmake_build_type }} $(extra_build_args) $(python_exe_path)
             ${{ if eq(parameters.with_cache, 'true') }}:
               env:
                 ADDITIONAL_DOCKER_PARAMETER: "--volume $(ORT_CACHE_DIR):/cache -e CCACHE_DIR=/cache -e ORT_BUILD_WITH_CACHE=1"
@@ -87,14 +101,14 @@ jobs:
       displayName: 'Publish Artifact: ONNXRuntime python wheel'
       inputs:
         PathtoPublish: '$(Build.BinariesDirectory)/dist'
-        ArtifactName: onnxruntime
+        ArtifactName: onnxruntime-${{ parameters.ep }}
 
     - task: PublishPipelineArtifact@0
       displayName: 'Publish Test Binaries'
       inputs:
-        artifactName: 'drop-linux-cpu-${{ parameters.arch }}'
+        artifactName: 'drop-linux-cpu-${{ parameters.arch }}-${{ parameters.ep }}'
         targetPath: '$(Build.BinariesDirectory)/${{ parameters.cmake_build_type }}'
 
     - template: component-governance-component-detection-steps.yml
       parameters :
-        condition : 'succeeded'
\ No newline at end of file
+        condition : 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml
index 0c7c356393b54..bfa6b0d32cab5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml
@@ -28,6 +28,10 @@ parameters:
   type: number
   default: 120
 
+- name: ep
+  type: string
+  default: 'cpu'
+
 jobs:
 - job: Linux_Test_CPU${{ parameters.extra_job_id }}_${{ parameters.arch }}
   timeoutInMinutes: ${{ parameters.timeout }}
@@ -43,30 +47,30 @@ jobs:
   # The public ADO project
   - ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}:
     - download: current   # pipeline resource identifier.
-      artifact: 'drop-linux-cpu-${{ parameters.arch }}'
+      artifact: 'drop-linux-cpu-${{ parameters.arch }}-${{parameters.ep}}'
 
     - download: current   # pipeline resource identifier.
-      artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}'
+      artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}-${{ parameters.ep }}'
 
     - bash: |
         set -e -x
-        mv "$(Pipeline.Workspace)/drop-linux-cpu-${{ parameters.arch }}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}}
-        mv "$(Pipeline.Workspace)/onnxruntime${{ parameters.python_wheel_suffix }}" "$(Build.BinariesDirectory)/whl"
+        mv "$(Pipeline.Workspace)/drop-linux-cpu-${{ parameters.arch }}-${{parameters.ep}}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}}
+        mv "$(Pipeline.Workspace)/onnxruntime${{ parameters.python_wheel_suffix }}-${{parameters.ep}}" "$(Build.BinariesDirectory)/whl"
         cp -r "$(Build.BinariesDirectory)/whl" $(Build.BinariesDirectory)/tmp
         find "$(Build.BinariesDirectory)/tmp" -name '*.whl' -exec bash -c 'unzip -d "${1%.*}" "$1"' _ {} \;
   # The private ADO project
   - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
     - download: build   # pipeline resource identifier.
-      artifact: 'drop-linux-cpu-${{ parameters.arch }}'
+      artifact: 'drop-linux-cpu-${{ parameters.arch }}-${{parameters.ep}}'
 
     - download: build   # pipeline resource identifier.
-      artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}'
+      artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}-${{ parameters.ep }}'
 
     - bash: |
         set -e -x
         ls $(Pipeline.Workspace)/build
-        mv "$(Pipeline.Workspace)/build/drop-linux-cpu-${{ parameters.arch }}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}}
-        mv "$(Pipeline.Workspace)/build/onnxruntime${{ parameters.python_wheel_suffix }}" "$(Build.BinariesDirectory)/whl"
+        mv "$(Pipeline.Workspace)/build/drop-linux-cpu-${{ parameters.arch }}-${{parameters.ep}}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}}
+        mv "$(Pipeline.Workspace)/build/onnxruntime${{ parameters.python_wheel_suffix }}-${{parameters.ep}}" "$(Build.BinariesDirectory)/whl"
         cp -r "$(Build.BinariesDirectory)/whl" $(Build.BinariesDirectory)/tmp
         find "$(Build.BinariesDirectory)/tmp" -name '*.whl' -exec bash -c 'unzip -d "${1%.*}" "$1"' _ {} \;
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
deleted file mode 100644
index fc163d17e44a9..0000000000000
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
+++ /dev/null
@@ -1,209 +0,0 @@
-parameters:
-  build_py_parameters: ''
-  torch_version: ''
-  opset_version: ''
-  cuda_version: ''
-  cmake_cuda_architectures: ''
-  docker_file: ''
-  upload_wheel: ''
-  debug_build:  ''
-  python_version: ''
-  stage_name: ''
-  SpecificArtifact: false
-  BuildId: '0'
-  build_pool_name: ''
-
-stages:
-  - stage: Build_${{ parameters.stage_name }}
-    variables:
-      - name: isMain
-        value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
-      - name: finalStorage
-        ${{ if eq(variables['isMain'], 'true') }}:
-          value: '--final_storage'
-        ${{ else }}:
-          value: ''
-      - name: buildConfig
-        ${{ if eq(parameters['debug_build'], 'true') }}:
-          value: 'Debug'
-        ${{ else }}:
-          value: 'Release'
-      - name: PythonVersion
-        value: ${{ parameters.python_version }}
-      - name: Repository
-        value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }}
-    dependsOn: []
-
-    jobs:
-    - job: Build
-      pool: ${{ parameters.build_pool_name }}
-      timeoutInMinutes: 180
-      steps:
-        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-          displayName: 'Clean Agent Directories'
-          condition: always()
-
-        - task: CmdLine@2
-          displayName: 'check variables'
-          inputs:
-            script: |
-              echo "Branch is "${{ variables['Build.SourceBranch'] }} && \
-              echo "isMain is "${{ variables['isMain'] }} && \
-              echo "final_storage is "${{ variables['finalStorage'] }}
-
-        - checkout: self
-          clean: true
-          submodules: recursive
-
-        - template: set-python-manylinux-variables-step.yml
-
-        - template: get-docker-image-steps.yml
-          parameters:
-            Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
-            Context: tools/ci_build/github/linux/docker
-            DockerBuildArgs: >-
-              --build-arg TORCH_VERSION=${{ parameters.torch_version }}
-              --build-arg OPSET_VERSION=${{ parameters.opset_version }}
-              --build-arg PYTHON_VERSION=${{ parameters.python_version }}
-              --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
-              --build-arg BUILD_UID=$(id -u)
-            Repository: $(Repository)
-
-        - task: CmdLine@2
-          displayName: 'build onnxruntime'
-          inputs:
-            script: |
-              set -e -x
-              mkdir -p $HOME/.onnx
-              docker run --rm \
-                --volume /data/onnx:/data/onnx:ro \
-                --volume $(Build.SourcesDirectory):/onnxruntime_src \
-                --volume $(Build.BinariesDirectory):/build \
-                --volume /data/models:/build/models:ro \
-                --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-                -e NIGHTLY_BUILD \
-                -e DEFAULT_TRAINING_PACKAGE_DEVICE \
-                -e BUILD_BUILDNUMBER \
-                -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \
-                $(Repository) \
-                  $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
-                    --build_dir /build \
-                    --config ${{ variables['buildConfig'] }} \
-                    --skip_submodule_sync \
-                    --parallel --use_binskim_compliant_compile_flags \
-                    --build_wheel \
-                    --enable_onnx_tests \
-                    ${{ parameters.build_py_parameters }} \
-                    --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \
-                    --use_cuda --cuda_version=${{ parameters.cuda_version }} --cuda_home=/usr/local/cuda-${{ parameters.cuda_version }} --cudnn_home=/usr/local/cuda-${{ parameters.cuda_version }};
-            workingDirectory: $(Build.SourcesDirectory)
-
-        - task: CopyFiles@2
-          displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-          inputs:
-            SourceFolder: '$(Build.BinariesDirectory)'
-            Contents: "${{ variables['buildConfig'] }}/dist/*.whl"
-            TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-        - task: PublishBuildArtifacts@1
-          displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation'
-          inputs:
-            ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}"
-
-        - template: component-governance-component-detection-steps.yml
-          parameters:
-            condition: 'succeeded'
-
-        - template: clean-agent-build-directory-step.yml
-
-  - stage: Test_${{ parameters.stage_name }}
-    variables:
-      - name: isMain
-        value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }}
-      - name: finalStorage
-        ${{ if eq(variables['isMain'], 'true') }}:
-          value: '--final_storage'
-        ${{ else }}:
-          value: ''
-      - name: buildConfig
-        ${{ if eq(parameters['debug_build'], 'true') }}:
-          value: 'Debug'
-        ${{ else }}:
-          value: 'Release'
-      - name: PythonVersion
-        value: ${{ parameters.python_version }}
-      - name: Repository
-        value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }}
-      - name: UploadWheel
-        value: ${{ parameters.upload_wheel }}
-    dependsOn: Build_${{ parameters.stage_name }}
-    jobs:
-    - job: Test_GPU
-      pool: Onnxruntime-Linux-GPU
-      steps:
-        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-          displayName: 'Clean Agent Directories'
-          condition: always()
-
-        - checkout: self
-          clean: true
-          submodules: none
-
-        - template: jobs/download_training_test_data.yml
-
-        - template: set-python-manylinux-variables-step.yml
-
-        - template: flex-downloadPipelineArtifact.yml
-          parameters:
-            ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}"
-            StepName: 'Download Pipeline Artifact - Linux Training Build'
-            TargetPath: '$(Build.ArtifactStagingDirectory)'
-            SpecificArtifact: ${{ parameters.SpecificArtifact }}
-            BuildId: ${{ parameters.BuildId }}
-
-        - script: |
-            set -e -x
-            whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1)  ; \
-            echo $whlfilename ; du -sh $whlfilename ; \
-            (( $(wc -c < "$whlfilename") -  400*1024*1024 < 0 )) ||  ( echo 'Wheel size bigger than 400M'; exit 1)
-          displayName: 'Check wheel size'
-          continueOnError: true
-
-        - template: get-docker-image-steps.yml
-          parameters:
-            Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
-            Context: tools/ci_build/github/linux/docker
-            DockerBuildArgs: >-
-              --build-arg TORCH_VERSION=${{ parameters.torch_version }}
-              --build-arg OPSET_VERSION=${{ parameters.opset_version }}
-              --build-arg PYTHON_VERSION=${{ parameters.python_version }}
-              --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
-              --build-arg BUILD_UID=$(id -u)
-            Repository: $(Repository)
-
-        - task: CmdLine@2
-          displayName: 'test ortmodule'
-          inputs:
-            script: |
-              set -ex ; \
-              whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
-              echo $whlfilename ; \
-              basefilename=$(basename $whlfilename) ; \
-              docker run --rm \
-                --gpus all \
-                -e NVIDIA_VISIBLE_DEVICES=all \
-                --volume $(Build.ArtifactStagingDirectory):/build \
-                --volume $(Agent.TempDirectory)/MNIST:/mnist \
-                $(Repository) \
-                  bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/$basefilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ;
-            workingDirectory: $(Build.SourcesDirectory)
-
-        - task: CmdLine@2
-          displayName: 'Upload wheel'
-          condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true')))
-          inputs:
-            script: |
-              set -e -x
-              whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \
-              python3 tools/ci_build/upload_python_package_to_azure_storage.py \
-                  --python_wheel_path $whlfilename ${{ variables['finalStorage'] }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index 48d3849d38321..e07f0afa6109c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 - name: ENV_SETUP_SCRIPT
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
index 0cf0f076c1d7a..8cc647c2464f3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 - name: ENV_SETUP_SCRIPT
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index 4f7c886635aca..466fee92d0d5e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 - name: ENV_SETUP_SCRIPT
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index 764599145a7e8..aa0b6bf6d391e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -1,5 +1,5 @@
 parameters:
-  QnnSdk: '2.28.0.241029'
+  QnnSdk: '2.28.2.241116'
   build_config: 'RelWithDebInfo'
   IsReleaseBuild: false
   DoEsrp: false
diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
index d8ea1c35c89c4..29c5f6bb34d7a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
@@ -261,8 +261,6 @@ stages:
             publishJUnitResults: true
             testResultsFiles: '**/TEST-*.xml'
             testRunTitle: 'React Native Android Instrumented Test results'
-            javaHomeOption: 'path'
-            jdkDirectory: '$(JAVA_HOME_11_X64)'
             sonarQubeRunAnalysis: false
             spotBugsAnalysis: false
           displayName: Run React Native Android Instrumented Tests
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index b6a214154e680..5d7ea5e7b2727 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -161,7 +161,7 @@ stages:
     # Publish the BrowserStack artifacts first so that if the next step fails, the artifacts will still be published
     # so that users can attempt to locally debug
     - publish: "$(Build.ArtifactStagingDirectory)"
-      artifact: "browserstack_test_artifacts"
+      artifact: "browserstack_test_artifacts_${{ lower(parameters.packageVariant) }}"
       displayName: "Publish BrowserStack test artifacts"
 
     - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 115ea010ad00e..e046997b4f49a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -130,15 +130,17 @@ stages:
 
       - template: telemetry-steps.yml
 
-      - task: PipAuthenticate@1
-        displayName: 'Pip Authenticate'
-        inputs:
-          artifactFeeds: 'Lotus'
+      # The private ADO project
+      - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
+        - task: PipAuthenticate@1
+          displayName: 'Pip Authenticate'
+          inputs:
+            artifactFeeds: 'Lotus'
 
       - ${{ if eq(parameters['buildJava'], 'true') }}:
         - task: JavaToolInstaller@0
           inputs:
-            versionSpec: "11"
+            versionSpec: "17"
             jdkArchitectureOption: ${{ parameters.buildArch }}
             jdkSourceOption: 'PreInstalled'
 
@@ -368,7 +370,7 @@ stages:
         - ${{ if eq(parameters['buildJava'], 'true') }}:
           - task: JavaToolInstaller@0
             inputs:
-              versionSpec: "11"
+              versionSpec: "17"
               jdkArchitectureOption: ${{ parameters.buildArch }}
               jdkSourceOption: 'PreInstalled'
 
@@ -378,10 +380,12 @@ stages:
             addToPath: true
             architecture: ${{ parameters.buildArch }}
 
-        - task: PipAuthenticate@1
-          displayName: 'Pip Authenticate'
-          inputs:
-            artifactFeeds: 'Lotus'
+        # The private ADO project
+        - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
+          - task: PipAuthenticate@1
+            displayName: 'Pip Authenticate'
+            inputs:
+              artifactFeeds: 'Lotus'
 
         - task: NodeTool@0
           condition: and(succeeded(), eq('${{ parameters.buildNodejs}}', true))
@@ -413,6 +417,7 @@ stages:
               workingDirectory: '$(Build.BinariesDirectory)'
             env:
               NO_CUDA_TEST: '1'
+              GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*' # Exclude CUDA EP tests under providers/cuda/
           - task: PythonScript@0
             displayName: 'test excludes DML'
             condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
diff --git a/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml b/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml
index be8569abf1bee..fb3ebdc760a7b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml
@@ -11,6 +11,7 @@ parameters:
 
 jobs:
 - job: Windows_Packaging_${{ parameters.BuildArch }}_${{ parameters.Runtime }}
+  timeoutInMinutes: 180
   templateContext:
     outputs:
     - output: pipelineArtifact
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
index 47ece37e66e09..67fd47c3150af 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
@@ -62,4 +62,28 @@ stages:
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
\ No newline at end of file
+        MachinePool: onnxruntime-Win2022-GPU-A10
+
+- stage: cuda_dml
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env_cuda.bat
+        buildArch: x64
+        additionalBuildFlags: >-
+          --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
+          --enable_cuda_profiling --enable_transformers_tool_test
+          --use_dml
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        ORT_EP_NAME: CUDA
+        EnablePython: false
+        WITH_CACHE: true
+        MachinePool: onnxruntime-Win2022-GPU-A10
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
index 94b0aa680d54d..911d99cd2adf3 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
@@ -43,11 +43,11 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos  --use_winml
+        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: DML
         WITH_CACHE: false
-        MachinePool: onnxruntime-Win2022-GPU-dml-A10
\ No newline at end of file
+        MachinePool: onnxruntime-Win2022-GPU-dml-A10
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 59a8dac9b1988..5c013fae6be0b 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -33,7 +33,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 jobs:
 - job: 'build'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index cd173347f8167..53700c58c7e7d 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -33,7 +33,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.28.0.241029
+  default: 2.28.2.241116
 
 jobs:
 - job: 'build'
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 22095f3f898b6..e2e0cea69efb5 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -7,15 +7,29 @@ mkdir -p /build/dist
 
 EXTRA_ARG=""
 ENABLE_CACHE=false
-# Put 3.10 at the last because Ubuntu 22.04 use python 3.10 and we will upload the intermediate build files of this 
+# Put 3.10 at the last because Ubuntu 22.04 use python 3.10 and we will upload the intermediate build files of this
 # config to Azure DevOps Artifacts and download them to a Ubuntu 22.04 machine to run the tests.
-PYTHON_EXES=("/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp313-cp313/bin/python3.13" "/opt/python/cp313-cp313t/bin/python3.13t" "/opt/python/cp310-cp310/bin/python3.10")
+PYTHON_EXES=(
+  "/opt/python/cp311-cp311/bin/python3.11"
+  "/opt/python/cp312-cp312/bin/python3.12"
+  "/opt/python/cp313-cp313/bin/python3.13"
+  "/opt/python/cp313-cp313t/bin/python3.13t"
+  "/opt/python/cp310-cp310/bin/python3.10"
+  )
 while getopts "d:p:x:c:e" parameter_Option
 do case "${parameter_Option}"
 in
 #GPU|CPU|NPU.
 d) BUILD_DEVICE=${OPTARG};;
-p) PYTHON_EXES=${OPTARG};;
+p)
+  # Check if OPTARG is empty or starts with a hyphen, indicating a missing or invalid argument for -p
+  if [[ -z "${OPTARG}" || "${OPTARG}" == -* ]]; then
+    echo "ERROR: Option -p requires a valid argument, not another option."
+    exit 1
+  else
+    PYTHON_EXES=("${OPTARG}") # Use the provided argument for -p
+  fi
+  ;;
 x) EXTRA_ARG=${OPTARG};;
 c) BUILD_CONFIG=${OPTARG};;
 e) ENABLE_CACHE=true;;
@@ -89,9 +103,11 @@ export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=ON -DONNX_WERROR=OFF"
 for PYTHON_EXE in "${PYTHON_EXES[@]}"
 do
   rm -rf /build/"$BUILD_CONFIG"
-  ${PYTHON_EXE} -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/python/requirements.txt  
-  ${PYTHON_EXE} /onnxruntime_src/tools/ci_build/build.py "${BUILD_ARGS[@]}"
-
+  # that's a workaround for the issue that there's no python3 in the docker image
+  # like xnnpack's cmakefile, it uses pythone3 to run a external command
+  python3_dir=$(dirname "$PYTHON_EXE")
+  ${PYTHON_EXE} -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/python/requirements.txt
+  PATH=$python3_dir:$PATH ${PYTHON_EXE} /onnxruntime_src/tools/ci_build/build.py "${BUILD_ARGS[@]}"
   cp /build/"$BUILD_CONFIG"/dist/*.whl /build/dist
 done
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index 3ff213b16f3d1..d2d3aa1675c2e 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -1,6 +1,6 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241020.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241120.3
 
-ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
index 07885ba65af8a..c42042b0ec639 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -32,7 +32,7 @@ else \
     echo "TRT_VERSION is none skipping Tensor RT Installation" ; \
 fi
 
-ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
 #Add our own dependencies
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
@@ -42,5 +42,5 @@ ARG BUILD_USER=onnxruntimedev
 RUN adduser --uid $BUILD_UID $BUILD_USER
 WORKDIR /home/$BUILD_USER
 USER $BUILD_USER
-ENV PATH /usr/local/dotnet:$PATH
-ENV CUDA_MODULE_LOADING "LAZY"
\ No newline at end of file
+ENV PATH=/usr/local/dotnet:$PATH
+ENV CUDA_MODULE_LOADING="LAZY"
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
index e4c3af05053ba..9a265b4249f0b 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
@@ -201,5 +201,5 @@ ARG BUILD_USER=onnxruntimedev
 RUN adduser --uid $BUILD_UID $BUILD_USER
 WORKDIR /home/$BUILD_USER
 USER $BUILD_USER
-ENV PATH /usr/local/dotnet:$PATH
+ENV PATH=/usr/local/dotnet:$PATH
 ENV ORTMODULE_ONNX_OPSET_VERSION=$OPSET_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
index 8ef8e05b8ac77..9de88d1664b82 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
@@ -9,19 +9,19 @@ ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 ARG TRT_VERSION=8.6.1.6-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
 
 RUN dnf install -y bash wget &&\
     dnf clean dbcache
 
 # Install python3
 RUN dnf install -y \
-    python3.8 \
-    python38-pip \
-    python38-wheel &&\
+    python3.10 \
+    python310-pip \
+    python310-wheel &&\
     cd /usr/local/bin &&\
-    ln -s /usr/bin/python3 python3.8 &&\
-    ln -s /usr/bin/pip3 pip3.8;
+    ln -s /usr/bin/python3 python3.10 &&\
+    ln -s /usr/bin/pip3 pip3.10;
 
 RUN pip3 install --upgrade pip
 RUN pip3 install setuptools>=68.2.2
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
index ff66698882136..c2bae5fd7ee59 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
@@ -9,7 +9,7 @@ ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8
 ARG TRT_VERSION=10.6.0.26-1.cuda12.6
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
-ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
+ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
 
 RUN dnf install -y bash wget &&\
     dnf clean dbcache
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
index eea205797af79..2ecc6d1918b1a 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
@@ -9,7 +9,7 @@ ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 ARG TRT_VERSION=10.6.0.26-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
-ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
+ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
 
 RUN dnf install -y bash wget &&\
     dnf clean dbcache
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 34c700c22a7c9..81aeada6a4a46 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -10,7 +10,7 @@ ARG TRT_VERSION=10.6.0.26-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}:${LD_LIBRARY_PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
similarity index 94%
rename from tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
rename to tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
index 1665a46d10f43..4298dd53e4c66 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
@@ -5,12 +5,12 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 ARG TRT_VERSION=10.6.0.26-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}:${LD_LIBRARY_PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
new file mode 100644
index 0000000000000..1312475ceca3a
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
@@ -0,0 +1,64 @@
+# --------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------
+# Dockerfile to run ONNXRuntime with TensorRT integration
+
+# Build base image with required system packages
+ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+ARG TRT_VERSION=10.6.0.26-1+cuda11.8
+ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
+FROM $BASEIMAGE AS base
+ARG TRT_VERSION
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
+ENV DEBIAN_FRONTEND=noninteractive
+
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}:${LD_LIBRARY_PATH}
+
+RUN apt-get update &&\
+    apt-get install -y git bash wget diffutils
+
+RUN DEBIAN_FRONTEND="noninteractive" apt-get install --yes python3-opencv
+
+# Install python3
+RUN apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-wheel
+
+RUN pip install --upgrade pip
+
+# Install TensorRT
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
+    apt-get update &&\
+    apt-get install -y \
+    libnvinfer-dev=${TRT_VERSION} \
+    libnvinfer-dispatch-dev=${TRT_VERSION} \
+    libnvinfer-dispatch10=${TRT_VERSION} \
+    libnvinfer-headers-dev=${TRT_VERSION} \
+    libnvinfer-headers-plugin-dev=${TRT_VERSION} \
+    libnvinfer-lean-dev=${TRT_VERSION} \
+    libnvinfer-lean10=${TRT_VERSION} \
+    libnvinfer-plugin-dev=${TRT_VERSION} \
+    libnvinfer-plugin10=${TRT_VERSION} \
+    libnvinfer-vc-plugin-dev=${TRT_VERSION} \
+    libnvinfer-vc-plugin10=${TRT_VERSION} \
+    libnvinfer10=${TRT_VERSION} \
+    libnvonnxparsers-dev=${TRT_VERSION} \
+    libnvonnxparsers10=${TRT_VERSION} \
+    tensorrt-dev=${TRT_VERSION} \
+    libnvinfer-bin=${TRT_VERSION} &&\
+    if [ $(echo $CUDA_VERSION | cut -d"." -f1) -ge 12 ]; then apt-get install -y cudnn9-cuda-12 ; fi
+#   ^^^^^^^^^^^If cuda version is 12 or higher, install cudnn 9 for cuda 12
+
+ADD scripts /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && rm -rf /tmp/scripts
+
+# Build final image from base.
+FROM base as final
+ARG BUILD_USER=onnxruntimedev
+ARG BUILD_UID=1000
+RUN adduser --uid $BUILD_UID $BUILD_USER
+WORKDIR /home/$BUILD_USER
+USER $BUILD_USER
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
index dfc057b129f91..3b4d36a9a8fd8 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
@@ -10,7 +10,7 @@ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
@@ -82,7 +82,7 @@ RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIM
     git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
 
 # Build ORT
-ENV CUDA_MODULE_LOADING "LAZY" 
+ENV CUDA_MODULE_LOADING="LAZY"
 ARG PARSER_CONFIG=""
 RUN /bin/sh build.sh ${PARSER_CONFIG} --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
index 45c854f62cd37..22d5e3b0248a8 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
@@ -10,7 +10,7 @@ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
@@ -98,7 +98,7 @@ RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIM
     git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
 
 # Build ORT
-ENV CUDA_MODULE_LOADING "LAZY"
+ENV CUDA_MODULE_LOADING="LAZY"
 ARG PARSER_CONFIG=""
 RUN /bin/sh build.sh ${PARSER_CONFIG} --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6
index f63112039fe8e..6d35df72894d8 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6
@@ -10,7 +10,7 @@ FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
@@ -85,7 +85,7 @@ RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIM
     git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
 
 # Build ORT
-ENV CUDA_MODULE_LOADING "LAZY" 
+ENV CUDA_MODULE_LOADING="LAZY"
 ARG PARSER_CONFIG=""
 RUN /bin/sh build.sh ${PARSER_CONFIG} --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
index 53b1072ded8f4..819d9bab7be75 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
@@ -10,7 +10,7 @@ FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
@@ -98,7 +98,7 @@ RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIM
     git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
 
 # Build ORT
-ENV CUDA_MODULE_LOADING "LAZY"
+ENV CUDA_MODULE_LOADING="LAZY"
 ARG PARSER_CONFIG=""
 RUN /bin/sh build.sh ${PARSER_CONFIG} --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
index 5f525c1310412..8f3dcb69d6c56 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
@@ -1,7 +1,7 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:${UBUNTU_VERSION}
 
-ARG OPENVINO_VERSION=2024.3.0
+ARG OPENVINO_VERSION=2024.4.0
 ARG PYTHON_VERSION=3.10
 
 ADD scripts /tmp/scripts
@@ -12,16 +12,16 @@ RUN /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d EdgeDevice
 RUN apt update && apt install -y libnuma1 ocl-icd-libopencl1 && \
     rm -rf /var/lib/apt/lists/* /tmp/scripts
 
-ENV INTEL_OPENVINO_DIR /opt/intel/openvino_${OPENVINO_VERSION}
-ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64:$INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH
-ENV OpenVINO_DIR $INTEL_OPENVINO_DIR/runtime/cmake
-ENV IE_PLUGINS_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64
+ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_${OPENVINO_VERSION}
+ENV LD_LIBRARY_PATH=$INTEL_OPENVINO_DIR/runtime/lib/intel64:$INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH
+ENV OpenVINO_DIR=$INTEL_OPENVINO_DIR/runtime/cmake
+ENV IE_PLUGINS_PATH=$INTEL_OPENVINO_DIR/runtime/lib/intel64
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN cd /opt && mkdir -p intel && cd intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.3/linux/l_openvino_toolkit_ubuntu22_2024.3.0.16041.1e3b88e4e3f_x86_64.tgz && \
-    tar xzf l_openvino_toolkit_ubuntu22_2024.3.0.16041.1e3b88e4e3f_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu22_2024.3.0.16041.1e3b88e4e3f_x86_64.tgz && \
-    mv l_openvino_toolkit_ubuntu22_2024.3.0.16041.1e3b88e4e3f_x86_64 openvino_2024.3.0 && \
+    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.4/linux/l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && \
+    tar xzf l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && \
+    mv l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64 openvino_2024.4.0 && \
     cd $INTEL_OPENVINO_DIR/install_dependencies && ./install_openvino_dependencies.sh -y
 
 WORKDIR /root
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
index 797495abef57b..4f58dc89333ba 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
@@ -10,7 +10,7 @@ FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
@@ -92,7 +92,7 @@ RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIM
     git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
 
 # Build ORT
-ENV CUDA_MODULE_LOADING "LAZY" 
+ENV CUDA_MODULE_LOADING="LAZY"
 ARG PARSER_CONFIG=""
 RUN /bin/sh build.sh ${PARSER_CONFIG} --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
 
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
index ca00050121d67..246ef09f7be25 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
@@ -2,15 +2,14 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=arm64v8/almalinux:8
-FROM $BASEIMAGE
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12_dotnet:20241120.3
 
 ENV PATH=/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
 
 ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
 
 ARG BUILD_UID=1001
 ARG BUILD_USER=onnxruntimedev
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh
deleted file mode 100755
index 596a5ce436c57..0000000000000
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -e -x
-
-os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
-
-echo "installing for CentOS version : $os_major_version"
-dnf install -y python3.12-pip python3.12-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
-locale
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
index bf08a853fe7f4..70bb373efb23f 100755
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
@@ -39,9 +39,6 @@ mkdir -p /tmp/src
 cd /tmp/src
 
 CPU_ARCH=$(uname -m)
-echo "Installing cmake"
-GetFile "https://github.com/Kitware/CMake/releases/download/v3.31.0-rc2/cmake-3.31.0-rc2-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz"
-tar -zxf /tmp/src/cmake.tar.gz --strip=1 -C /usr
 
 echo "Installing Ninja"
 GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
index 3f42b28497c7a..43dd3badef387 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12:20241020.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12:20241120.3
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_centos.sh
index c81e57c60c9da..d0b58ed28b8c9 100755
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_centos.sh
@@ -7,8 +7,6 @@ echo "installing for os major version : $os_major_version"
 dnf install -y glibc-langpack-\*
 yum install -y which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
 
-# export PATH=/opt/python/cp38-cp38/bin:$PATH
-
 echo "installing rapidjson for AzureEP"
 wget https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.tar.gz
 tar zxvf v1.1.0.tar.gz
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
index ef28dde67617f..fffe92d2583a2 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -2,15 +2,14 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=amd64/almalinux:8
-FROM $BASEIMAGE
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12_dotnet:20241120.3
 
-ENV PATH=/usr/lib/jvm/msopenjdk-11/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV PATH=/usr/lib/jvm/msopenjdk-17/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
-ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
+
 ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
 
 ARG BUILD_UID=1001
 ARG BUILD_USER=onnxruntimedev
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_centos.sh
deleted file mode 100755
index 03534d8a2f447..0000000000000
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_centos.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-set -e -x
-if [ ! -f /etc/yum.repos.d/microsoft-prod.repo ]; then
-  os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
-  echo "installing for CentOS version : $os_major_version"
-  rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm
-fi
-dnf install -y python3.12-pip python3.12-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11 graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran gcc-toolset-12-libasan-devel libasan.x86_64
-locale
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh
index 0cc48a720b8f4..be906bf21a4fb 100755
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh
@@ -38,9 +38,6 @@ mkdir -p /tmp/src
 
 cd /tmp/src
 CPU_ARCH=$(uname -m)
-echo "Installing cmake"
-GetFile "https://github.com/Kitware/CMake/releases/download/v3.31.0-rc2/cmake-3.31.0-rc2-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz"
-tar -zxf /tmp/src/cmake.tar.gz --strip=1 -C /usr
 
 echo "Installing Ninja"
 GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
index 6702474d75801..d386db7ab7bd8 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
@@ -2,10 +2,9 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20241020.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20241120.3
 
 ARG TRT_VERSION
-RUN  rpm -Uvh https://packages.microsoft.com/config/centos/8/packages-microsoft-prod.rpm && dnf install -y msopenjdk-11
 #Install TensorRT only if TRT_VERSION is not empty
 RUN if [ -n "$TRT_VERSION" ]; then  \
     echo "TRT_VERSION is $TRT_VERSION" && \
@@ -31,11 +30,11 @@ else \
     echo "TRT_VERSION is none skipping Tensor RT Installation" ; \
 fi
 
-ENV PATH /usr/lib/jvm/msopenjdk-11/bin:$PATH
+ENV PATH=/usr/lib/jvm/msopenjdk-17/bin:$PATH
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
-ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
-ENV CUDAHOSTCXX /opt/rh/gcc-toolset-11/root/usr/bin/g++
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
+ENV CUDAHOSTCXX=/opt/rh/gcc-toolset-11/root/usr/bin/g++
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
 
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
index 4059de23b2480..ba6f28be4636c 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20241020.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20241120.3
 ARG TRT_VERSION
 
 #Install TensorRT only if TRT_VERSION is not empty
@@ -35,12 +35,12 @@ fi
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
 
-ENV CUDAHOSTCXX /opt/rh/gcc-toolset-12/root/usr/bin/g++
+ENV CUDAHOSTCXX=/opt/rh/gcc-toolset-12/root/usr/bin/g++
 ADD scripts /tmp/scripts
 RUN sed -i 's/enabled\s*=\s*1/enabled = 1\nexclude=dotnet* aspnet* netstandard*/g' /etc/yum.repos.d/ubi.repo && \
-	rpm -Uvh https://packages.microsoft.com/config/centos/8/packages-microsoft-prod.rpm && dnf install -y msopenjdk-11 && cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
-ENV PATH /usr/lib/jvm/msopenjdk-11/bin:$PATH
-ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
+	cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
+ENV PATH=/usr/lib/jvm/msopenjdk-17/bin:$PATH
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
 ARG BUILD_UID=1001
 ARG BUILD_USER=onnxruntimedev
 RUN adduser --uid $BUILD_UID $BUILD_USER
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
index 76b31e71a7dea..857fc445ef74a 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241020.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241120.3
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/install_centos.sh
index c81e57c60c9da..d0b58ed28b8c9 100755
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/install_centos.sh
@@ -7,8 +7,6 @@ echo "installing for os major version : $os_major_version"
 dnf install -y glibc-langpack-\*
 yum install -y which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
 
-# export PATH=/opt/python/cp38-cp38/bin:$PATH
-
 echo "installing rapidjson for AzureEP"
 wget https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.tar.gz
 tar zxvf v1.1.0.tar.gz
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
index f6f3ad7384592..a69b98f86ba1b 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
@@ -32,8 +32,8 @@ else \
     echo "TRT_VERSION is x${TRT_VERSION} skipping Tensor RT Installation" ; \
 fi
 
-ENV PATH /usr/local/cuda/bin:$PATH
-ENV CUDA_MODULE_LOADING "LAZY"
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV CUDA_MODULE_LOADING="LAZY"
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/install_centos.sh
index c81e57c60c9da..d0b58ed28b8c9 100755
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/install_centos.sh
@@ -7,8 +7,6 @@ echo "installing for os major version : $os_major_version"
 dnf install -y glibc-langpack-\*
 yum install -y which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
 
-# export PATH=/opt/python/cp38-cp38/bin:$PATH
-
 echo "installing rapidjson for AzureEP"
 wget https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.tar.gz
 tar zxvf v1.1.0.tar.gz
diff --git a/tools/ci_build/github/linux/docker/scripts/install_java.sh b/tools/ci_build/github/linux/docker/scripts/install_java.sh
index d11e29f693b8b..f4ea49963f115 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_java.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_java.sh
@@ -5,7 +5,7 @@ if [ -f /etc/redhat-release ]; then
     dnf install -y java-11-openjdk-devel \
     && dnf clean dbcache
 elif [ -f /etc/os-release ]; then
-    apt-get update && apt-get install -y openjdk-11-jdk
+    apt-get update && apt-get install -y openjdk-17-jdk
 else
   echo "Unsupported OS"
   exit 1
diff --git a/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh
index 7f3160371aa24..87b9b960b7ebc 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh
@@ -12,7 +12,6 @@ d) DEVICE_TYPE=${OPTARG};;
 v) echo "Cuda version is no longer accepted as an input to this script. Ignoring the input argument -v.";;
 t) echo "Installing python training dependencies argument is no longer accepted as an input to this script. Ignoring the input argument -t.";;
 m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
-u) echo "Installing ortmodule python dependencies argument is no longer accepted as an input to this script. Ignoring the input argument -u.";;
 r) echo "Installing ROCM python dependencies argument is no longer accepted as an input to this script. Ignoring the input argument -r.";;
 esac
 done
diff --git a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
index 1ac1d226deec6..2d7acd1f701ff 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
@@ -3,7 +3,6 @@ set -e -x
 
 INSTALL_DEPS_TRAINING=false
 INSTALL_DEPS_DISTRIBUTED_SETUP=false
-ORTMODULE_BUILD=false
 TARGET_ROCM=false
 CU_VER="11.8"
 TORCH_VERSION='2.0.0'
@@ -18,7 +17,6 @@ d) DEVICE_TYPE=${OPTARG};;
 v) CU_VER=${OPTARG};;
 t) INSTALL_DEPS_TRAINING=true;;
 m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
-u) ORTMODULE_BUILD=true;;
 r) TARGET_ROCM=true;;
 c) USE_CONDA=true;;
 esac
@@ -55,17 +53,3 @@ fi
 export ONNX_ML=1
 export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF"
 ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps\.sh/requirements\.txt}
-if [ $DEVICE_TYPE = "gpu" ]; then
-  if [[ $INSTALL_DEPS_TRAINING = true ]]; then
-    if [[ $ORTMODULE_BUILD = false ]]; then
-      ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/requirements.txt}
-    else
-      if [[ $TARGET_ROCM = false ]]; then
-        ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage1\/requirements_torch${TORCH_VERSION}_cu${CU_VER}\/requirements.txt}
-        ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage2\/requirements.txt}
-      else
-        ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage1\/requirements_rocm\/requirements.txt}
-      fi
-    fi
-  fi
-fi
diff --git a/tools/ci_build/github/linux/docker/scripts/install_ubuntu.sh b/tools/ci_build/github/linux/docker/scripts/install_ubuntu.sh
index a980963429034..4bc609fc0badb 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_ubuntu.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_ubuntu.sh
@@ -5,6 +5,7 @@ do case "${parameter_Option}"
 in
 p) PYTHON_VER=${OPTARG};;
 d) DEVICE_TYPE=${OPTARG};;
+*) echo "Usage: $0 -p PYTHON_VER -d DEVICE_TYPE";;
 esac
 done
 
@@ -20,54 +21,65 @@ apt-get update && apt-get install -y software-properties-common lsb-release
 
 OS_VERSION=$(lsb_release -r -s)
 
-PACKAGE_LIST="autotools-dev \
-	automake \
-	build-essential \
-	git apt-transport-https apt-utils \
-	ca-certificates \
-	pkg-config \
-	wget \
-	zlib1g \
-	zlib1g-dev \
-	libssl-dev \
-	curl libcurl4-openssl-dev \
-	autoconf \
-	sudo \
-	gfortran \
-	python3-dev \
-	language-pack-en \
-	liblttng-ust-dev \
-	libcurl4 \
-	libkrb5-3 \
-	libtinfo-dev \
-	libtinfo5 \
-	libtool \
-	openssh-server \
-	aria2 \
-	bzip2 \
-	unzip \
-	zip \
-	rsync libunwind8 libpng-dev libexpat1-dev \
-	python3-setuptools python3-numpy python3-wheel python3-pip python3-pytest python3-distutils \
-	openjdk-11-jdk \
-	graphviz"
-
-
-if [ $DEVICE_TYPE = "Normal" ]; then
-    PACKAGE_LIST="$PACKAGE_LIST libedit-dev libxml2-dev python3-packaging"
+PACKAGE_LIST=(
+    "apt-transport-https"
+    "apt-utils"
+    "aria2"
+    "autoconf"
+    "automake"
+    "autotools-dev"
+    "build-essential"
+    "bzip2"
+    "ca-certificates"
+    "curl"
+    "gfortran"
+    "git"
+    "graphviz"
+    "language-pack-en"
+    "libcurl4"
+    "libcurl4-openssl-dev"
+    "libexpat1-dev"
+    "libkrb5-3"
+    "liblttng-ust-dev"
+    "libpng-dev"
+    "libssl-dev"
+    "libtinfo-dev"
+    "libtinfo5"
+    "libtool"
+    "libunwind8"
+    "openjdk-17-jdk"
+    "openssh-server"
+    "pkg-config"
+    "python3-dev"
+    "python3-distutils"
+    "python3-numpy"
+    "python3-pip"
+    "python3-pytest"
+    "python3-setuptools"
+    "python3-wheel"
+    "rsync"
+    "sudo"
+    "unzip"
+    "wget"
+    "zip"
+    "zlib1g"
+    "zlib1g-dev"
+)
+if [ "$DEVICE_TYPE" = "Normal" ]; then
+    PACKAGE_LIST+=("libedit-dev" "libxml2-dev" "python3-packaging")
 fi
 
-PACKAGE_LIST="$PACKAGE_LIST libicu-dev"
+PACKAGE_LIST+=("libicu-dev")
 
-apt-get install -y --no-install-recommends $PACKAGE_LIST
+apt-get install -y --no-install-recommends "${PACKAGE_LIST[@]}"
 
 locale-gen en_US.UTF-8
 update-locale LANG=en_US.UTF-8
 
 if [ "$OS_VERSION" = "20.04" ]; then
   # The defaul version of python is 3.8
-    major=$(echo $PYTHON_VER | cut -d. -f1)
-    minor=$(echo $PYTHON_VER | cut -d. -f2)
+    major=$(echo "$PYTHON_VER" | cut -d. -f1)
+    minor=$(echo "$PYTHON_VER" | cut -d. -f2)
     if [ "$major" -lt 3 ] || [ "$major" -eq 3 ] && [ "$minor" -lt 8 ]; then
       PYTHON_VER="3.8"
     fi
@@ -75,19 +87,19 @@ if [ "$OS_VERSION" = "20.04" ]; then
         add-apt-repository -y ppa:deadsnakes/ppa
         apt-get update
         apt-get install -y --no-install-recommends \
-                python${PYTHON_VER} \
-                python${PYTHON_VER}-dev
-        update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VER} 1
+                python"${PYTHON_VER}" \
+                python"${PYTHON_VER}-"dev
+        update-alternatives --install /usr/bin/python3 python3 /usr/bin/python"${PYTHON_VER}" 1
         update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2
-        update-alternatives --set python3 /usr/bin/python${PYTHON_VER}
+        update-alternatives --set python3 /usr/bin/python"${PYTHON_VER}"
         #TODO: the old one(/usr/bin/pip3) should be uninstalled first. Because the one will be
         #put at /usr/local/. Then there will be two pips.
-        /usr/bin/python${PYTHON_VER} -m pip install --upgrade --force-reinstall pip==19.0.3
+        /usr/bin/python"${PYTHON_VER}" -m pip install --upgrade --force-reinstall pip==19.0.3
     fi
 elif [ "$OS_VERSION" = "22.04" ] ; then
   # The defaul version of python is 3.10
-    major=$(echo $PYTHON_VER | cut -d. -f1)
-    minor=$(echo $PYTHON_VER | cut -d. -f2)
+    major=$(echo "$PYTHON_VER" | cut -d. -f1)
+    minor=$(echo "$PYTHON_VER" | cut -d. -f2)
     if [ "$major" -lt 3 ] || [ "$major" -eq 3 ] && [ "$minor" -lt 10 ]; then
       PYTHON_VER="3.10"
     fi
@@ -95,11 +107,11 @@ elif [ "$OS_VERSION" = "22.04" ] ; then
         add-apt-repository -y ppa:deadsnakes/ppa
         apt-get update
         apt-get install -y --no-install-recommends \
-                python${PYTHON_VER} \
-                python${PYTHON_VER}-dev
-        update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VER} 1
+                python"${PYTHON_VER}" \
+                python"${PYTHON_VER}"-dev
+        update-alternatives --install /usr/bin/python3 python3 /usr/bin/python"${PYTHON_VER}" 1
         update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2
-        update-alternatives --set python3 /usr/bin/python${PYTHON_VER}
+        update-alternatives --set python3 /usr/bin/python"${PYTHON_VER}"
     fi
 else
     exit 1
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh
index dfda5ec73fdbe..a487bf7f91507 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh
@@ -8,9 +8,6 @@ if [ "$os_major_version" -gt 7 ]; then
     PACKAGE_MANAGER="dnf"
     $PACKAGE_MANAGER install -y which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
 fi
-if [ ! -f /etc/yum.repos.d/microsoft-prod.repo ]; then
-  rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm
-fi
-# Install Java
+
 # Install automatic documentation generation dependencies
-$PACKAGE_MANAGER install -y msopenjdk-11 graphviz
+$PACKAGE_MANAGER install -y graphviz
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_ubuntuos.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_ubuntuos.sh
index 2f69435dc316e..69b0ea1321235 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_ubuntuos.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_ubuntuos.sh
@@ -12,4 +12,4 @@ apt-get install -y gdb  build-essential tar unzip  make  aria2  bzip2
 # Install Java
 # Install automatic documentation generation dependencies
 apt-get update
-apt-get install -y openjdk-11-jdk graphviz
+apt-get install -y openjdk-17-jdk graphviz
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 12db3bd132bb7..2d714e3058da4 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -6,7 +6,8 @@ setuptools>=68.2.2
 wheel
 onnx==1.16.1
 protobuf==4.21.12
-sympy==1.12
+sympy==1.12 ; python_version < '3.9'
+sympy==1.13 ; python_version >= '3.9'
 flatbuffers
 neural-compressor>=2.2.1
 triton
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
deleted file mode 100644
index 051f42dac335d..0000000000000
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-numpy==1.21.6 ; python_version < '3.9'
-numpy==2.1.2 ; python_version >= '3.9'
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.0.0_cu11.8/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.0.0_cu11.8/requirements.txt
deleted file mode 100644
index b3b2651c8d26d..0000000000000
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.0.0_cu11.8/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
---pre
--f https://download.pytorch.org/whl/torch_stable.html
-torch==2.0.0+cu118
-torchvision==0.15.1+cu118
-torchtext==0.15.1
-# TODO(bmeswani): packaging 22.0 removes support for LegacyVersion leading to errors because transformers 4.4.2 uses LegacyVersion
-packaging==21.3
-setuptools>=68.2.2
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt
deleted file mode 100644
index 152a17db90366..0000000000000
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
---pre
--f https://download.pytorch.org/whl/torch_stable.html
-torch==2.1.0+cu121
-torchvision==0.16.0+cu121
-torchtext==0.16.0
-packaging==23.1
-setuptools>=68.2.2
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt
deleted file mode 100644
index 846f8c15b257d..0000000000000
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
--f https://download.pytorch.org/whl/torch_stable.html
-torch==2.3.0+cpu
-setuptools>=68.2.2
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
deleted file mode 100644
index 01fa7b0ff956e..0000000000000
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-scikit-learn
-packaging==21.3
-transformers==v4.36.0
-accelerate==0.25.0
-wget
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/torch_eager_cpu/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/torch_eager_cpu/requirements.txt
deleted file mode 100644
index 6346c54decf9c..0000000000000
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/torch_eager_cpu/requirements.txt
+++ /dev/null
@@ -1,11 +0,0 @@
---pre
--f https://download.pytorch.org/whl/torch_stable.html
-torch==2.2.0
-setuptools>=68.2.2
-cerberus
-h5py
-scikit-learn
-numpy==1.21.6 ; python_version < '3.9'
-numpy==2.1.2 ; python_version >= '3.9'
-pandas
-parameterized
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
deleted file mode 100644
index dd86b32f88c76..0000000000000
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-pandas
-scikit-learn
-numpy==1.21.6 ; python_version < '3.9'
-numpy==2.1.2 ; python_version >= '3.9'
-transformers==v4.36.0
-accelerate==0.25.0
-rsa==4.9
-tensorboard==2.13.0
-h5py
-wget
-pytorch-lightning==2.3.3
-deepspeed==0.9.0
-fairscale==0.4.6
-parameterized>=0.8.1
-pydantic<2.0.0
diff --git a/tools/ci_build/github/linux/ort_minimal/build_full_ort_and_create_ort_files.sh b/tools/ci_build/github/linux/ort_minimal/build_full_ort_and_create_ort_files.sh
index 640028ee7678c..aef9793f696b6 100755
--- a/tools/ci_build/github/linux/ort_minimal/build_full_ort_and_create_ort_files.sh
+++ b/tools/ci_build/github/linux/ort_minimal/build_full_ort_and_create_ort_files.sh
@@ -5,7 +5,7 @@
 
 set -e
 set -x
-export PATH=/opt/python/cp38-cp38/bin:$PATH
+export PATH=/opt/python/cp310-cp310/bin:$PATH
 
 BUILD_DIR=${1:?"usage: $0 <build directory>"}
 
@@ -26,7 +26,7 @@ python3 /onnxruntime_src/tools/ci_build/build.py \
     --build_wheel \
     --skip_tests \
     --enable_training_ops \
-    --enable_pybind --cmake_extra_defines PYTHON_INCLUDE_DIR=/opt/python/cp38-cp38/include/python3.8 PYTHON_LIBRARY=/usr/lib64/librt.so \
+    --enable_pybind --cmake_extra_defines PYTHON_INCLUDE_DIR=/opt/python/cp310-cp310/include/python3.10 PYTHON_LIBRARY=/usr/lib64/librt.so \
     --use_nnapi \
     --use_coreml
 
diff --git a/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_and_run_tests.sh b/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_and_run_tests.sh
index 58d493086ece9..c857d3f1036bc 100755
--- a/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_and_run_tests.sh
+++ b/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_and_run_tests.sh
@@ -7,7 +7,7 @@
 
 set -e
 set -x
-export PATH=/opt/python/cp38-cp38/bin:$PATH
+export PATH=/opt/python/cp310-cp310/bin:$PATH
 USAGE_TEXT="Usage:
   -b|--build-directory <build directory>
     Specifies the build directory. Required.
diff --git a/tools/ci_build/github/linux/run_dockerbuild.sh b/tools/ci_build/github/linux/run_dockerbuild.sh
index 9944861f519f4..6618810c77f6d 100755
--- a/tools/ci_build/github/linux/run_dockerbuild.sh
+++ b/tools/ci_build/github/linux/run_dockerbuild.sh
@@ -15,10 +15,6 @@ BUILD_DIR=$BUILD_BINARIESDIRECTORY
 YOCTO_VERSION="4.19"
 #Training only
 INSTALL_DEPS_DISTRIBUTED_SETUP=false
-#Training only
-ORTMODULE_BUILD=false
-#Training only
-USE_CONDA=false
 ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV="ALLOW_RELEASED_ONNX_OPSET_ONLY="$ALLOW_RELEASED_ONNX_OPSET_ONLY
 echo "ALLOW_RELEASED_ONNX_OPSET_ONLY environment variable is set as $ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV"
 
@@ -44,10 +40,6 @@ t) EXTRA_IMAGE_TAG=${OPTARG};;
 i) IMAGE_CACHE_CONTAINER_REGISTRY_NAME=${OPTARG};;
 # install distributed setup dependencies
 m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
-# install ortmodule specific dependencies
-u) ORTMODULE_BUILD=true;;
-# install and use conda
-e) USE_CONDA=true;;
 *) echo "Invalid option";;
 esac
 done
@@ -82,24 +74,6 @@ if [ $BUILD_OS = "yocto" ]; then
     $GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \
         --docker-build-args="--build-arg TOOL_CHAIN=$TOOL_CHAIN_SCRIPT --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER}" \
         --dockerfile $DOCKER_FILE --context .
-elif [ $BUILD_DEVICE = "gpu" ]; then
-        # This code path is only for training. Inferecing pipeline uses CentOS
-        IMAGE="$BUILD_OS-gpu_training"
-        # Current build script doesn't support building shared lib with Python dependency. To enable building with PythonOp,
-        # We need to avoid `--no-undefined` when building shared lib (Otherwise, CIs will report `undefined symbols`), but removing that would bring some other concerns.
-        # Plus the fact training did not need build shared library, we disable the --build_shared_lib for training CIs.
-        NEED_BUILD_SHARED_LIB=false
-        INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -t"
-        if [[ $INSTALL_DEPS_DISTRIBUTED_SETUP = true ]]; then
-            INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -m"
-        fi
-        if [[ $ORTMODULE_BUILD = true ]]; then
-            INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -u"
-        fi
-        INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -v 11.8"
-        $GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \
-            --docker-build-args="--build-arg BASEIMAGE=nvcr.io/nvidia/cuda:11.8.0-cudnn8-devel-${BUILD_OS} --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg INSTALL_DEPS_EXTRA_ARGS=\"${INSTALL_DEPS_EXTRA_ARGS}\" --build-arg USE_CONDA=${USE_CONDA} --network=host" \
-            --dockerfile Dockerfile.ubuntu_gpu_training --context .
 elif [[ $BUILD_DEVICE = "openvino"* ]]; then
         BUILD_ARGS="--build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg OPENVINO_VERSION=${OPENVINO_VERSION} --build-arg UBUNTU_VERSION=${UBUNTU_VERSION}"
         IMAGE="$BUILD_OS-openvino"
diff --git a/tools/ci_build/github/linux/run_python_dockerbuild.sh b/tools/ci_build/github/linux/run_python_dockerbuild.sh
index eb3a0132f8aba..2fec98e569919 100755
--- a/tools/ci_build/github/linux/run_python_dockerbuild.sh
+++ b/tools/ci_build/github/linux/run_python_dockerbuild.sh
@@ -2,14 +2,15 @@
 set -e -x
 BUILD_CONFIG="Release"
 
-while getopts "i:d:x:c:" parameter_Option
+while getopts "i:d:x:c:p:" parameter_Option
 do case "${parameter_Option}"
 in
 i) DOCKER_IMAGE=${OPTARG};;
 d) DEVICE=${OPTARG};;
 x) BUILD_EXTR_PAR=${OPTARG};;
 c) BUILD_CONFIG=${OPTARG};;
-*) echo "Usage: $0 -i <docker_image> -d <GPU|CPU> [-x <extra_build_arg>] [-c <build_config>]"
+p) PYTHON_EXES=${OPTARG};;
+*) echo "Usage: $0 -i <docker_image> -d <GPU|CPU> [-x <extra_build_arg>] [-c <build_config>] [-p <python_exe_path>]"
    exit 1;;
 esac
 done
@@ -17,6 +18,10 @@ done
 mkdir -p "${HOME}/.onnx"
 DOCKER_SCRIPT_OPTIONS="-d ${DEVICE} -c ${BUILD_CONFIG}"
 
+if [ "${PYTHON_EXES}" != "" ] ; then
+    DOCKER_SCRIPT_OPTIONS+=" -p ${PYTHON_EXES}"
+fi
+
 if [ "${BUILD_EXTR_PAR}" != "" ] ; then
     DOCKER_SCRIPT_OPTIONS+=" -x ${BUILD_EXTR_PAR}"
 fi
diff --git a/tools/ci_build/github/linux/test_custom_ops_pytorch_export.sh b/tools/ci_build/github/linux/test_custom_ops_pytorch_export.sh
index 9cd1222cabfa6..835f83e2b8bed 100755
--- a/tools/ci_build/github/linux/test_custom_ops_pytorch_export.sh
+++ b/tools/ci_build/github/linux/test_custom_ops_pytorch_export.sh
@@ -5,7 +5,7 @@ pip3 install --user --upgrade pip
 pip3 install --user numpy torch pytest
 pip3 install --user /build/Release/dist/*.whl
 
-export PYTHONPATH=/onnxruntime_src/tools:/usr/local/lib/python3.8/site-packages:$PYTHONPATH
+export PYTHONPATH=/onnxruntime_src/tools:/usr/local/lib/python3.10/site-packages:$PYTHONPATH
 
 python3 -m pytest -v /onnxruntime_src/tools/test/test_custom_ops_pytorch_exporter.py || exit 1
 
diff --git a/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh b/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh
deleted file mode 100755
index fb4dbeb2e73d3..0000000000000
--- a/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-usage() { echo "Usage: $0 [-v <ROCm version>]" 1>&2; exit 1; }
-
-while getopts "v:" parameter_Option
-do case "${parameter_Option}"
-in
-v) ROCM_VERSION=${OPTARG};;
-*) usage ;;
-esac
-done
-
-MI200_DEVICE_NUMBERS=$(rocm-smi --showproductname | grep -c "MI250" | xargs)
-
-if [ "$MI200_DEVICE_NUMBERS" -gt "0" ]; then
-  RESULT_FILE=ci-mi200.huggingface.bert-large-rocm${ROCM_VERSION}.json
-else
-  RESULT_FILE=ci-mi100.huggingface.bert-large-rocm${ROCM_VERSION}.json
-fi
-
-python \
-  /stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
-  --model_name_or_path bert-large-uncased \
-  --dataset_name wikitext \
-  --dataset_config_name wikitext-2-raw-v1 \
-  --do_train \
-  --max_steps 260 \
-  --logging_steps 20 \
-  --output_dir ./test-mlm-bbu \
-  --overwrite_output_dir \
-  --per_device_train_batch_size 8 \
-  --fp16 \
-  --dataloader_num_workers 1 \
-  --ort \
-  --skip_memory_metrics
-
-cat ci-pipeline-actual.json
-
-python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \
-  ci-pipeline-actual.json \
-  /onnxruntime_src/orttraining/tools/ci_test/results/"$RESULT_FILE"
diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
index 32c5ce7dd08d1..14aeff3df9c62 100644
--- a/tools/ci_build/requirements/transformers-test/requirements.txt
+++ b/tools/ci_build/requirements/transformers-test/requirements.txt
@@ -1,11 +1,12 @@
-# packages used by transformers python unittest (only enabled in Linux CPU CI Pipeline)
+# packages used by transformers python unittest
 packaging
-protobuf==3.20.2
-numpy==1.24.0 ; python_version < '3.12'
-numpy==1.26.0 ; python_version >= '3.12'
+# protobuf and numpy is same as tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+protobuf==4.21.12
+numpy==1.21.6 ; python_version < '3.9'
+numpy==2.0.0 ; python_version >= '3.9'
 torch
 coloredlogs==15.0
-transformers==4.38.0
+transformers==4.46.3
 parameterized>=0.8.1
 psutil
 einops
diff --git a/tools/ci_build/set-trigger-rules.py b/tools/ci_build/set-trigger-rules.py
index 0e9cd514d8aa5..b46d1e2559e46 100644
--- a/tools/ci_build/set-trigger-rules.py
+++ b/tools/ci_build/set-trigger-rules.py
@@ -30,14 +30,9 @@
     "mac-ios-ci-pipeline.yml",
     "mac-ios-packaging-pipeline.yml",
     "mac-react-native-ci-pipeline.yml",
-    "orttraining-linux-ci-pipeline.yml",
-    "orttraining-linux-gpu-ci-pipeline.yml",
-    "orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml",
-    "orttraining-mac-ci-pipeline.yml",
     "win-ci-pipeline.yml",
     "win-gpu-dml-ci-pipeline.yml",
     "win-gpu-cuda-ci-pipeline.yml",
-    "win-gpu-training-ci-pipeline.yml",
     "win-gpu-doc-gen-ci-pipeline.yml",
     "win-gpu-tensorrt-ci-pipeline.yml",
     "win-gpu-webgpu-ci-pipeline.yml",
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index ff6556b1fd31a..ba125f4e2d980 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -138,7 +138,7 @@ def parse_arguments():
         required=False,
         default="None",
         type=str,
-        choices=["cuda", "dnnl", "openvino", "tensorrt", "snpe", "tvm", "qnn", "None"],
+        choices=["cuda", "dnnl", "openvino", "tensorrt", "snpe", "qnn", "None"],
         help="The selected execution provider for this build.",
     )
     parser.add_argument("--sdk_info", required=False, default="", type=str, help="dependency SDK information.")
@@ -375,13 +375,11 @@ def generate_files(line_list, args):
             "mklml": "mklml.dll",
             "openmp": "libiomp5md.dll",
             "dnnl": "dnnl.dll",
-            "tvm": "tvm.dll",
             "providers_shared_lib": "onnxruntime_providers_shared.dll",
             "dnnl_ep_shared_lib": "onnxruntime_providers_dnnl.dll",
             "tensorrt_ep_shared_lib": "onnxruntime_providers_tensorrt.dll",
             "openvino_ep_shared_lib": "onnxruntime_providers_openvino.dll",
             "cuda_ep_shared_lib": "onnxruntime_providers_cuda.dll",
-            "tvm_ep_shared_lib": "onnxruntime_providers_tvm.lib",
             "onnxruntime_perf_test": "onnxruntime_perf_test.exe",
             "onnx_test_runner": "onnx_test_runner.exe",
         }
@@ -394,7 +392,6 @@ def generate_files(line_list, args):
             "mklml_1": "libmklml_gnu.so",
             "openmp": "libiomp5.so",
             "dnnl": "libdnnl.so.1",
-            "tvm": "libtvm.so.0.5.1",
             "providers_shared_lib": "libonnxruntime_providers_shared.so",
             "dnnl_ep_shared_lib": "libonnxruntime_providers_dnnl.so",
             "tensorrt_ep_shared_lib": "libonnxruntime_providers_tensorrt.so",
@@ -456,14 +453,6 @@ def generate_files(line_list, args):
             + '" target="build\\native\\include" />'
         )
 
-    if args.execution_provider == "tvm":
-        files_list.append(
-            "<file src="
-            + '"'
-            + os.path.join(args.sources_path, "include\\onnxruntime\\core\\providers\\tvm\\tvm_provider_factory.h")
-            + '" target="build\\native\\include" />'
-        )
-
     if args.execution_provider == "openvino":
         files_list.append(
             "<file src="
@@ -707,38 +696,6 @@ def generate_files(line_list, args):
             + '\\native" />'
         )
 
-    if args.execution_provider == "tvm":
-        files_list.append(
-            "<file src="
-            + '"'
-            + os.path.join(args.native_build_path, nuget_dependencies["providers_shared_lib"])
-            + runtimes_target
-            + args.target_architecture
-            + '\\native" />'
-        )
-        files_list.append(
-            "<file src="
-            + '"'
-            + os.path.join(args.native_build_path, nuget_dependencies["tvm_ep_shared_lib"])
-            + runtimes_target
-            + args.target_architecture
-            + '\\native" />'
-        )
-
-        tvm_build_path = os.path.join(args.ort_build_path, args.build_config, "_deps", "tvm-build")
-        if is_windows():
-            files_list.append(
-                "<file src="
-                + '"'
-                + os.path.join(tvm_build_path, args.build_config, nuget_dependencies["tvm"])
-                + runtimes_target
-                + args.target_architecture
-                + '\\native" />'
-            )
-        else:
-            # TODO(agladyshev): Add support for Linux.
-            raise RuntimeError("Now only Windows is supported for TVM EP.")
-
     if args.execution_provider == "rocm" or is_rocm_gpu_package and not is_ado_packaging_build:
         files_list.append(
             "<file src="
@@ -831,12 +788,6 @@ def generate_files(line_list, args):
                 + " />"
             )
 
-        # Process tvm dependency
-        if os.path.exists(os.path.join(args.native_build_path, nuget_dependencies["tvm"])):
-            files_list.append(
-                "<file src=" + '"' + os.path.join(args.native_build_path, nuget_dependencies["tvm"]) + runtimes + " />"
-            )
-
         # Some tools to be packaged in nightly debug build only, should not be released
         # These are copied to the runtimes folder for convenience of loading with the dlls
         # NOTE: nuget gives a spurious error on linux if these aren't in a separate directory to the library so
diff --git a/tools/python/upload_and_run_browserstack_tests.py b/tools/python/upload_and_run_browserstack_tests.py
index 9b812da156707..a4da87e4fe435 100644
--- a/tools/python/upload_and_run_browserstack_tests.py
+++ b/tools/python/upload_and_run_browserstack_tests.py
@@ -29,13 +29,16 @@ def upload_apk_parse_json(post_url, apk_path, id, token):
     return response_to_json(response)
 
 
-def browserstack_build_request(devices, app_url, test_suite_url, test_platform, id, token):
+def browserstack_build_request(devices, app_url, test_suite_url, test_platform, id, token, project, build_tag):
     headers = {}
 
     json_data = {
         "devices": devices,
         "app": app_url,
         "testSuite": test_suite_url,
+        "project": project,
+        "buildTag": build_tag,
+        "deviceLogs": True,
     }
 
     build_response = requests.post(
@@ -108,6 +111,13 @@ def build_query_loop(build_id, test_platform, id, token):
         required=True,
     )
 
+    parser.add_argument(
+        "--project",
+        type=str,
+        help="Identifier to logically group multiple builds together",
+        default="ONNXRuntime tests",
+    )
+    parser.add_argument("--build_tag", type=str, help="Identifier to tag the build with a unique name", default="")
     args = parser.parse_args()
 
     try:
@@ -142,6 +152,8 @@ def build_query_loop(build_id, test_platform, id, token):
         args.test_platform,
         browserstack_id,
         browserstack_token,
+        args.project,
+        args.build_tag,
     )
 
     # Get build status until the tests are no longer running
diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py
index dd2dcce01bf4a..24004d6be761d 100644
--- a/tools/python/util/android/android.py
+++ b/tools/python/util/android/android.py
@@ -4,6 +4,7 @@
 import collections
 import contextlib
 import datetime
+import os
 import signal
 import subprocess
 import time
@@ -105,8 +106,15 @@ def _stop_process_with_pid(pid: int):
 
 
 def start_emulator(
-    sdk_tool_paths: SdkToolPaths, avd_name: str, extra_args: typing.Optional[typing.Sequence[str]] = None
+    sdk_tool_paths: SdkToolPaths,
+    avd_name: str,
+    extra_args: typing.Optional[typing.Sequence[str]] = None,
+    timeout_minutes: int = 20,
 ) -> subprocess.Popen:
+    if check_emulator_running_using_avd_name(avd_name=avd_name):
+        raise RuntimeError(
+            f"An emulator with avd_name{avd_name} is already running. Please close it before starting a new one."
+        )
     with contextlib.ExitStack() as emulator_stack, contextlib.ExitStack() as waiter_stack:
         emulator_args = [
             sdk_tool_paths.emulator,
@@ -122,6 +130,7 @@ def start_emulator(
             "-gpu",
             "guest",
             "-delay-adb",
+            "-verbose",
         ]
 
         # For Linux CIs we must use "-no-window" otherwise you'll get
@@ -155,9 +164,9 @@ def start_emulator(
         waiter_stack.callback(_stop_process, waiter_process)
 
         # poll subprocesses.
-        # allow 20 minutes for startup as some CIs are slow. TODO: Make timeout configurable if needed.
+        # allow 20 minutes for startup as some CIs are slow.
         sleep_interval_seconds = 10
-        end_time = datetime.datetime.now() + datetime.timedelta(minutes=20)
+        end_time = datetime.datetime.now() + datetime.timedelta(minutes=timeout_minutes)
 
         while True:
             waiter_ret, emulator_ret = waiter_process.poll(), emulator_process.poll()
@@ -205,13 +214,127 @@ def start_emulator(
             _log.debug(f"sys.boot_completed='{getprop_value}'. Sleeping for {sleep_interval_seconds} before retrying.")
             time.sleep(sleep_interval_seconds)
 
+        # Verify if the emulator is now running
+        if not check_emulator_running_using_avd_name(avd_name=avd_name):
+            raise RuntimeError("Emulator failed to start.")
         return emulator_process
 
 
-def stop_emulator(emulator_proc_or_pid: typing.Union[subprocess.Popen, int]):
+def check_emulator_running_using_avd_name(avd_name: str) -> bool:
+    """
+    Check if an emulator is running based on the provided AVD name.
+    :param avd_name: Name of the Android Virtual Device (AVD) to check.
+    :return: True if an emulator with the given AVD name is running, False otherwise.
+    """
+    try:
+        # Step 1: List running devices
+        result = subprocess.check_output(["adb", "devices"], text=True).strip()
+        _log.info(f"adb devices output:\n{result}")
+        running_emulators = [line.split("\t")[0] for line in result.splitlines()[1:] if "emulator" in line]
+
+        if not running_emulators:
+            _log.debug("No emulators running.")
+            return False  # No emulators running
+
+        # Step 2: Check each running emulator's AVD name
+        for emulator in running_emulators:
+            try:
+                avd_info = (
+                    subprocess.check_output(["adb", "-s", emulator, "emu", "avd", "name"], text=True)
+                    .strip()
+                    .split("\n")[0]
+                )
+                _log.debug(f"AVD name for emulator {emulator}: {avd_info}")
+                if avd_info == avd_name:
+                    return True
+            except subprocess.SubprocessError:
+                _log.warning(f"Error checking AVD name for emulator: {emulator}")
+                continue  # Skip if there's an issue querying a specific emulator
+
+        _log.warning(f"No emulator running with AVD name: {avd_name}")
+        return False  # No matching AVD name found
+    except subprocess.SubprocessError as e:
+        _log.warning(f"Error checking emulator status: {e}")
+        return False
+
+
+def check_emulator_running_using_process(emulator_proc: subprocess.Popen) -> bool:
+    """Check if the emulator process is running based on a Popen instance."""
+    return emulator_proc.poll() is None
+
+
+def check_emulator_running_using_pid(emulator_pid: int) -> bool:
+    """Check if the emulator process is running based on PID."""
+    try:
+        os.kill(emulator_pid, 0)  # Signal 0 checks process existence
+        return True
+    except OSError:
+        return False
+
+
+def stop_emulator_by_proc(emulator_proc: subprocess.Popen, timeout_seconds: int = 120):
+    """
+    Stops the emulator process using a subprocess.Popen instance.
+    :param emulator_proc: The emulator process as a subprocess.Popen instance.
+    :param timeout_seconds: Maximum time (in seconds) to wait for the emulator to stop.
+    """
+    if not check_emulator_running_using_process(emulator_proc):
+        _log.warning("The specified emulator process is not running.")
+        return
+
+    _log.info("Stopping emulator using subprocess.Popen instance.")
+    _stop_process(emulator_proc)
+
+    # Wait for the process to stop
+    interval = 5
+    end_time = datetime.datetime.now() + datetime.timedelta(seconds=timeout_seconds)
+
+    while check_emulator_running_using_process(emulator_proc):
+        if datetime.datetime.now() > end_time:
+            raise RuntimeError(f"Failed to stop the emulator within the specified timeout = {timeout_seconds} seconds.")
+        _log.debug("Emulator still running. Checking again in 5 seconds...")
+        time.sleep(interval)
+
+    _log.info("Emulator stopped successfully.")
+
+
+def stop_emulator_by_pid(emulator_pid: int, timeout_seconds: int = 120):
+    """
+    Stops the emulator process using a PID.
+    :param emulator_pid: The emulator process PID.
+    :param timeout_seconds: Maximum time (in seconds) to wait for the emulator to stop.
+    """
+    if not check_emulator_running_using_pid(emulator_pid):
+        _log.warning(f"No emulator process with PID {emulator_pid} is currently running.")
+        return
+
+    _log.info(f"Stopping emulator with PID: {emulator_pid}")
+    _stop_process_with_pid(emulator_pid)
+
+    # Wait for the process to stop
+    interval = 5
+    end_time = datetime.datetime.now() + datetime.timedelta(seconds=timeout_seconds)
+
+    while check_emulator_running_using_pid(emulator_pid):
+        if datetime.datetime.now() > end_time:
+            raise RuntimeError(
+                f"Failed to stop the emulator with PID {emulator_pid} within the specified timeout = {timeout_seconds} seconds."
+            )
+        _log.debug("Emulator still running. Checking again in 5 seconds...")
+        time.sleep(interval)
+
+    _log.info("Emulator stopped successfully.")
+
+
+def stop_emulator(emulator_proc_or_pid: typing.Union[subprocess.Popen, int], timeout_seconds: int = 120):
+    """
+    Stops the emulator process, checking its running status before and after stopping.
+    :param emulator_proc_or_pid: The emulator process (subprocess.Popen) or PID (int).
+    :param timeout_seconds: Maximum time (in seconds) to wait for the emulator to stop.
+    """
     if isinstance(emulator_proc_or_pid, subprocess.Popen):
-        _stop_process(emulator_proc_or_pid)
+        stop_emulator_by_proc(emulator_proc_or_pid, timeout_seconds)
     elif isinstance(emulator_proc_or_pid, int):
-        _stop_process_with_pid(emulator_proc_or_pid)
+        stop_emulator_by_pid(emulator_proc_or_pid, timeout_seconds)
     else:
         raise ValueError("Expected either a PID or subprocess.Popen instance.")
diff --git a/tools/scripts/python_test.sh b/tools/scripts/python_test.sh
index d12f6e6d33772..53d350cf30611 100755
--- a/tools/scripts/python_test.sh
+++ b/tools/scripts/python_test.sh
@@ -13,9 +13,6 @@ echo Install Python Deps
 cp $src_dir/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $build_dir/requirements.txt
 
 python3 -m pip install -r $build_dir/requirements.txt
-mkdir -p $build_dir/requirements_torch_cpu/
-cp $src_dir/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt $build_dir/requirements_torch_cpu/requirements.txt
-python3 -m pip install -r $build_dir/requirements_torch_cpu/requirements.txt
 python3 -m pip list | grep onnx
 
 echo Install $config python package
@@ -23,6 +20,5 @@ rm -rf $build_dir/$config/onnxruntime $build_dir/$config/pybind11
 python3 -m pip install $build_dir/$config/dist/*.whl
 
 echo Run $config unit tests
-pushd $build_dir/$config/
-python3 $src_dir/tools/ci_build/build.py --build_dir $build_dir --cmake_generator Ninja --config $config --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags  --build_wheel --enable_onnx_tests --enable_transformers_tool_test --ctest_path ""
-popd
+cd $build_dir/$config/
+python3 $src_dir/tools/ci_build/build.py --build_dir $build_dir --cmake_generator Ninja --config $config --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags  --build_wheel --enable_onnx_tests --enable_transformers_tool_test
diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp
index 1763290718a8f..f1272fc1b8626 100644
--- a/winml/lib/Api/HardwareCoreEnumerator.cpp
+++ b/winml/lib/Api/HardwareCoreEnumerator.cpp
@@ -1,8 +1,8 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #include "lib/Api/pch/pch.h"
-
 #include "HardwareCoreEnumerator.h"
 
 namespace WINMLP {
@@ -88,22 +88,33 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
 
 #if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
   const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69};  // "GenuntelineI"
+  bool isIntelSpecifiedPlatform = false;
+  const int kVendorID_IntelSpecifiedPlatformIDs[3] = {
+    // ExtendedModel,ExtendedFamily,Family Code, and Model Number
+    0xa06a,  // MTL
+    0xc065,  // ARL-H
+    0xb065   // ARL-U
+  };
+
   int regs_leaf0[4];
-  int regs_leaf7[4];
+  int regs_leaf1[4];
   __cpuid(regs_leaf0, 0);
-  __cpuid(regs_leaf7, 0x7);
+  __cpuid(regs_leaf1, 0x1);
 
   auto isIntel = (kVendorID_Intel[0] == regs_leaf0[1]) && (kVendorID_Intel[1] == regs_leaf0[2]) &&
     (kVendorID_Intel[2] == regs_leaf0[3]);
 
-  auto isHybrid = (regs_leaf7[3] & (1 << 15));
+  for (int intelSpecifiedPlatform : kVendorID_IntelSpecifiedPlatformIDs) {
+    if ((regs_leaf1[0] >> 4) == intelSpecifiedPlatform) {
+      isIntelSpecifiedPlatform = true;
+    }
+  }
 
-  if (isIntel && isHybrid) {
+  if (isIntel && isIntelSpecifiedPlatform) {
     // We want to use the number of physical cores, but exclude cores without an LLC
     return cores.LLCCores;
   }
 #endif
-
   return cores.PhysicalCores;
 }