diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml
index 6851c52d380ec..7144363717749 100644
--- a/.github/workflows/cffconvert.yml
+++ b/.github/workflows/cffconvert.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out a copy of the repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Check whether the citation metadata from CITATION.cff is valid
         uses: citation-file-format/cffconvert-github-action@2.0.0
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 2fe66013ebbbc..d3ecf44fe5733 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -33,7 +33,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
diff --git a/.github/workflows/gradle-wrapper-validation.yml b/.github/workflows/gradle-wrapper-validation.yml
index 07346b38b2151..03ea773a25130 100644
--- a/.github/workflows/gradle-wrapper-validation.yml
+++ b/.github/workflows/gradle-wrapper-validation.yml
@@ -10,5 +10,5 @@ jobs:
     name: "Validation"
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: gradle/wrapper-validation-action@v1
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 91f9a8ee3df40..432c789e943b5 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -12,7 +12,7 @@ jobs:
     name: Optional Lint
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: misspell # Check spellings as well
         uses: reviewdog/action-misspell@v1
         with:
@@ -34,7 +34,7 @@ jobs:
     name: Python format
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
@@ -100,7 +100,7 @@ jobs:
     name: Lint JavaScript
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: reviewdog/action-eslint@v1
         with:
           reporter: github-pr-check
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index dceb15b446a8a..7b314d845d9b4 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -14,7 +14,7 @@ jobs:
   Onnxruntime-TVM:
     runs-on: ubuntu-latest
     steps:
-       - uses: actions/checkout@v3
+       - uses: actions/checkout@v4
          with:
            submodules: true
        - uses: actions/setup-python@v4
diff --git a/.github/workflows/publish-c-apidocs.yml b/.github/workflows/publish-c-apidocs.yml
index 2fbd8e521aeee..0a3e9ed2594c1 100644
--- a/.github/workflows/publish-c-apidocs.yml
+++ b/.github/workflows/publish-c-apidocs.yml
@@ -24,19 +24,19 @@ jobs:
     name: Generate C/C++ API docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Install doxygen and dependencies
         run: |
           sudo apt update
           sudo apt-get install libclang-dev
           sudo apt-get install libclang-cpp14
-          wget https://www.doxygen.nl/files/doxygen-1.9.6.linux.bin.tar.gz
-          tar xvzf doxygen-1.9.6.linux.bin.tar.gz
+          wget https://www.doxygen.nl/files/doxygen-1.9.8.linux.bin.tar.gz
+          tar xvzf doxygen-1.9.8.linux.bin.tar.gz
       - name: Run doxygen
         run: |
           mkdir -p build/doxygen
           cd docs/c_cxx
-          ../../doxygen-1.9.6/bin/doxygen
+          ../../doxygen-1.9.8/bin/doxygen
       - name: Log source commit
         run: git rev-parse --short HEAD > build/doxygen/html/source-version.txt
       - name: Move C/C++ docs into site
diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index 097d4a1cdff5e..9b9ca924bd008 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -24,7 +24,7 @@ jobs:
     env:
       DOCFXVERSION: 2.62.2
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Setup .NET
       uses: actions/setup-dotnet@v3
       with:
diff --git a/.github/workflows/publish-java-apidocs.yml b/.github/workflows/publish-java-apidocs.yml
index cea350ba54de0..9ea9bda7e7c53 100644
--- a/.github/workflows/publish-java-apidocs.yml
+++ b/.github/workflows/publish-java-apidocs.yml
@@ -23,7 +23,7 @@ jobs:
     name: Generate Java docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up JDK 11
         uses: actions/setup-java@v3
         with:
diff --git a/.github/workflows/publish-js-apidocs.yml b/.github/workflows/publish-js-apidocs.yml
index 5668be77c98a4..ba8bfd718abfa 100644
--- a/.github/workflows/publish-js-apidocs.yml
+++ b/.github/workflows/publish-js-apidocs.yml
@@ -23,7 +23,7 @@ jobs:
     name: Generate JS API docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Setup Node.js
         uses: actions/setup-node@v3
         with:
diff --git a/.github/workflows/publish-objectivec-apidocs.yml b/.github/workflows/publish-objectivec-apidocs.yml
index b966793cc0d06..1b327eebfa8a8 100644
--- a/.github/workflows/publish-objectivec-apidocs.yml
+++ b/.github/workflows/publish-objectivec-apidocs.yml
@@ -23,7 +23,7 @@ jobs:
     name: Generate Objective-C API docs
     runs-on: macos-13
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Install Jazzy
       run: |
diff --git a/.github/workflows/publish-python-apidocs.yml b/.github/workflows/publish-python-apidocs.yml
index 4ca1249fc1d8e..ab9d4781afb83 100644
--- a/.github/workflows/publish-python-apidocs.yml
+++ b/.github/workflows/publish-python-apidocs.yml
@@ -24,7 +24,7 @@ jobs:
     name: Generate Python API docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Install tools
         run: |
           sudo apt-get update
diff --git a/.github/workflows/sca.yml b/.github/workflows/sca.yml
deleted file mode 100644
index 1416f5a4d33a9..0000000000000
--- a/.github/workflows/sca.yml
+++ /dev/null
@@ -1,133 +0,0 @@
-name: Windows_SCA
-on:
-  push:
-    branches:
-      - main
-      - rel-*
-  pull_request:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  AZCOPY_AUTO_LOGIN_TYPE: MSI
-  AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
-
-jobs:
-  Onnxruntime-SCA-training-CUDA:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11.x'
-          architecture: 'x64'
-
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Download cuda
-        run: azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v11.8" cuda_sdk
-
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x64 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run: python tools\ci_build\build.py --windows_sdk_version 10.0.22621.0 --enable_training --build_java --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_pybind --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --use_cuda --cuda_home=${{ github.workspace }}\cuda_sdk\v11.8 --enable_cuda_profiling  --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v2
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA
-
-  # No python
-  Onnxruntime-SCA-win32-WINML-x64:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11.x'
-          architecture: 'x64'
-
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x64 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run:  python tools\ci_build\build.py --build_java --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --ms_experimental --use_dml --use_winml --disable_rtti --enable_wcos --build_shared_lib
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v2
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA_WIN32_WINML_X64
-
-  # No java, No python
-  Onnxruntime-SCA-win32-WINML-x86:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11.x'
-          architecture: 'x86'
-
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x86 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run:  python tools\ci_build\build.py --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --ms_experimental --use_dml --use_winml --disable_rtti --enable_wcos --build_shared_lib
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v2
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA_WIN32_WINML_X86
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 8cd62db77b744..ba24e7eebfb03 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -18,7 +18,7 @@ jobs:
   Windows-CUDA-12:
     runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: false
       - uses: actions/setup-python@v4
@@ -46,7 +46,7 @@ jobs:
   Onnxruntime-TVM:
     runs-on: windows-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: true
       - uses: actions/setup-python@v4
diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py
index 7d775996835da..a9eaacc6f2938 100644
--- a/cgmanifests/generate_cgmanifest.py
+++ b/cgmanifests/generate_cgmanifest.py
@@ -91,7 +91,7 @@ def add_github_dep(name, parsed_url):
 
 
 with open(
-    os.path.join(REPO_DIR, "tools", "ci_build", "github", "linux", "docker", "Dockerfile.manylinux2014_cuda11"),
+    os.path.join(REPO_DIR, "tools", "ci_build", "github", "linux", "docker", "Dockerfile.manylinux2_28_cuda11"),
 ) as f:
     for line in f:
         if not line.strip():
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 4eb061ffa1cbd..6f1ca84e1a304 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -112,7 +112,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "8c0b94e793a66495e0b1f34a5eb26bd7dc672db0",
+          "commitHash": "29bf8085f3bf17b84d30e34b3d7ff8248fda404e",
           "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
         },
         "comments": "abseil_cpp"
@@ -132,7 +132,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "e7e1482087f58913b80a20b04d5c58d9d6d90155",
+          "commitHash": "6e921e1b1d21e84a5c82416ba7ecd98e33a436d0",
           "repositoryUrl": "https://github.com/HowardHinnant/date.git"
         },
         "comments": "date"
@@ -192,7 +192,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "436617053d0f39a1019a371c3a9aa599b3cb2cea",
+          "commitHash": "13de152c2a1cd73ff4df97bd2c406b6d15d34af3",
           "repositoryUrl": "https://github.com/google/nsync.git"
         },
         "comments": "google_nsync"
@@ -202,7 +202,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "519beb0e52c842729b4b53731d27c0e0c32ab4a2",
+          "commitHash": "f8d7d77c06936315286eb55f8de22cd23c188571",
           "repositoryUrl": "https://github.com/google/googletest.git"
         },
         "comments": "googletest"
@@ -242,7 +242,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "5f4caba4e7a9017816e47becdd918fcc872039ba",
+          "commitHash": "55f373e164d3f092dd6c7a56e3de6f90c4c6f3dc",
           "repositoryUrl": "https://github.com/microsoft/wil.git"
         },
         "comments": "microsoft_wil"
@@ -262,7 +262,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "7bc4e1ae9b36ec8ee635c3629b59ec525bbe82b9",
+          "commitHash": "0a0b5fb001ce0233ae3a6f99d849c0649e5a7361",
           "repositoryUrl": "https://github.com/boostorg/mp11.git"
         },
         "comments": "mp11"
@@ -322,7 +322,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "5916273f79a21551890fd3d56fc5375a78d1598d",
+          "commitHash": "959002f82d7962a473d8bf301845f2af720e0aa4",
           "repositoryUrl": "https://github.com/pytorch/cpuinfo.git"
         },
         "comments": "pytorch_cpuinfo"
@@ -342,7 +342,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "ff15c6ada150a5018c5ef2172401cb4529eac9c0",
+          "commitHash": "4cafc9196c4da9c817992b20f5253ef967685bf8",
           "repositoryUrl": "https://github.com/dcleblanc/SafeInt.git"
         },
         "comments": "safeint"
@@ -368,6 +368,16 @@
         "comments": "cutlass"
       }
     },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "72c943dea2b9240cd09efde15191e144bc7c7d38",
+          "repositoryUrl": "https://github.com/protocolbuffers/utf8_range.git"
+        },
+        "comments": "utf8_range"
+      }
+    },
     {
       "component": {
         "type": "git",
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 82a454791d159..496ca72bb1b6c 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -54,8 +54,8 @@ if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose build type: Debug Release RelWithDebInfo MinSizeRel." FORCE)
 endif()
 
-if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9)
-  message(FATAL_ERROR  "GCC version must be greater than or equal to 9")
+if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8)
+  message(FATAL_ERROR  "GCC version must be greater than or equal to 8")
 endif()
 
 # Options
@@ -554,6 +554,8 @@ if (WIN32)
     list(APPEND ORT_WARNING_FLAGS "/wd4251")
     # issued by thrust nonstandard extension used: nameless struct/union
     list(APPEND ORT_WARNING_FLAGS "/wd4201")
+    # structure was padded due to __declspec(align())
+    list(APPEND ORT_WARNING_FLAGS "/wd4324")
     # warning C4800: Implicit conversion from 'X' to bool. Possible information loss
     if (onnxruntime_USE_OPENVINO)
        list(APPEND ORT_WARNING_FLAGS "/wd4800")
@@ -1336,9 +1338,11 @@ set(ORT_BUILD_INFO "ORT Build Info: ")
 find_package(Git)
 if (Git_FOUND)
   execute_process(COMMAND ${GIT_EXECUTABLE} log -1 --format=%h
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
       OUTPUT_VARIABLE ORT_GIT_COMMIT)
   string(STRIP "${ORT_GIT_COMMIT}" ORT_GIT_COMMIT)
   execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
       OUTPUT_VARIABLE ORT_GIT_BRANCH)
   string(STRIP "${ORT_GIT_BRANCH}" ORT_GIT_BRANCH)
   string(APPEND ORT_BUILD_INFO "git-branch=${ORT_GIT_BRANCH}, git-commit-id=${ORT_GIT_COMMIT}, ")
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 68522a7dda7ea..e825bfeaea952 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -319,6 +319,11 @@ else()
     string(APPEND CMAKE_CXX_FLAGS " -g -O0 --coverage ")
     string(APPEND CMAKE_C_FLAGS   " -g -O0 --coverage ")
   endif()
+  if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+    # suppress warnings from flatbuffers
+    string(APPEND CMAKE_CXX_FLAGS " -Wno-restrict ")
+    string(APPEND CMAKE_C_FLAGS   " -Wno-restrict ")
+  endif()
   # Check support for AVX and f16c.
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-mf16c" COMPILER_SUPPORT_MF16C)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 0ab283698bc97..279b5ca649dba 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -5,24 +5,25 @@
 #URLs can be either https URLs or local file paths in cmake-style(directory separator is a forward slash character).
 #SHA1 hashes can be generated by running sha1sum command.
 #If you need to change abseil's version to a different one, you may also want to update external\abseil-cpp.natvis
-#since the file contains a version string: "lts_20220623". However, the file is for debugging purposes only and would
+#since the file contains a version string: "lts_20230802". However, the file is for debugging purposes only and would
 #not affect built binaries.
-abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20220623.1.zip;50c137c88965cba015dfcc8fd5d9b46d23146751
+abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20230802.0.zip;04271dfbfac59269b6939e1e9d5faf0d18a7ba91
 cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
-date;https://github.com/HowardHinnant/date/archive/refs/tags/v2.4.1.zip;ea99f021262b1d804a872735c658860a6a13cc98
+date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159
 dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b31321e5549591d78aa7f377173445
+eigen;https://gitlab.com/libeigen/eigen/-/archive/3.4/eigen-3.4.zip;ee201b07085203ea7bd8eb97cbcb31b07cfa3efb
 flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v1.12.0.zip;ba0a75fd12dbef8f6557a74e611b7a3d0c5fe7bf
 fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
 google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.7.0.zip;e97c368b176e8614e3f1bf13dd9abcf6a7ad9908
-google_nsync;https://github.com/google/nsync/archive/refs/tags/1.23.0.zip;f3233450cf7156fc0bedd1b0e884eddec264897c
-googletest;https://github.com/google/googletest/archive/519beb0e52c842729b4b53731d27c0e0c32ab4a2.zip;4b3c37972e4c1bef1185d46f702082f8772ee73f
+google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752
+googletest;https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip;0ac421f2ec11af38b0fff0f1992184032731a8bc
 googlexnnpack;https://github.com/google/XNNPACK/archive/003c580e696a774afdc984996ee909b7c8d8128c.zip;9f192e3f15e1e37ae9c78d53eeea47e45c5eb31c
 json;https://github.com/nlohmann/json/archive/refs/tags/v3.10.5.zip;f257f8dc27c5b8c085dc887b40cddd18ae1f725c
 microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14
-microsoft_wil;https://github.com/microsoft/wil/archive/5f4caba4e7a9017816e47becdd918fcc872039ba.zip;fd119887d0d17c37adf1fc227b054befa28158ad
+microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
-mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.79.0.zip;c8f04e378535ededbe5af52c8f969d2dedbe73d5
+mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 onnx;https://github.com/onnx/onnx/archive/e2525550194ce3d8a2c4a3af451c9d9b3ae6650e.zip;782f23d788185887f520a90535513e244218e928
 #use the commit of supporting all the plugins and TRT 8.6-GA (https://github.com/onnx/onnx-tensorrt/commit/0462dc31ae78f48744b6141ae376df1f96d3f459)
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/0462dc31ae78f48744b6141ae376df1f96d3f459.zip;5ff086361956cceb81ed17453a1fd8db2aa4328d
@@ -36,11 +37,11 @@ protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/downlo
 psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
 pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/1787867f6183f056420e532eec640cba25efafea.zip;e43e80781560c5ab404a4da20f34d846f5f5d101
 pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.10.1.zip;769b6aa67a77f17a770960f604b727645b6f6a13
-pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/5916273f79a21551890fd3d56fc5375a78d1598d.zip;2be4d2ae321fada97cb39eaf4eeba5f8c85597cf
+pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/959002f82d7962a473d8bf301845f2af720e0aa4.zip;85da3caa60eb2b148613b443fbc2bfdc30689965
 re2;https://github.com/google/re2/archive/refs/tags/2022-06-01.zip;aa77313b76e91b531ee7f3e45f004c6a502a5374
-safeint;https://github.com/dcleblanc/SafeInt/archive/ff15c6ada150a5018c5ef2172401cb4529eac9c0.zip;913a4046e5274d329af2806cb53194f617d8c0ab
+safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
 tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381
 cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.0.0.zip;0f95b3c1fc1bd1175c4a90b2c9e39074d1bccefd
+utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c
-eigen;https://gitlab.com/libeigen/eigen/-/archive/3.4/eigen-3.4.zip;ee201b07085203ea7bd8eb97cbcb31b07cfa3efb
-composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/d52ec01652b7d620386251db92455968d8d90bdc.zip;6b5ce8edf3625f8817086c194fbf94b664e1b0e0
+composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/d52ec01652b7d620386251db92455968d8d90bdc.zip;6b5ce8edf3625f8817086c194fbf94b664e1b0e0
\ No newline at end of file
diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index 54d2f9c5c19df..3bcd4109e2888 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -6,15 +6,20 @@ include(FetchContent)
 # Pass to build
 set(ABSL_PROPAGATE_CXX_STD 1)
 set(BUILD_TESTING 0)
-
+set(ABSL_BUILD_TESTING OFF)
+set(ABSL_BUILD_TEST_HELPERS OFF)
+set(ABSL_USE_EXTERNAL_GOOGLETEST ON)
 if(Patch_FOUND AND WIN32)
   set(ABSL_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/abseil/absl_windows.patch)
 else()
   set(ABSL_PATCH_COMMAND "")
 endif()
-
+if(WIN32 AND NOT Patch_FOUND)
+  #see https://github.com/google/re2/issues/425 and https://github.com/google/re2/issues/436
+  set(ABSL_ENABLE_INSTALL ON)
+endif()
 # NB! Advancing Abseil version changes its internal namespace,
-# currently absl::lts_20211102 which affects abseil-cpp.natvis debugger
+# currently absl::lts_20230125 which affects abseil-cpp.natvis debugger
 # visualization file, that must be adjusted accordingly, unless we eliminate
 # that namespace at build time.
 FetchContent_Declare(
@@ -22,6 +27,7 @@ FetchContent_Declare(
     URL ${DEP_URL_abseil_cpp}
     URL_HASH SHA1=${DEP_SHA1_abseil_cpp}
     PATCH_COMMAND ${ABSL_PATCH_COMMAND}
+    FIND_PACKAGE_ARGS NAMES absl
 )
 
 onnxruntime_fetchcontent_makeavailable(abseil_cpp)
@@ -37,8 +43,26 @@ if (GDK_PLATFORM)
   target_compile_definitions(absl_symbolize PRIVATE WINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP)
 endif()
 
-if(NOT onnxruntime_DISABLE_ABSEIL)
-  set(ABSEIL_LIBS absl::inlined_vector absl::flat_hash_set
-    absl::flat_hash_map absl::node_hash_set absl::node_hash_map absl::base absl::throw_delegate absl::raw_hash_set
-    absl::hash absl::city absl::low_level_hash absl::raw_logging_internal)
-endif()
\ No newline at end of file
+# TODO: since multiple ORT's dependencies depend on Abseil, the list below would vary from version to version.
+# We'd better to not manually manage the list.
+set(ABSEIL_LIBS absl::base
+absl::city
+absl::core_headers
+absl::fixed_array
+absl::flags
+absl::flat_hash_map
+absl::flat_hash_set
+absl::hash
+absl::inlined_vector
+absl::low_level_hash
+absl::node_hash_map
+absl::node_hash_set
+absl::optional
+absl::raw_hash_set
+absl::raw_logging_internal
+absl::span
+absl::str_format
+absl::strings
+absl::synchronization
+absl::throw_delegate
+absl::time)
diff --git a/cmake/external/abseil-cpp.natvis b/cmake/external/abseil-cpp.natvis
index e0294ba6f7b55..e923d5862ec2e 100644
--- a/cmake/external/abseil-cpp.natvis
+++ b/cmake/external/abseil-cpp.natvis
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-  <Type Name="absl::lts_20220623::InlinedVector&lt;*&gt;">
+  <Type Name="absl::lts_20230802::InlinedVector&lt;*&gt;">
     <Intrinsic Name="_size" Expression="storage_.metadata_.value >> 1"/>
     <Intrinsic Name="_is_allocated" Expression="(storage_.metadata_.value &amp; 1) == 1"/>
     <Intrinsic Name="_inlined_data" Expression="($T1*)storage_.data_.inlined.inlined_data"/>
@@ -24,7 +24,7 @@
     </Expand>
   </Type>
   <!-- Should handle both flat hash_set and hash_map -->
-  <Type Name="absl::lts_20220623::container_internal::raw_hash_set&lt;*&gt;">
+  <Type Name="absl::lts_20230802::container_internal::raw_hash_set&lt;*&gt;">
     <DisplayString Condition="size_ == 0">empty</DisplayString>
     <DisplayString>{{ size={size_} }}</DisplayString>
     <Expand>
@@ -44,7 +44,7 @@
       </CustomListItems>
     </Expand>
   </Type>
-  <Type Name="absl::lts_20220623::container_internal::map_slot_type&lt;*&gt;">
+  <Type Name="absl::lts_20230802::container_internal::map_slot_type&lt;*&gt;">
     <DisplayString>{{ {value.first}:{value.second} }}</DisplayString>
     <Expand>
       <Item Name="[key]" ExcludeView="simple">value.first</Item>
diff --git a/cmake/external/extensions.cmake b/cmake/external/extensions.cmake
index 5039929062445..68796ad02d982 100644
--- a/cmake/external/extensions.cmake
+++ b/cmake/external/extensions.cmake
@@ -22,7 +22,8 @@ if (onnxruntime_REDUCED_OPS_BUILD)
 endif()
 
 if (onnxruntime_WEBASSEMBLY_DEFAULT_EXTENSION_FLAGS)
-  set(OCOS_ENABLE_SPM_TOKENIZER ON CACHE INTERNAL "")
+  #The generated protobuf files in ORT-extension needs be updated to work with the current protobuf version ORT is using.
+  set(OCOS_ENABLE_SPM_TOKENIZER OFF CACHE INTERNAL "")
   set(OCOS_ENABLE_GPT2_TOKENIZER ON CACHE INTERNAL "")
   set(OCOS_ENABLE_WORDPIECE_TOKENIZER ON CACHE INTERNAL "")
   set(OCOS_ENABLE_BERT_TOKENIZER ON CACHE INTERNAL "")
@@ -54,9 +55,11 @@ endif()
 target_include_directories(ocos_operators PRIVATE ${RE2_INCLUDE_DIR} ${json_SOURCE_DIR}/include)
 target_include_directories(ortcustomops PUBLIC ${onnxruntime_EXTENSIONS_PATH}/includes)
 if(OCOS_ENABLE_SPM_TOKENIZER)
-  onnxruntime_add_include_to_target(sentencepiece-static ${PROTOBUF_LIB})
+  onnxruntime_add_include_to_target(sentencepiece-static ${PROTOBUF_LIB} ${ABSEIL_LIBS})
 endif()
-onnxruntime_add_include_to_target(ocos_operators ${PROTOBUF_LIB})
+onnxruntime_add_include_to_target(ocos_operators ${PROTOBUF_LIB} ${ABSEIL_LIBS})
+onnxruntime_add_include_to_target(noexcep_operators ${PROTOBUF_LIB} ${ABSEIL_LIBS})
+
 add_dependencies(ocos_operators ${onnxruntime_EXTERNAL_DEPENDENCIES})
 add_dependencies(ortcustomops ${onnxruntime_EXTERNAL_DEPENDENCIES})
 
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index c087ad8f6d81e..e1671bcf43ed9 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -19,11 +19,10 @@ endforeach()
 
 message("Loading Dependencies ...")
 # ABSL should be included before protobuf because protobuf may use absl
-if(NOT onnxruntime_DISABLE_ABSEIL)
-  include(external/abseil-cpp.cmake)
-endif()
+include(external/abseil-cpp.cmake)
 
 set(RE2_BUILD_TESTING OFF CACHE BOOL "" FORCE)
+
 FetchContent_Declare(
     re2
     URL ${DEP_URL_re2}
@@ -38,16 +37,14 @@ if (onnxruntime_BUILD_UNIT_TESTS)
     set(gtest_disable_pthreads ON)
   endif()
   set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
-  if(NOT onnxruntime_DISABLE_ABSEIL)
-    # It uses both ABSL and re2
-    set(GTEST_HAS_ABSL OFF CACHE BOOL "" FORCE)
-  endif()
+  # Set it to ON will cause crashes in onnxruntime_test_all when onnxruntime_USE_CUDA is ON
+  set(GTEST_HAS_ABSL OFF CACHE BOOL "" FORCE)
   # gtest and gmock
   FetchContent_Declare(
     googletest
     URL ${DEP_URL_googletest}
+    FIND_PACKAGE_ARGS 1.14.0...<2.0.0 NAMES GTest
     URL_HASH SHA1=${DEP_SHA1_googletest}
-    OVERRIDE_FIND_PACKAGE
   )
 endif()
 
@@ -161,6 +158,19 @@ if(Patch_FOUND)
 else()
  set(ONNXRUNTIME_PROTOBUF_PATCH_COMMAND "")
 endif()
+
+FetchContent_Declare(
+    utf8_range
+    URL ${DEP_URL_utf8_range}
+    URL_HASH SHA1=${DEP_SHA1_utf8_range}
+    FIND_PACKAGE_ARGS NAMES utf8_range
+)
+
+set(utf8_range_ENABLE_TESTS OFF CACHE BOOL "Build test suite" FORCE)
+set(utf8_range_ENABLE_INSTALL OFF CACHE BOOL "Configure installation" FORCE)
+
+
+#Protobuf depends on absl and utf8_range
 FetchContent_Declare(
   Protobuf
   URL ${DEP_URL_protobuf}
@@ -168,7 +178,15 @@ FetchContent_Declare(
   PATCH_COMMAND ${ONNXRUNTIME_PROTOBUF_PATCH_COMMAND}
   FIND_PACKAGE_ARGS 3.21.12 NAMES Protobuf
 )
+
 set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE)
+#TODO: we'd better to turn the following option off. However, it will cause 
+# ".\build.bat --config Debug --parallel --skip_submodule_sync --update" fail with an error message:
+# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is 
+# not in any export set.
+#set(protobuf_INSTALL OFF CACHE BOOL "Install protobuf binaries and files" FORCE)
+set(protobuf_USE_EXTERNAL_GTEST ON CACHE BOOL "" FORCE)
+
 if (CMAKE_SYSTEM_NAME STREQUAL "Android")
   set(protobuf_BUILD_PROTOC_BINARIES OFF CACHE BOOL "Build protobuf tests" FORCE)
   set(protobuf_WITH_ZLIB OFF CACHE BOOL "Build with zlib support" FORCE)
@@ -184,14 +202,13 @@ set(ENABLE_DATE_TESTING  OFF CACHE BOOL "" FORCE)
 set(USE_SYSTEM_TZ_DB  ON CACHE BOOL "" FORCE)
 
 FetchContent_Declare(
-      date
-      URL ${DEP_URL_date}
-      URL_HASH SHA1=${DEP_SHA1_date}
-    )
+  date
+  URL ${DEP_URL_date}
+  URL_HASH SHA1=${DEP_SHA1_date}
+  FIND_PACKAGE_ARGS 3...<4 NAMES date
+)
 onnxruntime_fetchcontent_makeavailable(date)
 
-
-
 FetchContent_Declare(
   mp11
   URL ${DEP_URL_mp11}
@@ -252,6 +269,20 @@ else()
   set(CPUINFO_SUPPORTED FALSE)
 endif()
 
+# xnnpack depends on clog
+# Android build should use the system's log library instead of clog
+if ((CPUINFO_SUPPORTED OR onnxruntime_USE_XNNPACK) AND NOT ANDROID)
+  set(CLOG_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+  FetchContent_Declare(
+    pytorch_clog
+    URL ${DEP_URL_pytorch_cpuinfo}
+    URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
+	SOURCE_SUBDIR deps/clog
+  )
+  set(ONNXRUNTIME_CLOG_PROJ pytorch_clog)
+  set(ONNXRUNTIME_CLOG_TARGET_NAME clog)
+endif()
+
 if (CPUINFO_SUPPORTED)
   if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
     set(IOS ON CACHE INTERNAL "")
@@ -276,7 +307,7 @@ if (CPUINFO_SUPPORTED)
     URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
     FIND_PACKAGE_ARGS NAMES cpuinfo
   )
-
+  set(ONNXRUNTIME_CPUINFO_PROJ pytorch_cpuinfo)
 endif()
 
 
@@ -316,8 +347,14 @@ FetchContent_Declare(
     URL_HASH SHA1=${DEP_SHA1_safeint}
 )
 
+# use fetch content rather than makeavailable because safeint only includes unconditional test targets
+FetchContent_Populate(safeint)
 # The next line will generate an error message "fatal: not a git repository", but it is ok. It is from flatbuffers
-onnxruntime_fetchcontent_makeavailable(Protobuf nlohmann_json mp11 re2 safeint GSL flatbuffers)
+onnxruntime_fetchcontent_makeavailable(utf8_range)
+# protobuf's cmake/utf8_range.cmake has the following line
+include_directories(${utf8_range_SOURCE_DIR})
+
+onnxruntime_fetchcontent_makeavailable(Protobuf nlohmann_json mp11 re2 GSL flatbuffers ${ONNXRUNTIME_CPUINFO_PROJ} ${ONNXRUNTIME_CLOG_PROJ})
 if(NOT flatbuffers_FOUND)
   if(NOT TARGET flatbuffers::flatbuffers)
     add_library(flatbuffers::flatbuffers ALIAS flatbuffers)
@@ -413,15 +450,7 @@ FetchContent_Declare(
 )
 
 
-if (CPUINFO_SUPPORTED)
-  onnxruntime_fetchcontent_makeavailable(pytorch_cpuinfo)
-  if (pytorch_cpuinfo_SOURCE_DIR)
-    # shouldn't need to define these aliases after we use a version of cpuinfo with this commit:
-    # https://github.com/pytorch/cpuinfo/commit/082deffc80ce517f81dc2f3aebe6ba671fcd09c9
-    add_library(cpuinfo::cpuinfo ALIAS cpuinfo)
-    add_library(cpuinfo::clog ALIAS clog)
-  endif()
-endif()
+
 
 
 
@@ -462,7 +491,7 @@ endif()
 #onnxruntime_EXTERNAL_LIBRARIES could contain onnx, onnx_proto,libprotobuf, cuda/cudnn,
 # dnnl/mklml, onnxruntime_codegen_tvm, tvm and pthread
 # pthread is always at the last
-set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${WIL_TARGET} nlohmann_json::nlohmann_json onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date_interface)
+set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${WIL_TARGET} nlohmann_json::nlohmann_json onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date ${ONNXRUNTIME_CLOG_TARGET_NAME})
 # The source code of onnx_proto is generated, we must build this lib first before starting to compile the other source code that uses ONNX protobuf types.
 # The other libs do not have the problem. All the sources are already there. We can compile them in any order.
 set(onnxruntime_EXTERNAL_DEPENDENCIES onnx_proto flatbuffers::flatbuffers)
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index 687a231e051e6..43d5fa9bdee34 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -121,7 +121,7 @@ if (MSVC)
     endif()
 endif()
 
-onnxruntime_add_include_to_target(onnxruntime_common date_interface ${WIL_TARGET})
+onnxruntime_add_include_to_target(onnxruntime_common date::date ${WIL_TARGET})
 target_include_directories(onnxruntime_common
     PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS}
     # propagate include directories of dependencies that are part of public interface
@@ -208,7 +208,7 @@ if (ARM64 OR ARM OR X86 OR X64 OR X86_64)
     # Its functionality in detecting x86 cpu features are lacking, so is support for Windows.
     if (CPUINFO_SUPPORTED)
       onnxruntime_add_include_to_target(onnxruntime_common cpuinfo::cpuinfo)
-      list(APPEND onnxruntime_EXTERNAL_LIBRARIES cpuinfo::cpuinfo cpuinfo::clog)
+      list(APPEND onnxruntime_EXTERNAL_LIBRARIES cpuinfo::cpuinfo ${ONNXRUNTIME_CLOG_TARGET_NAME})
     endif()
   endif()
 endif()
diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake
index 18104f361c401..735c86956ec4f 100644
--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@@ -98,7 +98,7 @@ if (MSVC)
 endif()
 
 if(NOT MSVC)
-  target_compile_options(onnxruntime_graph PRIVATE "-Wno-parentheses")
+  target_compile_options(onnxruntime_graph PRIVATE "-Wno-parentheses" "-Wno-deprecated-declarations")
 endif()
 if (onnxruntime_ENABLE_TRAINING)
   #TODO: the graph library should focus on ONNX IR, it shouldn't depend on math libraries like MKLML/OpenBlas
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 19075128476aa..b9e7873132089 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -729,6 +729,7 @@ if (onnxruntime_USE_TENSORRT)
     onnxruntime_fetchcontent_makeavailable(onnx_tensorrt)
     include_directories(${onnx_tensorrt_SOURCE_DIR})
     set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
+    set(CUDA_INCLUDE_DIR ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # onnx-tensorrt repo needs this variable to build
     if ( CMAKE_COMPILER_IS_GNUCC )
       set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
     endif()
@@ -1696,6 +1697,8 @@ if (onnxruntime_USE_ROCM)
       device_gemm_instance
       device_gemm_add_fastgelu_instance
       device_gemm_fastgelu_instance
+      device_gemm_splitk_instance
+      device_gemm_streamk_instance
       device_batched_gemm_instance
       device_softmax_instance
     )
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 3b9727ec08970..ec83eb2095071 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -23,7 +23,6 @@ function(AddTest)
   else()
     onnxruntime_add_executable(${_UT_TARGET} ${_UT_SOURCES})
   endif()
-
   if (_UT_DEPENDS)
     list(REMOVE_DUPLICATES _UT_DEPENDS)
   endif(_UT_DEPENDS)
@@ -40,6 +39,9 @@ function(AddTest)
   endif()
   if (MSVC)
     target_compile_options(${_UT_TARGET} PRIVATE "/wd6330")
+    #Abseil has a lot of C4127/C4324 warnings. 
+    target_compile_options(${_UT_TARGET} PRIVATE "/wd4127")
+    target_compile_options(${_UT_TARGET} PRIVATE "/wd4324")
   endif()
 
   set_target_properties(${_UT_TARGET} PROPERTIES FOLDER "ONNXRuntimeTest")
@@ -61,7 +63,7 @@ function(AddTest)
     target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES})
   endif()
 
-  onnxruntime_add_include_to_target(${_UT_TARGET} date_interface flatbuffers::flatbuffers)
+  onnxruntime_add_include_to_target(${_UT_TARGET} date::date flatbuffers::flatbuffers)
   target_include_directories(${_UT_TARGET} PRIVATE ${TEST_INC_DIR})
   if (onnxruntime_USE_CUDA)
     target_include_directories(${_UT_TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${onnxruntime_CUDNN_HOME}/include)
@@ -147,7 +149,7 @@ function(AddTest)
     else()
       target_link_libraries(${_UT_TARGET}_xc PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES})
     endif()
-    onnxruntime_add_include_to_target(${_UT_TARGET}_xc date_interface flatbuffers::flatbuffers)
+    onnxruntime_add_include_to_target(${_UT_TARGET}_xc date::date flatbuffers::flatbuffers)
     target_include_directories(${_UT_TARGET}_xc PRIVATE ${TEST_INC_DIR})
     get_target_property(${_UT_TARGET}_DEFS ${_UT_TARGET} COMPILE_DEFINITIONS)
     target_compile_definitions(${_UT_TARGET}_xc PRIVATE ${_UT_TARGET}_DEFS)
@@ -202,11 +204,15 @@ function(AddTest)
           WORKING_DIRECTORY $<TARGET_FILE_DIR:${_UT_TARGET}>
         )
       endif()
+      # Set test timeout to 3 hours.
+      set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 7200)
     else()
       add_test(NAME ${_UT_TARGET}
         COMMAND ${_UT_TARGET} ${TEST_ARGS}
         WORKING_DIRECTORY $<TARGET_FILE_DIR:${_UT_TARGET}>
       )
+      # Set test timeout to 3 hours.
+      set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 7200)
     endif()
   endif()
 endfunction(AddTest)
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index d7712a7b70c98..c6510c97a617e 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -236,6 +236,7 @@ else()
       "SHELL:-s ASYNCIFY=1"
       "SHELL:-s ASYNCIFY_STACK_SIZE=65536"
     )
+    set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/js_internal_api.js)
   endif()
 
   if (onnxruntime_EMSCRIPTEN_SETTINGS)
@@ -283,23 +284,23 @@ else()
     )
   endif()
 
-  set(target_name ort)
+  set(target_name_list ort)
 
   if (onnxruntime_ENABLE_TRAINING_APIS)
-    list(APPEND target_name "training")
+    list(APPEND target_name_list  "training")
   endif()
 
-  list(APPEND target_name "wasm")
+  list(APPEND target_name_list  "wasm")
 
   if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
-    list(APPEND target_name "simd")
+    list(APPEND target_name_list  "simd")
   endif()
 
   if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
-    list(APPEND target_name "threaded")
+    list(APPEND target_name_list  "threaded")
   endif()
 
-  list(JOIN target_name "-" target_name)
+  list(JOIN target_name_list  "-" target_name)
 
   set_target_properties(onnxruntime_webassembly PROPERTIES OUTPUT_NAME ${target_name})
 endif()
diff --git a/cmake/winml_unittests.cmake b/cmake/winml_unittests.cmake
index e0c0eeb4ffa05..b655e60a8aec9 100644
--- a/cmake/winml_unittests.cmake
+++ b/cmake/winml_unittests.cmake
@@ -49,7 +49,10 @@ function(add_winml_test)
   if (_UT_DEPENDS)
     add_dependencies(${_UT_TARGET} ${_UT_DEPENDS})
   endif()
-  target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} gtest winml_google_test_lib ${onnxruntime_EXTERNAL_LIBRARIES} winml_lib_common onnxruntime windowsapp.lib)
+  target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest winml_google_test_lib ${onnxruntime_EXTERNAL_LIBRARIES} winml_lib_common onnxruntime windowsapp.lib)
+  #Abseil has a lot of C4127/C4324 warnings.
+  target_compile_options(${_UT_TARGET} PRIVATE "/wd4127")
+  target_compile_options(${_UT_TARGET} PRIVATE "/wd4324")
   target_compile_options(${_UT_TARGET} PRIVATE /wd5205)  # workaround cppwinrt SDK bug https://github.com/microsoft/cppwinrt/issues/584
 
   # if building inbox
@@ -174,15 +177,18 @@ target_compile_options(winml_test_common PRIVATE /wd5205)  # workaround cppwinrt
 if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows")
   target_compile_definitions(winml_test_common PRIVATE "BUILD_INBOX=1")
 endif()
+#Abseil has a lot of C4127/C4324 warnings.
+target_compile_options(winml_test_common PRIVATE "/wd4127")
+target_compile_options(winml_test_common PRIVATE "/wd4324")
 add_dependencies(winml_test_common
   onnx
   winml_api
   winml_dll
 )
 
-onnxruntime_add_include_to_target(winml_test_common onnx_proto gtest ${PROTOBUF_LIB} ${WIL_TARGET} safeint_interface ${GSL_TARGET})
+onnxruntime_add_include_to_target(winml_test_common onnx_proto GTest::gtest ${PROTOBUF_LIB} ${WIL_TARGET} safeint_interface ${GSL_TARGET})
 onnxruntime_add_static_library(winml_google_test_lib ${WINML_TEST_SRC_DIR}/common/googletest/main.cpp)
-onnxruntime_add_include_to_target(winml_google_test_lib gtest)
+onnxruntime_add_include_to_target(winml_google_test_lib GTest::gtest)
 set_winml_target_properties(winml_google_test_lib)
 
 set_winml_target_properties(winml_test_common)
diff --git a/dockerfiles/Dockerfile.arm32v7 b/dockerfiles/Dockerfile.arm32v7
deleted file mode 100644
index 285f790598061..0000000000000
--- a/dockerfiles/Dockerfile.arm32v7
+++ /dev/null
@@ -1,17 +0,0 @@
-# --------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------
-# Dockerfile to run ONNXRuntime with source build for CPU
-
-FROM arm32v7/fedora:34
-MAINTAINER Changming Sun "chasun@microsoft.com"
-ADD . /code
-
-RUN /code/dockerfiles/scripts/install_fedora_arm32.sh
-RUN cd /code && ./build.sh --allow_running_as_root --skip_submodule_sync --config Release --build_wheel --update --build --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER)
-
-FROM arm64v8/centos:7
-COPY --from=0 /code/build/Linux/Release/dist /root
-COPY --from=0 /code/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt
-RUN yum install -y python3-wheel python3-pip && python3 -m pip install --upgrade pip && python3 -m pip install /root/*.whl && rm -rf /root/*.whl
diff --git a/dockerfiles/Dockerfile.arm64 b/dockerfiles/Dockerfile.arm64
deleted file mode 100644
index 06ce9c1e38040..0000000000000
--- a/dockerfiles/Dockerfile.arm64
+++ /dev/null
@@ -1,17 +0,0 @@
-# --------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------
-# Dockerfile to run ONNXRuntime with source build for CPU
-
-FROM arm64v8/centos:7
-MAINTAINER Changming Sun "chasun@microsoft.com"
-ADD . /code
-
-
-RUN /code/dockerfiles/scripts/install_centos_arm64.sh && cd /code && CC=/opt/rh/devtoolset-10/root/usr/bin/gcc CXX=/opt/rh/devtoolset-10/root/usr/bin/g++ ./build.sh --allow_running_as_root --skip_submodule_sync --config Release --build_wheel --update --build --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER)
-
-FROM arm64v8/centos:7
-COPY --from=0 /code/build/Linux/Release/dist /root
-COPY --from=0 /code/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt
-RUN yum install -y python3-wheel python3-pip && python3 -m pip install --upgrade pip && python3 -m pip install /root/*.whl && rm -rf /root/*.whl
diff --git a/dockerfiles/Dockerfile.cuda b/dockerfiles/Dockerfile.cuda
index dfc76c4bb385c..a03a6b0a6dcdc 100644
--- a/dockerfiles/Dockerfile.cuda
+++ b/dockerfiles/Dockerfile.cuda
@@ -11,7 +11,7 @@ MAINTAINER Changming Sun "chasun@microsoft.com"
 ADD . /code
 
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-RUN apt-get update && apt-get install -y --no-install-recommends python3-dev ca-certificates g++ python3-numpy gcc make git python3-setuptools python3-wheel python3-packaging python3-pip aria2 && aria2c -q -d /tmp -o cmake-3.26.3-linux-x86_64.tar.gz https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-x86_64.tar.gz && tar -zxf /tmp/cmake-3.26.3-linux-x86_64.tar.gz --strip=1 -C /usr
+RUN apt-get update && apt-get install -y --no-install-recommends python3-dev ca-certificates g++ python3-numpy gcc make git python3-setuptools python3-wheel python3-packaging python3-pip aria2 && aria2c -q -d /tmp -o cmake-3.27.3-linux-x86_64.tar.gz https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.tar.gz && tar -zxf /tmp/cmake-3.27.3-linux-x86_64.tar.gz --strip=1 -C /usr
 
 RUN cd /code && python3 -m pip install -r tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requireme\
 nts.txt && /bin/bash ./build.sh --allow_running_as_root --skip_submodule_sync --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_cuda --config Release --build_wheel --update --build --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) 'CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;86'
diff --git a/dockerfiles/Dockerfile.migraphx b/dockerfiles/Dockerfile.migraphx
index 886b863f2fc57..bc513a8e8ba6d 100644
--- a/dockerfiles/Dockerfile.migraphx
+++ b/dockerfiles/Dockerfile.migraphx
@@ -30,14 +30,14 @@ RUN apt-get update &&\
     apt-get install -y sudo git bash build-essential rocm-dev python3-dev python3-pip miopen-hip \
     rocblas half aria2 libnuma-dev pkg-config
 
-RUN aria2c -q -d /tmp -o cmake-3.26.3-linux-x86_64.tar.gz \
-https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-x86_64.tar.gz &&\
-tar -zxf /tmp/cmake-3.26.3-linux-x86_64.tar.gz --strip=1 -C /usr
+RUN aria2c -q -d /tmp -o cmake-3.27.3-linux-x86_64.tar.gz \
+https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.tar.gz &&\
+tar -zxf /tmp/cmake-3.27.3-linux-x86_64.tar.gz --strip=1 -C /usr
 
 # Install rbuild
 RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz numpy yapf==0.28.0
 
-ENV PATH /opt/miniconda/bin:/code/cmake-3.26.3-linux-x86_64/bin:${PATH}
+ENV PATH /opt/miniconda/bin:/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
 
 # Install MIGraphX from source
 RUN mkdir -p /migraphx
diff --git a/dockerfiles/Dockerfile.openvino-centos7 b/dockerfiles/Dockerfile.openvino-centos7
index 8b7555a940d96..697db44801e3b 100755
--- a/dockerfiles/Dockerfile.openvino-centos7
+++ b/dockerfiles/Dockerfile.openvino-centos7
@@ -31,9 +31,9 @@ RUN yum update -y && \
     yum clean packages &&  yum clean all && rm -rf /var/cache/yum && \
 # Install cmake
     cd $MY_ROOT && \
-    wget https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3.tar.gz && \
-    tar -zxvf cmake-3.26.3.tar.gz && rm -rf cmake-3.26.3.tar.gz && \
-    cd cmake-3.26.3 && \
+    wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3.tar.gz && \
+    tar -zxvf cmake-3.27.3.tar.gz && rm -rf cmake-3.27.3.tar.gz && \
+    cd cmake-3.27.3 && \
     ./bootstrap && \
     make && \
     make install && \
diff --git a/dockerfiles/Dockerfile.rocm b/dockerfiles/Dockerfile.rocm
index c3c45af59e724..35a676383337b 100644
--- a/dockerfiles/Dockerfile.rocm
+++ b/dockerfiles/Dockerfile.rocm
@@ -12,7 +12,7 @@ ARG ONNXRUNTIME_BRANCH=main
 
 WORKDIR /code
 
-ENV PATH /opt/miniconda/bin:/code/cmake-3.26.3-linux-x86_64/bin:${PATH}
+ENV PATH /opt/miniconda/bin:/code/cmake-3.27.3-linux-x86_64/bin:${PATH}
 
 # Prepare onnxruntime repository & build onnxruntime
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
diff --git a/dockerfiles/Dockerfile.source b/dockerfiles/Dockerfile.source
index 87ec529b65f5d..110e484e77d21 100644
--- a/dockerfiles/Dockerfile.source
+++ b/dockerfiles/Dockerfile.source
@@ -4,17 +4,17 @@
 # --------------------------------------------------------------
 # Dockerfile to run ONNXRuntime with source build for CPU
 
-FROM ubuntu:22.04
+FROM mcr.microsoft.com/cbl-mariner/base/python:3
 MAINTAINER Changming Sun "chasun@microsoft.com"
 ADD . /code
 
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y --no-install-recommends python3-dev ca-certificates g++ python3-numpy gcc make git python3-setuptools python3-wheel python3-pip aria2 && aria2c -q -d /tmp -o cmake-3.26.3-linux-x86_64.tar.gz https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-x86_64.tar.gz && tar -zxf /tmp/cmake-3.26.3-linux-x86_64.tar.gz --strip=1 -C /usr
+RUN tdnf install -y tar ca-certificates build-essential python3-numpy cmake python3-setuptools python3-wheel python3-pip curl python3-devel
+RUN /code/dockerfiles/scripts/install_cmake.sh
 
 # Prepare onnxruntime repository & build onnxruntime
 RUN cd /code && python3 -m pip install -r tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt && /bin/bash ./build.sh --allow_running_as_root --skip_submodule_sync --config Release --build_wheel --update --build --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER)
 
-FROM ubuntu:22.04
+FROM mcr.microsoft.com/cbl-mariner/base/python:3
 COPY --from=0 /code/build/Linux/Release/dist /root
 COPY --from=0 /code/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt
-RUN apt-get update && apt-get install -y --no-install-recommends libstdc++6 ca-certificates python3-setuptools python3-wheel python3-pip unattended-upgrades && unattended-upgrade && python3 -m pip install /root/*.whl && rm -rf /root/*.whl
+RUN tdnf install -y  ca-certificates python3-setuptools python3-wheel python3-pip && python3 -m pip install /root/*.whl && rm -rf /root/*.whl
diff --git a/dockerfiles/Dockerfile.tensorrt b/dockerfiles/Dockerfile.tensorrt
index 452cae54b57a2..ef51d41c5ff1b 100644
--- a/dockerfiles/Dockerfile.tensorrt
+++ b/dockerfiles/Dockerfile.tensorrt
@@ -17,7 +17,7 @@ RUN apt-get update &&\
 RUN unattended-upgrade
 
 WORKDIR /code
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.26.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 
 # Prepare onnxruntime repository & build onnxruntime with TensorRT
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
diff --git a/dockerfiles/Dockerfile.vitisai b/dockerfiles/Dockerfile.vitisai
index 3a0d75d4d3cb2..e11ab70a61332 100644
--- a/dockerfiles/Dockerfile.vitisai
+++ b/dockerfiles/Dockerfile.vitisai
@@ -22,7 +22,7 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-ENV PATH /code/cmake-3.26.3-linux-x86_64/bin:$PATH
+ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:$PATH
 ENV LD_LIBRARY_PATH /opt/xilinx/xrt/lib:$LD_LIBRARY_PATH
 
 WORKDIR /code
@@ -41,4 +41,4 @@ RUN . $VAI_ROOT/conda/etc/profile.d/conda.sh &&\
     /bin/sh ./build.sh --allow_running_as_root --config RelWithDebInfo --enable_pybind --build_wheel --use_vitisai --parallel --update --build --build_shared_lib &&\
     pip install /code/onnxruntime/build/Linux/RelWithDebInfo/dist/*-linux_x86_64.whl &&\
     cd .. &&\
-    rm -rf onnxruntime cmake-3.26.3-linux-x86_64
+    rm -rf onnxruntime cmake-3.27.3-linux-x86_64
diff --git a/dockerfiles/README.md b/dockerfiles/README.md
index fc4179d906c8b..f226ebfe8b193 100644
--- a/dockerfiles/README.md
+++ b/dockerfiles/README.md
@@ -7,10 +7,6 @@
 - OpenVINO: [Dockerfile](Dockerfile.openvino), [Instructions](#openvino)
 - TensorRT: [Dockerfile](Dockerfile.tensorrt), [Instructions](#tensorrt)
 - VitisAI: [Dockerfile](Dockerfile.vitisai)
-
-**Platforms**
-- ARM 32v7: [Dockerfile](Dockerfile.arm32v7), [Instructions](#arm-3264)
-- ARM 64: [Dockerfile](Dockerfile.arm64), [Instructions](#arm-3264)
 - NVIDIA Jetson TX1/TX2/Nano/Xavier: [Dockerfile](Dockerfile.jetson), [Instructions](#nvidia-jetson-tx1tx2nanoxavier)
 
 **Other**
@@ -22,38 +18,36 @@
 # Instructions
 
 ## CPU
-**Ubuntu 22.04, CPU, Python Bindings**
+**Mariner 2.0, CPU, Python Bindings**
 
-1. Update submodules
-```
-git submodule update --init
-```
 
-2. Build the docker image from the Dockerfile in this repository.
-  ```
+1. Build the docker image from the Dockerfile in this repository.
+  ```bash
   docker build -t onnxruntime-source -f Dockerfile.source ..
   ```
 
-3. Run the Docker image
+2. Run the Docker image
 
-  ```
+  ```bash
   docker run -it onnxruntime-source
   ```
 
-## CUDA
-**Ubuntu 20.04, CUDA 11.4, CuDNN 8**
+The docker file supports both x86_64 and ARM64(aarch64). You may use docker's "--platform" parameter to explictly specify which CPU architecture you want to build. For example:
 
-1. Update submodules
-```
-git submodule update --init
+```bash
+  docker build --platform linux/arm64/v8 -f Dockerfile.source
 ```
+However, we cannot build the code for 32-bit ARM in such a way since a 32-bit compiler/linker might not have enough memory to generate the binaries.
 
-2. Build the docker image from the Dockerfile in this repository.
+## CUDA
+**Ubuntu 22.04, CUDA 12.1, CuDNN 8**
+
+1. Build the docker image from the Dockerfile in this repository.
   ```
   docker build -t onnxruntime-cuda -f Dockerfile.cuda ..
   ```
 
-3. Run the Docker image
+2. Run the Docker image
 
   ```
   docker run --gpus all -it onnxruntime-cuda
diff --git a/dockerfiles/scripts/install_centos_arm64.sh b/dockerfiles/scripts/install_centos_arm64.sh
deleted file mode 100755
index b3dbb8b001422..0000000000000
--- a/dockerfiles/scripts/install_centos_arm64.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-yum-config-manager --enable extras
-yum -y install centos-release-scl-rh
-# EPEL support (for yasm)
-if ! rpm -q --quiet epel-release ; then
-  yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
-fi
-yum install -y devtoolset-10-binutils devtoolset-10-gcc devtoolset-10-gcc-c++ devtoolset-10-gcc aria2 python3-pip python3-wheel git python3-devel
-ARCH=`uname -m`
-if [ "$ARCH" = "aarch64" ]; then
-    aria2c -q -d /tmp -o cmake-3.26.3-linux-aarch64.tar.gz https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-aarch64.tar.gz && tar -zxf /tmp/cmake-3.26.3-linux-aarch64.tar.gz --strip=1 -C /usr
-else
-    aria2c -q -d /tmp https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3.tar.gz
-    cd /tmp
-    mkdir cmake
-    cd cmake
-    tar --strip=1 -zxvf /tmp/cmake-3.26.3.tar.gz
-    ./configure --prefix=/usr --parallel=$(nproc)
-    make -j$(nproc)
-    make install
-fi
-python3 -m pip install --upgrade pip
-python3 -m pip install numpy
-python3 -m pip install packaging
diff --git a/dockerfiles/scripts/install_cmake.sh b/dockerfiles/scripts/install_cmake.sh
new file mode 100755
index 0000000000000..e89c323460ac4
--- /dev/null
+++ b/dockerfiles/scripts/install_cmake.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e -x
+mkdir -p /tmp/src
+cd /tmp/src
+
+echo "Installing cmake"
+CPU_ARCH=`uname -m`
+CMAKE_VERSION='3.27.3'
+curl https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-$CPU_ARCH.tar.gz -sSL --retry 5  -o /tmp/src/cmake.tar.gz
+tar -zxf /tmp/src/cmake.tar.gz --strip=1 -C /usr
+rm -f /tmp/src/cmake.tar.gz
diff --git a/dockerfiles/scripts/install_common_deps.sh b/dockerfiles/scripts/install_common_deps.sh
index 460df850b985f..786a6f076a71b 100644
--- a/dockerfiles/scripts/install_common_deps.sh
+++ b/dockerfiles/scripts/install_common_deps.sh
@@ -21,6 +21,6 @@ pip install "wheel>=0.35.1"
 rm -rf /opt/miniconda/pkgs
 
 # Dependencies: cmake
-wget --quiet https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-x86_64.tar.gz
-tar zxf cmake-3.26.3-linux-x86_64.tar.gz
-rm -rf cmake-3.26.3-linux-x86_64.tar.gz
+wget --quiet https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.tar.gz
+tar zxf cmake-3.27.3-linux-x86_64.tar.gz
+rm -rf cmake-3.27.3-linux-x86_64.tar.gz
diff --git a/dockerfiles/scripts/install_fedora_arm32.sh b/dockerfiles/scripts/install_fedora_arm32.sh
deleted file mode 100755
index c32859e696c1e..0000000000000
--- a/dockerfiles/scripts/install_fedora_arm32.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-set -e
-dnf install -y binutils gcc gcc-c++ aria2 python3-pip python3-wheel git python3-devel cmake
-python3 -m pip install --upgrade pip
-python3 -m pip install numpy
diff --git a/docs/c_cxx/Doxyfile b/docs/c_cxx/Doxyfile
index 94b39d2045f69..aedb1fdcfee75 100644
--- a/docs/c_cxx/Doxyfile
+++ b/docs/c_cxx/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.9.2
+# Doxyfile 1.9.8
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -12,6 +12,16 @@
 # For lists, items can also be appended using:
 # TAG += value [value, ...]
 # Values that contain spaces should be placed between quotes (\" \").
+#
+# Note:
+#
+# Use doxygen to compare the used configuration file with the template
+# configuration file:
+# doxygen -x [configFile]
+# Use doxygen to compare the used configuration file with the template
+# configuration file without replacing the environment variables or CMake type
+# replacement variables:
+# doxygen -x_noenv [configFile]
 
 #---------------------------------------------------------------------------
 # Project related configuration options
@@ -60,16 +70,28 @@ PROJECT_LOGO           = "../images/ONNX_Runtime_logo - Docs.png"
 
 OUTPUT_DIRECTORY       = ../../build/doxygen
 
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
+# sub-directories (in 2 levels) under the output directory of each output format
+# and will distribute the generated files over these directories. Enabling this
 # option can be useful when feeding doxygen a huge amount of source files, where
 # putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
+# control the number of sub-directories.
 # The default value is: NO.
 
 CREATE_SUBDIRS         = NO
 
+# Controls the number of sub-directories that will be created when
+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
+# level increment doubles the number of directories, resulting in 4096
+# directories at level 8 which is the default and also the maximum value. The
+# sub-directories are organized in 2 levels, the first level always has a fixed
+# number of 16 directories.
+# Minimum value: 0, maximum value: 8, default value: 8.
+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
+
+CREATE_SUBDIRS_LEVEL   = 8
+
 # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
 # characters to appear in the names of generated files. If set to NO, non-ASCII
 # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
@@ -81,14 +103,14 @@ ALLOW_UNICODE_NAMES    = NO
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
 # information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
+# Swedish, Turkish, Ukrainian and Vietnamese.
 # The default value is: English.
 
 OUTPUT_LANGUAGE        = English
@@ -341,6 +363,17 @@ MARKDOWN_SUPPORT       = YES
 
 TOC_INCLUDE_HEADINGS   = 5
 
+# The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to
+# generate identifiers for the Markdown headings. Note: Every identifier is
+# unique.
+# Possible values are: DOXYGEN use a fixed 'autotoc_md' string followed by a
+# sequence number starting at 0 and GITHUB use the lower case version of title
+# with any whitespace replaced by '-' and punctuation characters removed.
+# The default value is: DOXYGEN.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+MARKDOWN_ID_STYLE      = DOXYGEN
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -437,7 +470,7 @@ INLINE_SIMPLE_STRUCTS  = NO
 # types are typedef'ed and only the typedef is referenced, never the tag name.
 # The default value is: NO.
 
-TYPEDEF_HIDES_STRUCT   = YES
+TYPEDEF_HIDES_STRUCT   = NO
 
 # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
 # cache is used to resolve symbols given their name and scope. Since this can be
@@ -452,7 +485,7 @@ TYPEDEF_HIDES_STRUCT   = YES
 
 LOOKUP_CACHE_SIZE      = 0
 
-# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
 # during processing. When set to 0 doxygen will based this on the number of
 # cores available in the system. You can set it explicitly to a value larger
 # than 0 to get more control over the balance between CPU load and processing
@@ -465,6 +498,14 @@ LOOKUP_CACHE_SIZE      = 0
 
 NUM_PROC_THREADS       = 1
 
+# If the TIMESTAMP tag is set different from NO then each generated page will
+# contain the date or date and time when the page was generated. Setting this to
+# NO can help when comparing the output of multiple runs.
+# Possible values are: YES, NO, DATETIME and DATE.
+# The default value is: NO.
+
+TIMESTAMP              = NO
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -546,7 +587,8 @@ HIDE_UNDOC_MEMBERS     = NO
 # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
 # undocumented classes that are normally visible in the class hierarchy. If set
 # to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
+# will also hide undocumented C++ concepts if enabled. This option has no effect
+# if EXTRACT_ALL is enabled.
 # The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
@@ -577,14 +619,15 @@ INTERNAL_DOCS          = NO
 # filesystem is case sensitive (i.e. it supports files in the same directory
 # whose names only differ in casing), the option must be set to YES to properly
 # deal with such files in case they appear in the input. For filesystems that
-# are not case sensitive the option should be be set to NO to properly deal with
+# are not case sensitive the option should be set to NO to properly deal with
 # output files written for symbols that only differ in casing, such as for two
 # classes, one named CLASS and the other named Class, and to also support
 # references to files without having to specify the exact matching casing. On
 # Windows (including Cygwin) and MacOS, users should typically set this option
 # to NO, whereas on Linux or other Unix flavors it should typically be set to
 # YES.
-# The default value is: system dependent.
+# Possible values are: SYSTEM, NO and YES.
+# The default value is: SYSTEM.
 
 CASE_SENSE_NAMES       = NO
 
@@ -836,11 +879,26 @@ WARN_IF_INCOMPLETE_DOC = YES
 
 WARN_NO_PARAMDOC       = YES
 
+# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about
+# undocumented enumeration values. If set to NO, doxygen will accept
+# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: NO.
+
+WARN_IF_UNDOC_ENUM_VAL = NO
+
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
 # a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
 # then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
 # at the end of the doxygen process doxygen will return with a non-zero status.
-# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then doxygen behaves
+# like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined doxygen will not
+# write the warning messages in between other messages but write them at the end
+# of a run, in case a WARN_LOGFILE is defined the warning messages will be
+# besides being in the defined file also be shown at the end of a run, unless
+# the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case
+# the behavior will remain as with the setting FAIL_ON_WARNINGS.
+# Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT.
 # The default value is: NO.
 
 WARN_AS_ERROR          = YES
@@ -851,13 +909,27 @@ WARN_AS_ERROR          = YES
 # and the warning text. Optionally the format may contain $version, which will
 # be replaced by the version of the file (if it could be obtained via
 # FILE_VERSION_FILTER)
+# See also: WARN_LINE_FORMAT
 # The default value is: $file:$line: $text.
 
 WARN_FORMAT            = "$file:$line: $text"
 
+# In the $text part of the WARN_FORMAT command it is possible that a reference
+# to a more specific place is given. To make it easier to jump to this place
+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
+# Example:
+# WARN_LINE_FORMAT = "'vi $file +$line'"
+# See also: WARN_FORMAT
+# The default value is: at line $line of file $file.
+
+WARN_LINE_FORMAT       = "at line $line of file $file"
+
 # The WARN_LOGFILE tag can be used to specify a file to which warning and error
 # messages should be written. If left blank the output is written to standard
-# error (stderr).
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
 
 WARN_LOGFILE           =
 
@@ -881,10 +953,21 @@ INPUT                  = ../../include/onnxruntime/core/session/onnxruntime_c_ap
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
 # documentation (see:
 # https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# See also: INPUT_FILE_ENCODING
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify
+# character encoding on a per file pattern basis. Doxygen will compare the file
+# name with each pattern and apply the encoding instead of the default
+# INPUT_ENCODING) if there is a match. The character encodings are a list of the
+# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding
+# "INPUT_ENCODING" for further information on supported encodings.
+
+INPUT_FILE_ENCODING    =
+
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
 # *.h) to filter out the source-files in the directories.
@@ -896,18 +979,21 @@ INPUT_ENCODING         = UTF-8
 # Note the list of default checked file patterns might differ from the list of
 # default file extension mappings.
 #
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
-# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
-# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
-# *.vhdl, *.ucf, *.qsf and *.ice.
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cxxm,
+# *.cpp, *.cppm, *.c++, *.c++m, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl,
+# *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, *.h++, *.ixx, *.l, *.cs, *.d, *.php,
+# *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be
+# provided as doxygen C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f18, *.f, *.for, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.c \
                          *.cc \
                          *.cxx \
+                         *.cxxm \
                          *.cpp \
+                         *.cppm \
                          *.c++ \
+                         *.c++m \
                          *.java \
                          *.ii \
                          *.ixx \
@@ -922,6 +1008,8 @@ FILE_PATTERNS          = *.c \
                          *.hxx \
                          *.hpp \
                          *.h++ \
+                         *.ixx \
+                         *.l \
                          *.cs \
                          *.d \
                          *.php \
@@ -984,10 +1072,7 @@ EXCLUDE_PATTERNS       =
 # (namespaces, classes, functions, etc.) that should be excluded from the
 # output. The symbol name can be a fully qualified name, a word, or if the
 # wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
+# ANamespace::AClass, ANamespace::*Test
 
 EXCLUDE_SYMBOLS        =
 
@@ -1032,6 +1117,11 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 #
+# Note that doxygen will use the data processed and written to standard output
+# for further processing, therefore nothing else, like debug statements or used
+# commands (so in case of a Windows batch file always use @echo OFF), should be
+# written to standard output.
+#
 # Note that for custom extensions or not directly supported extensions you also
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # properly processed by doxygen.
@@ -1073,6 +1163,15 @@ FILTER_SOURCE_PATTERNS =
 
 USE_MDFILE_AS_MAINPAGE =
 
+# The Fortran standard specifies that for fixed formatted Fortran code all
+# characters from position 72 are to be considered as comment. A common
+# extension is to allow longer lines before the automatic comment starts. The
+# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can
+# be processed before the automatic comment starts.
+# Minimum value: 7, maximum value: 10000, default value: 72.
+
+FORTRAN_COMMENT_AFTER  = 72
+
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
 #---------------------------------------------------------------------------
@@ -1210,10 +1309,11 @@ CLANG_DATABASE_PATH    =
 
 ALPHABETICAL_INDEX     = YES
 
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
+# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes)
+# that should be ignored while generating the index headers. The IGNORE_PREFIX
+# tag works for classes, function and member names. The entity will be placed in
+# the alphabetical list under the first letter of the entity name that remains
+# after removing the prefix.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
 IGNORE_PREFIX          =
@@ -1292,7 +1392,12 @@ HTML_STYLESHEET        =
 # Doxygen will copy the style sheet files to the output directory.
 # Note: The order of the extra style sheet files is of importance (e.g. the last
 # style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
+# list).
+# Note: Since the styling of scrollbars can currently not be overruled in
+# Webkit/Chromium, the styling will be left out of the default doxygen.css if
+# one or more extra stylesheets have been specified. So if scrollbar
+# customization is desired it has to be added explicitly. For an example see the
+# documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_EXTRA_STYLESHEET  =
@@ -1307,6 +1412,19 @@ HTML_EXTRA_STYLESHEET  =
 
 HTML_EXTRA_FILES       =
 
+# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
+# should be rendered with a dark or light theme.
+# Possible values are: LIGHT always generate light mode output, DARK always
+# generate dark mode output, AUTO_LIGHT automatically set the mode according to
+# the user preference, use light mode if no preference is set (the default),
+# AUTO_DARK automatically set the mode according to the user preference, use
+# dark mode if no preference is set and TOGGLE allow to user to switch between
+# light and dark mode via a button.
+# The default value is: AUTO_LIGHT.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE        = AUTO_LIGHT
+
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a color-wheel, see
@@ -1337,15 +1455,6 @@ HTML_COLORSTYLE_SAT    = 100
 
 HTML_COLORSTYLE_GAMMA  = 80
 
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
 # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
 # documentation will contain a main index with vertical navigation menus that
 # are dynamically created via JavaScript. If disabled, the navigation index will
@@ -1365,6 +1474,13 @@ HTML_DYNAMIC_MENUS     = YES
 
 HTML_DYNAMIC_SECTIONS  = NO
 
+# If the HTML_CODE_FOLDING tag is set to YES then classes and functions can be
+# dynamically folded and expanded in the generated HTML source code.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_CODE_FOLDING      = YES
+
 # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
 # shown in the various tree structured indices initially; the user can expand
 # and collapse entries dynamically later on. Doxygen will expand the tree to
@@ -1401,6 +1517,13 @@ GENERATE_DOCSET        = NO
 
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
 # This tag specifies a string that should uniquely identify the documentation
 # set bundle. This should be a reverse domain-name style string, e.g.
 # com.mycompany.MyDocSet. Doxygen will append .docset to the name.
@@ -1488,6 +1611,16 @@ BINARY_TOC             = NO
 
 TOC_EXPAND             = NO
 
+# The SITEMAP_URL tag is used to specify the full URL of the place where the
+# generated documentation will be placed on the server by the user during the
+# deployment of the documentation. The generated sitemap is called sitemap.xml
+# and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL
+# is specified no sitemap is generated. For information about the sitemap
+# protocol see https://www.sitemaps.org
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SITEMAP_URL            =
+
 # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
 # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
 # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
@@ -1605,7 +1738,7 @@ GENERATE_TREEVIEW      = YES
 # area (value NO) or if it should extend to the full height of the window (value
 # YES). Setting this to YES gives a layout similar to
 # https://docs.readthedocs.io with more room for contents, but less room for the
-# project logo, title, and description. If either GENERATOR_TREEVIEW or
+# project logo, title, and description. If either GENERATE_TREEVIEW or
 # DISABLE_INDEX is set to NO, this option has no effect.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1636,6 +1769,13 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
 # If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
 # tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
 # https://inkscape.org) to generate formulas as SVG images instead of PNGs for
@@ -1969,9 +2109,16 @@ PDF_HYPERLINKS         = YES
 
 USE_PDFLATEX           = YES
 
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help.
+# The LATEX_BATCHMODE tag signals the behavior of LaTeX in case of an error.
+# Possible values are: NO same as ERROR_STOP, YES same as BATCH, BATCH In batch
+# mode nothing is printed on the terminal, errors are scrolled as if <return> is
+# hit at every error; missing files that TeX tries to input or request from
+# keyboard input (\read on a not open input stream) cause the job to abort,
+# NON_STOP In nonstop mode the diagnostic message will appear on the terminal,
+# but there is no possibility of user interaction just like in batch mode,
+# SCROLL In scroll mode, TeX will stop only for missing files to input or if
+# keyboard input is necessary and ERROR_STOP In errorstop mode, TeX will stop at
+# each error, asking for user intervention.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1992,14 +2139,6 @@ LATEX_HIDE_INDICES     = NO
 
 LATEX_BIB_STYLE        = plain
 
-# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_TIMESTAMP        = NO
-
 # The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
 # path from which the emoji images will be read. If a relative path is entered,
 # it will be relative to the LATEX_OUTPUT directory. If left blank the
@@ -2165,7 +2304,7 @@ DOCBOOK_OUTPUT         = docbook
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# AutoGen Definitions (see https://autogen.sourceforge.net/) file that captures
 # the structure of the code including all documentation. Note that this feature
 # is still experimental and incomplete at the moment.
 # The default value is: NO.
@@ -2176,6 +2315,28 @@ GENERATE_AUTOGEN_DEF   = NO
 # Configuration options related to Sqlite3 output
 #---------------------------------------------------------------------------
 
+# If the GENERATE_SQLITE3 tag is set to YES doxygen will generate a Sqlite3
+# database with symbols found by doxygen stored in tables.
+# The default value is: NO.
+
+GENERATE_SQLITE3       = NO
+
+# The SQLITE3_OUTPUT tag is used to specify where the Sqlite3 database will be
+# put. If a relative path is entered the value of OUTPUT_DIRECTORY will be put
+# in front of it.
+# The default directory is: sqlite3.
+# This tag requires that the tag GENERATE_SQLITE3 is set to YES.
+
+SQLITE3_OUTPUT         = sqlite3
+
+# The SQLITE3_OVERWRITE_DB tag is set to YES, the existing doxygen_sqlite3.db
+# database file will be recreated with each doxygen run. If set to NO, doxygen
+# will warn if an a database file is already found and not modify it.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_SQLITE3 is set to YES.
+
+SQLITE3_RECREATE_DB    = YES
+
 #---------------------------------------------------------------------------
 # Configuration options related to the Perl module output
 #---------------------------------------------------------------------------
@@ -2250,7 +2411,8 @@ SEARCH_INCLUDES        = YES
 
 # The INCLUDE_PATH tag can be used to specify one or more directories that
 # contain include files that are not input files but should be processed by the
-# preprocessor.
+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
+# RECURSIVE has no effect here.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
 # onnxruntime-training and onnxruntime core headers are in different directories.
@@ -2324,15 +2486,15 @@ TAGFILES               =
 
 GENERATE_TAGFILE       =
 
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
+# If the ALLEXTERNALS tag is set to YES, all external classes and namespaces
+# will be listed in the class and namespace index. If set to NO, only the
+# inherited external classes will be listed.
 # The default value is: NO.
 
 ALLEXTERNALS           = NO
 
 # If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
+# in the topic index. If set to NO, only the current project's groups will be
 # listed.
 # The default value is: YES.
 
@@ -2346,16 +2508,9 @@ EXTERNAL_GROUPS        = YES
 EXTERNAL_PAGES         = YES
 
 #---------------------------------------------------------------------------
-# Configuration options related to the dot tool
+# Configuration options related to diagram generator tools
 #---------------------------------------------------------------------------
 
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # The default value is: YES.
@@ -2364,7 +2519,7 @@ HIDE_UNDOC_RELATIONS   = YES
 
 # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
 # available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# https://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
 # The default value is: NO.
@@ -2381,32 +2536,73 @@ HAVE_DOT               = NO
 
 DOT_NUM_THREADS        = 0
 
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
+# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of
+# subgraphs. When you want a differently looking font in the dot files that
+# doxygen generates you can specify fontname, fontcolor and fontsize attributes.
+# For details please see <a href=https://graphviz.org/doc/info/attrs.html>Node,
+# Edge and Graph Attributes specification</a> You need to make sure dot is able
+# to find the font, which can be done by putting it in a standard location or by
+# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font. Default graphviz fontsize is 14.
+# The default value is: fontname=Helvetica,fontsize=10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_COMMON_ATTR        = "fontname=Helvetica,fontsize=10"
+
+# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can
+# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. <a
+# href=https://graphviz.org/doc/info/arrows.html>Complete documentation about
+# arrows shapes.</a>
+# The default value is: labelfontname=Helvetica,labelfontsize=10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_EDGE_ATTR          = "labelfontname=Helvetica,labelfontsize=10"
+
+# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes
+# around nodes set 'shape=plain' or 'shape=plaintext' <a
+# href=https://www.graphviz.org/doc/info/shapes.html>Shapes specification</a>
+# The default value is: shape=box,height=0.2,width=0.4.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NODE_ATTR          = "shape=box,height=0.2,width=0.4"
+
+# You can set the path where dot can find font specified with fontname in
+# DOT_COMMON_ATTR and others dot attributes.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_FONTPATH           =
 
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES or GRAPH or BUILTIN then doxygen will
+# generate a graph for each documented class showing the direct and indirect
+# inheritance relations. In case the CLASS_GRAPH tag is set to YES or GRAPH and
+# HAVE_DOT is enabled as well, then dot will be used to draw the graph. In case
+# the CLASS_GRAPH tag is set to YES and HAVE_DOT is disabled or if the
+# CLASS_GRAPH tag is set to BUILTIN, then the built-in generator will be used.
+# If the CLASS_GRAPH tag is set to TEXT the direct and indirect inheritance
+# relations will be shown as texts / links.
+# Possible values are: NO, YES, TEXT, GRAPH and BUILTIN.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
 # If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
 # graph for each documented class showing the direct and indirect implementation
 # dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
+# class with other documented classes. Explicit enabling a collaboration graph,
+# when COLLABORATION_GRAPH is set to NO, can be accomplished by means of the
+# command \collaborationgraph. Disabling a collaboration graph can be
+# accomplished by means of the command \hidecollaborationgraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 COLLABORATION_GRAPH    = YES
 
 # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
+# groups, showing the direct groups dependencies. Explicit enabling a group
+# dependency graph, when GROUP_GRAPHS is set to NO, can be accomplished by means
+# of the command \groupgraph. Disabling a directory graph can be accomplished by
+# means of the command \hidegroupgraph. See also the chapter Grouping in the
+# manual.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2466,7 +2662,9 @@ TEMPLATE_RELATIONS     = NO
 # If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
 # YES then doxygen will generate a graph for each documented file showing the
 # direct and indirect include dependencies of the file with other documented
-# files.
+# files. Explicit enabling an include graph, when INCLUDE_GRAPH is is set to NO,
+# can be accomplished by means of the command \includegraph. Disabling an
+# include graph can be accomplished by means of the command \hideincludegraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2475,7 +2673,10 @@ INCLUDE_GRAPH          = YES
 # If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
 # set to YES then doxygen will generate a graph for each documented file showing
 # the direct and indirect include dependencies of the file with other documented
-# files.
+# files. Explicit enabling an included by graph, when INCLUDED_BY_GRAPH is set
+# to NO, can be accomplished by means of the command \includedbygraph. Disabling
+# an included by graph can be accomplished by means of the command
+# \hideincludedbygraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2515,16 +2716,26 @@ GRAPHICAL_HIERARCHY    = YES
 # If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
 # dependencies a directory has on other directories in a graphical way. The
 # dependency relations are determined by the #include relations between the
-# files in the directories.
+# files in the directories. Explicit enabling a directory graph, when
+# DIRECTORY_GRAPH is set to NO, can be accomplished by means of the command
+# \directorygraph. Disabling a directory graph can be accomplished by means of
+# the command \hidedirectorygraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DIRECTORY_GRAPH        = YES
 
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
 # generated by dot. For an explanation of the image formats see the section
 # output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
+# https://www.graphviz.org/)).
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
@@ -2561,11 +2772,12 @@ DOT_PATH               =
 
 DOTFILE_DIRS           =
 
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
 
-MSCFILE_DIRS           =
+DIA_PATH               =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
@@ -2574,10 +2786,10 @@ MSCFILE_DIRS           =
 DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
 
 PLANTUML_JAR_PATH      =
 
@@ -2627,6 +2839,8 @@ DOT_MULTI_TARGETS      = NO
 # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
 # explaining the meaning of the various boxes and arrows in the dot generated
 # graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2640,3 +2854,19 @@ GENERATE_LEGEND        = YES
 # The default value is: YES.
 
 DOT_CLEANUP            = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. If the MSCGEN_TOOL tag is left empty (the default), then doxygen will
+# use a built-in version of mscgen tool to produce the charts. Alternatively,
+# the MSCGEN_TOOL tag can also specify the name an external tool. For instance,
+# specifying prog as the value, doxygen will call the tool as prog -T
+# <outfile_format> -o <outputfile> <inputfile>. The external tool should support
+# output file formats "png", "eps", "svg", and "ismap".
+
+MSCGEN_TOOL            =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
diff --git a/include/onnxruntime/core/common/status.h b/include/onnxruntime/core/common/status.h
index d6e1992944feb..8f171daabbb1e 100644
--- a/include/onnxruntime/core/common/status.h
+++ b/include/onnxruntime/core/common/status.h
@@ -19,7 +19,6 @@ limitations under the License.
 #ifdef _WIN32
 #include <winerror.h>
 #endif
-#include "core/common/gsl.h"
 namespace onnxruntime {
 namespace common {
 
@@ -121,10 +120,8 @@ class [[nodiscard]] Status {
 
   Status(StatusCategory category, int code);
 
-  GSL_SUPPRESS(r.11)
   Status(const Status& other)
       : state_((other.state_ == nullptr) ? nullptr : new State(*other.state_)) {}
-  GSL_SUPPRESS(r.11)
   Status& operator=(const Status& other) {
     if (state_ != other.state_) {
       if (other.state_ == nullptr) {
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index e2a90852e5de0..cbc2208b6bbd7 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -48,6 +48,7 @@ constexpr const char* HIP = "Hip";
 constexpr const char* HIP_PINNED = "HipPinned";
 constexpr const char* OpenVINO_CPU = "OpenVINO_CPU";
 constexpr const char* OpenVINO_GPU = "OpenVINO_GPU";
+constexpr const char* WEBGPU_BUFFER = "WebGPU_Buffer";
 
 constexpr size_t kAllocAlignment = 256;
 
diff --git a/include/onnxruntime/core/framework/float8.h b/include/onnxruntime/core/framework/float8.h
index 0a318dac17c98..0fd04f28d44b7 100644
--- a/include/onnxruntime/core/framework/float8.h
+++ b/include/onnxruntime/core/framework/float8.h
@@ -44,35 +44,36 @@ struct Float8E4M3FN {
     std::memcpy(&b, &v, sizeof(b));
 
     val = static_cast<uint8_t>((b & 0x80000000) >> 24);  // sign
-    if ((b & 0x7fc00000) == 0x7fc00000) {
-      val |= 0x7f;
-    } else if ((b & 0x7fffffff) == 0x7f800000) {
+    if ((b & 0x7fffffff) == 0x7f800000) {                // infinity
       if (saturate) {
         val |= 126;
       } else {
         val |= 0x7f;
       }
+    } else if ((b & 0x7F800000) == 0x7F800000) {  // NaN
+      val |= 0x7f;
     } else {
       uint8_t e = static_cast<uint8_t>((b & 0x7F800000) >> 23);  // exponent
       uint32_t m = static_cast<uint32_t>(b & 0x007FFFFF);        // mantissa
       if (e != 0) {
-        if (e < 117) {         // 0b1110101
-        } else if (e < 118) {  // 0b1110110
-          val |= 1;
-          if ((m >> 23) & 1) {
-            // rounding
-            val += 1;
+        if (e < 117) {
+        } else if (e < 121) {
+          // denormalized number
+          auto d = 120 - e;
+          if (d < 3) {
+            val |= 1 << (2 - d);
+            val |= m >> (21 + d);
+          } else if (m > 0) {
+            val |= 1;
           }
-        } else if (e < 121) {  // 127 - 7 + 1 // 0b1111001
-          auto d = 120 - e;    // 0b1111000
-          val |= 1 << (2 - d);
-          val |= m >> (21 + d);
-          if ((m >> (20 + d)) & 1) {
+          auto mask = 1 << (20 + d);
+          if ((m & mask) && ((val & 1) || ((m & (mask - 1)) > 0) || ((m & mask) && (m & (mask << 1)) && ((m & (mask - 1)) == 0)))) {
             // rounding
             val += 1;
           }
-        } else if (e < 136) {  // 127 + 8 + 1 // 0b10001000
-          auto ex = e - 120;   // 127 - 7
+        } else if (e < 136) {
+          // normalized number
+          auto ex = e - 120;
           if (ex == 0) {
             val |= 0x4;
             val |= m >> 21;
@@ -83,7 +84,7 @@ struct Float8E4M3FN {
               val &= 0xFE;
             }
           }
-          if ((m & 0x80000) && ((m & 0x100000) || (m & 0x7C000))) {
+          if ((m & 0x80000) && ((m & 0x100000) || (m & 0x7FFFF))) {
             if ((val & 0x7F) < 0x7E) {
               // rounding
               val += 1;
@@ -205,36 +206,37 @@ struct Float8E4M3FNUZ {
     std::memcpy(&b, &v, sizeof(b));
 
     val = static_cast<uint8_t>((b & 0x80000000) >> 24);  // sign
-    if ((b & 0x7fc00000) == 0x7fc00000) {
-      val = 0x80;
-    } else if ((b & 0x7fffffff) == 0x7f800000) {
+    if ((b & 0x7fffffff) == 0x7f800000) {                // infinity
       if (saturate) {
         val |= 0x7F;
       } else {
         // infinity
         val = 0x80;
       }
+    } else if ((b & 0x7F800000) == 0x7F800000) {  // NaN
+      val = 0x80;
     } else {
       uint8_t e = static_cast<uint8_t>((b & 0x7F800000) >> 23);  // exponent
       uint32_t m = static_cast<uint32_t>(b & 0x007FFFFF);        // mantissa
       if (e != 0) {
         if (e < 116) {
-        } else if (e < 117) {
-          val |= 1;
-          if ((m >> 23) & 1) {
-            // rounding
-            val += 1;
-          }
-        } else if (e < 120) {  // 127 - 8 + 1
+        } else if (e < 120) {
+          // denormalized number
           auto d = 119 - e;
-          val |= 1 << (2 - d);
-          val |= m >> (21 + d);
-          if ((m >> (20 + d)) & 1) {
+          if (d < 3) {
+            val |= 1 << (2 - d);
+            val |= m >> (21 + d);
+          } else if (m > 0) {
+            val |= 1;
+          }
+          auto mask = 1 << (20 + d);
+          if ((m & mask) && ((val & 1) || ((m & (mask - 1)) > 0) || ((m & mask) && (m & (mask << 1)) && ((m & (mask - 1)) == 0)))) {
             // rounding
             val += 1;
           }
-        } else if (e < 135) {  // 127 + 8
-          auto ex = e - 119;   // 127 - 7
+        } else if (e < 135) {
+          // normalized number
+          auto ex = e - 119;
           if (ex == 0) {
             val |= 0x4;
             val |= m >> 21;
@@ -242,7 +244,7 @@ struct Float8E4M3FNUZ {
             val |= ex << 3;
             val |= m >> 20;
           }
-          if (m & 0x80000) {
+          if ((m & 0x80000) && ((m & 0x100000) || (m & 0x7FFFF))) {
             if ((val & 0x7F) < 0x7F) {
               // rounding
               val += 1;
@@ -357,32 +359,32 @@ struct Float8E5M2 {
     uint32_t b;
     std::memcpy(&b, &v, sizeof(b));
 
-    val = (b & 0x80000000) >> 24;  // sign
-    if ((b & 0x7fc00000) == 0x7fc00000) {
-      val |= 0x7f;
-    } else if ((b & 0x7FFFFFFF) == 0x7F800000) {  // inf
+    val = (b & 0x80000000) >> 24;          // sign
+    if ((b & 0x7FFFFFFF) == 0x7F800000) {  // inf
       if (saturate) {
         val |= 0x7B;
       } else {
         val |= 0x7C;
       }
+    } else if ((b & 0x7F800000) == 0x7F800000) {  // NaN
+      val |= 0x7f;
     } else {
       uint32_t e = (b & 0x7F800000) >> 23;  // exponent
       uint32_t m = b & 0x007FFFFF;          // mantissa
 
       if (e != 0) {
         if (e < 110) {
-        } else if (e < 111) {
-          val |= 1;
-          if ((m >> 23) & 1) {
-            // rounding
-            val += 1;
-          }
-        } else if (e < 113) {  // 127 - 15 + 1
+        } else if (e < 113) {
+          // denormalized number
           auto d = 112 - e;
-          val |= 1 << (1 - d);
-          val |= m >> (22 + d);
-          if ((m >> (21 + d)) & 1) {
+          if (d < 2) {
+            val |= 1 << (1 - d);
+            val |= m >> (22 + d);
+          } else if (m > 0) {
+            val |= 1;
+          }
+          auto mask = 1 << (21 + d);
+          if ((m & mask) && ((val & 1) || ((m & (mask - 1)) > 0) || ((m & mask) && (m & (mask << 1)) && ((m & (mask - 1)) == 0)))) {
             // rounding
             val += 1;
           }
@@ -513,40 +515,41 @@ struct Float8E5M2FNUZ {
     uint32_t b;
     std::memcpy(&b, &v, sizeof(b));
 
-    val = (b & 0x80000000) >> 24;  // sign
-    if ((b & 0x7fc00000) == 0x7fc00000) {
-      val = 0x80;
-    } else if ((b & 0x7FFFFFFF) == 0x7F800000) {  // inf
+    val = (b & 0x80000000) >> 24;          // sign
+    if ((b & 0x7FFFFFFF) == 0x7F800000) {  // inf
       if (saturate) {
         val |= 0x7F;
       } else {
         val = 0x80;
       }
+    } else if ((b & 0x7F800000) == 0x7F800000) {  // NaN
+      val = 0x80;
     } else {
       uint32_t e = (b & 0x7F800000) >> 23;  // exponent
       uint32_t m = b & 0x007FFFFF;          // mantissa
 
       if (e != 0) {
         if (e < 109) {
-        } else if (e < 110) {
-          val |= 1;
-          if ((m >> 23) & 1) {
-            // rounding
-            val += 1;
-          }
-        } else if (e < 112) {  // 127 - 16 + 1
+        } else if (e < 112) {
+          // denormalized number
           auto d = 111 - e;
-          val |= 1 << (1 - d);
-          val |= m >> (22 + d);
-          if ((m >> (21 + d)) & 1) {
+          if (d < 2) {
+            val |= 1 << (1 - d);
+            val |= m >> (22 + d);
+          } else if (m > 0) {
+            val |= 1;
+          }
+          auto mask = 1 << (21 + d);
+          if ((m & mask) && ((val & 1) || ((m & (mask - 1)) > 0) || ((m & mask) && (m & (mask << 1)) && ((m & (mask - 1)) == 0)))) {
             // rounding
             val += 1;
           }
-        } else if (e < 143) {  // 127 + 15 + 1
+        } else if (e < 143) {
+          // normalized number
           auto ex = e - 111;
           val |= ex << 2;
           val |= m >> 21;
-          if (m & 0x100000) {
+          if ((m & 0x100000) && ((m & 0xFFFFF) || (m & 0x200000))) {
             if ((val & 0x7F) < 0x7F) {
               // rounding
               val += 1;
@@ -554,7 +557,7 @@ struct Float8E5M2FNUZ {
               val = 0x80;
             }
           }
-        } else if ((e == 255) && (m == 0)) {  // inf
+        } else if ((e == 255) && (m == 0)) {
           val = 0x80;
         } else if (saturate) {
           val |= 0x7F;
diff --git a/js/common/lib/backend-impl.ts b/js/common/lib/backend-impl.ts
index ef8c23c5b6725..75feba1d0ae08 100644
--- a/js/common/lib/backend-impl.ts
+++ b/js/common/lib/backend-impl.ts
@@ -12,7 +12,7 @@ interface BackendInfo {
   aborted?: boolean;
 }
 
-const backends: {[name: string]: BackendInfo} = {};
+const backends: Map<string, BackendInfo> = new Map();
 const backendsSortedByPriority: string[] = [];
 
 /**
@@ -23,13 +23,13 @@ const backendsSortedByPriority: string[] = [];
  * @param priority - an integer indicating the priority of the backend. Higher number means higher priority. if priority
  * < 0, it will be considered as a 'beta' version and will not be used as a fallback backend by default.
  *
- * @internal
+ * @ignore
  */
 export const registerBackend = (name: string, backend: Backend, priority: number): void => {
   if (backend && typeof backend.init === 'function' && typeof backend.createSessionHandler === 'function') {
-    const currentBackend = backends[name];
+    const currentBackend = backends.get(name);
     if (currentBackend === undefined) {
-      backends[name] = {backend, priority};
+      backends.set(name, {backend, priority});
     } else if (currentBackend.priority > priority) {
       // same name is already registered with a higher priority. skip registeration.
       return;
@@ -46,7 +46,7 @@ export const registerBackend = (name: string, backend: Backend, priority: number
       }
 
       for (let i = 0; i < backendsSortedByPriority.length; i++) {
-        if (backends[backendsSortedByPriority[i]].priority <= priority) {
+        if (backends.get(backendsSortedByPriority[i])!.priority <= priority) {
           backendsSortedByPriority.splice(i, 0, name);
           return;
         }
@@ -65,13 +65,13 @@ export const registerBackend = (name: string, backend: Backend, priority: number
  * @param backendHints - a list of execution provider names to lookup. If omitted use registered backends as list.
  * @returns a promise that resolves to the backend.
  *
- * @internal
+ * @ignore
  */
 export const resolveBackend = async(backendHints: readonly string[]): Promise<Backend> => {
   const backendNames = backendHints.length === 0 ? backendsSortedByPriority : backendHints;
   const errors = [];
   for (const backendName of backendNames) {
-    const backendInfo = backends[backendName];
+    const backendInfo = backends.get(backendName);
     if (backendInfo) {
       if (backendInfo.initialized) {
         return backendInfo.backend;
diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts
index 226abaf033435..804f33f00d103 100644
--- a/js/common/lib/backend.ts
+++ b/js/common/lib/backend.ts
@@ -5,7 +5,7 @@ import {InferenceSession} from './inference-session.js';
 import {OnnxValue} from './onnx-value.js';
 
 /**
- * @internal
+ * @ignore
  */
 export declare namespace SessionHandler {
   type FeedsType = {[name: string]: OnnxValue};
@@ -16,7 +16,7 @@ export declare namespace SessionHandler {
 /**
  * Represent a handler instance of an inference session.
  *
- * @internal
+ * @ignore
  */
 export interface SessionHandler {
   dispose(): Promise<void>;
@@ -34,7 +34,7 @@ export interface SessionHandler {
 /**
  * Represent a backend that provides implementation of model inferencing.
  *
- * @internal
+ * @ignore
  */
 export interface Backend {
   /**
diff --git a/js/common/lib/inference-session-impl.ts b/js/common/lib/inference-session-impl.ts
index 2403b82ea80af..06949b4a26c0d 100644
--- a/js/common/lib/inference-session-impl.ts
+++ b/js/common/lib/inference-session-impl.ts
@@ -109,7 +109,12 @@ export class InferenceSession implements InferenceSessionInterface {
     const returnValue: {[name: string]: OnnxValue} = {};
     for (const key in results) {
       if (Object.hasOwnProperty.call(results, key)) {
-        returnValue[key] = new Tensor(results[key].type, results[key].data, results[key].dims);
+        const result = results[key];
+        if (result instanceof Tensor) {
+          returnValue[key] = result;
+        } else {
+          returnValue[key] = new Tensor(result.type, result.data, result.dims);
+        }
       }
     }
     return returnValue;
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index ec030084c9675..71a5912df2464 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -66,6 +66,13 @@ export declare namespace InferenceSession {
      */
     interOpNumThreads?: number;
 
+    /**
+     * The free dimension override.
+     *
+     * This setting is available only in ONNXRuntime (Node.js binding and react-native) or WebAssembly backend
+     */
+    freeDimensionOverrides?: {readonly [dimensionName: string]: number};
+
     /**
      * The optimization level.
      *
diff --git a/js/common/lib/tensor-factory-impl.ts b/js/common/lib/tensor-factory-impl.ts
index 926312e62c856..7228c4a97055b 100644
--- a/js/common/lib/tensor-factory-impl.ts
+++ b/js/common/lib/tensor-factory-impl.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {GpuBufferDataTypes, OptionsDimensions, OptionsFormat, OptionsNormalizationParameters, OptionsTensorFormat, OptionsTensorLayout, TensorFromGpuBufferOptions, TensorFromImageBitmapOptions, TensorFromImageDataOptions, TensorFromImageElementOptions, TensorFromTextureOptions, TensorFromUrlOptions, TextureDataTypes} from './tensor-factory.js';
+import {OptionsDimensions, OptionsFormat, OptionsNormalizationParameters, OptionsTensorFormat, OptionsTensorLayout, TensorFromGpuBufferOptions, TensorFromImageBitmapOptions, TensorFromImageDataOptions, TensorFromImageElementOptions, TensorFromTextureOptions, TensorFromUrlOptions} from './tensor-factory.js';
 import {Tensor} from './tensor-impl.js';
 import {Tensor as TensorInterface} from './tensor.js';
 
@@ -239,7 +239,7 @@ export const tensorFromImage = async(
 /**
  * implementation of Tensor.fromTexture().
  */
-export const tensorFromTexture = <T extends TextureDataTypes>(
+export const tensorFromTexture = <T extends TensorInterface.TextureDataTypes>(
     texture: TensorInterface.TextureType, options: TensorFromTextureOptions<T>): Tensor => {
   const {width, height, download, dispose} = options;
   // Always assume RGBAF32. TODO: support different texture format
@@ -250,7 +250,7 @@ export const tensorFromTexture = <T extends TextureDataTypes>(
 /**
  * implementation of Tensor.fromGpuBuffer().
  */
-export const tensorFromGpuBuffer = <T extends GpuBufferDataTypes>(
+export const tensorFromGpuBuffer = <T extends TensorInterface.GpuBufferDataTypes>(
     gpuBuffer: TensorInterface.GpuBufferType, options: TensorFromGpuBufferOptions<T>): Tensor => {
   const {dataType, dims, download, dispose} = options;
   return new Tensor({location: 'gpu-buffer', type: dataType ?? 'float32', gpuBuffer, dims, download, dispose});
@@ -259,6 +259,6 @@ export const tensorFromGpuBuffer = <T extends GpuBufferDataTypes>(
 /**
  * implementation of Tensor.fromPinnedBuffer().
  */
-export const tensorFromPinnedBuffer = <T extends Exclude<TensorInterface.Type, 'string'>>(
+export const tensorFromPinnedBuffer = <T extends TensorInterface.CpuPinnedDataTypes>(
     type: T, buffer: TensorInterface.DataTypeMap[T], dims?: readonly number[]): Tensor =>
     new Tensor({location: 'cpu-pinned', type, data: buffer, dims: dims ?? [buffer.length]});
diff --git a/js/common/lib/tensor-factory.ts b/js/common/lib/tensor-factory.ts
index 38d3106d56bcd..6e19d7fb898a3 100644
--- a/js/common/lib/tensor-factory.ts
+++ b/js/common/lib/tensor-factory.ts
@@ -39,15 +39,10 @@ interface GpuResourceConstructorParameters<T extends Tensor.Type> {
   dispose?(): void;
 }
 
-/**
- * supported data types for constructing a tensor from a pinned CPU buffer
- */
-export type CpuPinnedDataTypes = Exclude<Tensor.Type, 'string'>;
-
 /**
  * represent the parameter for constructing a tensor from a pinned CPU buffer
  */
-export interface CpuPinnedConstructorParameters<T extends CpuPinnedDataTypes = CpuPinnedDataTypes> extends
+export interface CpuPinnedConstructorParameters<T extends Tensor.CpuPinnedDataTypes = Tensor.CpuPinnedDataTypes> extends
     CommonConstructorParameters<T> {
   /**
    * Specify the location of the data to be 'cpu-pinned'.
@@ -59,15 +54,10 @@ export interface CpuPinnedConstructorParameters<T extends CpuPinnedDataTypes = C
   readonly data: Tensor.DataTypeMap[T];
 }
 
-/**
- * supported data types for constructing a tensor from a WebGL texture
- */
-export type TextureDataTypes = 'float32';
-
 /**
  * represent the parameter for constructing a tensor from a WebGL texture
  */
-export interface TextureConstructorParameters<T extends TextureDataTypes = TextureDataTypes> extends
+export interface TextureConstructorParameters<T extends Tensor.TextureDataTypes = Tensor.TextureDataTypes> extends
     CommonConstructorParameters<T>, GpuResourceConstructorParameters<T> {
   /**
    * Specify the location of the data to be 'texture'.
@@ -79,15 +69,10 @@ export interface TextureConstructorParameters<T extends TextureDataTypes = Textu
   readonly texture: Tensor.TextureType;
 }
 
-/**
- * supported data types for constructing a tensor from a WebGPU buffer
- */
-export type GpuBufferDataTypes = 'float32'|'int32';
-
 /**
  * represent the parameter for constructing a tensor from a WebGPU buffer
  */
-export interface GpuBufferConstructorParameters<T extends GpuBufferDataTypes = GpuBufferDataTypes> extends
+export interface GpuBufferConstructorParameters<T extends Tensor.GpuBufferDataTypes = Tensor.GpuBufferDataTypes> extends
     CommonConstructorParameters<T>, GpuResourceConstructorParameters<T> {
   /**
    * Specify the location of the data to be 'gpu-buffer'.
@@ -203,11 +188,11 @@ export interface TensorFromUrlOptions extends OptionsDimensions, OptionResizedDi
 export interface TensorFromImageBitmapOptions extends OptionResizedDimensions, OptionsTensorFormat, OptionsTensorLayout,
                                                       OptionsTensorDataType, OptionsNormalizationParameters {}
 
-export interface TensorFromTextureOptions<T extends TextureDataTypes> extends
+export interface TensorFromTextureOptions<T extends Tensor.TextureDataTypes> extends
     Required<OptionsDimensions>, OptionsFormat, GpuResourceConstructorParameters<T>/* TODO: add more */ {}
 
-export interface TensorFromGpuBufferOptions<T extends GpuBufferDataTypes> extends Pick<Tensor, 'dims'>,
-                                                                                  GpuResourceConstructorParameters<T> {
+export interface TensorFromGpuBufferOptions<T extends Tensor.GpuBufferDataTypes> extends
+    Pick<Tensor, 'dims'>, GpuResourceConstructorParameters<T> {
   /**
    * Describes the data type of the tensor.
    */
@@ -298,7 +283,7 @@ export interface TensorFactory {
    *
    * @returns a tensor object
    */
-  fromTexture<T extends TextureDataTypes = 'float32'>(
+  fromTexture<T extends Tensor.TextureDataTypes = 'float32'>(
       texture: Tensor.TextureType, options: TensorFromTextureOptions<T>): TypedTensor<'float32'>;
 
   /**
@@ -318,7 +303,7 @@ export interface TensorFactory {
    *
    * @returns a tensor object
    */
-  fromGpuBuffer<T extends GpuBufferDataTypes = 'float32'>(
+  fromGpuBuffer<T extends Tensor.GpuBufferDataTypes>(
       buffer: Tensor.GpuBufferType, options: TensorFromGpuBufferOptions<T>): TypedTensor<T>;
 
   /**
diff --git a/js/common/lib/tensor-impl.ts b/js/common/lib/tensor-impl.ts
index dbd8685de43f4..e3e2b9c728556 100644
--- a/js/common/lib/tensor-impl.ts
+++ b/js/common/lib/tensor-impl.ts
@@ -4,7 +4,7 @@
 import {tensorToDataURL, tensorToImageData} from './tensor-conversion-impl.js';
 import {TensorToDataUrlOptions, TensorToImageDataOptions} from './tensor-conversion.js';
 import {tensorFromGpuBuffer, tensorFromImage, tensorFromPinnedBuffer, tensorFromTexture} from './tensor-factory-impl.js';
-import {CpuPinnedConstructorParameters, CpuPinnedDataTypes, GpuBufferConstructorParameters, GpuBufferDataTypes, TensorFromGpuBufferOptions, TensorFromImageBitmapOptions, TensorFromImageDataOptions, TensorFromImageElementOptions, TensorFromTextureOptions, TensorFromUrlOptions, TextureConstructorParameters} from './tensor-factory.js';
+import {CpuPinnedConstructorParameters, GpuBufferConstructorParameters, TensorFromGpuBufferOptions, TensorFromImageBitmapOptions, TensorFromImageDataOptions, TensorFromImageElementOptions, TensorFromTextureOptions, TensorFromUrlOptions, TextureConstructorParameters} from './tensor-factory.js';
 import {checkBigInt, NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP, NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, SupportedTypedArray, SupportedTypedArrayConstructors} from './tensor-impl-type-mapping.js';
 import {calculateSize, tensorReshape} from './tensor-utils-impl.js';
 import {Tensor as TensorInterface} from './tensor.js';
@@ -20,7 +20,7 @@ type TensorGpuBufferType = TensorInterface.GpuBufferType;
 /**
  * the implementation of Tensor interface.
  *
- * @internal
+ * @ignore
  */
 export class Tensor implements TensorInterface {
   // #region constructors
@@ -102,7 +102,8 @@ export class Tensor implements TensorInterface {
           break;
         }
         case 'gpu-buffer': {
-          if (type !== 'float32' && type !== 'int32') {
+          if ((type !== 'float32' && type !== 'float16' && type !== 'int32' && type !== 'int64' && type !== 'uint32' &&
+               type !== 'bool')) {
             throw new TypeError(`unsupported type "${type}" to create tensor from gpu buffer`);
           }
           this.gpuBufferData = arg0.gpuBuffer;
@@ -240,16 +241,17 @@ export class Tensor implements TensorInterface {
     return tensorFromImage(image, options);
   }
 
-  static fromTexture(texture: TensorTextureType, options: TensorFromTextureOptions<'float32'>): TensorInterface {
+  static fromTexture<T extends TensorInterface.TextureDataTypes>(
+      texture: TensorTextureType, options: TensorFromTextureOptions<T>): TensorInterface {
     return tensorFromTexture(texture, options);
   }
 
-  static fromGpuBuffer<T extends GpuBufferDataTypes>(
+  static fromGpuBuffer<T extends TensorInterface.GpuBufferDataTypes>(
       gpuBuffer: TensorGpuBufferType, options: TensorFromGpuBufferOptions<T>): TensorInterface {
     return tensorFromGpuBuffer(gpuBuffer, options);
   }
 
-  static fromPinnedBuffer<T extends CpuPinnedDataTypes>(
+  static fromPinnedBuffer<T extends TensorInterface.CpuPinnedDataTypes>(
       type: T, buffer: TensorInterface.DataTypeMap[T], dims?: readonly number[]): Tensor {
     return tensorFromPinnedBuffer(type, buffer, dims);
   }
@@ -316,7 +318,7 @@ export class Tensor implements TensorInterface {
     if (!this.cpuData) {
       throw new Error(
           'The data is not on CPU. Use `getData()` to download GPU data to CPU, ' +
-          'or use `texture` property to access the GPU data directly.');
+          'or use `texture` or `gpuBuffer` property to access the GPU data directly.');
     }
     return this.cpuData;
   }
diff --git a/js/common/lib/tensor.ts b/js/common/lib/tensor.ts
index 10071eda39405..6c08d1fe8e057 100644
--- a/js/common/lib/tensor.ts
+++ b/js/common/lib/tensor.ts
@@ -105,11 +105,21 @@ export declare namespace Tensor {
   type DataType = DataTypeMap[Type];
   type ElementType = ElementTypeMap[Type];
 
+  /**
+   * supported data types for constructing a tensor from a pinned CPU buffer
+   */
+  export type CpuPinnedDataTypes = Exclude<Tensor.Type, 'string'>;
+
   /**
    * type alias for WebGL texture
    */
   export type TextureType = WebGLTexture;
 
+  /**
+   * supported data types for constructing a tensor from a WebGL texture
+   */
+  export type TextureDataTypes = 'float32';
+
   /**
    * type alias for WebGPU buffer
    *
@@ -122,6 +132,11 @@ export declare namespace Tensor {
    */
   export type GpuBufferType = {size: number; mapState: 'unmapped' | 'pending' | 'mapped'};
 
+  /**
+   * supported data types for constructing a tensor from a WebGPU buffer
+   */
+  export type GpuBufferDataTypes = 'float32'|'float16'|'int32'|'int64'|'uint32'|'bool';
+
   /**
    * represent where the tensor data is stored
    */
diff --git a/js/node/script/prepack.ts b/js/node/script/prepack.ts
index be86c5687bec0..4c5941d8dae12 100644
--- a/js/node/script/prepack.ts
+++ b/js/node/script/prepack.ts
@@ -11,7 +11,7 @@ function updatePackageJson() {
   const packageCommon = fs.readJSONSync(commonPackageJsonPath);
   const packageSelf = fs.readJSONSync(selfPackageJsonPath);
   const version = packageCommon.version;
-  packageSelf.dependencies['onnxruntime-common'] = `~${version}`;
+  packageSelf.dependencies['onnxruntime-common'] = `${version}`;
   fs.writeJSONSync(selfPackageJsonPath, packageSelf, {spaces: 2});
   console.log('=== finished updating package.json.');
 }
diff --git a/js/node/test/test-runner.ts b/js/node/test/test-runner.ts
index 5d9cf61c8d45b..06ed0acfca36c 100644
--- a/js/node/test/test-runner.ts
+++ b/js/node/test/test-runner.ts
@@ -112,6 +112,14 @@ export function run(testDataRoot: string): void {
               });
             }
           }
+
+          if (!skipModel) {
+            after(async () => {
+              if (session !== null) {
+                await session.release();
+              }
+            });
+          }
         });
       }
     }
diff --git a/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java b/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java
index 76fd608e4362b..72518488e6682 100644
--- a/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java
+++ b/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java
@@ -238,6 +238,34 @@ public void createInputTensor_double() throws Exception {
     outputTensor.close();
   }
 
+  @Test
+  public void createInputTensor_bool() throws Exception {
+    OnnxTensor outputTensor = OnnxTensor.createTensor(ortEnvironment, new boolean[] {false, true});
+
+    JavaOnlyMap inputTensorMap = new JavaOnlyMap();
+
+    JavaOnlyArray dims = new JavaOnlyArray();
+    dims.pushInt(2);
+    inputTensorMap.putArray("dims", dims);
+
+    inputTensorMap.putString("type", TensorHelper.JsTensorTypeBool);
+
+    ByteBuffer dataByteBuffer = ByteBuffer.allocate(2);
+    dataByteBuffer.put((byte)0);
+    dataByteBuffer.put((byte)1);
+    inputTensorMap.putMap("data", blobModule.testCreateData(dataByteBuffer.array()));
+
+    OnnxTensor inputTensor = TensorHelper.createInputTensor(blobModule, inputTensorMap, ortEnvironment);
+
+    Assert.assertEquals(inputTensor.getInfo().onnxType, TensorInfo.OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL);
+    Assert.assertEquals(outputTensor.getInfo().onnxType, TensorInfo.OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL);
+    Assert.assertEquals(inputTensor.toString(), outputTensor.toString());
+    Assert.assertArrayEquals(inputTensor.getByteBuffer().array(), outputTensor.getByteBuffer().array());
+
+    inputTensor.close();
+    outputTensor.close();
+  }
+
   @Test
   public void createOutputTensor_bool() throws Exception {
     MockitoSession mockSession = mockitoSession().mockStatic(Arguments.class).startMocking();
diff --git a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java
index d9c2e3bac5d9b..63cddace36640 100644
--- a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java
+++ b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java
@@ -174,7 +174,11 @@ private static OnnxTensor createInputTensor(TensorInfo.OnnxTensorType tensorType
       tensor = OnnxTensor.createTensor(ortEnvironment, buffer, dims, OnnxJavaType.UINT8);
       break;
     }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
+      ByteBuffer buffer = values;
+      tensor = OnnxTensor.createTensor(ortEnvironment, buffer, dims, OnnxJavaType.BOOL);
+      break;
+    }
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
diff --git a/js/react_native/scripts/prepack.ts b/js/react_native/scripts/prepack.ts
index 15ae69722108c..2e43294165a83 100644
--- a/js/react_native/scripts/prepack.ts
+++ b/js/react_native/scripts/prepack.ts
@@ -18,7 +18,7 @@ function updatePackageJson() {
     delete packageSelf.dependencies['onnxruntime-common'];
   } else {
     const version = packageCommon.version;
-    packageSelf.dependencies['onnxruntime-common'] = `~${version}`;
+    packageSelf.dependencies['onnxruntime-common'] = `${version}`;
   }
   fs.writeJSONSync(selfPackageJsonPath, packageSelf, {spaces: 2});
   console.log('=== finished updating package.json.');
diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index a969e1b86bf99..de53f943bc9ef 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -30,6 +30,7 @@ Do not modify directly.*
 | Cos | ai.onnx(7+) |  |
 | Cosh | ai.onnx(9+) |  |
 | Div | ai.onnx(7-12,13,14+) |  |
+| Einsum | ai.onnx(12+) |  |
 | Elu | ai.onnx(6+) |  |
 | Equal | ai.onnx(7-10,11-12,13-18,19+) |  |
 | Erf | ai.onnx(9-12,13+) |  |
@@ -44,10 +45,12 @@ Do not modify directly.*
 | GlobalAveragePool | ai.onnx(1+); com.ms.internal.nhwc(1+) |  |
 | GlobalMaxPool | ai.onnx(1+); com.ms.internal.nhwc(1+) |  |
 | Greater | ai.onnx(7-8,9-12,13+) |  |
+| GreaterOrEqual | ai.onnx(12-15,16+) |  |
 | InstanceNormalization | ai.onnx(6+); com.ms.internal.nhwc(6+) |  |
 | LayerNormalization | ai.onnx(17+) |  |
 | LeakyRelu | ai.onnx(6-15,16+) |  |
 | Less | ai.onnx(7-8,9-12,13+) |  |
+| LessOrEqual | ai.onnx(12-15,16+) |  |
 | Log | ai.onnx(6-12,13+) |  |
 | MatMul | ai.onnx(1-12,13+) |  |
 | MaxPool | ai.onnx(1-7,8-9,10,11,12+); com.ms.internal.nhwc(11,12+) | need perf optimization; need implementing activation |
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index 7f0430b7b28b9..59da1369e152e 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -54,6 +54,7 @@ export interface OrtWasmModule extends EmscriptenModule {
       enableProfiling: boolean, profileFilePrefix: number, logId: number, logSeverityLevel: number,
       logVerbosityLevel: number, optimizedModelFilePath: number): number;
   _OrtAppendExecutionProvider(sessionOptionsHandle: number, name: number): number;
+  _OrtAddFreeDimensionOverride(sessionOptionsHandle: number, name: number, dim: number): number;
   _OrtAddSessionConfigEntry(sessionOptionsHandle: number, configKey: number, configValue: number): number;
   _OrtReleaseSessionOptions(sessionOptionsHandle: number): void;
 
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 653957a9a3489..e6e78df2cfb23 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -110,6 +110,7 @@ export class WebGpuBackend {
     }
 
     this.env = env;
+    const requiredFeatures: GPUFeatureName[] = [];
     const deviceDescriptor: GPUDeviceDescriptor = {
       requiredLimits: {
         maxComputeWorkgroupStorageSize: adapter.limits.maxComputeWorkgroupStorageSize,
@@ -121,13 +122,16 @@ export class WebGpuBackend {
         maxComputeWorkgroupSizeY: adapter.limits.maxComputeWorkgroupSizeY,
         maxComputeWorkgroupSizeZ: adapter.limits.maxComputeWorkgroupSizeZ,
       },
+      requiredFeatures,
     };
     // WebGPU Spec: Timestamp Queries Inside Passes
     // https://github.com/gpuweb/gpuweb/blob/main/proposals/timestamp-query-inside-passes.md
     if (adapter.features.has('timestamp-query-inside-passes')) {
       this.supportTimestampQuery = true;
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      deviceDescriptor.requiredFeatures = ['timestamp-query-inside-passes' as any];
+      requiredFeatures.push('timestamp-query-inside-passes' as GPUFeatureName);
+    }
+    if (adapter.features.has('shader-f16')) {
+      requiredFeatures.push('shader-f16');
     }
 
     this.device = await adapter.requestDevice(deviceDescriptor);
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 23aabb6531f01..9c46b97694903 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -6,6 +6,7 @@ import * as binaryOps from './ops/binary-op';
 import {concat, parseConcatAttributes} from './ops/concat';
 import {conv, parseConvAttributes} from './ops/conv';
 import {convTranspose, parseConvTransposeAttributes} from './ops/conv-transpose';
+import {einsum, parseEinsumAttributes} from './ops/einsum';
 import {expand} from './ops/expand';
 import {gather, parseGatherAttributes} from './ops/gather';
 import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
@@ -52,6 +53,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Cos', [unaryOps.cos]],
   ['Cosh', [unaryOps.cosh]],
   ['Div', [binaryOps.div]],
+  ['Einsum', [einsum, parseEinsumAttributes]],
   ['Elu', [unaryOps.elu, unaryOps.parseAlphaAttributes]],
   ['Equal', [binaryOps.equal]],
   ['Erf', [unaryOps.erf]],
@@ -65,10 +67,12 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['GlobalAveragePool', [pool.globalAveragePool, pool.parseGlobalAveragePoolAttributes]],
   ['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
   ['Greater', [binaryOps.greater]],
+  ['GreaterOrEqual', [binaryOps.greaterOrEqual]],
   ['InstanceNormalization', [instanceNorm, parseInstanceNormAttributes]],
   ['LayerNormalization', [layerNorm, parseLayerNormAttributes]],
   ['LeakyRelu', [unaryOps.leakyRelu, unaryOps.parseAlphaAttributes]],
   ['Less', [binaryOps.less]],
+  ['LessOrEqual', [binaryOps.lessOrEqual]],
   ['Log', [unaryOps.log]],
   ['MatMul', [matMul]],
   // TODO: support new attributes for MaxPool-8 and MaxPool-10
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
index fee872f4120e3..ab4f608451101 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -416,24 +416,23 @@ const matMulReadWriteFnSource =
 
 export const createMatmulProgramInfo =
     (metadata: ProgramMetadata, inputs: readonly TensorView[], activationAttributes: InternalActivationAttributes,
-     outputShape: readonly number[]): ProgramInfo => {
+     outputShape: readonly number[], reshapedOutputShape?: readonly number[]): ProgramInfo => {
       const aShape = inputs[0].dims;
       const bShape = inputs[1].dims;
 
       const outerDimsA = aShape.slice(0, -2);
       const outerDimsB = bShape.slice(0, -2);
-      const outerDims = outputShape.slice(0, -2);
+      const outerDims = reshapedOutputShape ? reshapedOutputShape.slice(0, -2) : outputShape.slice(0, -2);
       const batchDims = inputVariable('batchDims', inputs[0].dataType, outerDims);
       const batchADims = inputVariable('batchADims', inputs[0].dataType, outerDimsA);
       const batchBDims = inputVariable('batchBDims', inputs[0].dataType, outerDimsB);
       const variables = [batchADims, batchBDims, batchDims];
       const batchSize = ShapeUtil.size(outerDims);
 
-      const dimAOuter = outputShape[outputShape.length - 2];
+      const dimAOuter = aShape[aShape.length - 2];
       const dimInner = aShape[aShape.length - 1];
-      const dimBOuter = outputShape[outputShape.length - 1];
+      const dimBOuter = bShape[bShape.length - 1];
       const isVec4 = dimInner % 4 === 0 && dimBOuter % 4 === 0;
-      const component = isVec4 ? 4 : 1;
       const {activationFunction, applyActivation} = getActicationSnippet(activationAttributes);
 
       // TODO: fine tune size
@@ -455,7 +454,7 @@ export const createMatmulProgramInfo =
       variables.push(output);
       const inputVariables = [A, B];
       const hasBias = inputs.length > 2;
-      const declareFunctions = matMulReadWriteFnSource(component, hasBias, applyActivation, variables);
+      const declareFunctions = matMulReadWriteFnSource(components, hasBias, applyActivation, variables);
       if (hasBias) {
         inputVariables.push(inputVariable('bias', inputs[2].dataType, [dimBOuter / components], components));
       }
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
index 28284554f97fc..b004ca37a2ea8 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -240,3 +240,16 @@ export const less = (context: ComputeContext): void => {
       context.inputs, 'Less', ({scalar: (a, b) => `u32(${a}<${b})`, vector: (a, b) => `vec4<u32>(${a}<${b})`}),
       undefined, undefined, DataType.bool));
 };
+
+export const greaterOrEqual = (context: ComputeContext): void => {
+  context.compute(createBinaryOpProgramInfoLoader(
+      context.inputs, 'GreaterOrEqual',
+      ({scalar: (a, b) => `u32(${a}>=${b})`, vector: (a, b) => `vec4<u32>(${a}>=${b})`}), undefined, undefined,
+      DataType.bool));
+};
+
+export const lessOrEqual = (context: ComputeContext): void => {
+  context.compute(createBinaryOpProgramInfoLoader(
+      context.inputs, 'LessOrEqual', ({scalar: (a, b) => `u32(${a}<=${b})`, vector: (a, b) => `vec4<u32>(${a}<=${b})`}),
+      undefined, undefined, DataType.bool));
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index c96f4858db2ae..f3845e3110905 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -192,11 +192,14 @@ export interface IndicesHelper {
 }
 
 const getWgslMappedType = (type: number, components: 1|2|3|4): string|[string, string] => {
+  if (components === 3) {
+    throw new Error('vec3 has same alignment as vec4, use vec4 instead');
+  }
+
   // return type is [ storage type, runtime type ] or a single string for both
   switch (type) {
-    // TODO: enable after "shader-f16" WSGL extension release
-    // case DataType.float16:
-    //   return components > 1 ? `vec${components}<f16>` : 'f16';
+    case DataType.float16:
+      return components > 1 ? `vec${components}<f16>` : 'f16';
     case DataType.float:
       return components > 1 ? `vec${components}<f32>` : 'f32';
     case DataType.int32:
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index afac503290c4d..3a83b1c2de6c1 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -147,6 +147,10 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
   const hasBias = inputs.length === 3;
   // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
   const isChannelsLast = attributes.format === 'NHWC';
+  if (!isChannelsLast || attributes.group !== 1) {
+    context.compute(createGroupedConvProgramInfoLoader(inputs, adjustedAttributes));
+    return;
+  }
 
   // const batchSize = context.inputs[0].dims[0];
   const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2];
@@ -169,36 +173,30 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
       (weightHeight === 1 && weightWidth === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1 &&
        attributes.strides[0] === 1 && attributes.strides[1] === 1 && attributes.pads[0] === 0 &&
        attributes.pads[1] === 0)) {
-    if (isChannelsLast && attributes.group === 1) {
-      // conv2dByMatMul
-      const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
-          context.compute(
-              {
-                ...transposeProgramMetadata,
-                cacheHint: weightTransposeAttribute.cacheKey,
-                get: () => createTransposeProgramInfo(inputs[1], weightTransposeAttribute.perm)
-              },
-              {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
-      if (attributes.wIsConst && !context.kernelCustomData.wT) {
-        context.kernelCustomData.wT = transposedWeight;
-      }
-
-      const matmulInputs = [];
-      matmulInputs.push(inputs[0].reshape([batch, inputHeight * inputWidth, inputChannels]));
-      matmulInputs.push(transposedWeight.reshape([1, inputChannels, outChannels]));
-      if (hasBias) {
-        matmulInputs.push(inputs[2]);
-      }
-      context.compute(
-          createMatmulProgramInfoLoader(matmulInputs, adjustedAttributes, outputShape), {inputs: matmulInputs});
-    } else {
-      context.compute(createGroupedConvProgramInfoLoader(inputs, adjustedAttributes));
+    // conv2dByMatMul
+    const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
+        context.compute(
+            {
+              ...transposeProgramMetadata,
+              cacheHint: weightTransposeAttribute.cacheKey,
+              get: () => createTransposeProgramInfo(inputs[1], weightTransposeAttribute.perm)
+            },
+            {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
+    if (attributes.wIsConst && !context.kernelCustomData.wT) {
+      context.kernelCustomData.wT = transposedWeight;
     }
-    return;
-  }
 
-  if (!isChannelsLast || attributes.group !== 1) {
-    context.compute(createGroupedConvProgramInfoLoader(inputs, adjustedAttributes));
+    const matmulInputs = [];
+    matmulInputs.push(inputs[0].reshape([batch, inputHeight * inputWidth, inputChannels]));
+    matmulInputs.push(transposedWeight.reshape([1, inputChannels, outChannels]));
+    if (hasBias) {
+      matmulInputs.push(inputs[2]);
+    }
+    const matmulOutputShape = [batch, outHeight * outWidth, outChannels];
+    context.compute(
+        createMatmulProgramInfoLoader(matmulInputs, adjustedAttributes, outputShape, matmulOutputShape),
+        {inputs: matmulInputs});
+
     return;
   }
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts b/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
new file mode 100644
index 0000000000000..f0196f37c3153
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
@@ -0,0 +1,290 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {TensorView} from '../../tensor';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+
+export interface EinsumAttributes extends AttributeWithCacheKey {
+  readonly equation: string;
+}
+// The equation attribute value is a string which consists of left hand side (LHS) and optionally right hand side (RHS)
+// separated by '->'. Ex. "ij,jk -> ik" expresses matrix multiplication
+//     "ij->ji" expresses matrix transpose
+//      "ii->i" diagonal elements of a square matrix
+// LHS consists of a sequence of terms separated by commas. Each term corresponds to an input variable.
+// Each symbol corresponds to a dimension in the input variable. The symbol can be either a letter, 'a' to 'z' or 'A' to
+// 'Z' or '...' to represent arbitrary dimensions.
+
+const symbolPattern =
+    '[a-zA-Z]|\\.\\.\\.';  // The pattern each symbol in each term in the symbolic equation should match
+const termPattern = '(' + symbolPattern + ')+';   // The pattern each term in the symbolic equation should match
+const termPatternOnly = '^' + termPattern + '$';  // The patterns only matchs a term begin to end.
+const lhsPattern = '(' + termPattern + ',)*' + termPattern;  // The pattern the LHS should match
+const lhsPatternOnly = '^' + lhsPattern + '$';               // The patterns only matchs a LHS begin to end.
+
+interface SymbolInfo {
+  count: number;           // Symbol corresponding to a dimmension of an input
+  inputIndices: number[];  // Number of input variables the symbol corresponds to
+  dimValue: number;        // Number of dimensions the symbol corresponds to
+}
+
+class EinsumTerm {
+  constructor(inputIndex = -1) {
+    this.symbolToIndices = new Map<string, number[]>();
+    this.inputIndex = inputIndex;
+  }
+
+  // Add a symbol to the term
+  addSymbol(symbol: string, index: number) {
+    let value = this.symbolToIndices.get(symbol);
+    if (value === undefined) {
+      value = [index];
+    } else {
+      value.push(index);
+    }
+    this.symbolToIndices.set(symbol, value);
+  }
+
+  symbolToIndices: Map<string, number[]>;  // Map from symbol to dimensions of the input corresponding to the term
+  inputIndex: number;                      // -1 for output and 0, 1, 2, ... for inputs
+}
+
+class EinsumEquation {
+  constructor(inputs: readonly TensorView[], equation: string) {
+    this.hasEllipsis = false;
+    this.symbolToInfo = new Map<string, SymbolInfo>();
+    this.lhs = new Array<EinsumTerm>();
+    this.outputDims = [];
+    // As rhs needs to be updated allow using let instead of const for both lhs and rhs.
+    // eslint-disable-next-line prefer-const
+    let [lhs, rhs] = equation.includes('->') ? equation.split('->', 2) : [equation, ''];
+    if (!lhs.match(RegExp(lhsPatternOnly))) {
+      throw new Error('Invalid LHS term');
+    }
+    const inputTerms = lhs.split(',');
+    inputTerms.forEach((inputTerm, index) => {
+      const dims = inputs[index].dims.slice();
+      if (!inputTerm.match(RegExp(termPatternOnly))) {
+        throw new Error('Invalid LHS term');
+      }
+      const einsumTerm = this.processTerm(inputTerm, true, dims, index);
+      this.lhs.push(einsumTerm);
+    });
+
+    // Initialize the RHS if not specified
+    if (rhs === '') {
+      // Construct RHS from LHS terms/symbols
+      rhs += [...this.symbolToInfo.entries()]
+                 .filter(([sym, info]) => (info.count === 1 || sym === '...'))
+                 .map(([sym]) => sym)
+                 .join('');
+    } else {
+      if (!rhs.match(RegExp(termPattern))) {
+        throw new Error('Invalid RHS');
+      }
+    }
+
+    // Compute output dims
+    const rhsSymbols = rhs.match(RegExp(symbolPattern, 'g'));
+    rhsSymbols?.forEach((symbol) => {
+      if (symbol === '...') {
+        this.outputDims = this.outputDims.concat(this.ellipsisDims);
+      } else {
+        const info = this.symbolToInfo.get(symbol);
+        if (info === undefined) {
+          throw new Error('Invalid RHS symbol');
+        }
+        this.outputDims.push(info.dimValue);
+      }
+    });
+    this.rhs = this.processTerm(rhs, true, this.outputDims);
+  }  // End of EinsumEqation constructor
+
+  // Add a symbol to the equation
+  addSymbol(symbol: string, dimValue: number, inputIndex: number) {
+    let info = this.symbolToInfo.get(symbol);
+    if (info !== undefined) {
+      if (info.dimValue !== dimValue && info.count !== 1) {
+        throw new Error('Dimension mismatch');
+      } else {
+        info.count++;
+        info.inputIndices.push(inputIndex);
+      }
+    } else {
+      info = {count: 1, dimValue, inputIndices: [inputIndex]};
+    }
+    this.symbolToInfo.set(symbol, info);
+  }
+
+  // Process one input/output term
+  processTerm(term: string, isInput: boolean, dims: readonly number[], index = -1): EinsumTerm {
+    const rank = dims.length;
+    let ellipsis = false;
+    let ellipsisDims = [];
+    let nextDim = 0;
+    // For output empty string is allowed because the output may be reduced to a scalar value
+    if (!term.match(RegExp(termPatternOnly)) && (!isInput && term !== '')) {
+      throw new Error('Invalid LHS term');
+    }
+    const indexSymbols = term.match(RegExp(symbolPattern, 'g'));
+    const einsumTerm = new EinsumTerm(index);
+    // symbol can be either a lettre, 'a' to 'z' or 'A' to 'Z', or '...'
+    indexSymbols?.forEach((symbol: string, i: number) => {
+      if (symbol === '...') {
+        if (ellipsis) {
+          throw new Error('Only one ellipsis is allowed per input term');
+        }
+        ellipsis = true;
+        const ellipsisDimLength = rank - indexSymbols.length + 1;
+        if (ellipsisDimLength < 0) {
+          throw new Error('Ellipsis out of bounds');
+        }
+        ellipsisDims = dims.slice(nextDim, nextDim + ellipsisDimLength);
+        if (this.hasEllipsis) {
+          if (this.ellipsisDims.length !== ellipsisDims.length ||
+              this.ellipsisDims.toString() !== ellipsisDims.toString()) {
+            throw new Error('Ellipsis dimensions mismatch');
+          }
+        } else if (isInput) {
+          this.hasEllipsis = true;
+          this.ellipsisDims = ellipsisDims;
+        } else {
+          throw new Error('Ellipsis must be specified in the LHS');
+        }
+        // Add '0', '1', '2', '3', '4', etc to represent ellipsis dimensions to avoid special handling
+        for (let j = 0; j < ellipsisDims.length; j++) {
+          const symbol = String.fromCharCode('0'.charCodeAt(0) + i);
+          einsumTerm.addSymbol(symbol, i + j);
+          this.addSymbol(symbol, dims[nextDim++], index);
+        }
+      } else {
+        einsumTerm.addSymbol(symbol, i);
+        this.addSymbol(symbol, dims[nextDim++], index);
+      }
+    });
+    return einsumTerm;
+  }
+
+  symbolToInfo: Map<string, SymbolInfo>;  // All symbols in the equation
+  hasEllipsis: boolean;                   // The equation has ellipsis or not
+  ellipsisDims: number[];                 // The dimensions of the equation ellipsis corresponds to.
+  lhs: EinsumTerm[];                      // Terms on the left-hand side of the equation
+  rhs: EinsumTerm;                        // Term on the right-hand side of the equation
+  outputDims: number[];                   // Output dimensions of the equation
+}  // End of class EinsumEquation
+
+
+const createEinsumProgramMetadata = (inputCount: number, cacheHint: string): ProgramMetadata =>
+    ({name: 'Einsum', inputTypes: Array(inputCount).fill(GpuDataType.default), cacheHint});
+
+const createEinsumProgramInfo =
+    (metadata: ProgramMetadata, inputs: readonly TensorView[], einsumEquation: EinsumEquation): ProgramInfo => {
+      const dataType = inputs[0].dataType;
+      const inputVars = new Array<IndicesHelper>(inputs.length);
+      for (let i = 0; i < inputs.length; ++i) {
+        inputVars[i] = inputVariable(`input${i}`, dataType, inputs[i].dims);
+      }
+      const outputShape = einsumEquation.outputDims;
+      const outputSize = ShapeUtil.size(outputShape);
+      const output = outputVariable('output', dataType, outputShape);
+      const idxCopy: string[] = [];
+      const rhsSymbols = Array.from(einsumEquation.rhs.symbolToIndices.keys());
+      const initProd = 'var prod = 1.0;';
+      const initSum = 'var sum = 0.0;';
+      const updateSum = 'sum += prod;';
+      const reduceOpsSetIndices: string[] = [];
+      const reduceOpsLoopHeaders: string[] = [];
+      const reduceOpsLoopFooters: string[] = [];
+      const reduceOpCompute: string[] = [];
+      const isReduceOpsWithoutLoop = einsumEquation.symbolToInfo.size === rhsSymbols.length;
+      einsumEquation.symbolToInfo.forEach((info, symbol) => {
+        if (rhsSymbols.includes(symbol)) {
+          const outputIndex = rhsSymbols.indexOf(symbol);
+          einsumEquation.lhs.forEach((term, i) => {
+            if (info.inputIndices.includes(i)) {
+              const indices = term.symbolToIndices.get(symbol);
+              if (indices === undefined) {
+                throw new Error('Invalid symbol error');
+              }
+              indices.forEach((index) => {
+                idxCopy.push(`${
+                    inputVars[i].indicesSet(
+                        `input${i}Indices`, index, output.indicesGet('outputIndices', outputIndex))}`);
+              });
+            }
+          });
+        } else {
+          einsumEquation.lhs.forEach((term, i) => {
+            const info = einsumEquation.symbolToInfo.get(symbol);
+            if (info === undefined) {
+              throw new Error('Invalid symbol error');
+            }
+            if (info.inputIndices.includes(i)) {
+              const indices = term.symbolToIndices.get(symbol);
+              if (indices === undefined) {
+                throw new Error('Invalid symbol error');
+              }
+              indices.forEach((index) => {
+                reduceOpsSetIndices.push(`${inputVars[i].indicesSet(`input${i}Indices`, index, `${symbol}`)}`);
+              });
+              reduceOpCompute.push(`prod *= ${inputVars[i].getByIndices(`input${i}Indices`)};`);
+            }
+          });
+          reduceOpsLoopHeaders.push(`for(var ${symbol}: u32 = 0; ${symbol} < ${
+              einsumEquation.symbolToInfo.get(symbol)?.dimValue}; ${symbol}++) {`);
+          reduceOpsLoopFooters.push('}');
+        }
+      });
+      const reduceOps = isReduceOpsWithoutLoop ?
+          [
+            ...idxCopy,
+            `let sum = ${inputVars.map((inputVar, i) => inputVar.getByIndices(`input${i}Indices`)).join(' * ')};`
+          ] :
+          [
+            ...idxCopy,
+            initSum,
+            ...reduceOpsLoopHeaders,
+            ...reduceOpsSetIndices,
+            initProd,
+            ...reduceOpCompute,
+            updateSum,
+            ...reduceOpsLoopFooters,
+          ];
+      const getShaderSource = (shaderHelper: ShaderHelper) => `
+      ${shaderHelper.declareVariables(...inputVars, output)}
+
+      ${shaderHelper.mainStart()}
+        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
+        var outputIndices = ${output.offsetToIndices('global_idx')};
+        ${inputVars.map((inputVar, i) => `var input${i}Indices: ${inputVars[i].type.indices};`).join('\n')}
+        ${reduceOps.join('\n')};
+        ${output.setByOffset('global_idx', 'sum')};
+      }`;
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+        getShaderSource,
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
+      };
+    };
+
+const createEinsumProgramInfoLoader =
+    (inputs: readonly TensorView[], einsumEquation: EinsumEquation, attributes: EinsumAttributes):
+        ProgramInfoLoader => {
+          const metadata = createEinsumProgramMetadata(inputs.length, attributes.cacheKey);
+          return {...metadata, get: () => createEinsumProgramInfo(metadata, inputs, einsumEquation)};
+        };
+
+export const einsum = (context: ComputeContext, attributes: EinsumAttributes): void => {
+  const einsumEquation = new EinsumEquation(context.inputs, attributes.equation);
+  context.compute(createEinsumProgramInfoLoader(context.inputs, einsumEquation, attributes));
+};
+
+export const parseEinsumAttributes = (attributes: Record<string, unknown>): EinsumAttributes => {
+  const equation = (attributes.equation as string).replace(/\s+/g, '');
+  return createAttributeWithCacheKey({equation});
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 46816a2410586..3ce963b54f3ee 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -1,13 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {GemmUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
-import {ShaderHelper} from './common';
+import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
 
 const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs) {
@@ -22,11 +21,6 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
     throw new Error('Invalid input shape of C');
   }
 
-  if ((inputs[0].dataType !== DataType.float) || (inputs[1].dataType !== DataType.float) ||
-      (inputs.length === 3 && inputs[2].dataType !== DataType.float)) {
-    throw new Error('Invalid input type.');
-  }
-
   if ((inputs[0].dataType !== inputs[1].dataType) ||
       (inputs.length === 3 && inputs[0].dataType !== inputs[2].dataType)) {
     throw new Error('Input types are mismatched');
@@ -81,7 +75,7 @@ const createGemmProgramInfo =
         line = 'value += a[m * K + k] * b[k * N + n];';
       }
 
-      const dataType = 'f32';  // TODO: support other data type
+      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
       const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
       const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
       const inputStorageBuffersDeclarations = [
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index 2d5750c3e2a88..e4dae00db6305 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -18,11 +18,14 @@ const createMatmulProgramMetadata = (hasBias: boolean, cacheHint: string) => ({
 });
 
 export const createMatmulProgramInfoLoader =
-    (inputs: readonly TensorView[], activationAttributes: InternalActivationAttributes, outputShape: readonly number[]):
-        ProgramInfoLoader => {
-          const metadata = createMatmulProgramMetadata(inputs.length > 2, activationAttributes.activationCacheKey);
-          return {...metadata, get: () => createMatmulProgramInfo(metadata, inputs, activationAttributes, outputShape)};
-        };
+    (inputs: readonly TensorView[], activationAttributes: InternalActivationAttributes, outputShape: readonly number[],
+     reshapedOutputShape?: readonly number[]): ProgramInfoLoader => {
+      const metadata = createMatmulProgramMetadata(inputs.length > 2, activationAttributes.activationCacheKey);
+      return {
+        ...metadata,
+        get: () => createMatmulProgramInfo(metadata, inputs, activationAttributes, outputShape, reshapedOutputShape)
+      };
+    };
 
 const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs || inputs.length !== 2) {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts b/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
index bdbf05e2f185e..e2443b24410a5 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
@@ -5,21 +5,17 @@
 // performance limitations when the reduced axis is long. Need to add
 // a optimized codepath for this.
 
-import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo} from '../types';
 
-import {ShaderHelper} from './common';
+import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
 
 const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs || inputs.length !== 1) {
     throw new Error('Softmax op requires 1 input.');
   }
-  if (inputs[0].dataType !== DataType.float) {
-    throw new Error('Softmax input needs to be float.');
-  }
 };
 
 export interface SoftmaxAttributes extends AttributeWithCacheKey {
@@ -33,7 +29,7 @@ export const softmaxProgramMetadata = {
 
 
 const createSoftmaxProgramInfo = (input: TensorView, attributes: SoftmaxAttributes): ProgramInfo => {
-  const dataType = 'f32';
+  const dataType = tensorTypeToWsglStorageType(input.dataType);
   const shape = input.dims;
   const outputSize = ShapeUtil.size(shape);
   const WG = 64;
@@ -48,6 +44,8 @@ const createSoftmaxProgramInfo = (input: TensorView, attributes: SoftmaxAttribut
   const cols = shape[axis];
   const rows = outputSize / cols;
 
+  // 6.2.4 in wgsl spec
+  const threadMaxDecl = dataType === 'f32' ? 'var threadMax: f32 = -3.402823e+38f;' : 'var threadMax: f16 = -65504.0h;';
   const getShaderSource = (_shaderHelper: ShaderHelper) => `
       var<workgroup> rowMaxShared : ${dataType};
       var<workgroup> rowSumShared : ${dataType};
@@ -76,7 +74,7 @@ const createSoftmaxProgramInfo = (input: TensorView, attributes: SoftmaxAttribut
         let row_stride : i32 = ${cols};
 
         // find the rows max
-        var threadMax = -3.402823e+38f; // 6.2.4 in wgsl spec
+        ${threadMaxDecl}
         for (var col = lindex; col < cols; col += wg) {
           let value = getValue(row, col, row_stride);
           threadMax = max(threadMax, value);
@@ -100,7 +98,7 @@ const createSoftmaxProgramInfo = (input: TensorView, attributes: SoftmaxAttribut
         workgroupBarrier();
 
         // find the rows sum
-        var threadSum = 0.0;
+        var threadSum: ${dataType} = 0.0;
         for (var col = lindex; col < cols; col += wg) {
           let subExp = exp(getValue(row, col, row_stride) - rowMaxShared);
           threadSum += subExp;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index ebedc61712e8a..9243b0e4af6b6 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
@@ -22,11 +21,6 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs || inputs.length !== 1) {
     throw new Error('Transpose requires 1 input.');
   }
-
-  if (inputs[0].dataType !== DataType.float && inputs[0].dataType !== DataType.int32 &&
-      inputs[0].dataType !== DataType.uint32) {
-    throw new Error('Transpose only support float, int32, and uint32 data types');
-  }
 };
 
 const getAdjustedPerm = (inputShape: readonly number[], perm: number[]): number[] =>
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index a02d2ebeebf78..cce61be3448cd 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -126,10 +126,13 @@ export class ProgramManager {
   }
   build(programInfo: ProgramInfo, normalizedDispatchGroupSize: [number, number, number]): Artifact {
     const device = this.backend.device;
-
+    const extensions: string[] = [];
+    if (device.features.has('shader-f16')) {
+      extensions.push('enable f16;');
+    }
     const shaderHelper = createShaderHelper(normalizedDispatchGroupSize);
     const userCode = programInfo.getShaderSource(shaderHelper);
-    const code = `${shaderHelper.additionalImplementations}\n${userCode}`;
+    const code = `${extensions.join('\n')}\n${shaderHelper.additionalImplementations}\n${userCode}`;
     const shaderModule = device.createShaderModule({code, label: programInfo.name});
     LOG_DEBUG('verbose', () => `[WebGPU] shader code: ${code}`);
 
diff --git a/js/web/lib/wasm/session-handler.ts b/js/web/lib/wasm/session-handler.ts
index d35f295592685..d8c5ae7886fe4 100644
--- a/js/web/lib/wasm/session-handler.ts
+++ b/js/web/lib/wasm/session-handler.ts
@@ -9,6 +9,7 @@ import {SerializableModeldata} from './proxy-messages';
 import {createSession, createSessionAllocate, createSessionFinalize, endProfiling, initializeRuntime, releaseSession, run} from './proxy-wrapper';
 
 let runtimeInitialized: boolean;
+let runtimeInitializationPromise: Promise<void>|undefined;
 
 export class OnnxruntimeWebAssemblySessionHandler implements SessionHandler {
   private sessionId: number;
@@ -29,7 +30,11 @@ export class OnnxruntimeWebAssemblySessionHandler implements SessionHandler {
 
   async loadModel(pathOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions): Promise<void> {
     if (!runtimeInitialized) {
-      await initializeRuntime(env);
+      if (!runtimeInitializationPromise) {
+        runtimeInitializationPromise = initializeRuntime(env);
+      }
+      await runtimeInitializationPromise;
+      runtimeInitializationPromise = undefined;
       runtimeInitialized = true;
     }
 
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 38caa9076e3c0..2659b471733f5 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -143,6 +143,21 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n
       setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs);
     }
 
+    if (sessionOptions.freeDimensionOverrides) {
+      for (const [name, value] of Object.entries(sessionOptions.freeDimensionOverrides)) {
+        if (typeof name !== 'string') {
+          throw new Error(`free dimension override name must be a string: ${name}`);
+        }
+        if (typeof value !== 'number' || !Number.isInteger(value) || value < 0) {
+          throw new Error(`free dimension override value must be a non-negative integer: ${value}`);
+        }
+        const nameOffset = allocWasmString(name, allocs);
+        if (wasm._OrtAddFreeDimensionOverride(sessionOptionsHandle, nameOffset, value) !== 0) {
+          checkLastError(`Can't set a free dimension override: ${name} - ${value}.`);
+        }
+      }
+    }
+
     if (sessionOptions.extra !== undefined) {
       iterateExtraOptions(sessionOptions.extra, '', new WeakSet<Record<string, unknown>>(), (key, value) => {
         const keyDataOffset = allocWasmString(key, allocs);
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 4c5649d8806c9..eabd641914170 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1042,9 +1042,9 @@
       "dev": true
     },
     "node_modules/electron": {
-      "version": "23.1.2",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-23.1.2.tgz",
-      "integrity": "sha512-ajE6xzIwH7swf8TlTU5WklDqpI3mPj4Am6690YrpCXzcp+E+dmMBXIajUUNt4joDrFhJC/lC6ZqDS2Q1BApKgQ==",
+      "version": "23.3.13",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-23.3.13.tgz",
+      "integrity": "sha512-BaXtHEb+KYKLouUXlUVDa/lj9pj4F5kiE0kwFdJV84Y2EU7euIDgPthfKtchhr5MVHmjtavRMIV/zAwEiSQ9rQ==",
       "dev": true,
       "hasInstallScript": true,
       "dependencies": {
@@ -4339,9 +4339,9 @@
       "dev": true
     },
     "electron": {
-      "version": "23.1.2",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-23.1.2.tgz",
-      "integrity": "sha512-ajE6xzIwH7swf8TlTU5WklDqpI3mPj4Am6690YrpCXzcp+E+dmMBXIajUUNt4joDrFhJC/lC6ZqDS2Q1BApKgQ==",
+      "version": "23.3.13",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-23.3.13.tgz",
+      "integrity": "sha512-BaXtHEb+KYKLouUXlUVDa/lj9pj4F5kiE0kwFdJV84Y2EU7euIDgPthfKtchhr5MVHmjtavRMIV/zAwEiSQ9rQ==",
       "dev": true,
       "requires": {
         "@electron/get": "^2.0.0",
diff --git a/js/web/script/prepack.ts b/js/web/script/prepack.ts
index be86c5687bec0..4c5941d8dae12 100644
--- a/js/web/script/prepack.ts
+++ b/js/web/script/prepack.ts
@@ -11,7 +11,7 @@ function updatePackageJson() {
   const packageCommon = fs.readJSONSync(commonPackageJsonPath);
   const packageSelf = fs.readJSONSync(selfPackageJsonPath);
   const version = packageCommon.version;
-  packageSelf.dependencies['onnxruntime-common'] = `~${version}`;
+  packageSelf.dependencies['onnxruntime-common'] = `${version}`;
   fs.writeJSONSync(selfPackageJsonPath, packageSelf, {spaces: 2});
   console.log('=== finished updating package.json.');
 }
diff --git a/js/web/script/pull-prebuilt-wasm-artifacts.ts b/js/web/script/pull-prebuilt-wasm-artifacts.ts
index a96741e4cf6ee..aa0a1f88e1d6b 100644
--- a/js/web/script/pull-prebuilt-wasm-artifacts.ts
+++ b/js/web/script/pull-prebuilt-wasm-artifacts.ts
@@ -145,12 +145,14 @@ downloadJson(
                 extractFile(zip, WASM_FOLDER, 'ort-wasm-simd-threaded.wasm', folderName);
                 extractFile(zip, WASM_FOLDER, 'ort-wasm-simd.jsep.wasm', folderName);
                 extractFile(zip, WASM_FOLDER, 'ort-wasm-simd-threaded.jsep.wasm', folderName);
+                extractFile(zip, WASM_FOLDER, 'ort-training-wasm-simd.wasm', folderName);
 
                 extractFile(zip, JS_FOLDER, 'ort-wasm.js', folderName);
                 extractFile(zip, JS_FOLDER, 'ort-wasm-threaded.js', folderName);
                 extractFile(zip, JS_FOLDER, 'ort-wasm-threaded.worker.js', folderName);
                 extractFile(zip, JS_FOLDER, 'ort-wasm-simd.jsep.js', folderName);
                 extractFile(zip, JS_FOLDER, 'ort-wasm-simd-threaded.jsep.js', folderName);
+                extractFile(zip, JS_FOLDER, 'ort-training-wasm-simd.js', folderName);
               });
             });
           });
diff --git a/js/web/test/data/ops/abs_int32.jsonc b/js/web/test/data/ops/abs-int32.jsonc
similarity index 100%
rename from js/web/test/data/ops/abs_int32.jsonc
rename to js/web/test/data/ops/abs-int32.jsonc
diff --git a/js/web/test/data/ops/concat_int32.jsonc b/js/web/test/data/ops/concat_int32.jsonc
new file mode 100644
index 0000000000000..6e2ce18c6f7c5
--- /dev/null
+++ b/js/web/test/data/ops/concat_int32.jsonc
@@ -0,0 +1,406 @@
+[
+  {
+    "name": "Concat 2D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 0, "type": "int" }],
+    "cases": [
+      {
+        "name": "[4,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+              16
+            ],
+            "dims": [8, 4],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [4, 4],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,3]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6],
+            "dims": [4, 3],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 2D axis=1",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "[4,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8, 5, 6, 7, 8, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16, 13, 14, 15,
+              16
+            ],
+            "dims": [4, 8],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 5, 6, 1, 2, 3, 4, 3, 4, 7, 8, 5, 6, 7, 8],
+            "dims": [2, 8],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,3]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6],
+            "dims": [2, 6],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 3D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 0, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15,
+              16
+            ],
+            "dims": [4, 2, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 3D axis=1",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 9, 10, 13, 14, 11, 12, 15,
+              16
+            ],
+            "dims": [2, 4, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 3D axis=2",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 2, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 1, 2, 5, 6, 3, 4, 7, 8, 3, 4, 7, 8, 9, 10, 13, 14, 9, 10, 13, 14, 11, 12, 15, 16, 11, 12, 15,
+              16
+            ],
+            "dims": [2, 2, 8],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 0, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26,
+              29, 30, 27, 28, 31, 32
+            ],
+            "dims": [4, 2, 2, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=1",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15,
+              16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27, 28, 31, 32, 17, 18, 21, 22, 19, 20, 23, 24, 25,
+              26, 29, 30, 27, 28, 31, 32
+            ],
+            "dims": [2, 4, 2, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=2",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 2, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 9, 10, 13, 14, 11, 12, 15,
+              16, 17, 18, 21, 22, 19, 20, 23, 24, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27, 28, 31, 32, 25,
+              26, 29, 30, 27, 28, 31, 32
+            ],
+            "dims": [2, 2, 4, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=3",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 3, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 1, 2, 5, 6, 3, 4, 7, 8, 3, 4, 7, 8, 9, 10, 13, 14, 9, 10, 13, 14, 11, 12, 15, 16, 11, 12, 15,
+              16, 17, 18, 21, 22, 17, 18, 21, 22, 19, 20, 23, 24, 19, 20, 23, 24, 25, 26, 29, 30, 25, 26, 29, 30, 27,
+              28, 31, 32, 27, 28, 31, 32
+            ],
+            "dims": [2, 2, 2, 8],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/einsum.jsonc b/js/web/test/data/ops/einsum.jsonc
new file mode 100644
index 0000000000000..baf30cf982148
--- /dev/null
+++ b/js/web/test/data/ops/einsum.jsonc
@@ -0,0 +1,635 @@
+[
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "i,i",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Dotproduct/scalar product",
+        "inputs": [
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [32],
+            "dims": [],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "i,i->i",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "elementwise product",
+        "inputs": [
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [4, 10, 18],
+            "dims": [3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "i,j",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Product without specifying RSH",
+        "inputs": [
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [4, 5, 6, 8, 10, 12, 12, 15, 18],
+            "dims": [3, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "i,j->ij",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Product",
+        "inputs": [
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [4, 5, 6, 8, 10, 12, 12, 15, 18],
+            "dims": [3, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "ii,jj",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Diagonal elementwise multiplication",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "dims": [3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 0, 0, 0, 1, 0, 0, 0, 1],
+            "dims": [3, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [45],
+            "dims": [],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "ii,jj -> ij",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Dotproduct",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "dims": [3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 0, 0, 0, 1, 0, 0, 0, 1],
+            "dims": [3, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 1, 1, 5, 5, 5, 9, 9, 9],
+            "dims": [3, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "ij,jk->ik",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Multiply",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [38, 44, 50, 56, 83, 98, 113, 128],
+            "dims": [2, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "ij->ji",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Transpose",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 4, 2, 5, 3, 6],
+            "dims": [3, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "ij->i",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "ReduceSum",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "dims": [3, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [6, 15, 24],
+            "dims": [3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "ii->i",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Diagonal",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "dims": [3, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 5, 9],
+            "dims": [3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "ij...,jk...->ik...",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Multiply with ellipsis - A",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 4, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [38, 44, 50, 56, 83, 98, 113, 128],
+            "dims": [2, 4, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "...ij,...jk->...ik",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Multiply with ellipsis - B",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [1, 2, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [1, 3, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [38, 44, 50, 56, 83, 98, 113, 128],
+            "dims": [1, 2, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "i...j,j...k->i...k",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Multiply with ellipsis - C",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 1, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 1, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [38, 44, 50, 56, 83, 98, 113, 128],
+            "dims": [2, 1, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "...ij,jk->...ik",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Multiply with ellipsis - D",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [1, 2, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [38, 44, 50, 56, 83, 98, 113, 128],
+            "dims": [1, 2, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "...ij->...ji",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Transpose with ellipsis",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [1, 2, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 4, 2, 5, 3, 6],
+            "dims": [1, 3, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "...ij->...i",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "ReduceSum with ellipsis",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "dims": [1, 3, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [6, 15, 24],
+            "dims": [1, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "einsum",
+    "operator": "Einsum",
+    "opset": {
+      "domain": "",
+      "version": 12
+    },
+    "attributes": [
+      {
+        "name": "equation",
+        "data": "...ii->...i",
+        "type": "string"
+      }
+    ],
+    "cases": [
+      {
+        "name": "Diagonal with ellipsis",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "dims": [1, 3, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 5, 9],
+            "dims": [1, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/matmul-broadcast.jsonc b/js/web/test/data/ops/matmul-broadcast.jsonc
new file mode 100644
index 0000000000000..170924bb585af
--- /dev/null
+++ b/js/web/test/data/ops/matmul-broadcast.jsonc
@@ -0,0 +1,219 @@
+[
+  {
+    "name": "matmul tests with no attributes",
+    "operator": "MatMul",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "multiplies 5D broadcasted to 6D tensors",
+        "inputs": [
+          {
+            "data": [
+              49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+              75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+              101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
+              122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+              143, 144, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+              27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 145, 146, 147,
+              148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+              169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+              190, 191, 192
+            ],
+            "dims": [4, 3, 2, 4, 2],
+            "type": "float32"
+          },
+          {
+            "data": [
+              73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+              38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+              64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+              90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+              20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+              46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+              72, 64, 65, 66, 67, 68, 69, 70, 71, 72, 64, 65, 66, 67, 68, 69, 70, 71, 72
+            ],
+            "dims": [2, 4, 3, 2, 2, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              7377, 7476, 7575, 7675, 7778, 7881, 7973, 8080, 8187, 8271, 8382, 8493, 9259, 9374, 9489, 9581, 9700,
+              9819, 9903, 10026, 10149, 10225, 10352, 10479, 11333, 11464, 11595, 11679, 11814, 11949, 12025, 12164,
+              12303, 12371, 12514, 12657, 369, 516, 663, 379, 530, 681, 389, 544, 699, 399, 558, 717, 1387, 1550, 1713,
+              1421, 1588, 1755, 1455, 1626, 1797, 1489, 1664, 1839, 2597, 2776, 2955, 2655, 2838, 3021, 2713, 2900,
+              3087, 2771, 2962, 3153, 3999, 4194, 4389, 4081, 4280, 4479, 4163, 4366, 4569, 4245, 4452, 4659, 5593,
+              5804, 6015, 5699, 5914, 6129, 5805, 6024, 6243, 5911, 6134, 6357, 7379, 7606, 7833, 7509, 7740, 7971,
+              7639, 7874, 8109, 7769, 8008, 8247, 9357, 9600, 9843, 9511, 9758, 10005, 9665, 9916, 10167, 9819, 10074,
+              10329, 11527, 11786, 12045, 11705, 11968, 12231, 11883, 12150, 12417, 12061, 12332, 12603, 13889, 14164,
+              14439, 14091, 14370, 14649, 14293, 14576, 14859, 14495, 14782, 15069, 171, 174, 177, 397, 404, 411, 623,
+              634, 645, 849, 864, 879, 1189, 1208, 1227, 1439, 1462, 1485, 1689, 1716, 1743, 1939, 1970, 2001, 2399,
+              2434, 2469, 2673, 2712, 2751, 2947, 2990, 3033, 3221, 3268, 3315, 3801, 3852, 3903, 4099, 4154, 4209,
+              4397, 4456, 4515, 4695, 4758, 4821, 5395, 5462, 5529, 5717, 5788, 5859, 6039, 6114, 6189, 6361, 6440,
+              6519, 7181, 7264, 7347, 7527, 7614, 7701, 7873, 7964, 8055, 8219, 8314, 8409, 729, 1020, 1311, 739, 1034,
+              1329, 749, 1048, 1347, 759, 1062, 1365, 2611, 2918, 3225, 2645, 2956, 3267, 2679, 2994, 3309, 2713, 3032,
+              3351, 4685, 5008, 5331, 4743, 5070, 5397, 4801, 5132, 5463, 4859, 5194, 5529, 6951, 7290, 7629, 7033,
+              7376, 7719, 7115, 7462, 7809, 7197, 7548, 7899, 9409, 9764, 10119, 9515, 9874, 10233, 9621, 9984, 10347,
+              9727, 10094, 10461, 12059, 12430, 12801, 12189, 12564, 12939, 12319, 12698, 13077, 12449, 12832, 13215,
+              3813, 3912, 4011, 3967, 4070, 4173, 4121, 4228, 4335, 4275, 4386, 4497, 5119, 5234, 5349, 5297, 5416,
+              5535, 5475, 5598, 5721, 5653, 5780, 5907, 6617, 6748, 6879, 6819, 6954, 7089, 7021, 7160, 7299, 7223,
+              7366, 7509, 8307, 8454, 8601, 8533, 8684, 8835, 8759, 8914, 9069, 8985, 9144, 9303, 10189, 10352, 10515,
+              10439, 10606, 10773, 10689, 10860, 11031, 10939, 11114, 11289, 12263, 12442, 12621, 12537, 12720, 12903,
+              12811, 12998, 13185, 13085, 13276, 13467, 14529, 14724, 14919, 14827, 15026, 15225, 15125, 15328, 15531,
+              15423, 15630, 15837, 16987, 17198, 17409, 17309, 17524, 17739, 17631, 17850, 18069, 17953, 18176, 18399,
+              19637, 19864, 20091, 19983, 20214, 20445, 20329, 20564, 20799, 20675, 20914, 21153, 609, 852, 1095, 619,
+              866, 1113, 629, 880, 1131, 639, 894, 1149, 2203, 2462, 2721, 2237, 2500, 2763, 2271, 2538, 2805, 2305,
+              2576, 2847, 3989, 4264, 4539, 4047, 4326, 4605, 4105, 4388, 4671, 4163, 4450, 4737, 63, 66, 69, 145, 152,
+              159, 227, 238, 249, 309, 324, 339, 505, 524, 543, 611, 634, 657, 717, 744, 771, 823, 854, 885, 1139, 1174,
+              1209, 1269, 1308, 1347, 1399, 1442, 1485, 1529, 1576, 1623, 1965, 2016, 2067, 2119, 2174, 2229, 2273,
+              2332, 2391, 2427, 2490, 2553, 2983, 3050, 3117, 3161, 3232, 3303, 3339, 3414, 3489, 3517, 3596, 3675,
+              4193, 4276, 4359, 4395, 4482, 4569, 4597, 4688, 4779, 4799, 4894, 4989, 16443, 16734, 17025, 16669, 16964,
+              17259, 16895, 17194, 17493, 17121, 17424, 17727, 19189, 19496, 19803, 19439, 19750, 20061, 19689, 20004,
+              20319, 19939, 20258, 20577, 22127, 22450, 22773, 22401, 22728, 23055, 22675, 23006, 23337, 22949, 23284,
+              23619, 22206, 22545, 22884, 22468, 22811, 23154, 22730, 23077, 23424, 22992, 23343, 23694, 23782, 24137,
+              24492, 24050, 24409, 24768, 24318, 24681, 25044, 24586, 24953, 25320, 25415, 25786, 26157, 25689, 26064,
+              26439, 25963, 26342, 26721, 26237, 26620, 27003
+            ],
+            "dims": [2, 4, 3, 2, 4, 3],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "multiplies 4D broadcasted to 6D tensors",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [3, 1, 4, 2],
+            "type": "float32"
+          },
+          {
+            "data": [
+              73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+              38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+              64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+              90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+              20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+              46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+              72, 64, 65, 66, 67, 68, 69, 70, 71, 72, 64, 65, 66, 67, 68, 69, 70, 71, 72
+            ],
+            "dims": [2, 4, 3, 2, 2, 3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              225, 228, 231, 523, 530, 537, 821, 832, 843, 1119, 1134, 1149, 243, 246, 249, 565, 572, 579, 887, 898,
+              909, 1209, 1224, 1239, 3029, 3064, 3099, 3375, 3414, 3453, 3721, 3764, 3807, 4067, 4114, 4161, 89, 124,
+              159, 99, 138, 177, 109, 152, 195, 119, 166, 213, 163, 182, 201, 197, 220, 243, 231, 258, 285, 265, 296,
+              327, 277, 296, 315, 335, 358, 381, 393, 420, 447, 451, 482, 513, 63, 66, 69, 145, 152, 159, 227, 238, 249,
+              309, 324, 339, 81, 84, 87, 187, 194, 201, 293, 304, 315, 399, 414, 429, 1139, 1174, 1209, 1269, 1308,
+              1347, 1399, 1442, 1485, 1529, 1576, 1623, 1349, 1384, 1419, 1503, 1542, 1581, 1657, 1700, 1743, 1811,
+              1858, 1905, 847, 866, 885, 1025, 1048, 1071, 1203, 1230, 1257, 1381, 1412, 1443, 961, 980, 999, 1163,
+              1186, 1209, 1365, 1392, 1419, 1567, 1598, 1629, 171, 174, 177, 397, 404, 411, 623, 634, 645, 849, 864,
+              879, 189, 192, 195, 439, 446, 453, 689, 700, 711, 939, 954, 969, 2399, 2434, 2469, 2673, 2712, 2751, 2947,
+              2990, 3033, 3221, 3268, 3315, 2609, 2644, 2679, 2907, 2946, 2985, 3205, 3248, 3291, 3503, 3550, 3597,
+              1531, 1550, 1569, 1853, 1876, 1899, 2175, 2202, 2229, 2497, 2528, 2559, 1645, 1664, 1683, 1991, 2014,
+              2037, 2337, 2364, 2391, 2683, 2714, 2745, 9, 12, 15, 19, 26, 33, 29, 40, 51, 39, 54, 69, 27, 30, 33, 61,
+              68, 75, 95, 106, 117, 129, 144, 159, 509, 544, 579, 567, 606, 645, 625, 668, 711, 683, 730, 777, 719, 754,
+              789, 801, 840, 879, 883, 926, 969, 965, 1012, 1059, 505, 524, 543, 611, 634, 657, 717, 744, 771, 823, 854,
+              885, 619, 638, 657, 749, 772, 795, 879, 906, 933, 1009, 1040, 1071, 117, 120, 123, 271, 278, 285, 425,
+              436, 447, 579, 594, 609, 135, 138, 141, 313, 320, 327, 491, 502, 513, 669, 684, 699, 1769, 1804, 1839,
+              1971, 2010, 2049, 2173, 2216, 2259, 2375, 2422, 2469, 1979, 2014, 2049, 2205, 2244, 2283, 2431, 2474,
+              2517, 2657, 2704, 2751, 1189, 1208, 1227, 1439, 1462, 1485, 1689, 1716, 1743, 1939, 1970, 2001, 1303,
+              1322, 1341, 1577, 1600, 1623, 1851, 1878, 1905, 2125, 2156, 2187, 225, 228, 231, 523, 530, 537, 821, 832,
+              843, 1119, 1134, 1149, 243, 246, 249, 565, 572, 579, 887, 898, 909, 1209, 1224, 1239, 3029, 3064, 3099,
+              3375, 3414, 3453, 3721, 3764, 3807, 4067, 4114, 4161, 89, 124, 159, 99, 138, 177, 109, 152, 195, 119, 166,
+              213, 163, 182, 201, 197, 220, 243, 231, 258, 285, 265, 296, 327, 277, 296, 315, 335, 358, 381, 393, 420,
+              447, 451, 482, 513, 63, 66, 69, 145, 152, 159, 227, 238, 249, 309, 324, 339, 81, 84, 87, 187, 194, 201,
+              293, 304, 315, 399, 414, 429, 1139, 1174, 1209, 1269, 1308, 1347, 1399, 1442, 1485, 1529, 1576, 1623,
+              1349, 1384, 1419, 1503, 1542, 1581, 1657, 1700, 1743, 1811, 1858, 1905, 847, 866, 885, 1025, 1048, 1071,
+              1203, 1230, 1257, 1381, 1412, 1443, 961, 980, 999, 1163, 1186, 1209, 1365, 1392, 1419, 1567, 1598, 1629,
+              171, 174, 177, 397, 404, 411, 623, 634, 645, 849, 864, 879, 189, 192, 195, 439, 446, 453, 689, 700, 711,
+              939, 954, 969, 2399, 2434, 2469, 2673, 2712, 2751, 2947, 2990, 3033, 3221, 3268, 3315, 2294, 2329, 2364,
+              2556, 2595, 2634, 2818, 2861, 2904, 3080, 3127, 3174, 1270, 1289, 1308, 1538, 1561, 1584, 1806, 1833,
+              1860, 2074, 2105, 2136, 1303, 1322, 1341, 1577, 1600, 1623, 1851, 1878, 1905, 2125, 2156, 2187
+            ],
+            "dims": [2, 4, 3, 2, 4, 3],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "multiplies 6D with 4D tensors",
+        "inputs": [
+          {
+            "data": [
+              73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+              38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+              64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+              90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+              55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+              81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+              20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+              46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+              72, 64, 65, 66, 67, 68, 69, 70, 71, 72, 64, 65, 66, 67, 68, 69, 70, 71, 72
+            ],
+            "dims": [2, 4, 3, 2, 3, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [3, 1, 2, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              443, 590, 737, 884, 455, 606, 757, 908, 467, 622, 777, 932, 479, 638, 797, 956, 491, 654, 817, 980, 503,
+              670, 837, 1004, 3251, 3422, 3593, 3764, 3327, 3502, 3677, 3852, 3403, 3582, 3761, 3940, 59, 62, 65, 68,
+              135, 142, 149, 156, 211, 222, 233, 244, 167, 182, 197, 212, 211, 230, 249, 268, 255, 278, 301, 324, 299,
+              326, 353, 380, 343, 374, 405, 436, 387, 422, 457, 492, 119, 158, 197, 236, 131, 174, 217, 260, 143, 190,
+              237, 284, 155, 206, 257, 308, 167, 222, 277, 332, 179, 238, 297, 356, 1199, 1262, 1325, 1388, 1275, 1342,
+              1409, 1476, 1351, 1422, 1493, 1564, 1427, 1502, 1577, 1652, 1503, 1582, 1661, 1740, 1579, 1662, 1745,
+              1828, 959, 1046, 1133, 1220, 1003, 1094, 1185, 1276, 1047, 1142, 1237, 1332, 1091, 1190, 1289, 1388, 1135,
+              1238, 1341, 1444, 1179, 1286, 1393, 1500, 335, 446, 557, 668, 347, 462, 577, 692, 359, 478, 597, 716, 371,
+              494, 617, 740, 383, 510, 637, 764, 395, 526, 657, 788, 2567, 2702, 2837, 2972, 2643, 2782, 2921, 3060,
+              2719, 2862, 3005, 3148, 2795, 2942, 3089, 3236, 2871, 3022, 3173, 3324, 2947, 3102, 3257, 3412, 1751,
+              1910, 2069, 2228, 1795, 1958, 2121, 2284, 1839, 2006, 2173, 2340, 1883, 2054, 2225, 2396, 1927, 2102,
+              2277, 2452, 1971, 2150, 2329, 2508, 11, 14, 17, 20, 23, 30, 37, 44, 35, 46, 57, 68, 47, 62, 77, 92, 59,
+              78, 97, 116, 71, 94, 117, 140, 515, 542, 569, 596, 591, 622, 653, 684, 667, 702, 737, 772, 743, 782, 821,
+              860, 819, 862, 905, 948, 895, 942, 989, 1036, 563, 614, 665, 716, 607, 662, 717, 772, 651, 710, 769, 828,
+              695, 758, 821, 884, 739, 806, 873, 940, 783, 854, 925, 996, 227, 302, 377, 452, 239, 318, 397, 476, 251,
+              334, 417, 500, 263, 350, 437, 524, 275, 366, 457, 548, 287, 382, 477, 572, 1883, 1982, 2081, 2180, 1959,
+              2062, 2165, 2268, 2035, 2142, 2249, 2356, 2111, 2222, 2333, 2444, 2187, 2302, 2417, 2532, 2263, 2382,
+              2501, 2620, 1355, 1478, 1601, 1724, 1399, 1526, 1653, 1780, 1443, 1574, 1705, 1836, 1487, 1622, 1757,
+              1892, 1531, 1670, 1809, 1948, 1575, 1718, 1861, 2004, 443, 590, 737, 884, 455, 606, 757, 908, 467, 622,
+              777, 932, 479, 638, 797, 956, 491, 654, 817, 980, 503, 670, 837, 1004, 3251, 3422, 3593, 3764, 3327, 3502,
+              3677, 3852, 3403, 3582, 3761, 3940, 59, 62, 65, 68, 135, 142, 149, 156, 211, 222, 233, 244, 167, 182, 197,
+              212, 211, 230, 249, 268, 255, 278, 301, 324, 299, 326, 353, 380, 343, 374, 405, 436, 387, 422, 457, 492,
+              119, 158, 197, 236, 131, 174, 217, 260, 143, 190, 237, 284, 155, 206, 257, 308, 167, 222, 277, 332, 179,
+              238, 297, 356, 1199, 1262, 1325, 1388, 1275, 1342, 1409, 1476, 1351, 1422, 1493, 1564, 1427, 1502, 1577,
+              1652, 1503, 1582, 1661, 1740, 1579, 1662, 1745, 1828, 959, 1046, 1133, 1220, 1003, 1094, 1185, 1276, 1047,
+              1142, 1237, 1332, 1091, 1190, 1289, 1388, 1135, 1238, 1341, 1444, 1179, 1286, 1393, 1500, 335, 446, 557,
+              668, 347, 462, 577, 692, 359, 478, 597, 716, 371, 494, 617, 740, 383, 510, 637, 764, 395, 526, 657, 788,
+              2567, 2702, 2837, 2972, 2643, 2782, 2921, 3060, 2719, 2862, 3005, 3148, 2453, 2582, 2711, 2840, 2529,
+              2662, 2795, 2928, 2605, 2742, 2879, 3016, 1553, 1694, 1835, 1976, 1480, 1616, 1752, 1888, 1443, 1574,
+              1705, 1836, 1487, 1622, 1757, 1892, 1531, 1670, 1809, 1948, 1575, 1718, 1861, 2004
+            ],
+            "dims": [2, 4, 3, 2, 3, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/neg-int32.jsonc b/js/web/test/data/ops/neg-int32.jsonc
new file mode 100644
index 0000000000000..807333db4a96d
--- /dev/null
+++ b/js/web/test/data/ops/neg-int32.jsonc
@@ -0,0 +1,26 @@
+[
+  {
+    "name": "neg with no attributes",
+    "operator": "Neg",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[2,4] (int32)",
+        "inputs": [
+          {
+            "data": [1, 2, -1, -2, 0, 1, -1, 0],
+            "dims": [2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [-1, -2, 1, 2, 0, -1, 1, 0],
+            "dims": [2, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/slice.jsonc b/js/web/test/data/ops/slice.jsonc
new file mode 100644
index 0000000000000..9c90817a80c36
--- /dev/null
+++ b/js/web/test/data/ops/slice.jsonc
@@ -0,0 +1,40 @@
+[
+  {
+    "name": "Slice float32",
+    "operator": "Slice",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[5] T[1] T[1] (float32)",
+        "inputs": [
+          {
+            "data": [
+              0.3964604139328003, -0.8916832804679871, -1.6578896045684814, 1.960708737373352, 1.181204915046692
+            ],
+            "dims": [5],
+            "type": "float32"
+          },
+          { "data": [3], "dims": [1], "type": "int64" },
+          { "data": [4], "dims": [1], "type": "int64" }
+        ],
+        "outputs": [{ "data": [1.960708737373352], "dims": [1], "type": "float32" }]
+      }
+    ]
+  },
+  {
+    "name": "Slice int32",
+    "operator": "Slice",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[5] T[1] T[1] (int32)",
+        "inputs": [
+          { "data": [0, 0, -1, 1, 0], "dims": [5], "type": "int32" },
+          { "data": [3], "dims": [1], "type": "int64" },
+          { "data": [4], "dims": [1], "type": "int64" }
+        ],
+        "outputs": [{ "data": [1], "dims": [1], "type": "int32" }]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index aca3526115c7e..f4249b24101e5 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -432,18 +432,18 @@
       // // "test_compress_1",
       // // "test_compress_default_axis",
       // // "test_compress_negative_axis",
-      // "test_concat_1d_axis_0",
-      // "test_concat_1d_axis_negative_1",
-      // "test_concat_2d_axis_0",
-      // "test_concat_2d_axis_1",
-      // "test_concat_2d_axis_negative_1",
-      // "test_concat_2d_axis_negative_2",
-      // "test_concat_3d_axis_0",
-      // "test_concat_3d_axis_1",
-      // "test_concat_3d_axis_2",
-      // "test_concat_3d_axis_negative_1",
-      // "test_concat_3d_axis_negative_2",
-      // "test_concat_3d_axis_negative_3",
+      "test_concat_1d_axis_0",
+      "test_concat_1d_axis_negative_1",
+      "test_concat_2d_axis_0",
+      "test_concat_2d_axis_1",
+      "test_concat_2d_axis_negative_1",
+      "test_concat_2d_axis_negative_2",
+      "test_concat_3d_axis_0",
+      "test_concat_3d_axis_1",
+      "test_concat_3d_axis_2",
+      "test_concat_3d_axis_negative_1",
+      "test_concat_3d_axis_negative_2",
+      "test_concat_3d_axis_negative_3",
       "test_conv_with_autopad_same",
       "test_conv_with_strides_and_asymmetric_padding",
       "test_conv_with_strides_no_padding",
@@ -904,14 +904,15 @@
       "test_reduce_log_sum_asc_axes",
       "test_reduce_log_sum_default",
       "test_reduce_log_sum_desc_axes",
-      "test_reduce_log_sum_exp_default_axes_keepdims_example",
-      "test_reduce_log_sum_exp_default_axes_keepdims_random",
-      "test_reduce_log_sum_exp_do_not_keepdims_example",
-      "test_reduce_log_sum_exp_do_not_keepdims_random",
-      "test_reduce_log_sum_exp_keepdims_example",
-      "test_reduce_log_sum_exp_keepdims_random",
-      "test_reduce_log_sum_exp_negative_axes_keepdims_example",
-      "test_reduce_log_sum_exp_negative_axes_keepdims_random",
+      // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64.
+      "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
+      "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
+      "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
+      "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
+      "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
       "test_reduce_log_sum_negative_axes",
       "test_reduce_log_sum",
       "test_reduce_max_default_axes_keepdim_example",
@@ -1322,14 +1323,15 @@
     ],
     "ops": [
       "abs.jsonc",
-      "abs_int32.jsonc",
+      "abs-int32.jsonc",
       "acos.jsonc",
       "add.jsonc",
       "add_int32.jsonc",
       //"and.jsonc",
       "asin.jsonc",
       "ceil.jsonc",
-      //"concat.jsonc",
+      "concat.jsonc",
+      "concat_int32.jsonc",
       "cast.jsonc",
       "conv.jsonc",
       "cos.jsonc",
@@ -1347,9 +1349,11 @@
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",
+      "matmul-broadcast.jsonc",
       "mul.jsonc",
       "mul_int32.jsonc",
       //"neg.jsonc",
+      "neg-int32.jsonc",
       "not.jsonc",
       //"or.jsonc",
       "layer-norm.jsonc",
@@ -1364,6 +1368,7 @@
       "pow-big-number.jsonc",
       "reshape.jsonc",
       "skip-layer-norm.jsonc",
+      "slice.jsonc",
       //"softmax.jsonc",
       "sin.jsonc",
       //"split.jsonc",
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index e614cc8e67e71..49d0ac225be2f 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -110,9 +110,9 @@ for (const group of ORT_WEB_TEST_CONFIG.model) {
               test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions);
         });
 
-        after('release session', () => {
+        after('release session', async () => {
           if (context) {
-            context.release();
+            await context.release();
           }
         });
 
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 9802f00f7a866..46d80a9f56f35 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -210,11 +210,12 @@ export class ModelTestContext {
     Logger.verbose('TestRunner.Perf', '***Perf Data End');
   }
 
-  release(): void {
+  async release(): Promise<void> {
     if (this.profile) {
       this.session.endProfiling();
     }
     this.logPerfData();
+    await this.session.release();
   }
 
   /**
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index ceb3b3bbd72ab..2499ead9effbd 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -137,26 +137,18 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
                     enum OrtMemType mem_type1, _Outptr_ OrtMemoryInfo** out) {
   if (strcmp(name1, onnxruntime::CPU) == 0) {
     *out = new OrtMemoryInfo(onnxruntime::CPU, type, OrtDevice(), id1, mem_type1);
-  } else if (strcmp(name1, onnxruntime::CUDA) == 0) {
+  } else if (strcmp(name1, onnxruntime::CUDA) == 0 ||
+             strcmp(name1, onnxruntime::OpenVINO_GPU) == 0 ||
+             strcmp(name1, onnxruntime::DML) == 0 ||
+             strcmp(name1, onnxruntime::HIP) == 0 ||
+             strcmp(name1, onnxruntime::WEBGPU_BUFFER) == 0) {
     *out = new OrtMemoryInfo(
-        onnxruntime::CUDA, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
+        name1, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
         mem_type1);
   } else if (strcmp(name1, onnxruntime::CUDA_PINNED) == 0) {
     *out = new OrtMemoryInfo(
         onnxruntime::CUDA_PINNED, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, static_cast<OrtDevice::DeviceId>(id1)),
         id1, mem_type1);
-  } else if (strcmp(name1, onnxruntime::OpenVINO_GPU) == 0) {
-    *out = new OrtMemoryInfo(
-        onnxruntime::OpenVINO_GPU, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)),
-        id1, mem_type1);
-  } else if (strcmp(name1, onnxruntime::DML) == 0) {
-    *out = new OrtMemoryInfo(
-        onnxruntime::DML, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)),
-        id1, mem_type1);
-  } else if (strcmp(name1, onnxruntime::HIP) == 0) {
-    *out = new OrtMemoryInfo(
-        onnxruntime::HIP, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
-        mem_type1);
   } else if (strcmp(name1, onnxruntime::HIP_PINNED) == 0) {
     *out = new OrtMemoryInfo(
         onnxruntime::HIP_PINNED, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, static_cast<OrtDevice::DeviceId>(id1)),
diff --git a/onnxruntime/core/framework/tuning_context.h b/onnxruntime/core/framework/tuning_context.h
index b6569a21e4c91..aae70d85814bc 100644
--- a/onnxruntime/core/framework/tuning_context.h
+++ b/onnxruntime/core/framework/tuning_context.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <array>
 #include <unordered_map>
 
 #include "core/common/common.h"
diff --git a/onnxruntime/core/graph/function_utils.cc b/onnxruntime/core/graph/function_utils.cc
index aa0727e3750b0..7477f48088a15 100644
--- a/onnxruntime/core/graph/function_utils.cc
+++ b/onnxruntime/core/graph/function_utils.cc
@@ -269,7 +269,7 @@ static void IOTypeConstraintHelper(const ONNX_NAMESPACE::FunctionProto& onnx_fun
 
 std::unique_ptr<ONNX_NAMESPACE::OpSchema> CreateSchema(const std::string& function_domain,
                                                        const std::string& function_name,
-                                                       const InlinedHashMap<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_local_functions,
+                                                       const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_local_functions,
                                                        const std::unordered_map<std::string, int>& domain_version_map,
                                                        const SchemaRegistryManager& schema_registry,
                                                        const logging::Logger& logger,
@@ -315,6 +315,7 @@ std::unique_ptr<ONNX_NAMESPACE::OpSchema> CreateSchema(const std::string& functi
       schema_registry.GetLastReleasedOpsetVersions(false);
 
   std::unordered_map<std::string, int> func_domain_to_version;
+  func_domain_to_version.reserve(onnx_func_proto->opset_import().size());
   for (auto& opSet : onnx_func_proto->opset_import()) {
     const auto& domain = opSet.domain();
     const auto version = gsl::narrow_cast<int>(opSet.version());
@@ -332,18 +333,16 @@ std::unique_ptr<ONNX_NAMESPACE::OpSchema> CreateSchema(const std::string& functi
     }
   }
 
-  op_schema->TypeAndShapeInferenceFunction(
-      [onnx_func_proto, func_domain_to_version, &model_local_functions](ONNX_NAMESPACE::InferenceContext& ctx) {
-        auto schema_registry = ONNX_NAMESPACE::OpSchemaRegistry::Instance();
-
-        constexpr bool check_type_true = true;
-        constexpr int error_mode_throw = 1;
-        constexpr bool enable_data_propagation_false = false;
-        ONNX_NAMESPACE::ShapeInferenceOptions options{check_type_true, error_mode_throw, enable_data_propagation_false};
+  // Instantiate once and reuse for all shape inference calls.
+  constexpr bool check_type_true = true;
+  constexpr int error_mode_throw = 1;
+  constexpr bool enable_data_propagation_false = false;
+  static const ONNX_NAMESPACE::ShapeInferenceOptions inference_options{check_type_true, error_mode_throw, enable_data_propagation_false};
 
-        std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*> map_copy(model_local_functions.begin(),
-                                                                                       model_local_functions.end());
-        std::unordered_map<std::string, TensorShapeProto> empty_map;
+  // model_local_functions is a member of Model instance and will be alive at the time this is invoked.
+  op_schema->TypeAndShapeInferenceFunction(
+      [onnx_func_proto, func_domain_to_version = std::move(func_domain_to_version), &model_local_functions](ONNX_NAMESPACE::InferenceContext& ctx) {
+        auto* schema_registry = ONNX_NAMESPACE::OpSchemaRegistry::Instance();
 
         // https://github.com/microsoft/onnxruntime/issues/17061
         // We are passing a nullptr for the symbol table, because symbol table must be global
@@ -351,8 +350,8 @@ std::unique_ptr<ONNX_NAMESPACE::OpSchema> CreateSchema(const std::string& functi
         // the same symbolic shapes and are marked for memory re-use. This is a Temp fix.
         constexpr ONNX_NAMESPACE::shape_inference::SymbolTableImpl* symbolTable = nullptr;
         ONNX_NAMESPACE::shape_inference::InferShapeForFunctionNode(*onnx_func_proto, func_domain_to_version,
-                                                                   schema_registry, ctx, options, map_copy,
-                                                                   symbolTable, &empty_map);
+                                                                   schema_registry, ctx, inference_options, model_local_functions,
+                                                                   symbolTable, nullptr);
       });
 
   op_schema->Finalize();
diff --git a/onnxruntime/core/graph/function_utils.h b/onnxruntime/core/graph/function_utils.h
index d2bb86d107e20..34e5e57189bd5 100644
--- a/onnxruntime/core/graph/function_utils.h
+++ b/onnxruntime/core/graph/function_utils.h
@@ -31,6 +31,8 @@ std::unique_ptr<ONNX_NAMESPACE::OpSchema> CreateSchema(const Graph& graph,
  * @param function_name The name of the function.
  * @param model_local_functions The map of local functions in the same onnx model.
  *                              This will be used as context for the function's type/shape inference.
+ *                              This argument is captured by shape inferencing lambda by reference and must
+ *                              be alive at the time of the shape inferencing.
  * @param domain_version_map Domain to version map used in current onnx model.
  * @param schema_registry The schema registry current model is using.
  * @param logger The logger current model is using.
@@ -38,7 +40,7 @@ std::unique_ptr<ONNX_NAMESPACE::OpSchema> CreateSchema(const Graph& graph,
  */
 std::unique_ptr<ONNX_NAMESPACE::OpSchema> CreateSchema(const std::string& function_domain,
                                                        const std::string& function_name,
-                                                       const InlinedHashMap<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_local_functions,
+                                                       const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_local_functions,
                                                        const std::unordered_map<std::string, int>& domain_version_map,
                                                        const SchemaRegistryManager& schema_registry,
                                                        const logging::Logger& logger,
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index d206af1acfb88..05747a7e5124d 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -91,10 +91,11 @@ Model::Model(const std::string& graph_name,
     opset_id_proto->set_version(version);
   }
 
+  model_local_functions_.reserve(model_local_functions.size());
   for (auto& func : model_local_functions) {
     auto func_ptr = model_proto_.add_functions();
     func_ptr->CopyFrom(func);
-    model_local_functions_[function_utils::GetFunctionIdentifier(func_ptr->domain(), func_ptr->name())] = func_ptr;
+    model_local_functions_.insert_or_assign(function_utils::GetFunctionIdentifier(func_ptr->domain(), func_ptr->name()), func_ptr);
   }
 
   model_local_function_templates_.reserve(model_proto_.functions().size());
@@ -214,9 +215,9 @@ Model::Model(ModelProto&& model_proto, const PathString& model_path,
     }
   }
 
-  std::vector<const ONNX_NAMESPACE::FunctionProto*> model_local_functions;
+  model_local_functions_.reserve(model_proto_.functions().size());
   for (auto& func : model_proto_.functions()) {
-    model_local_functions_[function_utils::GetFunctionIdentifier(func.domain(), func.name())] = &func;
+    model_local_functions_.insert_or_assign(function_utils::GetFunctionIdentifier(func.domain(), func.name()), &func);
   }
 
   model_local_function_templates_.reserve(model_proto_.functions().size());
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 7e3942b029251..6bdb68dd734f0 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -310,7 +310,8 @@ class Model {
   // map from function id to pointer of model local function proto
   // FunctionProto is hosted in ModelProto.
   // this map will be used for the local functions' schema's type/shape inference.
-  InlinedHashMap<std::string, const ONNX_NAMESPACE::FunctionProto*> model_local_functions_;
+  // This container is used by ONNX code and must be an std::unordered_map.
+  std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*> model_local_functions_;
   // this is the container that host the generated schemas for model local functions.
   // the generated schemare will be used for graph resolving and type/shape inference.
   // those schemas' type/shape inference will reference to the model_local_functions_ as context,
diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index eb130785add1c..f46273f2680a9 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -185,19 +185,24 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
         fetch_mlvalue_idxs.push_back(info.GetMLValueIndex(node_out->Name()));
       }
 
-      auto& ep_type = node->GetExecutionProviderType();
-      const bool node_on_cpu_ep = ep_type == kCpuExecutionProvider;
+      const bool node_on_cpu_ep = node->GetExecutionProviderType() == kCpuExecutionProvider;
+
+      std::unique_ptr<const OpKernel> kernel;
 
-      // override the EP assigned to the node so that it will use the CPU kernel for Compute.
       if (!node_on_cpu_ep) {
+        // We need to copy the string here instead of taking a reference to it since node->SetExecutionProviderType
+        // will change the value of the reference
+        auto ep_type = node->GetExecutionProviderType();
+
+        // override the EP assigned to the node so that it will use the CPU kernel for Compute.
         node->SetExecutionProviderType(kCpuExecutionProvider);
-      }
 
-      auto kernel = info.CreateKernel(node);
+        kernel = info.CreateKernel(node);
 
-      // undo the EP change to the value that was assigned at graph partitioning time
-      if (!node_on_cpu_ep) {
+        // undo the EP change to the value that was assigned at graph partitioning time
         node->SetExecutionProviderType(ep_type);
+      } else {
+        kernel = info.CreateKernel(node);
       }
 
       // We currently constant fold using the CPU EP only.
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index cc7a892d1c445..7783d3b3f36b7 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -60,6 +60,7 @@ static const OpVersionsAndSelector::OpVersionsMap GetUnaryOpVersionsMap() {
           {"HardSwish", {}},
           {"Sigmoid", {}},
           {"Slice", {}},
+          {"LogSoftmax", {}},
           {"Softmax", {}},
           {"Sqrt", {}},
           {"Atan", {}},
@@ -72,7 +73,10 @@ static const OpVersionsAndSelector::OpVersionsMap GetUnaryOpVersionsMap() {
           {"Log", {}},
           {"LRN", {}},
           {"Ceil", {}},
+          {"Floor", {}},
+          {"Round", {}},
           {"Abs", {}},
+          {"Neg", {}},
           {"DepthToSpace", {}},
           {"SpaceToDepth", {}}};
 }
@@ -82,10 +86,13 @@ static const OpVersionsAndSelector::OpVersionsMap GetBinaryOpVersionsMap() {
           {"Mul", {}},
           {"Pow", {}},
           {"Sub", {}},
+          {"PRelu", {}},
           {"GridSample", {}}};
 }
 static const OpVersionsAndSelector::OpVersionsMap GetVariadicOpVersionsMap() {
-  return {{"Concat", {}}};
+  return {{"Concat", {}},
+          {"Max", {}},
+          {"Min", {}}};
 }
 static const OpVersionsAndSelector::OpVersionsMap GetConvOpVersionsMap() {
   return {{"Conv", {}}};
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h
index aba9a798cf786..b9f3050e59c5b 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h
@@ -64,6 +64,18 @@ enum MissingTrack : uint8_t {
   kFalse = 0
 };
 
+template <typename T>
+struct TreeNodeElement;
+
+template <typename T>
+union PtrOrWeight {
+  TreeNodeElement<T>* ptr;
+  struct WeightData {
+    int32_t weight;
+    int32_t n_weights;
+  } weight_data;
+};
+
 template <typename T>
 struct TreeNodeElement {
   int feature_id;
@@ -71,24 +83,19 @@ struct TreeNodeElement {
   // Stores the node threshold or the weights if the tree has one target.
   T value_or_unique_weight;
 
-  // onnx specification says hitrates is used to store information about the node,
+  // The onnx specification says hitrates is used to store information about the node,
   // but this information is not used for inference.
   // T hitrates;
 
-  // True node, false node are obtained by computing `this + truenode_inc_or_first_weight`,
-  // `this + falsenode_inc_or_n_weights` if the node is not a leaf.
-  // In case of a leaf, these attributes are used to indicate the position of the weight
-  // in array `TreeEnsembleCommon::weights_`. If the number of targets or classes is one,
-  // the weight is also stored in `value_or_unique_weight`.
-  // This implementation assumes a tree has less than 2^31 nodes,
-  // and the total number of leave in the set of trees is below 2^31.
-  // A node cannot point to itself.
-  int32_t truenode_inc_or_first_weight;
-  // In case of a leaf, the following attribute indicates the number of weights
-  // in array `TreeEnsembleCommon::weights_`. If not a leaf, it indicates
-  // `this + falsenode_inc_or_n_weights` is the false node.
-  // A node cannot point to itself.
-  int32_t falsenode_inc_or_n_weights;
+  // PtrOrWeight acts as a tagged union, with the "tag" being whether the node is a leaf or not (see `is_not_leaf`).
+
+  // If it is not a leaf, it is a pointer to the true child node when traversing the decision tree. The false branch is
+  // always 1 position away from the TreeNodeElement in practice in `TreeEnsembleCommon::nodes_` so it is not stored.
+
+  // If it is a leaf, it contains `weight` and `n_weights` attributes which are used to indicate the position of the
+  // weight in array `TreeEnsembleCommon::weights_`. If the number of targets or classes is one, the weight is also
+  // stored in `value_or_unique_weight`.
+  PtrOrWeight<T> truenode_or_weight;
   uint8_t flags;
 
   inline NODE_MODE mode() const { return NODE_MODE(flags & 0xF); }
@@ -189,8 +196,8 @@ class TreeAggregatorSum : public TreeAggregator<InputType, ThresholdType, Output
   void ProcessTreeNodePrediction(InlinedVector<ScoreValue<ThresholdType>>& predictions,
                                  const TreeNodeElement<ThresholdType>& root,
                                  gsl::span<const SparseValue<ThresholdType>> weights) const {
-    auto it = weights.begin() + root.truenode_inc_or_first_weight;
-    for (int32_t i = 0; i < root.falsenode_inc_or_n_weights; ++i, ++it) {
+    auto it = weights.begin() + root.truenode_or_weight.weight_data.weight;
+    for (int32_t i = 0; i < root.truenode_or_weight.weight_data.n_weights; ++i, ++it) {
       ORT_ENFORCE(it->i < (int64_t)predictions.size());
       predictions[onnxruntime::narrow<size_t>(it->i)].score += it->value;
       predictions[onnxruntime::narrow<size_t>(it->i)].has_score = 1;
@@ -292,8 +299,8 @@ class TreeAggregatorMin : public TreeAggregator<InputType, ThresholdType, Output
   void ProcessTreeNodePrediction(InlinedVector<ScoreValue<ThresholdType>>& predictions,
                                  const TreeNodeElement<ThresholdType>& root,
                                  gsl::span<const SparseValue<ThresholdType>> weights) const {
-    auto it = weights.begin() + root.truenode_inc_or_first_weight;
-    for (int32_t i = 0; i < root.falsenode_inc_or_n_weights; ++i, ++it) {
+    auto it = weights.begin() + root.truenode_or_weight.weight_data.weight;
+    for (int32_t i = 0; i < root.truenode_or_weight.weight_data.n_weights; ++i, ++it) {
       predictions[onnxruntime::narrow<size_t>(it->i)].score =
           (!predictions[onnxruntime::narrow<size_t>(it->i)].has_score || it->value < predictions[onnxruntime::narrow<size_t>(it->i)].score)
               ? it->value
@@ -349,8 +356,8 @@ class TreeAggregatorMax : public TreeAggregator<InputType, ThresholdType, Output
   void ProcessTreeNodePrediction(InlinedVector<ScoreValue<ThresholdType>>& predictions,
                                  const TreeNodeElement<ThresholdType>& root,
                                  gsl::span<const SparseValue<ThresholdType>> weights) const {
-    auto it = weights.begin() + root.truenode_inc_or_first_weight;
-    for (int32_t i = 0; i < root.falsenode_inc_or_n_weights; ++i, ++it) {
+    auto it = weights.begin() + root.truenode_or_weight.weight_data.weight;
+    for (int32_t i = 0; i < root.truenode_or_weight.weight_data.n_weights; ++i, ++it) {
       predictions[onnxruntime::narrow<size_t>(it->i)].score =
           (!predictions[onnxruntime::narrow<size_t>(it->i)].has_score || it->value > predictions[onnxruntime::narrow<size_t>(it->i)].score)
               ? it->value
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
index 161bb2b0820eb..8f847fe66aa73 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
@@ -85,6 +85,13 @@ class TreeEnsembleCommon : public TreeEnsembleCommonAttributes {
 
   template <typename AGG>
   void ComputeAgg(concurrency::ThreadPool* ttp, const Tensor* X, Tensor* Y, Tensor* label, const AGG& agg) const;
+
+ private:
+  size_t AddNodes(const size_t i, const InlinedVector<NODE_MODE>& cmodes, const InlinedVector<size_t>& truenode_ids,
+                  const InlinedVector<size_t>& falsenode_ids, const std::vector<int64_t>& nodes_featureids,
+                  const std::vector<ThresholdType>& nodes_values_as_tensor, const std::vector<float>& node_values,
+                  const std::vector<int64_t>& nodes_missing_value_tracks_true, std::vector<size_t>& updated_mapping,
+                  int64_t tree_id, const InlinedVector<TreeNodeElementId>& node_tree_ids);
 };
 
 template <typename InputType, typename ThresholdType, typename OutputType>
@@ -186,7 +193,7 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
   max_tree_depth_ = 1000;
   ORT_ENFORCE(nodes_modes.size() < std::numeric_limits<uint32_t>::max());
 
-  // additional members
+  // Additional members
   size_t limit;
   uint32_t i;
   InlinedVector<NODE_MODE> cmodes;
@@ -195,18 +202,14 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
   int fpos = -1;
   for (i = 0, limit = nodes_modes.size(); i < limit; ++i) {
     cmodes.push_back(MakeTreeNodeMode(nodes_modes[i]));
-    if (cmodes[i] == NODE_MODE::LEAF)
-      continue;
+    if (cmodes[i] == NODE_MODE::LEAF) continue;
     if (fpos == -1) {
       fpos = static_cast<int>(i);
       continue;
     }
-    if (cmodes[i] != cmodes[fpos])
-      same_mode_ = false;
+    if (cmodes[i] != cmodes[fpos]) same_mode_ = false;
   }
 
-  // filling nodes
-
   n_nodes_ = nodes_treeids.size();
   limit = static_cast<size_t>(n_nodes_);
   InlinedVector<TreeNodeElementId> node_tree_ids;
@@ -214,156 +217,185 @@ Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::Init(
   nodes_.clear();
   nodes_.reserve(limit);
   roots_.clear();
-  std::unordered_map<TreeNodeElementId, uint32_t, TreeNodeElementId::hash_fn> idi;
-  idi.reserve(limit);
+  std::unordered_map<TreeNodeElementId, size_t, TreeNodeElementId::hash_fn> node_tree_ids_map;
+  node_tree_ids_map.reserve(limit);
+
+  InlinedVector<size_t> truenode_ids, falsenode_ids;
+  truenode_ids.reserve(limit);
+  falsenode_ids.reserve(limit);
   max_feature_id_ = 0;
 
+  // Build node_tree_ids and node_tree_ids_map and truenode_ids and falsenode_ids
   for (i = 0; i < limit; ++i) {
-    TreeNodeElementId node_tree_id{static_cast<int>(nodes_treeids[i]),
-                                   static_cast<int>(nodes_nodeids[i])};
-    TreeNodeElement<ThresholdType> node;
-    node.feature_id = static_cast<int>(nodes_featureids[i]);
-    if (node.feature_id > max_feature_id_) {
-      max_feature_id_ = node.feature_id;
-    }
-    node.value_or_unique_weight = nodes_values_as_tensor.empty()
-                                      ? static_cast<ThresholdType>(nodes_values[i])
-                                      : nodes_values_as_tensor[i];
-
-    /* hitrates is not used for inference, they are ignored.
-    if (nodes_hitrates_as_tensor.empty()) {
-      node.hitrates = static_cast<ThresholdType>(i < nodes_hitrates.size() ? nodes_hitrates[i] : -1);
-    } else {
-      node.hitrates = i < nodes_hitrates_as_tensor.size() ? nodes_hitrates_as_tensor[i] : -1;
-    } */
-
-    node.flags = static_cast<uint8_t>(cmodes[i]);
-    node.truenode_inc_or_first_weight = 0;  // nodes_truenodeids[i] if not a leaf
-    node.falsenode_inc_or_n_weights = 0;    // nodes_falsenodeids[i] if not a leaf
-
-    if (i < static_cast<size_t>(nodes_missing_value_tracks_true.size()) && nodes_missing_value_tracks_true[i] == 1) {
-      node.flags |= static_cast<uint8_t>(MissingTrack::kTrue);
-    }
-    auto p = idi.insert(std::pair<TreeNodeElementId, uint32_t>(node_tree_id, i));
+    TreeNodeElementId node_tree_id{static_cast<int>(nodes_treeids[i]), static_cast<int>(nodes_nodeids[i])};
+    auto p = node_tree_ids_map.insert(std::pair<TreeNodeElementId, size_t>(node_tree_id, i));
     if (!p.second) {
       ORT_THROW("Node ", node_tree_id.node_id, " in tree ", node_tree_id.tree_id, " is already there.");
     }
-    nodes_.emplace_back(node);
     node_tree_ids.emplace_back(node_tree_id);
   }
 
-  InlinedVector<int64_t> truenode_ids, falsenode_ids;
-  truenode_ids.reserve(limit);
-  falsenode_ids.reserve(limit);
   TreeNodeElementId coor;
-  i = 0;
-  for (auto it = nodes_.begin(); it != nodes_.end(); ++it, ++i) {
-    if (!it->is_not_leaf()) {
+  for (i = 0; i < limit; ++i) {
+    if (cmodes[i] == NODE_MODE::LEAF) {
       truenode_ids.push_back(0);
       falsenode_ids.push_back(0);
-      continue;
-    }
-
-    TreeNodeElementId& node_tree_id = node_tree_ids[i];
-    coor.tree_id = node_tree_id.tree_id;
-    coor.node_id = static_cast<int>(nodes_truenodeids[i]);
-    ORT_ENFORCE((coor.node_id >= 0 && coor.node_id < n_nodes_));
+    } else {
+      TreeNodeElementId& node_tree_id = node_tree_ids[i];
+      coor.tree_id = node_tree_id.tree_id;
+      coor.node_id = static_cast<int>(nodes_truenodeids[i]);
+      ORT_ENFORCE((coor.node_id >= 0 && coor.node_id < n_nodes_));
+
+      auto found = node_tree_ids_map.find(coor);
+      if (found == node_tree_ids_map.end()) {
+        ORT_THROW("Unable to find node ", coor.tree_id, "-", coor.node_id, " (truenode).");
+      }
+      if (found->second == truenode_ids.size()) {
+        ORT_THROW("A node cannot point to itself: ", coor.tree_id, "-", node_tree_id.node_id, " (truenode).");
+      }
+      truenode_ids.emplace_back(found->second);
 
-    auto found = idi.find(coor);
-    if (found == idi.end()) {
-      ORT_THROW("Unable to find node ", coor.tree_id, "-", coor.node_id, " (truenode).");
-    }
-    if (found->second == truenode_ids.size()) {
-      ORT_THROW("A node cannot point to itself: ", coor.tree_id, "-", node_tree_id.node_id, " (truenode).");
+      coor.node_id = static_cast<int>(nodes_falsenodeids[i]);
+      ORT_ENFORCE((coor.node_id >= 0 && coor.node_id < n_nodes_));
+      found = node_tree_ids_map.find(coor);
+      if (found == node_tree_ids_map.end()) {
+        ORT_THROW("Unable to find node ", coor.tree_id, "-", coor.node_id, " (falsenode).");
+      }
+      if (found->second == falsenode_ids.size()) {
+        ORT_THROW("A node cannot point to itself: ", coor.tree_id, "-", node_tree_id.node_id, " (falsenode).");
+      }
+      falsenode_ids.emplace_back(found->second);
+      // We could also check that truenode_ids[truenode_ids.size() - 1] != falsenode_ids[falsenode_ids.size() - 1]).
+      // It is valid but no training algorithm would produce a tree where left and right nodes are the same.
     }
-    truenode_ids.emplace_back(found->second);
+  }
 
-    coor.node_id = static_cast<int>(nodes_falsenodeids[i]);
-    ORT_ENFORCE((coor.node_id >= 0 && coor.node_id < n_nodes_));
-    found = idi.find(coor);
-    if (found == idi.end()) {
-      ORT_THROW("Unable to find node ", coor.tree_id, "-", coor.node_id, " (falsenode).");
-    }
-    if (found->second == falsenode_ids.size()) {
-      ORT_THROW("A node cannot point to itself: ", coor.tree_id, "-", node_tree_id.node_id, " (falsenode).");
+  // Let's construct nodes_ such that the false branch is always the next element in nodes_.
+  // updated_mapping will translates the old position of each node to the new node position in nodes_.
+  std::vector<size_t> updated_mapping(nodes_treeids.size(), 0);
+  int64_t previous_tree_id = -1;
+  for (i = 0; i < n_nodes_; ++i) {
+    if (previous_tree_id == -1 || (previous_tree_id != node_tree_ids[i].tree_id)) {
+      // New tree.
+      int64_t tree_id = node_tree_ids[i].tree_id;
+      size_t root_position =
+          AddNodes(i, cmodes, truenode_ids, falsenode_ids, nodes_featureids, nodes_values_as_tensor, nodes_values,
+                   nodes_missing_value_tracks_true, updated_mapping, tree_id, node_tree_ids);
+      roots_.push_back(&nodes_[root_position]);
+      previous_tree_id = tree_id;
     }
-    falsenode_ids.emplace_back(found->second);
-    // We could also check that truenode_ids[truenode_ids.size() - 1] != falsenode_ids[falsenode_ids.size() - 1]).
-    // It is valid but no training algorithm would produce a tree where left and right nodes are the same.
   }
 
-  // sort targets
+  n_trees_ = roots_.size();
+  if (((int64_t)nodes_.size()) != n_nodes_) {
+    ORT_THROW("Number of nodes in nodes_ (", nodes_.size(), ") is different from n_nodes (", n_nodes_, ").");
+  }
+
+  // Sort targets
   InlinedVector<std::pair<TreeNodeElementId, uint32_t>> indices;
   indices.reserve(target_class_nodeids.size());
   for (i = 0, limit = target_class_nodeids.size(); i < limit; i++) {
-    indices.emplace_back(std::pair<TreeNodeElementId, uint32_t>(
-        TreeNodeElementId{target_class_treeids[i], target_class_nodeids[i]},
-        i));
+    indices.emplace_back(
+        std::pair<TreeNodeElementId, uint32_t>(TreeNodeElementId{target_class_treeids[i], target_class_nodeids[i]}, i));
   }
+
   std::sort(indices.begin(), indices.end());
 
-  // Initialize the leaves.
   TreeNodeElementId ind;
   SparseValue<ThresholdType> w;
   size_t indi;
   for (indi = 0, limit = target_class_nodeids.size(); indi < limit; ++indi) {
     ind = indices[indi].first;
     i = indices[indi].second;
-    auto found = idi.find(ind);
-    if (found == idi.end()) {
+    auto found = node_tree_ids_map.find(ind);
+    if (found == node_tree_ids_map.end()) {
       ORT_THROW("Unable to find node ", ind.tree_id, "-", ind.node_id, " (weights).");
     }
 
-    TreeNodeElement<ThresholdType>& leaf = nodes_[found->second];
+    TreeNodeElement<ThresholdType>& leaf = nodes_[updated_mapping[found->second]];
     if (leaf.is_not_leaf()) {
       // An exception should be raised in that case. But this case may happen in
       // models converted with an old version of onnxmltools. These weights are ignored.
       // ORT_THROW("Node ", ind.tree_id, "-", ind.node_id, " is not a leaf.");
       continue;
     }
-
     w.i = target_class_ids[i];
-    w.value = target_class_weights_as_tensor.empty()
-                  ? static_cast<ThresholdType>(target_class_weights[i])
-                  : target_class_weights_as_tensor[i];
-    if (leaf.falsenode_inc_or_n_weights == 0) {
-      leaf.truenode_inc_or_first_weight = static_cast<int32_t>(weights_.size());
+    w.value = target_class_weights_as_tensor.empty() ? static_cast<ThresholdType>(target_class_weights[i])
+                                                     : target_class_weights_as_tensor[i];
+    if (leaf.truenode_or_weight.weight_data.n_weights == 0) {
+      leaf.truenode_or_weight.weight_data.weight = static_cast<int32_t>(weights_.size());
       leaf.value_or_unique_weight = w.value;
     }
-    ++leaf.falsenode_inc_or_n_weights;
+    ++leaf.truenode_or_weight.weight_data.n_weights;
     weights_.push_back(w);
   }
 
-  // Initialize all the nodes but the leaves.
-  int64_t previous = -1;
-  for (i = 0, limit = static_cast<uint32_t>(n_nodes_); i < limit; ++i) {
-    if ((previous == -1) || (previous != node_tree_ids[i].tree_id))
-      roots_.push_back(&(nodes_[idi[node_tree_ids[i]]]));
-    previous = node_tree_ids[i].tree_id;
-    if (!nodes_[i].is_not_leaf()) {
-      if (nodes_[i].falsenode_inc_or_n_weights == 0) {
-        ORT_THROW("Target is missing for leaf ", ind.tree_id, "-", ind.node_id, ".");
-      }
-      continue;
-    }
-    ORT_ENFORCE(truenode_ids[i] != i);  // That would mean the left node is itself, leading to an infinite loop.
-    nodes_[i].truenode_inc_or_first_weight = static_cast<int32_t>(truenode_ids[i] - i);
-    ORT_ENFORCE(falsenode_ids[i] != i);  // That would mean the right node is itself, leading to an infinite loop.
-    nodes_[i].falsenode_inc_or_n_weights = static_cast<int32_t>(falsenode_ids[i] - i);
-  }
-
-  n_trees_ = roots_.size();
   has_missing_tracks_ = false;
-  for (auto itm = nodes_missing_value_tracks_true.begin();
-       itm != nodes_missing_value_tracks_true.end(); ++itm) {
+  for (auto itm = nodes_missing_value_tracks_true.begin(); itm != nodes_missing_value_tracks_true.end(); ++itm) {
     if (*itm) {
       has_missing_tracks_ = true;
       break;
     }
   }
+
   return Status::OK();
 }
 
+template <typename InputType, typename ThresholdType, typename OutputType>
+size_t TreeEnsembleCommon<InputType, ThresholdType, OutputType>::AddNodes(
+    const size_t i, const InlinedVector<NODE_MODE>& cmodes, const InlinedVector<size_t>& truenode_ids,
+    const InlinedVector<size_t>& falsenode_ids, const std::vector<int64_t>& nodes_featureids,
+    const std::vector<ThresholdType>& nodes_values_as_tensor, const std::vector<float>& node_values,
+    const std::vector<int64_t>& nodes_missing_value_tracks_true, std::vector<size_t>& updated_mapping, int64_t tree_id,
+    const InlinedVector<TreeNodeElementId>& node_tree_ids) {
+  // Validate this index maps to the same tree_id as the one we should be building.
+  if (node_tree_ids[i].tree_id != tree_id) {
+    ORT_THROW("Tree id mismatch. Expected ", tree_id, " but got ", node_tree_ids[i].tree_id, " at position ", i);
+  }
+
+  if (updated_mapping[i] != 0) {
+    // In theory we should not accept any cycles, however in practice LGBM conversion implements set membership via a
+    // series of "Equals" nodes, with the true branches directed at the same child node (a cycle).
+    // We may instead seek to formalize set membership in the future.
+    return updated_mapping[i];
+  }
+
+  size_t node_pos = nodes_.size();
+  updated_mapping[i] = node_pos;
+
+  TreeNodeElement<ThresholdType> node;
+  node.flags = static_cast<uint8_t>(cmodes[i]);
+  node.feature_id = static_cast<int>(nodes_featureids[i]);
+  if (node.feature_id > max_feature_id_) {
+    max_feature_id_ = node.feature_id;
+  }
+  node.value_or_unique_weight =
+      nodes_values_as_tensor.empty() ? static_cast<ThresholdType>(node_values[i]) : nodes_values_as_tensor[i];
+  if (i < static_cast<size_t>(nodes_missing_value_tracks_true.size()) && nodes_missing_value_tracks_true[i] == 1) {
+    node.flags |= static_cast<uint8_t>(MissingTrack::kTrue);
+  }
+  nodes_.push_back(std::move(node));
+  if (nodes_[node_pos].is_not_leaf()) {
+    size_t false_branch =
+        AddNodes(falsenode_ids[i], cmodes, truenode_ids, falsenode_ids, nodes_featureids, nodes_values_as_tensor,
+                 node_values, nodes_missing_value_tracks_true, updated_mapping, tree_id, node_tree_ids);
+    if (false_branch != node_pos + 1) {
+      ORT_THROW("False node must always be the next node, but it isn't at index ", node_pos, " with flags ",
+                static_cast<int>(nodes_[node_pos].flags));
+    }
+    size_t true_branch =
+        AddNodes(truenode_ids[i], cmodes, truenode_ids, falsenode_ids, nodes_featureids, nodes_values_as_tensor,
+                 node_values, nodes_missing_value_tracks_true, updated_mapping, tree_id, node_tree_ids);
+    // We don't need to store the false branch pointer since we know it is always in the immediate next entry in nodes_.
+    // nodes_[node_pos].falsenode_inc_or_n_weights.ptr = &nodes_[false_branch];
+    nodes_[node_pos].truenode_or_weight.ptr = &nodes_[true_branch];
+  } else {
+    nodes_[node_pos].truenode_or_weight.weight_data.weight = 0;
+    nodes_[node_pos].truenode_or_weight.weight_data.n_weights = 0;
+  }
+  return node_pos;
+}
+
 template <typename InputType, typename ThresholdType, typename OutputType>
 Status TreeEnsembleCommon<InputType, ThresholdType, OutputType>::compute(OpKernelContext* ctx,
                                                                          const Tensor* X,
@@ -637,22 +669,19 @@ void TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ComputeAgg(concur
   }
 }  // namespace detail
 
-#define TREE_FIND_VALUE(CMP)                                    \
-  if (has_missing_tracks_) {                                    \
-    while (root->is_not_leaf()) {                               \
-      val = x_data[root->feature_id];                           \
-      root += (val CMP root->value_or_unique_weight ||          \
-               (root->is_missing_track_true() && _isnan_(val))) \
-                  ? root->truenode_inc_or_first_weight          \
-                  : root->falsenode_inc_or_n_weights;           \
-    }                                                           \
-  } else {                                                      \
-    while (root->is_not_leaf()) {                               \
-      val = x_data[root->feature_id];                           \
-      root += val CMP root->value_or_unique_weight              \
-                  ? root->truenode_inc_or_first_weight          \
-                  : root->falsenode_inc_or_n_weights;           \
-    }                                                           \
+#define TREE_FIND_VALUE(CMP)                                                                           \
+  if (has_missing_tracks_) {                                                                           \
+    while (root->is_not_leaf()) {                                                                      \
+      val = x_data[root->feature_id];                                                                  \
+      root = (val CMP root->value_or_unique_weight || (root->is_missing_track_true() && _isnan_(val))) \
+                 ? root->truenode_or_weight.ptr                                                        \
+                 : root + 1;                                                                           \
+    }                                                                                                  \
+  } else {                                                                                             \
+    while (root->is_not_leaf()) {                                                                      \
+      val = x_data[root->feature_id];                                                                  \
+      root = val CMP root->value_or_unique_weight ? root->truenode_or_weight.ptr : root + 1;           \
+    }                                                                                                  \
   }
 
 inline bool _isnan_(float x) { return std::isnan(x); }
@@ -671,15 +700,14 @@ TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ProcessTreeNodeLeave(
         if (has_missing_tracks_) {
           while (root->is_not_leaf()) {
             val = x_data[root->feature_id];
-            root += (val <= root->value_or_unique_weight ||
-                     (root->is_missing_track_true() && _isnan_(val)))
-                        ? root->truenode_inc_or_first_weight
-                        : root->falsenode_inc_or_n_weights;
+            root = (val <= root->value_or_unique_weight || (root->is_missing_track_true() && _isnan_(val)))
+                       ? root->truenode_or_weight.ptr
+                       : root + 1;
           }
         } else {
           while (root->is_not_leaf()) {
             val = x_data[root->feature_id];
-            root += val <= root->value_or_unique_weight ? root->truenode_inc_or_first_weight : root->falsenode_inc_or_n_weights;
+            root = val <= root->value_or_unique_weight ? root->truenode_or_weight.ptr : root + 1;
           }
         }
         break;
@@ -703,42 +731,36 @@ TreeEnsembleCommon<InputType, ThresholdType, OutputType>::ProcessTreeNodeLeave(
     }
   } else {  // Different rules to compare to node thresholds.
     ThresholdType threshold;
-    while (root->is_not_leaf()) {
+    while (1) {
       val = x_data[root->feature_id];
       threshold = root->value_or_unique_weight;
       switch (root->mode()) {
         case NODE_MODE::BRANCH_LEQ:
-          root += val <= threshold || (root->is_missing_track_true() && _isnan_(val))
-                      ? root->truenode_inc_or_first_weight
-                      : root->falsenode_inc_or_n_weights;
+          root = val <= threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
+                                                                                     : root + 1;
           break;
         case NODE_MODE::BRANCH_LT:
-          root += val < threshold || (root->is_missing_track_true() && _isnan_(val))
-                      ? root->truenode_inc_or_first_weight
-                      : root->falsenode_inc_or_n_weights;
+          root = val < threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
+                                                                                    : root + 1;
           break;
         case NODE_MODE::BRANCH_GTE:
-          root += val >= threshold || (root->is_missing_track_true() && _isnan_(val))
-                      ? root->truenode_inc_or_first_weight
-                      : root->falsenode_inc_or_n_weights;
+          root = val >= threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
+                                                                                     : root + 1;
           break;
         case NODE_MODE::BRANCH_GT:
-          root += val > threshold || (root->is_missing_track_true() && _isnan_(val))
-                      ? root->truenode_inc_or_first_weight
-                      : root->falsenode_inc_or_n_weights;
+          root = val > threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
+                                                                                    : root + 1;
           break;
         case NODE_MODE::BRANCH_EQ:
-          root += val == threshold || (root->is_missing_track_true() && _isnan_(val))
-                      ? root->truenode_inc_or_first_weight
-                      : root->falsenode_inc_or_n_weights;
+          root = val == threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
+                                                                                     : root + 1;
           break;
         case NODE_MODE::BRANCH_NEQ:
-          root += val != threshold || (root->is_missing_track_true() && _isnan_(val))
-                      ? root->truenode_inc_or_first_weight
-                      : root->falsenode_inc_or_n_weights;
+          root = val != threshold || (root->is_missing_track_true() && _isnan_(val)) ? root->truenode_or_weight.ptr
+                                                                                     : root + 1;
           break;
         case NODE_MODE::LEAF:
-          break;
+          return root;
       }
     }
   }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 5dbea41901b80..c24257071eda5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -212,15 +212,6 @@ namespace Dml
             ORT_THROW_HR(E_INVALIDARG);
         }
         const auto* allocInfo = static_cast<const AllocationInfo*>(opaqueHandle);
-
-        auto owner = allocInfo->GetOwner();
-        //The owner can be null if the resource was wrapped via CreateGPUAllocationFromD3DResource
-        if (owner != nullptr && owner != this)
-        {
-            // This allocation doesn't belong to this allocator!
-            ORT_THROW_HR(E_INVALIDARG);
-        }
-
         return allocInfo;
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 4c24cb174f6ed..196fba5d7689d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -83,16 +83,16 @@ namespace Dml
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
         uint64_t m_currentResourceId = 0;
-        
-        // Unless specifically requested, allocation sizes are not rounded to enable pooling 
-        // until SetDefaultRoundingMode is called.  This should be done at completion of session 
+
+        // Unless specifically requested, allocation sizes are not rounded to enable pooling
+        // until SetDefaultRoundingMode is called.  This should be done at completion of session
         // initialization.
         AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Disabled;
 
         std::shared_ptr<ExecutionContext> m_context;
         std::unique_ptr<DmlSubAllocator> m_subAllocator;
 
-    #if _DEBUG
+    #ifndef NDEBUG
         // Useful for debugging; keeps track of all allocations that haven't been freed yet
         std::map<size_t, AllocationInfo*> m_outstandingAllocationsById;
     #endif
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
index a9d19a022d3e7..4813707cdf50c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
@@ -38,6 +38,16 @@ namespace Dml
         bool& modified,
         int graph_level,
         const onnxruntime::logging::Logger& logger) const
+    {
+        return ApplyImplHelper(graph, modified, graph_level, logger, {});
+    }
+
+    onnxruntime::common::Status DmlGraphFusionTransformer::ApplyImplHelper(
+        onnxruntime::Graph& graph,
+        bool& modified,
+        int graph_level,
+        const onnxruntime::logging::Logger& logger,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputDefs) const
     {
         onnxruntime::ProviderType provider_type = onnxruntime::kDmlExecutionProvider;
         const gsl::not_null<const onnxruntime::KernelRegistry*> registry = m_providerImpl->GetKernelRegistry().get();
@@ -49,6 +59,30 @@ namespace Dml
         std::vector<std::shared_ptr<CompiledPartitionInfo>> compiledPartitionInfos;
         std::vector<onnxruntime::NodeIndex> additionalSplittingNodes;
 
+        onnxruntime::GraphViewer graph_viewer(graph);
+        const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
+
+        for (auto node_index : node_topology_list)
+        {
+            auto* node = graph.GetNode(node_index);
+            if (!node)
+            {
+                continue;  // node was removed
+            }
+
+            std::unordered_map<std::string, const onnxruntime::NodeArg*> subgraphImplicitInputDefs;
+            for (const onnxruntime::NodeArg* inputDef : node->ImplicitInputDefs())
+            {
+                subgraphImplicitInputDefs[inputDef->Name()] = inputDef;
+            }
+
+            for (auto& entry : node->GetAttributeNameToMutableSubgraphMap())
+            {
+                auto& subgraph = *entry.second;
+                ORT_RETURN_IF_ERROR(ApplyImplHelper(subgraph, modified, graph_level + 1, logger, subgraphImplicitInputDefs));
+            }
+        }
+
         do
         {
             // Initializers needed by any graph partition
@@ -62,7 +96,8 @@ namespace Dml
                 m_providerImpl->GetSupportedDeviceDataTypeMask(),
                 graphNodePropertyMap,
                 requiredInitializerMap,
-                additionalSplittingNodes);
+                additionalSplittingNodes,
+                implicitInputDefs);
 
             // Reset the splitting nodes for the current iteration
             additionalSplittingNodes.clear();
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
index b546f29f59719..19dab0c89943c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
@@ -2,32 +2,41 @@
 // Licensed under the MIT License.
 #pragma once
 
-
+#include <string>
+#include <unordered_map>
 #include "core/optimizer/graph_transformer.h"
 #include "core/framework/execution_providers.h"
 
 namespace Dml
 {
-	class ExecutionProviderImpl;
+class ExecutionProviderImpl;
+
+class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
+{
+public:
+    DmlGraphFusionTransformer(
+        const std::string& name,
+        const onnxruntime::IExecutionProvider* provider
+    );
+
+public:
+    static inline const char* const DML_GRAPH_FUSION_NODE_NAME_PREFIX = "DmlFusedNode_";
+    static inline const char* const DML_GRAPH_FUSION_NODE_DOMAIN = "DmlFusedNodeDomain";
 
-	class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
-	{
-	public:
-		DmlGraphFusionTransformer(
-			const std::string& name,
-			const onnxruntime::IExecutionProvider* provider
-		);
+private:
+    onnxruntime::common::Status ApplyImpl(onnxruntime::Graph& graph,
+                                            bool& modified,
+                                            int graph_level,
+                                            const onnxruntime::logging::Logger& logger) const final;
 
-	public:
-		inline const static char* const DML_GRAPH_FUSION_NODE_NAME_PREFIX = "DmlFusedNode_";
-		inline const static char* const DML_GRAPH_FUSION_NODE_DOMAIN = "DmlFusedNodeDomain";
+    onnxruntime::common::Status ApplyImplHelper(
+        onnxruntime::Graph& graph,
+        bool& modified,
+        int graph_level,
+        const onnxruntime::logging::Logger& logger,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputDefs) const;
 
-	private:
-		onnxruntime::common::Status ApplyImpl(onnxruntime::Graph& graph, 
-											  bool& modified, 
-											  int graph_level, 
-											  const onnxruntime::logging::Logger& logger) const final;
-	private:
-		const ExecutionProviderImpl* m_providerImpl = nullptr;
-	};
+private:
+    const ExecutionProviderImpl* m_providerImpl = nullptr;
+};
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
index 2c8d4e4459f7f..18943878ccedc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
@@ -345,13 +345,8 @@ namespace Dml
     // Whether any operator in the model contains a subgraph.  This is true
     // if the graph being partitioned is itself within a subgraph, or contains
     // an operator with a subgraph.
-    bool ModelUsesSubgraph(const onnxruntime::GraphViewer& graph)
+    bool ContainsSubgraph(const onnxruntime::GraphViewer& graph)
     {
-        if (graph.IsSubgraph())
-        {
-            return true;
-        }
-
         const std::vector<onnxruntime::NodeIndex>& toplogicalOrder = graph.GetNodesInTopologicalOrder();
 
         for (size_t nodeIndex : toplogicalOrder)
@@ -384,7 +379,8 @@ namespace Dml
         uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
         std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
         std::unordered_set<std::string>& requiredInitializerMap,
-        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes)
+        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputs)
     {
         // Nodes are uniquely identified by the name of their first output argument
         std::vector<std::unique_ptr<GraphPartition>> partitions;
@@ -419,7 +415,7 @@ namespace Dml
         }
 
         // Check whether this graph is a subgraph, or contains any node with a subgraph.
-        bool modelUsesSubgraph = ModelUsesSubgraph(graph);
+        bool containsSubgraph = ContainsSubgraph(graph);
 
         uint32_t splittingNodeIndex = 0;
 
@@ -454,10 +450,10 @@ namespace Dml
             // Add a unique partition if graph node usage is not supported.
             //
             // Partitioning is disabled in models with subgraphs to work around issues with implicit inputs.
-            // The partitioning algorithm does not currently consider such inputs.  Transfering shared initializers
+            // The partitioning algorithm does not currently consider such inputs. Transferring shared initializers
             // for partitions could also cause problems.  Note, operators with subgraphs are currently not efficient
             // anyhow due to CPU/GPU copies.
-            if (modelUsesSubgraph || !isDmlGraphNode)
+            if (containsSubgraph || !isDmlGraphNode)
             {
                 partitions.push_back(CreatePartitionAndFinalizeInputs(node, isDmlNode, false, nodeNameToPartitionMap));
                 continue;
@@ -505,7 +501,7 @@ namespace Dml
                             firstNonFinalInputPartition->AddInput(arg->Name());
                         }
 
-                        if (graphInputs.find(arg->Name()) != graphInputs.end())
+                        if (graphInputs.find(arg->Name()) != graphInputs.end() || implicitInputs.find(arg->Name()) != implicitInputs.end())
                         {
                             firstNonFinalInputPartition->AddInput(arg->Name());
                         }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
index 990ba00fc4672..37d577f647fb5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <string>
+#include <unordered_map>
 #include "core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h"
 
 namespace Dml
@@ -48,5 +50,6 @@ namespace Dml
         uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
         std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
         std::unordered_set<std::string>& requiredInitializerMap,
-        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes);
+        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputs);
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
index e9c63cc72a837..f94270cfadb8b 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
@@ -5,6 +5,7 @@
 
 #include "core/providers/dml/DmlExecutionProvider/inc/MLOperatorAuthor.h"
 #include "MLOperatorAuthorPrivate.h"
+#include "core/common/gsl.h"
 
 #ifdef ORT_NO_EXCEPTIONS
 #define ML_CHECK_BOOL(x) ORT_THROW_HR_IF(E_INVALIDARG, !(x))
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index a46f820c6207f..fde61e73c2124 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -128,21 +128,13 @@ Microsoft::WRL::ComPtr<ID3D12Device> DMLProviderFactoryCreator::CreateD3D12Devic
   return d3d12_device;
 }
 
-std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int device_id, bool skip_software_device_check) {
-  ComPtr<ID3D12Device> d3d12_device = CreateD3D12Device(device_id, skip_software_device_check);
-
-  D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
-  cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
-  cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
-
-  ComPtr<ID3D12CommandQueue> cmd_queue;
-  ORT_THROW_IF_FAILED(d3d12_device->CreateCommandQueue(&cmd_queue_desc, IID_GRAPHICS_PPV_ARGS(cmd_queue.ReleaseAndGetAddressOf())));
-
+Microsoft::WRL::ComPtr<IDMLDevice> DMLProviderFactoryCreator::CreateDMLDevice(ID3D12Device* d3d12_device)
+{
   DML_CREATE_DEVICE_FLAGS flags = DML_CREATE_DEVICE_FLAG_NONE;
 
   // In debug builds, enable the DML debug layer if the D3D12 debug layer is also enabled
 #if _DEBUG && !_GAMING_XBOX
-  ComPtr<ID3D12DebugDevice> debug_device;
+  Microsoft::WRL::ComPtr<ID3D12DebugDevice> debug_device;
   (void)d3d12_device->QueryInterface(IID_PPV_ARGS(&debug_device));  // ignore failure
   const bool is_d3d12_debug_layer_enabled = (debug_device != nullptr);
 
@@ -151,12 +143,27 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int
   }
 #endif
 
-  ComPtr<IDMLDevice> dml_device;
-  ORT_THROW_IF_FAILED(DMLCreateDevice1(d3d12_device.Get(),
-                                   flags,
-                                   DML_FEATURE_LEVEL_5_0,
-                                   IID_PPV_ARGS(&dml_device)));
+  Microsoft::WRL::ComPtr<IDMLDevice> dml_device;
+  ORT_THROW_IF_FAILED(DMLCreateDevice1(
+      d3d12_device,
+      flags,
+      DML_FEATURE_LEVEL_5_0,
+      IID_PPV_ARGS(&dml_device)));
+
+  return dml_device;
+}
+
+std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int device_id, bool skip_software_device_check) {
+  ComPtr<ID3D12Device> d3d12_device = CreateD3D12Device(device_id, skip_software_device_check);
+
+  D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
+  cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+  cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
+
+  ComPtr<ID3D12CommandQueue> cmd_queue;
+  ORT_THROW_IF_FAILED(d3d12_device->CreateCommandQueue(&cmd_queue_desc, IID_GRAPHICS_PPV_ARGS(cmd_queue.ReleaseAndGetAddressOf())));
 
+  auto dml_device = CreateDMLDevice(d3d12_device.Get());
   return CreateExecutionProviderFactory_DML(dml_device.Get(), cmd_queue.Get());
 }
 
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
index b1c9bb3f6f679..574f4410fe3e3 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
+++ b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
@@ -16,5 +16,6 @@ struct DMLProviderFactoryCreator {
   static std::shared_ptr<IExecutionProviderFactory> Create(int device_id);
   static std::shared_ptr<IExecutionProviderFactory> Create(int device_id, bool skip_software_device_check);
   static Microsoft::WRL::ComPtr<ID3D12Device> CreateD3D12Device(int device_id, bool skip_software_device_check);
+  static Microsoft::WRL::ComPtr<IDMLDevice> CreateDMLDevice(ID3D12Device* d3d12_device);
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_data_types.cc b/onnxruntime/core/providers/js/js_data_types.cc
index 69d5bd4f9de8f..341d2cc19506f 100644
--- a/onnxruntime/core/providers/js/js_data_types.cc
+++ b/onnxruntime/core/providers/js/js_data_types.cc
@@ -9,12 +9,24 @@ namespace js {
 using SupportedTypes =
     TypeList<
         float,
+        MLFloat16,
         int32_t,
         uint32_t>;
 
+using SupportedFloats =
+    TypeList<
+        float,
+        MLFloat16>;
+
 const std::vector<MLDataType>& JsepSupportedDataTypes() {
   static const std::vector<MLDataType> supportedDataTypes = BuildKernelDefConstraintsFromTypeList<SupportedTypes>();
   return supportedDataTypes;
 }
+
+const std::vector<MLDataType>& JsepSupportedFloatTypes() {
+  static const std::vector<MLDataType> supportedDataTypes = BuildKernelDefConstraintsFromTypeList<SupportedFloats>();
+  return supportedDataTypes;
+}
+
 }  // namespace js
 }  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/js/js_data_types.h b/onnxruntime/core/providers/js/js_data_types.h
index d6b6ac00401b3..968e79124645f 100644
--- a/onnxruntime/core/providers/js/js_data_types.h
+++ b/onnxruntime/core/providers/js/js_data_types.h
@@ -6,5 +6,6 @@
 namespace onnxruntime {
 namespace js {
 std::vector<MLDataType>& JsepSupportedDataTypes();
-}
+std::vector<MLDataType>& JsepSupportedFloatTypes();
+}  // namespace js
 }  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 829f3e5f4f143..6b548921cdc8c 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -205,9 +205,13 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, Equ
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, Greater);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 12, Greater);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Greater);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 15, GreaterOrEqual);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, GreaterOrEqual);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, Less);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 12, Less);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Less);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 15, LessOrEqual);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, LessOrEqual);
 
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Shape);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 14, Shape);
@@ -240,10 +244,10 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnn
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ConvTranspose);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, ConvTranspose);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, float, Gemm);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, float, Gemm);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, Gemm);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, Gemm);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, Gemm);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, Gemm);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Gemm);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gemm);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, MatMul);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, MatMul);
 
@@ -265,9 +269,9 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ArgMin);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, ArgMin);
 
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Softmax);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, Softmax);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, Softmax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Softmax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Softmax);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Softmax);
 
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 3, Concat);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 4, 10, Concat);
@@ -315,6 +319,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 6, float, InstanceNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, float, InstanceNormalization);
 
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, Einsum);
+
 std::unique_ptr<KernelRegistry> RegisterKernels() {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
@@ -403,9 +409,13 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       KERNEL_CREATE_INFO_VERSIONED(7, 8, Greater),
       KERNEL_CREATE_INFO_VERSIONED(9, 12, Greater),
       KERNEL_CREATE_INFO(13, Greater),
+      KERNEL_CREATE_INFO_VERSIONED(12, 15, GreaterOrEqual),
+      KERNEL_CREATE_INFO(16, GreaterOrEqual),
       KERNEL_CREATE_INFO_VERSIONED(7, 8, Less),
       KERNEL_CREATE_INFO_VERSIONED(9, 12, Less),
       KERNEL_CREATE_INFO(13, Less),
+      KERNEL_CREATE_INFO_VERSIONED(12, 15, LessOrEqual),
+      KERNEL_CREATE_INFO(16, LessOrEqual),
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Shape)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
@@ -488,10 +498,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ConvTranspose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, ConvTranspose)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, float, Gemm)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, float, Gemm)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, Gemm)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, MatMul)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, MatMul)>,
 
@@ -514,9 +524,9 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ArgMin)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, ArgMin)>,
 
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Softmax)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, Softmax)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Softmax)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 3, Concat)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
@@ -565,6 +575,8 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 6, float, InstanceNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, float, InstanceNormalization)>,
 
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, Einsum)>,
+
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/js/js_export.cc b/onnxruntime/core/providers/js/js_export.cc
index 2c99e246b69d0..2402bb33ce9d0 100644
--- a/onnxruntime/core/providers/js/js_export.cc
+++ b/onnxruntime/core/providers/js/js_export.cc
@@ -9,9 +9,7 @@ const void* JsepOutput(void* context, int index, const void* data) {
   const uint32_t* data_offset = reinterpret_cast<const uint32_t*>(data);
   uint32_t dim = *data_offset++;
   size_t dim_size = static_cast<size_t>(dim);
-  std::vector<int64_t> dims;
-  dims.reserve(dim_size);
-  dims.resize(dim_size);
+  std::vector<int64_t> dims(dim_size);
   for (size_t i = 0; i < dim_size; i++) {
     dims[i] = static_cast<int64_t>(*data_offset++);
   }
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 3accd80875d1b..177c0a9e691ed 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -11,6 +11,7 @@
 
 #include "core/framework/op_kernel.h"
 #include "core/providers/js/js_execution_provider.h"
+#include "core/providers/js/js_data_types.h"
 
 struct pthreadpool;
 
diff --git a/onnxruntime/core/providers/js/operators/binary.cc b/onnxruntime/core/providers/js/operators/binary.cc
index 2a96619c2c659..98f7ca6e613b0 100644
--- a/onnxruntime/core/providers/js/operators/binary.cc
+++ b/onnxruntime/core/providers/js/operators/binary.cc
@@ -63,10 +63,18 @@ REG_ELEMENTWISE_VERSIONED_KERNEL(Greater, 7, 8, Greater);
 REG_ELEMENTWISE_VERSIONED_KERNEL(Greater, 9, 12, Greater);
 REG_ELEMENTWISE_KERNEL(Greater, 13, Greater);
 
+JSEP_KERNEL_IMPL(GreaterOrEqual, GreaterOrEqual)
+REG_ELEMENTWISE_VERSIONED_KERNEL(GreaterOrEqual, 12, 15, GreaterOrEqual);
+REG_ELEMENTWISE_KERNEL(GreaterOrEqual, 16, GreaterOrEqual);
+
 JSEP_KERNEL_IMPL(Less, Less)
 REG_ELEMENTWISE_VERSIONED_KERNEL(Less, 7, 8, Less);
 REG_ELEMENTWISE_VERSIONED_KERNEL(Less, 9, 12, Less);
 REG_ELEMENTWISE_KERNEL(Less, 13, Less);
 
+JSEP_KERNEL_IMPL(LessOrEqual, LessOrEqual)
+REG_ELEMENTWISE_VERSIONED_KERNEL(LessOrEqual, 12, 15, LessOrEqual);
+REG_ELEMENTWISE_KERNEL(LessOrEqual, 16, LessOrEqual);
+
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/concat.cc b/onnxruntime/core/providers/js/operators/concat.cc
index 7d50d78c82851..3a6a7e1cafd7a 100644
--- a/onnxruntime/core/providers/js/operators/concat.cc
+++ b/onnxruntime/core/providers/js/operators/concat.cc
@@ -12,7 +12,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     1, 3,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -21,7 +22,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     4, 10,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -30,7 +32,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     11, 12,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 ONNX_OPERATOR_KERNEL_EX(
@@ -39,7 +42,8 @@ ONNX_OPERATOR_KERNEL_EX(
     13,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/einsum.cc b/onnxruntime/core/providers/js/operators/einsum.cc
new file mode 100644
index 0000000000000..2fdc14fa3a831
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/einsum.cc
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+#include "einsum.h"
+
+namespace onnxruntime {
+namespace js {
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    Einsum,
+    kOnnxDomain,
+    12,
+    float,
+    kJsExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Einsum);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/einsum.h b/onnxruntime/core/providers/js/operators/einsum.h
new file mode 100644
index 0000000000000..ec8b6f5dab240
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/einsum.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+
+class Einsum final : public JsKernel {
+ public:
+  Einsum(const OpKernelInfo& info) : JsKernel(info) {
+    std::string equation;
+    ORT_ENFORCE(info.GetAttr<std::string>("equation", &equation).IsOK(),
+                "Missing 'equation' attribute");
+    JSEP_INIT_KERNEL_ATTRIBUTE(Einsum, ({
+                                 "equation" : UTF8ToString($1),
+                               }),
+                               equation.c_str());
+  }
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/gemm.cc b/onnxruntime/core/providers/js/operators/gemm.cc
index 04700d0f54705..de27288f2ee0e 100644
--- a/onnxruntime/core/providers/js/operators/gemm.cc
+++ b/onnxruntime/core/providers/js/operators/gemm.cc
@@ -8,41 +8,34 @@
 namespace onnxruntime {
 namespace js {
 
-#define REGISTER_KERNEL_TYPED(T)                                                           \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
-      Gemm,                                                                                \
-      kOnnxDomain,                                                                         \
-      13,                                                                                  \
-      T,                                                                                   \
-      kJsExecutionProvider,                                                                \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      Gemm<T>);                                                                            \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
-      Gemm,                                                                                \
-      kOnnxDomain,                                                                         \
-      11, 12,                                                                              \
-      T,                                                                                   \
-      kJsExecutionProvider,                                                                \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      Gemm<T>);                                                                            \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
-      Gemm,                                                                                \
-      kOnnxDomain,                                                                         \
-      9, 10,                                                                               \
-      T,                                                                                   \
-      kJsExecutionProvider,                                                                \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      Gemm<T>);                                                                            \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
-      Gemm,                                                                                \
-      kOnnxDomain,                                                                         \
-      7, 8,                                                                                \
-      T,                                                                                   \
-      kJsExecutionProvider,                                                                \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      Gemm<T>);
-
-REGISTER_KERNEL_TYPED(float)
+ONNX_OPERATOR_KERNEL_EX(
+    Gemm,
+    kOnnxDomain,
+    13,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
+    Gemm);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Gemm,
+    kOnnxDomain,
+    11, 12,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
+    Gemm);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Gemm,
+    kOnnxDomain,
+    9, 10,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
+    Gemm);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Gemm,
+    kOnnxDomain,
+    7, 8,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
+    Gemm);
 
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/gemm.h b/onnxruntime/core/providers/js/operators/gemm.h
index 27c41788ccfbd..74091526f8411 100644
--- a/onnxruntime/core/providers/js/operators/gemm.h
+++ b/onnxruntime/core/providers/js/operators/gemm.h
@@ -8,7 +8,6 @@
 namespace onnxruntime {
 namespace js {
 
-template <typename T>
 class Gemm : public JsKernel {
  public:
   Gemm(const OpKernelInfo& info) : JsKernel(info) {
diff --git a/onnxruntime/core/providers/js/operators/slice.cc b/onnxruntime/core/providers/js/operators/slice.cc
index 9cc96a53083b0..bbafe40ea92ac 100644
--- a/onnxruntime/core/providers/js/operators/slice.cc
+++ b/onnxruntime/core/providers/js/operators/slice.cc
@@ -12,7 +12,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     1, 9,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Slice_1);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -25,7 +26,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
         .InputMemoryType(OrtMemTypeCPU, 2)
         .InputMemoryType(OrtMemTypeCPU, 3)
         .InputMemoryType(OrtMemTypeCPU, 4)
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Slice);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -38,7 +40,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
         .InputMemoryType(OrtMemTypeCPU, 2)
         .InputMemoryType(OrtMemTypeCPU, 3)
         .InputMemoryType(OrtMemTypeCPU, 4)
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Slice);
 
 ONNX_OPERATOR_KERNEL_EX(
@@ -51,7 +54,8 @@ ONNX_OPERATOR_KERNEL_EX(
         .InputMemoryType(OrtMemTypeCPU, 2)
         .InputMemoryType(OrtMemTypeCPU, 3)
         .InputMemoryType(OrtMemTypeCPU, 4)
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Slice);
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/softmax.cc b/onnxruntime/core/providers/js/operators/softmax.cc
index cbaecf9e4c975..292bd5006fb30 100644
--- a/onnxruntime/core/providers/js/operators/softmax.cc
+++ b/onnxruntime/core/providers/js/operators/softmax.cc
@@ -7,27 +7,25 @@ namespace onnxruntime {
 namespace js {
 
 #define REGISTER_SOFTMAX_ELEMENTWISE_VERSIONED_KERNEL(SoftmaxOp, sinceVersion, endVersion) \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                       \
       SoftmaxOp,                                                                           \
       kOnnxDomain,                                                                         \
       sinceVersion, endVersion,                                                            \
-      float,                                                                               \
       kJsExecutionProvider,                                                                \
       (*KernelDefBuilder::Create())                                                        \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),                      \
-      SoftmaxOp<float>);
+          .TypeConstraint("T", JsepSupportedFloatTypes()),                                 \
+      SoftmaxOp);
 
 #define REGISTER_SOFTMAX_ELEMENTWISE_KERNEL(SoftmaxOp, sinceVersion) \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                     \
+  ONNX_OPERATOR_KERNEL_EX(                                           \
       SoftmaxOp,                                                     \
       kOnnxDomain,                                                   \
       sinceVersion,                                                  \
-      float,                                                         \
       kJsExecutionProvider,                                          \
       (*KernelDefBuilder::Create())                                  \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()) \
+          .TypeConstraint("T", JsepSupportedFloatTypes())            \
           .InputMemoryType(OrtMemTypeCPU, 1),                        \
-      SoftmaxOp<float>);
+      SoftmaxOp);
 
 REGISTER_SOFTMAX_ELEMENTWISE_VERSIONED_KERNEL(Softmax, 1, 10);
 REGISTER_SOFTMAX_ELEMENTWISE_VERSIONED_KERNEL(Softmax, 11, 12);
diff --git a/onnxruntime/core/providers/js/operators/softmax.h b/onnxruntime/core/providers/js/operators/softmax.h
index 068a59e6b24e3..87259e8b6f206 100644
--- a/onnxruntime/core/providers/js/operators/softmax.h
+++ b/onnxruntime/core/providers/js/operators/softmax.h
@@ -8,7 +8,6 @@
 
 namespace onnxruntime {
 namespace js {
-template <typename T>
 class Softmax : public JsKernel {
  public:
   Softmax(const OpKernelInfo& info) : JsKernel(info) {
diff --git a/onnxruntime/core/providers/js/operators/transpose.cc b/onnxruntime/core/providers/js/operators/transpose.cc
index ef1e49046ae8c..332bd35f2434c 100644
--- a/onnxruntime/core/providers/js/operators/transpose.cc
+++ b/onnxruntime/core/providers/js/operators/transpose.cc
@@ -12,9 +12,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     1, 12,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
-                              DataTypeImpl::GetTensorType<int32_t>(),
-                              DataTypeImpl::GetTensorType<uint32_t>()}),
+        .TypeConstraint("T", JsepSupportedDataTypes()),
     Transpose);
 
 ONNX_OPERATOR_KERNEL_EX(
@@ -23,9 +21,7 @@ ONNX_OPERATOR_KERNEL_EX(
     13,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
-                              DataTypeImpl::GetTensorType<int32_t>(),
-                              DataTypeImpl::GetTensorType<uint32_t>()}),
+        .TypeConstraint("T", JsepSupportedDataTypes()),
     Transpose);
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/unary.cc b/onnxruntime/core/providers/js/operators/unary.cc
index cf9433767c3d7..5e972e43e4566 100644
--- a/onnxruntime/core/providers/js/operators/unary.cc
+++ b/onnxruntime/core/providers/js/operators/unary.cc
@@ -38,8 +38,8 @@ JSEP_ELEMENTWISE_MULTI_TYPED_VERSIONED_KERNEL(Abs, 6, 12, Abs)
 JSEP_ELEMENTWISE_MULTI_TYPED_KERNEL(Abs, 13, Abs)
 
 JSEP_KERNEL_IMPL(Neg, Neg)
-JSEP_ELEMENTWISE_VERSIONED_KERNEL(Neg, 6, 12, float, Neg)
-JSEP_ELEMENTWISE_KERNEL(Neg, 13, float, Neg)
+JSEP_ELEMENTWISE_MULTI_TYPED_VERSIONED_KERNEL(Neg, 6, 12, Neg)
+JSEP_ELEMENTWISE_MULTI_TYPED_KERNEL(Neg, 13, Neg)
 
 JSEP_KERNEL_IMPL(Floor, Floor)
 JSEP_ELEMENTWISE_VERSIONED_KERNEL(Floor, 6, 12, float, Floor)
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 421c55a2c91a8..766034b3decea 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -8,6 +8,7 @@
 #include "core/common/inlined_containers.h"
 #include "core/graph/basic_types.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksTypes.h"
+#include "core/common/gsl.h"
 
 // This is the minimal Android API Level required by ORT NNAPI EP to run
 // ORT running on any host system with Android API level less than this will fall back to CPU EP
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index ca18c051a9922..8abb847b20b46 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -29,26 +29,37 @@ class SimpleOpBuilder : public BaseOpBuilder {
                                      bool do_op_validation) const override ORT_MUST_USE_RESULT;
 
  private:
-  Status ExplictOpCheck(const QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const;
+  Status ExplicitOpCheck(const QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const;
 
   static constexpr std::array<std::string_view, 2> gridsample_supported_modes = {"bilinear", "nearest"};
   static constexpr std::array<std::string_view, 3> gridsample_supported_padding_modes = {"zeros", "border", "reflection"};
 };
 
-Status SimpleOpBuilder::ExplictOpCheck(const QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
-  // QNN Softmax only supports an axis value equal to input_rank - 1 (i.e., same as -1).
-  if (node_unit.OpType() == "Softmax") {
-    int32_t axis = node_unit.SinceVersion() < 13 ? 1 : -1;  // Default axis changed from 1 to -1 in opset 13.
+static int32_t GetDefaultAxisAttribute(const std::string& op_type, int opset_version) {
+  if (op_type == "Softmax" || op_type == "LogSoftmax") {
+    // Default axis changed from 1 to -1 in opset 13.
+    return opset_version < 13 ? 1 : -1;
+  }
+
+  return 0;
+}
+
+Status SimpleOpBuilder::ExplicitOpCheck(const QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
+  const std::string& op_type = node_unit.OpType();
+
+  // QNN Softmax and LogSoftmax only support an axis value equal to input_rank - 1 (i.e., same as -1).
+  if (op_type == "Softmax" || op_type == "LogSoftmax") {
+    int32_t axis = GetDefaultAxisAttribute(op_type, node_unit.SinceVersion());
     Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT;
     ORT_RETURN_IF_ERROR(ProcessAxisAttribute(qnn_model_wrapper, node_unit, axis_qnn_scalar, axis));
     std::vector<uint32_t> input_shape;
     ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(node_unit.Inputs()[0].node_arg, input_shape),
                       "QNN EP: Cannot get shape for Softmax input");
     ORT_RETURN_IF(axis != static_cast<int32_t>(input_shape.size() - 1),
-                  "QNN Softmax only supports an `axis` attribute equal to input_rank-1 (or -1)");
+                  "QNN ", op_type.c_str(), " only supports an `axis` attribute equal to input_rank-1 (or -1)");
   }
 
-  if (node_unit.OpType() == "GridSample") {
+  if (op_type == "GridSample") {
     NodeAttrHelper node_helper(node_unit);
     std::string mode = node_helper.Get("mode", "linear");
     ORT_RETURN_IF_NOT(utils::ArrayHasString(gridsample_supported_modes, mode), "GridSample does not support mode ",
@@ -58,6 +69,13 @@ Status SimpleOpBuilder::ExplictOpCheck(const QnnModelWrapper& qnn_model_wrapper,
                       padding_mode.c_str());
   }
 
+  // ONNX's Min and Max operators accept a variable number of inputs (i.e., variadic).
+  // However, QNN's Min and Max operators must take in exactly two inputs.
+  if (op_type == "Min" || op_type == "Max") {
+    ORT_RETURN_IF_NOT(node_unit.Inputs().size() == 2,
+                      "QNN EP only supports Min and Max operators with exactly 2 inputs.");
+  }
+
   return Status::OK();
 }
 
@@ -207,7 +225,7 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   const std::string& op_type = node_unit.OpType();
 
   if (do_op_validation) {
-    ORT_RETURN_IF_ERROR(ExplictOpCheck(qnn_model_wrapper, node_unit));
+    ORT_RETURN_IF_ERROR(ExplicitOpCheck(qnn_model_wrapper, node_unit));
     // Skip the op validation for DepthToSpace & SpaceToDepth if it's not NHWC data layout
     if (node_unit.Domain() != kMSInternalNHWCDomain && (op_type == "DepthToSpace" || op_type == "SpaceToDepth" || op_type == "GridSample")) {
       return Status::OK();
@@ -217,7 +235,7 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   std::vector<std::string> param_tensor_names;
   // Add attribute
   if (op_type == "LogSoftmax" || op_type == "Softmax" || op_type == "Concat") {
-    int32_t default_axis = ("Softmax" == op_type) ? -1 : 0;
+    int32_t default_axis = GetDefaultAxisAttribute(op_type, node_unit.SinceVersion());
     Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT;
     ORT_RETURN_IF_ERROR(ProcessAxisAttribute(qnn_model_wrapper, node_unit, axis_qnn_scalar, default_axis));
     QnnParamWrapper axis_param(node_unit.Index(), node_unit.Name(), QNN_OP_SOFTMAX_PARAM_AXIS, axis_qnn_scalar);
diff --git a/onnxruntime/core/providers/rocm/tunable/gemm_ck.cuh b/onnxruntime/core/providers/rocm/tunable/gemm_ck.cuh
index 3e6f1612f2fd8..86d023886cfaf 100644
--- a/onnxruntime/core/providers/rocm/tunable/gemm_ck.cuh
+++ b/onnxruntime/core/providers/rocm/tunable/gemm_ck.cuh
@@ -13,6 +13,8 @@
 #include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 #include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
 #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
@@ -50,9 +52,8 @@ auto GetCKGemmTypeStringAndOps() {
     auto ck_gemm_op = [impl = std::move(impl), invoker = std::move(invoker)](const GemmParams<T>* params) -> Status {
       auto one = ToHipType<T>::FromFloat(1.0f);
       auto zero = ToHipType<T>::FromFloat(0.0f);
-      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-          params->alpha != one || params->beta != zero,
-          impl->GetTypeString(), " only supports alpha == 1 and beta == 0", params->Signature());
+      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(params->alpha != one || params->beta != zero,
+                                                impl->GetTypeString(), " only supports alpha == 1 and beta == 0");
 
       auto nop = Nop{};
       auto arg = impl->MakeArgumentPointer(params->a, params->b, params->c,
@@ -69,6 +70,80 @@ auto GetCKGemmTypeStringAndOps() {
   return ret;
 }
 
+template <typename T, typename ALayout, typename BLayout>
+auto GetCKStreamKGemmTypeStringAndOps() {
+  using CKDataType = typename CKDataTypeAdaptor<T>::type;
+  using DeviceGemm = ck::tensor_operation::device::DeviceGemmStreamK<
+      ALayout, BLayout, Row,
+      CKDataType, CKDataType, CKDataType,
+      Nop, Nop, Nop>;
+  using InstanceFactory = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<DeviceGemm>;
+
+  std::vector<std::pair<std::string, Op<GemmParams<T>>>> ret;
+  for (auto&& impl : InstanceFactory::GetInstances()) {
+    auto type_string = impl->GetTypeString();
+    auto invoker = impl->MakeInvokerPointer();
+    auto ck_gemm_op = [impl = std::move(impl), invoker = std::move(invoker)](const GemmParams<T>* params) -> Status {
+      auto one = ToHipType<T>::FromFloat(1.0f);
+      auto zero = ToHipType<T>::FromFloat(0.0f);
+      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(params->alpha != one || params->beta != zero,
+                                                impl->GetTypeString(), " only supports alpha == 1 and beta == 0");
+
+      auto nop = Nop{};
+      auto arg = impl->MakeArgumentPointer(params->a, params->b, params->c,
+                                           params->m, params->n, params->k,
+                                           params->lda, params->ldb, params->ldc,
+                                           nop, nop, nop);
+      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!impl->IsSupportedArgument(arg.get()),
+                                                impl->GetTypeString(), " does not support ", params->Signature());
+      invoker->Run(arg.get(), StreamConfig{params->StreamHandle()});
+      return Status::OK();
+    };
+    ret.emplace_back(std::make_pair(std::move(type_string), std::move(ck_gemm_op)));
+  }
+  return ret;
+}
+
+template <typename T, typename ALayout, typename BLayout>
+auto GetCKSplitKGemmTypeStringAndOps() {
+  using CKDataType = typename CKDataTypeAdaptor<T>::type;
+  using DeviceGemm = ck::tensor_operation::device::DeviceGemmSplitK<
+      ALayout, BLayout, Row,
+      CKDataType, CKDataType, CKDataType,
+      Nop, Nop, Nop>;
+  using InstanceFactory = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<DeviceGemm>;
+
+  std::vector<std::pair<std::string, Op<GemmParams<T>>>> ret;
+  for (auto num_split : {4, 16, 64}) {
+    auto instances = InstanceFactory::GetInstances();
+    for (auto&& impl : instances) {
+      auto type_string = impl->GetTypeString() + "_SplitK" + std::to_string(num_split);
+      auto invoker = impl->MakeInvokerPointer();
+      auto ck_gemm_op = [num_split, impl = std::move(impl), invoker = std::move(invoker)](const GemmParams<T>* params) -> Status {
+        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
+            params->k < 128 * num_split, "k=", params->k, " is too small, it makes no sense to use this split-k gemm.");
+
+        auto one = ToHipType<T>::FromFloat(1.0f);
+        auto zero = ToHipType<T>::FromFloat(0.0f);
+        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(params->alpha != one || params->beta != zero,
+                                                  impl->GetTypeString(), " only supports alpha == 1 and beta == 0");
+
+        auto nop = Nop{};
+        auto arg = impl->MakeArgumentPointer(params->a, params->b, params->c,
+                                             params->m, params->n, params->k,
+                                             params->lda, params->ldb, params->ldc,
+                                             nop, nop, nop, num_split);
+        TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!impl->IsSupportedArgument(arg.get()),
+                                                  impl->GetTypeString(), " does not support ", params->Signature());
+        invoker->Run(arg.get(), StreamConfig{params->StreamHandle()});
+        return Status::OK();
+      };
+      ret.emplace_back(std::make_pair(std::move(type_string), std::move(ck_gemm_op)));
+    }
+  }
+  return ret;
+}
+
 template <typename T, typename ALayout, typename BLayout>
 auto GetCKStridedBatchedGemmTypeStringAndOps() {
   using CKDataType = typename CKDataTypeAdaptor<T>::type;
diff --git a/onnxruntime/core/providers/rocm/tunable/gemm_tunable.cuh b/onnxruntime/core/providers/rocm/tunable/gemm_tunable.cuh
index d39fa3e66209f..dbef772f8cd96 100644
--- a/onnxruntime/core/providers/rocm/tunable/gemm_tunable.cuh
+++ b/onnxruntime/core/providers/rocm/tunable/gemm_tunable.cuh
@@ -58,6 +58,15 @@ class GemmTunableOp : public TunableOp<GemmParams<T>> {
       ORT_UNUSED_PARAMETER(_);
       this->RegisterOp(std::move(op));
     }
+
+    for (auto&& [_, op] : GetCKStreamKGemmTypeStringAndOps<T, ALayout, BLayout>()) {
+      ORT_UNUSED_PARAMETER(_);
+      this->RegisterOp(std::move(op));
+    }
+    for (auto&& [_, op] : GetCKSplitKGemmTypeStringAndOps<T, ALayout, BLayout>()) {
+      ORT_UNUSED_PARAMETER(_);
+      this->RegisterOp(std::move(op));
+    }
 #endif
   }
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index e90417a6d14fc..96893f63b4540 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -2433,6 +2433,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       Ort::KernelContext ctx(context);
 
       TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
+
+      // The whole compute_function should be considered the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
+      // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
+      // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+      std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
       const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
       const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
       const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
@@ -2475,237 +2480,230 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         timing_cache_path = GetTimingCachePath(cache_path_, prop);
       }
 
-      // Following block is the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
-      // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
-      // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-      {
-        std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
-
-        // Load serialized engine
-        if (trt_state->engine_cache_enable && trt_engine == nullptr) {
-          std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
-          std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
-          if (engine_file && !trt_state->engine_decryption_enable && profile_file) {
-            // Deserialize profile
-            shape_ranges = DeserializeProfileV2(profile_file);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
-
-            // Prepare buffer
-            engine_file.seekg(0, std::ios::end);
-            size_t engine_size = engine_file.tellg();
-            engine_file.seekg(0, std::ios::beg);
-            std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-            engine_file.read((char*)engine_buf.get(), engine_size);
-
-            // Deserialize engine
-            // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
-            // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-            trt_state->engine->reset();
-            *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-                trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
-            if (*(trt_state->engine) == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
-            }
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
-            trt_engine = trt_state->engine->get();
-            context_update = true;
-          } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
-            shape_ranges = DeserializeProfileV2(profile_file);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
-            // Decrypt engine
-            size_t engine_size = 0;
-            if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not get engine buffer size");
-            }
-            std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-            if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not call engine decryption function decrypt");
-            }
-            // Deserialize engine
-            // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
-            // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-            trt_state->engine->reset();
-            *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
-            if (*(trt_state->engine) == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
-            }
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
-            trt_engine = trt_state->engine->get();
-            context_update = true;
+      // Load serialized engine
+      if (trt_state->engine_cache_enable && trt_engine == nullptr) {
+        std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
+        std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
+        if (engine_file && !trt_state->engine_decryption_enable && profile_file) {
+          // Deserialize profile
+          shape_ranges = DeserializeProfileV2(profile_file);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
+
+          // Prepare buffer
+          engine_file.seekg(0, std::ios::end);
+          size_t engine_size = engine_file.tellg();
+          engine_file.seekg(0, std::ios::beg);
+          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+          engine_file.read((char*)engine_buf.get(), engine_size);
+
+          // Deserialize engine
+          // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
+          // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+          trt_state->engine->reset();
+          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
+              trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
+          if (*(trt_state->engine) == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+          }
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
+          trt_engine = trt_state->engine->get();
+          context_update = true;
+        } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
+          shape_ranges = DeserializeProfileV2(profile_file);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
+          // Decrypt engine
+          size_t engine_size = 0;
+          if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not get engine buffer size");
+          }
+          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+          if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not call engine decryption function decrypt");
+          }
+          // Deserialize engine
+          // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
+          // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+          trt_state->engine->reset();
+          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
+          if (*(trt_state->engine) == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
           }
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
+          trt_engine = trt_state->engine->get();
+          context_update = true;
         }
+      }
 
-        // Check and update shape ranges for dynamic shape inputs.
-        for (int i = 0, end = num_inputs; i < end; ++i) {
-          auto input = trt_state->network->get()->getInput(i);
-          const std::string& input_name = input->getName();
-          input_names.insert(input_name);
+      // Check and update shape ranges for dynamic shape inputs.
+      for (int i = 0, end = num_inputs; i < end; ++i) {
+        auto input = trt_state->network->get()->getInput(i);
+        const std::string& input_name = input->getName();
+        input_names.insert(input_name);
 
-          // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
-          // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
-          if (shape_ranges.find(input_name) != shape_ranges.end()) {
-            auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update);
-            if (status != Status::OK()) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles.");
-            }
+        // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
+        // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
+        if (shape_ranges.find(input_name) != shape_ranges.end()) {
+          auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update);
+          if (status != Status::OK()) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles.");
           }
         }
+      }
 
-        // Regenerate engine
-        if (engine_update) {
-          // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
-          if (GetPerThreadContext().IsTensorRTContextInMap(fused_node_name)) {
-            GetPerThreadContext().ResetTensorRTContext(fused_node_name);
-          }
+      // Regenerate engine
+      if (engine_update) {
+        // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
+        if (GetPerThreadContext().IsTensorRTContextInMap(fused_node_name)) {
+          GetPerThreadContext().ResetTensorRTContext(fused_node_name);
+        }
 
-          trt_state->engine->reset();
-          auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
-          trt_config->setMaxWorkspaceSize(*(trt_state->max_workspace_size_ptr));
-          for (auto trt_profile : trt_profiles) {
-            trt_config->addOptimizationProfile(trt_profile);
-          }
+        trt_state->engine->reset();
+        auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
+        trt_config->setMaxWorkspaceSize(*(trt_state->max_workspace_size_ptr));
+        for (auto trt_profile : trt_profiles) {
+          trt_config->addOptimizationProfile(trt_profile);
+        }
 
-          // Set INT8 Per Tensor Dynamic range
-          if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
-            trt_config->setInt8Calibrator(nullptr);
-            if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
-            }
+        // Set INT8 Per Tensor Dynamic range
+        if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
+          trt_config->setInt8Calibrator(nullptr);
+          if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
           }
+        }
 
-          // Set precision
-          if (trt_state->fp16_enable && trt_state->int8_enable) {
-            trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
-          } else if (trt_state->fp16_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
-          } else if (trt_state->int8_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
-          }
+        // Set precision
+        if (trt_state->fp16_enable && trt_state->int8_enable) {
+          trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
+        } else if (trt_state->fp16_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
+        } else if (trt_state->int8_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
+        }
 
-          // Set DLA (DLA can only run with FP16 or INT8)
-          if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
-            trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
-            trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
-            trt_config->setDLACore(trt_state->dla_core);
-          }
+        // Set DLA (DLA can only run with FP16 or INT8)
+        if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
+          trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
+          trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+          trt_config->setDLACore(trt_state->dla_core);
+        }
 
-          // enable sparse weights
-          if (trt_state->sparsity_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
-          }
+        // enable sparse weights
+        if (trt_state->sparsity_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
+        }
 
-          // enable builder heuristics
-          if (trt_state->build_heuristics_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled";
-          }
+        // enable builder heuristics
+        if (trt_state->build_heuristics_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled";
+        }
 #if NV_TENSORRT_MINOR > 5 && NV_TENSORRT_MAJOR >= 8
-          // switch optimizaion level
-          if (trt_state->builder_optimization_level != 3) {
-            trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
-          }
+        // switch optimizaion level
+        if (trt_state->builder_optimization_level != 3) {
+          trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
+        }
 
-          // limit auxiliary streams
-          if (trt_state->auxiliary_streams >= 0) {
-            trt_config->setMaxAuxStreams(trt_state->auxiliary_streams);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams;
-          }
+        // limit auxiliary streams
+        if (trt_state->auxiliary_streams >= 0) {
+          trt_config->setMaxAuxStreams(trt_state->auxiliary_streams);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams;
+        }
 #else
-          if (trt_state->builder_optimization_level != 3) {
-            LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
-          }
-          if (trt_state->auxiliary_streams >= 0) {
-            LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
-          }
+        if (trt_state->builder_optimization_level != 3) {
+          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
+        }
+        if (trt_state->auxiliary_streams >= 0) {
+          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
+        }
 #endif
-          // limit used tactic sources
-          if (trt_state->filter_tactic_sources) {
-            nvinfer1::TacticSources tactics = trt_config->getTacticSources();
-            tactics |= trt_state->tactic_sources;
-            trt_config->setTacticSources(tactics);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics;
+        // limit used tactic sources
+        if (trt_state->filter_tactic_sources) {
+          nvinfer1::TacticSources tactics = trt_config->getTacticSources();
+          tactics |= trt_state->tactic_sources;
+          trt_config->setTacticSources(tactics);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics;
+        }
+
+        // Load timing cache from file. Create a fresh cache if the file doesn't exist
+        std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
+        if (trt_state->timing_cache_enable) {
+          std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
+          timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
+          if (timing_cache == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not create timing cache: " + timing_cache_path);
           }
-
-          // Load timing cache from file. Create a fresh cache if the file doesn't exist
-          std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
-          if (trt_state->timing_cache_enable) {
-            std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
-            timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
-            if (timing_cache == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not create timing cache: " + timing_cache_path);
-            }
-            trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
-            if (detailed_build_log_) {
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
-            }
+          trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
+          if (detailed_build_log_) {
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
           }
+        }
 
-          // Build engine
-          {
-            auto lock = GetApiLock();
-            std::chrono::steady_clock::time_point engine_build_start;
-            if (detailed_build_log_) {
-              engine_build_start = std::chrono::steady_clock::now();
-            }
-            *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-                trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
-            if (detailed_build_log_) {
-              auto engine_build_stop = std::chrono::steady_clock::now();
-              LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
-            }
-          }
-          if (*(trt_state->engine) == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+        // Build engine
+        {
+          auto lock = GetApiLock();
+          std::chrono::steady_clock::time_point engine_build_start;
+          if (detailed_build_log_) {
+            engine_build_start = std::chrono::steady_clock::now();
           }
-          trt_engine = trt_state->engine->get();
-          if (trt_state->engine_cache_enable) {
-            // Serialize engine profile
-            SerializeProfileV2(profile_cache_path, shape_ranges);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
-
-            // Serialize engine
-            std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
-            size_t engine_size = serializedModel->size();
-            if (trt_state->engine_decryption_enable) {
-              // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
-              if (trt_state->engine_encryption != nullptr) {
-                if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
-                  return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                         "TensorRT EP could not call engine encryption function encrypt");
-                }
-                LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
-              } else {
-                LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine cache encryption function is not found. No cache is written to disk";
+          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
+              trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
+          if (detailed_build_log_) {
+            auto engine_build_stop = std::chrono::steady_clock::now();
+            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+          }
+        }
+        if (*(trt_state->engine) == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+        }
+        trt_engine = trt_state->engine->get();
+        if (trt_state->engine_cache_enable) {
+          // Serialize engine profile
+          SerializeProfileV2(profile_cache_path, shape_ranges);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
+
+          // Serialize engine
+          std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
+          size_t engine_size = serializedModel->size();
+          if (trt_state->engine_decryption_enable) {
+            // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
+            if (trt_state->engine_encryption != nullptr) {
+              if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
+                return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                       "TensorRT EP could not call engine encryption function encrypt");
               }
+              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
             } else {
-              std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
-              file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
+              LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine cache encryption function is not found. No cache is written to disk";
             }
+          } else {
+            std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
+            file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
           }
+        }
 
-          // serialize and save timing cache
-          if (trt_state->timing_cache_enable) {
-            auto timing_cache = trt_config->getTimingCache();
-            std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
-            if (timingCacheHostData == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not serialize timing cache: " + timing_cache_path);
-            }
-            saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
-            if (detailed_build_log_) {
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
-            }
+        // serialize and save timing cache
+        if (trt_state->timing_cache_enable) {
+          auto timing_cache = trt_config->getTimingCache();
+          std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
+          if (timingCacheHostData == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not serialize timing cache: " + timing_cache_path);
+          }
+          saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
+          if (detailed_build_log_) {
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
           }
-          context_update = true;
         }
+        context_update = true;
       }
 
       // Build execution context if either of the following conditions is true:
diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc
index b5f45b15a5992..cca680baf7dc0 100644
--- a/onnxruntime/core/providers/vitisai/imp/graph.cc
+++ b/onnxruntime/core/providers/vitisai/imp/graph.cc
@@ -151,8 +151,10 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
     }
   }
   // use relative path as data storage.
-  for (auto i = 0; i < model_proto.graph().initializer_size(); ++i) {
-    auto initializer = model_proto.mutable_graph()->mutable_initializer(i);
+  auto graph_proto = model_proto.mutable_graph();
+  *graph_proto = graph.ToGraphProto();
+  for (auto i = 0; i < graph_proto->initializer_size(); ++i) {
+    auto initializer = graph_proto->mutable_initializer(i);
     for (auto j = 0; j < initializer->external_data_size(); ++j) {
       auto external_data = initializer->mutable_external_data(j);
       if (external_data->key() == "location") {
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 6a70176ebcc8c..5a2a6efb6df4b 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1829,83 +1829,102 @@ const DataTransferManager& InferenceSession::GetDataTransferManager() const {
   return data_transfer_mgr_;
 }
 
-common::Status InferenceSession::CheckShapes(const std::string& input_name, const TensorShape& input_shape,
-                                             const TensorShape& expected_shape) const {
-  auto input_shape_sz = input_shape.NumDimensions();
-  auto expected_shape_sz = expected_shape.NumDimensions();
-  if (input_shape_sz != expected_shape_sz) {
-    std::ostringstream ostr;
-    ostr << "Invalid rank for input: " << input_name << " Got: " << input_shape_sz << " Expected: " << expected_shape_sz
-         << " Please fix either the inputs or the model.";
-    return Status(ONNXRUNTIME, INVALID_ARGUMENT, ostr.str());
-  }
-
-  std::vector<size_t> invalid_dim_indices;
-  for (size_t i = 0; i < input_shape_sz; ++i) {
+common::Status InferenceSession::CheckShapes(const std::string& input_output_name, const TensorShape& input_output_shape,
+                                             const TensorShape& expected_shape, const char* input_output_moniker) const {
+  const auto shape_size = input_output_shape.NumDimensions();
+  const auto expected_shape_size = expected_shape.NumDimensions();
+  if (shape_size != expected_shape_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid rank for ", input_output_moniker, ": ",
+                           input_output_name, " Got: ", shape_size, " Expected: ", expected_shape_size,
+                           " Please fix either the inputs/outputs or the model.");
+  }
+
+  InlinedVector<size_t> invalid_dim_indices;
+  for (size_t i = 0; i < shape_size; ++i) {
     if (expected_shape[i] < 0) {
       continue;  // this represents a symbolic shape dimension
     }
-    if (input_shape[i] != expected_shape[i]) {
+    if (input_output_shape[i] != expected_shape[i]) {
       invalid_dim_indices.push_back(i);
     }
   }
 
   if (!invalid_dim_indices.empty()) {
     std::ostringstream ostr;
-    ostr << "Got invalid dimensions for input: " << input_name << " for the following indices\n";
+    ostr << "Got invalid dimensions for " << input_output_moniker << ": " << input_output_name << " for the following indices\n";
     for (size_t i = 0, end = invalid_dim_indices.size(); i < end; ++i) {
       size_t idx = invalid_dim_indices[i];
-      ostr << " index: " << idx << " Got: " << input_shape[idx] << " Expected: " << expected_shape[idx] << "\n";
+      ostr << " index: " << idx << " Got: " << input_output_shape[idx] << " Expected: " << expected_shape[idx] << "\n";
     }
-    ostr << " Please fix either the inputs or the model.";
+    ostr << " Please fix either the inputs/outputs or the model.";
     return Status(ONNXRUNTIME, INVALID_ARGUMENT, ostr.str());
   }
   return Status::OK();
 }
 
-static common::Status CheckTypes(MLDataType actual, MLDataType expected, const std::string& base_type) {
+static common::Status CheckTypes(MLDataType actual, MLDataType expected, const std::string& base_type,
+                                 const char* input_output_moniker) {
   if (actual == expected) {
     return Status::OK();
   }
-  std::ostringstream ostr;
-  ostr << "Unexpected input data type. Actual: (";
-  ostr << base_type;
-  ostr << "(";
-  ostr << DataTypeImpl::ToString(actual);
-  ostr << ")) , expected: (";
-  ostr << base_type;
-  ostr << "(";
-  ostr << DataTypeImpl::ToString(expected);
-  ostr << "))";
 
-  return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
+  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unexpected ", input_output_moniker, " data type. Actual: (",
+                         base_type, "(",
+                         DataTypeImpl::ToString(actual), ")) , expected: (", base_type, "(",
+                         DataTypeImpl::ToString(expected), "))");
 }
 
-common::Status InferenceSession::ValidateInputs(gsl::span<const std::string> feed_names,
-                                                gsl::span<const OrtValue> feeds) const {
-  if (feed_names.size() != feeds.size()) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Size mismatch: feed_names has ", feed_names.size(),
-                           "elements, but feeds has ", feeds.size(), " elements.");
+common::Status InferenceSession::ValidateInputsOutputs(gsl::span<const std::string> names,
+                                                       gsl::span<const OrtValue> feeds_fetches,
+                                                       const InputOutputDefMetaMap& input_output_meta_map,
+                                                       ArgType arg_type) const {
+  ORT_ENFORCE(arg_type == ArgType::kInput || arg_type == ArgType::kOutput, "Valid values kInput, kOutput");
+
+  const bool is_inputs = arg_type == ArgType::kInput;
+
+  const char* const input_output_moniker = is_inputs ? "input" : "output";
+  const char* const feed_fetches_moniker = is_inputs ? "feed" : "fetch";
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  auto is_sparse_initializer = [this](const std::string& name) -> bool {
+    int idx = -1;
+    if (session_state_->GetOrtValueNameIdxMap().GetIdx(name, idx).IsOK()) {
+      return session_state_->IsSparseInitializer(idx);
+    }
+    return false;
+  };
+#endif
+
+  if (names.size() != feeds_fetches.size()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, feed_fetches_moniker, " names has ", names.size(),
+                           " elements, but ", feed_fetches_moniker, " has ", feeds_fetches.size(), " elements.");
   }
 
-  for (size_t i = 0; i < feeds.size(); ++i) {
-    const auto& feed_name = feed_names[i];
+  for (size_t i = 0; i < feeds_fetches.size(); ++i) {
+    const auto& name = names[i];
+
+    auto iter = input_output_meta_map.find(name);
+    if (input_output_meta_map.end() == iter) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid ", input_output_moniker, " name: ", name);
+    }
+
+    const auto& input_output_ml_value = feeds_fetches[i];
 
-    auto iter = input_def_map_.find(feed_name);
-    if (input_def_map_.end() == iter) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid Feed Input Name:", feed_name);
+    // For outputs the user may supply an unallocated placeholder.
+    if (!is_inputs && !input_output_ml_value.IsAllocated()) {
+      continue;
     }
 
     auto expected_type = iter->second.ml_data_type;
-    auto& input_ml_value = feeds[i];
-    if (input_ml_value.IsTensor()) {
+
+    if (input_output_ml_value.IsTensor()) {
       if (!expected_type->IsTensorType()
 #if !defined(DISABLE_OPTIONAL_TYPE)
           && !utils::IsOptionalTensor(expected_type)
 #endif
       ) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input with name: ", feed_name,
-                               " is not expected to be of type tensor.");
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, input_output_moniker, " with name: '", name,
+                               "' expected to be of type: ", static_cast<int>(expected_type->type_), " but received a tensor");
       }
 
       // check for type
@@ -1919,44 +1938,56 @@ common::Status InferenceSession::ValidateInputs(gsl::span<const std::string> fee
       auto expected_element_type = expected_type->AsTensorType()->GetElementType();
 #endif
 
-      auto input_element_type = input_ml_value.Get<Tensor>().DataType();
-      ORT_RETURN_IF_ERROR_SESSIONID_(CheckTypes(input_element_type, expected_element_type, "tensor"));
+      const auto& input_output_tensor = input_output_ml_value.Get<Tensor>();
+      ORT_RETURN_IF_ERROR_SESSIONID_(CheckTypes(input_output_tensor.DataType(),
+                                                expected_element_type, "tensor", input_output_moniker));
 
       // check for shape
-      const auto& expected_shape = iter->second.tensor_shape;
-      if (expected_shape.NumDimensions() > 0) {
-        const auto& input_shape = input_ml_value.Get<Tensor>().Shape();
-        ORT_RETURN_IF_ERROR_SESSIONID_(CheckShapes(feed_name, input_shape, expected_shape));
+      if (iter->second.tensor_shape.has_value()) {
+        ORT_RETURN_IF_ERROR_SESSIONID_(CheckShapes(name, input_output_tensor.Shape(),
+                                                   *iter->second.tensor_shape, input_output_moniker));
       }
-    } else if (input_ml_value.IsSparseTensor()) {
+    } else if (input_output_ml_value.IsSparseTensor()) {
 #if !defined(DISABLE_SPARSE_TENSORS)
-      if (!expected_type->IsSparseTensorType()) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input with name: ", feed_name,
-                               " is not expected to be of type sparse tensor.");
-      }
-      auto expected_element_type = expected_type->AsSparseTensorType()->GetElementType();
-      const SparseTensor& sparse_tensor = input_ml_value.Get<SparseTensor>();
-      auto input_element_type = sparse_tensor.DataType();
-      ORT_RETURN_IF_ERROR_SESSIONID_(CheckTypes(input_element_type, expected_element_type, "sparse_tensor"));
-      // Check shape
-      const auto& expected_shape = iter->second.tensor_shape;
-      if (expected_shape.NumDimensions() > 0) {
-        const auto& input_shape = sparse_tensor.DenseShape();
-        ORT_RETURN_IF_ERROR_SESSIONID_(CheckShapes(feed_name, input_shape, expected_shape));
+
+      const SparseTensor& sparse_tensor = input_output_ml_value.Get<SparseTensor>();
+      if (expected_type->IsSparseTensorType()) {
+        auto expected_element_type = expected_type->AsSparseTensorType()->GetElementType();
+        ORT_RETURN_IF_ERROR_SESSIONID_(CheckTypes(sparse_tensor.DataType(), expected_element_type,
+                                                  "sparse_tensor", input_output_moniker));
+        // Check shape
+        if (iter->second.tensor_shape.has_value()) {
+          ORT_RETURN_IF_ERROR_SESSIONID_(CheckShapes(name, sparse_tensor.DenseShape(),
+                                                     *iter->second.tensor_shape, input_output_moniker));
+        }
+      } else if (is_sparse_initializer(name) &&
+                 expected_type->IsTensorType()) {
+        // If this metadata came from a sparse initializer converted to dense, then still validate it.
+        auto expected_element_type = expected_type->AsTensorType()->GetElementType();
+        ORT_RETURN_IF_ERROR_SESSIONID_(CheckTypes(sparse_tensor.DataType(), expected_element_type,
+                                                  "sparse_tensor", input_output_moniker));
+        // Check shape
+        if (iter->second.tensor_shape.has_value()) {
+          ORT_RETURN_IF_ERROR_SESSIONID_(CheckShapes(name, sparse_tensor.DenseShape(),
+                                                     *iter->second.tensor_shape, input_output_moniker));
+        }
+      } else {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, input_output_moniker, " with name: '", name,
+                               "' expected to be of type: ", static_cast<int>(expected_type->type_), " but received a sparse tensor");
       }
 #else
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input with name ", feed_name,
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, input_output_moniker, " with name ", name,
                              " is a sparse tensor, which is not supported in this build.");
 #endif
 
-    } else if (input_ml_value.IsTensorSequence()) {
+    } else if (input_output_ml_value.IsTensorSequence()) {
       if (!expected_type->IsTensorSequenceType()
 #if !defined(DISABLE_OPTIONAL_TYPE)
           && !utils::IsOptionalSeqTensor(expected_type)
 #endif
       ) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input with name: ", feed_name,
-                               " is not expected to be of type tensor sequence.");
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, input_output_moniker, " with name: '", name,
+                               "' expected to be of type: ", static_cast<int>(expected_type->type_), " but received a tensor sequence");
       }
 
 #if !defined(DISABLE_OPTIONAL_TYPE)
@@ -1969,43 +2000,40 @@ common::Status InferenceSession::ValidateInputs(gsl::span<const std::string> fee
       auto expected_element_type = expected_type->AsSequenceTensorType()->GetElementType();
 #endif
 
-      auto input_element_type = input_ml_value.Get<TensorSeq>().DataType();
-      ORT_RETURN_IF_ERROR_SESSIONID_(CheckTypes(input_element_type, expected_element_type, "seq"));
+      auto input_output_element_type = input_output_ml_value.Get<TensorSeq>().DataType();
+      ORT_RETURN_IF_ERROR_SESSIONID_(CheckTypes(input_output_element_type, expected_element_type, "seq", input_output_moniker));
     } else {
-      auto input_type = input_ml_value.Type();
-      ORT_RETURN_IF_ERROR_SESSIONID_(CheckTypes(input_type, expected_type, ""));
+      auto input_output_type = input_output_ml_value.Type();
+      ORT_RETURN_IF_ERROR_SESSIONID_(CheckTypes(input_output_type, expected_type, "", input_output_moniker));
     }
   }
 
   return Status::OK();
 }
 
+common::Status InferenceSession::ValidateInputs(gsl::span<const std::string> feed_names,
+                                                gsl::span<const OrtValue> feeds) const {
+  return ValidateInputsOutputs(feed_names, feeds, input_def_map_, ArgType::kInput);
+}
+
 common::Status InferenceSession::ValidateOutputs(gsl::span<const std::string> output_names,
                                                  const std::vector<OrtValue>* p_fetches) const {
-  if (p_fetches == nullptr) {
-    return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Output vector pointer is NULL");
-  }
-
   if (output_names.empty()) {
     return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "At least one output should be requested.");
   }
 
-  if (!p_fetches->empty() && (output_names.size() != p_fetches->size())) {
-    std::ostringstream ostr;
-    ostr << "Output vector incorrectly sized: output_names.size(): " << output_names.size()
-         << "p_fetches->size(): " << p_fetches->size();
-    return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
-  }
+  const auto fetches = (p_fetches == nullptr) ? EmptySpan<const OrtValue>() : gsl::make_span(*p_fetches);
 
-  for (const auto& name : output_names) {
-    if (model_output_names_.find(name) == model_output_names_.end()) {
-      return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Invalid Output Name:" + name);
+  if (fetches.empty()) {
+    for (const auto& name : output_names) {
+      if (output_def_map_.count(name) == 0) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid output name:", name);
+      }
     }
+    return Status::OK();
   }
 
-  // TODO add more validation here like checking shape of the allocated buffers
-
-  return common::Status::OK();
+  return ValidateInputsOutputs(output_names, fetches, output_def_map_, ArgType::kOutput);
 }
 
 #ifdef ENABLE_TRAINING
@@ -2483,7 +2511,7 @@ std::pair<common::Status, const OutputDefList*> InferenceSession::GetModelOutput
     }
   }
 
-  return std::make_pair(common::Status::OK(), &output_def_list_);
+  return std::make_pair(common::Status::OK(), &model_->MainGraph().GetOutputs());
 }
 
 common::Status InferenceSession::NewIOBinding(std::unique_ptr<IOBinding>* io_binding) {
@@ -2697,43 +2725,40 @@ common::Status InferenceSession::SaveModelMetadata(const onnxruntime::Model& mod
   model_metadata_.custom_metadata_map = model.MetaData();
   model_metadata_.graph_name = graph.Name();
 
-  required_inputs_.clear();
-  for (auto input : graph.GetInputs()) {
-    required_inputs_.insert(input->Name());
-  }
-
-  auto add_inputs = [this](const InputDefList& inputs) {
-    input_def_map_.clear();
-    input_def_map_.reserve(inputs.size());
-    for (auto elem : inputs) {
+  auto add_inputs_outputs = [](const InputDefList& inputs_outputs, InputOutputDefMetaMap& map) {
+    map.reserve(inputs_outputs.size());
+    for (auto elem : inputs_outputs) {
       auto elem_type = utils::GetMLDataType(*elem);
-      auto elem_shape_proto = elem->Shape();
-      input_def_map_.insert(
-          {elem->Name(),
-           InputDefMetaData(
-               elem, elem_type,
-               elem_shape_proto ? utils::GetTensorShapeFromTensorShapeProto(*elem_shape_proto) : TensorShape())});
+      const auto* elem_shape_proto = elem->Shape();
+      if (elem_shape_proto != nullptr) {
+        map.emplace(elem->Name(), InputOutputDefMetaData(
+                                      elem, elem_type,
+                                      utils::GetTensorShapeFromTensorShapeProto(*elem_shape_proto)));
+      } else {
+        map.emplace(elem->Name(), InputOutputDefMetaData(elem, elem_type));
+      }
     }
   };
 
-  if (graph.CanOverrideInitializer()) {
-    // for IR 4 or higher it is optional to have a matching graph input for an initializer, and if one exists the
-    // initializer is explicitly overridable.
-    add_inputs(graph.GetInputsIncludingInitializers());
-  } else {
-    // for IR < 4 we don't allow overriding initializers so that they can be treated as constant. exclude them from
-    // the list of valid inputs by just using the GetInputs() list.
-    add_inputs(graph.GetInputs());
+  {
+    InputOutputDefMetaMap input_defs;
+    if (graph.CanOverrideInitializer()) {
+      // for IR 4 or higher it is optional to have a matching graph input for an initializer, and if one exists the
+      // initializer is explicitly overridable.
+      add_inputs_outputs(graph.GetInputsIncludingInitializers(), input_defs);
+    } else {
+      // for IR < 4 we don't allow overriding initializers so that they can be treated as constant. exclude them from
+      // the list of valid inputs by just using the GetInputs() list.
+      add_inputs_outputs(graph.GetInputs(), input_defs);
+    }
+    input_def_map_.swap(input_defs);
   }
 
-  // save outputs
   const auto& outputs = graph.GetOutputs();
-  output_def_list_ = outputs;  // A direct copy of outputs
-
-  model_output_names_.clear();
-  model_output_names_.reserve(outputs.size());
-  for (const auto& elem : outputs) {
-    model_output_names_.insert(elem->Name());
+  {
+    InputOutputDefMetaMap output_defs;
+    add_inputs_outputs(outputs, output_defs);
+    output_def_map_.swap(output_defs);
   }
 
   VLOGS(*session_logger_, 1) << "Done saving model metadata";
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index e4127085b3184..9259e014b9860 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <optional>
 #include <string>
 #include <unordered_map>
 
@@ -103,6 +104,22 @@ struct ModelMetadata {
  */
 
 class InferenceSession {
+  struct InputOutputDefMetaData {
+    InputOutputDefMetaData(const NodeArg* node_arg0, MLDataType ml_data_type0, TensorShape&& tensor_shape0)
+        : node_arg(node_arg0), ml_data_type(ml_data_type0), tensor_shape(std::move(tensor_shape0)) {
+    }
+
+    InputOutputDefMetaData(const NodeArg* node_arg0, MLDataType ml_data_type0)
+        : node_arg(node_arg0), ml_data_type(ml_data_type0) {
+    }
+
+    gsl::not_null<const NodeArg*> node_arg;
+    MLDataType ml_data_type;
+    std::optional<TensorShape> tensor_shape;  // not applicable if the input is non-tensor type
+  };
+
+  using InputOutputDefMetaMap = InlinedHashMap<std::string_view, InputOutputDefMetaData>;
+
  public:
 #if !defined(ORT_MINIMAL_BUILD)
 
@@ -570,9 +587,6 @@ class InferenceSession {
   // if they need.
   std::shared_ptr<onnxruntime::Model> model_;
 
-  // names of model outputs used for quick validation.
-  std::unordered_set<std::string> model_output_names_;
-
   // The file path of where the model was loaded. e.g. /tmp/test_squeezenet/model.onnx
   PathString model_location_;
 
@@ -628,7 +642,7 @@ class InferenceSession {
   void InitLogger(logging::LoggingManager* logging_manager);
 
   [[nodiscard]] common::Status CheckShapes(const std::string& input_name, const TensorShape& input_shape,
-                                           const TensorShape& expected_shape) const;
+                                           const TensorShape& expected_shape, const char* input_output_moniker) const;
 
   [[nodiscard]] common::Status ValidateInputs(gsl::span<const std::string> feed_names,
                                               gsl::span<const OrtValue> feeds) const;
@@ -636,6 +650,11 @@ class InferenceSession {
   [[nodiscard]] common::Status ValidateOutputs(gsl::span<const std::string> output_names,
                                                const std::vector<OrtValue>* p_fetches) const;
 
+  [[nodiscard]] common::Status ValidateInputsOutputs(gsl::span<const std::string> feed_fetches_names,
+                                                     gsl::span<const OrtValue> feeds_fetches,
+                                                     const InputOutputDefMetaMap& input_output_meta_map,
+                                                     ArgType arg_type) const;
+
   [[nodiscard]] common::Status WaitForNotification(Notification* p_executor_done, int64_t timeout_in_ms);
 
   template <typename T>
@@ -737,19 +756,9 @@ class InferenceSession {
 #endif
 
   ModelMetadata model_metadata_;
-  std::unordered_set<std::string> required_inputs_;
-
-  struct InputDefMetaData {
-    InputDefMetaData(const NodeArg* node_arg0, MLDataType ml_data_type0, TensorShape&& tensor_shape0)
-        : node_arg(node_arg0), ml_data_type(ml_data_type0), tensor_shape(std::move(tensor_shape0)) {
-    }
-    const NodeArg* node_arg;
-    MLDataType ml_data_type;
-    TensorShape tensor_shape;  // not applicable if the input is non-tensor type
-  };
 
-  std::unordered_map<std::string, InputDefMetaData> input_def_map_;
-  OutputDefList output_def_list_;
+  InputOutputDefMetaMap input_def_map_;
+  InputOutputDefMetaMap output_def_map_;
 
   // Data transfer manager.
   DataTransferManager data_transfer_mgr_;
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 10c8a2de7c3df..f470e9f6b6ed1 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -26,7 +26,18 @@
 #include "core/framework/provider_options_utils.h"
 
 #ifdef USE_DML
-#include "core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h"
+using Microsoft::WRL::ComPtr;
+
+#include <wil/wrl.h>
+#include "core/providers/dml/DmlExecutionProvider/src/External/D3DX12/d3dx12.h"
+#include "core/providers/dml/DmlExecutionProvider/src/ErrorHandling.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h"
+#include "core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h"
+#include "core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h"
+#include "core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.h"
+#include "core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h"
+#include "core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h"
 #endif
 
 namespace onnxruntime {
@@ -186,6 +197,11 @@ std::unique_ptr<IDataTransfer> GetGPUDataTransfer() {
 #endif
 
 #ifdef USE_DML
+
+constexpr GUID execution_context_guid = {0x50fd773b, 0x4462, 0x4b28, {0x98, 0x9e, 0x8c, 0xa0, 0x54, 0x05, 0xbd, 0x4a}};
+constexpr GUID upload_heap_guid = {0x125235f9, 0xef41, 0x4043, {0xa4, 0x9d, 0xdd, 0xc9, 0x61, 0xe7, 0xdb, 0xee}};
+constexpr GUID dml_readback_heap_guid = {0x00d32df8, 0xea2d, 0x40bf, {0xa4, 0x47, 0x9c, 0xb4, 0xbc, 0xf1, 0x1d, 0x5e}};
+
 AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id) {
   // Current approach is not thread-safe, but there are some bigger infra pieces to put together in order to make
   // multi-threaded DML allocation work, including maintaining a per-thread DML allocator.
@@ -196,13 +212,100 @@ AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id) {
 
   auto hit = id_to_allocator_map->find(id);
   if (hit == id_to_allocator_map->end()) {
-    auto dml_allocator = std::make_shared<Dml::DmlExternalBufferAllocator>(id);
+    constexpr uint32_t device_id = 0;
+    auto d3d12_device = onnxruntime::DMLProviderFactoryCreator::CreateD3D12Device(device_id, false);
+    auto dml_device = onnxruntime::DMLProviderFactoryCreator::CreateDMLDevice(d3d12_device.Get());
+
+    D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
+    cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+    cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
+
+    ComPtr<ID3D12CommandQueue> cmd_queue;
+    ORT_THROW_IF_FAILED(
+        d3d12_device->CreateCommandQueue(&cmd_queue_desc, IID_PPV_ARGS(cmd_queue.ReleaseAndGetAddressOf())));
+
+    auto context = std::make_shared<Dml::ExecutionContext>(d3d12_device.Get(), dml_device.Get(), cmd_queue.Get());
+
+    // We leak the upload and readback heaps to keep them alive, just like the map
+    auto upload_heap = std::make_unique<Dml::PooledUploadHeap>(d3d12_device.Get(), context).release();
+    auto readback_heap = std::make_unique<Dml::ReadbackHeap>(d3d12_device.Get(), context).release();
+
+    auto dml_allocator = std::make_shared<Dml::BucketizedBufferAllocator>(
+        d3d12_device.Get(),
+        context,
+        CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
+        D3D12_HEAP_FLAG_NONE,
+        D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
+        D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+        std::make_unique<Dml::DmlCommittedResourceAllocator>(d3d12_device.Get()));
+    dml_allocator->SetDefaultRoundingMode(AllocatorRoundingMode::Enabled);
+    context->SetAllocator(dml_allocator);
+
+    auto context_ptr = context.get();
+
+    ORT_THROW_IF_FAILED(d3d12_device->SetPrivateData(execution_context_guid, sizeof(context_ptr), &context_ptr));
+    ORT_THROW_IF_FAILED(d3d12_device->SetPrivateData(upload_heap_guid, sizeof(upload_heap), &upload_heap));
+    ORT_THROW_IF_FAILED(d3d12_device->SetPrivateData(dml_readback_heap_guid, sizeof(readback_heap), &readback_heap));
+
     hit = id_to_allocator_map->emplace(id, std::move(dml_allocator)).first;
   }
 
   return hit->second;
 }
 
+void CpuToDmlMemCpy(void* dst, const void* src, size_t num_bytes) {
+  const auto* allocInfo = static_cast<const Dml::AllocationInfo*>(dst);
+  ID3D12Resource* dst_data = allocInfo->GetResource();
+
+  ComPtr<ID3D12Device> d3d12_device;
+  ORT_THROW_IF_FAILED(dst_data->GetDevice(IID_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
+
+  Dml::ExecutionContext* context = nullptr;
+  uint32_t context_size = gsl::narrow_cast<uint32_t>(sizeof(context));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(execution_context_guid, &context_size, &context));
+
+  Dml::PooledUploadHeap* upload_heap = nullptr;
+  uint32_t upload_heap_size = gsl::narrow_cast<uint32_t>(sizeof(upload_heap));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(upload_heap_guid, &upload_heap_size, &upload_heap));
+
+  upload_heap->BeginUploadToGpu(
+      dst_data, 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, gsl::make_span(static_cast<const std::byte*>(src), num_bytes));
+  context->Flush();
+
+  // We don't use the same command queue as the execution provider, so we need to sync to make sure that all data has
+  // been uploaded to the resource. This function is usually called before inference just to upload initial data to the
+  // GPU, so it shouldn't be a bottleneck.
+  context->GetCurrentCompletionEvent().WaitForSignal();
+}
+
+void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
+  const auto* allocInfo = static_cast<const Dml::AllocationInfo*>(src);
+  ID3D12Resource* src_data = allocInfo->GetResource();
+
+  ComPtr<ID3D12Device> d3d12_device;
+  ORT_THROW_IF_FAILED(src_data->GetDevice(IID_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
+
+  Dml::ExecutionContext* context = nullptr;
+  uint32_t context_size = gsl::narrow_cast<uint32_t>(sizeof(context));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(execution_context_guid, &context_size, &context));
+
+  Dml::ReadbackHeap* readback_heap = nullptr;
+  uint32_t readback_heap_size = gsl::narrow_cast<uint32_t>(sizeof(readback_heap));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(dml_readback_heap_guid, &readback_heap_size, &readback_heap));
+
+  // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we don't need to sync after
+  // this call
+  readback_heap->ReadbackFromGpu(
+      gsl::make_span(static_cast<std::byte*>(dst), num_bytes), src_data, 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+}
+
+const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetDmlToHostMemCpyFunction() {
+  static std::unordered_map<OrtDevice::DeviceType, MemCpyFunc> map{
+      {OrtDevice::GPU, DmlToCpuMemCpy}};
+
+  return &map;
+}
+
 #endif
 
 #ifdef USE_CANN
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.h b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
index 4ac9c70468b19..e3f277bcb9c41 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.h
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
@@ -77,6 +77,12 @@ std::unique_ptr<IDataTransfer> GetGPUDataTransfer();
 
 AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id);
 
+void CpuToDmlMemCpy(void* dst, const void* src, size_t num_bytes);
+
+void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
+
+const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetDmlToHostMemCpyFunction();
+
 #endif
 
 #ifdef USE_CANN
diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
index f9d908e0ac518..dc4a4dcc13b7f 100644
--- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
@@ -63,7 +63,12 @@ void addOrtValueMethods(pybind11::module& m) {
       // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
       // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in CUDA
       CreateGenericMLValue(nullptr, GetRocmAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToRocmMemCpy);
-
+#elif USE_DML
+      // InputDeflist is null because OrtValue creation is not tied to a specific model
+      // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
+      // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in DML
+      CreateGenericMLValue(
+        nullptr, GetDmlAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToDmlMemCpy);
 #else
       throw std::runtime_error(
           "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
@@ -126,6 +131,12 @@ void addOrtValueMethods(pybind11::module& m) {
             values_type,
             *(ml_value->GetMutable<Tensor>()),
             CpuToRocmMemCpy);
+#elif USE_DML
+          onnxruntime::python::CopyDataToTensor(
+            py_values,
+            values_type,
+            *(ml_value->GetMutable<Tensor>()),
+            CpuToDmlMemCpy);
 #else
         throw std::runtime_error(
             "Unsupported GPU device: Cannot find the supported GPU device.");
@@ -158,12 +169,18 @@ void addOrtValueMethods(pybind11::module& m) {
             throw std::runtime_error("The provided device id doesn't match any available GPUs on the machine.");
           }
           allocator = GetCudaAllocator(device.Id());
-#elif USE_DML
-          allocator = GetDmlAllocator(device.Id());
 #else
       throw std::runtime_error(
           "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
           "Please use the CUDA package of OnnxRuntime to use this feature.");
+#endif
+        } else if (strcmp(GetDeviceName(device), DML) == 0) {
+#if USE_DML
+          allocator = GetDmlAllocator(device.Id());
+#else
+          throw std::runtime_error(
+              "Can't allocate memory on the DirectML device using this package of OnnxRuntime. "
+              "Please use the DirectML package of OnnxRuntime to use this feature.");
 #endif
         } else {
           throw std::runtime_error("Unsupported device: Cannot place the OrtValue on this device");
@@ -290,11 +307,13 @@ void addOrtValueMethods(pybind11::module& m) {
 #ifdef USE_CUDA
         GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCudaToHostMemCpyFunction());
 #elif USE_ROCM
-  GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetRocmToHostMemCpyFunction());
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetRocmToHostMemCpyFunction());
 #elif USE_CANN
-  GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCannToHostMemCpyFunction());
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCannToHostMemCpyFunction());
+#elif USE_DML
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetDmlToHostMemCpyFunction());
 #else
-  GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, nullptr);
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, nullptr);
 #endif
         return obj;
       })
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 82d119894a5d8..907ea0ec41e23 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -237,7 +237,11 @@ const char* GetDeviceName(const OrtDevice& device) {
     case OrtDevice::CPU:
       return CPU;
     case OrtDevice::GPU:
+#ifdef USE_DML
+      return DML;
+#else
       return CUDA;
+#endif
     case OrtDevice::FPGA:
       return "FPGA";
     case OrtDevice::NPU:
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/gemm_ck.cu b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/gemm_ck.cu
index 242035371435c..6707892cca50e 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/gemm_ck.cu
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/gemm_ck.cu
@@ -60,6 +60,14 @@ class CKGemm : public IKernelExplorer {
       type_strings_.emplace_back(std::move(type_string));
       ops_.emplace_back(std::move(op));
     }
+    for (auto&& [type_string, op] : GetCKStreamKGemmTypeStringAndOps<T, ALayout, BLayout>()) {
+      type_strings_.emplace_back(std::move(type_string));
+      ops_.emplace_back(std::move(op));
+    }
+    for (auto&& [type_string, op] : GetCKSplitKGemmTypeStringAndOps<T, ALayout, BLayout>()) {
+      type_strings_.emplace_back(std::move(type_string));
+      ops_.emplace_back(std::move(op));
+    }
     ORT_ENFORCE(!ops_.empty());
   }
 
diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index ddf152ba13964..9c743a83819c3 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -235,7 +235,12 @@ def to_string(model_path, session, test_setting):
     option += "graph_optimization_level={},intra_op_num_threads={},".format(
         sess_options.graph_optimization_level, sess_options.intra_op_num_threads
     ).replace("GraphOptimizationLevel.ORT_", "")
-    option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},test_cases={test_setting.test_cases},test_times={test_setting.test_times},use_gpu={test_setting.use_gpu}"
+
+    option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},"
+    option += f"test_cases={test_setting.test_cases},test_times={test_setting.test_times},"
+    option += f"use_gpu={test_setting.use_gpu},use_io_binding={test_setting.use_io_binding},"
+    option += f"average_sequence_length={test_setting.average_sequence_length},"
+    option += f"random_sequence_length={test_setting.random_sequence_length}"
     return option
 
 
diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py
index 4cb9585962143..61e4c97c75c8c 100644
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@@ -33,18 +33,20 @@ def run_model(model_path, all_inputs, use_gpu, disable_optimization):
     return results, latency_list, output_names
 
 
-def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):
+def compare(baseline_results, treatment_results, verbose, rtol=1e-1, atol=1e-3):
     # Validate the output of baseline and treatment, to make sure the results are similar.
     diff_count = 0
-    max_rel_diff = 0
     max_abs_diff = 0
     for test_case_id, results in enumerate(baseline_results):
         case_passed = True
         for i in range(len(results)):
             treatment_output = treatment_results[test_case_id][i]
-            rel_diff = np.amax(np.abs((treatment_output - results[i]) / results[i]))
             abs_diff = np.amax(np.abs(treatment_output - results[i]))
-            max_rel_diff = max(max_rel_diff, rel_diff)
+            if verbose and abs_diff > atol:
+                print("abs_diff", abs_diff)
+                print("treatment", treatment_output)
+                print("baseline", results[i])
+
             max_abs_diff = max(max_abs_diff, abs_diff)
             if not np.allclose(results[i].tolist(), treatment_output.tolist(), rtol=rtol, atol=atol):
                 if case_passed:
@@ -54,7 +56,7 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):
                     if verbose:
                         print(f"case {test_case_id} output {i}")
                         print(f"baseline={results[i].tolist()}\ntreatment={treatment_output}")
-                        print(f"rel_diff={rel_diff} abs_diff={abs_diff}")
+                        print(f"abs_diff={abs_diff}")
 
     if diff_count == 0:
         print(
@@ -70,8 +72,7 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):
         )
 
     print(f"maximum absolute difference={max_abs_diff}")
-
-    print(f"maximum relative difference={max_rel_diff}")
+    return max_abs_diff, case_passed
 
 
 def run_test(
@@ -133,7 +134,7 @@ def run_test(
         print(f"treatment average latency: {statistics.mean(treatment_latency) * 1000} ms")
 
     # Validate the output of baseline and treatment, to make sure the results are similar.
-    compare(baseline_results, treatment_results, verbose, rtol, atol)
+    return compare(baseline_results, treatment_results, verbose, rtol, atol)
 
 
 def parse_arguments():
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index 73561d312e4d4..63c991167d235 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -883,7 +883,8 @@ def remove_shared_initializers(
     graph2: GraphProto,
     shared_prefix: str = "shared_",
     min_elements: int = 1024,
-    require_raw_data: bool = False,
+    signature_cache1: Optional[dict] = None,
+    signature_cache2: Optional[dict] = None,
 ):
     """Remove initializers with same value from two graphs.
 
@@ -892,7 +893,8 @@ def remove_shared_initializers(
         graph2 (GraphProto): the second graph to process
         shared_prefix (str): add prefix to the shared initializers among two graphs
         min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.
-        require_raw_data (bool, optional): Only remove tensors with raw_data field to speed up method
+        signature_cache1 (dict): Optional dictionary to store data signatures of tensors in graph1 in order to speed up comparison
+        signature_cache2 (dict): Optional dictionary to store data signatures of tensors in graph2 in order to speed up comparison
     """
 
     mapping_initializers_1 = {}
@@ -909,7 +911,7 @@ def remove_shared_initializers(
             if not (initializer2.dims and sum(initializer2.dims) >= min_elements):
                 continue
 
-            if OnnxModel.has_same_value(initializer1, initializer2, require_raw_data=True):
+            if OnnxModel.has_same_value(initializer1, initializer2, signature_cache1, signature_cache2):
                 mapping_initializers_1[initializer1.name] = shared_prefix + initializer2.name
                 shared_initializers_1.append(initializer1)
 
@@ -982,14 +984,17 @@ def remove_shared_initializers(
     return shared_initializers_2
 
 
-def get_shared_initializers(encoder_model: ModelProto, decoder_model: ModelProto, require_raw_data: bool = False):
+def get_shared_initializers(encoder_model: ModelProto, decoder_model: ModelProto):
     encoder = OnnxModel(encoder_model)
     decoder = OnnxModel(decoder_model)
     encoder.add_prefix_to_names("e_")
     decoder.add_prefix_to_names("d_")
-    encoder.remove_duplicated_initializer(require_raw_data)
-    decoder.remove_duplicated_initializer(require_raw_data)
-    initializers = remove_shared_initializers(decoder.model.graph, encoder.model.graph, "s_", require_raw_data)
+    signature_cache1, signature_cache2 = {}, {}
+    encoder.remove_duplicated_initializer(signature_cache1)
+    decoder.remove_duplicated_initializer(signature_cache2)
+    initializers = remove_shared_initializers(
+        decoder.model.graph, encoder.model.graph, "s_", signature_cache1, signature_cache2
+    )
     return initializers
 
 
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index 31496c50523da..40f2aee875382 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -78,14 +78,7 @@ def process_mask(self, input: str) -> str:
             # ReduceSum-13: axes is moved from attribute to input
             axes_name = "ort_const_1_reduce_sum_axes"
             if self.model.get_initializer(axes_name) is None:
-                self.model.add_initializer(
-                    helper.make_tensor(
-                        name=axes_name,
-                        data_type=TensorProto.INT64,
-                        dims=[1],
-                        vals=[1],
-                    )
-                )
+                self.add_initializer(name=axes_name, data_type=TensorProto.INT64, dims=[1], vals=[1], raw=False)
             mask_index_node = helper.make_node(
                 "ReduceSum",
                 inputs=[input_name, axes_name],
@@ -203,7 +196,7 @@ def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]
     def get_add_qk_str(self, add_qk: NodeProto):
         shape_infer = self.model.infer_runtime_shape(update=True)
         if shape_infer is None:
-            return
+            return None
 
         input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
         input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
@@ -428,19 +421,12 @@ def create_combined_qkv_bias(
         qkv_bias_dim = 3 * np.prod(qb.shape)
 
         bias_name = name_prefix + "_qkv_bias"
-        bias = helper.make_tensor(
+        self.add_initializer(
             name=bias_name,
-            data_type=TensorProto.FLOAT,
+            data_type=q_bias.data_type,
             dims=[qkv_bias_dim],
-            vals=qkv_bias.flatten().tolist(),
+            vals=qkv_bias,
         )
-
-        # Convert bias to FP16 if model is using FP16
-        if q_bias.data_type == 10:
-            bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name))
-
-        self.model.add_initializer(bias, self.this_graph_name)
-
         return bias_name
 
     def create_packed_qkv_matmul_node(
@@ -488,13 +474,13 @@ def create_packed_qkv_matmul_node(
 
         qkv_weight = np.stack((qw, kw, vw), axis=1).reshape((d, 3 * d))
         qkv_weight_name = matmul_node_name + "_qkv_weight"
-        weight = helper.make_tensor(
+
+        self.add_initializer(
             name=qkv_weight_name,
-            data_type=TensorProto.FLOAT,
+            data_type=q_weight.data_type,
             dims=[qkv_weight.shape[0], qkv_weight.shape[1]],
-            vals=qkv_weight.flatten().tolist(),
+            vals=qkv_weight,
         )
-        self.model.add_initializer(weight, self.this_graph_name)
 
         # Created packed QKV MatMul with output (B, S, 3*D)
         # Output is of the form:
@@ -519,23 +505,15 @@ def create_packed_qkv_matmul_node(
 
         # Create Slice nodes to access Q, K, V
         q_slice_name = matmul_node_name + "_q_start_index"
-        q_start_tensor = helper.make_tensor(name=q_slice_name, data_type=TensorProto.INT64, dims=[1], vals=[0])
+        self.add_initializer(name=q_slice_name, data_type=TensorProto.INT64, dims=[1], vals=[0], raw=False)
         k_slice_name = matmul_node_name + "_k_start_index"
-        k_start_tensor = helper.make_tensor(name=k_slice_name, data_type=TensorProto.INT64, dims=[1], vals=[d])
+        self.add_initializer(name=k_slice_name, data_type=TensorProto.INT64, dims=[1], vals=[d], raw=False)
         v_slice_name = matmul_node_name + "_v_start_index"
-        v_start_tensor = helper.make_tensor(name=v_slice_name, data_type=TensorProto.INT64, dims=[1], vals=[2 * d])
+        self.add_initializer(name=v_slice_name, data_type=TensorProto.INT64, dims=[1], vals=[2 * d], raw=False)
         end_of_qkv_name = matmul_node_name + "_end_of_qkv_index"
-        end_of_qkv_tensor = helper.make_tensor(
-            name=end_of_qkv_name, data_type=TensorProto.INT64, dims=[1], vals=[3 * d]
-        )
+        self.add_initializer(name=end_of_qkv_name, data_type=TensorProto.INT64, dims=[1], vals=[3 * d], raw=False)
         qkv_last_axis_name = matmul_node_name + "_qkv_last_axis"
-        qkv_axis_tensor = helper.make_tensor(name=qkv_last_axis_name, data_type=TensorProto.INT64, dims=[1], vals=[-1])
-
-        self.model.add_initializer(q_start_tensor, self.this_graph_name)
-        self.model.add_initializer(k_start_tensor, self.this_graph_name)
-        self.model.add_initializer(v_start_tensor, self.this_graph_name)
-        self.model.add_initializer(end_of_qkv_tensor, self.this_graph_name)
-        self.model.add_initializer(qkv_axis_tensor, self.this_graph_name)
+        self.add_initializer(name=qkv_last_axis_name, data_type=TensorProto.INT64, dims=[1], vals=[-1], raw=False)
 
         q_slice_output = matmul_node_name + "_q_out"
         q_slice = helper.make_node(
@@ -719,6 +697,7 @@ def create_attention_node(
         present_k: str = "",
         present_v: str = "",
         scale: Optional[float] = None,
+        causal: bool = False,
     ) -> Union[NodeProto, None]:
         """Create an Attention node.
 
@@ -739,6 +718,8 @@ def create_attention_node(
             past_v (str): name of input for past V value
             present_k (str): name of output to store present K value
             present_v (str): name of output to store present V value
+            scale: scale before softmax
+            causal: whether it is uni-directional mask.
 
         Returns:
             Union[NodeProto, None]: the node created or None if failed.
@@ -823,7 +804,6 @@ def create_attention_node(
             assert q_bias_shape == k_bias_shape == qw_out_size
             assert v_bias_shape == vw_out_size
 
-            qkv_bias_dim = 0
             if is_qkv_diff_dims:
                 qkv_bias = np.concatenate((qb, kb, vb), axis=0)
                 qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
@@ -834,33 +814,24 @@ def create_attention_node(
         attention_node_name = self.model.create_node_name("Attention")
 
         if not self.use_multi_head_attention:
-            weight = helper.make_tensor(
+            self.add_initializer(
                 name=attention_node_name + "_qkv_weight",
-                data_type=TensorProto.FLOAT,
+                data_type=q_weight.data_type,
                 dims=[qw_in_size, qkv_weight_dim],
-                vals=qkv_weight.flatten().tolist(),
+                vals=qkv_weight,
             )
 
-            # Sometimes weights and bias are stored in fp16
-            if q_weight.data_type == 10:
-                weight.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(weight).astype(np.float16), weight.name))
-            self.model.add_initializer(weight, self.this_graph_name)
-
-        bias = None
         if has_bias:
-            bias = helper.make_tensor(
+            self.add_initializer(
                 name=attention_node_name + "_qkv_bias",
-                data_type=TensorProto.FLOAT,
+                data_type=q_bias.data_type,
                 dims=[qkv_bias_dim],
-                vals=qkv_bias.flatten().tolist(),
+                vals=qkv_bias,
             )
-            if q_bias.data_type == 10:
-                bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name))
-            self.model.add_initializer(bias, self.this_graph_name)
 
         # For MultiHeadAttention operator, use separated inputs for query, key and value, and no weights.
         if self.use_multi_head_attention:
-            if add_qk_str is not None:
+            if add_qk_str:
                 logger.debug("MultiHeadAttention does not support relative_position_bias: cannot fuse the attention.")
                 return None
 
@@ -896,7 +867,7 @@ def create_attention_node(
                 past_kv = self.concat_kv(past_k, past_v)
                 attention_inputs.append(past_kv)
 
-            if add_qk_str is not None:
+            if add_qk_str:
                 # Convert 4d mask from (B,1,M,M) to (B,N,M,M)
                 # B = batch size, M = max sequence length, N = num heads
                 concat_node_name = self.model.create_node_name("Concat")
@@ -933,6 +904,9 @@ def create_attention_node(
         attention_node.domain = "com.microsoft"
         attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
 
+        if causal:
+            attention_node.attribute.extend([helper.make_attribute("unidirectional", 1)])
+
         if scale is not None:
             attention_node.attribute.extend([helper.make_attribute("scale", scale)])
 
@@ -1166,6 +1140,13 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
 
             q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+            if q_num_heads <= 0 or q_hidden_size <= 0:
+                logger.warning(
+                    "Failed to detect num_heads and hidden_size for Attention fusion. "
+                    "Please specify those parameters in argument."
+                )
+                return
+
             # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
             # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
             new_node = self.create_attention_node(
@@ -1191,14 +1172,15 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             if einsum_node is not None:
                 unique_index = einsum_node.input[0]
                 new_edge = "edge_modified_" + unique_index
-                shape_tensor = helper.make_tensor(
+
+                shape_tensor = self.add_initializer(
                     name="shape_modified_tensor" + unique_index,
                     data_type=TensorProto.INT64,
                     dims=[4],
-                    vals=np.int64([0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]).tobytes(),
-                    raw=True,
+                    vals=np.int64([0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]),
+                    raw=False,
                 )
-                self.model.add_initializer(shape_tensor, self.this_graph_name)
+
                 self.model.add_node(
                     helper.make_node(
                         "Reshape",
diff --git a/onnxruntime/python/tools/transformers/fusion_attention_clip.py b/onnxruntime/python/tools/transformers/fusion_attention_clip.py
new file mode 100644
index 0000000000000..d400e248d6cca
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/fusion_attention_clip.py
@@ -0,0 +1,218 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Tuple
+
+from fusion_attention import AttentionMask, FusionAttention
+from fusion_options import AttentionMaskFormat
+from onnx import NodeProto
+from onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionAttentionClip(FusionAttention):
+    """
+    Fuse Attention subgraph of Clip into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+    ):
+        attention_mask = AttentionMask(model)
+        attention_mask.mask_format = AttentionMaskFormat.NoMask
+
+        super().__init__(
+            model,
+            hidden_size,
+            num_heads,
+            attention_mask,
+            use_multi_head_attention=False,
+            search_op_types=["SkipLayerNormalization"],
+        )
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size for ONNX model from MiDaS
+        Args:
+            reshape_q (NodeProto): reshape node for q
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+        concat = self.model.match_parent(reshape_q, "Concat", 1)
+        if concat is None or len(concat.input) != 4:
+            return self.num_heads, self.hidden_size
+
+        # The shape is a tensor like [?, ?, num_heads, head_size]
+        num_head_value = self.model.get_constant_value(concat.input[2])
+        if num_head_value is None:
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        if len(num_head_value) != 1 or num_head_value[0] <= 0:
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = num_head_value[0]
+
+        head_size_value = self.model.get_constant_value(concat.input[3])
+        if head_size_value is None:
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        if len(head_size_value) != 1 or head_size_value[0] <= 0:
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        head_size = head_size_value[0]
+
+        hidden_size = num_heads * head_size
+
+        if self.num_heads > 0 and num_heads != self.num_heads:
+            if self.num_heads_warning:
+                logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
+                self.num_heads_warning = False  # Do not show the warning more than once
+
+        if self.hidden_size > 0 and hidden_size != self.hidden_size:
+            if self.hidden_size_warning:
+                logger.warning(
+                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                )
+                self.hidden_size_warning = False  # Do not show the warning more than once
+
+        return num_heads, hidden_size
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        skip_input_index = None
+        node_before_layer_norm = None
+        for i in [1, 0]:
+            parent = self.model.match_parent(normalize_node, "SkipLayerNormalization", i)
+            if parent is not None:
+                skip_input_index = i
+                node_before_layer_norm = parent
+
+        root_input = None
+        if node_before_layer_norm is not None:
+            root_input = node_before_layer_norm.output[0]
+        else:
+            # Deal with the first attention after the embedding layer.
+            for i in [0, 1]:
+                node_before_layer_norm = self.model.match_parent(normalize_node, "Add", i)
+                if node_before_layer_norm is None:
+                    continue
+                child = self.model.find_first_child_by_type(
+                    node_before_layer_norm, "LayerNormalization", input_name_to_nodes, False
+                )
+                if child is None:
+                    continue
+                root_input = child.output[0]
+                skip_input_index = i
+                break
+
+            if skip_input_index is None:
+                return
+
+        qkv_nodes = self.model.match_parent_path(
+            normalize_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
+            [1 - skip_input_index, None, None, 0, 0, 0],
+        )
+        if qkv_nodes is None:
+            return
+
+        (_, _, reshape_qkv, transpose_qkv, _, matmul_qkv) = qkv_nodes
+
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv, ["Reshape", "Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, 0, None]
+        )
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (_, _, reshape_v, add_v, matmul_v) = v_nodes
+
+        add_mask_indices = []
+        qk_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            ["Softmax", "Reshape", "Add", "Reshape", "MatMul"],
+            [0, 0, 0, None, 0],
+            return_indice=add_mask_indices,
+        )
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+        assert len(add_mask_indices) == 1
+        causal_mask_input_index = 1 - add_mask_indices[0]
+
+        (_softmax_qk, _, add_mask, _, matmul_qk) = qk_nodes
+
+        q_nodes = self.model.match_parent_path(
+            matmul_qk, ["Reshape", "Transpose", "Reshape", "Mul", "Add", "MatMul"], [0, 0, 0, 0, None, None]
+        )
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        (_, _transpose_q, reshape_q, mul_q, add_q, matmul_q) = q_nodes
+
+        k_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, 0, 0, None]
+        )
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+
+        (_transpose_k, _reshape_k, _, _, add_k, matmul_k) = k_nodes
+        if matmul_q.input[0] != root_input or matmul_k.input[0] != root_input or matmul_v.input[0] != root_input:
+            logger.debug("fuse_attention: expect to have same input to q, k and v matmul")
+            return
+
+        num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+        if num_heads <= 0 or hidden_size <= 0:
+            logger.debug("fuse_attention: failed to detect num_heads or hidden_size")
+            return
+
+        attention_last_node = reshape_qkv
+
+        # Here we do not match the whole subgraph since it is very complex. Instead, we just check whether a key path
+        # of computing causal mask.
+        causal_mask_nodes = self.model.match_parent_path(
+            add_mask,
+            ["Concat", "Expand", "Unsqueeze", "Unsqueeze", "Where", "Less"],
+            [causal_mask_input_index, 0, 0, 0, 0, 0],
+        )
+        if causal_mask_nodes is None:
+            # If the model is exported with batch_size == 1, there is no Concat node
+            causal_mask_nodes = self.model.match_parent_path(
+                add_mask,
+                ["Expand", "Unsqueeze", "Unsqueeze", "Where", "Less"],
+                [causal_mask_input_index, 0, 0, 0, 0],
+            )
+            if causal_mask_nodes is None:
+                logger.debug("fuse_attention: failed to match causal mask subgraph")
+                return
+
+        new_node = self.create_attention_node(
+            mask_index=None,
+            q_matmul=matmul_q,
+            k_matmul=matmul_k,
+            v_matmul=matmul_v,
+            q_add=add_q,
+            k_add=add_k,
+            v_add=add_v,
+            num_heads=num_heads,
+            hidden_size=hidden_size,
+            input=root_input,
+            output=attention_last_node.output[0],
+            add_qk_str=None,
+            scale=None,
+            causal=True,
+        )
+        if new_node is None:
+            return
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+        self.nodes_to_remove.extend([attention_last_node, transpose_qkv])
+
+        # Use prune graph to remove nodes since they are shared by all attention nodes.
+        self.prune_graph = True
diff --git a/onnxruntime/python/tools/transformers/fusion_attention_unet.py b/onnxruntime/python/tools/transformers/fusion_attention_unet.py
index 902b1f4f9549e..250ec5f3eb159 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention_unet.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention_unet.py
@@ -210,15 +210,13 @@ def create_attention_node(
                 )
 
                 matmul_node_name = self.model.create_node_name("MatMul", name_prefix="MatMul_QKV")
-                weight = helper.make_tensor(
+                self.add_initializer(
                     name=matmul_node_name + "_weight",
                     data_type=TensorProto.FLOAT,
                     dims=[qkv_weight.shape[0], qkv_weight.shape[1]],
-                    vals=qkv_weight.flatten().tolist(),
+                    vals=qkv_weight,
                 )
 
-                self.model.add_initializer(weight, self.this_graph_name)
-
                 matmul_node = helper.make_node(
                     "MatMul",
                     inputs=[k_matmul.input[0], matmul_node_name + "_weight"],
@@ -227,13 +225,13 @@ def create_attention_node(
                 )
                 self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name
 
-                shape_tensor = helper.make_tensor(
+                self.add_initializer(
                     name=matmul_node_name + "_reshape_shape",
                     data_type=TensorProto.INT64,
                     dims=[5],
                     vals=[0, 0, n, 3, h],
+                    raw=False,
                 )
-                self.model.add_initializer(shape_tensor, self.this_graph_name)
 
                 reshape_node = helper.make_node(
                     "Reshape",
@@ -251,14 +249,12 @@ def create_attention_node(
 
                 attention_node_name = self.model.create_node_name("Attention")
 
-                weight = helper.make_tensor(
+                self.add_initializer(
                     name=attention_node_name + "_qkv_weight",
                     data_type=TensorProto.FLOAT,
                     dims=[qw_in_size, qkv_weight_dim],
-                    vals=qkv_weight.flatten().tolist(),
+                    vals=qkv_weight,
                 )
-
-                self.model.add_initializer(weight, self.this_graph_name)
         else:  # cross attention
             attention_node_name = self.model.create_node_name("MultiHeadAttention")
             if self.enable_packed_kv:
@@ -282,15 +278,13 @@ def create_attention_node(
                 kv_weight = np.dstack([kw.reshape(c, n, h), vw.reshape(c, n, h)]).reshape(c, n * 2 * h)
 
                 matmul_node_name = self.model.create_node_name("MatMul", name_prefix="MatMul_KV")
-                weight = helper.make_tensor(
+                self.add_initializer(
                     name=matmul_node_name + "_weight",
                     data_type=TensorProto.FLOAT,
                     dims=[kv_weight.shape[0], kv_weight.shape[1]],
-                    vals=kv_weight.flatten().tolist(),
+                    vals=kv_weight,
                 )
 
-                self.model.add_initializer(weight, self.this_graph_name)
-
                 matmul_node = helper.make_node(
                     "MatMul",
                     inputs=[k_matmul.input[0], matmul_node_name + "_weight"],
@@ -299,13 +293,13 @@ def create_attention_node(
                 )
                 self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name
 
-                shape_tensor = helper.make_tensor(
+                self.add_initializer(
                     name=matmul_node_name + "_reshape_shape",
                     data_type=TensorProto.INT64,
                     dims=[5],
                     vals=[0, 0, n, 2, h],
+                    raw=False,
                 )
-                self.model.add_initializer(shape_tensor, self.this_graph_name)
 
                 reshape_node = helper.make_node(
                     "Reshape",
@@ -321,13 +315,12 @@ def create_attention_node(
         qkv_bias = np.zeros([3, hidden_size], dtype=np.float32)
         qkv_bias_dim = 3 * hidden_size
 
-        bias = helper.make_tensor(
+        self.add_initializer(
             name=attention_node_name + "_qkv_bias",
             data_type=TensorProto.FLOAT,
             dims=[qkv_bias_dim],
-            vals=qkv_bias.flatten().tolist(),
+            vals=qkv_bias,
         )
-        self.model.add_initializer(bias, self.this_graph_name)
 
         if is_self_attention:
             if not self.enable_packed_qkv:
@@ -519,15 +512,13 @@ def create_attention_node_lora(
                 )
 
                 matmul_node_name = self.model.create_node_name("MatMul", name_prefix="MatMul_QKV")
-                weight = helper.make_tensor(
+                self.add_initializer(
                     name=matmul_node_name + "_weight",
                     data_type=TensorProto.FLOAT,
                     dims=[qkv_weight.shape[0], qkv_weight.shape[1]],
-                    vals=qkv_weight.flatten().tolist(),
+                    vals=qkv_weight,
                 )
 
-                self.model.add_initializer(weight, self.this_graph_name)
-
                 matmul_node = helper.make_node(
                     "MatMul",
                     inputs=[k_matmul.input[0], matmul_node_name + "_weight"],
@@ -539,13 +530,14 @@ def create_attention_node_lora(
                 # Do the same thing with the LoRA weights, but don't constant fold the result. The goal is to allow
                 # the Q/K/V weights to be changed without having to re-run the optimizer.
                 lora_weight_shape_tensor_name = q_lora_last_node.name + "_reshape_shape"
-                lora_weight_shape_tensor = helper.make_tensor(
+
+                self.add_initializer(
                     name=lora_weight_shape_tensor_name,
                     data_type=TensorProto.INT64,
                     dims=[4],
                     vals=[0, 0, n, h],
+                    raw=False,
                 )
-                self.model.add_initializer(lora_weight_shape_tensor, self.this_graph_name)
 
                 # Reshape the LoRA Q weights
                 q_lora_reshape_node_name = self.model.create_node_name("Reshape", name_prefix="Reshape_LoRA_Q")
@@ -594,13 +586,13 @@ def create_attention_node_lora(
 
                 # Reshape the LoRA concatenated weights to [..., n * 3 * h]
                 reshaped_lora_weights_shape_tensor_name = qkv_lora_concat_node.name + "_reshape_shape"
-                reshaped_lora_weights_shape_tensor = helper.make_tensor(
+                self.add_initializer(
                     name=reshaped_lora_weights_shape_tensor_name,
                     data_type=TensorProto.INT64,
                     dims=[3],
                     vals=[0, 0, n * 3 * h],
+                    raw=False,
                 )
-                self.model.add_initializer(reshaped_lora_weights_shape_tensor, self.this_graph_name)
 
                 qkv_lora_reshaped_node_name = self.model.create_node_name("Reshape", name_prefix="Reshape_LoRA_QKV")
                 qkv_lora_reshaped_node = helper.make_node(
@@ -623,13 +615,13 @@ def create_attention_node_lora(
 
                 # Finally, reshape the concatenated Q/K/V result to 5D
                 shape_tensor_name = add_weights_node_name + "_reshape_shape"
-                shape_tensor = helper.make_tensor(
+                self.add_initializer(
                     name=shape_tensor_name,
                     data_type=TensorProto.INT64,
                     dims=[5],
                     vals=[0, 0, n, 3, h],
+                    raw=False,
                 )
-                self.model.add_initializer(shape_tensor, self.this_graph_name)
 
                 reshape_node = helper.make_node(
                     "Reshape",
@@ -678,15 +670,13 @@ def create_attention_node_lora(
                 kv_weight = np.dstack([kw.reshape(c, n, h), vw.reshape(c, n, h)]).reshape(c, n * 2 * h)
 
                 matmul_node_name = self.model.create_node_name("MatMul", name_prefix="MatMul_KV")
-                weight = helper.make_tensor(
+                self.add_initializer(
                     name=matmul_node_name + "_weight",
                     data_type=TensorProto.FLOAT,
                     dims=[kv_weight.shape[0], kv_weight.shape[1]],
-                    vals=kv_weight.flatten().tolist(),
+                    vals=kv_weight,
                 )
 
-                self.model.add_initializer(weight, self.this_graph_name)
-
                 matmul_node = helper.make_node(
                     "MatMul",
                     inputs=[k_matmul.input[0], matmul_node_name + "_weight"],
@@ -698,13 +688,13 @@ def create_attention_node_lora(
                 # Do the same thing with the LoRA weights, but don't constant fold the result. The goal is to allow
                 # the Q/K/V weights to be changed without having to re-run the optimizer.
                 kv_lora_weight_shape_tensor_name = q_lora_last_node.name + "_reshape_shape"
-                lora_weight_shape_tensor = helper.make_tensor(
+                self.add_initializer(
                     name=kv_lora_weight_shape_tensor_name,
                     data_type=TensorProto.INT64,
                     dims=[4],
                     vals=[0, 0, n, h],
+                    raw=False,
                 )
-                self.model.add_initializer(lora_weight_shape_tensor, self.this_graph_name)
 
                 # Reshape the LoRA K weights
                 k_lora_reshape_node_name = self.model.create_node_name("Reshape", name_prefix="Reshape_LoRA_K")
@@ -739,13 +729,13 @@ def create_attention_node_lora(
 
                 # Reshape the LoRA concatenated weights to [..., n * 2 * h]
                 reshaped_kv_lora_weights_shape_tensor_name = kv_lora_concat_node.name + "_reshape_shape"
-                reshaped_kv_lora_weights_shape_tensor = helper.make_tensor(
+                self.add_initializer(
                     name=reshaped_kv_lora_weights_shape_tensor_name,
                     data_type=TensorProto.INT64,
                     dims=[3],
                     vals=[0, 0, n * 2 * h],
+                    raw=False,
                 )
-                self.model.add_initializer(reshaped_kv_lora_weights_shape_tensor, self.this_graph_name)
 
                 kv_lora_reshaped_node_name = self.model.create_node_name("Reshape", name_prefix="Reshape_LoRA_KV")
                 kv_lora_reshaped_node = helper.make_node(
@@ -768,13 +758,13 @@ def create_attention_node_lora(
 
                 # Finally, reshape the concatenated K/V result to 5D
                 shape_tensor_name = add_kv_weights_node_name + "_reshape_shape"
-                shape_tensor = helper.make_tensor(
+                self.add_initializer(
                     name=shape_tensor_name,
                     data_type=TensorProto.INT64,
                     dims=[5],
                     vals=[0, 0, n, 2, h],
+                    raw=False,
                 )
-                self.model.add_initializer(shape_tensor, self.this_graph_name)
 
                 reshape_node = helper.make_node(
                     "Reshape",
@@ -802,14 +792,12 @@ def create_attention_node_lora(
         # No bias, use zeros
         qkv_bias = np.zeros([3, hidden_size], dtype=np.float32)
         qkv_bias_dim = 3 * hidden_size
-
-        bias = helper.make_tensor(
+        self.add_initializer(
             name=attention_node_name + "_qkv_bias",
             data_type=TensorProto.FLOAT,
             dims=[qkv_bias_dim],
-            vals=qkv_bias.flatten().tolist(),
+            vals=qkv_bias,
         )
-        self.model.add_initializer(bias, self.this_graph_name)
 
         if is_self_attention:
             if not self.enable_packed_qkv:
diff --git a/onnxruntime/python/tools/transformers/fusion_attention_vae.py b/onnxruntime/python/tools/transformers/fusion_attention_vae.py
index e91a8a61fcc24..151c04f9334fe 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention_vae.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention_vae.py
@@ -170,26 +170,23 @@ def create_attention_node(
         qkv_bias = np.stack((q_bias, k_bias, v_bias), axis=0)
         qkv_bias_dim = 3 * q_bias_shape
 
-        weight = helper.make_tensor(
+        self.add_initializer(
             name=attention_node_name + "_qkv_weight",
             data_type=TensorProto.FLOAT,
             dims=[qw_in_size, qkv_weight_dim],
-            vals=qkv_weight.flatten().tolist(),
+            vals=qkv_weight,
         )
 
-        self.model.add_initializer(weight, self.this_graph_name)
-
         # No bias, use zeros
         qkv_bias = np.zeros([3, hidden_size], dtype=np.float32)
         qkv_bias_dim = 3 * hidden_size
 
-        bias = helper.make_tensor(
+        self.add_initializer(
             name=attention_node_name + "_qkv_bias",
             data_type=TensorProto.FLOAT,
             dims=[qkv_bias_dim],
-            vals=qkv_bias.flatten().tolist(),
+            vals=qkv_bias,
         )
-        self.model.add_initializer(bias, self.this_graph_name)
 
         attention_inputs = [
             input_name,
diff --git a/onnxruntime/python/tools/transformers/fusion_bart_attention.py b/onnxruntime/python/tools/transformers/fusion_bart_attention.py
index 513c68a29dbd1..71801401e9d06 100644
--- a/onnxruntime/python/tools/transformers/fusion_bart_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_bart_attention.py
@@ -4,6 +4,7 @@
 # --------------------------------------------------------------------------
 import logging
 
+import numpy as np
 from fusion_attention import AttentionMask, FusionAttention
 from onnx import TensorProto, helper
 from onnx_model import OnnxModel
@@ -259,8 +260,12 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             empty_bias_name = "empty_bias"
             empty_tensor = self.model.get_initializer(empty_bias_name)
             if empty_tensor is None:
-                empty_tensor = helper.make_tensor(empty_bias_name, TensorProto.FLOAT, [bias_dim], [0.0] * bias_dim)
-                self.model.add_initializer(empty_tensor, self.this_graph_name)
+                self.add_initializer(
+                    empty_bias_name,
+                    TensorProto.FLOAT,
+                    dims=[bias_dim],
+                    vals=np.array([0.0] * bias_dim, dtype=np.float32),
+                )
 
             add_name = self.model.create_node_name("Add")
             add_k = helper.make_node("Add", [empty_bias_name, matmul_k.output[0]], [reshape_k_1.name], add_name)
diff --git a/onnxruntime/python/tools/transformers/fusion_base.py b/onnxruntime/python/tools/transformers/fusion_base.py
index d53a2f4ba4d2b..117468be412fa 100644
--- a/onnxruntime/python/tools/transformers/fusion_base.py
+++ b/onnxruntime/python/tools/transformers/fusion_base.py
@@ -4,9 +4,10 @@
 # --------------------------------------------------------------------------
 from collections import defaultdict
 from logging import getLogger
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Sequence, Union
 
-from onnx import NodeProto
+import numpy as np
+from onnx import NodeProto, helper
 from onnx_model import OnnxModel
 
 logger = getLogger(__name__)
@@ -86,3 +87,29 @@ def apply(self):
             self.model.prune_graph()
         elif self.nodes_to_remove or self.nodes_to_add:
             self.model.update_graph()
+
+    def add_initializer(self, name: str, data_type: int, dims: Sequence[int], vals: Any, raw: bool = True):
+        if raw:
+            np_type = helper.tensor_dtype_to_np_dtype(data_type)
+            if not isinstance(vals, np.ndarray):
+                bytes = np.array(vals, dtype=np_type).tobytes()
+            else:
+                bytes = vals.astype(np_type).tobytes()
+            tensor = helper.make_tensor(
+                name=name,
+                data_type=data_type,
+                dims=dims,
+                vals=bytes,
+                raw=True,
+            )
+        else:
+            tensor = helper.make_tensor(
+                name=name,
+                data_type=data_type,
+                dims=dims,
+                vals=vals,
+                raw=False,
+            )
+
+        self.model.add_initializer(tensor, self.this_graph_name)
+        return tensor
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index a20febb9f0a9a..bc38399e3cce5 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -378,7 +378,7 @@ def check_embedding(self, word_embedding_gather, segment_embedding_gather, posit
                 logger.info("Cannot fuse EmbedLayerNormalization: segment embedding table is not expected")
                 return False
 
-        # In normal case, word embeding table is the largest, and segment embedding table is the smallest, while postion embedding table is in between.
+        # In normal case, word embedding table is the largest, and segment embedding table is the smallest, while position embedding table is in between.
         # TODO: use other information (like initializer names) to identify different embedding weights automatically.
         if word_embedding_table.shape[0] <= position_embedding_table.shape[0]:
             logger.warning(
@@ -430,6 +430,7 @@ def create_fused_node(
         segment_embedding_gather: Union[None, NodeProto],
         position_ids: Optional[str] = None,
         embedding_sum_output=False,
+        embedding_sum_name=None,
     ):
         """Create an EmbedLayerNormalization node. Note that segment embedding is optional.
 
@@ -487,7 +488,8 @@ def create_fused_node(
 
         embed_node_outputs = [node_name + "_output", node_name + "_dummy_mask_index"]
         if embedding_sum_output:
-            embed_node_outputs.append(node_name + "_embedding_sum")
+            name = embedding_sum_name if embedding_sum_name is not None else node_name + "_embedding_sum"
+            embed_node_outputs.append(name)
 
         embed_node = helper.make_node(
             "EmbedLayerNormalization",
@@ -522,19 +524,8 @@ def finish_fusion(self, layernorm, embed_node):
         # use prune graph to remove nodes that is not needed
         self.prune_graph = True
 
-    def is_embedding_sum_needed(self, add_before_layer_norm):
-        """Check that Add before layer norm has an output to add before next layernorm
-
-        Args:
-            add_before_layer_norm (NodeProto): Add before any LayerNormalization node in topological order of graph
-
-        Returns:
-            bool: whether there is an extra output needed out of embed layer norm node
-        """
-
-        nodes = self.model.get_children(add_before_layer_norm)
-
-        return len(nodes) > 1
+    def is_skip_layer_norm_with_sum_output(self, node):
+        return (node.op_type == "SkipLayerNormalization") and len(node.output) > 3 and len(node.output[3]) > 0
 
     def fuse_gpt2(
         self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node, optional_segment_gather=None
@@ -570,21 +561,31 @@ def fuse_gpt2(
         if not self.check_embedding(word_embedding_gather, None, position_embedding_gather):
             return False
 
-        # If the add_before_layernorm node is an Add node, then the add_output output is the first index
-        # output of this node.
-
-        # If the add_before_layernorm node is SkipLayerNormalization node, then the add_output output
+        # If layernorm node is SkipLayerNormalization, we need look at its optional fourth output.
+        # If the add_before_layernorm node is an Add node, then the add_output output is the first output of this node.
+        # If the add_before_layernorm node is a SkipLayerNormalization node, then the add_output output
         # is the (optional) fourth index output of this node.
-        add_output = None
-        optional_embedding_sum_output = False
-        if (add_before_layernorm.op_type == "Add" and self.is_embedding_sum_needed(add_before_layernorm)) or (
-            add_before_layernorm.op_type == "SkipLayerNormalization" and len(add_before_layernorm.output) >= 4
-        ):
-            optional_embedding_sum_output = True
-            add_output = (
-                add_before_layernorm.output[0]
-                if add_before_layernorm.op_type == "Add"
-                else add_before_layernorm.output[3]
+        # When add_before_layernorm is SkipLayerNormalization, add_before_layernorm and layernorm are same node.
+        if layernorm.op_type == "SkipLayerNormalization":
+            need_embedding_sum_output = self.is_skip_layer_norm_with_sum_output(layernorm)
+            sum_output_index = 3
+            node_with_sum_output = layernorm
+            sum_output = layernorm.output[3] if need_embedding_sum_output else None
+            is_sum_graph_output = (sum_output is not None) and (self.model.find_graph_output(sum_output) is not None)
+        else:  # layernorm.op_type == "LayerNormalization"
+            node_with_sum_output = add_before_layernorm
+            sum_output_index = 0 if add_before_layernorm.op_type == "Add" else 3
+            sum_output = (
+                add_before_layernorm.output[sum_output_index]
+                if len(add_before_layernorm.output) > sum_output_index
+                else None
+            )
+            is_sum_graph_output = (sum_output is not None) and (self.model.find_graph_output(sum_output) is not None)
+            is_sum_used_by_multiple_nodes = (
+                sum_output and (sum_output in input_name_to_nodes) and len(input_name_to_nodes[sum_output]) > 1
+            )
+            need_embedding_sum_output = (sum_output is not None) and (
+                add_before_layernorm.op_type != "Add" or is_sum_graph_output or is_sum_used_by_multiple_nodes
             )
 
         # make the fused node
@@ -595,14 +596,16 @@ def fuse_gpt2(
             position_embedding_gather,
             optional_segment_gather,
             position_ids,
-            optional_embedding_sum_output,
+            embedding_sum_output=need_embedding_sum_output,
+            embedding_sum_name=sum_output if is_sum_graph_output else None,
         )
 
-        # direct the output to another add too
-        self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
-        if optional_embedding_sum_output:
-            self.model.replace_input_of_all_nodes(add_output, embed_node.output[2])
+        if need_embedding_sum_output:
+            node_with_sum_output.output[sum_output_index] = "_no_use__to_be_removed_"
+            if not is_sum_graph_output:
+                self.model.replace_input_of_all_nodes(sum_output, embed_node.output[2])
 
+        self.finish_fusion(layernorm, embed_node)
         return True
 
     def fuse_distilbert(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
@@ -707,9 +710,14 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             gather_0_path = self.model.match_parent_path(node, ["Gather"], [0])
             gather_1_path = self.model.match_parent_path(node, ["Gather"], [1])
             if gather_0_path is None and gather_1_path is not None:
+                if first_add_path is None:
+                    return
                 add_before_layernorm = first_add_path[0]
                 optional_segment_gather = gather_1_path[0]
             elif gather_0_path is not None and gather_1_path is None:
+                first_add_path = self.model.match_parent_path(node, ["Add"], [1])
+                if first_add_path is None:
+                    return
                 add_before_layernorm = first_add_path[0]
                 optional_segment_gather = gather_0_path[0]
             else:
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
index 7b9e758178e2d..a3f98d411ebad 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
@@ -239,7 +239,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 [0, None, 0, 0, 0, 0, 0],
                 output_name_to_node=output_name_to_node,
                 return_indice=return_indice,
-            )  # yapf: disable
+            )
         else:
             qkv_nodes = self.model.match_parent_path(
                 normalize_node,
@@ -247,7 +247,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 [None, 0, 0, 0, 0, 0],
                 output_name_to_node=output_name_to_node,
                 return_indice=return_indice,
-            )  # yapf: disable
+            )
 
         if qkv_nodes is None:
             return
@@ -361,7 +361,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                     "Div",
                 ],
                 [1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
-            )  # yapf: disable
+            )
             if mask_nodes is None:
                 logger.debug("fuse_attention: failed to match unidirectional mask path")
                 return
@@ -414,7 +414,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                         ),  # useless cast and reshape are removed.
                     ],
                     output_name_to_node,
-                )  # yapf: disable
+                )
                 if input_mask_nodes is None:
                     logger.debug("fuse_attention: failed to match input attention mask path")
                     return
@@ -437,7 +437,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                     ),
                 ],
                 output_name_to_node,
-            )  # yapf: disable
+            )
             if mask_nodes is None:
                 # TODO: match mask path for GPT2LMHeadModel_BeamSearchStep.
                 logger.debug("fuse_attention: failed to match mask path")
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py
index 052dd243fd788..7eb774b746cac 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py
@@ -72,9 +72,7 @@ def fuse_attention_node(
         self.prune_graph = True
 
     def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention):
-        mask_nodes = self.model.match_parent_path(
-            sub_qk, ["Mul", "Sub", "Slice", "Slice"], [1, 0, 1, 0]
-        )  # yapf: disable
+        mask_nodes = self.model.match_parent_path(sub_qk, ["Mul", "Sub", "Slice", "Slice"], [1, 0, 1, 0])
         if mask_nodes is None:
             logger.debug("fuse_attention: failed to match unidirectional mask path")
             return None
@@ -176,14 +174,14 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 ["Add", "Add", "MatMul", "Reshape", "Transpose", "MatMul"],
                 [0, 1, None, 0, 0, 0],
                 output_name_to_node=output_name_to_node,
-            )  # yapf: disable
+            )
         else:
             qkv_nodes = self.model.match_parent_path(
                 normalize_node,
                 ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
                 [1, None, 0, 0, 0],
                 output_name_to_node=output_name_to_node,
-            )  # yapf: disable
+            )
 
         if qkv_nodes is None:
             return
@@ -223,7 +221,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 "LayerNormalization",
             ],
             [1, 1, 0, 0, 0, None, 0],
-        )  # yapf: disable
+        )
 
         if v_nodes is None:
             v_nodes = self.model.match_parent_path(
@@ -238,7 +236,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                     "SkipLayerNormalization",
                 ],
                 [1, 1, 0, 0, 0, None, 0],
-            )  # yapf: disable
+            )
 
         if v_nodes is None:
             logger.debug("fuse_attention: failed to match v path")
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
index 83fa51dcfafa6..b217743c4ab14 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
@@ -76,7 +76,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 [0, None, 0, 0, 0, 0, 0],
                 output_name_to_node=output_name_to_node,
                 return_indice=return_indice,
-            )  # yapf: disable
+            )
         else:
             qkv_nodes = self.model.match_parent_path(
                 normalize_node,
@@ -84,7 +84,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 [None, 0, 0, 0, 0, 0],
                 output_name_to_node=output_name_to_node,
                 return_indice=return_indice,
-            )  # yapf: disable
+            )
 
         if qkv_nodes is None:
             return
@@ -116,7 +116,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             matmul_qkv,
             ["Transpose", "Reshape", "Split", "Reshape", "Gemm", "Reshape"],
             [1, 0, 0, 0, 0, 0],
-        )  # yapf: disable
+        )
         if v_nodes is None:
             logger.debug("fuse_attention: failed to match v path")
             return
@@ -168,7 +168,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                     "Div",
                 ],
                 [1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
-            )  # yapf: disable
+            )
             if mask_nodes is None:
                 logger.debug("fuse_attention: failed to match mask path")
                 return
@@ -201,7 +201,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                         "Div",
                     ],
                     [0, 0, 0, 1, 0, 0, 0, 0, 0],
-                )  # yapf: disable
+                )
                 if mask_nodes is None:
                     logger.debug("fuse_attention: failed to match mask path")
                     return
@@ -225,7 +225,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                     mul_qk,
                     ["Slice", "Slice", "Unsqueeze", "Squeeze", "Slice", "Shape", "Div"],
                     [1, 0, 2, 0, 0, 0, 0],
-                )  # yapf: disable
+                )
                 if mask_nodes is None:
                     logger.debug("fuse_attention: failed to match mask path")
                     return
diff --git a/onnxruntime/python/tools/transformers/fusion_group_norm.py b/onnxruntime/python/tools/transformers/fusion_group_norm.py
index 2cae366d3f9bd..a4491d29b3698 100644
--- a/onnxruntime/python/tools/transformers/fusion_group_norm.py
+++ b/onnxruntime/python/tools/transformers/fusion_group_norm.py
@@ -107,21 +107,19 @@ def fuse(self, add_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
         if weight_elements not in [320, 640, 960, 1280, 1920, 2560, 128, 256, 512]:
             logger.info("GroupNorm channels=%d", weight_elements)
 
-        gamma = helper.make_tensor(
+        self.add_initializer(
             name=group_norm_name + "_gamma",
             data_type=TensorProto.FLOAT,
             dims=[weight_elements],
-            vals=weight.flatten().tolist(),
+            vals=weight,
         )
-        self.model.add_initializer(gamma, self.this_graph_name)
 
-        beta = helper.make_tensor(
+        self.add_initializer(
             name=group_norm_name + "_beta",
             data_type=TensorProto.FLOAT,
             dims=[bias_elements],
-            vals=bias.flatten().tolist(),
+            vals=bias,
         )
-        self.model.add_initializer(beta, self.this_graph_name)
 
         last_node = add_node
         subgraph_nodes = [add_node, weight_mul, reshape_4d, instance_norm, reshape_3d, shape_node]
diff --git a/onnxruntime/python/tools/transformers/fusion_layernorm.py b/onnxruntime/python/tools/transformers/fusion_layernorm.py
index ec485e0dfaac0..68d26fc46fa23 100644
--- a/onnxruntime/python/tools/transformers/fusion_layernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_layernorm.py
@@ -187,7 +187,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
                 ),
             ],
             output_name_to_node,
-        )  # yapf: disable
+        )
 
         if parent_nodes is None:
             return
diff --git a/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py
index d8ecb652800f6..141ebb1f95a11 100644
--- a/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py
+++ b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py
@@ -54,13 +54,12 @@ def fuse(self, conv, input_name_to_nodes, output_name_to_node):
             weight = weight.transpose(0, 2, 3, 1)
 
             weight_name = node_name + "_weight_NHWC"
-            nhwc_weight = helper.make_tensor(
+            self.add_initializer(
                 name=weight_name,
                 data_type=TensorProto.FLOAT,
                 dims=list(weight.shape),
-                vals=weight.flatten().tolist(),
+                vals=weight,
             )
-            self.model.add_initializer(nhwc_weight, self.this_graph_name)
             weight_transpose_node = None
         else:
             weight_transpose_node = self.create_transpose_node(conv.input[1], [0, 2, 3, 1])
diff --git a/onnxruntime/python/tools/transformers/fusion_options.py b/onnxruntime/python/tools/transformers/fusion_options.py
index 57f0fea99d145..69b5cd26f4525 100644
--- a/onnxruntime/python/tools/transformers/fusion_options.py
+++ b/onnxruntime/python/tools/transformers/fusion_options.py
@@ -45,6 +45,9 @@ def __init__(self, model_type):
         self.enable_gemm_fast_gelu = False
         self.group_norm_channels_last = True
 
+        if model_type == "clip":
+            self.enable_embed_layer_norm = False
+
         # Set default to sequence length for BERT model to use fused attention to speed up.
         # Note that embed layer normalization will convert 2D mask to 1D when mask type is MaskIndexEnd.
         self.attention_mask_format = AttentionMaskFormat.AttentionMask
diff --git a/onnxruntime/python/tools/transformers/fusion_transpose.py b/onnxruntime/python/tools/transformers/fusion_transpose.py
index 6602d168309f0..2762d95dd7b00 100644
--- a/onnxruntime/python/tools/transformers/fusion_transpose.py
+++ b/onnxruntime/python/tools/transformers/fusion_transpose.py
@@ -139,23 +139,23 @@ def fuse(
         # Here we use hard-coded name so that it could be shared for the whole model.
         axes_1 = "ort_const_unsqueeze_axes_1"
         if self.model.get_initializer(axes_1) is None:
-            axes_1_tensor = helper.make_tensor(
+            self.add_initializer(
                 name=axes_1,
                 data_type=TensorProto.INT64,
                 dims=[1],
                 vals=[1],
+                raw=False,
             )
-            self.model.add_initializer(axes_1_tensor, self.this_graph_name)
 
         axes_2 = "ort_const_unsqueeze_axes_2"
         if self.model.get_initializer(axes_2) is None:
-            axes_2_tensor = helper.make_tensor(
+            self.add_initializer(
                 name=axes_2,
                 data_type=TensorProto.INT64,
                 dims=[1],
                 vals=[2],
+                raw=False,
             )
-            self.model.add_initializer(axes_2_tensor, self.this_graph_name)
 
         unsqueeze_3.input[1] = "ort_const_unsqueeze_axes_2"
         unsqueeze_2.input[1] = "ort_const_unsqueeze_axes_1"
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
index d184224317a71..facbd3bf6944e 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
@@ -384,8 +384,7 @@ Some kernels are enabled by MIOpen. We hereby thank for the AMD developers' coll
 
 There are other optimizations might improve the performance or reduce memory footprint:
 * Export the whole pipeline into a single ONNX model. Currently, there are multiple ONNX models (CLIP, VAE and U-Net etc). Each model uses separated thread pool and memory allocator. Combine them into one model could share thread pool and memory allocator. The end result is more efficient and less memory footprint.
-* For Stable Diffusion 2.1, we disable TensorRT flash attention kernel and use only memory efficient attention. It is possible to add flash attention using Triton compiler to improve performance.
+* For Stable Diffusion 2.1, we disable TensorRT flash attention kernel and use only memory efficient attention. It is possible to add flash attention in Windows to improve performance.
 * Reduce GPU memory footprint by actively deleting buffers for intermediate results.
-* Attention fusion in CLIP
 * Safety Checker Optimization
 * Leverage FP8 in latest GPU
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
index a8e3c69332339..22fee4bfeab29 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
@@ -150,18 +150,19 @@ def optimize_sd_pipeline(
                 op_block_list=op_block_list + force_fp32_operators[name],
             )
 
-        if enable_runtime_optimization and (float16 or (name not in ["unet"])):
+        if enable_runtime_optimization:
             # Use this step to see the final graph that executed by Onnx Runtime.
-            # Note that ORT cannot save model larger than 2GB so we exclude unet float32 model.
-            # This step is optional since it has no impact on performance except model loading time.
             with tempfile.TemporaryDirectory() as tmp_dir:
                 # Save to a temporary file so that we can load it with Onnx Runtime.
                 logger.info("Saving a temporary model to run OnnxRuntime graph optimizations...")
                 tmp_model_path = Path(tmp_dir) / "model.onnx"
-                m.save_model_to_file(str(tmp_model_path))
-                ort_optimized_model_path = tmp_model_path
+                m.save_model_to_file(str(tmp_model_path), use_external_data_format=use_external_data_format)
+                ort_optimized_model_path = Path(tmp_dir) / "optimized.onnx"
                 optimize_by_onnxruntime(
-                    str(tmp_model_path), use_gpu=True, optimized_model_path=str(ort_optimized_model_path)
+                    str(tmp_model_path),
+                    use_gpu=True,
+                    optimized_model_path=str(ort_optimized_model_path),
+                    save_as_external_data=use_external_data_format,
                 )
                 model = onnx.load(str(ort_optimized_model_path), load_external_data=True)
                 m = model_type_class_mapping[model_type](model)
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
index 7e2325c148efa..3b1e656136547 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
@@ -135,7 +135,7 @@ def chain_model(args):
 
     # Initializers/opsets
     # Delete shared data between decoder/encoder and move to larger graph initializers
-    initializers = get_shared_initializers(encoder_model, decoder_model, require_raw_data=True)
+    initializers = get_shared_initializers(encoder_model, decoder_model)
     node.attribute.extend(
         [
             helper.make_attribute("decoder", decoder_model.graph),
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index 4f74da577dfee..60be2d84b2bc8 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -23,6 +23,7 @@
     numpy_helper,
     save_model,
 )
+from onnx.external_data_helper import load_external_data_for_tensor, uses_external_data
 from shape_infer_helper import SymbolicShapeInferenceHelper
 
 logger = logging.getLogger(__name__)
@@ -815,51 +816,77 @@ def prune_graph(self, outputs=None, allow_remove_graph_inputs=True):
         """
 
         if len(self.graphs()) > 1:
+            # TODO(tianleiwu): handle subgraph
             logger.debug("Skip prune_graph since graph has subgraph")
             return
 
-        if outputs is None:
-            outputs = [output.name for output in self.model.graph.output]
+        keep_outputs = [output.name for output in self.model.graph.output] if outputs is None else outputs
 
         output_name_to_node = self.output_name_to_node()
-        all_nodes = []
-        for output in outputs:
-            if output in output_name_to_node:
-                last_node = output_name_to_node[output]
-                if last_node in all_nodes:
-                    continue
-                nodes = self.get_parent_subgraph_nodes(last_node, [])
-                all_nodes.append(last_node)
-                all_nodes.extend(nodes)
 
-        nodes_to_remove = [node for node in self.model.graph.node if node not in all_nodes]
+        def get_first_output(node):
+            if node.output[0]:
+                return node.output[0]
+            return next(iter([o for o in node.output if o]), None)
 
-        self.remove_nodes(nodes_to_remove)
+        # Keep track of nodes to keep. The key is first output of node, and the value is the node.
+        output_to_node = {}
 
-        # remove outputs not in list
-        output_to_remove = []
-        for output in self.model.graph.output:
-            if output.name not in outputs:
-                output_to_remove.append(output)
-        for output in output_to_remove:
-            self.model.graph.output.remove(output)
+        # Start from graph outputs, and find parent nodes recurisvely, and add nodes to the output_to_node dictionary.
+        dq = deque()
+        for output in keep_outputs:
+            if output in output_name_to_node:
+                dq.append(output_name_to_node[output])
+        while len(dq) > 0:
+            node = dq.pop()
+            first_output = get_first_output(node)
+            if first_output and (first_output not in output_to_node):
+                output_to_node[first_output] = node
+                for name in node.input:
+                    if len(name) > 0 and (name in output_name_to_node) and (name not in output_to_node):
+                        dq.appendleft(output_name_to_node[name])
+
+        # Keep only those nodes in the output_to_node dictionary.
+        nodes_to_keep = []
+        num_nodes_removed = 0
+        for node in self.model.graph.node:
+            first_output = get_first_output(node)
+            kept_node = output_to_node[first_output] if first_output in output_to_node else None
+
+            # Need double check the node since fused node might reuse output name of some nodes to be removed.
+            # It is slow to compare whole node, so we compare op_type first to avoid comparing node in most cases.
+            if kept_node and kept_node.op_type == node.op_type and kept_node == node:
+                nodes_to_keep.append(node)
+            else:
+                num_nodes_removed += 1
+        self.model.graph.ClearField("node")
+        self.model.graph.node.extend(nodes_to_keep)
 
-        # remove inputs not used by any node.
+        # Remove graph outputs not in list
+        output_to_remove = []
+        if outputs is not None:
+            for output in self.model.graph.output:
+                if output.name not in outputs:
+                    output_to_remove.append(output)
+            for output in output_to_remove:
+                self.model.graph.output.remove(output)
+
+        # Remove graph inputs not used by any node.
         input_to_remove = []
         if allow_remove_graph_inputs:
             input_name_to_nodes = self.input_name_to_nodes()
             input_to_remove = [input for input in self.model.graph.input if input.name not in input_name_to_nodes]
-            for input in input_to_remove:
-                self.model.graph.input.remove(input)
+            for name in input_to_remove:
+                self.model.graph.input.remove(name)
 
-        if input_to_remove or output_to_remove or nodes_to_remove:
+        if input_to_remove or output_to_remove or num_nodes_removed > 0:
             removed = []
             if input_to_remove:
                 removed.append(f"{len(input_to_remove)} inputs")
             if output_to_remove:
                 removed.append(f"{len(output_to_remove)} outputs")
-            if nodes_to_remove:
-                removed.append(f"{len(nodes_to_remove)} nodes")
+            if num_nodes_removed > 0:
+                removed.append(f"{num_nodes_removed} nodes")
             logger.info("Removed %s", ", ".join(removed))
 
         self.update_graph()
@@ -1091,29 +1118,72 @@ def get_operator_statistics(self, include_domain=False):
         return op_count
 
     @staticmethod
-    def has_same_value(tensor1: TensorProto, tensor2: TensorProto, require_raw_data: bool = False) -> bool:
+    def to_data_hash(tensor: TensorProto, base_dir: str = "") -> int:
+        """Converts a tensor def object to a hash for data comparison purposes.
+        Args:
+            tensor: a TensorProto object.
+            base_dir: if external tensor exists, base_dir can help to find the path to it
+        Returns:
+            hash: a hash of the data.
+        """
+        if tensor.HasField("segment"):
+            raise ValueError("Currently not supporting loading segments.")
+        if tensor.data_type == TensorProto.UNDEFINED:
+            raise TypeError("The element type in the input tensor is not defined.")
+        tensor_dtype = tensor.data_type
+        storage_field = helper.tensor_dtype_to_field(tensor_dtype)
+
+        if tensor.data_type == TensorProto.STRING:
+            utf8_strings = getattr(tensor, storage_field)
+            return hash(tuple(s.decode("utf-8") for s in utf8_strings))
+        # Load raw data from external tensor if it exists
+        if uses_external_data(tensor):
+            load_external_data_for_tensor(tensor, base_dir)
+        if tensor.HasField("raw_data"):
+            return hash(tensor.raw_data)
+        else:
+            np_data = numpy_helper.to_array(tensor)
+            return hash(np_data.tobytes())
+
+    @staticmethod
+    def has_same_value(
+        tensor1: TensorProto,
+        tensor2: TensorProto,
+        signature_cache1: Optional[dict] = None,
+        signature_cache2: Optional[dict] = None,
+    ) -> bool:
         """Returns True when two tensors have same value.
            Note that name can be different.
 
         Args:
             tensor1 (TensorProto): initializer 1
             tensor2 (TensorProto): initializer 2
-            require_raw_data (bool): ignore tensors without raw_data
-                Note: Flag can speed up runtime significantly
-
+            signature_cache1 (dict): Optional dictionary to store data signatures of tensor1 in order to speed up comparison.
+            signature_cache2 (dict): Optional dictionary to store data signatures of tensor2 in order to speed up comparison.
         Returns:
             bool: True when two intializers has same value.
         """
-        if tensor1.data_type != tensor2.data_type or tensor1.dims != tensor2.dims:
-            return False
-        if tensor1.HasField("raw_data") and tensor2.HasField("raw_data"):
-            return tensor1.raw_data == tensor2.raw_data
-        if require_raw_data:
-            return False
+        sig1 = (
+            signature_cache1[tensor1.name]
+            if signature_cache1 and tensor1.name in signature_cache1
+            else OnnxModel.to_data_hash(tensor1)
+        )
+        sig2 = (
+            signature_cache2[tensor2.name]
+            if signature_cache2 and tensor2.name in signature_cache2
+            else OnnxModel.to_data_hash(tensor2)
+        )
+        if signature_cache1 is not None:
+            signature_cache1[tensor1.name] = sig1
+        if signature_cache2 is not None:
+            signature_cache2[tensor2.name] = sig2
+        if sig1 == sig2 and tensor1.data_type == tensor2.data_type and tensor1.dims == tensor2.dims:
+            # Same signature, now do the expensive check to confirm the data is the same
+            return (numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2)).all()
 
-        return (numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2)).all()
+        return False
 
-    def remove_duplicated_initializer(self, require_raw_data: bool = False):
+    def remove_duplicated_initializer(self, cache: Optional[dict] = None):
         """Remove initializers with duplicated values, and only keep the first one.
         It could help reduce size of models (like ALBert) with shared weights.
         If require_raw_data passed, method will only compare raw_data initializers to speed runtime
@@ -1130,7 +1200,7 @@ def remove_duplicated_initializer(self, require_raw_data: bool = False):
                 continue
             for j in range(i + 1, initializer_count):
                 if OnnxModel.has_same_value(
-                    self.model.graph.initializer[i], self.model.graph.initializer[j], require_raw_data
+                    self.model.graph.initializer[i], self.model.graph.initializer[j], cache, cache
                 ):
                     same[j] = i
 
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
index 1229825fec3d4..c781a91c9e493 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
@@ -435,7 +435,7 @@ def remove_extra_reshape_2(self):
                     "SkipLayerNormalization",
                 ],
                 [None, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-            )  # yapf: disable
+            )
             if path is None:
                 continue
 
diff --git a/onnxruntime/python/tools/transformers/onnx_model_clip.py b/onnxruntime/python/tools/transformers/onnx_model_clip.py
index 93e8623768067..9b4ca03a47a5b 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_clip.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_clip.py
@@ -5,15 +5,17 @@
 
 from logging import getLogger
 
+from fusion_attention_clip import FusionAttentionClip
 from onnx import ModelProto
-from onnx_model_unet import UnetOnnxModel
+from onnx_model_bert import BertOnnxModel
 
 logger = getLogger(__name__)
 
 
-class ClipOnnxModel(UnetOnnxModel):
+class ClipOnnxModel(BertOnnxModel):
     def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
         super().__init__(model, num_heads=num_heads, hidden_size=hidden_size)
+        self.clip_attention_fusion = FusionAttentionClip(self, self.hidden_size, self.num_heads)
 
     def get_fused_operator_statistics(self):
         """
@@ -31,3 +33,6 @@ def get_fused_operator_statistics(self):
 
         logger.info(f"Optimized operators:{op_count}")
         return op_count
+
+    def fuse_attention(self):
+        self.clip_attention_fusion.apply()
diff --git a/onnxruntime/python/tools/transformers/onnx_model_t5.py b/onnxruntime/python/tools/transformers/onnx_model_t5.py
index 8fb31da4a61f7..ab6a7c72a2c7a 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_t5.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_t5.py
@@ -111,7 +111,8 @@ def create_attention_node(
             name=attention_node_name + "_qkv_weight",
             data_type=TensorProto.FLOAT,
             dims=[qw_in_size, qkv_weight_dim],
-            vals=qkv_weight.flatten().tolist(),
+            vals=qkv_weight.tobytes(),
+            raw=True,
         )
 
         self.model.add_initializer(weight, self.this_graph_name)
@@ -665,7 +666,8 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             name=self.model.create_node_name("bias_table_weight", name_prefix=node_name_prefix),
             data_type=TensorProto.FLOAT,
             dims=[np.shape(table_weight)[0], np.shape(table_weight)[1]],
-            vals=table_weight_t.flatten().tolist(),
+            vals=table_weight_t.tobytes(),
+            raw=True,
         )
 
         self.model.add_initializer(bias_table, self.this_graph_name)
diff --git a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
index d1815394e9661..98235de6ba6fd 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
@@ -5,10 +5,9 @@
 import logging
 from typing import Union
 
-import numpy as np
 from fusion_attention import AttentionMask, FusionAttention
 from fusion_utils import NumpyHelper
-from onnx import NodeProto, TensorProto, helper, numpy_helper
+from onnx import NodeProto, helper
 from onnx_model import OnnxModel
 from onnx_model_bert import BertOnnxModel
 
@@ -57,26 +56,24 @@ def create_attention_node(
 
         attention_node_name = self.model.create_node_name("Attention")
 
+        tensor_dtype = weight.data_type
+        np_type = helper.tensor_dtype_to_np_dtype(tensor_dtype)
         weight = helper.make_tensor(
             name=attention_node_name + "_qkv_weight",
-            data_type=TensorProto.FLOAT,
+            data_type=tensor_dtype,
             dims=[hidden_size, 3 * hidden_size],
-            vals=qkv_weight.flatten().tolist(),
+            vals=qkv_weight.astype(np_type).tobytes(),
+            raw=True,
         )
-
-        # Sometimes weights and bias are stored in fp16
-        if weight.data_type == 10:
-            weight.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(weight).astype(np.float16), weight.name))
         self.model.add_initializer(weight, self.this_graph_name)
 
         bias = helper.make_tensor(
             name=attention_node_name + "_qkv_bias",
-            data_type=TensorProto.FLOAT,
+            data_type=tensor_dtype,
             dims=[3 * hidden_size],
-            vals=qkv_bias.flatten().tolist(),
+            vals=qkv_bias.astype(np_type).tobytes(),
+            raw=True,
         )
-        if bias.data_type == 10:
-            bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name))
         self.model.add_initializer(bias, self.this_graph_name)
 
         attention_inputs = [
diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc
index 4da0d9b4880f6..ec572ce9deed8 100644
--- a/onnxruntime/test/framework/execution_frame_test.cc
+++ b/onnxruntime/test/framework/execution_frame_test.cc
@@ -496,14 +496,16 @@ TEST(ExecutionFrameTestInit, InitializerAsOutput) {
 
 #if !defined(DISABLE_SPARSE_TENSORS)
 TEST(ExecutionFrameTestInit, SparseInitializerAsOutput) {
-  const std::vector<int64_t> dense_shape{3, 3};
-  std::vector<float> dense_data = {
-      0, 0, 1.764052391052246f,
-      0.40015721321105957f, 0, 0.978738009929657f,
-      0, 0, 0};
+  constexpr std::array<int64_t, 2> dense_shape{3, 3};
 
-  const std::vector<float> expected_values = {1.764052391052246f, 0.40015721321105957f, 0.978738009929657f};
-  const std::vector<int64_t> expected_linear_indices = {2, 3, 5};
+  // Tensor data in a dense form, useful for debugging and reference.
+  // constexpr std::array<float, 9> dense_data = {
+  //     0, 0, 1.764052391052246f,
+  //     0.40015721321105957f, 0, 0.978738009929657f,
+  //     0, 0, 0};
+
+  constexpr std::array<float, 3> expected_values = {1.764052391052246f, 0.40015721321105957f, 0.978738009929657f};
+  constexpr std::array<int64_t, 3> expected_linear_indices = {2, 3, 5};
 
   // sparse_initializer_as_output.onnx
   SessionOptions so;
@@ -515,14 +517,18 @@ TEST(ExecutionFrameTestInit, SparseInitializerAsOutput) {
     ASSERT_STATUS_OK(session.Initialize());
 
     auto allocator = test::AllocatorManager::Instance().GetAllocator(CPU);
-    auto p_tensor = std::make_unique<SparseTensor>();
 
     std::vector<OrtValue> results;
     results.resize(1);
-    auto ml_type = DataTypeImpl::GetType<SparseTensor>();
-    results[0].Init(p_tensor.release(), ml_type, ml_type->GetDeleteFunc());
+
+    // Initialize the output value as a SparseTensor with pre-allocated memory
+    // this is done here to test output types.
+    auto element_type = DataTypeImpl::GetSparseTensorType<float>()->AsSparseTensorType()->GetElementType();
+    SparseTensor::InitOrtValue(element_type, TensorShape(dense_shape), allocator, results[0]);
+
     RunOptions ro;
-    ASSERT_STATUS_OK(session.Run(ro, EmptySpan<std::string>(), EmptySpan<OrtValue>(), AsSpan<std::string>({"values"}), &results, nullptr));
+    ASSERT_STATUS_OK(session.Run(ro, EmptySpan<std::string>(), EmptySpan<OrtValue>(),
+                                 AsSpan<std::string>({"values"}), &results, nullptr));
 
     ASSERT_TRUE(results[0].IsAllocated());
     ASSERT_TRUE(results[0].IsSparseTensor());
diff --git a/onnxruntime/test/framework/float_8_test.cc b/onnxruntime/test/framework/float_8_test.cc
new file mode 100644
index 0000000000000..948e0e05a9141
--- /dev/null
+++ b/onnxruntime/test/framework/float_8_test.cc
@@ -0,0 +1,96 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+#include <vector>
+
+#include "core/framework/float8.h"
+#include "test/capturing_sink.h"
+#include "test/test_environment.h"
+#include "test_utils.h"
+#include "gtest/gtest.h"
+
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::common;
+
+namespace onnxruntime {
+namespace test {
+
+TEST(Float8_Tests, CastE4M3FN) {
+  std::vector<std::pair<float, float>> cases{
+      std::pair<float, float>(0.00439453125, 0.00390625),
+      std::pair<float, float>(0.005859375, 0.005859375),
+      std::pair<float, float>(0.005759375, 0.005859375),
+      std::pair<float, float>(0.0046875, 0.00390625),
+      std::pair<float, float>(0.001953125, 0.001953125),
+      std::pair<float, float>(0.0029296875, 0.00390625),
+      std::pair<float, float>(0.002053125, 0.001953125),
+      std::pair<float, float>(0.00234375, 0.001953125),
+      std::pair<float, float>(0.0087890625, 0.0078125),
+      std::pair<float, float>(0.001171875, 0.001953125),
+      std::pair<float, float>(1.8131605, 1.875)};
+  for (auto it : cases) {
+    auto f8 = onnxruntime::Float8E4M3FN(it.first);
+    auto f8_32 = f8.ToFloat();
+    EXPECT_EQ(it.second, f8_32);
+  }
+}
+
+union float_bits {
+  uint32_t bits;
+  float val;
+};
+
+TEST(Float8_Tests, NanE4M3FN) {
+  EXPECT_EQ(onnxruntime::Float8E4M3FN((float_bits{0x7F800000}).val).val, static_cast<uint8_t>(0x7E));
+  EXPECT_EQ(onnxruntime::Float8E4M3FN((float_bits{0xFF800000}).val).val, static_cast<uint8_t>(0xFE));
+  EXPECT_EQ(onnxruntime::Float8E4M3FN((float_bits{0x7F800000}).val, false).val, static_cast<uint8_t>(0x7F));
+  EXPECT_EQ(onnxruntime::Float8E4M3FN((float_bits{0xFF800000}).val, false).val, static_cast<uint8_t>(0xFF));
+  EXPECT_EQ(onnxruntime::Float8E4M3FN((float_bits{0x7F800001}).val).val, static_cast<uint8_t>(0x7F));
+  EXPECT_EQ(onnxruntime::Float8E4M3FN((float_bits{0xFF800001}).val).val, static_cast<uint8_t>(0xFF));
+  // 0x7FC00000 is the value used by numpy.
+  EXPECT_EQ(onnxruntime::Float8E4M3FN((float_bits{0x7FC00000}).val).val, static_cast<uint8_t>(0x7F));
+  EXPECT_EQ(onnxruntime::Float8E4M3FN((float_bits{0xFFC00000}).val).val, static_cast<uint8_t>(0xFF));
+}
+
+TEST(Float8_Tests, NanE4M3FNUZ) {
+  EXPECT_EQ(onnxruntime::Float8E4M3FNUZ((float_bits{0x7F800000}).val).val, static_cast<uint8_t>(0x7F));
+  EXPECT_EQ(onnxruntime::Float8E4M3FNUZ((float_bits{0xFF800000}).val).val, static_cast<uint8_t>(0xFF));
+  EXPECT_EQ(onnxruntime::Float8E4M3FNUZ((float_bits{0x7F800000}).val, false).val, static_cast<uint8_t>(0x80));
+  EXPECT_EQ(onnxruntime::Float8E4M3FNUZ((float_bits{0xFF800000}).val, false).val, static_cast<uint8_t>(0x80));
+  EXPECT_EQ(onnxruntime::Float8E4M3FNUZ((float_bits{0x7F800001}).val).val, static_cast<uint8_t>(0x80));
+  EXPECT_EQ(onnxruntime::Float8E4M3FNUZ((float_bits{0xFF800001}).val).val, static_cast<uint8_t>(0x80));
+  // 0x7FC00000 is the value used by numpy.
+  EXPECT_EQ(onnxruntime::Float8E4M3FNUZ((float_bits{0x7FC00000}).val).val, static_cast<uint8_t>(0x80));
+  EXPECT_EQ(onnxruntime::Float8E4M3FNUZ((float_bits{0xFFC00000}).val).val, static_cast<uint8_t>(0x80));
+}
+
+TEST(Float8_Tests, NanE5M2) {
+  EXPECT_EQ(onnxruntime::Float8E5M2((float_bits{0x7F800000}).val).val, static_cast<uint8_t>(0x7B));
+  EXPECT_EQ(onnxruntime::Float8E5M2((float_bits{0xFF800000}).val).val, static_cast<uint8_t>(0xFB));
+  EXPECT_EQ(onnxruntime::Float8E5M2((float_bits{0x7F800000}).val, false).val, static_cast<uint8_t>(0x7C));
+  EXPECT_EQ(onnxruntime::Float8E5M2((float_bits{0xFF800000}).val, false).val, static_cast<uint8_t>(0xFC));
+  EXPECT_EQ(onnxruntime::Float8E5M2((float_bits{0x7F800001}).val).val, static_cast<uint8_t>(0x7F));
+  EXPECT_EQ(onnxruntime::Float8E5M2((float_bits{0xFF800001}).val).val, static_cast<uint8_t>(0xFF));
+  // 0x7FC00000 is the value used by numpy.
+  EXPECT_EQ(onnxruntime::Float8E5M2((float_bits{0x7FC00000}).val).val, static_cast<uint8_t>(0x7F));
+  EXPECT_EQ(onnxruntime::Float8E5M2((float_bits{0xFFC00000}).val).val, static_cast<uint8_t>(0xFF));
+}
+
+TEST(Float8_Tests, NanE5M2FNUZ) {
+  EXPECT_EQ(onnxruntime::Float8E5M2FNUZ((float_bits{0x7F800000}).val).val, static_cast<uint8_t>(0x7F));
+  EXPECT_EQ(onnxruntime::Float8E5M2FNUZ((float_bits{0xFF800000}).val).val, static_cast<uint8_t>(0xFF));
+  EXPECT_EQ(onnxruntime::Float8E5M2FNUZ((float_bits{0x7F800000}).val, false).val, static_cast<uint8_t>(0x80));
+  EXPECT_EQ(onnxruntime::Float8E5M2FNUZ((float_bits{0xFF800000}).val, false).val, static_cast<uint8_t>(0x80));
+  EXPECT_EQ(onnxruntime::Float8E5M2FNUZ((float_bits{0x7F800001}).val).val, static_cast<uint8_t>(0x80));
+  EXPECT_EQ(onnxruntime::Float8E5M2FNUZ((float_bits{0xFF800001}).val).val, static_cast<uint8_t>(0x80));
+  // 0x7FC00000 is the value used by numpy.
+  EXPECT_EQ(onnxruntime::Float8E5M2FNUZ((float_bits{0x7FC00000}).val).val, static_cast<uint8_t>(0x80));
+  EXPECT_EQ(onnxruntime::Float8E5M2FNUZ((float_bits{0xFFC00000}).val).val, static_cast<uint8_t>(0x80));
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // DISABLE_FLOAT8_TYPES
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index fa3d61a28b658..077c6ff58e2da 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -1218,13 +1218,13 @@ TEST(InferenceSessionTests, TestOptionalInputs) {
     // required, optional and invalid input
     status = RunOptionalInputTest(true, true, true, version, sess_env);
     ASSERT_FALSE(status.IsOK());
-    EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid Feed Input Name"));
+    EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid input name"));
 
     // missing required
     status = RunOptionalInputTest(false, true, false, version, sess_env);
     ASSERT_FALSE(status.IsOK());
     if (version == 3) {
-      EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid Feed Input Name"));
+      EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid input name"));
     } else {
       EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Missing Input:"));
     }
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 8a6f3b1cd8416..062ca4ece86bf 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -1223,6 +1223,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     broken_tests.insert({"sce_sum_log_prob", "result differs"});
     broken_tests.insert({"sce_sum_log_prob_expanded", "result differs"});
     broken_tests.insert({"gridsample_reflection_padding", "result differs"});
+    broken_tests.insert({"spacetodepth", "result differs"});
   }
 #if defined(_WIN32) && !defined(_WIN64)
   broken_tests.insert({"vgg19", "failed: bad allocation"});
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 8e1511bcaafeb..553fcca92aa78 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -613,6 +613,36 @@ TEST_F(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) {
   }
 }
 
+TEST_F(GraphTransformationTests, ConstantFoldingUnsupportedFloat16) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "constant_float16_mul.onnx";
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_));
+  Graph& graph = model->MainGraph();
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Mul"] == 1);
+  std::unique_ptr<CPUExecutionProvider> e =
+      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+
+  // assign all nodes to CUDA. the constant folding should try folding the node on the CPU and fail, thus leaving the
+  // EP as CUDA and not constant folding the node.
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kCudaExecutionProvider);
+  }
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Mul"] == 1);
+
+  // all nodes should still be on CUDA
+  for (auto& node : graph.Nodes()) {
+    EXPECT_STREQ(node.GetExecutionProviderType().c_str(), kCudaExecutionProvider);
+  }
+}
+
 TEST_F(GraphTransformationTests, ConstantFoldingSubgraph) {
   TensorProto value_tensor;
   value_tensor.add_dims(1);
diff --git a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
index 2631f460f208e..63b92cfc187bd 100644
--- a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
@@ -35,7 +35,10 @@ TEST(TensorOpTest, SpaceToDepthTest_1) {
       1.1f, 1.3f,
       3.1f, 3.3f};
   test.AddOutput<float>("output", {N, C * blocksize * blocksize, H / blocksize, W / blocksize}, result);
-  test.Run();
+
+  // TODO: Test is flaky on QNN EP (CPU backend). Reneable when the QnnCPUBackendTests.DISABLED_SpaceToDepth_Flaky test
+  // is fixed.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});
 }
 
 TEST(TensorOpTest, SpaceToDepthTest_1_double) {
@@ -99,7 +102,10 @@ TEST(TensorOpTest, SpaceToDepthTest_2) {
       98., 101., 66., 69., 84., 87., 102., 105., 67., 70., 85.,
       88., 103., 106., 68., 71., 86., 89., 104., 107.};
   test.AddOutput<float>("output", {2, 27, 1, 2}, result);
-  test.Run();
+
+  // TODO: Test is flaky on QNN EP (CPU backend). Reneable when the QnnCPUBackendTests.DISABLED_SpaceToDepth_Flaky2
+  // test is fixed.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});
 }
 
 TEST(TensorOpTest, DepthToSpaceTest_1) {
diff --git a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
index e579e3274e699..eaeebba5bea5c 100644
--- a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
+++ b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
@@ -43,7 +43,7 @@ static GetTestQDQModelFn<QType> BuildQDQArgMxxTestCase(const std::string& op_typ
   return [op_type, input_def, attrs](ModelTestBuilder& builder,
                                      std::vector<QuantParams<QType>>& output_qparams) {
     ORT_UNUSED_PARAMETER(output_qparams);
-    QuantParams<QType> input_qparams = GetTestInputQuantParams(input_def);
+    QuantParams<QType> input_qparams = GetTestInputQuantParams<QType>(input_def);
 
     auto* input = MakeTestInput(builder, input_def);
 
@@ -205,7 +205,7 @@ TEST_F(QnnHTPBackendTests, ArgMaxMin_AsGraphOutputUnsupported) {
   auto model_builder_func = [](const std::string& op_type, const TestInputDef<float>& input_def,
                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) -> GetTestModelFn {
     return [op_type, input_def, attrs](ModelTestBuilder& builder) {
-      QuantParams<uint8_t> input_qparams = GetTestInputQuantParams(input_def);
+      QuantParams<uint8_t> input_qparams = GetTestInputQuantParams<uint8_t>(input_def);
 
       auto* input = MakeTestInput(builder, input_def);
       auto* output = builder.MakeOutput();
diff --git a/onnxruntime/test/providers/qnn/average_pool_test.cc b/onnxruntime/test/providers/qnn/average_pool_test.cc
index 114802d56cfd3..79ec07796c0e8 100644
--- a/onnxruntime/test/providers/qnn/average_pool_test.cc
+++ b/onnxruntime/test/providers/qnn/average_pool_test.cc
@@ -5,7 +5,9 @@
 
 #include <string>
 #include <unordered_map>
+#include <vector>
 
+#include "core/graph/node_attr_utils.h"
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
@@ -16,87 +18,11 @@
 namespace onnxruntime {
 namespace test {
 
-// Returns a function that creates a graph with a single AveragePool operator.
-static GetTestModelFn BuildAveragePoolTestCase(const TestInputDef<float>& input_def,
-                                               const std::vector<int64_t>& kernel_shape,
-                                               const std::vector<int64_t>& strides,
-                                               const std::vector<int64_t>& pads,
-                                               int64_t count_include_pad,
-                                               const std::string& auto_pad = "NOTSET") {
-  return [input_def, kernel_shape, strides, pads,
-          count_include_pad, auto_pad](ModelTestBuilder& builder) {
-    auto* input = MakeTestInput(builder, input_def);
-
-    auto* output = builder.MakeOutput();
-    Node& pool_node = builder.AddNode("AveragePool", {input}, {output});
-
-    pool_node.AddAttribute("kernel_shape", kernel_shape);
-
-    if (!strides.empty()) {
-      pool_node.AddAttribute("strides", strides);
-    }
-
-    pool_node.AddAttribute("auto_pad", auto_pad);
-
-    if (!pads.empty() && auto_pad == "NOTSET") {
-      pool_node.AddAttribute("pads", pads);
-    }
-
-    if (count_include_pad > 0) {
-      pool_node.AddAttribute("count_include_pad", count_include_pad);
-    }
-  };
-}
-
-// Returns a function that creates a graph with a QDQ AveragePool operator.
-template <typename QuantType>
-GetTestQDQModelFn<QuantType> BuildAveragePoolQDQTestCase(const TestInputDef<float>& input_def,
-                                                         const std::vector<int64_t>& kernel_shape,
-                                                         const std::vector<int64_t>& strides,
-                                                         const std::vector<int64_t>& pads,
-                                                         int64_t count_include_pad,
-                                                         const std::string& auto_pad = "NOTSET") {
-  return [input_def, kernel_shape, strides, pads,
-          count_include_pad, auto_pad](ModelTestBuilder& builder,
-                                       std::vector<QuantParams<QuantType>>& output_qparams) {
-    auto* input_arg = MakeTestInput(builder, input_def);
-
-    // add QDQ + AveragePool
-    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
-    auto* dq_output = AddQDQNodePair<QuantType>(builder, input_arg, input_qparams.scale, input_qparams.zero_point);
-    auto* averagepool_output = builder.MakeIntermediate();
-    Node& pool_node = builder.AddNode("AveragePool", {dq_output}, {averagepool_output});
-
-    pool_node.AddAttribute("kernel_shape", kernel_shape);
-
-    if (!strides.empty()) {
-      pool_node.AddAttribute("strides", strides);
-    }
-
-    pool_node.AddAttribute("auto_pad", auto_pad);
-
-    if (!pads.empty() && auto_pad == "NOTSET") {
-      pool_node.AddAttribute("pads", pads);
-    }
-
-    if (count_include_pad > 0) {
-      pool_node.AddAttribute("count_include_pad", count_include_pad);
-    }
-
-    // op_output -> Q -> DQ -> output
-    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, averagepool_output,
-                                                     output_qparams[0].scale, output_qparams[0].zero_point);
-  };
-}
-
 // Runs an AveragePool model on the QNN CPU backend. Checks the graph node assignment, and that inference
 // outputs for QNN and CPU match.
-static void RunAveragePoolOpTest(const TestInputDef<float>& input_def,
-                                 const std::vector<int64_t>& kernel_shape,
-                                 const std::vector<int64_t>& strides,
-                                 const std::vector<int64_t>& pads,
-                                 int64_t count_include_pad,
-                                 const std::string& auto_pad,
+static void RunAveragePoolOpTest(const std::string& op_type,
+                                 const std::vector<TestInputDef<float>>& input_defs,
+                                 const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                  ExpectedEPNodeAssignment expected_ep_assignment,
                                  int opset = 18) {
   ProviderOptions provider_options;
@@ -106,7 +32,7 @@ static void RunAveragePoolOpTest(const TestInputDef<float>& input_def,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildAveragePoolTestCase(input_def, kernel_shape, strides, pads, count_include_pad, auto_pad),
+  RunQnnModelTest(BuildOpTestCase(op_type, input_defs, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -115,14 +41,11 @@ static void RunAveragePoolOpTest(const TestInputDef<float>& input_def,
 // Runs a QDQ AveragePool model on the QNN HTP backend. Checks the graph node assignment, and that accuracy
 // on QNN EP is at least as good as on CPU EP.
 template <typename QuantType>
-static void RunQDQAveragePoolOpTest(const TestInputDef<float>& input_def,
-                                    const std::vector<int64_t>& kernel_shape,
-                                    const std::vector<int64_t>& strides,
-                                    const std::vector<int64_t>& pads,
-                                    int64_t count_include_pad,
-                                    const std::string& auto_pad,
+static void RunQDQAveragePoolOpTest(const std::string& op_type,
+                                    const std::vector<TestInputDef<float>>& input_defs,
+                                    const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                     ExpectedEPNodeAssignment expected_ep_assignment,
-                                    int opset = 18, float fp32_abs_err = 1e-5f) {
+                                    int opset = 18) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -130,13 +53,11 @@ static void RunQDQAveragePoolOpTest(const TestInputDef<float>& input_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildAveragePoolTestCase(input_def, kernel_shape, strides, pads, count_include_pad, auto_pad),
-                       BuildAveragePoolQDQTestCase<QuantType>(input_def, kernel_shape, strides, pads, count_include_pad,
-                                                              auto_pad),
+  TestQDQModelAccuracy(BuildOpTestCase(op_type, input_defs, attrs),
+                       BuildQDQOpTestCase<QuantType>(op_type, input_defs, attrs),
                        provider_options,
                        opset,
-                       expected_ep_assignment,
-                       fp32_abs_err);
+                       expected_ep_assignment);
 }
 
 //
@@ -144,46 +65,48 @@ static void RunQDQAveragePoolOpTest(const TestInputDef<float>& input_def,
 //
 
 // AveragePool with kernel size equal to the spatial dimension of input tensor.
-TEST_F(QnnCPUBackendTests, AveragePool_Global) {
-  RunAveragePoolOpTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // random input
-                       {3, 3},                                                   // kernel_shape
-                       {3, 3},                                                   // strides
-                       {0, 0, 0, 0},                                             // pads
-                       0,                                                        // count_include_pad
-                       "NOTSET",
+TEST_F(QnnCPUBackendTests, AveragePool_AsGlobal) {
+  RunAveragePoolOpTest("AveragePool",
+                       {TestInputDef<float>({1, 2, 3, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 18))},
+                       {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                        utils::MakeAttribute("strides", std::vector<int64_t>{3, 3})},
+                       ExpectedEPNodeAssignment::All);
+}
+
+// Test GlobalAveragePool on QNN CPU backend.
+TEST_F(QnnCPUBackendTests, GlobalAveragePool) {
+  RunAveragePoolOpTest("GlobalAveragePool",
+                       {TestInputDef<float>({1, 2, 3, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 18))},
+                       {},
                        ExpectedEPNodeAssignment::All);
 }
 
 // AveragePool that counts padding.
 TEST_F(QnnCPUBackendTests, AveragePool_CountIncludePad) {
-  RunAveragePoolOpTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // random input
-                       {1, 1},                                                   // kernel_shape
-                       {1, 1},                                                   // strides
-                       {0, 0, 0, 0},                                             // pads
-                       1,                                                        // count_include_pad
-                       "NOTSET",
+  RunAveragePoolOpTest("AveragePool",
+                       {TestInputDef<float>({1, 2, 3, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 18))},
+                       {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
+                        utils::MakeAttribute("count_include_pad", static_cast<int64_t>(1))},
                        ExpectedEPNodeAssignment::All);
 }
 
 // AveragePool that use auto_pad 'SAME_UPPER'.
 TEST_F(QnnCPUBackendTests, AveragePool_AutopadSameUpper) {
-  RunAveragePoolOpTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // random input
-                       {1, 1},                                                   // kernel_shape
-                       {1, 1},                                                   // strides
-                       {},                                                       // pads
-                       1,                                                        // count_include_pad
-                       "SAME_UPPER",
+  RunAveragePoolOpTest("AveragePool",
+                       {TestInputDef<float>({1, 2, 3, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 18))},
+                       {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
+                        utils::MakeAttribute("count_include_pad", static_cast<int64_t>(1)),
+                        utils::MakeAttribute("auto_pad", "SAME_UPPER")},
                        ExpectedEPNodeAssignment::All);
 }
 
 // AveragePool that use auto_pad 'SAME_LOWER'.
 TEST_F(QnnCPUBackendTests, AveragePool_AutopadSameLower) {
-  RunAveragePoolOpTest(TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // random input
-                       {1, 1},                                                   // kernel_shape
-                       {1, 1},                                                   // strides
-                       {},                                                       // pads
-                       1,                                                        // count_include_pad
-                       "SAME_LOWER",
+  RunAveragePoolOpTest("AveragePool",
+                       {TestInputDef<float>({1, 2, 3, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 18))},
+                       {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
+                        utils::MakeAttribute("count_include_pad", static_cast<int64_t>(1)),
+                        utils::MakeAttribute("auto_pad", "SAME_LOWER")},
                        ExpectedEPNodeAssignment::All);
 }
 
@@ -193,15 +116,23 @@ TEST_F(QnnCPUBackendTests, AveragePool_AutopadSameLower) {
 //
 
 // QDQ AveragePool with kernel size equal to the spatial dimension of input tensor.
-TEST_F(QnnHTPBackendTests, AveragePool_Global_HTP) {
+TEST_F(QnnHTPBackendTests, AveragePool_AsGlobal) {
   std::vector<float> input = {32.1289f, -59.981f, -17.2799f, 62.7263f, 33.6205f, -19.3515f, -54.0113f, 37.5648f, 61.5357f,
                               -52.5769f, 27.3637f, -9.01382f, -65.5612f, 19.9497f, -47.9228f, 26.9813f, 83.064f, 0.362503f};
-  RunQDQAveragePoolOpTest<uint8_t>(TestInputDef<float>({1, 2, 3, 3}, false, input),
-                                   {3, 3},        // kernel_shape
-                                   {3, 3},        // strides
-                                   {0, 0, 0, 0},  // pads
-                                   0,             // count_include_pad
-                                   "NOTSET",
+  RunQDQAveragePoolOpTest<uint8_t>("AveragePool",
+                                   {TestInputDef<float>({1, 2, 3, 3}, false, input)},
+                                   {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
+                                    utils::MakeAttribute("strides", std::vector<int64_t>{3, 3})},
+                                   ExpectedEPNodeAssignment::All);
+}
+
+// Test accuracy for 8-bit QDQ GlobalAveragePool with input of rank 4.
+TEST_F(QnnHTPBackendTests, GlobalAveragePool) {
+  std::vector<float> input = GetFloatDataInRange(-32.0f, 32.0f, 18);
+
+  RunQDQAveragePoolOpTest<uint8_t>("GlobalAveragePool",
+                                   {TestInputDef<float>({1, 2, 3, 3}, false, input)},
+                                   {},
                                    ExpectedEPNodeAssignment::All);
 }
 
@@ -210,12 +141,10 @@ TEST_F(QnnHTPBackendTests, AveragePool_CountIncludePad_HTP_u8) {
   std::vector<float> input = {-9.0f, -7.33f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f,
                               1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
 
-  RunQDQAveragePoolOpTest<uint8_t>(TestInputDef<float>({1, 2, 3, 3}, false, input),
-                                   {1, 1},        // kernel_shape
-                                   {1, 1},        // strides
-                                   {0, 0, 0, 0},  // pads
-                                   1,             // count_include_pad
-                                   "NOTSET",
+  RunQDQAveragePoolOpTest<uint8_t>("AveragePool",
+                                   {TestInputDef<float>({1, 2, 3, 3}, false, input)},
+                                   {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
+                                    utils::MakeAttribute("count_include_pad", static_cast<int64_t>(1))},
                                    ExpectedEPNodeAssignment::All,
                                    18);
 }
@@ -225,12 +154,10 @@ TEST_F(QnnHTPBackendTests, AveragePool_AutopadSameUpper_HTP_u8) {
   std::vector<float> input = {-9.0f, -7.33f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f,
                               1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
 
-  RunQDQAveragePoolOpTest<uint8_t>(TestInputDef<float>({1, 2, 3, 3}, false, input),
-                                   {1, 1},  // kernel_shape
-                                   {1, 1},  // strides
-                                   {},      // pads
-                                   0,       // count_include_pad
-                                   "SAME_UPPER",
+  RunQDQAveragePoolOpTest<uint8_t>("AveragePool",
+                                   {TestInputDef<float>({1, 2, 3, 3}, false, input)},
+                                   {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
+                                    utils::MakeAttribute("auto_pad", "SAME_UPPER")},
                                    ExpectedEPNodeAssignment::All,
                                    18);
 }
@@ -240,12 +167,10 @@ TEST_F(QnnHTPBackendTests, AveragePool_AutopadSameLower_HTP_u8) {
   std::vector<float> input = {-9.0f, -7.33f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f,
                               1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
 
-  RunQDQAveragePoolOpTest<uint8_t>(TestInputDef<float>({1, 2, 3, 3}, false, input),
-                                   {1, 1},  // kernel_shape
-                                   {1, 1},  // strides
-                                   {},      // pads
-                                   0,       // count_include_pad
-                                   "SAME_LOWER",
+  RunQDQAveragePoolOpTest<uint8_t>("AveragePool",
+                                   {TestInputDef<float>({1, 2, 3, 3}, false, input)},
+                                   {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
+                                    utils::MakeAttribute("auto_pad", "SAME_LOWER")},
                                    ExpectedEPNodeAssignment::All,
                                    18);
 }
diff --git a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
index 8e4a07e66624e..9b65ca7bda3e2 100644
--- a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
@@ -114,15 +114,15 @@ GetTestQDQModelFn<InputQType> BuildQDQBatchNormTestCase(const TestInputDef<float
     const int64_t num_channels = input_shape[1];
 
     NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<InputQType> input_qparams = GetTestInputQuantParams(input_def);
+    QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def);
     NodeArg* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point);
 
     NodeArg* scale = MakeTestInput(builder, scale_def);
-    QuantParams<ScaleQType> scale_qparams = GetTestInputQuantParams(scale_def);
+    QuantParams<ScaleQType> scale_qparams = GetTestInputQuantParams<ScaleQType>(scale_def);
     NodeArg* scale_qdq = AddQDQNodePair<ScaleQType>(builder, scale, scale_qparams.scale, scale_qparams.zero_point);
 
     NodeArg* bias = MakeTestInput(builder, bias_def);
-    QuantParams<BiasQType> bias_qparams = GetTestInputQuantParams(bias_def);
+    QuantParams<BiasQType> bias_qparams = GetTestInputQuantParams<BiasQType>(bias_def);
     NodeArg* bias_qdq = AddQDQNodePair<BiasQType>(builder, bias, bias_qparams.scale, bias_qparams.zero_point);
 
     std::vector<float> mean_vals(num_channels);
diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
index c6ebaaf7ab7e4..e9e285411f0a7 100644
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -13,70 +13,6 @@
 namespace onnxruntime {
 namespace test {
 
-// The bug is from a QDQ model, and Conv node gets processed before it's producer Mul node
-// A Transpose node gets inserted between Mul and the dynamic weight tensor shape on Conv
-// to make Conv weight with shape HWNC
-// However it changes Mul output shape to HWNC and cause issue
-// It has to be QDQ model, because the DQ node with initializer on Conv gets processed first
-// and DQ node requires its node unit to be processed
-// So, Conv gets processed before Mul node
-TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) {
-  ProviderOptions provider_options;
-
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
-#else
-  provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-
-  auto BuildConvMulGraph = [](ModelTestBuilder& builder) {
-    // DQ node for Conv input
-    auto* dq_i_output = builder.MakeIntermediate();
-    auto* conv_dq_input = builder.MakeInitializer<uint8_t>({1, 32, 16, 113}, static_cast<uint8_t>(0), static_cast<uint8_t>(127));
-
-    // DQ node for Conv bias
-    auto* dq_bias_output = builder.MakeIntermediate();
-    auto* bias = builder.MakeInitializer<int32_t>({16}, static_cast<int32_t>(0), static_cast<int32_t>(127));
-
-    // Mul node
-    // DQ nodes for Mul
-    auto* mul_dq1_output = builder.MakeIntermediate();
-    auto* mul_input1 = builder.MakeInput<uint8_t>({16, 32, 1, 1}, static_cast<uint8_t>(0), static_cast<uint8_t>(127));
-
-    auto* mul_dq2_output = builder.MakeIntermediate();
-    auto* mul_input2 = builder.MakeInitializer<uint8_t>({16, 1, 1, 1}, static_cast<uint8_t>(0), static_cast<uint8_t>(127));
-    builder.AddDequantizeLinearNode<uint8_t>(mul_input1, .03f, 0, mul_dq1_output);
-    builder.AddDequantizeLinearNode<uint8_t>(mul_input2, .03f, 0, mul_dq2_output);
-
-    auto* mul_output = builder.MakeIntermediate();
-    builder.AddNode("Mul", {mul_dq1_output, mul_dq2_output}, {mul_output});
-
-    auto* mul_dq_output = AddQDQNodePair<uint8_t>(builder, mul_output, .03f, 0);
-
-    builder.AddDequantizeLinearNode<uint8_t>(conv_dq_input, .04f, 0, dq_i_output);
-    builder.AddDequantizeLinearNode<int32_t>(bias, .0012f, 0, dq_bias_output);
-    // Conv node
-    auto* conv_output = builder.MakeIntermediate();
-
-    Node& conv_node = builder.AddNode("Conv", {dq_i_output, mul_dq_output, dq_bias_output}, {conv_output});
-    conv_node.AddAttribute("auto_pad", "NOTSET");
-    conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
-    conv_node.AddAttribute("strides", std::vector<int64_t>{1, 1});
-    conv_node.AddAttribute("dilations", std::vector<int64_t>{1, 1});
-
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<uint8_t>(conv_output, .039f, 0, q_output);
-
-    auto* dq_output = builder.MakeOutput();
-    builder.AddDequantizeLinearNode<uint8_t>(q_output, .039f, 0, dq_output);
-  };
-
-  RunQnnModelTest(BuildConvMulGraph,
-                  provider_options,
-                  13,
-                  ExpectedEPNodeAssignment::All);
-}
-
 // Creates a graph with a single float32 Conv operator. Used for testing CPU backend.
 static GetTestModelFn BuildF32ConvTestCase(const std::string& conv_op_type, const TestInputDef<float>& input_def,
                                            const TestInputDef<float>& weights_def,
@@ -156,13 +92,13 @@ static GetTestQDQModelFn<InputQType> BuildQDQConvTestCase(const std::string& con
 
     // input -> Q/DQ ->
     auto* input = MakeTestInput(builder, input_def);
-    QuantParams<InputQType> input_qparams = GetTestInputQuantParams(input_def);
+    QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def);
     auto* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point);
     conv_inputs.push_back(input_qdq);
 
     // weights -> Q/DQ ->
     auto* weights = MakeTestInput(builder, weights_def);
-    QuantParams<InputQType> weights_qparams = GetTestInputQuantParams(weights_def);
+    QuantParams<InputQType> weights_qparams = GetTestInputQuantParams<InputQType>(weights_def);
     auto* weights_qdq = AddQDQNodePair<InputQType>(builder, weights, weights_qparams.scale, weights_qparams.zero_point);
     conv_inputs.push_back(weights_qdq);
 
@@ -395,6 +331,70 @@ TEST_F(QnnCPUBackendTests, ConvTranspose1Df32_DynamicWeights_DefaultBias) {
 
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
+// The bug is from a QDQ model, and Conv node gets processed before it's producer Mul node
+// A Transpose node gets inserted between Mul and the dynamic weight tensor shape on Conv
+// to make Conv weight with shape HWNC
+// However it changes Mul output shape to HWNC and cause issue
+// It has to be QDQ model, because the DQ node with initializer on Conv gets processed first
+// and DQ node requires its node unit to be processed
+// So, Conv gets processed before Mul node
+TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto BuildConvMulGraph = [](ModelTestBuilder& builder) {
+    // DQ node for Conv input
+    auto* dq_i_output = builder.MakeIntermediate();
+    auto* conv_dq_input = builder.MakeInitializer<uint8_t>({1, 32, 16, 113}, static_cast<uint8_t>(0), static_cast<uint8_t>(127));
+
+    // DQ node for Conv bias
+    auto* dq_bias_output = builder.MakeIntermediate();
+    auto* bias = builder.MakeInitializer<int32_t>({16}, static_cast<int32_t>(0), static_cast<int32_t>(127));
+
+    // Mul node
+    // DQ nodes for Mul
+    auto* mul_dq1_output = builder.MakeIntermediate();
+    auto* mul_input1 = builder.MakeInput<uint8_t>({16, 32, 1, 1}, static_cast<uint8_t>(0), static_cast<uint8_t>(127));
+
+    auto* mul_dq2_output = builder.MakeIntermediate();
+    auto* mul_input2 = builder.MakeInitializer<uint8_t>({16, 1, 1, 1}, static_cast<uint8_t>(0), static_cast<uint8_t>(127));
+    builder.AddDequantizeLinearNode<uint8_t>(mul_input1, .03f, 0, mul_dq1_output);
+    builder.AddDequantizeLinearNode<uint8_t>(mul_input2, .03f, 0, mul_dq2_output);
+
+    auto* mul_output = builder.MakeIntermediate();
+    builder.AddNode("Mul", {mul_dq1_output, mul_dq2_output}, {mul_output});
+
+    auto* mul_dq_output = AddQDQNodePair<uint8_t>(builder, mul_output, .03f, 0);
+
+    builder.AddDequantizeLinearNode<uint8_t>(conv_dq_input, .04f, 0, dq_i_output);
+    builder.AddDequantizeLinearNode<int32_t>(bias, .0012f, 0, dq_bias_output);
+    // Conv node
+    auto* conv_output = builder.MakeIntermediate();
+
+    Node& conv_node = builder.AddNode("Conv", {dq_i_output, mul_dq_output, dq_bias_output}, {conv_output});
+    conv_node.AddAttribute("auto_pad", "NOTSET");
+    conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+    conv_node.AddAttribute("strides", std::vector<int64_t>{1, 1});
+    conv_node.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+
+    auto* q_output = builder.MakeIntermediate();
+    builder.AddQuantizeLinearNode<uint8_t>(conv_output, .039f, 0, q_output);
+
+    auto* dq_output = builder.MakeOutput();
+    builder.AddDequantizeLinearNode<uint8_t>(q_output, .039f, 0, dq_output);
+  };
+
+  RunQnnModelTest(BuildConvMulGraph,
+                  provider_options,
+                  13,
+                  ExpectedEPNodeAssignment::All);
+}
+
 // Check that QNN compiles DQ -> Conv -> Q as a single unit.
 // Tests bias as a dynamic input.
 TEST_F(QnnHTPBackendTests, ConvU8S32_bias_dynamic_input) {
diff --git a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
index d2ca9d8ff71e0..5b05b39f34a27 100644
--- a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
@@ -37,7 +37,7 @@ static GetTestQDQModelFn<QuantType> BuildQDQGatherOpTestCase(const TestInputDef<
   return [input_def, indices_def, axis](ModelTestBuilder& builder,
                                         std::vector<QuantParams<QuantType>>& output_qparams) {
     NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
     NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
 
     NodeArg* indices = MakeTestInput(builder, indices_def);
diff --git a/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc b/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
index 683c4d49fa99d..594973e37ef0b 100644
--- a/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
@@ -45,12 +45,12 @@ static GetTestQDQModelFn<QuantType> BuildQDQInstanceNormTestCase(const TestInput
                                                  std::vector<QuantParams<QuantType>>& output_qparams) {
     // input => Q => DQ =>
     NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
     NodeArg* input_qdq = AddQDQNodePair(builder, input, input_qparams.scale, input_qparams.zero_point);
 
     // scale => Q => DQ =>
     NodeArg* scale = MakeTestInput(builder, scale_def);
-    QuantParams<QuantType> scale_qparams = GetTestInputQuantParams(scale_def);
+    QuantParams<QuantType> scale_qparams = GetTestInputQuantParams<QuantType>(scale_def);
     NodeArg* scale_qdq = AddQDQNodePair(builder, scale, scale_qparams.scale, scale_qparams.zero_point);
 
     // bias (as int32) => DQ =>
diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc
index e82e88d509019..aa6c6a142e6d1 100644
--- a/onnxruntime/test/providers/qnn/layer_norm_test.cc
+++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc
@@ -5,6 +5,7 @@
 
 #include <string>
 #include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
 
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
@@ -15,7 +16,12 @@ namespace onnxruntime {
 namespace test {
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
-static void RunLayerNormCpuTest(const std::vector<int64_t>& shape) {
+// Runs an LayerNorm model on the QNN CPU backend. Checks the graph node assignment and that inference
+// outputs for QNN and CPU match.
+static void RunLayerNormCpuTest(const TestInputDef<float>& input_def,
+                                const TestInputDef<float>& scale_def,
+                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnCpu.dll";
@@ -23,88 +29,84 @@ static void RunLayerNormCpuTest(const std::vector<int64_t>& shape) {
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  auto BuildLayerNormTestCase = [](const std::vector<int64_t>& shape) -> GetTestModelFn {
-    return [shape](ModelTestBuilder& builder) {
-      // Random input data
-      auto input = builder.MakeInput<float>(shape, 0.0f, 10.0f);
-      auto scale = builder.MakeInput<float>(shape, 0.0f, 10.0f);
-
-      auto* output = builder.MakeOutput();
-      Node& layer_norm_node = builder.AddNode("LayerNormalization", {input, scale}, {output});
-
-      layer_norm_node.AddAttribute("axis", static_cast<int64_t>(0));
-    };
-  };
-
-  constexpr int expected_nodes_in_partition = 1;
-  RunQnnModelTest(BuildLayerNormTestCase(shape),
+  RunQnnModelTest(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, attrs),
                   provider_options,
-                  13,
-                  ExpectedEPNodeAssignment::All,
-                  expected_nodes_in_partition);
+                  17,
+                  expected_ep_assignment);
+}
+
+TEST_F(QnnCPUBackendTests, LayerNorm) {
+  RunLayerNormCpuTest(TestInputDef<float>({2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
+                      TestInputDef<float>({2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
+                      ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, TestLayerNorm) {
-  RunLayerNormCpuTest({2, 3});
+TEST_F(QnnCPUBackendTests, LayerNorm1D_Axis0) {
+  RunLayerNormCpuTest(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
+                      TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
+                      ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, TestLayerNorm1D) {
-  RunLayerNormCpuTest({1, 2, 3});
+TEST_F(QnnCPUBackendTests, LayerNorm1D_AxisLast) {
+  RunLayerNormCpuTest(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
+                      TestInputDef<float>({3}, false, GetFloatDataInRange(0.0f, 10.0f, 3)),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
+                      ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, TestLayerNorm2D) {
-  RunLayerNormCpuTest({1, 2, 3, 3});
+TEST_F(QnnCPUBackendTests, LayerNorm2D) {
+  RunLayerNormCpuTest(TestInputDef<float>({1, 2, 3, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 18)),
+                      TestInputDef<float>({1, 2, 3, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 18)),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
+                      ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, TestLayerNorm3D) {
-  RunLayerNormCpuTest({1, 2, 3, 3, 4});
+TEST_F(QnnCPUBackendTests, LayerNorm3D) {
+  RunLayerNormCpuTest(TestInputDef<float>({1, 2, 3, 3, 4}, false, GetFloatDataInRange(0.0f, 10.0f, 72)),
+                      TestInputDef<float>({1, 2, 3, 3, 4}, false, GetFloatDataInRange(0.0f, 10.0f, 72)),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
+                      ExpectedEPNodeAssignment::All);
 }
 
 template <typename InputQType, typename ScaleQType>
-GetQDQTestCaseFn BuildQDQLayerNormTestCase(const std::vector<int64_t>& input_shape,
-                                           const std::vector<int64_t>& scale_shape,
-                                           int64_t axis_value = 0) {
-  return [input_shape, scale_shape, axis_value](ModelTestBuilder& builder) {
-    const InputQType quant_zero_point = 0;
-    // const float quant_scale = 1.0f;
-
-    auto* input = builder.MakeInput<InputQType>(input_shape, std::numeric_limits<InputQType>::min(),
-                                                std::numeric_limits<InputQType>::max());
-    auto* dq_input = builder.MakeIntermediate();
-    builder.AddDequantizeLinearNode<InputQType>(input, 0.0039f, quant_zero_point, dq_input);
-
-    auto* dq_scale_output = builder.MakeIntermediate();
-    auto* scale = builder.MakeInitializer<ScaleQType>(scale_shape, static_cast<ScaleQType>(1), static_cast<ScaleQType>(127));
-    builder.AddDequantizeLinearNode<ScaleQType>(scale, 0.0028f, quant_zero_point, dq_scale_output);
-
-    auto* layernorm_output = builder.MakeIntermediate();
-    Node& layer_norm_node = builder.AddNode("LayerNormalization", {dq_input, dq_scale_output}, {layernorm_output});
-    layer_norm_node.AddAttribute("axis", axis_value);
-
-    auto* q_output = builder.MakeIntermediate();
-    builder.AddQuantizeLinearNode<InputQType>(layernorm_output, 0.00377f, quant_zero_point, q_output);
-
-    auto* final_output = builder.MakeOutput();
-    builder.AddDequantizeLinearNode<InputQType>(q_output, 0.00377f,
-                                                quant_zero_point,
-                                                final_output);
+GetTestQDQModelFn<InputQType> BuildQDQLayerNormTestCase(const TestInputDef<float>& input_def,
+                                                        const TestInputDef<float>& scale_def,
+                                                        const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [input_def, scale_def, attrs](ModelTestBuilder& builder,
+                                       std::vector<QuantParams<InputQType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point);
+
+    // scale input -> Q -> DQ ->
+    NodeArg* scale = MakeTestInput(builder, scale_def);
+    QuantParams<ScaleQType> scale_qparams = GetTestInputQuantParams<ScaleQType>(scale_def);
+    NodeArg* scale_qdq = AddQDQNodePair<ScaleQType>(builder, scale, scale_qparams.scale, scale_qparams.zero_point);
+
+    // LayerNormalization
+    NodeArg* layer_norm_output = builder.MakeIntermediate();
+    Node& layer_norm_node = builder.AddNode("LayerNormalization", {input_qdq, scale_qdq}, {layer_norm_output});
+
+    for (const auto& attr : attrs) {
+      layer_norm_node.AddAttributeProto(attr);
+    }
+
+    // layer_norm_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, layer_norm_output, output_qparams[0].scale,
+                                                      output_qparams[0].zero_point);
   };
 }
 
-/**
- * Runs an LayerNormalization model on the QNN HTP backend. Checks the graph node assignment, and that inference
- * outputs for QNN and CPU match.
- *
- * \param input_shape The input's shape.
- * \param scale_shape The scale's shape.
- * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
- * \param num_modes_in_graph The number of expected nodes in the graph.
- * \param axis_value The axis value.
- */
-static void RunLayerNormQDQTest(const std::vector<int64_t>& input_shape,
-                                const std::vector<int64_t>& scale_shape,
-                                ExpectedEPNodeAssignment expected_ep_assignment,
-                                int64_t axis_value = 0) {
+// Runs a QDQ LayerNorm model on the QNN HTP backend. Checks the graph node assignment and that inference
+// outputs for QNN are as accurate as CPU EP (compares against f32 model and QDQ model).
+template <typename InputQType, typename ScaleQType>
+static void RunLayerNormQDQTest(const TestInputDef<float>& input_def,
+                                const TestInputDef<float>& scale_def,
+                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -112,27 +114,40 @@ static void RunLayerNormQDQTest(const std::vector<int64_t>& input_shape,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
-  // TODO: Use new QDQ accuracy testing approach (see TestQDQModelAccuracy)
-  RunQnnModelTest(BuildQDQLayerNormTestCase<uint8_t, uint8_t>(input_shape, scale_shape, axis_value),
-                  provider_options,
-                  11,
-                  expected_ep_assignment);
+  TestQDQModelAccuracy(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, attrs),
+                       BuildQDQLayerNormTestCase<InputQType, ScaleQType>(input_def, scale_def, attrs),
+                       provider_options,
+                       17,  // opset
+                       expected_ep_assignment);
+}
+
+// Test that QNN HTP only supports axis = -1 (i.e., last dimension).
+TEST_F(QnnHTPBackendTests, LayerNorm1D_Axis0_Unsupported) {
+  RunLayerNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, 0.0f, 10.0f),
+                                        TestInputDef<float>({1, 2, 3}, true, 0.0f, 10.0f),
+                                        {utils::MakeAttribute("axis", static_cast<int64_t>(0))},  // Unsupported axis
+                                        ExpectedEPNodeAssignment::None);
 }
 
-// Check that QNN compiles DQ -> LayerNormalization -> Q as a single unit.
-// Use an input of rank 3.
-// QNN HTP only supports axis = -1
-// TODO: Use new QDQ accuracy testing approach (see TestQDQModelAccuracy)
-TEST_F(QnnHTPBackendTests, TestQDQLayerNorm1DAxis0) {
-  RunLayerNormQDQTest({1, 2, 3}, {1, 2, 3}, ExpectedEPNodeAssignment::None);
+// Test accuracy of 8-bit QDQ LayerNorm with a static scale input. This used to fail on QNN DK 2.13,
+// but was fixed in QNN SDK 2.14.
+TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale) {
+  RunLayerNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
+                                        TestInputDef<float>({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)),  // Static
+                                        {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},            // Last axis
+                                        ExpectedEPNodeAssignment::All);
 }
 
-// QNN v2.13: Failed QNN FinalizeGraphs: QnnDsp <E> Failed to finalize graph (id: 1) with err 1002
-//
-// TODO: Use new QDQ accuracy testing approach (see TestQDQModelAccuracy)
-TEST_F(QnnHTPBackendTests, DISABLED_TestQDQLayerNorm1DAxis2) {
-  RunLayerNormQDQTest({1, 2, 3}, {3}, ExpectedEPNodeAssignment::All, -1);
+// Test accuracy of 8-bit QDQ LayerNorm with a dynamic scale input.
+// TODO(adrianlizarraga): Investigate graph finalization error in QNN SDK 2.14.1
+// Failed QNN FinalizeGraphs: QnnDsp <E> Failed to finalize graph (id: 1) with err 1002
+// C:\qnn_src\QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:232:ERROR:could not create op: q::flat_from_vtcm
+// C:\qnn_src\QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:1021:ERROR:Op 0x103d00000002 preparation failed with err:-1
+TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_LastAxis_DynamicScale) {
+  RunLayerNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
+                                        TestInputDef<float>({3}, false, GetFloatDataInRange(0.0f, 1.0f, 3)),  // Dynamic
+                                        {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},             // Last axis
+                                        ExpectedEPNodeAssignment::All);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc b/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc
index 772476cb0d245..a8237817c71df 100644
--- a/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc
@@ -33,7 +33,7 @@ static GetTestQDQModelFn<QuantType> BuildQDQLeakyReluOpTestCase(const TestInputD
                             std::vector<QuantParams<QuantType>>& output_qparams) {
     // input => Q => DQ =>
     NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
     NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
 
     // LeakryRelu
diff --git a/onnxruntime/test/providers/qnn/lrn_op_test.cc b/onnxruntime/test/providers/qnn/lrn_op_test.cc
index 82f7b246aa5e4..4f64b4a7e0d3f 100644
--- a/onnxruntime/test/providers/qnn/lrn_op_test.cc
+++ b/onnxruntime/test/providers/qnn/lrn_op_test.cc
@@ -39,7 +39,7 @@ static GetTestQDQModelFn<InputQType> BuildQDQLRNTestCase(const TestInputDef<floa
                                               std::vector<QuantParams<InputQType>>& output_qparams) {
     // input -> Q -> DQ ->
     NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<InputQType> input_qparams = GetTestInputQuantParams(input_def);
+    QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def);
     NodeArg* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point);
 
     // LRN
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index 00ba7bd7858c3..6edb6ecdcfb1a 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -34,12 +34,12 @@ static GetTestQDQModelFn<QuantType> BuildMatMulOpQDQTestCase(const TestInputDef<
                                   std::vector<QuantParams<QuantType>>& output_qparams) {
     // input1 -> Q -> DQ ->
     NodeArg* input1 = MakeTestInput(builder, input1_def);
-    QuantParams<QuantType> input1_qparams = GetTestInputQuantParams(input1_def);
+    QuantParams<QuantType> input1_qparams = GetTestInputQuantParams<QuantType>(input1_def);
     auto* input1_qdq = AddQDQNodePair<QuantType>(builder, input1, input1_qparams.scale, input1_qparams.zero_point);
 
     // input2 -> Q -> DQ ->
     NodeArg* input2 = MakeTestInput(builder, input2_def);
-    QuantParams<QuantType> input2_qparams = GetTestInputQuantParams(input2_def);
+    QuantParams<QuantType> input2_qparams = GetTestInputQuantParams<QuantType>(input2_def);
     auto* input2_qdq = AddQDQNodePair<QuantType>(builder, input2, input2_qparams.scale, input2_qparams.zero_point);
 
     // MatMul
@@ -108,9 +108,9 @@ TEST_F(QnnCPUBackendTests, MatMulOp) {
 // Test MatMul broadcasting
 // Note slight inaccuracy in CPU backend:
 // Expected: contains 896 values, where each value and its corresponding value in 16-byte object
-// <80-03 00-00 00-00 00-00 40-00 34-F0 5B-01 00-00> are an almost-equal pair
-// Actual: 16-byte object <80-03 00-00 00-00 00-00 40-00 23-F0 5B-01 00-00>,
-// where the value pair (148.536011, 148.536255) at index #4 don't match, which is 0.000244141 from 148.536
+// <80-03 00-00 00-00 00-00 40-00 34-DD F7-01 00-00> are an almost-equal pair
+// Actual: 16-byte object <80-03 00-00 00-00 00-00 40-00 23-DD F7-01 00-00>,
+// where the value pair (73.68116, 73.680809) at index #80 don't match, which is -0.000350952 from 73.6812
 TEST_F(QnnCPUBackendTests, MatMulOp_Broadcast) {
   // Create two matrices with element values in the range [-10.0, 10.0].
   std::vector<float> input_a = GetFloatDataInRange(-10.0f, 10.0f, 28 * 64);
@@ -118,7 +118,7 @@ TEST_F(QnnCPUBackendTests, MatMulOp_Broadcast) {
 
   RunMatMulOpOpTest(TestInputDef<float>({28, 1, 64}, false, input_a),
                     TestInputDef<float>({64, 32}, false, input_b),
-                    ExpectedEPNodeAssignment::All, 18, 0.00026f);
+                    ExpectedEPNodeAssignment::All, 18, 0.0004f);
 }
 
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/max_min_op_test.cc b/onnxruntime/test/providers/qnn/max_min_op_test.cc
new file mode 100644
index 0000000000000..09ea71e5f03eb
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/max_min_op_test.cc
@@ -0,0 +1,135 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "onnx/onnx_pb.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Runs an Max/Min model on the QNN CPU backend. Checks the graph node assignment, and that inference
+// outputs for QNN EP and CPU EP match.
+static void RunCPUMinOrMaxOpTest(const std::string& op_type,
+                                 const std::vector<TestInputDef<float>>& input_defs,
+                                 ExpectedEPNodeAssignment expected_ep_assignment,
+                                 int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildOpTestCase(op_type, input_defs, {}, kOnnxDomain),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a QDQ Max/Min model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment, and that inference
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
+template <typename QType = uint8_t>
+static void RunQDQMinOrMaxOpTest(const std::string& op_type,
+                                 const std::vector<TestInputDef<float>>& input_defs,
+                                 ExpectedEPNodeAssignment expected_ep_assignment,
+                                 int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy(BuildOpTestCase(op_type, input_defs, {}, kOnnxDomain),            // baseline float32 model
+                       BuildQDQOpTestCase<QType>(op_type, input_defs, {}, kOnnxDomain),  // QDQ model
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-4f);
+}
+
+//
+// CPU tests:
+//
+
+// Test that Min with 1 input is *NOT* supported on CPU backend.
+TEST_F(QnnCPUBackendTests, Min_1Input_NotSupported) {
+  RunCPUMinOrMaxOpTest("Min",
+                       {TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f)},
+                       ExpectedEPNodeAssignment::None, 13);
+}
+
+// Test that Max with 1 input is *NOT* supported on CPU backend.
+TEST_F(QnnCPUBackendTests, Max_1Input_NotSupported) {
+  RunCPUMinOrMaxOpTest("Max",
+                       {TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f)},
+                       ExpectedEPNodeAssignment::None, 13);
+}
+
+// Test Min with 2 inputs on CPU backend.
+TEST_F(QnnCPUBackendTests, Min_2Inputs) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunCPUMinOrMaxOpTest("Min",
+                       {TestInputDef<float>({1, 3, 4, 4}, false, input_data),
+                        TestInputDef<float>({1, 3, 4, 4}, false, input_data)},
+                       ExpectedEPNodeAssignment::All, 13);
+}
+
+// Test Max with 2 inputs on CPU backend.
+TEST_F(QnnCPUBackendTests, Max_2Inputs) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunCPUMinOrMaxOpTest("Max",
+                       {TestInputDef<float>({1, 3, 4, 4}, false, input_data),
+                        TestInputDef<float>({1, 3, 4, 4}, false, input_data)},
+                       ExpectedEPNodeAssignment::All, 13);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
+// Test that Min with 1 input is *NOT* supported on HTP backend.
+TEST_F(QnnHTPBackendTests, Min_1Input_NotSupported) {
+  RunQDQMinOrMaxOpTest("Min",
+                       {TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f)},
+                       ExpectedEPNodeAssignment::None, 13);
+}
+
+// Test that Max with 1 input is *NOT* supported on HTP backend.
+TEST_F(QnnHTPBackendTests, Max_1Input_NotSupported) {
+  RunQDQMinOrMaxOpTest("Max",
+                       {TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f)},
+                       ExpectedEPNodeAssignment::None, 13);
+}
+
+// Test accuracy of 8-bit Q/DQ Min with 2 inputs on HTP backend.
+TEST_F(QnnHTPBackendTests, Min_2Inputs) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQMinOrMaxOpTest<uint8_t>("Min",
+                                {TestInputDef<float>({1, 3, 4, 4}, false, input_data),
+                                 TestInputDef<float>({1, 3, 4, 4}, false, input_data)},
+                                ExpectedEPNodeAssignment::All, 13);
+}
+
+// Test accuracy of 8-bit Q/DQ Max with 2 inputs on HTP backend.
+TEST_F(QnnHTPBackendTests, Max_2Inputs) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQMinOrMaxOpTest<uint8_t>("Max",
+                                {TestInputDef<float>({1, 3, 4, 4}, false, input_data),
+                                 TestInputDef<float>({1, 3, 4, 4}, false, input_data)},
+                                ExpectedEPNodeAssignment::All, 13);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/providers/qnn/pool_op_test.cpp b/onnxruntime/test/providers/qnn/pool_op_test.cpp
index c6e8a032ca7f4..fee10a542fb82 100644
--- a/onnxruntime/test/providers/qnn/pool_op_test.cpp
+++ b/onnxruntime/test/providers/qnn/pool_op_test.cpp
@@ -41,7 +41,7 @@ GetTestQDQModelFn<QuantType> BuildPoolQDQTestCase(const std::string& op_type,
                                      std::vector<QuantParams<QuantType>>& output_qparams) {
     // input -> Q -> DQ ->
     NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
     NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
 
     // MaxPool
@@ -233,7 +233,8 @@ TEST_F(QnnHTPBackendTests, DISABLED_MaxPool_Large_Input2_Ceil_HTP_u8) {
 }
 
 // QNN v2.13: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
-TEST_F(QnnHTPBackendTests, DISABLED_MaxPool_LargeInput_1Pads) {
+// Fixed in QNN v2.14.1.
+TEST_F(QnnHTPBackendTests, MaxPool_LargeInput_1Pads) {
   RunQDQPoolOpTest<uint8_t>("MaxPool",
                             TestInputDef<float>({1, 64, 384, 576}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
                             {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3}),
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
index feacdc54226b6..548f80675a622 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
@@ -21,19 +21,21 @@ std::vector<float> GetFloatDataInRange(float min_val, float max_val, size_t num_
     return {};
   }
 
+  if (num_elems == 1) {
+    return {min_val};
+  }
+
   std::vector<float> data;
   data.reserve(num_elems);
 
-  const float step_size = (max_val - min_val) / static_cast<float>(num_elems);
+  const float step_size = (max_val - min_val) / static_cast<float>(num_elems - 1);
   float val = min_val;
   for (size_t i = 0; i < num_elems; i++) {
     data.push_back(val);
     val += step_size;
   }
 
-  // Try to ensure that 0.0 and max_val are also included in the array.
-  // If num_elems is less than 3, then not all of min_val, 0, and max_val will be present.
-  data[num_elems / 2] = 0.0f;
+  // Ensure that max_val is included exactly (due to rounding from adding step sizes).
   data[num_elems - 1] = max_val;
 
   return data;
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h
index dd5e6fc23670a..1b0b85319918f 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.h
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h
@@ -199,7 +199,7 @@ struct TestInputDef {
   std::pair<T, T> range_override_;
 };
 
-template <typename QType = uint8_t>
+template <typename QType>
 inline QuantParams<QType> GetTestInputQuantParams(const TestInputDef<float>& input_def) {
   const std::pair<float, float> frange = input_def.GetRange();
   return QuantParams<QType>::Compute(frange.first, frange.second);
@@ -239,10 +239,10 @@ void InferenceModel(const std::string& model_data, const char* log_id,
  * \param fp32_abs_err Small tolerance used for floating-point comparisons.
  * \param log_severity The logger's severity setting.
  */
-template <typename QuantType = uint8_t>
+template <typename QuantType>
 inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTestQDQModelFn<QuantType>& qdq_model_fn,
                                  const ProviderOptions& qnn_options, int opset_version,
-                                 ExpectedEPNodeAssignment expected_ep_assignment, float fp32_abs_err,
+                                 ExpectedEPNodeAssignment expected_ep_assignment, float fp32_abs_err = 1e-4f,
                                  logging::Severity log_severity = logging::Severity::kERROR) {
   // Add kMSDomain to cover contrib op like Gelu
   const std::unordered_map<std::string, int> domain_to_version = {{"", opset_version}, {kMSDomain, 1}};
@@ -314,7 +314,8 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
 
     // limit the error message count in case test with large data failed
     size_t max_error_count = 10;
-    int error_count = 0;
+    size_t error_count = 0;
+
     // Compare accuracy of QDQ results with float model.
     // QNN EP must be at least as accurate as CPU EP when running the QDQ model.
     for (size_t i = 0; i < num_outputs; i++) {
@@ -433,6 +434,79 @@ inline NodeArg* MakeTestInput(ModelTestBuilder& builder, const TestInputDef<bool
 // i.e., initial bias => manual quantization (int32) => DQ => final float bias
 NodeArg* MakeTestQDQBiasInput(ModelTestBuilder& builder, const TestInputDef<float>& bias_def, float bias_scale);
 
+/**
+ * Returns a function that builds a model with a single operator with N inputs of the same element type.
+ *
+ * \param op_type The operator to instantiate.
+ * \param input_defs List of input definitions.
+ * \param attrs List of operator attributes.
+ * \param op_domain The operator's domain. Defaults to the ONNX domain (i.e., "").
+ * \returns A model building function.
+ */
+template <typename InputType>
+inline GetTestModelFn BuildOpTestCase(const std::string& op_type,
+                                      const std::vector<TestInputDef<InputType>>& input_defs,
+                                      const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                      const std::string& op_domain = kOnnxDomain) {
+  return [op_type, input_defs, attrs, op_domain](ModelTestBuilder& builder) {
+    std::vector<NodeArg*> op_inputs;
+    op_inputs.reserve(input_defs.size());
+
+    for (const auto& input_def : input_defs) {
+      NodeArg* input = MakeTestInput<InputType>(builder, input_def);
+      op_inputs.push_back(input);
+    }
+
+    auto* output = builder.MakeOutput();
+    Node& onnx_node = builder.AddNode(op_type, op_inputs, {output}, op_domain);
+
+    for (const auto& attr : attrs) {
+      onnx_node.AddAttributeProto(attr);
+    }
+  };
+}
+
+/**
+ * Returns a function that builds a model with a single QDQ operator with N inputs of the same element type.
+ *
+ * \param op_type The operator to instantiate.
+ * \param input_defs List of input definitions.
+ * \param attrs List of operator attributes.
+ * \param op_domain The operator's domain. Defaults to the ONNX domain (i.e., "").
+ * \returns A model building function.
+ */
+template <typename InputQType>
+inline GetTestQDQModelFn<InputQType> BuildQDQOpTestCase(const std::string& op_type,
+                                                        const std::vector<TestInputDef<float>>& input_defs,
+                                                        const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                        const std::string& op_domain = kOnnxDomain) {
+  return [op_type, input_defs, attrs, op_domain](ModelTestBuilder& builder,
+                                                 std::vector<QuantParams<InputQType>>& output_qparams) {
+    std::vector<NodeArg*> op_inputs;
+    op_inputs.reserve(input_defs.size());
+
+    for (const auto& input_def : input_defs) {
+      NodeArg* input = MakeTestInput<float>(builder, input_def);
+      QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def);
+      NodeArg* input_after_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale,
+                                                            input_qparams.zero_point);
+      op_inputs.push_back(input_after_qdq);
+    }
+
+    // Op -> op_output
+    auto* op_output = builder.MakeIntermediate();
+    Node& onnx_node = builder.AddNode(op_type, op_inputs, {op_output}, op_domain);
+
+    for (const auto& attr : attrs) {
+      onnx_node.AddAttributeProto(attr);
+    }
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, op_output, output_qparams[0].scale,
+                                                      output_qparams[0].zero_point);
+  };
+}
+
 /**
  * Runs a test model on the QNN EP. Checks the graph node assignment, and that inference
  * outputs for QNN and CPU match.
diff --git a/onnxruntime/test/providers/qnn/reduce_op_test.cc b/onnxruntime/test/providers/qnn/reduce_op_test.cc
index 755f6b094df07..c3c2b578a1bd0 100644
--- a/onnxruntime/test/providers/qnn/reduce_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reduce_op_test.cc
@@ -366,7 +366,7 @@ static void RunReduceOpQDQTest(const std::string& op_type,
                                bool keepdims,
                                int opset,
                                ExpectedEPNodeAssignment expected_ep_assignment,
-                               float fp32_abs_err = 1e-5f) {
+                               float fp32_abs_err = 1e-4f) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 4e7702bd84270..eed12af3c703c 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -16,151 +16,95 @@
 
 namespace onnxruntime {
 namespace test {
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
-
-using UInt8Limits = std::numeric_limits<uint8_t>;
 
+// Runs a non-QDQ model on the QNN CPU backend and compares output to CPU EP.
 template <typename InputType = float>
-static GetTestModelFn BuildUnaryOpTestCase(const std::string& op_type, const TestInputDef<InputType>& input0_def,
-                                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
-                                           const std::string& domain = kOnnxDomain) {
-  return [op_type, input0_def, attrs, domain](ModelTestBuilder& builder) {
-    NodeArg* input0 = MakeTestInput(builder, input0_def);
-
-    auto* output = builder.MakeOutput();
-    auto& op_node = builder.AddNode(op_type, {input0}, {output}, domain);
-    for (const auto& attr : attrs) {
-      op_node.AddAttributeProto(attr);
-    }
-  };
-}
-
-// Creates the graph:
-//                       _______________________
-//                      |                       |
-//    input_u8 -> DQ -> |       SimpleOp        | -> Q -> output_u8
-//                      |_______________________|
-//
-// Currently used to test QNN EP.
-template <typename InputQType>
-GetTestQDQModelFn<InputQType> BuildQDQUnaryOpTestCase(const TestInputDef<float>& input_def,
-                                                      const std::string& op_type,
-                                                      const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
-                                                      const std::string& domain = kOnnxDomain) {
-  return [input_def, op_type, attrs, domain](ModelTestBuilder& builder,
-                                             std::vector<QuantParams<InputQType>>& output_qparams) {
-    auto* input = MakeTestInput(builder, input_def);
-    QuantParams<InputQType> input_qparams = GetTestInputQuantParams(input_def);
-    auto* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point);
-
-    auto* op_output = builder.MakeIntermediate();
-    auto& op_node = builder.AddNode(op_type, {input_qdq}, {op_output}, domain);
-
-    for (const auto& attr : attrs) {
-      op_node.AddAttributeProto(attr);
-    }
-
-    // op_output -> Q -> DQ -> output
-    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, op_output, output_qparams[0].scale, output_qparams[0].zero_point);
-  };
-}
-
-/**
- * Runs an Simple Op model on the QNN HTP backend. Checks the graph node assignment, and that inference
- * outputs for QNN and CPU match.
- *
- * \param input_shape The input's shape.
- * \param test_description Description of the test for error reporting.
- * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
- * \param num_modes_in_graph The number of expected nodes in the graph.
- */
-template <typename InputQType = uint8_t>
-static void RunQDQUnaryOpTest(const TestInputDef<float>& input_def, const std::string& op_type,
-                              const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
-                              int opset_version,
-                              ExpectedEPNodeAssignment expected_ep_assignment,
-                              const std::string& domain = kOnnxDomain) {
+static void RunOpTestOnCPU(const std::string& op_type,
+                           const std::vector<TestInputDef<InputType>>& input_defs,
+                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                           int opset_version,
+                           ExpectedEPNodeAssignment expected_ep_assignment,
+                           const std::string& op_domain = kOnnxDomain) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
+  provider_options["backend_path"] = "QnnCpu.dll";
 #else
-  provider_options["backend_path"] = "libQnnHtp.so";
+  provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  // Runs model with DQ-> Op -> Q and compares the outputs of the CPU and QNN EPs.
-  TestQDQModelAccuracy(BuildUnaryOpTestCase<float>(op_type, input_def, attrs, domain),
-                       BuildQDQUnaryOpTestCase<InputQType>(input_def, op_type, attrs, domain),
-                       provider_options,
-                       opset_version,
-                       expected_ep_assignment,
-                       1e-5f);
-}
-
-// TODO: share with other op tests
-// Creates the graph with two inputs and attributes
-template <typename InputType>
-static GetTestModelFn BuildOpTestCase(const std::string& op_type,
-                                      const TestInputDef<InputType>& input0_def,
-                                      const TestInputDef<InputType>& input1_def,
-                                      const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
-  return [op_type, input0_def, input1_def, attrs](ModelTestBuilder& builder) {
-    NodeArg* input0 = MakeTestInput(builder, input0_def);
-    NodeArg* input1 = MakeTestInput(builder, input1_def);
-
-    auto* output = builder.MakeOutput();
-    Node& onnx_node = builder.AddNode(op_type, {input0, input1}, {output});
-
-    for (const auto& attr : attrs) {
-      onnx_node.AddAttributeProto(attr);
-    }
-  };
+  RunQnnModelTest(BuildOpTestCase<InputType>(op_type, input_defs, attrs, op_domain),
+                  provider_options,
+                  opset_version,
+                  expected_ep_assignment);
 }
 
-// Creates the graph with two inputs and attributes
-//                       _______________________
-//                      |                       |
-//   input0_u8 -> DQ -> |       SimpleOp        | -> Q -> output_u8
-//   input1_u8 -> DQ -> |_______________________|
+// Test float DepthToSpace on the QNN CPU backend.
+// TODO: Flaky test tails often.
+// Value of: expected_tensor.DataAsSpan<float>()
+// Expected: contains 16 values, where each value and its corresponding value in 16-byte object
+// <10-00 00-00 00-00 00-00 40-00 23-D1 82-02 00-00> are an almost-equal pair
+// Actual: 16-byte object <10-00 00-00 00-00 00-00 40-00 12-D1 82-02 00-00>, where the value pair (2, 0.1) at
+// index #2 don't match, which is -1.9 from 2
 //
-// Currently used to test QNN EP.
-template <typename InputQType>
-static GetTestQDQModelFn<InputQType> BuildQDQOpTestCase(const std::string& op_type,
-                                                        const TestInputDef<float>& input0_def,
-                                                        const TestInputDef<float>& input1_def,
-                                                        const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
-  return [op_type, input0_def, input1_def, attrs](ModelTestBuilder& builder,
-                                                  std::vector<QuantParams<InputQType>>& output_qparams) {
-    NodeArg* input0 = MakeTestInput(builder, input0_def);
-    NodeArg* input1 = MakeTestInput(builder, input1_def);
-
-    // input -> Q -> DQ -> Op
-    QuantParams<InputQType> input0_qparams = GetTestInputQuantParams(input0_def);
-    auto* qdq0_output = AddQDQNodePair<InputQType>(builder, input0, input0_qparams.scale, input0_qparams.zero_point);
-
-    QuantParams<InputQType> input1_qparams = GetTestInputQuantParams(input1_def);
-    auto* qdq1_output = AddQDQNodePair<InputQType>(builder, input1, input1_qparams.scale, input1_qparams.zero_point);
-
-    // Op -> op_output
-    auto* op_output = builder.MakeIntermediate();
-    Node& onnx_node = builder.AddNode(op_type, {qdq0_output, qdq1_output}, {op_output});
-
-    for (const auto& attr : attrs) {
-      onnx_node.AddAttributeProto(attr);
-    }
-
-    // op_output -> Q -> DQ -> output
-    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, op_output, output_qparams[0].scale,
-                                                      output_qparams[0].zero_point);
-  };
+// If/when fixed, enable QNN EP in cpu test TensorOpTest.SpaceToDepthTest_1
+TEST_F(QnnCPUBackendTests, DISABLED_SpaceToDepth_Flaky) {
+  std::vector<float> X =
+      {0.0f, 0.1f, 0.2f, 0.3f,
+       1.0f, 1.1f, 1.2f, 1.3f,
+
+       2.0f, 2.1f, 2.2f, 2.3f,
+       3.0f, 3.1f, 3.2f, 3.3f};
+
+  for (size_t i = 0; i < 4; i++) {
+    RunOpTestOnCPU("SpaceToDepth",
+                   {TestInputDef<float>({1, 2, 2, 4}, false, X)},
+                   {utils::MakeAttribute("blocksize", static_cast<int64_t>(2))},
+                   7,
+                   ExpectedEPNodeAssignment::All);
+  }
+}
+
+// Value of: expected_tensor.DataAsSpan<float>()
+// Expected: contains 108 values, where each value and its corresponding value in 16-byte object
+// <6C-00 00-00 00-00 00-00 40-00 23-BB 0E-02 00-00> are an almost-equal pair
+// Actual: 16-byte object <6C-00 00-00 00-00 00-00 40-00 12-BB 0E-02 00-00>, where the value pair (18, 1)
+// at index #2 don't match, which is -17 from 18
+//
+// If/when fixed, enable QNN EP in cpu test TensorOpTest.SpaceToDepthTest_2
+TEST_F(QnnCPUBackendTests, DISABLED_SpaceToDepth_Flaky2) {
+  const std::vector<float> X = {
+      0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.,
+      11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21.,
+      22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+      33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43.,
+      44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54.,
+      55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+      66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76.,
+      77., 78., 79., 80., 81., 82., 83., 84., 85., 86., 87.,
+      88., 89., 90., 91., 92., 93., 94., 95., 96., 97., 98.,
+      99., 100., 101., 102., 103., 104., 105., 106., 107.};
+
+  for (size_t i = 0; i < 4; i++) {
+    RunOpTestOnCPU("SpaceToDepth",
+                   {TestInputDef<float>({2, 3, 3, 6}, false, X)},
+                   {utils::MakeAttribute("blocksize", static_cast<int64_t>(3))},
+                   7,
+                   ExpectedEPNodeAssignment::All);
+  }
 }
 
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+// Tests the accuracy of a QDQ model on QNN EP by comparing to CPU EP, which runs both the fp32 model
+// and the QDQ model.
 template <typename InputQType = uint8_t>
 static void RunQDQOpTest(const std::string& op_type,
-                         const TestInputDef<float>& input0_def,
-                         const TestInputDef<float>& input1_def,
+                         const std::vector<TestInputDef<float>>& input_defs,
                          const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                          int opset_version,
-                         ExpectedEPNodeAssignment expected_ep_assignment) {
+                         ExpectedEPNodeAssignment expected_ep_assignment,
+                         const std::string& op_domain = kOnnxDomain,
+                         float fp32_abs_err = 1e-4f) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -168,21 +112,22 @@ static void RunQDQOpTest(const std::string& op_type,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input0_def, input1_def, attrs),
-                       BuildQDQOpTestCase<InputQType>(op_type, input0_def, input1_def, attrs),
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, attrs, op_domain),
+                       BuildQDQOpTestCase<InputQType>(op_type, input_defs, attrs, op_domain),
                        provider_options,
                        opset_version,
                        expected_ep_assignment,
-                       1e-5f);
+                       fp32_abs_err);
 }
 
+// Runs a non-QDQ model on HTP and compares output to CPU EP.
 template <typename InputType = float>
 static void RunOpTest(const std::string& op_type,
-                      const TestInputDef<InputType>& input0_def,
-                      const TestInputDef<InputType>& input1_def,
+                      const std::vector<TestInputDef<InputType>>& input_defs,
                       const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                       int opset_version,
-                      ExpectedEPNodeAssignment expected_ep_assignment) {
+                      ExpectedEPNodeAssignment expected_ep_assignment,
+                      const std::string& op_domain = kOnnxDomain) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -191,151 +136,307 @@ static void RunOpTest(const std::string& op_type,
 #endif
 
   // Runs model with a Q/DQ binary op and compares the outputs of the CPU and QNN EPs.
-  RunQnnModelTest(BuildOpTestCase<InputType>(op_type, input0_def, input1_def, attrs),
+  RunQnnModelTest(BuildOpTestCase<InputType>(op_type, input_defs, attrs, op_domain),
                   provider_options,
                   opset_version,
                   expected_ep_assignment);
 }
 
+// Test the accuracy of QDQ Sigmoid.
+TEST_F(QnnHTPBackendTests, UnaryOp_Sigmoid) {
+  RunQDQOpTest<uint8_t>("Sigmoid",
+                        {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))},
+                        {},
+                        13,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Test the accuracy of QDQ Tanh.
+TEST_F(QnnHTPBackendTests, UnaryOp_Tanh) {
+  RunQDQOpTest<uint8_t>("Tanh",
+                        {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))},
+                        {},
+                        13,
+                        ExpectedEPNodeAssignment::All);
+}
+
 // Check that QNN compiles DQ -> Gelu -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, UnaryOp_Gelu) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),  // Input range [-10.0, 10.0f]
-                    "Gelu",
-                    {},
-                    11,
-                    ExpectedEPNodeAssignment::All,
-                    kMSDomain);  // GeLu is a contrib op.
+  RunQDQOpTest<uint8_t>("Gelu",
+                        {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))},
+                        {},
+                        11,
+                        ExpectedEPNodeAssignment::All,
+                        kMSDomain);  // GeLu is a contrib op.
 }
 
 // Check that QNN compiles DQ -> Elu -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, UnaryOp_Elu) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),  // Input range [-10.0, 10.0f]
-                    "Elu",
-                    {},
-                    11,
-                    ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Elu",
+                        {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))},
+                        {},
+                        11,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Tests accuracy of QDQ Relu
+// TODO: Relu does not set negative values to zero!
+// Could be due to ORT's ReluQuantFusion!
+//
+// Inaccuracy detected for output 'output', element 0.
+// Output quant params: scale=0.039215687662363052, zero_point=0.
+// Expected val: 0
+// QNN QDQ val: -10 (err 10)
+// CPU QDQ val: 0 (err 0)
+TEST_F(QnnHTPBackendTests, DISABLED_UnaryOp_Relu) {
+  RunQDQOpTest<uint8_t>("Relu",
+                        {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))},
+                        {},
+                        14,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> HardSwish -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, UnaryOp_HardSwish) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),  // Input range [-10.0, 10.0f]
-                    "HardSwish",
-                    {},
-                    14,
-                    ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("HardSwish",
+                        {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))},
+                        {},
+                        14,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Atan -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, UnaryOp_Atan) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),  // Input range [-10.0, 10.0f]
-                    "Atan",
-                    {},
-                    14,
-                    ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Atan",
+                        {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))},
+                        {},
+                        14,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Asin -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, UnaryOp_Asin) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -0.5f, 0.5f),  // input range -0.5 to 0.5
-                    "Asin", {},
-                    13, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Asin",
+                        {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-0.5, 0.5, 6))},
+                        {},
+                        13,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Sign -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, UnaryOp_Sign) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),
-                    "Sign", {},
-                    13, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Sign",
+                        {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))},
+                        {},
+                        13,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Sin -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, UnaryOp_Sin) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -3.14159f, 3.14159f),
-                    "Sin", {},
-                    11, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Sin",
+                        {TestInputDef<float>({1, 2, 3}, false, -3.14159f, 3.14159f)},
+                        {},
+                        11,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Cos -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, UnaryOp_Cos) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, {-3.14159f, -1.5f, -0.5f, 0.0f, 1.5, 3.14159f}),
-                    "Cos", {},
-                    11, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Cos",
+                        {TestInputDef<float>({1, 2, 3}, false, {-3.14159f, -1.5f, -0.5f, 0.0f, 1.5, 3.14159f})},
+                        {},
+                        11,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Cos -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, UnaryOp_Cos_Inaccurate) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, {-3.14159f, -1.88436f, -0.542863f, 0.0f, 1.05622f, 3.14159f}),
-                    "Cos", {},
-                    11, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Cos",
+                        {TestInputDef<float>({1, 2, 3}, false, {-3.14159f, -1.88436f, -0.542863f, 0.0f, 1.05622f, 3.14159f})},
+                        {},
+                        11,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Log -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, UnaryOp_Log) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, {3.14159f, 100.88436f, 10.542863f, 9.1f, 1.05622f, 3.14159f}),
-                    "Log", {},
-                    11, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Log",
+                        {TestInputDef<float>({1, 2, 3}, false, {3.14159f, 100.88436f, 10.542863f, 9.1f, 1.05622f, 3.14159f})},
+                        {},
+                        11, ExpectedEPNodeAssignment::All);
+}
+
+// Test accuracy of 8-bit QDQ Exp
+TEST_F(QnnHTPBackendTests, UnaryOp_Exp) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  RunQDQOpTest<uint8_t>("Exp",
+                        {TestInputDef<float>({1, 2, 3}, false, input_data)},
+                        {},
+                        13,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Test accuracy of 8-bit QDQ Sqrt
+TEST_F(QnnHTPBackendTests, UnaryOp_Sqrt) {
+  std::vector<float> input_data = GetFloatDataInRange(0.0f, 20.0f, 9);
+  RunQDQOpTest<uint8_t>("Sqrt",
+                        {TestInputDef<float>({1, 3, 3}, false, input_data)},
+                        {},
+                        13,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Test accuracy of 8-bit QDQ Neg
+TEST_F(QnnHTPBackendTests, UnaryOp_Neg) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  RunQDQOpTest<uint8_t>("Neg",
+                        {TestInputDef<float>({1, 2, 3}, false, input_data)},
+                        {},
+                        13,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Test Not operator on HTP backend.
+TEST_F(QnnHTPBackendTests, UnaryOp_Not) {
+  RunOpTest<bool>("Not",
+                  {TestInputDef<bool>({1, 4}, false, {false, false, true, true})},
+                  {},
+                  17,
+                  ExpectedEPNodeAssignment::All);
+}
+
+// Test accuracy of 8-bit QDQ Round
+TEST_F(QnnHTPBackendTests, UnaryOp_Round) {
+  std::vector<float> input_data = GetFloatDataInRange(-9.0f, 9.0f, 6);
+  RunQDQOpTest<uint8_t>("Round",
+                        {TestInputDef<float>({1, 2, 3}, false, input_data)},
+                        {},
+                        11,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Softmax -> Q as a single unit.
 // Test that the default axis (-1) for SoftMax opset 13 works.
 TEST_F(QnnHTPBackendTests, UnaryOp_Softmax13_DefaultAxis) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f),
-                    "Softmax",
-                    {},  // Uses default axis of -1 for opset 13
-                    13, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Softmax",
+                        {TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f)},
+                        {},  // Uses default axis of -1 for opset 13
+                        13,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Softmax -> Q as a single unit.
 // Test that an axis != -1 is not supported.
 TEST_F(QnnHTPBackendTests, UnaryOp_Softmax13_UnsupportedAxis) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f),
-                    "Softmax",
-                    {utils::MakeAttribute("axis", static_cast<int64_t>(1))},
-                    13, ExpectedEPNodeAssignment::None);
+  RunQDQOpTest<uint8_t>("Softmax",
+                        {TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f)},
+                        {utils::MakeAttribute("axis", static_cast<int64_t>(1))},
+                        13,
+                        ExpectedEPNodeAssignment::None);
 }
 
 // Check that QNN compiles DQ -> Softmax -> Q as a single unit.
 // Test that the default axis (1) for SoftMax opset < 13 does not work.
 TEST_F(QnnHTPBackendTests, UnaryOp_Softmax11_DefaultAxisFails) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f),
-                    "Softmax",
-                    {},  // Uses default axis of 1 for opset < 13.
-                    11, ExpectedEPNodeAssignment::None);
+  RunQDQOpTest<uint8_t>("Softmax",
+                        {TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f)},
+                        {},  // Uses default axis of 1 for opset < 13.
+                        11,
+                        ExpectedEPNodeAssignment::None);
 }
 
 // Check that QNN compiles DQ -> Softmax -> Q as a single unit.
 // Test that setting an axis value of -1 works for Softmax opset < 13.
 TEST_F(QnnHTPBackendTests, UnaryOp_Softmax11_SetValidAxis) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f),
-                    "Softmax",
-                    {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
-                    11, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Softmax",
+                        {TestInputDef<float>({1, 2, 3}, false, -5.0f, 5.0f)},
+                        {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
+                        11,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Check that QNN compiles DQ -> LogSoftmax -> Q as a single unit.
+// Test that the default axis (-1) for LogSoftmax opset 13 works.
+TEST_F(QnnHTPBackendTests, UnaryOp_LogSoftmax13_DefaultAxis) {
+  std::vector<float> input_data = GetFloatDataInRange(-5.0f, 5.0f, 6);
+  RunQDQOpTest<uint8_t>("LogSoftmax",
+                        {TestInputDef<float>({1, 2, 3}, false, input_data)},
+                        {},  // Uses default axis of -1 for opset 13
+                        13,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Check that QNN compiles DQ -> LogSoftmax -> Q as a single unit.
+// Test that an axis != -1 is not supported.
+TEST_F(QnnHTPBackendTests, UnaryOp_LogSoftmax13_UnsupportedAxis) {
+  std::vector<float> input_data = GetFloatDataInRange(-5.0f, 5.0f, 6);
+  RunQDQOpTest<uint8_t>("LogSoftmax",
+                        {TestInputDef<float>({1, 2, 3}, false, input_data)},
+                        {utils::MakeAttribute("axis", static_cast<int64_t>(1))},
+                        13,
+                        ExpectedEPNodeAssignment::None);
+}
+
+// Check that QNN compiles DQ -> LogSoftmax -> Q as a single unit.
+// Test that the default axis (1) for LogSoftmax opset < 13 does not work.
+TEST_F(QnnHTPBackendTests, UnaryOp_LogSoftmax11_DefaultAxisFails) {
+  std::vector<float> input_data = GetFloatDataInRange(-5.0f, 5.0f, 6);
+  RunQDQOpTest<uint8_t>("LogSoftmax",
+                        {TestInputDef<float>({1, 2, 3}, false, input_data)},
+                        {},  // Uses default axis of 1 for opset < 13.
+                        11,
+                        ExpectedEPNodeAssignment::None);
+}
+
+// Check that QNN compiles DQ -> LogSoftmax -> Q as a single unit.
+// Test that setting an axis value of -1 works for LogSoftmax opset < 13.
+TEST_F(QnnHTPBackendTests, UnaryOp_LogSoftmax11_SetValidAxis) {
+  std::vector<float> input_data = GetFloatDataInRange(-5.0f, 5.0f, 6);
+  RunQDQOpTest<uint8_t>("LogSoftmax",
+                        {TestInputDef<float>({1, 2, 3}, false, input_data)},
+                        {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
+                        11,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Test QDQ Abs op.
 TEST_F(QnnHTPBackendTests, UnaryOp_Abs) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -10.0f, 10.0f),
-                    "Abs",
-                    {},
-                    13, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("Abs",
+                        {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))},
+                        {},
+                        13,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Test QDQ Ceil op.
 TEST_F(QnnHTPBackendTests, UnaryOp_Ceil) {
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 3}, false, -100.0f, 100.0f),
-                    "Ceil",
-                    {},
-                    13, ExpectedEPNodeAssignment::All);
+  const std::vector<float> input_data = GetFloatDataInRange(-12.0f, 12.0f, 6);
+  RunQDQOpTest<uint8_t>("Ceil",
+                        {TestInputDef<float>({1, 2, 3}, false, input_data)},
+                        {},
+                        13,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Floor op.
+TEST_F(QnnHTPBackendTests, UnaryOp_Floor) {
+  const std::vector<float> input_data = GetFloatDataInRange(-12.0f, 12.0f, 6);
+  RunQDQOpTest<uint8_t>("Floor",
+                        {TestInputDef<float>({1, 2, 3}, false, input_data)},
+                        {},
+                        13,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Test QDQ DepthToSpace.
@@ -348,11 +449,12 @@ TEST_F(QnnHTPBackendTests, DepthToSpaceOp_CRD) {
                                 21., 22., 23.,
                                 27., 28., 29.,
                                 30., 31., 32.};
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 4, 2, 3}, false, X),
-                    "DepthToSpace",
-                    {utils::MakeAttribute("blocksize", static_cast<int64_t>(2)),
-                     utils::MakeAttribute("mode", "CRD")},
-                    11, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("DepthToSpace",
+                        {TestInputDef<float>({1, 4, 2, 3}, false, X)},
+                        {utils::MakeAttribute("blocksize", static_cast<int64_t>(2)),
+                         utils::MakeAttribute("mode", "CRD")},
+                        11,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Test QDQ DepthToSpace.
@@ -365,11 +467,12 @@ TEST_F(QnnHTPBackendTests, DepthToSpaceOp_DCR) {
                                 21., 22., 23.,
                                 27., 28., 29.,
                                 30., 31., 32.};
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 4, 2, 3}, false, X),
-                    "DepthToSpace",
-                    {utils::MakeAttribute("blocksize", static_cast<int64_t>(2)),
-                     utils::MakeAttribute("mode", "DCR")},
-                    11, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("DepthToSpace",
+                        {TestInputDef<float>({1, 4, 2, 3}, false, X)},
+                        {utils::MakeAttribute("blocksize", static_cast<int64_t>(2)),
+                         utils::MakeAttribute("mode", "DCR")},
+                        11,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Test QDQ SpaceToDepth.
@@ -379,10 +482,11 @@ TEST_F(QnnHTPBackendTests, SpaceToDepthOp) {
 
                                 2.0f, 2.1f, 2.2f, 2.3f,
                                 3.0f, 3.1f, 3.2f, 3.3f};
-  RunQDQUnaryOpTest(TestInputDef<float>({1, 2, 2, 4}, false, X),
-                    "SpaceToDepth",
-                    {utils::MakeAttribute("blocksize", static_cast<int64_t>(2))},
-                    11, ExpectedEPNodeAssignment::All);
+  RunQDQOpTest<uint8_t>("SpaceToDepth",
+                        {TestInputDef<float>({1, 2, 2, 4}, false, X)},
+                        {utils::MakeAttribute("blocksize", static_cast<int64_t>(2))},
+                        11,
+                        ExpectedEPNodeAssignment::All);
 }
 
 // Run QDQ model on HTP twice
@@ -404,23 +508,21 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheTest) {
 
   // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
   // 1st run will generate the Qnn context cache binary file
-  TestQDQModelAccuracy(BuildUnaryOpTestCase<float>(op_type, input_def, {}),
-                       BuildQDQUnaryOpTestCase<uint8_t>(input_def, op_type, {}),
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}),
                        provider_options,
                        14,
-                       ExpectedEPNodeAssignment::All,
-                       1e-5f);
+                       ExpectedEPNodeAssignment::All);
 
   // Make sure the Qnn context cache binary file is generated
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
 
   // 2nd run will load and run from Qnn context cache binary file
-  TestQDQModelAccuracy(BuildUnaryOpTestCase<float>(op_type, input_def, {}),
-                       BuildQDQUnaryOpTestCase<uint8_t>(input_def, op_type, {}),
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}),
                        provider_options,
                        14,
-                       ExpectedEPNodeAssignment::All,
-                       1e-5f);
+                       ExpectedEPNodeAssignment::All);
 }
 
 TEST_F(QnnHTPBackendTests, QuantAccuracyTest) {
@@ -439,7 +541,7 @@ TEST_F(QnnHTPBackendTests, QuantAccuracyTest) {
 
     // input -> Q -> Transpose -> DQ -> output
     NodeArg* input0 = MakeTestInput(builder, input0_def);
-    QuantParams<uint8_t> qparams = GetTestInputQuantParams(input0_def);
+    QuantParams<uint8_t> qparams = GetTestInputQuantParams<uint8_t>(input0_def);
 
     auto* quant_input = builder.MakeIntermediate();
     builder.AddQuantizeLinearNode<uint8_t>(input0, qparams.scale, qparams.zero_point, quant_input);
@@ -462,8 +564,8 @@ TEST_F(QnnHTPBackendTests, QuantAccuracyTest) {
 // Test QDQ Add
 TEST_F(QnnHTPBackendTests, BinaryOp_Add4D) {
   RunQDQOpTest<uint8_t>("Add",
-                        TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
-                        TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                        {TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                         TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f)},
                         {},
                         17,
                         ExpectedEPNodeAssignment::All);
@@ -472,8 +574,8 @@ TEST_F(QnnHTPBackendTests, BinaryOp_Add4D) {
 // Test QDQ Sub
 TEST_F(QnnHTPBackendTests, BinaryOp_Sub4D) {
   RunQDQOpTest<uint8_t>("Sub",
-                        TestInputDef<float>({1, 3, 8, 8}, false, -10.0f, 10.0f),
-                        TestInputDef<float>({1, 3, 8, 8}, false, -10.0f, 10.0f),
+                        {TestInputDef<float>({1, 3, 8, 8}, false, -10.0f, 10.0f),
+                         TestInputDef<float>({1, 3, 8, 8}, false, -10.0f, 10.0f)},
                         {},
                         17,
                         ExpectedEPNodeAssignment::All);
@@ -481,8 +583,8 @@ TEST_F(QnnHTPBackendTests, BinaryOp_Sub4D) {
 
 TEST_F(QnnHTPBackendTests, BinaryOp_Sub4D_LargeInputs) {
   RunQDQOpTest<uint8_t>("Sub",
-                        TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
-                        TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                        {TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                         TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f)},
                         {},
                         17,
                         ExpectedEPNodeAssignment::All);
@@ -490,17 +592,65 @@ TEST_F(QnnHTPBackendTests, BinaryOp_Sub4D_LargeInputs) {
 
 TEST_F(QnnHTPBackendTests, BinaryOp_Sub4D_Broadcast) {
   RunQDQOpTest<uint8_t>("Sub",
-                        TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
-                        TestInputDef<float>({3, 1, 1}, true, {1.0f, 0.5f, -0.3f}),
+                        {TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                         TestInputDef<float>({3, 1, 1}, true, {1.0f, 0.5f, -0.3f})},
                         {},
                         17,
                         ExpectedEPNodeAssignment::All);
 }
 
+// Test accuracy of QDQ Pow
+#if defined(__linux__)
+// TODO: This fails on Linux (HTP emulation). Works on Windows ARM64.
+// Inaccuracy detected for output 'output', element 0.
+// Output quant params: scale=0.051073111593723297, zero_point=2.
+// Expected val: 0.0099999997764825821
+// QNN QDQ val: 12.921497344970703 (err 12.911497116088867)
+// CPU QDQ val: -0.10214622318744659 (err 0.11214622110128403)
+TEST_F(QnnHTPBackendTests, DISABLED_BinaryOp_Pow) {
+#else
+TEST_F(QnnHTPBackendTests, BinaryOp_Pow) {
+#endif
+  std::vector<float> bases_input = {-10.0f, -8.0f, -6.0f, 1.0f, 2.0f, 3.0f, 5.5f, 10.0f};
+  std::vector<float> exponents_input = {-2.0f, -1.0f, 0.0f, 0.5f, 1.0f, 2.0f, 1.5f, 0.2f};
+  RunQDQOpTest<uint8_t>("Pow",
+                        {TestInputDef<float>({1, 2, 2, 2}, false, bases_input),
+                         TestInputDef<float>({1, 2, 2, 2}, false, exponents_input)},
+                        {},
+                        15,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Test accuracy of QDQ PRelu with dynamic slopes.
+TEST_F(QnnHTPBackendTests, BinaryOp_PRelu_DynamicSlopes) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  std::vector<float> slopes_data = GetFloatDataInRange(-1.0f, 1.0f, 8);
+  RunQDQOpTest<uint8_t>("PRelu",
+                        {TestInputDef<float>({1, 2, 2, 2}, false, input_data),
+                         TestInputDef<float>({1, 2, 2, 2}, false, slopes_data)},
+                        {},
+                        16,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Test accuracy of QDQ PRelu with static slope weights.
+TEST_F(QnnHTPBackendTests, BinaryOp_PRelu_StaticSlopes) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  std::vector<float> slopes_data = GetFloatDataInRange(-1.0f, 1.0f, 8);
+  RunQDQOpTest<uint8_t>("PRelu",
+                        {TestInputDef<float>({1, 2, 2, 2}, false, input_data),
+                         TestInputDef<float>({1, 2, 2, 2}, true, slopes_data)},
+                        {},
+                        16,
+                        ExpectedEPNodeAssignment::All);
+}
+
 TEST_F(QnnHTPBackendTests, BinaryOp_Div4D_SmallInputs) {
+  std::vector<float> input0_data = {-10.0f, -8.0f, -1.0f, 0.0f, 1.0f, 2.1f, 8.0f, 10.0f};
+  std::vector<float> input1_data = {5.0f, 4.0f, 1.0f, 1.0f, 1.0f, 4.0f, 4.0f, 5.0f};
   RunQDQOpTest<uint8_t>("Div",
-                        TestInputDef<float>({1, 2, 2, 2}, false, {-10.0f, -8.0f, -1.0f, 0.0f, 1.0f, 2.1f, 8.0f, 10.0f}),
-                        TestInputDef<float>({1, 2, 2, 2}, false, {5.0f, 4.0f, 1.0f, 1.0f, 1.0f, 4.0f, 4.0f, 5.0f}),
+                        {TestInputDef<float>({1, 2, 2, 2}, false, input0_data),
+                         TestInputDef<float>({1, 2, 2, 2}, false, input1_data)},
                         {},
                         17,
                         ExpectedEPNodeAssignment::All);
@@ -514,8 +664,8 @@ TEST_F(QnnHTPBackendTests, BinaryOp_Div4D_SmallInputs) {
 // CPU QDQ val: -516716.71875 (err 238759.40625)
 TEST_F(QnnHTPBackendTests, DISABLED_BinaryOp_Div4D_LargeInputs) {
   RunQDQOpTest<uint8_t>("Div",
-                        TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
-                        TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                        {TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                         TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f)},
                         {},
                         17,
                         ExpectedEPNodeAssignment::All);
@@ -523,8 +673,8 @@ TEST_F(QnnHTPBackendTests, DISABLED_BinaryOp_Div4D_LargeInputs) {
 
 TEST_F(QnnHTPBackendTests, BinaryOp_Div4D_Broadcast) {
   RunQDQOpTest<uint8_t>("Div",
-                        TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
-                        TestInputDef<float>({3, 1, 1}, true, {1.0f, 0.5f, -0.3f}),
+                        {TestInputDef<float>({1, 3, 768, 1152}, false, -1.0f, 1.0f),
+                         TestInputDef<float>({3, 1, 1}, true, {1.0f, 0.5f, -0.3f})},
                         {},
                         17,
                         ExpectedEPNodeAssignment::All);
@@ -532,29 +682,30 @@ TEST_F(QnnHTPBackendTests, BinaryOp_Div4D_Broadcast) {
 
 // Test QDQ Mul
 TEST_F(QnnHTPBackendTests, BinaryOp_Mul4D) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0, 10.0f, 8);
   RunQDQOpTest<uint8_t>("Mul",
-                        TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
-                        TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                        {TestInputDef<float>({1, 2, 2, 2}, false, input_data),
+                         TestInputDef<float>({1, 2, 2, 2}, false, input_data)},
                         {},
                         17,
                         ExpectedEPNodeAssignment::All);
 }
 
 // Test And
-TEST_F(QnnCPUBackendTests, BinaryOp_And4D) {
+TEST_F(QnnHTPBackendTests, BinaryOp_And4D) {
   RunOpTest<bool>("And",
-                  TestInputDef<bool>({1, 4}, false, {false, false, true, true}),
-                  TestInputDef<bool>({1, 4}, false, {false, true, false, true}),
+                  {TestInputDef<bool>({1, 4}, false, {false, false, true, true}),
+                   TestInputDef<bool>({1, 4}, false, {false, true, false, true})},
                   {},
                   17,
                   ExpectedEPNodeAssignment::All);
 }
 
 // Test that Or is not yet supported on CPU backend.
-TEST_F(QnnCPUBackendTests, BinaryOp_HTP_Or_Unsupported) {
+TEST_F(QnnHTPBackendTests, BinaryOp_HTP_Or_Unsupported) {
   RunOpTest<bool>("Or",
-                  TestInputDef<bool>({1, 4}, false, {false, false, true, true}),
-                  TestInputDef<bool>({1, 4}, false, {false, true, false, true}),
+                  {TestInputDef<bool>({1, 4}, false, {false, false, true, true}),
+                   TestInputDef<bool>({1, 4}, false, {false, true, false, true})},
                   {},
                   17,
                   ExpectedEPNodeAssignment::None);
@@ -563,8 +714,8 @@ TEST_F(QnnCPUBackendTests, BinaryOp_HTP_Or_Unsupported) {
 // Test QDQ GridSample with bilinear
 TEST_F(QnnHTPBackendTests, GridSample_Bilinear) {
   RunQDQOpTest<uint8_t>("GridSample",
-                        TestInputDef<float>({1, 1, 3, 2}, false, -10.0f, 10.0f),
-                        TestInputDef<float>({1, 2, 4, 2}, false, -10.0f, 10.0f),
+                        {TestInputDef<float>({1, 1, 3, 2}, false, GetFloatDataInRange(-10.0f, 10.0f, 6)),
+                         TestInputDef<float>({1, 2, 4, 2}, false, GetFloatDataInRange(-10.0f, 10.0f, 16))},
                         {utils::MakeAttribute("align_corners", static_cast<int64_t>(0)),
                          utils::MakeAttribute("mode", "bilinear"),
                          utils::MakeAttribute("padding_mode", "zeros")},
@@ -575,8 +726,8 @@ TEST_F(QnnHTPBackendTests, GridSample_Bilinear) {
 // Test QDQ GridSample with align corners
 TEST_F(QnnHTPBackendTests, GridSample_AlignCorners) {
   RunQDQOpTest<uint8_t>("GridSample",
-                        TestInputDef<float>({1, 1, 3, 2}, false, -10.0f, 10.0f),
-                        TestInputDef<float>({1, 2, 4, 2}, false, -10.0f, 10.0f),
+                        {TestInputDef<float>({1, 1, 3, 2}, false, GetFloatDataInRange(-10.0f, 10.0f, 6)),
+                         TestInputDef<float>({1, 2, 4, 2}, false, GetFloatDataInRange(-10.0f, 10.0f, 16))},
                         {utils::MakeAttribute("align_corners", static_cast<int64_t>(1)),
                          utils::MakeAttribute("mode", "bilinear"),
                          utils::MakeAttribute("padding_mode", "zeros")},
@@ -592,8 +743,8 @@ TEST_F(QnnHTPBackendTests, GridSample_AlignCorners) {
 // CPU QDQ val: 3.3850328922271729 (err 0.022981882095336914)
 TEST_F(QnnHTPBackendTests, DISABLED_GridSample_BorderPadding) {
   RunQDQOpTest<uint8_t>("GridSample",
-                        TestInputDef<float>({1, 1, 3, 2}, false, -10.0f, 10.0f),
-                        TestInputDef<float>({1, 2, 4, 2}, false, -10.0f, 10.0f),
+                        {TestInputDef<float>({1, 1, 3, 2}, false, -10.0f, 10.0f),
+                         TestInputDef<float>({1, 2, 4, 2}, false, -10.0f, 10.0f)},
                         {utils::MakeAttribute("mode", "bilinear"),
                          utils::MakeAttribute("padding_mode", "border")},
                         17,
@@ -603,8 +754,8 @@ TEST_F(QnnHTPBackendTests, DISABLED_GridSample_BorderPadding) {
 // Test QDQ GridSample with nearest mode
 TEST_F(QnnHTPBackendTests, GridSample_Nearest) {
   RunQDQOpTest<uint8_t>("GridSample",
-                        TestInputDef<float>({1, 1, 3, 2}, false, -10.0f, 10.0f),
-                        TestInputDef<float>({1, 2, 4, 2}, false, -10.0f, 10.0f),
+                        {TestInputDef<float>({1, 1, 3, 2}, false, GetFloatDataInRange(-10.0f, 10.0f, 6)),
+                         TestInputDef<float>({1, 2, 4, 2}, false, GetFloatDataInRange(-10.0f, 10.0f, 16))},
                         {utils::MakeAttribute("mode", "nearest")},
                         17,
                         ExpectedEPNodeAssignment::All);
@@ -618,13 +769,33 @@ TEST_F(QnnHTPBackendTests, GridSample_Nearest) {
 // CPU QDQ val: 3.2036216259002686 (err 0.0092642307281494141)
 TEST_F(QnnHTPBackendTests, DISABLED_GridSample_ReflectionPaddingMode) {
   RunQDQOpTest<uint8_t>("GridSample",
-                        TestInputDef<float>({1, 1, 3, 2}, false, -10.0f, 10.0f),
-                        TestInputDef<float>({1, 2, 4, 2}, false, -10.0f, 10.0f),
+                        {TestInputDef<float>({1, 1, 3, 2}, false, -10.0f, 10.0f),
+                         TestInputDef<float>({1, 2, 4, 2}, false, -10.0f, 10.0f)},
                         {utils::MakeAttribute("padding_mode", "reflection")},
                         17,
                         ExpectedEPNodeAssignment::All);
 }
 
+// Test QDQ Concat: 3 inputs concatenated at the last axis.
+TEST_F(QnnHTPBackendTests, VariadicOp_Concat_3Inputs_LastAxis) {
+  RunQDQOpTest<uint8_t>("Concat",
+                        {TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                         TestInputDef<float>({1, 2, 2, 3}, false, -1.0f, 1.0f),
+                         TestInputDef<float>({1, 2, 2, 1}, false, -2.0f, 2.0f)},
+                        {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
+                        13,
+                        ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Concat: 2 inputs concatenated at the second axis.
+TEST_F(QnnHTPBackendTests, VariadicOp_Concat_2Inputs_2ndAxis) {
+  RunQDQOpTest<uint8_t>("Concat",
+                        {TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
+                         TestInputDef<float>({1, 3, 2, 2}, false, -2.0f, 2.0f)},
+                        {utils::MakeAttribute("axis", static_cast<int64_t>(1))},
+                        13,
+                        ExpectedEPNodeAssignment::All);
+}
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/qnn/slice_htp_test.cc b/onnxruntime/test/providers/qnn/slice_htp_test.cc
index 23d817a69b89b..f7163f04736a5 100644
--- a/onnxruntime/test/providers/qnn/slice_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/slice_htp_test.cc
@@ -45,7 +45,7 @@ static GetTestQDQModelFn<QuantType> BuildQDQSliceTestCase(const TestInputDef<flo
   return [data_def, starts_def, ends_def, axes_def, steps_def](ModelTestBuilder& builder,
                                                                std::vector<QuantParams<QuantType>>& output_qparams) {
     NodeArg* data = MakeTestInput(builder, data_def);
-    QuantParams<QuantType> data_qparams = GetTestInputQuantParams(data_def);
+    QuantParams<QuantType> data_qparams = GetTestInputQuantParams<QuantType>(data_def);
     NodeArg* data_qdq = AddQDQNodePair(builder, data, data_qparams.scale, data_qparams.zero_point);
 
     NodeArg* starts = MakeTestInput(builder, starts_def);
diff --git a/onnxruntime/test/providers/qnn/transpose_htp_test.cc b/onnxruntime/test/providers/qnn/transpose_htp_test.cc
index adc0e7104b136..8d8c1ebb0fd15 100644
--- a/onnxruntime/test/providers/qnn/transpose_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/transpose_htp_test.cc
@@ -38,7 +38,7 @@ static GetTestQDQModelFn<QuantType> BuildQDQTransposeTestCase(const TestInputDef
                                                               const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
   return [input_def, attrs](ModelTestBuilder& builder, std::vector<QuantParams<QuantType>>& output_qparams) {
     NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<QuantType> input_qparams = GetTestInputQuantParams(input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
     NodeArg* input_qdq = AddQDQNodePair(builder, input, input_qparams.scale, input_qparams.zero_point);
 
     auto* output = builder.MakeIntermediate();
diff --git a/onnxruntime/test/providers/qnn/where_htp_test.cc b/onnxruntime/test/providers/qnn/where_htp_test.cc
index 02238dad1c5dd..49f3ef0fd983a 100644
--- a/onnxruntime/test/providers/qnn/where_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/where_htp_test.cc
@@ -42,12 +42,12 @@ static GetTestQDQModelFn<QuantType> BuildQDQWhereTestCase(const TestInputDef<boo
 
     // x => Q => DQ =>
     NodeArg* x = MakeTestInput(builder, x_def);
-    QuantParams<QuantType> x_qparams = GetTestInputQuantParams(x_def);
+    QuantParams<QuantType> x_qparams = GetTestInputQuantParams<QuantType>(x_def);
     NodeArg* x_qdq = AddQDQNodePair(builder, x, x_qparams.scale, x_qparams.zero_point);
 
     // y => Q => DQ =>
     NodeArg* y = MakeTestInput(builder, y_def);
-    QuantParams<QuantType> y_qparams = GetTestInputQuantParams(y_def);
+    QuantParams<QuantType> y_qparams = GetTestInputQuantParams<QuantType>(y_def);
     NodeArg* y_qdq = AddQDQNodePair(builder, y, y_qparams.scale, y_qparams.zero_point);
 
     // Where operator.
diff --git a/onnxruntime/test/python/onnxruntime_test_float8.py b/onnxruntime/test/python/onnxruntime_test_float8.py
index 3f3180230f853..76ca5d9538374 100644
--- a/onnxruntime/test/python/onnxruntime_test_float8.py
+++ b/onnxruntime/test/python/onnxruntime_test_float8.py
@@ -8,9 +8,11 @@
 import unittest
 
 import numpy as np
+import packaging.version as pv
 import parameterized
 from numpy.testing import assert_allclose
 from onnx import TensorProto
+from onnx import __version__ as onnx_version
 from onnx.checker import check_model
 from onnx.helper import make_graph, make_model, make_node, make_opsetid, make_tensor, make_tensor_value_info
 from onnx.reference import ReferenceEvaluator
@@ -37,7 +39,7 @@ class TestInferenceSession(unittest.TestCase):
     <https://onnx.ai/onnx/api/numpy_helper.html#onnx.numpy_helper.float8e5m2_to_float32>`_.
     """
 
-    dtypes = frozenset({"FLOAT": np.float32, "FLOAT16": np.float16})
+    dtypes = {"FLOAT": np.float32, "FLOAT16": np.float16}  # noqa: RUF012
     x = np.array(
         [0.4068359375, 352, 416, 336, 304, 272, -248, -100, 1e-4, 1e-2, 416, 432, 1e5, np.inf, -np.inf, np.nan],
         dtype=np.float32,
@@ -76,7 +78,7 @@ class TestInferenceSession(unittest.TestCase):
                     240.0,
                     240.0,
                     -240.0,
-                    -104.0,
+                    -96.0,
                     0.0,
                     0.009765625,
                     240.0,
@@ -113,7 +115,7 @@ class TestInferenceSession(unittest.TestCase):
                 [
                     0.4375,
                     384.0,
-                    448.0,
+                    384.0,
                     320.0,
                     320.0,
                     256.0,
@@ -121,7 +123,7 @@ class TestInferenceSession(unittest.TestCase):
                     -96.0,
                     0.0001068115234375,
                     0.009765625,
-                    448.0,
+                    384.0,
                     448.0,
                     57344.0,
                     57344.0,
@@ -167,7 +169,7 @@ class TestInferenceSession(unittest.TestCase):
                     np.nan,
                     np.nan,
                     np.nan,
-                    -104.0,
+                    -96.0,
                     0.0,
                     0.009765625,
                     np.nan,
@@ -204,7 +206,7 @@ class TestInferenceSession(unittest.TestCase):
                 [
                     0.4375,
                     384.0,
-                    448.0,
+                    384.0,
                     320.0,
                     320.0,
                     256.0,
@@ -212,7 +214,7 @@ class TestInferenceSession(unittest.TestCase):
                     -96.0,
                     0.0001068115234375,
                     0.009765625,
-                    448.0,
+                    384.0,
                     448.0,
                     np.nan,
                     np.nan,
@@ -245,6 +247,7 @@ def model_cast_cast_f16_float(self, to, saturate, rev=False):
         check_model(onnx_model)
         return onnx_model
 
+    @unittest.skipIf(pv.Version(onnx_version) < pv.Version("1.15.0"), reason="needs onnx>=1.15.0")
     @parameterized.parameterized.expand(
         [
             ("FLOAT8E4M3FN", "FLOAT", 1),
@@ -429,6 +432,7 @@ def model_qdq(self, to, float_name, saturate, castq=False, castdq=False, like=Fa
         check_model(onnx_model)
         return onnx_model
 
+    @unittest.skipIf(pv.Version(onnx_version) < pv.Version("1.15.0"), reason="needs onnx>=1.15.0")
     @parameterized.parameterized.expand(
         [
             ("FLOAT8E4M3FN", "FLOAT", 1),
@@ -689,6 +693,18 @@ def test_model_qdq_cuda_ortvalue(self, name: str, float_name: str, saturate: int
         self.assertEqual(expect.shape, y.shape)
         self.assertEqual(expect.dtype, y.dtype)
 
+    @unittest.skipIf("CUDAExecutionProvider" not in available_providers, reason="Not running on CUDA.")
+    def test_compare_cpu_cuda_e4m3fn(self):
+        folder = os.path.join(os.path.dirname(__file__), "..", "testdata", "float8")
+        model = os.path.join(folder, "te.cast_fp8_1_fp32.onnx")
+        data = np.load(os.path.join(folder, "te.cast_fp8_1_fp32_input.npy"))
+
+        sess_cpu = onnxruntime.InferenceSession(model, providers=["CPUExecutionProvider"])
+        sess_cuda = onnxruntime.InferenceSession(model, providers=["CUDAExecutionProvider"])
+        cpu_res = sess_cpu.run(None, {"input": data})[0]
+        cuda_res = sess_cuda.run(None, {"input": data})[0]
+        self.assertEqual(cuda_res.tolist(), cpu_res.tolist())
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
index 8009d97ba34ce..56417f13fbea4 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
@@ -16,40 +16,43 @@
 from onnxruntime.capi._pybind_state import OrtValue as C_OrtValue
 from onnxruntime.capi._pybind_state import OrtValueVector, SessionIOBinding
 
+test_params = [
+    ("cuda", "CUDAExecutionProvider", C_OrtDevice.cuda),
+    ("dml", "DmlExecutionProvider", C_OrtDevice.dml),
+]
+
 
 class TestIOBinding(unittest.TestCase):
-    def create_ortvalue_input_on_gpu(self):
+    def _create_ortvalue_input_on_gpu(self, device):
         return onnxrt.OrtValue.ortvalue_from_numpy(
-            np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32), "cuda", 0
+            np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32), device, 0
         )
 
-    def create_ortvalue_alternate_input_on_gpu(self):
+    def _create_ortvalue_alternate_input_on_gpu(self, device):
         return onnxrt.OrtValue.ortvalue_from_numpy(
             np.array([[2.0, 4.0], [6.0, 8.0], [10.0, 12.0]], dtype=np.float32),
-            "cuda",
+            device,
             0,
         )
 
-    def create_uninitialized_ortvalue_input_on_gpu(self):
-        return onnxrt.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, "cuda", 0)
+    def _create_uninitialized_ortvalue_input_on_gpu(self, device):
+        return onnxrt.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, device, 0)
 
-    def create_numpy_input(self):
+    def _create_numpy_input(self):
         return np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
 
-    def create_expected_output(self):
+    def _create_expected_output(self):
         return np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
 
-    def create_expected_output_alternate(self):
+    def _create_expected_output_alternate(self):
         return np.array([[2.0, 8.0], [18.0, 32.0], [50.0, 72.0]], dtype=np.float32)
 
     def test_bind_input_to_cpu_arr(self):
-        self.create_numpy_input()
-
         session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
         io_binding = session.io_binding()
 
         # Bind Numpy object (input) that's on CPU to wherever the model needs it
-        io_binding.bind_cpu_input("X", self.create_numpy_input())
+        io_binding.bind_cpu_input("X", self._create_numpy_input())
 
         # Bind output to CPU
         io_binding.bind_output("Y")
@@ -57,254 +60,280 @@ def test_bind_input_to_cpu_arr(self):
         # Invoke Run
         session.run_with_iobinding(io_binding)
 
-        # Sync if different CUDA streams
+        # Sync if different streams
         io_binding.synchronize_outputs()
 
-        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
+        # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host here)
         ort_output = io_binding.copy_outputs_to_cpu()[0]
 
         # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output))
+        self.assertTrue(np.array_equal(self._create_expected_output(), ort_output))
 
-    @unittest.skip("Could not find an implementation for Identity(19) node with name ''")
     def test_bind_input_types(self):
-        opset = onnx_opset_version()
-        devices = [
-            (
-                C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0),
-                ["CPUExecutionProvider"],
-            )
-        ]
-        if "CUDAExecutionProvider" in onnxrt.get_all_providers():
-            devices.append(
-                (
-                    C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0),
-                    ["CUDAExecutionProvider"],
-                )
-            )
-
-        for device, provider in devices:
-            for dtype in [
-                np.float32,
-                np.float64,
-                np.int32,
-                np.uint32,
-                np.int64,
-                np.uint64,
-                np.int16,
-                np.uint16,
-                np.int8,
-                np.uint8,
-                np.float16,
-                np.bool_,
-            ]:
-                with self.subTest(dtype=dtype, device=str(device)):
-                    x = np.arange(8).reshape((-1, 2)).astype(dtype)
-                    proto_dtype = NP_TYPE_TO_TENSOR_TYPE[x.dtype]
-
-                    X = helper.make_tensor_value_info("X", proto_dtype, [None, x.shape[1]])  # noqa: N806
-                    Y = helper.make_tensor_value_info("Y", proto_dtype, [None, x.shape[1]])  # noqa: N806
-
-                    # inference
-                    node_add = helper.make_node("Identity", ["X"], ["Y"])
-
-                    # graph
-                    graph_def = helper.make_graph([node_add], "lr", [X], [Y], [])
-                    model_def = helper.make_model(
-                        graph_def,
-                        producer_name="dummy",
-                        ir_version=7,
-                        producer_version="0",
-                        opset_imports=[helper.make_operatorsetid("", opset)],
-                    )
-
-                    sess = onnxrt.InferenceSession(model_def.SerializeToString(), providers=provider)
-
-                    bind = SessionIOBinding(sess._sess)
-                    ort_value = C_OrtValue.ortvalue_from_numpy(x, device)
-                    bind.bind_ortvalue_input("X", ort_value)
-                    bind.bind_output("Y", device)
-                    sess._sess.run_with_iobinding(bind, None)
-                    ortvaluevector = bind.get_outputs()
-                    self.assertIsInstance(ortvaluevector, OrtValueVector)
-                    ortvalue = bind.get_outputs()[0]
-                    y = ortvalue.numpy()
-                    assert_almost_equal(x, y)
-
-                    bind = SessionIOBinding(sess._sess)
-                    bind.bind_input("X", device, dtype, x.shape, ort_value.data_ptr())
-                    bind.bind_output("Y", device)
-                    sess._sess.run_with_iobinding(bind, None)
-                    ortvalue = bind.get_outputs()[0]
-                    y = ortvalue.numpy()
-                    assert_almost_equal(x, y)
+        for device, execution_provider, generate_device in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
+
+                opset = onnx_opset_version()
+                devices = [
+                    (
+                        C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0),
+                        ["CPUExecutionProvider"],
+                    ),
+                    (
+                        C_OrtDevice(generate_device(), C_OrtDevice.default_memory(), 0),
+                        [execution_provider],
+                    ),
+                ]
+
+                for inner_device, provider in devices:
+                    for dtype in [
+                        np.float32,
+                        np.float64,
+                        np.int32,
+                        np.uint32,
+                        np.int64,
+                        np.uint64,
+                        np.int16,
+                        np.uint16,
+                        np.int8,
+                        np.uint8,
+                        np.float16,
+                        np.bool_,
+                    ]:
+                        with self.subTest(dtype=dtype, inner_device=str(inner_device)):
+                            x = np.arange(8).reshape((-1, 2)).astype(dtype)
+                            proto_dtype = NP_TYPE_TO_TENSOR_TYPE[x.dtype]
+
+                            X = helper.make_tensor_value_info("X", proto_dtype, [None, x.shape[1]])  # noqa: N806
+                            Y = helper.make_tensor_value_info("Y", proto_dtype, [None, x.shape[1]])  # noqa: N806
+
+                            # inference
+                            node_add = helper.make_node("Identity", ["X"], ["Y"])
+
+                            # graph
+                            graph_def = helper.make_graph([node_add], "lr", [X], [Y], [])
+                            model_def = helper.make_model(
+                                graph_def,
+                                producer_name="dummy",
+                                ir_version=7,
+                                producer_version="0",
+                                opset_imports=[helper.make_operatorsetid("", opset)],
+                            )
+
+                            sess = onnxrt.InferenceSession(model_def.SerializeToString(), providers=provider)
+
+                            bind = SessionIOBinding(sess._sess)
+                            ort_value = C_OrtValue.ortvalue_from_numpy(x, inner_device)
+                            bind.bind_ortvalue_input("X", ort_value)
+                            bind.bind_output("Y", inner_device)
+                            sess._sess.run_with_iobinding(bind, None)
+                            ortvaluevector = bind.get_outputs()
+                            self.assertIsInstance(ortvaluevector, OrtValueVector)
+                            ortvalue = bind.get_outputs()[0]
+                            y = ortvalue.numpy()
+                            assert_almost_equal(x, y)
+
+                            bind = SessionIOBinding(sess._sess)
+                            bind.bind_input("X", inner_device, dtype, x.shape, ort_value.data_ptr())
+                            bind.bind_output("Y", inner_device)
+                            sess._sess.run_with_iobinding(bind, None)
+                            ortvalue = bind.get_outputs()[0]
+                            y = ortvalue.numpy()
+                            assert_almost_equal(x, y)
 
     def test_bind_input_only(self):
-        input = self.create_ortvalue_input_on_gpu()
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
+                input = self._create_ortvalue_input_on_gpu(device)
 
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
 
-        # Bind input to CUDA
-        io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr())
+                # Bind input to the GPU
+                io_binding.bind_input("X", device, 0, np.float32, [3, 2], input.data_ptr())
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Bind output to CPU
-        io_binding.bind_output("Y")
+                # Bind output to CPU
+                io_binding.bind_output("Y")
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
-        ort_output = io_binding.copy_outputs_to_cpu()[0]
+                # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host
+                # here)
+                ort_output = io_binding.copy_outputs_to_cpu()[0]
 
-        # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output))
+                # Validate results
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_output))
 
     def test_bind_input_and_preallocated_output(self):
-        input = self.create_ortvalue_input_on_gpu()
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
 
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
-
-        # Bind input to CUDA
-        io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr())
-
-        # Bind output to CUDA
-        output = self.create_uninitialized_ortvalue_input_on_gpu()
-        io_binding.bind_output("Y", "cuda", 0, np.float32, [3, 2], output.data_ptr())
-
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
-
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                input = self._create_ortvalue_input_on_gpu(device)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
 
-        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
-        ort_output_vals = io_binding.copy_outputs_to_cpu()[0]
-        # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output_vals))
+                # Bind input to the GPU
+                io_binding.bind_input("X", device, 0, np.float32, [3, 2], input.data_ptr())
 
-        # Validate if ORT actually wrote to pre-allocated buffer by copying the Torch allocated buffer
-        # to the host and validating its contents
-        ort_output_vals_in_cpu = output.numpy()
-        # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output_vals_in_cpu))
+                # Bind output to the GPU
+                output = self._create_uninitialized_ortvalue_input_on_gpu(device)
+                io_binding.bind_output("Y", device, 0, np.float32, [3, 2], output.data_ptr())
 
-    def test_bind_input_and_non_preallocated_output(self):
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Bind input to CUDA
-        io_binding.bind_input(
-            "X",
-            "cuda",
-            0,
-            np.float32,
-            [3, 2],
-            self.create_ortvalue_input_on_gpu().data_ptr(),
-        )
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Bind output to CUDA
-        io_binding.bind_output("Y", "cuda")
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host
+                # here)
+                ort_output_vals = io_binding.copy_outputs_to_cpu()[0]
+                # Validate results
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_output_vals))
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Validate if ORT actually wrote to pre-allocated buffer by copying the allocated buffer
+                # to the host and validating its contents
+                ort_output_vals_in_cpu = output.numpy()
+                # Validate results
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_output_vals_in_cpu))
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+    def test_bind_input_and_non_preallocated_output(self):
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
+
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
+
+                input = self._create_ortvalue_input_on_gpu(device)
+
+                # Bind input to the GPU
+                io_binding.bind_input(
+                    "X",
+                    device,
+                    0,
+                    np.float32,
+                    [3, 2],
+                    input.data_ptr(),
+                )
 
-        # This call returns an OrtValue which has data allocated by ORT on CUDA
-        ort_outputs = io_binding.get_outputs()
-        self.assertEqual(len(ort_outputs), 1)
-        self.assertEqual(ort_outputs[0].device_name(), "cuda")
-        # Validate results (by copying results to CPU by creating a Numpy object)
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_outputs[0].numpy()))
-
-        # We should be able to repeat the above process as many times as we want - try once more
-        ort_outputs = io_binding.get_outputs()
-        self.assertEqual(len(ort_outputs), 1)
-        self.assertEqual(ort_outputs[0].device_name(), "cuda")
-        # Validate results (by copying results to CPU by creating a Numpy object)
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_outputs[0].numpy()))
-
-        # Change the bound input and validate the results in the same bound OrtValue
-        # Bind alternate input to CUDA
-        io_binding.bind_input(
-            "X",
-            "cuda",
-            0,
-            np.float32,
-            [3, 2],
-            self.create_ortvalue_alternate_input_on_gpu().data_ptr(),
-        )
+                # Bind output to the GPU
+                io_binding.bind_output("Y", device)
+
+                # Sync if different streams
+                io_binding.synchronize_inputs()
+
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
+
+                # Sync if different streams
+                io_binding.synchronize_outputs()
+
+                # This call returns an OrtValue which has data allocated by ORT on the GPU
+                ort_outputs = io_binding.get_outputs()
+                self.assertEqual(len(ort_outputs), 1)
+                self.assertEqual(ort_outputs[0].device_name(), device)
+                # Validate results (by copying results to CPU by creating a Numpy object)
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_outputs[0].numpy()))
+
+                # We should be able to repeat the above process as many times as we want - try once more
+                ort_outputs = io_binding.get_outputs()
+                self.assertEqual(len(ort_outputs), 1)
+                self.assertEqual(ort_outputs[0].device_name(), device)
+                # Validate results (by copying results to CPU by creating a Numpy object)
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_outputs[0].numpy()))
+
+                input = self._create_ortvalue_alternate_input_on_gpu(device)
+
+                # Change the bound input and validate the results in the same bound OrtValue
+                # Bind alternate input to the GPU
+                io_binding.bind_input(
+                    "X",
+                    device,
+                    0,
+                    np.float32,
+                    [3, 2],
+                    input.data_ptr(),
+                )
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # This call returns an OrtValue which has data allocated by ORT on CUDA
-        ort_outputs = io_binding.get_outputs()
-        self.assertEqual(len(ort_outputs), 1)
-        self.assertEqual(ort_outputs[0].device_name(), "cuda")
-        # Validate results (by copying results to CPU by creating a Numpy object)
-        self.assertTrue(np.array_equal(self.create_expected_output_alternate(), ort_outputs[0].numpy()))
+                # This call returns an OrtValue which has data allocated by ORT on the GPU
+                ort_outputs = io_binding.get_outputs()
+                self.assertEqual(len(ort_outputs), 1)
+                self.assertEqual(ort_outputs[0].device_name(), device)
+                # Validate results (by copying results to CPU by creating a Numpy object)
+                self.assertTrue(np.array_equal(self._create_expected_output_alternate(), ort_outputs[0].numpy()))
 
     def test_bind_input_and_bind_output_with_ortvalues(self):
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
 
-        # Bind ortvalue as input
-        input_ortvalue = self.create_ortvalue_input_on_gpu()
-        io_binding.bind_ortvalue_input("X", input_ortvalue)
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
 
-        # Bind ortvalue as output
-        output_ortvalue = self.create_uninitialized_ortvalue_input_on_gpu()
-        io_binding.bind_ortvalue_output("Y", output_ortvalue)
+                # Bind ortvalue as input
+                input_ortvalue = self._create_ortvalue_input_on_gpu(device)
+                io_binding.bind_ortvalue_input("X", input_ortvalue)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Bind ortvalue as output
+                output_ortvalue = self._create_uninitialized_ortvalue_input_on_gpu(device)
+                io_binding.bind_ortvalue_output("Y", output_ortvalue)
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Inspect contents of output_ortvalue and make sure that it has the right contents
-        self.assertTrue(np.array_equal(self.create_expected_output(), output_ortvalue.numpy()))
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Bind another ortvalue as input
-        input_ortvalue_2 = self.create_ortvalue_alternate_input_on_gpu()
-        io_binding.bind_ortvalue_input("X", input_ortvalue_2)
+                # Inspect contents of output_ortvalue and make sure that it has the right contents
+                self.assertTrue(np.array_equal(self._create_expected_output(), output_ortvalue.numpy()))
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Bind another ortvalue as input
+                input_ortvalue_2 = self._create_ortvalue_alternate_input_on_gpu(device)
+                io_binding.bind_ortvalue_input("X", input_ortvalue_2)
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
+
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Inspect contents of output_ortvalue and make sure that it has the right contents
-        self.assertTrue(np.array_equal(self.create_expected_output_alternate(), output_ortvalue.numpy()))
+                # Inspect contents of output_ortvalue and make sure that it has the right contents
+                self.assertTrue(np.array_equal(self._create_expected_output_alternate(), output_ortvalue.numpy()))
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/transformers/gpt2_model_generator.py b/onnxruntime/test/python/transformers/gpt2_model_generator.py
index 6d4d6ea920e42..4a1b48d4d1b48 100644
--- a/onnxruntime/test/python/transformers/gpt2_model_generator.py
+++ b/onnxruntime/test/python/transformers/gpt2_model_generator.py
@@ -555,6 +555,8 @@ def create_gpt2_embedlayer(
     num_heads=4,
     epsilon=0.1,
     one_attention_node=False,
+    has_skip_layer_norm=True,
+    output_embedding_sum=False,
 ):
     # Construct input and output nodes
     inputs = [
@@ -564,21 +566,47 @@ def create_gpt2_embedlayer(
         helper.make_tensor_value_info("output_0", TensorProto.FLOAT, ["batch_size", "sequence_length", hidden_size])
     ]
 
+    if output_embedding_sum:
+        outputs.append(
+            helper.make_tensor_value_info(
+                "embedding_sum", TensorProto.FLOAT, ["batch_size", "sequence_length", hidden_size]
+            )
+        )
+
     # Construct graph nodes
     embed_layernorm_nodes = [
         helper.make_node("Gather", ["word_embeddings_weight", "ids"], ["gather_0_out"], "gather_word_embeddings"),
         helper.make_node("Gather", ["pos_embeddings_weight", "ids"], ["gather_1_out"], "gather_position_embeddings"),
         helper.make_node("Add", ["gather_0_out", "gather_1_out"], ["add_0_out"], "add_before_layernorm"),
         helper.make_node("Gather", ["token_embeddings_weight", "ids"], ["gather_2_out"], "gather_token_embeddings"),
-        helper.make_node(
-            "SkipLayerNormalization",
-            ["add_0_out", "gather_2_out", "layernorm_weight", "layernorm_bias"],
-            ["skip_layernorm_out"],
-            "skip_layernorm",
-            domain="com.microsoft",
-            epsilon=epsilon,
-        ),
     ]
+
+    if has_skip_layer_norm:
+        embed_layernorm_nodes.append(
+            helper.make_node(
+                "SkipLayerNormalization",
+                ["add_0_out", "gather_2_out", "layernorm_weight", "layernorm_bias"],
+                ["skip_layernorm_out"] if not output_embedding_sum else ["skip_layernorm_out", "", "", "embedding_sum"],
+                "skip_layernorm",
+                domain="com.microsoft",
+                epsilon=epsilon,
+            )
+        )
+    else:
+        embed_layernorm_nodes.append(
+            helper.make_node("Add", ["add_0_out", "gather_2_out"], ["embedding_sum"], "embedding_sum")
+        )
+
+        embed_layernorm_nodes.append(
+            helper.make_node(
+                "LayerNormalization",
+                ["embedding_sum", "layernorm_weight", "layernorm_bias"],
+                ["skip_layernorm_out"],
+                "layernorm",
+                epsilon=epsilon,
+            )
+        )
+
     attention_nodes = (
         [
             helper.make_node("MatMul", ["skip_layernorm_out", "q_weight"], ["q_out"], "q_attn"),
@@ -708,6 +736,7 @@ def create_gpt2_fused_embedlayer(
     num_heads=4,
     epsilon=0.1,
     one_attention_node=False,
+    output_embedding_sum=False,
 ):
     # Construct input and output nodes
     inputs = [
@@ -716,6 +745,12 @@ def create_gpt2_fused_embedlayer(
     outputs = [
         helper.make_tensor_value_info("output_0", TensorProto.FLOAT, ["batch_size", "sequence_length", hidden_size])
     ]
+    if output_embedding_sum:
+        outputs.append(
+            helper.make_tensor_value_info(
+                "embedding_sum", TensorProto.FLOAT, ["batch_size", "sequence_length", hidden_size]
+            )
+        )
 
     # Construct graph nodes
     embed_layernorm_nodes = [
@@ -732,7 +767,9 @@ def create_gpt2_fused_embedlayer(
                 "",
                 "ids",
             ],
-            ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index"],
+            ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index", "embedding_sum"]
+            if output_embedding_sum
+            else ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index"],
             "EmbedLayerNormalization_0",
             domain="com.microsoft",
             epsilon=epsilon,
@@ -876,3 +913,9 @@ def create_gpt2_fused_embedlayer(
 
     model = create_gpt2_fused_embedlayer(one_attention_node=True)
     onnx.save(model, "./test_data/models/gpt2_embedlayer_one_attn_exp.onnx")
+
+    model = create_gpt2_embedlayer(one_attention_node=True, output_embedding_sum=True)
+    onnx.save(model, "gpt2_embedlayer_one_attn_output_sum.onnx")
+
+    model = create_gpt2_fused_embedlayer(one_attention_node=True, output_embedding_sum=True)
+    onnx.save(model, "./test_data/models/gpt2_embedlayer_one_attn_output_sum_exp.onnx")
diff --git a/onnxruntime/test/python/transformers/test_attention_fusion.py b/onnxruntime/test/python/transformers/test_attention_fusion.py
index 2edc2ec06d631..76d1dcf013321 100644
--- a/onnxruntime/test/python/transformers/test_attention_fusion.py
+++ b/onnxruntime/test/python/transformers/test_attention_fusion.py
@@ -31,7 +31,18 @@ def verify_fusion(self, optimized_model, expected_model_filename):
         expected_model = OnnxModel(onnx.load(expected_model_path))
         expected_model.topological_sort(is_deterministic=True)
 
-        self.assertEqual(str(optimized_model.model.graph), str(expected_model.model.graph))
+        nodes = optimized_model.model.graph.node
+        self.assertEqual(len(nodes), len(expected_model.model.graph.node))
+
+        for i in range(len(nodes)):
+            self.assertEqual(nodes[i], expected_model.model.graph.node[i])
+
+        for expected_initializer in expected_model.model.graph.initializer:
+            self.assertTrue(
+                OnnxModel.has_same_value(
+                    optimized_model.get_initializer(expected_initializer.name), expected_initializer
+                )
+            )
 
     def test_multi_head_attention_fusion(self):
         model = create_bert_attention()
diff --git a/onnxruntime/test/python/transformers/test_data/models/gpt2_embedlayer_one_attn_output_sum_exp.onnx b/onnxruntime/test/python/transformers/test_data/models/gpt2_embedlayer_one_attn_output_sum_exp.onnx
new file mode 100644
index 0000000000000..853f3f5cf7f2c
Binary files /dev/null and b/onnxruntime/test/python/transformers/test_data/models/gpt2_embedlayer_one_attn_output_sum_exp.onnx differ
diff --git a/onnxruntime/test/python/transformers/test_embedlayer_fusion.py b/onnxruntime/test/python/transformers/test_embedlayer_fusion.py
index 732833e5da27d..ccd367fdbbe01 100644
--- a/onnxruntime/test/python/transformers/test_embedlayer_fusion.py
+++ b/onnxruntime/test/python/transformers/test_embedlayer_fusion.py
@@ -74,6 +74,38 @@ def test_embedlayer_fusion_one_attn_node(self):
         os.remove(original_model_path)
         os.remove(optimized_model_path)
 
+    def test_embedlayer_fusion_with_embedding_sum_output(self):
+        model = create_gpt2_embedlayer(one_attention_node=True, output_embedding_sum=True)
+        path = "."
+        original_model_path = os.path.join(path, "gpt2_embedlayer_one_attn_output_sum.onnx")
+        optimized_model_path = os.path.join(path, "gpt2_embedlayer_one_attn_output_sum_opt.onnx")
+        expected_model_filename = "gpt2_embedlayer_one_attn_output_sum_exp.onnx"
+
+        onnx.save(model, original_model_path)
+        optimized_model = optimize_model(original_model_path, model_type="gpt2")
+        optimized_model.save_model_to_file(optimized_model_path, use_external_data_format=True)
+
+        self.verify_fusion(optimized_model, expected_model_filename)
+        self.verify_parity(optimized_model_path, expected_model_filename)
+        os.remove(original_model_path)
+        os.remove(optimized_model_path)
+
+    def test_embedlayer_fusion_with_embedding_sum_output_no_sln(self):
+        model = create_gpt2_embedlayer(one_attention_node=True, has_skip_layer_norm=False, output_embedding_sum=True)
+        path = "."
+        original_model_path = os.path.join(path, "gpt2_embedlayer_one_attn_output_sum_no_sln.onnx")
+        optimized_model_path = os.path.join(path, "gpt2_embedlayer_one_attn_output_sum_no_sln_opt.onnx")
+        expected_model_filename = "gpt2_embedlayer_one_attn_output_sum_exp.onnx"
+
+        onnx.save(model, original_model_path)
+        optimized_model = optimize_model(original_model_path, model_type="gpt2")
+        optimized_model.save_model_to_file(optimized_model_path, use_external_data_format=True)
+
+        self.verify_fusion(optimized_model, expected_model_filename)
+        self.verify_parity(optimized_model_path, expected_model_filename)
+        os.remove(original_model_path)
+        os.remove(optimized_model_path)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_optimizer_stable_diffusion.py b/onnxruntime/test/python/transformers/test_optimizer_stable_diffusion.py
new file mode 100644
index 0000000000000..cde6b56a6648c
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_optimizer_stable_diffusion.py
@@ -0,0 +1,156 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import os
+import shutil
+import unittest
+
+import pytest
+from parity_utilities import find_transformers_source
+
+if find_transformers_source():
+    from compare_bert_results import run_test
+    from fusion_options import FusionOptions
+    from optimizer import optimize_model
+else:
+    from onnxruntime.transformers.compare_bert_results import run_test
+    from onnxruntime.transformers.fusion_options import FusionOptions
+    from onnxruntime.transformers.optimizer import optimize_model
+
+TINY_MODELS = {
+    "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
+    "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
+}
+
+
+class TestStableDiffusionOptimization(unittest.TestCase):
+    def verify_node_count(self, onnx_model, expected_node_count, test_name):
+        for op_type, count in expected_node_count.items():
+            if len(onnx_model.get_nodes_by_op_type(op_type)) != count:
+                print(f"Counters is not expected in test: {test_name}")
+                for op, counter in expected_node_count.items():
+                    print(f"{op}: {len(onnx_model.get_nodes_by_op_type(op))} expected={counter}")
+
+                self.assertEqual(len(onnx_model.get_nodes_by_op_type(op_type)), count)
+
+    def verify_clip_optimizer(self, clip_onnx_path, optimized_clip_onnx_path, expected_counters, float16=False):
+        fusion_options = FusionOptions("clip")
+        m = optimize_model(
+            clip_onnx_path,
+            model_type="clip",
+            num_heads=0,
+            hidden_size=0,
+            opt_level=0,
+            optimization_options=fusion_options,
+            use_gpu=True,
+        )
+        self.verify_node_count(m, expected_counters, "test_clip")
+
+        if float16:
+            m.convert_float_to_float16(
+                keep_io_types=True,
+            )
+        print(m.get_operator_statistics())
+        m.save_model_to_file(optimized_clip_onnx_path)
+
+        threshold = 1e-2 if float16 else 3e-3
+        max_abs_diff, passed = run_test(
+            clip_onnx_path,
+            optimized_clip_onnx_path,
+            output_dir=None,
+            batch_size=1,
+            sequence_length=77,
+            use_gpu=True,
+            test_cases=10,
+            seed=1,
+            verbose=False,
+            rtol=1e-1,
+            atol=threshold,
+            input_ids_name="input_ids",
+            segment_ids_name=None,
+            input_mask_name=None,
+            mask_type=0,
+        )
+
+        self.assertLess(max_abs_diff, threshold)
+        self.assertTrue(passed)
+
+    @pytest.mark.slow
+    def test_clip_sd(self):
+        save_directory = "tiny-random-stable-diffusion"
+        if os.path.exists(save_directory):
+            shutil.rmtree(save_directory, ignore_errors=True)
+
+        model_type = "stable-diffusion"
+        model_name = TINY_MODELS[model_type]
+
+        from optimum.onnxruntime import ORTStableDiffusionPipeline
+
+        base = ORTStableDiffusionPipeline.from_pretrained(model_name, export=True)
+        base.save_pretrained(save_directory)
+
+        clip_onnx_path = os.path.join(save_directory, "text_encoder", "model.onnx")
+        optimized_clip_onnx_path = os.path.join(save_directory, "text_encoder", "opt.onnx")
+        self.verify_clip_optimizer(
+            clip_onnx_path,
+            optimized_clip_onnx_path,
+            expected_counters={
+                "EmbedLayerNormalization": 0,
+                "Attention": 5,
+                "SkipLayerNormalization": 10,
+                "LayerNormalization": 1,
+                "Gelu": 0,
+                "BiasGelu": 0,
+            },
+            float16=True,
+        )
+
+    @pytest.mark.slow
+    def test_clip_sdxl(self):
+        save_directory = "tiny-random-stable-diffusion-xl"
+        if os.path.exists(save_directory):
+            shutil.rmtree(save_directory, ignore_errors=True)
+
+        model_type = "stable-diffusion-xl"
+        model_name = TINY_MODELS[model_type]
+
+        from optimum.onnxruntime import ORTStableDiffusionXLPipeline
+
+        base = ORTStableDiffusionXLPipeline.from_pretrained(model_name, export=True)
+        base.save_pretrained(save_directory)
+
+        clip_onnx_path = os.path.join(save_directory, "text_encoder", "model.onnx")
+        optimized_clip_onnx_path = os.path.join(save_directory, "text_encoder", "opt.onnx")
+        self.verify_clip_optimizer(
+            clip_onnx_path,
+            optimized_clip_onnx_path,
+            expected_counters={
+                "EmbedLayerNormalization": 0,
+                "Attention": 5,
+                "SkipLayerNormalization": 10,
+                "LayerNormalization": 1,
+                "Gelu": 0,
+                "BiasGelu": 5,
+            },
+        )
+
+        clip_onnx_path = os.path.join(save_directory, "text_encoder_2", "model.onnx")
+        optimized_clip_onnx_path = os.path.join(save_directory, "text_encoder_2", "opt.onnx")
+        self.verify_clip_optimizer(
+            clip_onnx_path,
+            optimized_clip_onnx_path,
+            expected_counters={
+                "EmbedLayerNormalization": 0,
+                "Attention": 5,
+                "SkipLayerNormalization": 10,
+                "LayerNormalization": 1,
+                "Gelu": 0,
+                "BiasGelu": 5,
+            },
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
index ad4117f997567..85b30bea4f0af 100644
--- a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
+++ b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
@@ -339,7 +339,7 @@ def verify_attention(
 
         ort_outputs = onnxruntime_inference(ort_session, input_hidden_states, attention_mask, layer_past)
 
-        tolerance = 1e-03 if float16 else 1e-05
+        tolerance = 1e-02 if float16 else 1e-04
         is_all_close, max_diff = compare_outputs(torch_outputs, ort_outputs, atol=tolerance, verbose=True)
         max_diffs.append(max_diff)
         if is_all_close:
diff --git a/onnxruntime/test/python/transformers/test_whisper.py b/onnxruntime/test/python/transformers/test_whisper.py
index a2aa6383c2fbe..ebda0bccaadcf 100644
--- a/onnxruntime/test/python/transformers/test_whisper.py
+++ b/onnxruntime/test/python/transformers/test_whisper.py
@@ -37,7 +37,18 @@ def verify_fusion(self, optimized_model, expected_model_filename):
         expected_model = OnnxModel(onnx.load(expected_model_path))
         expected_model.topological_sort(is_deterministic=True)
 
-        self.assertEqual(str(optimized_model.model.graph), str(expected_model.model.graph))
+        nodes = optimized_model.model.graph.node
+        self.assertEqual(len(nodes), len(expected_model.model.graph.node))
+
+        for i in range(len(nodes)):
+            self.assertEqual(nodes[i], expected_model.model.graph.node[i])
+
+        for expected_initializer in expected_model.model.graph.initializer:
+            self.assertTrue(
+                OnnxModel.has_same_value(
+                    optimized_model.get_initializer(expected_initializer.name), expected_initializer
+                )
+            )
 
     # Attention type #1 in onnx_model_bart.py
     def test_encoder_attention_fusion_with_skiplayernorm(self):
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index f3a0058c6fc4e..8357ce22fb710 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -159,8 +159,8 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
                                   expected_values_y,
                                   nullptr);
     // with preallocated output tensor
-    Ort::Value value_y = Ort::Value::CreateTensor<InT>(default_allocator.get(),
-                                                       expected_dims_y.data(), expected_dims_y.size());
+    Ort::Value value_y = Ort::Value::CreateTensor<OutT>(default_allocator.get(),
+                                                        expected_dims_y.data(), expected_dims_y.size());
 
     // test it twice
     for (int i = 0; i != 2; ++i)
diff --git a/onnxruntime/test/testdata/float8/te.cast_fp8_1_fp32.onnx b/onnxruntime/test/testdata/float8/te.cast_fp8_1_fp32.onnx
new file mode 100644
index 0000000000000..1dec9910087fc
Binary files /dev/null and b/onnxruntime/test/testdata/float8/te.cast_fp8_1_fp32.onnx differ
diff --git a/onnxruntime/test/testdata/float8/te.cast_fp8_1_fp32_input.npy b/onnxruntime/test/testdata/float8/te.cast_fp8_1_fp32_input.npy
new file mode 100644
index 0000000000000..706f508836888
Binary files /dev/null and b/onnxruntime/test/testdata/float8/te.cast_fp8_1_fp32_input.npy differ
diff --git a/onnxruntime/test/testdata/transform/constant_float16_mul.onnx b/onnxruntime/test/testdata/transform/constant_float16_mul.onnx
new file mode 100644
index 0000000000000..baa682edb7f56
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/constant_float16_mul.onnx
@@ -0,0 +1,17 @@
+onnxruntime_test:�
+2c1c1_node"Constant*
+value*
+*�xBc1v�
+3c2c2_node"Constant*
+value*
+*��Bc2v�
+
+c1
+c2
+mul_outputmul"Mulfloat16_mulb
+
+mul_output
+
+
+
+B
\ No newline at end of file
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index aabefeaa7a07c..174edabbc91fe 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -155,6 +155,12 @@ int OrtAppendExecutionProvider(ort_session_options_handle_t session_options, con
   return CHECK_STATUS(SessionOptionsAppendExecutionProvider, session_options, name, nullptr, nullptr, 0);
 }
 
+int OrtAddFreeDimensionOverride(ort_session_options_handle_t session_options,
+                                const char* dim_param_name,
+                                int dim_value) {
+  return CHECK_STATUS(AddFreeDimensionOverrideByName, session_options, dim_param_name, dim_value);
+}
+
 int OrtAddSessionConfigEntry(OrtSessionOptions* session_options,
                              const char* config_key,
                              const char* config_value) {
@@ -394,9 +400,11 @@ char* OrtEndProfiling(ort_session_handle_t session) {
 #define CHECK_TRAINING_STATUS(ORT_API_NAME, ...) \
   CheckStatus(Ort::GetTrainingApi().ORT_API_NAME(__VA_ARGS__))
 
-ort_training_checkpoint_handle_t EMSCRIPTEN_KEEPALIVE OrtTrainingLoadCheckpoint(void* checkpoint_data_buffer, size_t checkpoint_size) {
+ort_training_checkpoint_handle_t EMSCRIPTEN_KEEPALIVE OrtTrainingLoadCheckpoint(void* checkpoint_data_buffer,
+                                                                                size_t checkpoint_size) {
   OrtCheckpointState* checkpoint_state = nullptr;
-  return (CHECK_TRAINING_STATUS(LoadCheckpointFromBuffer, checkpoint_data_buffer, checkpoint_size, &checkpoint_state) == ORT_OK)
+  return (CHECK_TRAINING_STATUS(LoadCheckpointFromBuffer, checkpoint_data_buffer,
+                                checkpoint_size, &checkpoint_state) == ORT_OK)
              ? checkpoint_state
              : nullptr;
 }
@@ -414,7 +422,7 @@ ort_training_session_handle_t EMSCRIPTEN_KEEPALIVE OrtTrainingCreateSession(cons
                                                                             void* optimizer_model,
                                                                             size_t optimizer_size) {
   OrtTrainingSession* training_session = nullptr;
-  return (CHECK_TRAINING_STATUS(CreateTrainingSessionFromArray, g_env, options,
+  return (CHECK_TRAINING_STATUS(CreateTrainingSessionFromBuffer, g_env, options,
                                 training_checkpoint_state_handle, train_model, train_size,
                                 eval_model, eval_size, optimizer_model, optimizer_size,
                                 &training_session) == ORT_OK)
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index b9103414aae67..398c901e0e5ed 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -84,6 +84,13 @@ ort_session_options_handle_t EMSCRIPTEN_KEEPALIVE OrtCreateSessionOptions(size_t
 int EMSCRIPTEN_KEEPALIVE OrtAppendExecutionProvider(ort_session_options_handle_t session_options,
                                                     const char* name);
 
+/**
+ * add a free dimension override for one dimension of a session's input.
+ */
+int EMSCRIPTEN_KEEPALIVE OrtAddFreeDimensionOverride(ort_session_options_handle_t session_options,
+                                                     const char* dim_param_name,
+                                                     int dim_value);
+
 /**
  * store configurations for a session.
  * @param session_options a handle to session options created by OrtCreateSessionOptions
diff --git a/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc b/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc
index ceeedd23ad454..358deb421bc21 100644
--- a/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc
+++ b/orttraining/orttraining/test/graph/gradient_graph_builder_test.cc
@@ -22,7 +22,6 @@
 
 using namespace onnxruntime::logging;
 using namespace onnxruntime::training;
-using namespace google::protobuf::util;
 using namespace onnxruntime::path_utils;
 using namespace onnxruntime::test::training_session_test_utils;
 
diff --git a/orttraining/orttraining/test/python/how_to_add_ortmodule_ci_pipeline_tests.md b/orttraining/orttraining/test/python/how_to_add_ortmodule_ci_pipeline_tests.md
index 32fc83b341174..3fbb6a819ab3c 100644
--- a/orttraining/orttraining/test/python/how_to_add_ortmodule_ci_pipeline_tests.md
+++ b/orttraining/orttraining/test/python/how_to_add_ortmodule_ci_pipeline_tests.md
@@ -5,7 +5,7 @@ This is a simple guide on how the ortmodule CI pipeline works and how it can be
 ### The Pipeline
 
 The ortmodule CI pipeline is intended for running tests related to the ```ORTModule``` class.
-The pipeline ```yml``` file is defined in [```tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml```](https://github.com/microsoft/onnxruntime/blob/thiagofc/ortmodule-api/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml).
+The pipeline ```yml``` file is defined in [```tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-test-ci-pipeline.yml```](https://github.com/microsoft/onnxruntime/blob/thiagofc/ortmodule-api/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-test-ci-pipeline.yml).
 The pipeline runs on every pull request commit to the branch ```thiagofc/ortmodule```.
 
 ## Running Locally
@@ -41,4 +41,4 @@ Follow the below steps to add new ortmodule tests that will run in this pipeline
 
 > **Note**: If the test requires multiple ```run_subprocess()``` calls, restructure the test file(s) such that they have a single entry point.
 
-Once the above has been tried and tested, submit a pull request and the tests should be executed in the ortmodule ci pipeline. Make sure to search for ```'Running: Dummy ortmodule tests'``` in the pipeline logs to ensure that the newly added tests were successfully run in the pipeline.
\ No newline at end of file
+Once the above has been tried and tested, submit a pull request and the tests should be executed in the ortmodule ci pipeline. Make sure to search for ```'Running: Dummy ortmodule tests'``` in the pipeline logs to ensure that the newly added tests were successfully run in the pipeline.
diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis.py
index ce96ee3da6658..506aafbe9f618 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ort_apis.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis.py
@@ -8,7 +8,6 @@
 
 import argparse
 import logging
-import os
 import sys
 
 from _test_commons import run_subprocess
@@ -43,31 +42,6 @@ def run_onnxblock_tests(cwd, log):
     run_subprocess(command, cwd=cwd, log=log).check_returncode()
 
 
-def run_onnxruntime_test_all_ctest(cwd, log, filter):
-    """Calls onnxruntime_test_all gtest executable with the given filter."""
-
-    command = [os.path.join(cwd, "onnxruntime_test_all"), f"--gtest_filter={filter}"]
-
-    run_subprocess(command, cwd=cwd, log=log).check_returncode()
-
-
-def run_training_api_tests(cwd, log):
-    """Runs the onnxruntime_test_all executable with the TrainingApiTest* gtest filter."""
-
-    log.debug("Running: TrainingApi and TrainingCApi tests")
-
-    run_onnxruntime_test_all_ctest(cwd, log, "TrainingApiTest*")
-    run_onnxruntime_test_all_ctest(cwd, log, "TrainingCApiTest*")
-
-
-def run_checkpoint_api_tests(cwd, log):
-    """Runs the onnxruntime_test_all executable with the CheckpointApiTest* gtest filter."""
-
-    log.debug("Running: TrainingApi tests")
-
-    run_onnxruntime_test_all_ctest(cwd, log, "CheckpointApiTest*")
-
-
 def main():
     args = parse_arguments()
     cwd = args.cwd
@@ -78,10 +52,6 @@ def main():
 
     run_training_apis_python_api_tests(cwd, log)
 
-    run_training_api_tests(cwd, log)
-
-    run_checkpoint_api_tests(cwd, log)
-
     return 0
 
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 64cdb957f4046..bf26fd1822dc4 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -4002,6 +4002,7 @@ def forward(self, bool_argument, input1):
     ],
 )
 def test_unused_parameters(model, none_pt_params):
+    torch.manual_seed(2333)
     device = "cuda"
 
     N, D_in, H1, H2, D_out = 64, 784, 500, 400, 10  # noqa: F841, N806
diff --git a/orttraining/orttraining/test/session/training_session_test.cc b/orttraining/orttraining/test/session/training_session_test.cc
index 7afe6cdef75a5..b6ed80c426afc 100644
--- a/orttraining/orttraining/test/session/training_session_test.cc
+++ b/orttraining/orttraining/test/session/training_session_test.cc
@@ -17,7 +17,6 @@
 
 using namespace onnxruntime::logging;
 using namespace onnxruntime::training;
-using namespace google::protobuf::util;
 using namespace onnxruntime::path_utils;
 using namespace onnxruntime::test::training_session_test_utils;
 
diff --git a/orttraining/orttraining/test/session/training_session_test_utils.cc b/orttraining/orttraining/test/session/training_session_test_utils.cc
index 59d58be59872b..868388d4b9a93 100644
--- a/orttraining/orttraining/test/session/training_session_test_utils.cc
+++ b/orttraining/orttraining/test/session/training_session_test_utils.cc
@@ -8,7 +8,6 @@
 
 using namespace onnxruntime::logging;
 using namespace onnxruntime::training;
-using namespace google::protobuf::util;
 using namespace onnxruntime::path_utils;
 
 namespace onnxruntime {
diff --git a/rust/onnxruntime-sys/build.rs b/rust/onnxruntime-sys/build.rs
index 82d1e4278015c..f59ee99fa29a7 100644
--- a/rust/onnxruntime-sys/build.rs
+++ b/rust/onnxruntime-sys/build.rs
@@ -105,7 +105,6 @@ fn generate_bindings(include_dir: &Path) {
         .expect("Unable to generate bindings");
 
     let generated_file = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
-    println!("cargo:rerun-if-changed={:?}", generated_file);
     bindings
         .write_to_file(&generated_file)
         .expect("Couldn't write bindings!");
diff --git a/setup.py b/setup.py
index 8bd68f36f7454..13731eb4e76bb 100644
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,7 @@
 import datetime
 import logging
 import platform
+import shlex
 import subprocess
 import sys
 from glob import glob, iglob
@@ -183,108 +184,37 @@ def run(self):
                 dest = "onnxruntime/capi/onnxruntime_pybind11_state_manylinux1.so"
                 logger.info("copying %s -> %s", source, dest)
                 copyfile(source, dest)
-                result = subprocess.run(
-                    ["patchelf", "--print-needed", dest], check=True, stdout=subprocess.PIPE, text=True
-                )
-                dependencies = [
-                    "librccl.so",
-                    "libamdhip64.so",
-                    "librocblas.so",
-                    "libMIOpen.so",
-                    "libhsa-runtime64.so",
-                    "libhsakmt.so",
-                ]
+
                 to_preload = []
                 to_preload_cuda = []
                 to_preload_tensorrt = []
                 to_preload_cann = []
-                cuda_dependencies = []
-                args = ["patchelf", "--debug"]
-                for line in result.stdout.split("\n"):
-                    for dependency in dependencies:
-                        if dependency in line:
-                            to_preload.append(line)
-                            args.extend(["--remove-needed", line])
-                args.append(dest)
-                if len(args) > 3:
-                    subprocess.run(args, check=True, stdout=subprocess.PIPE)
-
-                dest = "onnxruntime/capi/libonnxruntime_providers_" + ("rocm.so" if is_rocm else "cuda.so")
-                if path.isfile(dest):
-                    result = subprocess.run(
-                        ["patchelf", "--print-needed", dest],
-                        check=True,
-                        stdout=subprocess.PIPE,
-                        text=True,
-                    )
-                    cuda_dependencies = [
-                        "libcublas.so",
-                        "libcublasLt.so",
-                        "libcudnn.so",
-                        "libcudart.so",
-                        "libcurand.so",
-                        "libcufft.so",
-                        "libnvToolsExt.so",
-                        "libcupti.so",
-                    ]
-                    rocm_dependencies = [
-                        "librccl.so",
-                        "libamdhip64.so",
-                        "librocblas.so",
-                        "libMIOpen.so",
-                        "libhsa-runtime64.so",
-                        "libhsakmt.so",
-                    ]
-                    args = ["patchelf", "--debug"]
-                    for line in result.stdout.split("\n"):
-                        for dependency in cuda_dependencies + rocm_dependencies:
-                            if dependency in line:
-                                if dependency not in to_preload:
-                                    to_preload_cuda.append(line)
-                                args.extend(["--remove-needed", line])
-                    args.append(dest)
-                    if len(args) > 3:
-                        subprocess.run(args, check=True, stdout=subprocess.PIPE)
-
-                dest = "onnxruntime/capi/libonnxruntime_providers_" + ("migraphx.so" if is_rocm else "tensorrt.so")
-                if path.isfile(dest):
-                    result = subprocess.run(
-                        ["patchelf", "--print-needed", dest],
-                        check=True,
-                        stdout=subprocess.PIPE,
-                        text=True,
-                    )
-                    tensorrt_dependencies = ["libnvinfer.so", "libnvinfer_plugin.so", "libnvonnxparser.so"]
-                    args = ["patchelf", "--debug"]
-                    for line in result.stdout.split("\n"):
-                        for dependency in cuda_dependencies + tensorrt_dependencies:
-                            if dependency in line:
-                                if dependency not in (to_preload + to_preload_cuda):
-                                    to_preload_tensorrt.append(line)
-                                args.extend(["--remove-needed", line])
-                    args.append(dest)
-                    if len(args) > 3:
-                        subprocess.run(args, check=True, stdout=subprocess.PIPE)
-
-                dest = "onnxruntime/capi/libonnxruntime_providers_cann.so"
-                if path.isfile(dest):
-                    result = subprocess.run(
-                        ["patchelf", "--print-needed", dest],
-                        check=True,
-                        stdout=subprocess.PIPE,
-                        text=True,
-                    )
-                    cann_dependencies = ["libascendcl.so", "libacl_op_compiler.so", "libfmk_onnx_parser.so"]
-                    args = ["patchelf", "--debug"]
-                    for line in result.stdout.split("\n"):
-                        for dependency in cann_dependencies:
-                            if dependency in line:
-                                if dependency not in to_preload:
-                                    to_preload_cann.append(line)
-                                args.extend(["--remove-needed", line])
-                    args.append(dest)
-                    if len(args) > 3:
-                        subprocess.run(args, check=True, stdout=subprocess.PIPE)
+
+                cuda_dependencies = [
+                    "libcublas.so.11",
+                    "libcublasLt.so.11",
+                    "libcudnn.so.8",
+                    "libcudart.so.11.0",
+                    "libcurand.so.10",
+                    "libcufft.so.10",
+                ]
+                rocm_dependencies = [
+                    "librccl.so.1",
+                    "libnuma.so.1",
+                    "libamd_comgr.so.2",
+                    "libdrm.so.2",
+                    "librocblas.so.0",
+                    "libdrm_amdgpu.so.1",
+                    "libamdhip64.so.5",
+                    "libroctracer64.so.4",
+                    "libMIOpen.so.1",
+                    "libtinfo.so.6",
+                    "libelf.so.1",
+                    "librocm_smi64.so.5",
+                    "libhsa-runtime64.so.1",
+                ]
+
+                tensorrt_dependencies = ["libnvinfer.so.8.6", "libnvinfer_plugin.so.8.6", "libnvonnxparser.so.8.6"]
 
                 dest = "onnxruntime/capi/libonnxruntime_providers_openvino.so"
                 if path.isfile(dest):
@@ -308,10 +238,12 @@ def run(self):
                 assert self.dist_dir is not None
                 file = glob(path.join(self.dist_dir, "*linux*.whl"))[0]
                 logger.info("repairing %s for manylinux1", file)
+                auditwheel_cmd = ["auditwheel", "-v", "repair", "-w", self.dist_dir, file]
+                for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies:
+                    auditwheel_cmd += ["--exclude", i]
+                logger.info("Running {}".format(" ".join([shlex.quote(arg) for arg in auditwheel_cmd])))
                 try:
-                    subprocess.run(
-                        ["auditwheel", "repair", "-w", self.dist_dir, file], check=True, stdout=subprocess.PIPE
-                    )
+                    subprocess.run(auditwheel_cmd, check=True, stdout=subprocess.PIPE)
                 finally:
                     logger.info("removing %s", file)
                     remove(file)
diff --git a/tools/android_custom_build/Dockerfile b/tools/android_custom_build/Dockerfile
index 539badb36224d..bc50e4fb0a943 100644
--- a/tools/android_custom_build/Dockerfile
+++ b/tools/android_custom_build/Dockerfile
@@ -24,9 +24,9 @@ RUN apt-get update && apt-get install --yes --no-install-recommends \
   unzip lsb-release
 
 # cmake
-RUN CMAKE_VERSION=3.26.3 && \
+RUN CMAKE_VERSION=3.27.3 && \
   aria2c -q -d /tmp -o cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz \
-  --checksum=sha-256=28d4d1d0db94b47d8dfd4f7dec969a3c747304f4a28ddd6fd340f553f2384dc2 \
+  --checksum=sha-256=62e7819fe0867658b6ea765a711686d637dce76cdf6eb0a6b0f1b879e0344fa7 \
   https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
   tar -zxf /tmp/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz --strip=1 -C /usr
 
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 6ca9993a09d78..c4fb5499983cb 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -96,7 +96,7 @@ def _openvino_verify_device_type(device_read):
                 break
 
     def invalid_hetero_build():
-        print("\nIf trying to build Hetero/Multi/Auto, specifiy the supported devices along with it.\n")
+        print("\nIf trying to build Hetero/Multi/Auto, specify the supported devices along with it.\n")
         print("specify the keyword HETERO or MULTI or AUTO followed by the devices ")
         print("in the order of priority you want to build\n")
         print("The different hardware devices that can be added in HETERO or MULTI or AUTO")
@@ -107,7 +107,7 @@ def invalid_hetero_build():
         sys.exit("Wrong Build Type selected")
 
     if res is False:
-        print("\nYou have selcted wrong configuration for the build.")
+        print("\nYou have selected wrong configuration for the build.")
         print("pick the build type for specific Hardware Device from following options: ", choices)
         print("(or) from the following options with graph partitioning disabled: ", choices1)
         print("\n")
@@ -171,8 +171,8 @@ def convert_arg_line_to_args(self, arg_line):
         nargs="?",
         default=-1,
         type=int,
-        help="Maximum number of NVCC threads to be used in parallel. "
-        "If the optional value is negative or unspecified, the value of --parallel is used.",
+        help="Maximum number of NVCC threads in each parallel job."
+        "If the value is unspecified, it will be computed based on available memory and number of parallel jobs.",
     )
 
     parser.add_argument("--test", action="store_true", help="Run unit tests.")
@@ -431,7 +431,7 @@ def convert_arg_line_to_args(self, arg_line):
     parser.add_argument("--wasm_run_tests_in_browser", action="store_true", help="Run WebAssembly tests in browser")
 
     parser.add_argument(
-        "--enable_wasm_profiling", action="store_true", help="Enable WebAsselby profiling and preserve function names"
+        "--enable_wasm_profiling", action="store_true", help="Enable WebAssembly profiling and preserve function names"
     )
     parser.add_argument(
         "--enable_wasm_debug_info", action="store_true", help="Build WebAssembly with DWARF format debug info"
@@ -528,7 +528,7 @@ def convert_arg_line_to_args(self, arg_line):
         "--llvm_config",
         type=str,
         default="",
-        help="Path to llvm-config.exe for LLVM buit from sources. It is strongly needed for build on Windows",
+        help="Path to llvm-config.exe for LLVM built from sources. It is strongly needed for build on Windows",
     )
     parser.add_argument(
         "--skip_onnx_tests",
@@ -875,6 +875,43 @@ def normalize_arg_list(nested_list):
     return [i for j in nested_list for i in j] if nested_list else []
 
 
+def number_of_parallel_jobs(args):
+    return os.cpu_count() if args.parallel == 0 else args.parallel
+
+
+def number_of_nvcc_threads(args):
+    if args.nvcc_threads >= 0:
+        return args.nvcc_threads
+
+    nvcc_threads = 1
+    try:
+        import psutil
+
+        available_memory = psutil.virtual_memory().available
+        if isinstance(available_memory, int) and available_memory > 0:
+            if available_memory > 60 * 1024 * 1024 * 1024:
+                # When available memory is large enough, chance of OOM is small.
+                nvcc_threads = 4
+            else:
+                # NVCC need a lot of memory to compile 8 flash attention cu files in Linux or 4 cutlass fmha cu files in Windows.
+                # Here we select number of threads to ensure each thread has enough memory (>= 4 GB). For example,
+                # Standard_NC4as_T4_v3 has 4 CPUs and 28 GB memory. When parallel=4 and nvcc_threads=2,
+                # total nvcc threads is 4 * 2, which is barely able to build in 28 GB memory so we will use nvcc_threads=1.
+                memory_per_thread = 4 * 1024 * 1024 * 1024
+                fmha_cu_files = 4 if is_windows() else 8
+                fmha_parallel_jobs = min(fmha_cu_files, number_of_parallel_jobs(args))
+                nvcc_threads = max(1, int(available_memory / (memory_per_thread * fmha_parallel_jobs)))
+                print(
+                    f"nvcc_threads={nvcc_threads} to ensure memory per thread >= 4GB for available_memory={available_memory} and fmha_parallel_jobs={fmha_parallel_jobs}"
+                )
+    except ImportError:
+        print(
+            "Failed to import psutil. Please `pip install psutil` for better estimation of nvcc threads. Use nvcc_threads=1"
+        )
+
+    return nvcc_threads
+
+
 def generate_build_tree(
     cmake_path,
     source_dir,
@@ -1044,10 +1081,7 @@ def generate_build_tree(
     if args.use_migraphx:
         cmake_args.append("-Donnxruntime_MIGRAPHX_HOME=" + migraphx_home)
     if args.use_cuda:
-        if args.nvcc_threads >= 0:
-            nvcc_threads = args.nvcc_threads
-        else:
-            nvcc_threads = args.parallel
+        nvcc_threads = number_of_nvcc_threads(args)
         cmake_args.append("-Donnxruntime_NVCC_THREADS=" + str(nvcc_threads))
     if args.use_rocm:
         cmake_args.append("-Donnxruntime_ROCM_HOME=" + rocm_home)
@@ -1639,6 +1673,10 @@ def run_adb_shell(cmd):
             adb_shell(f"chmod +x {device_dir}/onnx_test_runner")
             run_adb_shell(f"{device_dir}/onnxruntime_test_all")
 
+            # remove onnxruntime_test_all as it takes up a _lot_ of space and can cause insufficient storage errors
+            # when we try to copy the java app to the device.
+            adb_shell(f"rm {device_dir}/onnxruntime_test_all")
+
             if args.build_java:
                 # use the gradle wrapper under <repo root>/java
                 gradle_executable = os.path.join(source_dir, "java", "gradlew.bat" if is_windows() else "gradlew")
@@ -1802,13 +1840,12 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                     [sys.executable, "onnxruntime_test_python_symbolic_shape_infer.py"], cwd=cwd, dll_path=dll_path
                 )
 
-            # For CUDA enabled builds test IOBinding feature
-            if args.use_cuda:
-                # We need to have Torch installed to test the IOBinding feature
-                # which currently uses Torch's allocator to allocate GPU memory for testing
+            # For CUDA or DML enabled builds test IOBinding feature
+            if args.use_cuda or args.use_dml:
                 log.info("Testing IOBinding feature")
                 run_subprocess([sys.executable, "onnxruntime_test_python_iobinding.py"], cwd=cwd, dll_path=dll_path)
 
+            if args.use_cuda:
                 log.info("Testing CUDA Graph feature")
                 run_subprocess([sys.executable, "onnxruntime_test_python_cudagraph.py"], cwd=cwd, dll_path=dll_path)
 
@@ -2547,7 +2584,7 @@ def main():
     if args.build:
         if args.parallel < 0:
             raise BuildError(f"Invalid parallel job count: {args.parallel}")
-        num_parallel_jobs = os.cpu_count() if args.parallel == 0 else args.parallel
+        num_parallel_jobs = number_of_parallel_jobs(args)
         build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, args.target)
 
     if args.test:
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index f6ebed007fa26..f9688a1453e12 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -95,42 +95,11 @@ def _build_aar(args):
     exe_dir = os.path.join(intermediates_dir, "executables", build_config)
     base_build_command = [sys.executable, BUILD_PY] + build_settings["build_params"] + ["--config=" + build_config]
     header_files_path = ""
-    # Build and install protoc
-    protobuf_installation_script = os.path.join(
-        REPO_DIR,
-        "tools",
-        "ci_build",
-        "github",
-        "linux",
-        "docker",
-        "inference",
-        "x64",
-        "python",
-        "cpu",
-        "scripts",
-        "install_protobuf.sh",
-    )
-    subprocess.run(
-        [
-            protobuf_installation_script,
-            "-p",
-            os.path.join(build_dir, "protobuf"),
-            "-d",
-            os.path.join(REPO_DIR, "cmake", "deps.txt"),
-        ],
-        shell=False,
-        check=True,
-    )
+
     # Build binary for each ABI, one by one
     for abi in build_settings["build_abis"]:
         abi_build_dir = os.path.join(intermediates_dir, abi)
-        abi_build_command = [
-            *base_build_command,
-            "--android_abi=" + abi,
-            "--build_dir=" + abi_build_dir,
-            "--path_to_protoc_exe",
-            os.path.join(build_dir, "protobuf", "bin", "protoc"),
-        ]
+        abi_build_command = [*base_build_command, "--android_abi=" + abi, "--build_dir=" + abi_build_dir]
 
         if ops_config_path is not None:
             abi_build_command += ["--include_ops_by_config=" + ops_config_path]
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index cab5a455c5ef7..bbab9f3d85abb 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -31,7 +31,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.13.1.230730
+  default: qnn-v2.14.1.230828
 
 jobs:
 - job: Build_QNN_EP
@@ -65,7 +65,9 @@ jobs:
 
   - script: |
       set -e -x
-      tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/installed -d cmake/deps.txt
+      rm -rf /tmp/scripts
+      cp -r tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts /tmp
+      /tmp/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/installed -d cmake/deps.txt
       python3 tools/ci_build/build.py \
         --config Release \
         --android \
diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
index 7994be8655f52..f6f6f52440534 100644
--- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
@@ -79,14 +79,6 @@ stages:
 
     - template: "templates/use-android-ndk.yml"
 
-    # We build the host protoc to <ORT_ROOT>/protobuf_install
-    - script: |
-        /bin/bash $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_host_protoc.sh \
-          $(Build.SourcesDirectory) \
-          $(Build.BinariesDirectory)/protobuf \
-          $(Build.SourcesDirectory)/protobuf_install
-      displayName: Build Host Protoc
-
     - script: |
         env | grep ANDROID
       displayName: View Android ENVs
@@ -102,7 +94,6 @@ stages:
           --skip_submodule_sync \
           --parallel \
           --cmake_generator=Ninja \
-          --path_to_protoc_exe $(Build.SourcesDirectory)/protobuf_install/bin/protoc \
           --build_java \
           --skip_tests
       displayName: CPU EP, Build
@@ -172,14 +163,6 @@ stages:
 
     - template: "templates/use-android-ndk.yml"
 
-    # We build the host protoc to <ORT_ROOT>/protobuf_install
-    - script: |
-        /bin/bash $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_host_protoc.sh \
-          $(Build.SourcesDirectory) \
-          $(Build.BinariesDirectory)/protobuf \
-          $(Build.SourcesDirectory)/protobuf_install
-      displayName: Build Host Protoc
-
     - script: |
         env | grep ANDROID
       displayName: View Android ENVs
@@ -196,7 +179,6 @@ stages:
           --parallel \
           --use_nnapi \
           --cmake_generator=Ninja \
-          --path_to_protoc_exe $(Build.SourcesDirectory)/protobuf_install/bin/protoc \
           --build_java \
           --skip_tests
       displayName: NNAPI EP, Build
@@ -372,14 +354,6 @@ stages:
       - script: brew install coreutils ninja
         displayName: Install coreutils and ninja
 
-      # We build the host protoc to <ORT_ROOT>/protobuf_install
-      - script: |
-          /bin/bash $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_host_protoc.sh \
-            $(Build.SourcesDirectory) \
-            $(Build.BinariesDirectory)/protobuf \
-            $(Build.SourcesDirectory)/protobuf_install
-        displayName: Build Host Protoc
-
       - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
         # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
         displayName: Build Minimal ORT with NNAPI and run tests
@@ -462,7 +436,6 @@ stages:
           --use_nnapi \
           --build_shared_lib \
           --cmake_generator=Ninja \
-          --path_to_protoc_exe $(Build.SourcesDirectory)/protobuf_install/bin/protoc \
           --build_java \
           --code_coverage
         displayName: NNAPI EP, Build, Test, CodeCoverage on Android Emulator
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index cb557dd612b01..09b2a0697447e 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -147,9 +147,9 @@ stages:
     - template: templates/set-version-number-variables-step.yml
     - template: templates/get-docker-image-steps.yml
       parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile
-        Context: tools/ci_build/github/linux/docker/inference/x64/default/cpu
-        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-centos7"
+        Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
+        Context: tools/ci_build/github/linux/docker/inference/x64/default/gpu
+        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
         Repository: onnxruntimecuda11centosbuild
 
     - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
@@ -197,7 +197,7 @@ stages:
     buildArch: x64
     msbuildPlatform: x64
     packageName: x64-cuda
-    buildparameter: --use_cuda --cuda_version=11.8 --cuda_home=$(Agent.TempDirectory)\v11.8 --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ${{parameters.AdditionalBuildFlag}}
+    buildparameter: --use_cuda --cuda_version=11.8 --cuda_home=$(Agent.TempDirectory)\v11.8 --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" ${{parameters.AdditionalBuildFlag}}
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: true
     java_artifact_id: onnxruntime_gpu
@@ -213,7 +213,7 @@ stages:
     buildArch: x64
     msbuildPlatform: x64
     packageName: x64-tensorrt
-    buildparameter: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_version=11.8 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"  --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+    buildparameter: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_version=11.8 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"  --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: true
     java_artifact_id: onnxruntime_gpu
@@ -240,16 +240,16 @@ stages:
     # then rename $(Build.SourcesDirectory)/onnxruntime as $(Build.SourcesDirectory)
     - template: templates/get-docker-image-steps.yml
       parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
         Context: tools/ci_build/github/linux/docker
         DockerBuildArgs: >-
           --build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur
           --build-arg BUILD_UID=$(id -u)
-          --network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64
+          --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
           --build-arg ROCM_VERSION=$(RocmVersion)
-          --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-10/root
-          --build-arg PREPEND_PATH=/opt/rh/devtoolset-10/root/usr/bin:
-          --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
+          --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root
+          --build-arg PREPEND_PATH=/opt/rh/gcc-toolset-12/root/usr/bin:
+          --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
         Repository: onnxruntimetrainingrocmbuild-rocm$(RocmVersion)
 
     - template: templates/set-version-number-variables-step.yml
@@ -475,13 +475,13 @@ stages:
         Steps:
         - script: |
             tools/ci_build/get_docker_image.py \
-              --dockerfile tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_8_tensorrt8_6 \
+              --dockerfile tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6 \
               --context tools/ci_build/github/linux/docker \
-              --docker-build-args "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-11/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-11/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )" \
+              --docker-build-args "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u ) --build-arg BUILD_UID=$( id -u )" \
               --container-registry onnxruntimebuildcache \
               --multiple_repos \
               --repository onnxruntimecuda118xtrt86build
-          displayName: "Get onnxruntimecuda118xtrt86build image for tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_8_tensorrt8_6"
+          displayName: "Get onnxruntimecuda118xtrt86build image for tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6"
           workingDirectory: $(Build.SourcesDirectory)/onnxruntime
         ContainerRegistry: onnxruntimebuildcache
 
@@ -532,7 +532,7 @@ stages:
       displayName: 'Test C API application for GPU package'
       inputs:
         script: |
-          docker run --gpus all -e CC=/opt/rh/devtoolset-11/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-11/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume $(Build.SourcesDirectory):/src_dir \
+          docker run --gpus all -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume $(Build.SourcesDirectory):/src_dir \
           --volume $(Build.ArtifactStagingDirectory):/artifact_src -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
           /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet/run_capi_application.sh -o /src_dir/onnxruntime -p /artifact_src/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz -w /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet
         workingDirectory: '$(Build.ArtifactStagingDirectory)'
diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 8d59874d1e464..21bc1c481b3e6 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -67,10 +67,10 @@ stages:
 
       - template: templates/get-docker-image-steps.yml
         parameters:
-          Dockerfile: tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
-          Context: tools/ci_build/github/linux/docker/inference/x64/python/cpu
-          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=amd64/almalinux:8 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/opt/rh/gcc-toolset-12/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root"
-          Repository: onnxruntimecpubuildpythonx86_64
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+          Context: tools/ci_build/github/linux/docker/
+          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=registry.access.redhat.com/ubi8/ubi"
+          Repository: onnxruntimecpubuild
 
       - template: templates/linux-build-step-with-cache.yml
         parameters:
@@ -85,7 +85,6 @@ stages:
               inputs:
                 script: |
                   mkdir -p $HOME/.onnx
-                  mkdir -p $(Pipeline.Workspace)/ccache
                   docker run --rm \
                     --volume /data/onnx:/data/onnx:ro \
                     --volume $(Build.SourcesDirectory):/onnxruntime_src \
@@ -96,12 +95,12 @@ stages:
                     -e NIGHTLY_BUILD \
                     -e BUILD_BUILDNUMBER \
                     -e CCACHE_DIR=/cache \
-                    onnxruntimecpubuildpythonx86_64 \
+                    onnxruntimecpubuild \
                     /bin/bash -c "
                       set -ex; \
                       ccache -s; \
                       /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
-                        --build_dir /build --cmake_generator 'Unix Makefiles' \
+                        --build_dir /build --cmake_generator 'Ninja' \
                         --config Debug Release \
                         --skip_submodule_sync \
                         --build_shared_lib \
@@ -116,115 +115,65 @@ stages:
                       ccache -z"
                 workingDirectory: $(Build.SourcesDirectory)
 
-      - task: UseDotNet@2
-        displayName: "Setup dotnet"
-        inputs:
-          version: '6.0.408'
-
-      - task: DotNetCoreCLI@2
-        displayName: "Restore C# packages"
-        inputs:
-          command: 'restore'
-          projects: '$(Build.SourcesDirectory)/csharp/OnnxRuntime.DesktopOnly.CSharp.sln'
-
-      # the props file was generated with docker container paths. convert to the 'real' path by replacing the
-      # the container path of '/build'. The '>' prefix is to match the closing angle bracket of the tag.
-      # e.g. <OnnxRuntimeBuildDirectory>/build/... so we only match the start of a path.
-      # We use powershell so we don't need extra escaping of the '/' chars in the path.
-      - task: CmdLine@2
-        displayName: 'Update props from docker path to local and create models link'
-        inputs:
-          script: |
-             pwsh -Command '(Get-Content $(Build.SourcesDirectory)/csharp/Directory.Build.props) -replace ">/build", ">$(Build.BinariesDirectory)" | Set-Content $(Build.SourcesDirectory)/csharp/Directory.Build.props'
-             cat $(Build.SourcesDirectory)/csharp/Directory.Build.props
-             ln -s /data/models $(Build.BinariesDirectory)/models
-
-      - task: DotNetCoreCLI@2
-        displayName: 'dotnet build C# sln'
-        inputs:
-          command: 'build'
-          projects: '$(Build.SourcesDirectory)/csharp/OnnxRuntime.DesktopOnly.CSharp.sln'
-
-      - task: DotNetCoreCLI@2
-        displayName: 'dotnet test C#'
-        inputs:
-          command: 'test'
-          projects: '$(Build.SourcesDirectory)/csharp/OnnxRuntime.DesktopOnly.CSharp.sln'
-          # extra logging so all tests are listed in output to validate what's actually run
-          arguments: '-f net6.0 --no-build -l "console;verbosity=normal"'
-          workingDirectory: $(Build.SourcesDirectory)/csharp
-
-      - task: CmdLine@2
-        displayName: 'Install python deps'
-        inputs:
-          script: |
-             set -e -x
-             python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml onnx -qq
-             cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt
-             # Test ORT with the latest ONNX release.
-             sed -i "s/git+http:\/\/github\.com\/onnx\/onnx.*/onnx/" $(Build.BinariesDirectory)/requirements.txt
-             python3 -m pip install -r $(Build.BinariesDirectory)/requirements.txt
-             mkdir $(Build.BinariesDirectory)/requirements_torch_cpu/
-             cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt $(Build.BinariesDirectory)/requirements_torch_cpu/requirements.txt
-             python3 -m pip install -r $(Build.BinariesDirectory)/requirements_torch_cpu/requirements.txt
-
-      - task: CmdLine@2
-        displayName: 'Install Release python package'
-        inputs:
-          script: |
-             rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
-             python3 -m pip install $(Build.BinariesDirectory)/Release/dist/*.whl
-
-      - task: PythonScript@0
-        displayName: 'Run Release unit tests'
-        inputs:
-           scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
-           workingDirectory: $(Build.BinariesDirectory)/Release
-           arguments: >-
-              --build_dir $(Build.BinariesDirectory)
-              --cmake_generator Ninja
-              --config Release
-              --test
-              --skip_submodule_sync
-              --build_shared_lib
-              --parallel
-              --build_wheel
-              --enable_onnx_tests
-              --enable_transformers_tool_test
-              --ctest_path ""
-
-      - task: CmdLine@2
-        displayName: 'Install Debug python package'
-        inputs:
-          script: |
-             set -e -x
-             rm -rf $(Build.BinariesDirectory)/Debug/onnxruntime $(Build.BinariesDirectory)/Debug/pybind11
-             python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
-             python3 -m pip install $(Build.BinariesDirectory)/Debug/dist/*.whl
-
-      - task: PythonScript@0
-        displayName: 'Run Debug unit tests'
-        inputs:
-          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
-          workingDirectory: $(Build.BinariesDirectory)/Debug
-          arguments: >-
-              --build_dir $(Build.BinariesDirectory)
-              --cmake_generator Ninja
-              --config Debug
-              --test
-              --skip_submodule_sync
-              --build_shared_lib
-              --parallel
-              --build_wheel
-              --enable_onnx_tests
-              --enable_transformers_tool_test
-              --ctest_path ""
-
-      - task: PythonScript@0
-        displayName: 'Symbolic shape infer'
-        inputs:
-          scriptPath: $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py
-          workingDirectory: $(Build.BinariesDirectory)/Release
+      - script: |
+          ln -s /data/models $(Build.BinariesDirectory)/models
+        displayName: link model dir
+
+      - bash: |
+          mkdir -p $HOME/.onnx
+          docker run --rm \
+            --volume /data/onnx:/data/onnx:ro \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            onnxruntimecpubuild \
+            /bin/bash -c "
+              set -ex; \
+              pushd /onnxruntime_src/csharp; \
+              dotnet restore /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln; \
+              dotnet build /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln; \
+              dotnet test /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln -f net6.0 --no-build -l \"console;verbosity=normal\"; \
+              popd
+              "
+        displayName: 'Dotnet build C# sln and Test'
+
+      - bash: |
+          mkdir -p $HOME/.onnx
+          docker run --rm \
+            --volume /data/onnx:/data/onnx:ro \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            onnxruntimecpubuild \
+              /bin/bash -c "
+                set -ex; \
+                /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Release && \
+                /bin/bash /onnxruntime_src/tools/scripts/symbolic_shape_infer_test.sh /build
+              "
+        displayName: 'Run Release tests and symbolic shape infer test'
+
+      - bash: |
+          mkdir -p $HOME/.onnx
+          docker run --rm \
+            --volume /data/onnx:/data/onnx:ro \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            onnxruntimecpubuild \
+                /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Debug
+        displayName: 'Run Debug tests'
 
       - task: PublishTestResults@2
         displayName: 'Publish unit test results'
@@ -234,7 +183,6 @@ stages:
           testRunTitle: 'Unit Test Run'
         condition: succeededOrFailed()
 
-
 - stage: arm64_build
   dependsOn: []
   jobs:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 0a1a8c10e46cd..981cbec4ef50f 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -58,9 +58,9 @@ jobs:
 
   - template: templates/get-docker-image-steps.yml
     parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
       Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg BASEIMAGE=${{variables.common_cuda_baseimg}} --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-11/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-11/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )"
+      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u )"
       Repository: onnxruntimecuda11build
 
   - task: Cache@2
@@ -82,7 +82,7 @@ jobs:
     inputs:
       script: |
         mkdir -p $HOME/.onnx
-        docker run -e CC=/opt/rh/devtoolset-11/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-11/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
+        docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
           --volume /data/onnx:/data/onnx:ro \
           --volume $(Build.SourcesDirectory):/onnxruntime_src \
           --volume $(Build.BinariesDirectory):/build \
@@ -109,7 +109,7 @@ jobs:
               --enable_cuda_profiling \
               --enable_pybind --build_java \
               --use_cache \
-              --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc  CMAKE_CUDA_ARCHITECTURES=75; \
+              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
                 ccache -sv; \
                 ccache -z"
       workingDirectory: $(Build.SourcesDirectory)
@@ -154,9 +154,9 @@ jobs:
 
   - template: templates/get-docker-image-steps.yml
     parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
       Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg BASEIMAGE=${{variables.common_cuda_baseimg}} --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-11/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-11/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )"
+      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u )"
       Repository: onnxruntimecuda11build
 
   - task: CmdLine@2
@@ -174,7 +174,6 @@ jobs:
           /bin/bash -c "
             set -ex; \
             cp /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt /tmp/requirements.txt; \
-            sed -i \"s/git+http:\/\/github\.com\/onnx\/onnx.*/onnx/\" /tmp/requirements.txt; \
             ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \
             /tmp/python3 -m pip install -r /tmp/requirements.txt; \
             /tmp/python3 -m pip install /build/Release/dist/*.whl; \
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 5a43018c8023c..9450395f3cf79 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -57,9 +57,9 @@ jobs:
 
   - template: templates/get-docker-image-steps.yml
     parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_8_tensorrt8_6
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
       Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64  --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-11/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-11/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )"
+      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u )"
       Repository: onnxruntimetensorrt86gpubuild
 
   - template: templates/linux-build-step-with-cache.yml
@@ -72,7 +72,7 @@ jobs:
         - task: CmdLine@2
           inputs:
             script: |
-              docker run --gpus all -e CC=/opt/rh/devtoolset-11/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-11/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
+              docker run --gpus all -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
                   --volume /data/onnx:/data/onnx:ro \
                   --volume $(Build.SourcesDirectory):/onnxruntime_src \
                   --volume $(Build.BinariesDirectory):/build \
@@ -85,7 +85,8 @@ jobs:
                   -e CCACHE_DIR=/cache \
                   onnxruntimetensorrt86gpubuild \
                   /bin/bash -c "
-                    cccache -s; \
+                    set -ex; \
+                    ccache -s; \
                       /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
                       --build_dir /build --cmake_generator Ninja \
                       --config Release \
@@ -96,7 +97,7 @@ jobs:
                       --enable_onnx_tests --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
                       --enable_pybind --build_java \
                       --use_tensorrt --tensorrt_home /usr \
-                      --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc CMAKE_CUDA_ARCHITECTURES=75 \
+                      --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 \
                       --use_cache; \
                      ccache -sv; \
                      ccache -z"
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index 340e22b474d61..f678b18ba9787 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.13.1.230730
+  default: qnn-v2.14.1.230828
 
 jobs:
   - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
index 91031ca46020e..b1d7ede2843c8 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
@@ -37,21 +37,6 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   timeoutInMinutes: 150
   steps:
-    - template: templates/mac-build-step-with-cache.yml
-      parameters:
-        WithCache: true
-        Today: $(TODAY)
-        AdditionalKey: ' protobuf | "$(Agent.OS)" | $(Build.SourcesDirectory)/cmake/deps.txt, $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh'
-        CacheDir: $(PROTO_CACHE_DIR)
-        ChangeEveryCommit: false
-        BuildStep:
-          - script: |
-              $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh \
-                -p $(Build.BinariesDirectory)/protobuf_install -d $(Build.SourcesDirectory)/cmake/deps.txt
-            displayName: Install protobuf
-            env:
-              CCACHE_DIR: $(PROTO_CACHE_DIR)
-
     - template: templates/use-xcode-version.yml
     - template: templates/mac-build-step-with-cache.yml
       parameters:
@@ -75,7 +60,6 @@ jobs:
                 --use_xcode \
                 --config RelWithDebInfo \
                 --build_apple_framework \
-                --path_to_protoc_exe $(Build.BinariesDirectory)/protobuf_install/bin/protoc \
                 --parallel
             displayName: (CPU, CoreML, XNNPACK EPs) Build onnxruntime for iOS x86_64 and run tests using simulator
             env:
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index fb87d6150f39a..64fa29f06553e 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -51,19 +51,52 @@ stages:
         script: |
           ln -sf /data/models $(Build.BinariesDirectory)
 
-    - task: Bash@3
-      displayName: 'Run Package Test'
-      inputs:
-        targetType: filePath
-        filePath: '$(Build.SourcesDirectory)/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh'
-        arguments: '$(Build.BinariesDirectory)/nuget-artifact $(NuGetPackageVersionNumber)'
-        workingDirectory: $(Build.BinariesDirectory)
-      env:
-        OnnxRuntimeBuildDirectory: $(Build.BinariesDirectory)
-        DisableContribOps: $(DisableContribOps)
-        DisableMlOps: $(DisableMlOps)
-        IsReleaseBuild: $(IsReleaseBuild)
-        PACKAGENAME: ${{ parameters.NugetPackageName }}
+    - ${{if contains(parameters.StageSuffix , 'GPU') }}:
+      - template: ../../templates/get-docker-image-steps.yml
+        parameters:
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6
+          Context: tools/ci_build/github/linux/docker/
+          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+          Repository: onnxruntimepackagestest
+      - bash: |
+          docker run --rm \
+            --gpus all \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            -e BUILD_SOURCESDIRECTORY='/onnxruntime_src' \
+            -e OnnxRuntimeBuildDirectory='/build' \
+            -e DisableContribOps='$(DisableContribOps)' \
+            -e DisableMlOps='$(DisableMlOps)' \
+            -e IsReleaseBuild='$(IsReleaseBuild)' \
+            -e PACKAGENAME='${{ parameters.NugetPackageName }}' \
+            onnxruntimepackagestest \
+            /bin/bash -c "
+              set -ex; \
+              pushd /build; \
+              bash /onnxruntime_src/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh /build/nuget-artifact $(NuGetPackageVersionNumber); \
+              popd
+              "
+        displayName: 'Run Package Test'
+    - ${{ else }}:
+        - task: CmdLine@2
+          displayName: 'Create symlink for test models'
+          inputs:
+            script: |
+              ln -sf /data/models $(Build.BinariesDirectory)
+        - task: Bash@3
+          displayName: 'Run Package Test'
+          inputs:
+            targetType: filePath
+            filePath: '$(Build.SourcesDirectory)/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh'
+            arguments: '$(Build.BinariesDirectory)/nuget-artifact $(NuGetPackageVersionNumber)'
+            workingDirectory: $(Build.BinariesDirectory)
+          env:
+            OnnxRuntimeBuildDirectory: $(Build.BinariesDirectory)
+            DisableContribOps: $(DisableContribOps)
+            DisableMlOps: $(DisableMlOps)
+            IsReleaseBuild: $(IsReleaseBuild)
+            PACKAGENAME: ${{ parameters.NugetPackageName }}
 
     - template: ../../templates/component-governance-component-detection-steps.yml
       parameters:
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
index 9d27b3edca36b..007630edb25be 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
@@ -67,7 +67,7 @@ jobs:
     parameters:
       Dockerfile: tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
       Context: tools/ci_build/github/linux/docker/inference/x64/python/cpu
-      DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=amd64/almalinux:8 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/opt/rh/gcc-toolset-12/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root"
+      DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=registry.access.redhat.com/ubi8/ubi --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/opt/rh/gcc-toolset-12/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root"
       Repository: onnxruntimecpubuildpythonx86_64
 
   - task: Cache@2
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
index 953e8b3d58c34..adf5695bd76eb 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
@@ -48,7 +48,7 @@ jobs:
       "
     DoNugetPack: 'false'
     RunInjectedPipeline: 'true'
-    InjectedPipeline: 'orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml'
+    InjectedPipeline: 'orttraining-linux-gpu-test-ci-pipeline.yml'
     DockerImageTag: 'onnxruntime_orttraining_ortmodule_tests_image'
     BuildConfig: $(buildConfig)
     ArtifactName: 'drop-linux'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-training-apis.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-training-apis.yml
deleted file mode 100644
index 1b456cdb13d27..0000000000000
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-training-apis.yml
+++ /dev/null
@@ -1,76 +0,0 @@
-##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
-trigger:
-  branches:
-    include:
-    - main
-    - rel-*
-  paths:
-    exclude:
-    - docs/**
-    - README.md
-    - CONTRIBUTING.md
-    - BUILD.md
-    - 'js/web'
-    - 'onnxruntime/core/providers/js'
-pr:
-  branches:
-    include:
-    - main
-    - rel-*
-  paths:
-    exclude:
-    - docs/**
-    - README.md
-    - CONTRIBUTING.md
-    - BUILD.md
-    - 'js/web'
-    - 'onnxruntime/core/providers/js'
-#### end trigger ####
-
-jobs:
-- job: Onnxruntime_Linux_GPU_TrainingAPIs
-
-  timeoutInMinutes: 120
-  pool: 'Onnxruntime-Linux-GPU-NC6sv3'
-
-  steps:
-  - checkout: self
-    clean: true
-    submodules: recursive
-
-  - template: templates/run-docker-build-steps.yml
-    parameters:
-      RunDockerBuildArgs: |
-        -o ubuntu20.04 -d gpu -e \
-        -t onnxruntime_training_apis_tests_image \
-        -x " \
-          --config RelWithDebInfo \
-          --enable_training \
-          --enable_training_apis \
-          --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
-          --build_wheel \
-          --build_java \
-          --skip_tests \
-          " \
-        -u
-      DisplayName: 'Build onnxruntime'
-
-  # Entry point for all ort training api tests
-  - script: |
-      docker run \
-        --gpus all \
-        --shm-size=1024m \
-        --rm \
-        --volume $(Build.SourcesDirectory):/onnxruntime_src \
-        --volume $(Build.BinariesDirectory):/build \
-        onnxruntime_training_apis_tests_image \
-          /build/RelWithDebInfo/launch_test.py \
-            --cwd /build/RelWithDebInfo --cmd_line_with_args "python orttraining_test_ort_apis.py --cwd /build/RelWithDebInfo" \
-    displayName: 'Run ORT Training APIs Tests'
-    condition: succeededOrFailed()
-    timeoutInMinutes: 120
-  - template: templates/component-governance-component-detection-steps.yml
-    parameters:
-      condition: 'succeeded'
-
-  - template: templates/clean-agent-build-directory-step.yml
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
index a41ca5f02467d..7824bf2203efe 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
@@ -23,7 +23,7 @@ jobs:
         --rm \
         --volume $(Build.SourcesDirectory)/orttraining/orttraining/test/python:/onnxruntime_src \
         --volume $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly:/requirements_torch_nightly \
-        ptebic.azurecr.io/internal/azureml/aifx/nightly-ubuntu2004-cu118-py38-torch210dev \
+        ptebic.azurecr.io/internal/aifx/acpt/nightly-ubuntu-cuda-torch-dev \
          bash -c "python3 -m pip install -r /requirements_torch_nightly/requirements.txt && python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py"
     displayName: 'Run ORTModule Tests'
     condition: succeededOrFailed()
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
index 9432abd473e27..b8dfb7f3c90a2 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml
@@ -15,22 +15,21 @@ stages:
     torch_version: '2.0.0'
     opset_version: '15'
     cuda_version: '11.8'
-    gcc_version: 11
-    cmake_cuda_architectures: 50;52;60;61;70;75;80;86;87
-    docker_file: Dockerfile.manylinux2014_training_cuda11_8
+    cmake_cuda_architectures: 60;61;70;75;80;86;90
+    docker_file: Dockerfile.manylinux2_28_training_cuda11_8
     agent_pool: Onnxruntime-Linux-GPU
     upload_wheel: 'yes'
     debug_build: false
 
+# Added for triton compiler team. Can be potentially removed.
 - template: templates/py-packaging-training-cuda-stage.yml
   parameters:
     build_py_parameters: --enable_training --update --build
     torch_version: '2.0.0'
     opset_version: '15'
     cuda_version: '11.8'
-    gcc_version: 11
-    cmake_cuda_architectures: 50;52;60;61;70;75;80;86;87
-    docker_file: Dockerfile.manylinux2014_training_cuda11_8
+    cmake_cuda_architectures: 70;75;80;86
+    docker_file: Dockerfile.manylinux2_28_training_cuda11_8
     agent_pool: Onnxruntime-Linux-GPU
     upload_wheel: 'no'
     debug_build: true
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 113b24f7579ac..61f9b37d4ce78 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -74,7 +74,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: true
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-MultiA10
@@ -95,7 +94,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: true
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: TRT
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-MultiA10
@@ -114,7 +112,6 @@ stages:
       isX86: false
       job_name_suffix: x64_mimalloc
       RunOnnxRuntimeTests: true
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
@@ -134,7 +131,6 @@ stages:
       isX86: false
       job_name_suffix: x64_no_memory_profiling
       RunOnnxRuntimeTests: false
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
@@ -154,7 +150,6 @@ stages:
       isX86: false
       job_name_suffix: x64_minimal_no_exception
       RunOnnxRuntimeTests: true
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
@@ -174,7 +169,6 @@ stages:
       isX86: false
       job_name_suffix: x64_debug_node_input_output
       RunOnnxRuntimeTests: true
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index bc4aa63347e46..654ccad3af327 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -2,12 +2,12 @@ parameters:
 - name: qnn_sdk_path_win
   displayName: QNN Windows SDK path
   type: string
-  default: C:\data\qnnsdk\qnn-v2.13.1.230730_win
+  default: C:\data\qnnsdk\qnn-v2.14.1.230828_win
 
 - name: qnn_sdk_info
   displayName: QNN SDK Version Information
   type: string
-  default: qnn-v2.13.1.230730_win
+  default: qnn-v2.14.1.230828_win
 
 - name: ort_package_version
   displayName: OnnxRuntime Nuget package version
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 830325b05d086..21cd3a44e8924 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -532,10 +532,9 @@ stages:
 - stage: Nodejs_Packaging_CPU
   dependsOn:
   - Linux_C_API_Packaging_CPU
-  - Linux_C_API_Packaging_GPU_TensorRT_x64
   - MacOS_C_API_Package_Publish
-  - Windows_CI_GPU_DML_Dev
-  - Windows_CI_GPU_DML_Dev_arm64
+  - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }}
+  - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }}
   condition: succeeded()
   jobs:
   - job:
@@ -565,13 +564,13 @@ stages:
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - NuGet (Win x64)'
       inputs:
-        artifactName: 'drop-nuget-dml'
+        artifactName: 'onnxruntime-win-x64'
         targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
 
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - NuGet (Win ARM64)'
       inputs:
-        artifactName: 'drop-win-dml-arm64-zip'
+        artifactName: 'onnxruntime-win-arm64'
         targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
 
     - task: DownloadPipelineArtifact@0
@@ -595,14 +594,14 @@ stages:
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (Win x64)'
       inputs:
-        artifactName: 'drop-onnxruntime-nodejs-win-x64-dml'
-        targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/'
+        artifactName: 'drop-onnxruntime-nodejs-win-x64'
+        targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/x64/'
 
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (Win ARM64)'
       inputs:
-        artifactName: 'drop-onnxruntime-nodejs-win-arm64-dml'
-        targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/'
+        artifactName: 'drop-onnxruntime-nodejs-win-arm64'
+        targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/win32/arm64/'
 
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (macOS x86_64)'
@@ -619,7 +618,7 @@ stages:
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - Nodejs (Linux x64)'
       inputs:
-        artifactName: 'drop-onnxruntime-nodejs-linux-x64-tensorrt'
+        artifactName: 'drop-onnxruntime-nodejs-linux-x64'
         targetPath: '$(Build.BinariesDirectory)/nodejs-artifacts/linux/x64/'
 
     - task: DownloadPipelineArtifact@0
diff --git a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
index da6bfd5058177..e7f703fa592a3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
@@ -1,3 +1,3 @@
 variables:
   common_cuda_version: '11.8'
-  common_cuda_baseimg: 'nvidia/cuda:11.8.0-cudnn8-devel-centos7'
+  common_cuda_baseimg: 'nvidia/cuda:11.8.0-cudnn8-devel-ubi8'
diff --git a/tools/ci_build/github/azure-pipelines/templates/compliance.yml b/tools/ci_build/github/azure-pipelines/templates/compliance.yml
index f4bce8c53605b..cc451425be42a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/compliance.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/compliance.yml
@@ -18,27 +18,6 @@ steps:
     AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
   continueOnError: true
 
-- task: DeleteFiles@1
-  displayName: 'Delete files from $(Build.BinariesDirectory)\RelWithDebInfo'
-  inputs:
-    SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-    Contents: |
-     **/*.obj
-     **/*.pdb
-     **/*.dll
-
-# Manually set msBuildCommandline so that we can also set CAExcludePath
-- task: SDLNativeRules@3
-  displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-  inputs:
-    userProvideBuildInfo: msBuildInfo
-    msBuildArchitecture: x64
-    msBuildVersion: 17.0
-    msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln" /p:platform="${{parameters.msbuildPlatform}}" /p:configuration="RelWithDebInfo" /p:CAExcludePath="$(Build.BinariesDirectory);$(Build.SourcesDirectory)\cmake;C:\program files (x86)" /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-    excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-    rulesetName: Custom
-    customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-
 - task: SdtReport@2
   displayName: 'Create Security Analysis Report'
   inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index c5056a39e9605..f17bc8de5739b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.78
+      version: 1.0.81
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.78
+      version: 1.0.81
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
index 67a03beab9362..46f2ae7b97acc 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -34,11 +34,6 @@ parameters:
   type: boolean
   default: true
 
-- name: RunStaticCodeAnalysis
-  displayName: Run Static Code Analysis
-  type: boolean
-  default: true
-
 - name: ORT_EP_NAME
   type: string
 
@@ -220,49 +215,6 @@ jobs:
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
         displayName: 'Run tests'
 
-
-  - ${{ if eq(parameters.RunStaticCodeAnalysis, true) }}:
-      - task: DeleteFiles@1
-        displayName: 'Delete binaries files from $(Build.BinariesDirectory)\RelWithDebInfo'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-          Contents: |
-           **/*.obj
-           **/*.pdb
-           **/*.dll
-
-
-      # Manually set msBuildCommandline so that we can also set CAExcludePath
-      # build_dir must be a sub folder of $(Build.SourcesDirectory)
-      # TODO: move this step to a CPU-only machine to save GPU resources.
-      - task: SDLNativeRules@3
-        displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-        inputs:
-          msBuildArchitecture: amd64
-          setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }} --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON onnxruntime_ENABLE_LTO=OFF'
-          msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\RelWithDebInfo\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform=${{ parameters.msbuildPlatform }} /p:configuration=RelWithDebInfo /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-          excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-          rulesetName: Custom
-          customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-          publishXML: true
-
-      - task: SdtReport@2
-        displayName: 'Create Security Analysis Report'
-        inputs:
-          SDLNativeRules: true
-
-      - task: PublishSecurityAnalysisLogs@3
-        displayName: 'Publish Security Analysis Logs'
-        continueOnError: true
-
-      - task: PostAnalysis@2
-        displayName: 'Guardian Break v2'
-        inputs:
-          GdnBreakGdnToolSDLNativeRulesSeverity: Note
-          GdnBreakGdnToolSDLNativeRules: true
-
-
-  - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
       - task: PublishTestResults@2
         displayName: 'Publish unit test results'
         inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
index a0be955983aff..51d3a9ebc2187 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
@@ -29,7 +29,7 @@ stages:
   - template: c-api-linux-cpu.yml
     parameters:
       AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }}
-      BaseImage: 'amd64/almalinux:8'
+      BaseImage: 'registry.access.redhat.com/ubi8/ubi'
       OnnxruntimeArch: 'x64'
       OnnxruntimeCFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
       OnnxruntimeCXXFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
index ec5b41fc1318a..445f739e81c45 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
@@ -44,21 +44,15 @@ stages:
         submodules: recursive
       - template: get-docker-image-steps.yml
         parameters:
-          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_8_tensorrt8_6
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
           Context: tools/ci_build/github/linux/docker
-          DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-11/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-11/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )"
+          DockerBuildArgs: "--build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u )"
           Repository: onnxruntimecuda118xtrt86build
       - template: set-version-number-variables-step.yml
 
-      - task: CmdLine@2
-        inputs:
-          script: |
-            mkdir -p $HOME/.onnx
-            docker run --gpus all -e CC=/opt/rh/devtoolset-11/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-11/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \
-            --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
-            /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \
-            --skip_submodule_sync --parallel --build_shared_lib ${{ parameters.buildJavaOption }} ${{ parameters.buildNodejsOption }} --use_tensorrt --cuda_version=$(CUDA_VERSION) --cuda_home=/usr/local/cuda-$(CUDA_VERSION) --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80'
-          workingDirectory: $(Build.SourcesDirectory)
+      - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
+        workingDirectory: $(Build.SourcesDirectory)
+        displayName: 'Build and Test'
 
       - ${{ if eq(parameters.buildJava, true) }}:
           - template: java-api-artifacts-package-and-publish-steps-posix.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
index 93945a1cb5e96..0e584b550f562 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
@@ -31,6 +31,10 @@ parameters:
   type: boolean
   default: false
 
+- name: BuildTraining
+  type: boolean
+  default: true
+
 - name: WithCache
   type: boolean
   default: false
@@ -146,6 +150,19 @@ jobs:
       DisplayName: 'Build and test (node) (simd)'
       WithCache: ${{ parameters.WithCache }}
 
+  - ${{ if eq(parameters.BuildTraining, true) }}:
+    - template: build-linux-wasm-step.yml
+      parameters:
+        Today: $(Today)
+        ${{ if eq(parameters.BuildStaticLib, true)}}:
+          AdditionalKey: training_wasm_simd | ${{ parameters.BuildConfig }} | static
+        ${{ else }}:
+          AdditionalKey: training_wasm_simd | ${{ parameters.BuildConfig }}
+        CacheDir: $(ORT_CACHE_DIR)/training_wasm_simd
+        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/training_wasm_simd --enable_training_apis --enable_wasm_simd --target onnxruntime_webassembly --skip_tests'
+        DisplayName: 'Build (training + simd)'
+        WithCache: ${{ parameters.WithCache }}
+
   - ${{ if eq(parameters.BuildJsep, true) }}:
     - template: build-linux-wasm-step.yml
       parameters:
@@ -185,6 +202,10 @@ jobs:
           cp $(Build.BinariesDirectory)/wasm_simd_threads_jsep/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.js $(Build.ArtifactStagingDirectory)/ort-wasm-simd-threaded.jsep.js
           cp $(Build.BinariesDirectory)/wasm_simd_threads_jsep/${{ parameters.BuildConfig }}/ort-wasm-simd-threaded.worker.js $(Build.ArtifactStagingDirectory)/ort-wasm-simd-threaded.jsep.worker.js
         fi
+        if [ -d $(Build.BinariesDirectory)/training_wasm_simd ]; then
+          cp $(Build.BinariesDirectory)/training_wasm_simd/${{ parameters.BuildConfig }}/ort-training-wasm-simd.wasm $(Build.ArtifactStagingDirectory)/ort-training-wasm-simd.wasm
+          cp $(Build.BinariesDirectory)/training_wasm_simd/${{ parameters.BuildConfig }}/ort-training-wasm-simd.js $(Build.ArtifactStagingDirectory)/ort-training-wasm-simd.js
+        fi
       displayName: 'Create Artifacts'
   - ${{ if eq(parameters.SkipPublish, false) }}:
     - task: PublishPipelineArtifact@0
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-web-ci.yml
deleted file mode 100644
index d2f5df1d17361..0000000000000
--- a/tools/ci_build/github/azure-pipelines/templates/linux-web-ci.yml
+++ /dev/null
@@ -1,147 +0,0 @@
-parameters:
-- name: CommitOverride
-  type: boolean
-  default: false
-
-- name: BuildConfig
-  type: string
-  default: 'Debug'
-
-- name: NpmPackagingMode
-  type: string
-  default: 'dev'
-
-- name: PoolName
-  type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
-
-- name: PackageName
-  displayName: 'Package name'
-  type: string
-  default: 'NPM_packages'
-
-jobs:
-- job: build_onnxruntime_web
-  pool: ${{ parameters.PoolName }}
-
-  variables:
-    runCodesignValidationInjection: false
-  timeoutInMinutes: 60
-  workspace:
-    clean: all
-  steps:
-  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-    displayName: 'Clean Agent Directories'
-    condition: always()
-  - checkout: self
-    submodules: false
-  - task: DownloadPipelineArtifact@2
-    inputs:
-      artifact: '__commit'
-      path: $(Pipeline.Workspace)
-    displayName: 'Get commit SHA'
-    condition: eq('${{ parameters.CommitOverride }}', 'true')
-  - script: |
-     export __commit__=<$(Pipeline.Workspace)/__commit.txt
-     git fetch origin +$__commit__:refs/remotes/origin/$__commit__
-     git checkout --force $__commit__
-    workingDirectory: '$(Build.SourcesDirectory)'
-    displayName: 'Read commit SHA and checkout'
-    condition: eq('${{ parameters.CommitOverride }}', 'true')
-  - script: |
-     echo.$(Build.SourceVersion)>$(Pipeline.Workspace)/__commit.txt
-    workingDirectory: '$(Build.SourcesDirectory)'
-    displayName: 'Write commit SHA to __commit.txt'
-    condition: ne('${{ parameters.CommitOverride }}', 'true')
-  - script: |
-     git submodule sync -- cmake/external/onnx
-     git submodule update --init -- cmake/external/onnx
-    workingDirectory: '$(Build.SourcesDirectory)'
-    displayName: 'Checkout submodule onnx'
-  - task: NodeTool@0
-    inputs:
-      versionSpec: '16.x'
-  - task: DownloadPipelineArtifact@2
-    inputs:
-      patterns: '${{ parameters.BuildConfig }}_*/**/*'
-      path: $(Pipeline.Workspace)/artifacts
-    displayName: 'Download WebAssembly artifacts'
-  - task: CopyFiles@2
-    inputs:
-      sourceFolder: $(Pipeline.Workspace)/artifacts
-      contents: |
-        **/*.wasm
-      targetFolder: $(Build.SourcesDirectory)/js/web/dist
-      flattenFolders: true
-    displayName: 'Binplace dist files'
-  - task: CopyFiles@2
-    inputs:
-      sourceFolder: $(Pipeline.Workspace)/artifacts
-      contents: |
-        **/*.js
-      targetFolder: $(Build.SourcesDirectory)/js/web/lib/wasm/binding
-      flattenFolders: true
-    displayName: 'Binplace js files'
-  - template: linux-web-init-and-check.yml
-  - task: Cache@2
-    inputs:
-      key: onnxtestdata | $(Build.SourcesDirectory)\js\scripts\prepare-onnx-node-tests.ts
-      restoreKeys: |
-        onnxtestdata | $(Build.SourcesDirectory)\js\scripts\prepare-onnx-node-tests.ts
-      path: $(Build.SourcesDirectory)/js/test/
-      cacheHitVar: CACHE_RESTORED
-    displayName: 'Cache ONNX node test data'
-  - task: Bash@3
-    inputs:
-      targetType: 'inline'
-      script: find "$(Build.SourcesDirectory)/js/test/" -type f
-    condition: and(not(canceled()), eq(variables.CACHE_RESTORED, 'true'))
-    displayName: 'List ONNX node test data'
-  - task: PowerShell@2
-    inputs:
-      filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/js/pack-npm-packages.ps1'
-      arguments: '$(NpmPackagingMode) $(Build.SourcesDirectory) web'
-      workingDirectory: $(Build.BinariesDirectory)
-      errorActionPreference: stop
-    displayName: 'Pack NPM packages'
-  - script: |
-     npm test
-    workingDirectory: '$(Build.SourcesDirectory)/js/web'
-    displayName: 'Run ort-web tests'
-  - script: |
-     npm test -- --webgl-texture-pack-mode -b=webgl
-    workingDirectory: '$(Build.SourcesDirectory)/js/web'
-    displayName: 'Run ort-web tests - WebGL: packed mode'
-  - script: |
-     npm test -- --wasm-enable-proxy -b=wasm
-    workingDirectory: '$(Build.SourcesDirectory)/js/web'
-    displayName: 'Run ort-web tests - WebAssembly: proxy'
-    condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
-  - script: |
-      npm run test:e2e
-    workingDirectory: '$(Build.SourcesDirectory)/js/web'
-    displayName: 'E2E package consuming test'
-    condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
-  - task: CopyFiles@2
-    inputs:
-      sourceFolder: $(Build.SourcesDirectory)/js/common
-      contents: onnxruntime-common-*.tgz
-      targetFolder: $(Build.ArtifactStagingDirectory)
-    displayName: 'Create Artifacts (onnxruntime-common)'
-    condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
-  - task: CopyFiles@2
-    inputs:
-      sourceFolder: $(Build.SourcesDirectory)/js/web
-      contents: onnxruntime-web-*.tgz
-      targetFolder: $(Build.ArtifactStagingDirectory)
-    displayName: 'Create Artifacts (onnxruntime-web)'
-    condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
-  - task: PublishPipelineArtifact@0
-    inputs:
-      artifactName: '${{ parameters.PackageName }}'
-      targetPath: '$(Build.ArtifactStagingDirectory)'
-    displayName: 'Publish Pipeline Artifact'
-    condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
-  - template: component-governance-component-detection-steps.yml
-    parameters :
-      condition : 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
similarity index 76%
rename from tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml
rename to tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
index 1c0a54b850c07..5dc156e301357 100644
--- a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
@@ -37,3 +37,17 @@ steps:
   displayName: 'Run orttraining_ortmodule_tests.py'
   condition: succeededOrFailed()
   timeoutInMinutes: 60
+
+# Entry point for all ort training api tests
+- script: |
+    docker run \
+      --gpus all \
+      --shm-size=1024m \
+      --rm \
+      --volume $(Build.SourcesDirectory):/onnxruntime_src \
+      --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \
+      ${{ parameters.DockerImageTag }} \
+        bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \
+  displayName: 'Run ORT Training APIs Tests'
+  condition: succeededOrFailed()
+  timeoutInMinutes: 120
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
index 087d2cfee5f6b..3d5a71284fa6f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
@@ -36,9 +36,9 @@ jobs:
 
     - template: get-docker-image-steps.yml
       parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_8_tensorrt8_6
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
         Context: tools/ci_build/github/linux/docker
-        DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-11/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-11/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u ) --build-arg PLATFORM=${{ parameters.arch }}"
+        DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u ) --build-arg PLATFORM=${{ parameters.arch }}"
         Repository: onnxruntimecuda118xtrt86build${{ parameters.arch }}
 
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
index 8375ef4061302..0774c3350b9b1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
@@ -64,7 +64,7 @@ jobs:
       parameters:
         Dockerfile: tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
         Context: tools/ci_build/github/linux/docker/inference/x64/python/cpu
-        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{ parameters.base_image }} --build-arg PLATFORM=${{ parameters.arch }} --build-arg PREPEND_PATH=${{ parameters.prepend_path }} --build-arg LD_LIBRARY_PATH_ARG=${{ parameters.ld_library_path_arg }} --build-arg DEVTOOLSET_ROOTPATH=${{ parameters.devtoolset_rootpath }}"
+        DockerBuildArgs: "--build-arg POLICY=manylinux_2_28 --build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{ parameters.base_image }} --build-arg PLATFORM=${{ parameters.arch }} --build-arg PREPEND_PATH=${{ parameters.prepend_path }} --build-arg LD_LIBRARY_PATH_ARG=${{ parameters.ld_library_path_arg }} --build-arg DEVTOOLSET_ROOTPATH=${{ parameters.devtoolset_rootpath }}"
         Repository: onnxruntimecpubuildpython${{ parameters.arch }}
         ${{ if eq(parameters.arch, 'aarch64') }}:
             UpdateDepsTxt: false
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 7ec41c8768998..1305f5ae21725 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -246,24 +246,6 @@ stages:
         workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
         displayName: 'Run Python Tests'
 
-      #Skip it for 32 bits x86 build. Currently the scan tool has a bug: it doesn't allow me use 64 bits link.exe
-      #in 32 bits Win32 build. I tried all the settings but they all don't work.
-      - task: SDLNativeRules@3
-        displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-        condition: and (succeeded(), and(eq(variables['buildArch'], 'x64'), eq(variables['PythonVersion'], '3.8')))
-        inputs:
-          msBuildArchitecture: amd64
-          setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --cmake_generator "Visual Studio 17 2022" --enable_pybind --enable_onnx_tests --parallel $(TelemetryOption) --update --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON onnxruntime_ENABLE_LTO=OFF'
-          msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\Debug\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform="$(MsbuildPlatform)" /p:configuration=Debug /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-          excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-          rulesetName: Custom
-          customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-
-      - task: SdtReport@2
-        displayName: 'Create Security Analysis Report'
-        inputs:
-          SDLNativeRules: true
-
       - task: TSAUpload@2
         displayName: 'TSA upload'
         condition: and(and (succeeded(), and(eq(variables['buildArch'], 'x64'), eq(variables['PythonVersion'], '3.8'))), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
@@ -515,7 +497,7 @@ stages:
         parameters:
           arch: 'x86_64'
           machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
-          base_image: 'amd64/almalinux:8'
+          base_image: 'registry.access.redhat.com/ubi8/ubi'
           devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
           ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
           prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
index ee25ea0a08743..7fdd7e54e752d 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
@@ -25,11 +25,6 @@ parameters:
     cmake_cuda_architectures
   type: string
 
-- name: gcc_version
-  displayName: >
-    gcc_version.
-  type: number
-
 - name: docker_file
   displayName: >
     docker_file.
@@ -84,28 +79,24 @@ stages:
             TorchVersion: ${{ parameters.torch_version }}
             OpsetVersion: ${{ parameters.opset_version }}
             CudaVersion: ${{ parameters.cuda_version }}
-            GccVersion: ${{ parameters.gcc_version }}
             UploadWheel: ${{ parameters.upload_wheel }}
           Python39:
             PythonVersion: '3.9'
             TorchVersion: ${{ parameters.torch_version }}
             OpsetVersion: ${{ parameters.opset_version }}
             CudaVersion: ${{ parameters.cuda_version }}
-            GccVersion: ${{ parameters.gcc_version }}
             UploadWheel: ${{ parameters.upload_wheel }}
           Python310:
             PythonVersion: '3.10'
             TorchVersion: ${{ parameters.torch_version }}
             OpsetVersion: ${{ parameters.opset_version }}
             CudaVersion: ${{ parameters.cuda_version }}
-            GccVersion: ${{ parameters.gcc_version }}
             UploadWheel: ${{ parameters.upload_wheel }}
           Python311:
             PythonVersion: '3.11'
             TorchVersion: ${{ parameters.torch_version }}
             OpsetVersion: ${{ parameters.opset_version }}
             CudaVersion: ${{ parameters.cuda_version }}
-            GccVersion: ${{ parameters.gcc_version }}
             UploadWheel: ${{ parameters.upload_wheel }}
 
       steps:
@@ -133,10 +124,10 @@ stages:
             --build-arg PYTHON_VERSION=$(PythonVersion)
             --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
             --build-arg BUILD_UID=$(id -u)
-            --network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64
-            --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-$(GccVersion)/root
-            --build-arg PREPEND_PATH=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin:
-            --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib64/dyninst:/opt/rh/devtoolset-$(GccVersion)/root/usr/lib/dyninst:/usr/local/lib64
+            --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
+            --build-arg DEVTOOLSET_ROOTPATH=/usr
+            --build-arg PREPEND_PATH=/usr/local/cuda/bin:
+            --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64
           Repository: onnxruntimetraininggpubuild
 
       - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
@@ -155,8 +146,9 @@ stages:
         displayName: 'build onnxruntime'
         inputs:
           script: |
+            set -e -x
             mkdir -p $HOME/.onnx
-            docker run --rm -e CC=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
+            docker run --rm -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
               --volume /data/onnx:/data/onnx:ro \
               --volume $(Build.SourcesDirectory):/onnxruntime_src \
               --volume $(Build.BinariesDirectory):/build \
@@ -176,7 +168,7 @@ stages:
                   --build_wheel \
                   --enable_onnx_tests \
                   ${{ parameters.build_py_parameters }} \
-                  --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-$(GccVersion)/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \
+                  --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \
                   --use_cuda --cuda_version=$(CudaVersion) --cuda_home=/usr/local/cuda-$(CudaVersion) --cudnn_home=/usr/local/cuda-$(CudaVersion) ;
           workingDirectory: $(Build.SourcesDirectory)
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index ef938a634554a..919749cac15b6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -22,65 +22,6 @@ parameters:
   default: ''
 
 jobs:
-- ${{ if eq(parameters.PYTHON_VERSION, '3.8') }}:
-    - job: Win_py_${{ parameters.EP_NAME }}_Wheels_StaticAnalysis
-      timeoutInMinutes: 240
-      workspace:
-        clean: all
-      pool: onnxruntime-Win-CPU-2022
-      steps:
-          - checkout: self
-            clean: true
-            submodules: none
-          - task: UsePythonVersion@0
-            inputs:
-              versionSpec: 3.8
-              addToPath: true
-              architecture: 'x64'       
-          - task: onebranch.pipeline.tsaoptions@1
-            displayName: 'OneBranch TSAOptions'
-            inputs:
-              tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
-              appendSourceBranchName: false
-              
-          - template: download-deps.yml
-
-          - template: jobs/set-winenv.yml
-            parameters:
-              EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }}
-              DownloadCUDA: true
-
-          - task: PythonScript@0
-            displayName: 'Update deps.txt'
-            inputs:
-              scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
-              arguments: --new_dir $(Build.BinariesDirectory)/deps
-              workingDirectory: $(Build.BinariesDirectory)
-
-          - task: SDLNativeRules@3
-            displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-            inputs:
-              msBuildArchitecture: amd64
-              setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --cmake_generator "Visual Studio 17 2022" --enable_pybind ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} --update --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON onnxruntime_ENABLE_LTO=OFF'
-              msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\Debug\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform=x64 /p:configuration=Debug /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-              excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-              rulesetName: Custom
-              customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-              publishXML: true
-
-          - task: SdtReport@2
-            displayName: 'Create Security Analysis Report'
-            inputs:
-              SDLNativeRules: true
-
-          - task: TSAUpload@2
-            displayName: 'TSA upload'
-            condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-            inputs:
-              GdnPublishTsaOnboard: false
-              GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' 
-
- 
 - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}
   timeoutInMinutes: 240
   workspace:
diff --git a/tools/ci_build/github/azure-pipelines/templates/rocm.yml b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
index 6d085472621e5..cc2e8745e8946 100644
--- a/tools/ci_build/github/azure-pipelines/templates/rocm.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
@@ -50,7 +50,11 @@ jobs:
       DockerBuildArgs: >-
         --build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur
         --build-arg BUILD_UID=$(id -u)
-        --network=host
+        --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
+        --build-arg ROCM_VERSION=$(RocmVersion)
+        --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root
+        --build-arg PREPEND_PATH=/opt/rh/gcc-toolset-12/root/usr/bin:
+        --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
         --build-arg ROCM_VERSION=${{ parameters.RocmVersion }}
       Repository: onnxruntimetrainingrocmbuild-rocm${{ parameters.RocmVersion }}
 
@@ -63,7 +67,7 @@ jobs:
           --network=host \
           --cap-add=SYS_PTRACE \
           --security-opt seccomp=unconfined \
-          -e CC=/opt/rh/devtoolset-10/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-10/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
+          -e CC=/opt/rh/gcc-toolset-12/root/usr/bin/cc -e CXX=/opt/rh/gcc-toolset-12/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
           --volume $(Build.SourcesDirectory):/onnxruntime_src \
           --volume $(Build.BinariesDirectory):/build \
           --workdir /onnxruntime_src \
@@ -87,7 +91,7 @@ jobs:
               --enable_training \
               --cmake_extra_defines \
                 CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
-                onnxruntime_BUILD_UNIT_TESTS=OFF \
+                onnxruntime_BUILD_UNIT_TESTS=OFF FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER \
               ${{ variables['EnableProfiling'] }}
       workingDirectory: $(Build.SourcesDirectory)
     displayName: 'Build onnxruntime (in container)'
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index 15254ce4d1d5b..81f17a26b16a6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -116,9 +116,11 @@ stages:
         xcodeDeveloperDir: '/Applications/Xcode_${{ variables.xcodeVersion }}.app/Contents/Developer'
         signingOption: 'manual'
         signingIdentity: '$(APPLE_CERTIFICATE_SIGNING_IDENTITY)'
-        provisioningProfileName: 'iOS Team Provisioning Profile'
+        provisioningProfileName: 'temporary *'  # temporary name, change it back to the original below later
+        #provisioningProfileName: 'iOS Team Provisioning Profile'
         args: '-derivedDataPath $(Build.BinariesDirectory)/app_center_test/ios_package_test/DerivedData'
         workingDirectory: '$(Build.BinariesDirectory)/app_center_test/ios_package_test/'
+        useXcpretty: false  # xcpretty can hide useful error output so we will disable it
       displayName: 'Build App Center iPhone arm64 tests'
 
     - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
index e0a85cc1973bc..0b7bd3f645442 100644
--- a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
@@ -27,14 +27,14 @@ parameters:
 - name: WASMTemplate
   type: string
   default: win-wasm-ci.yml
-- name: WebTemplate
-  type: string
-  default: win-web-ci.yml
 # parameter couldn't be compared by string, so add one boolean parameter.
 - name: UseWebPoolName
   type: boolean
   default: false
-- name: RunWebGpuTests
+- name: RunWebGpuTestsForReleaseBuild
+  type: boolean
+  default: false
+- name: RunWebGpuTestsForDebugBuild
   type: boolean
   default: false
 - name: WebGpuPoolName
@@ -104,16 +104,20 @@ stages:
 - stage: Build_web_Debug
   dependsOn: Build_wasm_Debug
   jobs:
-  - template: ${{ parameters.WebTemplate }}
+  - template: win-web-ci.yml
     parameters:
       CommitOverride: true
       BuildConfig: 'Debug'
       NpmPackagingMode: ${{ parameters.NpmPackagingMode }}
       ${{ if eq(parameters.UseWebPoolName, true)}}:
-        PoolName: ${{ parameters.WebCpuPoolName }}
+        ${{ if eq(parameters.RunWebGpuTestsForDebugBuild, true)}}:
+          PoolName: ${{ parameters.WebGpuPoolName }}
+        ${{ else }}:
+          PoolName: ${{ parameters.WebCpuPoolName }}
       ${{ else }}:
         PoolName: ${{ parameters.PoolName }}
       PackageName: ${{ parameters.PackageName }}
+      RunWebGpuTests: ${{ parameters.RunWebGpuTestsForDebugBuild }}
 
 - stage: Build_wasm_Release
   dependsOn: Precheck_and_extract_commit
@@ -145,20 +149,20 @@ stages:
 - stage: Build_web_Release
   dependsOn: Build_wasm_Release
   jobs:
-  - template: ${{ parameters.WebTemplate }}
+  - template: win-web-ci.yml
     parameters:
       CommitOverride: true
       BuildConfig: 'Release'
       NpmPackagingMode: ${{ parameters.NpmPackagingMode }}
       ${{ if eq(parameters.UseWebPoolName, true)}}:
-        ${{ if eq(parameters.RunWebGpuTests, true)}}:
+        ${{ if eq(parameters.RunWebGpuTestsForReleaseBuild, true)}}:
           PoolName: ${{ parameters.WebGpuPoolName }}
         ${{ else }}:
           PoolName: ${{ parameters.WebCpuPoolName }}
       ${{ else }}:
         PoolName: ${{ parameters.PoolName }}
       PackageName: ${{ parameters.PackageName }}
-      RunWebGpuTests: ${{ parameters.RunWebGpuTests }}
+      RunWebGpuTests: ${{ parameters.RunWebGpuTestsForReleaseBuild }}
 
 # Disable BrowserStack test
 # TODO: fix and re-enable in PostMerge test
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index f6da7bb857b7d..80d285f3fd3fb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -263,25 +263,6 @@ stages:
             AnalyzeTargetGlob: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\**\*.dll'
           continueOnError: true
 
-        - task: DeleteFiles@1
-          displayName: 'Delete files from $(Build.BinariesDirectory)\RelWithDebInfo'
-          inputs:
-            SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-            Contents: |
-             **/*.obj
-             **/*.pdb
-             **/*.dll
-
-        #Manually set msBuildCommandline so that we can also set CAExcludePath
-        - task: SDLNativeRules@3
-          displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-          condition: and (succeeded(), eq(variables['msbuildPlatform'], 'x64'))
-          inputs:
-            msBuildArchitecture: amd64
-            setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }} --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON'
-            msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.BinariesDirectory)\Debug\onnxruntime.sln" /p:platform="$(MsbuildPlatform)" /p:configuration=Debug /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-            excludedPaths: '$(Build.BinariesDirectory)#$(Build.SourcesDirectory)\cmake#C:\program files (x86)'
-
         - task: PostAnalysis@2
           inputs:
             GdnBreakAllTools: false
diff --git a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
index 38b4814a4cb0c..63dabf5eab9d9 100644
--- a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
@@ -57,9 +57,9 @@ stages:
     BuildStaticLib: true
     ExtraBuildArgs: $(ExtraBuildArgs)
     WASMTemplate: linux-wasm-ci.yml
-    WebTemplate: win-web-ci.yml
     UseWebPoolName: true
-    RunWebGpuTests: true
+    RunWebGpuTestsForDebugBuild: false
+    RunWebGpuTestsForReleaseBuild: true
     WebGpuPoolName: 'onnxruntime-Win2022-webgpu-A10'
     WebCpuPoolName: 'onnxruntime-Win-CPU-2022-web'
     WithCache: true
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index b9b833a3155bf..2a5622faf2905 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -47,7 +47,6 @@ stages:
         isX86: false
         job_name_suffix: x64_debug
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -69,7 +68,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -89,7 +87,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: DNNL
         GenerateDocumentation: false
@@ -111,7 +108,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release
         RunOnnxRuntimeTests: true
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: XNNPACK
         GenerateDocumentation: false
@@ -132,7 +128,6 @@ stages:
         job_name_suffix: x64_release_winml
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         # WinML has many warnings
-        RunStaticCodeAnalysis: false
         EnablePython: false
         isTraining: false
         ORT_EP_NAME: CPU
@@ -153,7 +148,6 @@ stages:
         isX86: true
         job_name_suffix: x86_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -173,7 +167,6 @@ stages:
         isX86: false
         job_name_suffix: training_x64_debug
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -193,7 +186,6 @@ stages:
         isX86: false
         job_name_suffix: training_x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: true
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -213,7 +205,6 @@ stages:
         isX86: false
         job_name_suffix: ort_training_apis_x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         EnablePython: false
         isTraining: true
         ORT_EP_NAME: CPU
@@ -234,7 +225,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release_azure
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         EnablePython: false
         isTraining: false
         ORT_EP_NAME: CPU
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index c7cfa31e53cc2..8796917afa37d 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -47,7 +47,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-A10
@@ -65,10 +64,9 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
-        # Some unit tests crash on A10 GPUs. So this job still needs to use A10.
+        # Some unit tests crash on A10 GPUs. So this job still needs to use T4.
         MachinePool: onnxruntime-Win2022-GPU-T4
         isTraining: true
 
@@ -85,7 +83,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: DML
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-dml-A10
@@ -104,7 +101,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: false
-        RunStaticCodeAnalysis: false
         GenerateDocumentation: true
         ORT_EP_NAME: CUDA # It doesn't really matter which EP is selected here since this stage is for documentation.
         WITH_CACHE: true
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 2a5cb722e2002..b36a25034b19e 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.13.1.230730_win
+  default: qnn-v2.14.1.230828_win
 
 jobs:
 - job: 'build'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index 64fd578b6591c..7db0c7302cd6f 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.13.1.230730_win
+  default: qnn-v2.14.1.230828_win
 
 jobs:
 - job: 'build'
diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
index ad37d6dbd3e4f..5cd1c8c243050 100755
--- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
@@ -1,10 +1,11 @@
 #!/bin/bash
+set -e -x
 export CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
 export CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
 docker run --gpus all -e CFLAGS -e CXXFLAGS  -e NVIDIA_VISIBLE_DEVICES=all --rm --volume \
 $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
 --volume /data/models:/build/models:ro --volume /data/onnx:/data/onnx:ro -e NIGHTLY_BUILD onnxruntimecuda11centosbuild \
-python3 /onnxruntime_src/tools/ci_build/build.py --build_java --build_dir /build --config Release \
---skip_submodule_sync  --parallel --nvcc_threads=1 --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION \
+/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --build_java --build_nodejs --build_dir /build --config Release \
+--skip_submodule_sync  --parallel --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION \
 --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION \
---cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80'
+--cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80'
diff --git a/tools/ci_build/github/linux/build_rocm_c_api_package.sh b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
index 4d0af63893643..957f1f8a812a5 100755
--- a/tools/ci_build/github/linux/build_rocm_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
@@ -40,7 +40,7 @@ docker run --rm \
     --use_rocm --rocm_version=$ROCM_VERSION --rocm_home $ROCM_HOME --nccl_home $ROCM_HOME \
     --build_shared_lib \
     --skip_submodule_sync \
-    --skip_tests \
+    --skip_tests --cmake_extra_defines FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER
 
 
 EXIT_CODE=$?
diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
new file mode 100755
index 0000000000000..18a32e3599391
--- /dev/null
+++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -e -x
+export CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
+export CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
+mkdir -p $HOME/.onnx
+docker run --gpus all -e CFLAGS -e CXXFLAGS -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
+--volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
+/opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \
+--skip_submodule_sync --parallel --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80'
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index 1895c75b3d2f1..a9a1e6b39a8cb 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -158,7 +158,8 @@ CMD ["/bin/bash"]
 ENV PATH ${DEVTOOLSET_ROOTPATH}/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 
 ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh
+RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
 
 ARG BUILD_UID=1001
 ARG BUILD_USER=onnxruntimedev
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
similarity index 98%
rename from tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11
rename to tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
index dc52fb51d6389..dab8df6703c4f 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
@@ -1,5 +1,5 @@
-ARG BASEIMAGE=nvidia/cuda:11.4.2-cudnn8-devel-centos7
-ARG POLICY=manylinux2014
+ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+ARG POLICY=manylinux_2_28
 ARG PLATFORM=x86_64
 ARG DEVTOOLSET_ROOTPATH=
 ARG LD_LIBRARY_PATH_ARG=
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4
similarity index 100%
rename from tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_6_tensorrt8_4
rename to tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_6_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5
similarity index 100%
rename from tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_6_tensorrt8_5
rename to tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
similarity index 98%
rename from tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_8_tensorrt8_6
rename to tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
index accdcbe2cc40d..3c0ac22e38b5a 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
@@ -1,5 +1,5 @@
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-centos7
-ARG POLICY=manylinux2014
+ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+ARG POLICY=manylinux_2_28
 ARG PLATFORM=x86_64
 ARG DEVTOOLSET_ROOTPATH=
 ARG LD_LIBRARY_PATH_ARG=
@@ -168,7 +168,7 @@ CMD ["/bin/bash"]
 #Install TensorRT 8.6.1.6
 #RUN yum install -y wget
 RUN v="8.6.1.6-1.cuda11.8" &&\
-    yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo &&\
+    yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo &&\
     yum -y install libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} libnvinfer-vc-plugin8-${v}\
         libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v} libnvinfer-vc-plugin-devel-${v} libnvinfer-headers-devel-${v}  libnvinfer-headers-plugin-devel-${v}
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
index 57c2fd99b6d5c..10ce8f0ed65f7 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
@@ -8,16 +8,9 @@ ARG PREPEND_PATH=${DEVTOOLSET_ROOTPATH}/usr/bin:
 FROM $BASEIMAGE AS base_image
 ARG ROCM_VERSION=5.5
 
-# Enable epel-release repositories
-RUN yum --enablerepo=extras install -y epel-release
-
-# Install the ROCm rpms
-RUN yum clean all
-RUN echo -e "[ROCm]\nname=ROCm\nbaseurl=https://repo.radeon.com/rocm/yum/$ROCM_VERSION/main\nenabled=1\ngpgcheck=0" >> /etc/yum.repos.d/rocm.repo
-
-RUN echo -e "[amdgpu]\nname=amdgpu\nbaseurl=https://repo.radeon.com/amdgpu/$ROCM_VERSION/rhel/7.9/main/x86_64\nenabled=1\ngpgcheck=0" >> /etc/yum.repos.d/amdgpu.repo
-
-RUN yum install -y rocm-dev
+#Add our own dependencies
+ADD scripts /tmp/scripts
+RUN /tmp/scripts/setup_rocm_yum_repo.sh -r ${ROCM_VERSION}
 
 # Set ENV
 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin${PATH:+:${PATH}}
@@ -52,7 +45,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
 
 # setup entrypoint, this will wrap commands with `linux32` with i686 images
 COPY build_scripts/install-entrypoint.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 
@@ -61,7 +53,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
 ENTRYPOINT ["manylinux-entrypoint"]
 
 COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
@@ -164,7 +155,6 @@ COPY --from=build_git /manylinux-rootfs /
 COPY --from=build_cpython /manylinux-rootfs /
 COPY --from=all_python /opt/_internal /opt/_internal/
 COPY build_scripts/finalize.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/python-tag-abi-tag.py \
      build_scripts/requirements3.8.txt \
      build_scripts/requirements3.9.txt \
@@ -185,8 +175,7 @@ ARG PYTHON_VERSION=3.8
 ARG OPSET_VERSION=15
 ARG INSTALL_DEPS_EXTRA_ARGS
 
-#Add our own dependencies
-ADD scripts /tmp/scripts
+
 RUN cd /tmp/scripts && \
     /tmp/scripts/manylinux/install_centos.sh && \
     /tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_8 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
similarity index 99%
rename from tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_8
rename to tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
index 5d774460073ed..326e15d58456a 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_8
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
@@ -1,4 +1,4 @@
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-centos7
+ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 ARG POLICY=manylinux2014
 ARG PLATFORM=x86_64
 ARG DEVTOOLSET_ROOTPATH=
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6
new file mode 100644
index 0000000000000..c211fa9b9e2b8
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6
@@ -0,0 +1,53 @@
+# --------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------
+# Dockerfile to run ONNXRuntime with TensorRT integration
+
+# Build base image with required system packages
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
+
+# The local directory into which to build and install CMAKE
+ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
+
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update &&\
+    apt-get install -y sudo git bash unattended-upgrades wget
+RUN unattended-upgrade
+
+# Install python3
+RUN apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-wheel &&\
+    cd /usr/local/bin &&\
+    ln -s /usr/bin/python3 python &&\
+    ln -s /usr/bin/pip3 pip;
+
+RUN pip install --upgrade pip
+RUN pip install setuptools>=41.0.0
+
+# Install TensorRT
+RUN v="8.6.1.6-1+cuda11.8" &&\
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
+    apt-get update &&\
+    sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} libnvinfer-lean8=${v} libnvinfer-vc-plugin8=${v} libnvinfer-dispatch8=${v}\
+        libnvinfer-headers-dev=${v} libnvinfer-headers-plugin-dev=${v} libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} libnvinfer-lean-dev=${v} libnvinfer-vc-plugin-dev=${v}  libnvinfer-dispatch-dev=${v}\
+        python3-libnvinfer=${v} libnvinfer-samples=${v} tensorrt-dev=${v} tensorrt-libs=${v}
+
+# Install Valgrind
+RUN apt-get install -y valgrind
+
+ADD scripts /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && rm -rf /tmp/scripts
+
+# Build final image from base.
+FROM base as final
+ARG BUILD_USER=onnxruntimedev
+ARG BUILD_UID=1000
+RUN adduser --uid $BUILD_UID $BUILD_USER
+WORKDIR /home/$BUILD_USER
+USER $BUILD_USER
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
index dc616c9711f08..10f404c7c6a85 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
@@ -12,7 +12,7 @@ ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
 ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:/code/cmake-3.26.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 
 ENV DEBIAN_FRONTEND=noninteractive
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
index 0c57ed1463d27..cacc09f0c7455 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
@@ -10,7 +10,7 @@ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.26.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
index c79e1720f8794..0a4885e774047 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
@@ -10,7 +10,7 @@ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.26.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
index 86d513a4f7677..a0ba5ea232ca3 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
@@ -35,8 +35,8 @@ RUN wget "https://github.com/intel/compute-runtime/releases/download/21.48.21782
     sudo dpkg -i *.deb && rm -rf *.deb
 
 RUN mkdir -p /opt/cmake/bin && \
-    wget https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-x86_64.tar.gz && \
-    tar -xf cmake-3.26.3-linux-x86_64.tar.gz --strip 1 -C /opt/cmake && rm -rf /cmake-3.26.3-linux-x86_64.tar.gz && \
+    wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.tar.gz && \
+    tar -xf cmake-3.27.3-linux-x86_64.tar.gz --strip 1 -C /opt/cmake && rm -rf /cmake-3.27.3-linux-x86_64.tar.gz && \
     ln -sf /opt/cmake/bin/* /usr/bin
 
 ARG BUILD_UID=1000
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
index 0071bf5013e7d..c9308ade37396 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
@@ -21,7 +21,7 @@ ARG TAR_CUDNN_VERSION
 # Directory containing TensorRT tar.gz installation package
 ARG TRT_BINS_DIR=.
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.26.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 
 ENV DEBIAN_FRONTEND=noninteractive
 
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
index fccc282446be7..2cd054e6246bc 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
@@ -6,8 +6,8 @@ ARG BASEIMAGE=arm64v8/almalinux:8
 FROM $BASEIMAGE
 
 ENV PATH /opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ENV LANG=en_US.utf8
-ENV LC_ALL=en_US.utf8
+ENV LANG=en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh
index b85cf8e8a83f7..a1ade39e57e16 100755
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh
@@ -4,7 +4,6 @@ set -e -x
 os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
 
 echo "installing for CentOS version : $os_major_version"
-dnf install -y glibc-langpack-\*
-yum install -y which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
 
-localedef -i en_US -f UTF-8 en_US.UTF-8
+dnf install -y glibc-langpack-\* glibc-locale-source which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
+locale
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
index 61189b6277052..7ecd0525c7e7e 100755
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
@@ -39,8 +39,8 @@ mkdir -p /tmp/src
 cd /tmp/src
 
 echo "Installing cmake"
-GetFile https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.26.3-linux-`uname -m`.tar.gz
-tar -zxf /tmp/src/cmake-3.26.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
+GetFile https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.27.3-linux-`uname -m`.tar.gz
+tar -zxf /tmp/src/cmake-3.27.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
 
 echo "Installing Ninja"
 GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile
index 892fb19865ca3..0324f377b8e9e 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile
@@ -6,8 +6,8 @@ ARG BASEIMAGE=amd64/almalinux:8
 FROM $BASEIMAGE
 
 ENV PATH /opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ENV LANG=en_US.utf8
-ENV LC_ALL=en_US.utf8
+ENV LANG=en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
index b85cf8e8a83f7..8e18a237a807e 100755
--- a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
@@ -4,7 +4,6 @@ set -e -x
 os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
 
 echo "installing for CentOS version : $os_major_version"
-dnf install -y glibc-langpack-\*
-yum install -y which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
 
-localedef -i en_US -f UTF-8 en_US.UTF-8
+dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
+locale
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_deps.sh
index 61189b6277052..3b05c6787ca3e 100755
--- a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_deps.sh
@@ -39,8 +39,8 @@ mkdir -p /tmp/src
 cd /tmp/src
 
 echo "Installing cmake"
-GetFile https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.26.3-linux-`uname -m`.tar.gz
-tar -zxf /tmp/src/cmake-3.26.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
+GetFile https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.27.3-linux-`uname -m`.tar.gz
+tar -zxf /tmp/src/cmake-3.27.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
 
 echo "Installing Ninja"
 GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
@@ -64,16 +64,5 @@ fi
 GetFile https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz
 tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr
 
-# The Python version in CentOS 7's python3 package is no longer supported (3.6) so we will build Python from source.
-echo "Installing Python"
-PYTHON_VERSION="3.8.17"
-GetFile https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz /tmp/src/Python-${PYTHON_VERSION}.tgz
-tar -zxf Python-${PYTHON_VERSION}.tgz
-pushd Python-${PYTHON_VERSION}
-./configure
-make
-make install
-popd
-
 cd /
 rm -rf /tmp/src
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
new file mode 100644
index 0000000000000..386759890d085
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
@@ -0,0 +1,17 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+
+ENV LANG=en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8
+
+ADD scripts /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
+
+ARG BUILD_UID=1001
+ARG BUILD_USER=onnxruntimedev
+RUN adduser --uid $BUILD_UID $BUILD_USER
+WORKDIR /home/$BUILD_USER
+USER $BUILD_USER
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh
new file mode 100755
index 0000000000000..3cf259dc7240e
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -e -x
+
+os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
+
+echo "installing for CentOS version : $os_major_version"
+
+dnf install -y python39-devel python3-devel glibc-langpack-\* glibc-locale-source which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel
+locale
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_deps.sh
new file mode 100755
index 0000000000000..eb6d3315b97ef
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_deps.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -e -x
+
+# Download a file from internet
+function GetFile {
+  local uri=$1
+  local path=$2
+  local force=${3:-false}
+  local download_retries=${4:-5}
+  local retry_wait_time_seconds=${5:-30}
+
+  if [[ -f $path ]]; then
+    if [[ $force = false ]]; then
+      echo "File '$path' already exists. Skipping download"
+      return 0
+    else
+      rm -rf $path
+    fi
+  fi
+
+  if [[ -f $uri ]]; then
+    echo "'$uri' is a file path, copying file to '$path'"
+    cp $uri $path
+    return $?
+  fi
+
+  echo "Downloading $uri"
+  # Use aria2c if available, otherwise use curl
+  if command -v aria2c > /dev/null; then
+    aria2c -q -d $(dirname $path) -o $(basename $path) "$uri"
+  else
+    curl "$uri" -sSL --retry $download_retries --retry-delay $retry_wait_time_seconds --create-dirs -o "$path" --fail
+  fi
+
+  return $?
+}
+mkdir -p /tmp/src
+
+cd /tmp/src
+
+echo "Installing cmake"
+GetFile https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.26.3-linux-`uname -m`.tar.gz
+tar -zxf /tmp/src/cmake-3.26.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
+
+echo "Installing Ninja"
+GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
+tar -zxf ninja-linux.tar.gz
+pushd ninja-1.10.0
+cmake -Bbuild-cmake -H.
+cmake --build build-cmake
+mv ./build-cmake/ninja /usr/bin
+popd
+
+echo "Installing Node.js"
+CPU_ARCH=`uname -m`
+if [[ "$CPU_ARCH" = "x86_64" ]]; then
+  NODEJS_ARCH=x64
+elif [[ "$CPU_ARCH" = "aarch64" ]]; then
+  NODEJS_ARCH=arm64
+else
+  NODEJS_ARCH=$CPU_ARCH
+fi
+# The EOL for nodejs v18.17.1 LTS is April 2025
+GetFile https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz
+tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr
+
+cd /
+rm -rf /tmp/src
diff --git a/tools/ci_build/github/linux/docker/manylinux-entrypoint b/tools/ci_build/github/linux/docker/manylinux-entrypoint
deleted file mode 100755
index 06ea40efa998f..0000000000000
--- a/tools/ci_build/github/linux/docker/manylinux-entrypoint
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-if [ "${AUDITWHEEL_ARCH}" == "i686" ]; then
-	linux32 "$@"
-else
-	exec "$@"
-fi
diff --git a/tools/ci_build/github/linux/docker/manylinux.patch b/tools/ci_build/github/linux/docker/manylinux.patch
index 7750118d01bb6..f1821f9197525 100644
--- a/tools/ci_build/github/linux/docker/manylinux.patch
+++ b/tools/ci_build/github/linux/docker/manylinux.patch
@@ -50,6 +50,35 @@ index 961e34d..55ae11b 100755
      make install > /dev/null
  }
  
+diff --git a/finalize.sh b/finalize.sh
+index 621eab9..4cbcf90 100755
+--- a/finalize.sh
++++ b/finalize.sh
+@@ -86,6 +86,3 @@ clean_pyc /opt/_internal
+ rm -rf /root/.cache
+ 
+ hardlink -cv /opt/_internal
+-
+-# update system packages
+-LC_ALL=C ${MY_DIR}/update-system-packages.sh
+diff --git a/install-build-packages.sh b/install-build-packages.sh
+index 408bc33..b45ceba 100755
+--- a/install-build-packages.sh
++++ b/install-build-packages.sh
+@@ -9,12 +9,11 @@ set -exuo pipefail
+ # make sure the corresponding library is added to RUNTIME_DEPS if applicable
+ 
+ if [ "${AUDITWHEEL_POLICY}" == "manylinux2014" ] || [ "${AUDITWHEEL_POLICY}" == "manylinux_2_28" ]; then
+-	COMPILE_DEPS="bzip2-devel ncurses-devel readline-devel gdbm-devel libpcap-devel xz-devel openssl openssl-devel keyutils-libs-devel krb5-devel libcom_err-devel libidn-devel curl-devel uuid-devel libffi-devel kernel-headers libdb-devel"
++	COMPILE_DEPS="bzip2-devel ncurses-devel gdbm-devel xz-devel openssl openssl-devel keyutils-libs-devel krb5-devel libcom_err-devel curl-devel libffi-devel kernel-headers libdb-devel"
+ 	if [ "${AUDITWHEEL_POLICY}" == "manylinux2014" ]; then
+ 		PACKAGE_MANAGER=yum
+ 	else
+ 		PACKAGE_MANAGER=dnf
+-		COMPILE_DEPS="${COMPILE_DEPS} tk-devel"
+ 	fi
+ elif [ "${AUDITWHEEL_POLICY}" == "musllinux_1_1" ]; then
+ 	PACKAGE_MANAGER=apk
 diff --git a/install-entrypoint.sh b/install-entrypoint.sh
 index 9ef1e99..ec52833 100755
 --- a/install-entrypoint.sh
@@ -65,9 +94,27 @@ index 9ef1e99..ec52833 100755
 +fi
 \ No newline at end of file
 diff --git a/install-runtime-packages.sh b/install-runtime-packages.sh
-index 137d2e2..21b60a7 100755
+index 137d2e2..4269afb 100755
 --- a/install-runtime-packages.sh
 +++ b/install-runtime-packages.sh
+@@ -33,7 +33,7 @@ source $MY_DIR/build_utils.sh
+ 
+ # MANYLINUX_DEPS: Install development packages (except for libgcc which is provided by gcc install)
+ if [ "${AUDITWHEEL_POLICY}" == "manylinux2014" ] || [ "${AUDITWHEEL_POLICY}" == "manylinux_2_28" ]; then
+-	MANYLINUX_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel mesa-libGL-devel libICE-devel libSM-devel zlib-devel expat-devel"
++	MANYLINUX_DEPS="glibc-devel libstdc++-devel glib2-devel zlib-devel expat-devel"
+ elif [ "${AUDITWHEEL_POLICY}" == "musllinux_1_1" ]; then
+ 	MANYLINUX_DEPS="musl-dev libstdc++ glib-dev libx11-dev libxext-dev libxrender-dev mesa-dev libice-dev libsm-dev zlib-dev expat-dev"
+ else
+@@ -54,7 +54,7 @@ else
+ 	exit 1
+ fi
+ 
+-BASETOOLS="autoconf automake bison bzip2 diffutils file make patch unzip"
++BASETOOLS="autoconf automake bzip2 diffutils file make patch unzip"
+ if [ "${AUDITWHEEL_POLICY}" == "manylinux2014" ]; then
+ 	PACKAGE_MANAGER=yum
+ 	BASETOOLS="${BASETOOLS} hardlink hostname which"
 @@ -73,9 +73,11 @@ if [ "${AUDITWHEEL_POLICY}" == "manylinux2014" ]; then
  	if [ "${AUDITWHEEL_ARCH}" == "x86_64" ]; then
  		# Software collection (for devtoolset-10)
@@ -83,3 +130,41 @@ index 137d2e2..21b60a7 100755
  	elif [ "${AUDITWHEEL_ARCH}" == "aarch64" ] || [ "${AUDITWHEEL_ARCH}" == "ppc64le" ] || [ "${AUDITWHEEL_ARCH}" == "s390x" ]; then
  		# Software collection (for devtoolset-10)
  		yum -y install centos-release-scl-rh
+@@ -86,19 +88,18 @@ if [ "${AUDITWHEEL_POLICY}" == "manylinux2014" ]; then
+ 	fi
+ elif [ "${AUDITWHEEL_POLICY}" == "manylinux_2_28" ]; then
+ 	PACKAGE_MANAGER=dnf
+-	BASETOOLS="${BASETOOLS} curl glibc-locale-source glibc-langpack-en hardlink hostname libcurl libnsl libxcrypt which"
++	BASETOOLS="${BASETOOLS} yum-utils curl glibc-locale-source glibc-langpack-en hardlink hostname libcurl libxcrypt which"
+ 	# See https://unix.stackexchange.com/questions/41784/can-yum-express-a-preference-for-x86-64-over-i386-packages
+ 	echo "multilib_policy=best" >> /etc/yum.conf
+ 	# Error out if requested packages do not exist
+ 	echo "skip_missing_names_on_install=False" >> /etc/yum.conf
+ 	# Make sure that locale will not be removed
+ 	sed -i '/^override_install_langs=/d' /etc/yum.conf
+-	dnf -y upgrade
+ 	dnf -y install dnf-plugins-core
+-	dnf config-manager --set-enabled powertools # for yasm
+-	TOOLCHAIN_DEPS="gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran"
+-	if [ "${AUDITWHEEL_ARCH}" == "x86_64" ]; then
+-		TOOLCHAIN_DEPS="${TOOLCHAIN_DEPS} yasm"
++        if [[ -d /usr/local/cuda ]]; then
++	    TOOLCHAIN_DEPS="gcc gcc-c++"
++	else
++	    TOOLCHAIN_DEPS="gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran"
+ 	fi
+ elif [ "${AUDITWHEEL_POLICY}" == "musllinux_1_1" ]; then
+ 	TOOLCHAIN_DEPS="binutils gcc g++ gfortran"
+@@ -121,12 +122,6 @@ else
+ 	exit 1
+ fi
+ 
+-# update system packages, we already updated them but
+-# the following script takes care of cleaning-up some things
+-# and since it's also needed in the finalize step, everything's
+-# centralized in this script to avoid code duplication
+-LC_ALL=C ${MY_DIR}/update-system-packages.sh
+-
+ if [ "${BASE_POLICY}" == "manylinux" ]; then
+ 	# we'll be removing libcrypt.so.1 later on
+ 	# this is needed to ensure the new one will be found
diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
index d1b1df39b4811..7d2c818d08920 100644
--- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
@@ -45,10 +45,10 @@ ENV LANG C.UTF-8
 WORKDIR /stage
 
 # Cmake
-ENV CMAKE_VERSION=3.26.3
+ENV CMAKE_VERSION=3.27.3
 RUN cd /usr/local && \
     wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz && \
-    tar -zxf /usr/local/cmake-3.26.3-Linux-x86_64.tar.gz --strip=1 -C /usr
+    tar -zxf /usr/local/cmake-3.27.3-Linux-x86_64.tar.gz --strip=1 -C /usr
 
 # ccache
 RUN mkdir -p /tmp/ccache && \
diff --git a/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh b/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh
new file mode 100755
index 0000000000000..b9accb134b26d
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+set -e -x
+
+if [ -f /etc/redhat-release ]; then
+    dnf update --refresh -y \
+    && dnf install -y dotnet-sdk-6.0
+elif [ -f /etc/os-release ]; then
+  # Get Ubuntu version
+  declare repo_version=$(if command -v lsb_release &> /dev/null; then lsb_release -r -s; else grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"'; fi)
+  # Download Microsoft signing key and repository
+  wget https://packages.microsoft.com/config/ubuntu/$repo_version/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
+  # Install Microsoft signing key and repository
+  dpkg -i packages-microsoft-prod.deb
+  # Clean up
+  rm packages-microsoft-prod.deb
+  # Update packages
+  apt-get update && apt-get install -y dotnet-sdk-6.0
+else
+  echo "Unsupported OS"
+  exit 1
+fi
diff --git a/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh
index 796adfea6c302..3e872d17504a1 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh
@@ -71,18 +71,18 @@ if [[ $SYS_LONG_BIT = "64" && "$GLIBC_VERSION" -gt "9" ]]; then
   tar --strip 1 -xf /tmp/azcopy/azcopy.tar.gz -C /tmp/azcopy
   cp /tmp/azcopy/azcopy /usr/bin
   echo "Installing cmake"
-  GetFile https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-Linux-x86_64.tar.gz /tmp/src/cmake-3.26.3-Linux-x86_64.tar.gz
-  tar -zxf /tmp/src/cmake-3.26.3-Linux-x86_64.tar.gz --strip=1 -C /usr
+  GetFile https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-Linux-x86_64.tar.gz /tmp/src/cmake-3.27.3-Linux-x86_64.tar.gz
+  tar -zxf /tmp/src/cmake-3.27.3-Linux-x86_64.tar.gz --strip=1 -C /usr
   echo "Installing Node.js"
   # The EOL for nodejs v18.17.1 LTS is April 2025
   GetFile https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-x64.tar.xz /tmp/src/node-v18.17.1-linux-x64.tar.xz
   tar -xf /tmp/src/node-v18.17.1-linux-x64.tar.xz --strip=1 -C /usr
 else
   echo "Installing cmake"
-  GetFile https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3.tar.gz /tmp/src/cmake-3.26.3.tar.gz
-  tar -xf /tmp/src/cmake-3.26.3.tar.gz -C /tmp/src
+  GetFile https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3.tar.gz /tmp/src/cmake-3.27.3.tar.gz
+  tar -xf /tmp/src/cmake-3.27.3.tar.gz -C /tmp/src
   pushd .
-  cd /tmp/src/cmake-3.26.3
+  cd /tmp/src/cmake-3.27.3
   ./bootstrap --prefix=/usr --parallel=$(getconf _NPROCESSORS_ONLN) --system-bzip2 --system-curl --system-zlib --system-expat
   make -j$(getconf _NPROCESSORS_ONLN)
   make install
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
index c34abbd2ba873..a1cb4be5b72c9 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
@@ -13,6 +13,9 @@ else
   exit 1
 fi
 
+# Install dotnet
+source $(cd "$(dirname "${BASH_SOURCE[0]}")/.." &> /dev/null && pwd)/install_dotnet.sh
+
 if [ ! -d "/opt/conda/bin" ]; then
     PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
 else
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_shared_deps.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_shared_deps.sh
old mode 100644
new mode 100755
diff --git a/tools/ci_build/github/linux/docker/scripts/setup_rocm_yum_repo.sh b/tools/ci_build/github/linux/docker/scripts/setup_rocm_yum_repo.sh
new file mode 100755
index 0000000000000..fcd9086061227
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/scripts/setup_rocm_yum_repo.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+set -e -x
+
+# version
+ROCM_VERSION=5.6
+
+while getopts "r:" parameter_Option
+do case "${parameter_Option}"
+in
+r) ROCM_VERSION=${OPTARG};;
+esac
+done
+
+tee /etc/yum.repos.d/amdgpu.repo <<EOF
+[amdgpu]
+name=amdgpu
+baseurl=https://repo.radeon.com/amdgpu/$ROCM_VERSION/rhel/8.7/main/x86_64/
+enabled=1
+priority=50
+gpgcheck=1
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
+EOF
+dnf clean all
+
+
+# Enable epel-release repositories
+# See: https://github.com/ROCm-Developer-Tools/HIP/issues/2330
+dnf --enablerepo=extras install -y epel-release && dnf config-manager --set-enabled powertools
+
+# Install the ROCm rpms
+dnf clean all
+
+tee /etc/yum.repos.d/rocm.repo <<EOF
+[ROCm]
+name=ROCm
+baseurl=https://repo.radeon.com/rocm/rhel8/$ROCM_VERSION/main
+enabled=1
+priority=50
+gpgcheck=1
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
+EOF
+
+dnf install -y rocm-dev
diff --git a/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh b/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh
old mode 100644
new mode 100755
diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
index 5343960c8d6ae..89a7fe09c527f 100644
--- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
@@ -41,7 +41,7 @@ ENV LANG C.UTF-8
 WORKDIR /stage
 
 # CMake
-ENV CMAKE_VERSION=3.26.3
+ENV CMAKE_VERSION=3.27.3
 RUN cd /usr/local && \
     wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
 ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}
diff --git a/tools/ci_build/github/windows/install_protoc.ps1 b/tools/ci_build/github/windows/install_protoc.ps1
deleted file mode 100644
index ca506c49b78fa..0000000000000
--- a/tools/ci_build/github/windows/install_protoc.ps1
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-# This script depends on python.exe, cmake.exe and Visual C++ spectre-mitigated libs.
-# Please setup AGENT_TEMPDIRECTORY env variable before running this script
-# This file is very similar to install_third_party_deps.ps1 expect this one only installs protobuf.
-# This file is mainly for cross-compiling use cases.
-
- param (
-    [string]$cpu_arch = "x64",
-    [string]$build_config = "RelWithDebInfo",
-    [string]$install_prefix = ".",
-    [switch]$use_cache
- )
-
-. "$PSScriptRoot\helpers.ps1"
-
-$ort_src_root = (Get-Item $PSScriptRoot).parent.parent.parent.parent.FullName
-
-Write-Host "ONNX Runtime src root: $ort_src_root"
-
-$ErrorActionPreference = "Stop"
-
-$Env:Path = "$install_prefix\bin;" + $env:Path
-$Env:MSBUILDDISABLENODEREUSE=1
-
-New-Item -Path "$install_prefix" -ItemType Directory -Force
-
-# Setup compile flags
-$compile_flags = @('/MP', '/guard:cf', '/Qspectre', '/DWIN32', '/D_WINDOWS', '/DWINVER=0x0A00', '/D_WIN32_WINNT=0x0A00', '/DNTDDI_VERSION=0x0A000000', '/W3')
-$linker_flags=@('/guard:cf')
-
-if ($use_cache) {
-  $debug_info_format = "/Z7"
-}
-else {
-  $debug_info_format = "/Zi"
-}
-
-if($build_config -eq 'Release'){
-  $compile_flags += "/O2", "/Ob2", "/DNDEBUG", "/Gw", "/GL"
-} elseif($build_config -eq 'RelWithDebInfo'){
-  $compile_flags += "$debug_info_format", "/O2", "/Ob1", "/DNDEBUG", "/Gw", "/GL"
-} elseif($build_config -eq 'Debug'){
-  $compile_flags += "$debug_info_format", "/Ob0", "/Od", "/RTC1"
-} elseif($build_config -eq 'MinSizeRel'){
-  $compile_flags += "/O1", "/Ob1", "/DNDEBUG", "/Gw", "/GL"
-}
-
-
-# cmake args that applies to every 3rd-party library
-[string[]]$cmake_extra_args="`"-DCMAKE_C_FLAGS=$compile_flags`"", "--compile-no-warning-as-error", "--fresh", "-Wno-dev"
-
-
-if($cpu_arch -eq 'x86'){
-  $cmake_extra_args +=  "-A", "Win32", "-T", "host=x64"
-  $compile_flags += '/Qspectre'
-  $linker_flags += '/machine:x86'
-} elseif($cpu_arch -eq 'x64') {
-  $linker_flags += '/machine:x64'
-  $compile_flags += '/Qspectre'
-} elseif($cpu_arch -eq 'arm') {
-  $linker_flags += '/machine:ARM'
-} elseif($cpu_arch -eq 'arm64') {
-  $linker_flags += '/machine:ARM64'
-} elseif($cpu_arch -eq 'arm64ec') {
-  $linker_flags += '/machine:ARM64EC'
-} else {
-  throw "$cpu_arch is not supported"
-}
-
-Write-Host $compile_flags
-
-$cmake_extra_args += "-DCMAKE_CXX_STANDARD=17", "`"-DCMAKE_CXX_FLAGS=$compile_flags /EHsc`""
-
-if ($use_cache) {
-  if ($build_config -eq 'RelWithDebInfo') {
-    $cmake_extra_args += "-DCMAKE_CXX_FLAGS_RELWITHDEBINFO=`"/MD /Z7 /O2 /Ob1 /DNDEBUG`""
-  }
-  elseif ($build_config -eq 'Debug') {
-    $cmake_extra_args += "-DCMAKE_CXX_FLAGS_DEBUG=`"/MDd /Z7 /Ob0 /Od /RTC1`""
-  }
-}
-
-$cmake_extra_args += "-DCMAKE_EXE_LINKER_FLAGS=`"$linker_flags`""
-
-# Find the full path of cmake.exe
-$cmake_command = (Get-Command -CommandType Application cmake)[0]
-$cmake_path = $cmake_command.Path
-$msbuild_path = "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin\msbuild.exe"
-if(-not (Test-Path $msbuild_path)){
-  $vshwere_path =  Join-Path -Path ${env:ProgramFiles(x86)} "Microsoft Visual Studio\Installer\vswhere.exe"
-  if(-not (Test-Path $vshwere_path -PathType Leaf)){
-    $vshwere_path =  Join-Path -Path ${env:ProgramFiles} "Microsoft Visual Studio\Installer\vswhere.exe"
-  }
-  $msbuild_path = &$vshwere_path -latest -requires Microsoft.Component.MSBuild -find MSBuild\**\Bin\MSBuild.exe | select-object -first 1
-}
-
-Write-Host "$msbuild_path"
-
-Install-Protobuf -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path
diff --git a/tools/scripts/python_test.sh b/tools/scripts/python_test.sh
new file mode 100644
index 0000000000000..bfdd4663feede
--- /dev/null
+++ b/tools/scripts/python_test.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -ex
+
+export src_dir=$1
+export build_dir=$2
+export config=$3
+
+# it's for manylinux image
+export PATH=/opt/python/cp38-cp38/bin:$PATH
+
+echo Install Python Deps
+cp $src_dir/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $build_dir/requirements.txt
+
+python3 -m pip install -r $build_dir/requirements.txt
+mkdir -p $build_dir/requirements_torch_cpu/
+cp $src_dir/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt $build_dir/requirements_torch_cpu/requirements.txt
+python3 -m pip install -r $build_dir/requirements_torch_cpu/requirements.txt
+python3 -m pip list | grep onnx
+
+echo Install $config python package
+rm -rf $build_dir/$config/onnxruntime $build_dir/$config/pybind11
+python3 -m pip install $build_dir/$config/dist/*.whl
+
+echo Run $config unit tests
+pushd $build_dir/$config/
+python3 $src_dir/tools/ci_build/build.py --build_dir $build_dir --cmake_generator Ninja --config $config --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_onnx_tests --enable_transformers_tool_test --ctest_path ""
+popd
diff --git a/tools/scripts/symbolic_shape_infer_test.sh b/tools/scripts/symbolic_shape_infer_test.sh
new file mode 100644
index 0000000000000..d8d50c5e3fa91
--- /dev/null
+++ b/tools/scripts/symbolic_shape_infer_test.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -ex
+
+export build_dir=$1
+
+# it's for manylinux image
+export PATH=/opt/python/cp38-cp38/bin:$PATH
+
+echo Run symbolic shape infer test
+pushd $build_dir/Release/
+python3 /build/Release/onnxruntime_test_python_symbolic_shape_infer.py
+popd
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.h b/winml/lib/Api.Ort/OnnxruntimeEngine.h
index 5974d46b82c4f..eae7dc37941c7 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.h
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.h
@@ -3,6 +3,7 @@
 
 #include "iengine.h"
 #include "UniqueOrtPtr.h"
+#include "core/common/gsl.h"
 
 #include <memory>
 #include <mutex>
diff --git a/winml/test/model/model_tests.cpp b/winml/test/model/model_tests.cpp
index 43b904ce77ad0..0b4c10eac9142 100644
--- a/winml/test/model/model_tests.cpp
+++ b/winml/test/model/model_tests.cpp
@@ -1,4 +1,5 @@
 #include "testPch.h"
+#include <absl/strings/ascii.h>
 #include "test/onnx/TestCase.h"
 #include "test/onnx/heap_buffer.h"
 #include "test/util/include/test/compare_ortvalue.h"
@@ -387,7 +388,7 @@ std::string GetFullNameOfTest(ITestCase* testCase, winml::LearningModelDeviceKin
   }
 
   std::replace_if(
-    name.begin(), name.end(), [](char c) { return !google::protobuf::ascii_isalnum(c); }, '_'
+    name.begin(), name.end(), [](char c) { return !absl::ascii_isalnum(c); }, '_'
   );
 
   // Determine if test should be skipped, using the generic name (no CPU or GPU suffix yet).